├── .gitignore ├── LICENSE ├── README.md ├── SUMMARY.md ├── chapter1 ├── README.md ├── section1.1.md └── section1.2.md ├── chapter2 ├── README.md ├── section2.1.md └── section2.2.md ├── chapter3 ├── README.md ├── section3.1.md ├── section3.2.md └── section3.3.md ├── chapter4 ├── README.md ├── section4.1.md ├── section4.2.md ├── section4.3.md └── section4.4.md └── chapter5 ├── README.md ├── section5.1.md ├── section5.2.md └── section5.3.md /.gitignore: -------------------------------------------------------------------------------- 1 | _book/* 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # studying-containerd-notes 2 | 简要记录在containerd 1.2学习过程中的一些心得体会。 3 | -------------------------------------------------------------------------------- /SUMMARY.md: -------------------------------------------------------------------------------- 1 | # Summary 2 | 3 | * [Introduction](README.md) 4 | * [入门简介](chapter1/README.md) 5 | * [简介](chapter1/section1.1.md) 6 | * [启动流程](chapter1/section1.2.md) 7 | * [API层](chapter2/README.md) 8 | * [Containers API](chapter2/section2.1.md) 9 | * [Task API](chapter2/section2.2.md) 10 | * [执行引擎](chapter3/README.md) 11 | * [ctr run流程](chapter3/section3.1.md) 12 | * [Container I/O](chapter3/section3.2.md) 13 | * [runc](chapter3/section3.3.md) 14 | * [镜像](chapter4/README.md) 15 | * [image fetch](chapter4/section4.1.md) 16 | * [image unpack](chapter4/section4.2.md) 17 | * [snapshotter](chapter4/section4.3.md) 18 | * [pouch commit实现](chapter4/section4.4.md) 19 | * [存储](chapter5/README.md) 20 | * [native](chapter5/section5.1.md) 21 | * [overlayfs](chapter5/section5.2.md) 22 | * [btrfs](chapter5/section5.3.md) 23 | 24 | -------------------------------------------------------------------------------- /chapter1/README.md: -------------------------------------------------------------------------------- 1 | # 入门简介 2 | 在本章将会介绍contaierd的整体架构以及其启动的流程。 3 | -------------------------------------------------------------------------------- /chapter1/section1.1.md: -------------------------------------------------------------------------------- 1 | # 简介 2 | 3 | Containerd 是一个工业级标准的容器运行时,它强调简单性、健壮性和可移植性。Containerd 可以在宿主机中管理完整的容器生命周期:容器镜像的传输和存储、容器的执行和管理、存储和网络等。 4 | 5 | ![image.png-16.5kB][1] 6 | 从上图中我们可以看到containerd在整个docker engine架构中的作用。containerd作为抽象的容器运行时,是比较薄的一层,而containerd-shim作为底层执行容器的父进程,完成具体容器进程的监控和回收工作,使得容器可控。 7 | 8 | 下图中包含了containerd的整个组件,其中底层存储通过golang实现的boltdb来完成,容器以metadata的形式存在于containerd,创建销毁容器通过task的接口来完成,底层的Runtime通过runc/runv等符合OCI标准的运行时来完成。 9 | ![image.png-105.9kB][2] 10 | 11 | 在线版grpc接口文档: 12 | https://dnephin.github.io/containerd/ 13 | 14 | 15 | [1]: http://static.zybuluo.com/myecho/frkxdgjsk7f4xwjboadifk4n/image.png 16 | [2]: http://static.zybuluo.com/myecho/i7csubxibdandpywp1wl0jew/image.png 17 | -------------------------------------------------------------------------------- /chapter1/section1.2.md: -------------------------------------------------------------------------------- 1 | # 启动流程 2 | 3 | 入口path: containerd/cmd/containerd/command/main.go 4 | 5 | app.Action = func(context *cli.Context) error { 6 | var ( 7 | start = time.Now() 8 | signals = make(chan os.Signal, 2048) 9 | serverC = make(chan *server.Server, 1) 10 | ctx = gocontext.Background() 11 | config = defaultConfig() 12 | //默认的 config root 为 /var/lib/containerd 13 | ) 14 | 15 | done := handleSignals(ctx, signals, serverC) 16 | // start the signal handler as soon as we can to make sure that 17 | // we don't miss any signals during boot 18 | signal.Notify(signals, handledSignals...) 19 | 20 | if err := srvconfig.LoadConfig(context.GlobalString("config"), config); err != nil && !os.IsNotExist(err) { 21 | return err 22 | } 23 | // apply flags to the config 24 | if err := applyFlags(context, config); err != nil { 25 | return err 26 | } 27 | // cleanup temp mounts 28 | if err := mount.SetTempMountLocation(filepath.Join(config.Root, "tmpmounts")); err != nil { 29 | return errors.Wrap(err, "creating temp mount location") 30 | } 31 | // unmount all temp mounts on boot for the server 32 | warnings, err := mount.CleanupTempMounts(0) 33 | if err != nil { 34 | log.G(ctx).WithError(err).Error("unmounting temp mounts") 35 | } 36 | for _, w := range warnings { 37 | log.G(ctx).WithError(w).Warn("cleanup temp mount") 38 | } 39 | address := config.GRPC.Address 40 | if address == "" { 41 | return errors.New("grpc address cannot be empty") 42 | } 43 | log.G(ctx).WithFields(logrus.Fields{ 44 | "version": version.Version, 45 | "revision": version.Revision, 46 | }).Info("starting containerd") 47 | 48 | server, err := server.New(ctx, config) 49 | if err != nil { 50 | return err 51 | } 52 | serverC <- server 53 | if config.Debug.Address != "" { 54 | var l net.Listener 55 | if filepath.IsAbs(config.Debug.Address) { 56 | if l, err = sys.GetLocalListener(config.Debug.Address, config.Debug.UID, config.Debug.GID); err != nil { 57 | return errors.Wrapf(err, "failed to get listener for debug endpoint") 58 | } 59 | } else { 60 | if l, err = net.Listen("tcp", config.Debug.Address); err != nil { 61 | return errors.Wrapf(err, "failed to get listener for debug endpoint") 62 | } 63 | } 64 | //初始化debug的接口 65 | serve(ctx, l, server.ServeDebug) 66 | } 67 | if config.Metrics.Address != "" { 68 | l, err := net.Listen("tcp", config.Metrics.Address) 69 | if err != nil { 70 | return errors.Wrapf(err, "failed to get listener for metrics endpoint") 71 | } 72 | //初始化ServeMetrics的接口 73 | serve(ctx, l, server.ServeMetrics) 74 | } 75 | 76 | //产生unix-socket被grpcServer使用来监听, 默认在/run/containerd/containerd.sock 77 | l, err := sys.GetLocalListener(address, config.GRPC.UID, config.GRPC.GID) 78 | if err != nil { 79 | return errors.Wrapf(err, "failed to get listener for main endpoint") 80 | } 81 | //rpc在Server.New中被初始化 82 | serve(ctx, l, server.ServeGRPC) 83 | 84 | log.G(ctx).Infof("containerd successfully booted in %fs", time.Since(start).Seconds()) 85 | <-done 86 | return nil 87 | } 88 | return app 89 | } 90 | 91 | 92 | 93 | // New creates and initializes a new containerd server 94 | func New(ctx context.Context, config *srvconfig.Config) (*Server, error) { 95 | switch { 96 | case config.Root == "": 97 | return nil, errors.New("root must be specified") 98 | case config.State == "": 99 | return nil, errors.New("state must be specified") 100 | case config.Root == config.State: 101 | return nil, errors.New("root and state must be different paths") 102 | } 103 | 104 | if err := os.MkdirAll(config.Root, 0711); err != nil { 105 | return nil, err 106 | } 107 | if err := os.MkdirAll(config.State, 0711); err != nil { 108 | return nil, err 109 | } 110 | if err := apply(ctx, config); err != nil { 111 | return nil, err 112 | } 113 | //加载插件,比如snapshottor, metaStore,content等,但是其中还有个proxyPlugin,看起来是为某些服务提供的代理 114 | plugins, err := LoadPlugins(ctx, config) 115 | if err != nil { 116 | return nil, err 117 | } 118 | 119 | serverOpts := []grpc.ServerOption{ 120 | grpc.UnaryInterceptor(grpc_prometheus.UnaryServerInterceptor), 121 | grpc.StreamInterceptor(grpc_prometheus.StreamServerInterceptor), 122 | } 123 | if config.GRPC.MaxRecvMsgSize > 0 { 124 | serverOpts = append(serverOpts, grpc.MaxRecvMsgSize(config.GRPC.MaxRecvMsgSize)) 125 | } 126 | if config.GRPC.MaxSendMsgSize > 0 { 127 | serverOpts = append(serverOpts, grpc.MaxSendMsgSize(config.GRPC.MaxSendMsgSize)) 128 | } 129 | rpc := grpc.NewServer(serverOpts...) 130 | var ( 131 | services []plugin.Service 132 | s = &Server{ 133 | rpc: rpc, 134 | events: exchange.NewExchange(), 135 | config: config, 136 | } 137 | initialized = plugin.NewPluginSet() 138 | ) 139 | //上边加载好了plugin后,由下边调用init方法初始化每个plugin,并将需要注册的grpc路由收集到services []plugin.Service中 140 | for _, p := range plugins { 141 | id := p.URI() 142 | log.G(ctx).WithField("type", p.Type).Infof("loading plugin %q...", id) 143 | 144 | initContext := plugin.NewContext( 145 | ctx, 146 | p, 147 | initialized, 148 | config.Root, 149 | config.State, 150 | ) 151 | initContext.Events = s.events 152 | initContext.Address = config.GRPC.Address 153 | 154 | // load the plugin specific configuration if it is provided 155 | if p.Config != nil { 156 | pluginConfig, err := config.Decode(p.ID, p.Config) 157 | if err != nil { 158 | return nil, err 159 | } 160 | initContext.Config = pluginConfig 161 | } 162 | result := p.Init(initContext) 163 | if err := initialized.Add(result); err != nil { 164 | return nil, errors.Wrapf(err, "could not add plugin result to plugin set") 165 | } 166 | 167 | instance, err := result.Instance() 168 | if err != nil { 169 | if plugin.IsSkipPlugin(err) { 170 | log.G(ctx).WithField("type", p.Type).Infof("skip loading plugin %q...", id) 171 | } else { 172 | log.G(ctx).WithError(err).Warnf("failed to load plugin %s", id) 173 | } 174 | continue 175 | } 176 | // check for grpc services that should be registered with the server 177 | if service, ok := instance.(plugin.Service); ok { 178 | services = append(services, service) 179 | } 180 | s.plugins = append(s.plugins, result) 181 | } 182 | // register services after all plugins have been initialized(注册rpc路由信息) 183 | for _, service := range services { 184 | if err := service.Register(rpc); err != nil { 185 | return nil, err 186 | } 187 | } 188 | return s, nil 189 | } 190 | 191 | 下面简介一下plugin的类型 192 | 193 | const ( 194 | // InternalPlugin implements an internal plugin to containerd 195 | InternalPlugin Type = "io.containerd.internal.v1" 196 | // RuntimePlugin implements a runtime 197 | RuntimePlugin Type = "io.containerd.runtime.v1" 198 | // RuntimePluginV2 implements a runtime v2 199 | RuntimePluginV2 Type = "io.containerd.runtime.v2" 200 | // ServicePlugin implements a internal service 201 | ServicePlugin Type = "io.containerd.service.v1" 202 | // GRPCPlugin implements a grpc service 203 | GRPCPlugin Type = "io.containerd.grpc.v1" 204 | // SnapshotPlugin implements a snapshotter 205 | SnapshotPlugin Type = "io.containerd.snapshotter.v1" 206 | // TaskMonitorPlugin implements a task monitor 207 | TaskMonitorPlugin Type = "io.containerd.monitor.v1" 208 | // DiffPlugin implements a differ 209 | DiffPlugin Type = "io.containerd.differ.v1" 210 | // MetadataPlugin implements a metadata store 211 | MetadataPlugin Type = "io.containerd.metadata.v1" 212 | // ContentPlugin implements a content store 213 | ContentPlugin Type = "io.containerd.content.v1" 214 | // GCPlugin implements garbage collection policy 215 | GCPlugin Type = "io.containerd.gc.v1" 216 | ) 217 | 218 | func init() { 219 | plugin.Register(&plugin.Registration{ 220 | ID: "btrfs", 221 | Type: plugin.SnapshotPlugin, 222 | Init: func(ic *plugin.InitContext) (interface{}, error) { 223 | return NewSnapshotter(ic.Root) 224 | }, 225 | }) 226 | } 227 | 228 | func init() { 229 | plugin.Register(&plugin.Registration{ 230 | Type: plugin.SnapshotPlugin, 231 | ID: "overlayfs", 232 | Init: func(ic *plugin.InitContext) (interface{}, error) { 233 | return NewSnapshotter(ic.Root) 234 | }, 235 | }) 236 | } 237 | //可以看到containerd的snapshottor机制也是通过底层的这些storage driver实现的。 238 | 239 | 更详细的plugin介绍在后边涉及到的时候再深入研究。 240 | 241 | ### 总结-流程图 242 | ![应用添加模块.png-12.2kB][1] 243 | 244 | 245 | 246 | [1]: http://static.zybuluo.com/myecho/tz2m8o9i6ppbp8kvrj1r56q7/%E5%BA%94%E7%94%A8%E6%B7%BB%E5%8A%A0%E6%A8%A1%E5%9D%97.png 247 | -------------------------------------------------------------------------------- /chapter2/README.md: -------------------------------------------------------------------------------- 1 | # API层 2 | 3 | -------------------------------------------------------------------------------- /chapter2/section2.1.md: -------------------------------------------------------------------------------- 1 | # Containers API 2 | 3 | 4 | 入口path: containerd/api/services,其中的每个service中都定义各自负责模块的rpc接口。 5 | 6 | ### List 7 | 以containerd list为例子 8 | 首先是函数入口: 9 | 10 | func _Containers_List_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { 11 | in := new(ListContainersRequest) 12 | if err := dec(in); err != nil { 13 | return nil, err 14 | } 15 | if interceptor == nil { 16 | return srv.(ContainersServer).List(ctx, in) 17 | } 18 | info := &grpc.UnaryServerInfo{ 19 | Server: srv, 20 | FullMethod: "/containerd.services.containers.v1.Containers/List", 21 | } 22 | //这个handler就是API具体的处理逻辑 23 | handler := func(ctx context.Context, req interface{}) (interface{}, error) { 24 | return srv.(ContainersServer).List(ctx, req.(*ListContainersRequest)) 25 | } 26 | return interceptor(ctx, in, info, handler) 27 | } 28 | 29 | 对应接口的实现都在path: containerd/services,对应service的plugin的path在: containerd/services/containers/service.go, 通过对应Service的client为上层提供调用接口,上文中提到的List接口,在path: containerd/services/containers/local.go中, 30 | 31 | func (l *local) List(ctx context.Context, req *api.ListContainersRequest, _ ...grpc.CallOption) (*api.ListContainersResponse, error) { 32 | var resp api.ListContainersResponse 33 | return &resp, errdefs.ToGRPC(l.withStoreView(ctx, func(ctx context.Context, store containers.Store) error { 34 | containers, err := store.List(ctx, req.Filters...) 35 | if err != nil { 36 | return err 37 | } 38 | resp.Containers = containersToProto(containers) 39 | return nil 40 | })) 41 | } 42 | 43 | 根据 withStore 函数可以得到 store 为 metadata.NewContainerStore,路径 containerd/metadata/containers.go 中,containerStore 结构体是包裹的是操作数据库。 44 | 45 | func (s *containerStore) List(ctx context.Context, fs ...string) ([]containers.Container, error) { 46 | namespace, err := namespaces.NamespaceRequired(ctx) 47 | if err != nil { 48 | return nil, err 49 | } 50 | 51 | filter, err := filters.ParseAll(fs...) 52 | if err != nil { 53 | return nil, errors.Wrapf(errdefs.ErrInvalidArgument, err.Error()) 54 | } 55 | 56 | bkt := getContainersBucket(s.tx, namespace) 57 | if bkt == nil { 58 | return nil, nil // empty store 59 | } 60 | 61 | var m []containers.Container 62 | if err := bkt.ForEach(func(k, v []byte) error { 63 | cbkt := bkt.Bucket(k) 64 | if cbkt == nil { 65 | return nil 66 | } 67 | container := containers.Container{ID: string(k)} 68 | //从boltdb的存储格式转化为Container结构 69 | if err := readContainer(&container, cbkt); err != nil { 70 | return errors.Wrapf(err, "failed to read container %q", string(k)) 71 | } 72 | 73 | if filter.Match(adaptContainer(container)) { 74 | m = append(m, container) 75 | } 76 | return nil 77 | }); err != nil { 78 | return nil, err 79 | } 80 | 81 | return m, nil 82 | } 83 | 84 | 85 | func getContainersBucket(tx *bolt.Tx, namespace string) *bolt.Bucket { 86 | return getBucket(tx, bucketKeyVersion, []byte(namespace), bucketKeyObjectContainers) 87 | } 88 | 89 | //是个嵌套的bucket结构,从最后的name为:bucketKeyObjectContainers的Bucket中取出value 90 | func getBucket(tx *bolt.Tx, keys ...[]byte) *bolt.Bucket { 91 | bkt := tx.Bucket(keys[0]) 92 | 93 | for _, key := range keys[1:] { 94 | if bkt == nil { 95 | break 96 | } 97 | bkt = bkt.Bucket(key) 98 | } 99 | return bkt 100 | } 101 | 102 | ### Create 103 | 入口 104 | 105 | func _Containers_Create_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { 106 | in := new(CreateContainerRequest) 107 | if err := dec(in); err != nil { 108 | return nil, err 109 | } 110 | if interceptor == nil { 111 | return srv.(ContainersServer).Create(ctx, in) 112 | } 113 | info := &grpc.UnaryServerInfo{ 114 | Server: srv, 115 | FullMethod: "/containerd.services.containers.v1.Containers/Create", 116 | } 117 | handler := func(ctx context.Context, req interface{}) (interface{}, error) { 118 | return srv.(ContainersServer).Create(ctx, req.(*CreateContainerRequest)) 119 | } 120 | return interceptor(ctx, in, info, handler) 121 | } 122 | 123 | 执行逻辑同样位于在path: containerd/services/containers/local.go中 124 | 125 | func (l *local) Create(ctx context.Context, req *api.CreateContainerRequest, _ ...grpc.CallOption) (*api.CreateContainerResponse, error) { 126 | var resp api.CreateContainerResponse 127 | 128 | if err := l.withStoreUpdate(ctx, func(ctx context.Context, store containers.Store) error { 129 | container := containerFromProto(&req.Container) 130 | 131 | created, err := store.Create(ctx, container) 132 | if err != nil { 133 | return err 134 | } 135 | 136 | resp.Container = containerToProto(&created) 137 | 138 | return nil 139 | }); err != nil { 140 | return &resp, errdefs.ToGRPC(err) 141 | } 142 | // 发出事件Event 143 | if err := l.publisher.Publish(ctx, "/containers/create", &eventstypes.ContainerCreate{ 144 | ID: resp.Container.ID, 145 | Image: resp.Container.Image, 146 | Runtime: &eventstypes.ContainerCreate_Runtime{ 147 | Name: resp.Container.Runtime.Name, 148 | Options: resp.Container.Runtime.Options, 149 | }, 150 | }); err != nil { 151 | return &resp, err 152 | } 153 | 154 | return &resp, nil 155 | } 156 | 157 | //最后落到存储层 158 | func (s *containerStore) Create(ctx context.Context, container containers.Container) (containers.Container, error) { 159 | namespace, err := namespaces.NamespaceRequired(ctx) 160 | if err != nil { 161 | return containers.Container{}, err 162 | } 163 | 164 | if err := validateContainer(&container); err != nil { 165 | return containers.Container{}, errors.Wrap(err, "create container failed validation") 166 | } 167 | 168 | //如果没有创建过才去创建 169 | bkt, err := createContainersBucket(s.tx, namespace) 170 | if err != nil { 171 | return containers.Container{}, err 172 | } 173 | 174 | //又去创建了个子bucket,在containersBucket下边 175 | cbkt, err := bkt.CreateBucket([]byte(container.ID)) 176 | if err != nil { 177 | if err == bolt.ErrBucketExists { 178 | err = errors.Wrapf(errdefs.ErrAlreadyExists, "container %q", container.ID) 179 | } 180 | return containers.Container{}, err 181 | } 182 | 183 | container.CreatedAt = time.Now().UTC() 184 | container.UpdatedAt = container.CreatedAt 185 | if err := writeContainer(cbkt, &container); err != nil { 186 | return containers.Container{}, errors.Wrapf(err, "failed to write container %q", container.ID) 187 | } 188 | 189 | return container, nil 190 | } 191 | 192 | 综上所述,Containers相关的接口基本上是在维护容器相关的metadata.想要创建完整的容器还需要依赖其他的接口比如Task相关的接口。 193 | 194 | > A container is a metadata object that resources are allocated and 195 | > attached to 196 | 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /chapter2/section2.2.md: -------------------------------------------------------------------------------- 1 | # Task API 2 | 3 | 首先看一下关于Task API都有哪些接口, path在 containerd/api/services/tasks/v1/tasks.pb.go, 4 | 5 | var _Tasks_serviceDesc = grpc.ServiceDesc{ 6 | ServiceName: "containerd.services.tasks.v1.Tasks", 7 | HandlerType: (*TasksServer)(nil), 8 | Methods: []grpc.MethodDesc{ 9 | { 10 | MethodName: "Create", 11 | Handler: _Tasks_Create_Handler, 12 | }, 13 | { 14 | MethodName: "Start", 15 | Handler: _Tasks_Start_Handler, 16 | }, 17 | { 18 | MethodName: "Delete", 19 | Handler: _Tasks_Delete_Handler, 20 | }, 21 | { 22 | MethodName: "DeleteProcess", 23 | Handler: _Tasks_DeleteProcess_Handler, 24 | }, 25 | { 26 | MethodName: "Get", 27 | Handler: _Tasks_Get_Handler, 28 | }, 29 | { 30 | MethodName: "List", 31 | Handler: _Tasks_List_Handler, 32 | }, 33 | { 34 | MethodName: "Kill", 35 | Handler: _Tasks_Kill_Handler, 36 | }, 37 | { 38 | MethodName: "Exec", 39 | Handler: _Tasks_Exec_Handler, 40 | }, 41 | { 42 | MethodName: "ResizePty", 43 | Handler: _Tasks_ResizePty_Handler, 44 | }, 45 | { //关于容器I/O的部分我们后边会着重看一下 46 | MethodName: "CloseIO", 47 | Handler: _Tasks_CloseIO_Handler, 48 | }, 49 | { 50 | MethodName: "Pause", 51 | Handler: _Tasks_Pause_Handler, 52 | }, 53 | { 54 | MethodName: "Resume", 55 | Handler: _Tasks_Resume_Handler, 56 | }, 57 | { 58 | MethodName: "ListPids", 59 | Handler: _Tasks_ListPids_Handler, 60 | }, 61 | { 62 | MethodName: "Checkpoint", 63 | Handler: _Tasks_Checkpoint_Handler, 64 | }, 65 | { 66 | MethodName: "Update", 67 | Handler: _Tasks_Update_Handler, 68 | }, 69 | { 70 | MethodName: "Metrics", 71 | Handler: _Tasks_Metrics_Handler, 72 | }, 73 | { 74 | MethodName: "Wait", 75 | Handler: _Tasks_Wait_Handler, 76 | }, 77 | }, 78 | Streams: []grpc.StreamDesc{}, 79 | Metadata: "github.com/containerd/containerd/api/services/tasks/v1/tasks.proto", 80 | } 81 | 82 | 来看一下run一个容器的逻辑链路。 83 | 84 | path: containerd/cmd/ctr/commands/run/run.go 85 | 具体代码的就不截图了,而实现的主要逻辑如下所示: 86 | 87 | NewContainer -> c.ContainerService().Create(ctx, container)//创建container这个metadata 88 | //其中还会针对是否有无checkpoint来进行区别 89 | 90 | NewTask -> container.NewTask(ctx, ioCreator, opts...) -> c.client.TaskService().Create(ctx, request) //也验证了必须首先有container的metadata才能够创建,只有当Task被执行的时候才会创建一个真正的容器 91 | 92 | task.Start(ctx) 93 | 94 | //如果收到退出信号 95 | task.Delete(ctx) 96 | //返回状态码 97 | 98 | 99 | -------------------------------------------------------------------------------- /chapter3/README.md: -------------------------------------------------------------------------------- 1 | # 执行引擎 2 | 3 | 在节介绍有关containerd执行引擎的内容,包括容器I/O、runc的运行时、containerd client的运行流程。 4 | -------------------------------------------------------------------------------- /chapter3/section3.1.md: -------------------------------------------------------------------------------- 1 | # ctr run流程 2 | 3 | 我们接下来从TaskService的Create接口看起,看一下containerd是如何与底层的containerd-shim以及最后的runc进行结合工作的。 4 | 5 | opts := runtime.CreateOpts{ 6 | Spec: container.Spec, 7 | //这看起来实际操作I/O还是底层的容器执行器 8 | IO: runtime.IO{ 9 | Stdin: r.Stdin, 10 | Stdout: r.Stdout, 11 | Stderr: r.Stderr, 12 | Terminal: r.Terminal, 13 | }, 14 | Checkpoint: checkpointPath, 15 | Runtime: container.Runtime.Name, 16 | RuntimeOptions: container.Runtime.Options, 17 | TaskOptions: r.Options, 18 | } 19 | c, err := runtime.Create(ctx, r.ContainerID, opts) 20 | 21 | // Create a new task 22 | func (r *Runtime) Create(ctx context.Context, id string, opts runtime.CreateOpts) (_ runtime.Task, err error) { 23 | namespace, err := namespaces.NamespaceRequired(ctx) 24 | if err != nil { 25 | return nil, err 26 | } 27 | 28 | if err := identifiers.Validate(id); err != nil { 29 | return nil, errors.Wrapf(err, "invalid task id") 30 | } 31 | 32 | ropts, err := r.getRuncOptions(ctx, id) 33 | if err != nil { 34 | return nil, err 35 | } 36 | // newBundle 根据传入的路径和 ID 创建目录文件,/var/lib/containerd/io.containerd.runtime.v1.linux/default 37 | bundle, err := newBundle(id, 38 | filepath.Join(r.state, namespace), 39 | filepath.Join(r.root, namespace), 40 | opts.Spec.Value) 41 | if err != nil { 42 | return nil, err 43 | } 44 | defer func() { 45 | if err != nil { 46 | bundle.Delete() 47 | } 48 | }() 49 | 50 | shimopt := ShimLocal(r.config, r.events) 51 | if !r.config.NoShim { 52 | var cgroup string 53 | if opts.TaskOptions != nil { 54 | v, err := typeurl.UnmarshalAny(opts.TaskOptions) 55 | if err != nil { 56 | return nil, err 57 | } 58 | cgroup = v.(*runctypes.CreateOptions).ShimCgroup 59 | } 60 | exitHandler := func() { 61 | log.G(ctx).WithField("id", id).Info("shim reaped") 62 | t, err := r.tasks.Get(ctx, id) 63 | if err != nil { 64 | // Task was never started or was already successfully deleted 65 | return 66 | } 67 | lc := t.(*Task) 68 | 69 | log.G(ctx).WithFields(logrus.Fields{ 70 | "id": id, 71 | "namespace": namespace, 72 | }).Warn("cleaning up after killed shim") 73 | if err = r.cleanupAfterDeadShim(context.Background(), bundle, namespace, id, lc.pid); err != nil { 74 | log.G(ctx).WithError(err).WithFields(logrus.Fields{ 75 | "id": id, 76 | "namespace": namespace, 77 | }).Warn("failed to clean up after killed shim") 78 | } 79 | } 80 | //在这里产生了一个新的shim进程,adress地址就是containerd daemon的地址,调用了WithStart func 81 | shimopt = ShimRemote(r.config, r.address, cgroup, exitHandler) 82 | } 83 | 84 | // 产生一个shim client 85 | s, err := bundle.NewShimClient(ctx, namespace, shimopt, ropts) 86 | if err != nil { 87 | return nil, err 88 | } 89 | defer func() { 90 | if err != nil { 91 | if kerr := s.KillShim(ctx); kerr != nil { 92 | log.G(ctx).WithError(err).Error("failed to kill shim") 93 | } 94 | } 95 | }() 96 | 97 | rt := r.config.Runtime 98 | if ropts != nil && ropts.Runtime != "" { 99 | rt = ropts.Runtime 100 | } 101 | //填充 CreateTaskRequest 结构体,发送 GRPC 给 shim 创建,Create 路径为containerd/runtime/v1/shim/service.go 102 | sopts := &shim.CreateTaskRequest{ 103 | ID: id, 104 | Bundle: bundle.path, 105 | Runtime: rt, 106 | Stdin: opts.IO.Stdin, 107 | Stdout: opts.IO.Stdout, 108 | Stderr: opts.IO.Stderr, 109 | Terminal: opts.IO.Terminal, 110 | Checkpoint: opts.Checkpoint, 111 | Options: opts.TaskOptions, 112 | } 113 | for _, m := range opts.Rootfs { 114 | sopts.Rootfs = append(sopts.Rootfs, &types.Mount{ 115 | Type: m.Type, 116 | Source: m.Source, 117 | Options: m.Options, 118 | }) 119 | } 120 | //将创建的请求交给shim处理,我们下边看看shim会再如何处理得到的这个create请求 121 | cr, err := s.Create(ctx, sopts) 122 | if err != nil { 123 | return nil, errdefs.FromGRPC(err) 124 | } 125 | t, err := newTask(id, namespace, int(cr.Pid), s, r.events, r.tasks, bundle) 126 | if err != nil { 127 | return nil, err 128 | } 129 | if err := r.tasks.Add(ctx, t); err != nil { 130 | return nil, err 131 | } 132 | r.events.Publish(ctx, runtime.TaskCreateEventTopic, &eventstypes.TaskCreate{ 133 | ContainerID: sopts.ID, 134 | Bundle: sopts.Bundle, 135 | Rootfs: sopts.Rootfs, 136 | IO: &eventstypes.TaskIO{ 137 | Stdin: sopts.Stdin, 138 | Stdout: sopts.Stdout, 139 | Stderr: sopts.Stderr, 140 | Terminal: sopts.Terminal, 141 | }, 142 | Checkpoint: sopts.Checkpoint, 143 | Pid: uint32(t.pid), 144 | }) 145 | 146 | return t, nil 147 | } 148 | 149 | 150 | func WithStart(binary, address, daemonAddress, cgroup string, debug bool, exitHandler func()) Opt { 151 | return func(ctx context.Context, config shim.Config) (_ shimapi.ShimService, _ io.Closer, err error) { 152 | // 这个address是shim-address,用来containerd和shim进程之间的通信 153 | // filepath.Join(string(filepath.Separator), "containerd-shim", namespace, bundle.id, "shim.sock") 154 | socket, err := newSocket(address) 155 | if err != nil { 156 | return nil, nil, err 157 | } 158 | defer socket.Close() 159 | f, err := socket.File() 160 | if err != nil { 161 | return nil, nil, errors.Wrapf(err, "failed to get fd for socket %s", address) 162 | } 163 | defer f.Close() 164 | // 执行containerd-shim --namespace default --address /run/containerd/containerd.sock命令 165 | cmd, err := newCommand(binary, daemonAddress, debug, config, f) 166 | if err != nil { 167 | return nil, nil, err 168 | } 169 | if err := cmd.Start(); err != nil { 170 | return nil, nil, errors.Wrapf(err, "failed to start shim") 171 | } 172 | defer func() { 173 | if err != nil { 174 | cmd.Process.Kill() 175 | } 176 | }() 177 | go func() { 178 | cmd.Wait() 179 | exitHandler() 180 | }() 181 | log.G(ctx).WithFields(logrus.Fields{ 182 | "pid": cmd.Process.Pid, 183 | "address": address, 184 | "debug": debug, 185 | }).Infof("shim %s started", binary) 186 | // set shim in cgroup if it is provided 187 | if cgroup != "" { 188 | if err := setCgroup(cgroup, cmd); err != nil { 189 | return nil, nil, err 190 | } 191 | log.G(ctx).WithFields(logrus.Fields{ 192 | "pid": cmd.Process.Pid, 193 | "address": address, 194 | }).Infof("shim placed in cgroup %s", cgroup) 195 | } 196 | if err = sys.SetOOMScore(cmd.Process.Pid, sys.OOMScoreMaxKillable); err != nil { 197 | return nil, nil, errors.Wrap(err, "failed to set OOM Score on shim") 198 | } 199 | //通过前边介绍的shim-address产生一个client然后进行通信,使用的也是unix socket通信 200 | c, clo, err := WithConnect(address, func() {})(ctx, config) 201 | if err != nil { 202 | return nil, nil, errors.Wrap(err, "failed to connect") 203 | } 204 | return c, clo, nil 205 | } 206 | } 207 | 208 | ### shim处理逻辑 209 | path: containerd/runtime/v1/shim/service.go 210 | api list: 211 | 212 | type ShimService interface { 213 | State(ctx context.Context, req *StateRequest) (*StateResponse, error) 214 | Create(ctx context.Context, req *CreateTaskRequest) (*CreateTaskResponse, error) 215 | Start(ctx context.Context, req *StartRequest) (*StartResponse, error) 216 | Delete(ctx context.Context, req *google_protobuf1.Empty) (*DeleteResponse, error) 217 | DeleteProcess(ctx context.Context, req *DeleteProcessRequest) (*DeleteResponse, error) 218 | ListPids(ctx context.Context, req *ListPidsRequest) (*ListPidsResponse, error) 219 | Pause(ctx context.Context, req *google_protobuf1.Empty) (*google_protobuf1.Empty, error) 220 | Resume(ctx context.Context, req *google_protobuf1.Empty) (*google_protobuf1.Empty, error) 221 | Checkpoint(ctx context.Context, req *CheckpointTaskRequest) (*google_protobuf1.Empty, error) 222 | Kill(ctx context.Context, req *KillRequest) (*google_protobuf1.Empty, error) 223 | Exec(ctx context.Context, req *ExecProcessRequest) (*google_protobuf1.Empty, error) 224 | ResizePty(ctx context.Context, req *ResizePtyRequest) (*google_protobuf1.Empty, error) 225 | CloseIO(ctx context.Context, req *CloseIORequest) (*google_protobuf1.Empty, error) 226 | ShimInfo(ctx context.Context, req *google_protobuf1.Empty) (*ShimInfoResponse, error) 227 | Update(ctx context.Context, req *UpdateTaskRequest) (*google_protobuf1.Empty, error) 228 | Wait(ctx context.Context, req *WaitRequest) (*WaitResponse, error) 229 | } 230 | 231 | // 上文中提到过的CreateRequest通过shim的unix socket交给这个函数来进行处理 232 | // Create a new initial process and container with the underlying OCI runtime 233 | func (s *Service) Create(ctx context.Context, r *shimapi.CreateTaskRequest) (_ *shimapi.CreateTaskResponse, err error) { 234 | s.mu.Lock() 235 | defer s.mu.Unlock() 236 | 237 | var mounts []proc.Mount 238 | for _, m := range r.Rootfs { 239 | mounts = append(mounts, proc.Mount{ 240 | Type: m.Type, 241 | Source: m.Source, 242 | Target: m.Target, 243 | Options: m.Options, 244 | }) 245 | } 246 | 247 | config := &proc.CreateConfig{ 248 | ID: r.ID, 249 | Bundle: r.Bundle, 250 | Runtime: r.Runtime, 251 | Rootfs: mounts, 252 | Terminal: r.Terminal, 253 | Stdin: r.Stdin, 254 | Stdout: r.Stdout, 255 | Stderr: r.Stderr, 256 | Checkpoint: r.Checkpoint, 257 | ParentCheckpoint: r.ParentCheckpoint, 258 | Options: r.Options, 259 | } 260 | rootfs := filepath.Join(r.Bundle, "rootfs") 261 | defer func() { 262 | if err != nil { 263 | if err2 := mount.UnmountAll(rootfs, 0); err2 != nil { 264 | log.G(ctx).WithError(err2).Warn("Failed to cleanup rootfs mount") 265 | } 266 | } 267 | }() 268 | for _, rm := range mounts { 269 | m := &mount.Mount{ 270 | Type: rm.Type, 271 | Source: rm.Source, 272 | Options: rm.Options, 273 | } 274 | if err := m.Mount(rootfs); err != nil { 275 | return nil, errors.Wrapf(err, "failed to mount rootfs component %v", m) 276 | } 277 | } 278 | //下文有介绍 279 | process, err := newInit( 280 | ctx, 281 | s.config.Path, 282 | s.config.WorkDir, 283 | s.config.RuntimeRoot, 284 | s.config.Namespace, 285 | s.config.Criu, 286 | s.config.SystemdCgroup, 287 | s.platform, 288 | config, 289 | ) 290 | if err != nil { 291 | return nil, errdefs.ToGRPC(err) 292 | } 293 | //在这里函数里边开始创建物理容器进程 294 | //最终在func (r *Runc) Create(context context.Context, id, bundle string, opts *CreateOpts) 里边通过/run/containerd/runc命令创建符合OCI标准的容器物理进程 295 | if err := process.Create(ctx, config); err != nil { 296 | return nil, errdefs.ToGRPC(err) 297 | } 298 | // save the main task id and bundle to the shim for additional requests 299 | s.id = r.ID 300 | s.bundle = r.Bundle 301 | pid := process.Pid() 302 | s.processes[r.ID] = process 303 | return &shimapi.CreateTaskResponse{ 304 | Pid: uint32(pid), 305 | }, nil 306 | } 307 | 308 | //在这个func里边返回一个process描述对象供以后使用,这里这里已经是在shim进程里边了。 309 | func newInit(ctx context.Context, path, workDir, runtimeRoot, namespace, criu string, systemdCgroup bool, platform rproc.Platform, r *proc.CreateConfig) (*proc.Init, error) { 310 | var options runctypes.CreateOptions 311 | if r.Options != nil { 312 | v, err := typeurl.UnmarshalAny(r.Options) 313 | if err != nil { 314 | return nil, err 315 | } 316 | options = *v.(*runctypes.CreateOptions) 317 | } 318 | 319 | rootfs := filepath.Join(path, "rootfs") 320 | runtime := proc.NewRunc(runtimeRoot, path, namespace, r.Runtime, criu, systemdCgroup) 321 | //创建进程描述对象,这个时候还没有创建起物理的容器进程 322 | p := proc.New(r.ID, runtime, rproc.Stdio{ 323 | Stdin: r.Stdin, 324 | Stdout: r.Stdout, 325 | Stderr: r.Stderr, 326 | Terminal: r.Terminal, 327 | }) 328 | p.Bundle = r.Bundle 329 | p.Platform = platform 330 | p.Rootfs = rootfs 331 | p.WorkDir = workDir 332 | p.IoUID = int(options.IoUid) 333 | p.IoGID = int(options.IoGid) 334 | p.NoPivotRoot = options.NoPivotRoot 335 | p.NoNewKeyring = options.NoNewKeyring 336 | return p, nil 337 | } 338 | 339 | ### 总结 340 | containerd daemon看起来只是一层比较薄的逻辑。 341 | containerd-shim则相当于是容器物理进程,比如会在容器物理挂掉后接管它,避免它被init进程监管等。 342 | 具体的执行还是都交给了底层的runc来完成。 343 | 344 | 流程图如下所示 345 | ![未命名文件.png-34.3kB][1] 346 | 347 | 348 | ### 参考 349 | https://blog.csdn.net/zhonglinzhang/article/details/76615127 350 | https://blog.csdn.net/zhonglinzhang/article/category/3271199 351 | https://blog.csdn.net/zhonglinzhang/article/details/76683925 352 | 353 | 354 | [1]: http://static.zybuluo.com/myecho/r5rshm7wg9ahrv56cqk07zy5/%E6%9C%AA%E5%91%BD%E5%90%8D%E6%96%87%E4%BB%B6.png 355 | -------------------------------------------------------------------------------- /chapter3/section3.2.md: -------------------------------------------------------------------------------- 1 | # Container I/O 2 | 3 | 那么在整个运作过程容器的I/O是如何通过containerd的API来操作的呢?I/O如何被打开?I/O如何被关闭?I/O之间如何进行数据交换的操作,都将在这一节得到答案。 4 | 5 | path: containerd/cmd/ctr/commands/tasks/tasks_unix.go 6 | 7 | // NewTask creates a new task 8 | func NewTask(ctx gocontext.Context, client *containerd.Client, container containerd.Container, checkpoint string, con console.Console, nullIO bool, ioOpts []cio.Opt, opts ...containerd.NewTaskOpts) (containerd.Task, error) { 9 | //默认是使用stdin来初始化client的I/O,如os.stdout/os.stdin/os.stderr 10 | stdio := cio.NewCreator(append([]cio.Opt{cio.WithStdio}, ioOpts...)...) 11 | if checkpoint != "" { 12 | im, err := client.GetImage(ctx, checkpoint) 13 | if err != nil { 14 | return nil, err 15 | } 16 | opts = append(opts, containerd.WithTaskCheckpoint(im)) 17 | } 18 | ioCreator := stdio 19 | // 如果指定了tty 20 | if con != nil { 21 | // 改变client端的stdion, stdin -> console, stdout -> console, stderr->nil 22 | ioCreator = cio.NewCreator(append([]cio.Opt{cio.WithStreams(con, con, nil), cio.WithTerminal}, ioOpts...)...) 23 | } 24 | if nullIO { 25 | if con != nil { 26 | return nil, errors.New("tty and null-io cannot be used together") 27 | } 28 | ioCreator = cio.NullIO 29 | } 30 | //最终会调用containerd的taskService, 这里创建的FIFO-DIR也会作为参数传入 31 | return container.NewTask(ctx, ioCreator, opts...) 32 | } 33 | 34 | // NewCreator returns an IO creator from the options 35 | func NewCreator(opts ...Opt) Creator { 36 | streams := &Streams{} 37 | for _, opt := range opts { 38 | opt(streams) 39 | } 40 | if streams.FIFODir == "" { 41 | streams.FIFODir = defaults.DefaultFIFODir 42 | } 43 | return func(id string) (IO, error) { 44 | //创建containerd对外的I/O管道,所有的I/O都是通过fifo与client端的I/O进行数据交换的 45 | // path形如:filepath.Join(dir, id+"-stdin"), id就是container id 46 | fifos, err := NewFIFOSetInDir(streams.FIFODir, id, streams.Terminal) 47 | if err != nil { 48 | return nil, err 49 | } 50 | if streams.Stdin == nil { 51 | fifos.Stdin = "" 52 | } 53 | if streams.Stdout == nil { 54 | fifos.Stdout = "" 55 | } 56 | if streams.Stderr == nil { 57 | fifos.Stderr = "" 58 | } 59 | //真正的数据交换过程 60 | return copyIO(fifos, streams) 61 | } 62 | } 63 | 64 | //ioset对应的就是containerd client端的I/O 65 | func copyIO(fifos *FIFOSet, ioset *Streams) (*cio, error) { 66 | var ctx, cancel = context.WithCancel(context.Background()) 67 | //注重看下打开io的方式 68 | pipes, err := openFifos(ctx, fifos) 69 | if err != nil { 70 | cancel() 71 | return nil, err 72 | } 73 | 74 | if fifos.Stdin != "" { 75 | go func() { 76 | p := bufPool.Get().(*[]byte) 77 | defer bufPool.Put(p) 78 | 79 | io.CopyBuffer(pipes.Stdin, ioset.Stdin, *p) 80 | pipes.Stdin.Close() 81 | }() 82 | } 83 | 84 | var wg = &sync.WaitGroup{} 85 | wg.Add(1) 86 | go func() { 87 | p := bufPool.Get().(*[]byte) 88 | defer bufPool.Put(p) 89 | 90 | io.CopyBuffer(ioset.Stdout, pipes.Stdout, *p) 91 | pipes.Stdout.Close() 92 | wg.Done() 93 | }() 94 | 95 | //对于terminal的环境来说,stdout/stderr统一拷贝到stdout,两者不区分 96 | if !fifos.Terminal { 97 | wg.Add(1) 98 | go func() { 99 | p := bufPool.Get().(*[]byte) 100 | defer bufPool.Put(p) 101 | 102 | io.CopyBuffer(ioset.Stderr, pipes.Stderr, *p) 103 | pipes.Stderr.Close() 104 | wg.Done() 105 | }() 106 | } 107 | return &cio{ 108 | config: fifos.Config, 109 | wg: wg, 110 | closers: append(pipes.closers(), fifos), 111 | cancel: cancel, 112 | }, nil 113 | } 114 | 115 | func openFifos(ctx context.Context, fifos *FIFOSet) (pipes, error) { 116 | var err error 117 | defer func() { 118 | if err != nil { 119 | fifos.Close() 120 | } 121 | }() 122 | 123 | var f pipes 124 | if fifos.Stdin != "" { 125 | //非堵塞方式打开写端 126 | if f.Stdin, err = fifo.OpenFifo(ctx, fifos.Stdin, syscall.O_WRONLY|syscall.O_CREAT|syscall.O_NONBLOCK, 0700); err != nil { 127 | return f, errors.Wrapf(err, "failed to open stdin fifo") 128 | } 129 | defer func() { 130 | if err != nil && f.Stdin != nil { 131 | f.Stdin.Close() 132 | } 133 | }() 134 | } 135 | if fifos.Stdout != "" { 136 | //非堵塞方式打开读端 137 | if f.Stdout, err = fifo.OpenFifo(ctx, fifos.Stdout, syscall.O_RDONLY|syscall.O_CREAT|syscall.O_NONBLOCK, 0700); err != nil { 138 | return f, errors.Wrapf(err, "failed to open stdout fifo") 139 | } 140 | defer func() { 141 | if err != nil && f.Stdout != nil { 142 | f.Stdout.Close() 143 | } 144 | }() 145 | } 146 | if fifos.Stderr != "" { 147 | //非堵塞方式打开读端 148 | if f.Stderr, err = fifo.OpenFifo(ctx, fifos.Stderr, syscall.O_RDONLY|syscall.O_CREAT|syscall.O_NONBLOCK, 0700); err != nil { 149 | return f, errors.Wrapf(err, "failed to open stderr fifo") 150 | } 151 | } 152 | return f, nil 153 | } 154 | 155 | 对于container的daemon来说,它只需要将I/O通过fifo管道写出去即可(其实还是通过底层runc和shim交互来完成的),不需要关心谁从FIFO管道消费。 156 | 157 | 接下来再看下,containerd daemon拿到fifo-dir后会如何处理: 158 | 159 | path: containerd/services/tasks/local.go 160 | 161 | opts := runtime.CreateOpts{ 162 | Spec: container.Spec, 163 | //这里的I/O还是fifo-dir目录下的fifo 164 | IO: runtime.IO{ 165 | Stdin: r.Stdin, 166 | Stdout: r.Stdout, 167 | Stderr: r.Stderr, 168 | Terminal: r.Terminal, 169 | }, 170 | Checkpoint: checkpointPath, 171 | Runtime: container.Runtime.Name, 172 | RuntimeOptions: container.Runtime.Options, 173 | TaskOptions: r.Options, 174 | } 175 | c, err := runtime.Create(ctx, r.ContainerID, opts) 176 | 177 | path: containerd/runtime/v1/linux/runtime.go 178 | 179 | sopts := &shim.CreateTaskRequest{ 180 | ID: id, 181 | Bundle: bundle.path, 182 | Runtime: rt, 183 | Stdin: opts.IO.Stdin, 184 | Stdout: opts.IO.Stdout, 185 | Stderr: opts.IO.Stderr, 186 | Terminal: opts.IO.Terminal, 187 | Checkpoint: opts.Checkpoint, 188 | Options: opts.TaskOptions, 189 | } 190 | cr, err := s.Create(ctx, sopts) 191 | //封装成task请求然后又交给shim进程来处理了 192 | 193 | path: containerd/runtime/v1/shim/service.go 194 | 195 | if err := process.Create(ctx, config); err != nil { 196 | return nil, errdefs.ToGRPC(err) 197 | } 198 | 199 | path: containerd/runtime/v1/linux/proc/init.go 200 | 201 | // Create the process with the provided config 202 | func (p *Init) Create(ctx context.Context, r *CreateConfig) error { 203 | var ( 204 | err error 205 | socket *runc.Socket 206 | ) 207 | //根据是否tty有两种区分,也能体现在后边copyI/O的步骤上 208 | if r.Terminal { 209 | if socket, err = runc.NewTempConsoleSocket(); err != nil { 210 | return errors.Wrap(err, "failed to create OCI runtime console socket") 211 | } 212 | defer socket.Close() 213 | } else if hasNoIO(r) { 214 | if p.io, err = runc.NewNullIO(); err != nil { 215 | return errors.Wrap(err, "creating new NULL IO") 216 | } 217 | } else { 218 | //这里是创建与容器物理进程通信的Pipe, 不是前边介绍的fifo-dir目录下的fifo 219 | if p.io, err = runc.NewPipeIO(p.IoUID, p.IoGID, withConditionalIO(p.stdio)); err != nil { 220 | return errors.Wrap(err, "failed to create OCI runtime io pipes") 221 | } 222 | } 223 | pidFile := filepath.Join(p.Bundle, InitPidFile) 224 | if r.Checkpoint != "" { 225 | opts := &runc.RestoreOpts{ 226 | CheckpointOpts: runc.CheckpointOpts{ 227 | ImagePath: r.Checkpoint, 228 | WorkDir: p.WorkDir, 229 | ParentPath: r.ParentCheckpoint, 230 | }, 231 | PidFile: pidFile, 232 | IO: p.io, 233 | NoPivot: p.NoPivotRoot, 234 | Detach: true, 235 | NoSubreaper: true, 236 | } 237 | p.initState = &createdCheckpointState{ 238 | p: p, 239 | opts: opts, 240 | } 241 | return nil 242 | } 243 | opts := &runc.CreateOpts{ 244 | PidFile: pidFile, 245 | IO: p.io, 246 | NoPivot: p.NoPivotRoot, 247 | NoNewKeyring: p.NoNewKeyring, 248 | } 249 | if socket != nil { 250 | opts.ConsoleSocket = socket 251 | } 252 | //这是真正创建物理容器进程的步骤,在其中通过 opts.Set(cmd)将pipe I/O set进去 253 | if err := p.runtime.Create(ctx, r.ID, r.Bundle, opts); err != nil { 254 | return p.runtimeError(err, "OCI runtime create failed") 255 | } 256 | //这里为什么打开r.stdin的fifo写端,在CloseIO的时候关闭的就是它 257 | //如果只有读端,如果外部进程异常退出了,那么容器只能认为容器输入结束了(因为对于fifo来说,唯一的写关闭了,读就会自动关闭) 258 | //而如果增加了写端,就能够区分正常和异常结束,同时可以将关闭的主动权握在自己手里,正常结束通过closeIO来关闭写端来结束 259 | if r.Stdin != "" { 260 | sc, err := fifo.OpenFifo(ctx, r.Stdin, syscall.O_WRONLY|syscall.O_NONBLOCK, 0) 261 | if err != nil { 262 | return errors.Wrapf(err, "failed to open stdin fifo %s", r.Stdin) 263 | } 264 | p.stdin = sc 265 | p.closers = append(p.closers, sc) 266 | } 267 | var copyWaitGroup sync.WaitGroup 268 | //下边的逻辑是处理从pipe拷贝到fifo的过程 269 | if socket != nil { 270 | console, err := socket.ReceiveMaster() 271 | if err != nil { 272 | return errors.Wrap(err, "failed to retrieve console master") 273 | } 274 | console, err = p.Platform.CopyConsole(ctx, console, r.Stdin, r.Stdout, r.Stderr, &p.wg, ©WaitGroup) 275 | if err != nil { 276 | return errors.Wrap(err, "failed to start console copy") 277 | } 278 | p.console = console 279 | } else if !hasNoIO(r) { 280 | if err := copyPipes(ctx, p.io, r.Stdin, r.Stdout, r.Stderr, &p.wg, ©WaitGroup); err != nil { 281 | return errors.Wrap(err, "failed to start io pipe copy") 282 | } 283 | } 284 | 285 | copyWaitGroup.Wait() 286 | pid, err := runc.ReadPidFile(pidFile) 287 | if err != nil { 288 | return errors.Wrap(err, "failed to retrieve OCI runtime container pid") 289 | } 290 | p.pid = pid 291 | return nil 292 | } 293 | 294 | //以非tty的情况下举例子 295 | 296 | func copyPipes(ctx context.Context, rio runc.IO, stdin, stdout, stderr string, wg, cwg *sync.WaitGroup) error { 297 | var sameFile io.WriteCloser 298 | for _, i := range []struct { 299 | name string 300 | dest func(wc io.WriteCloser, rc io.Closer) 301 | }{ 302 | { 303 | name: stdout, 304 | dest: func(wc io.WriteCloser, rc io.Closer) { 305 | wg.Add(1) 306 | cwg.Add(1) 307 | go func() { 308 | cwg.Done() 309 | p := bufPool.Get().(*[]byte) 310 | defer bufPool.Put(p) 311 | io.CopyBuffer(wc, rio.Stdout(), *p) 312 | wg.Done() 313 | wc.Close() 314 | if rc != nil { 315 | rc.Close() 316 | } 317 | }() 318 | }, 319 | }, { 320 | name: stderr, 321 | dest: func(wc io.WriteCloser, rc io.Closer) { 322 | wg.Add(1) 323 | cwg.Add(1) 324 | go func() { 325 | cwg.Done() 326 | p := bufPool.Get().(*[]byte) 327 | defer bufPool.Put(p) 328 | io.CopyBuffer(wc, rio.Stderr(), *p) 329 | wg.Done() 330 | wc.Close() 331 | if rc != nil { 332 | rc.Close() 333 | } 334 | }() 335 | }, 336 | }, 337 | } {//开始对stdout和stderr处理 338 | ok, err := isFifo(i.name) 339 | if err != nil { 340 | return err 341 | } 342 | var ( 343 | fw io.WriteCloser 344 | fr io.Closer 345 | ) 346 | if ok { 347 | //为什么stdout和stderr读端和写端都打开?这里还是为了防止fifo的自动关闭,开了写再开个读,什么时候关闭全由自己决定 348 | if fw, err = fifo.OpenFifo(ctx, i.name, syscall.O_WRONLY, 0); err != nil { 349 | return fmt.Errorf("containerd-shim: opening %s failed: %s", i.name, err) 350 | } 351 | if fr, err = fifo.OpenFifo(ctx, i.name, syscall.O_RDONLY, 0); err != nil { 352 | return fmt.Errorf("containerd-shim: opening %s failed: %s", i.name, err) 353 | } 354 | } else { 355 | if sameFile != nil { 356 | i.dest(sameFile, nil) 357 | continue 358 | } 359 | if fw, err = os.OpenFile(i.name, syscall.O_WRONLY|syscall.O_APPEND, 0); err != nil { 360 | return fmt.Errorf("containerd-shim: opening %s failed: %s", i.name, err) 361 | } 362 | if stdout == stderr { 363 | sameFile = fw 364 | } 365 | } 366 | i.dest(fw, fr) 367 | } 368 | if stdin == "" { 369 | return nil 370 | } 371 | //stdin打开读端用作正常使用 372 | f, err := fifo.OpenFifo(ctx, stdin, syscall.O_RDONLY|syscall.O_NONBLOCK, 0) 373 | if err != nil { 374 | return fmt.Errorf("containerd-shim: opening %s failed: %s", stdin, err) 375 | } 376 | cwg.Add(1) 377 | go func() { 378 | cwg.Done() 379 | p := bufPool.Get().(*[]byte) 380 | defer bufPool.Put(p) 381 | 382 | io.CopyBuffer(rio.Stdin(), f, *p) 383 | rio.Stdin().Close() 384 | f.Close() 385 | }() 386 | return nil 387 | } 388 | 389 | #### Container I/O的关闭 390 | 下边关注一下`_Tasks_CloseIO_Handler`,也就是处理I/O关闭的handler. 391 | 392 | path: containerd/services/tasks/local.go 393 | 394 | func (l *local) CloseIO(ctx context.Context, r *api.CloseIORequest, _ ...grpc.CallOption) (*ptypes.Empty, error) { 395 | t, err := l.getTask(ctx, r.ContainerID) 396 | if err != nil { 397 | return nil, err 398 | } 399 | p := runtime.Process(t) 400 | if r.ExecID != "" { 401 | if p, err = t.Process(ctx, r.ExecID); err != nil { 402 | return nil, errdefs.ToGRPC(err) 403 | } 404 | } 405 | if r.Stdin { 406 | if err := p.CloseIO(ctx); err != nil { 407 | return nil, err 408 | } 409 | } 410 | return empty, nil 411 | } 412 | 413 | // CloseIO closes the provided IO pipe for the process 414 | func (p *process) CloseIO(ctx context.Context) error { 415 | _, err := p.shim.task.CloseIO(ctx, &task.CloseIORequest{ 416 | ID: p.shim.ID(), 417 | ExecID: p.id, 418 | Stdin: true, 419 | }) 420 | if err != nil { 421 | return errdefs.FromGRPC(err) 422 | } 423 | return nil 424 | } 425 | 426 | // CloseIO of a process 427 | func (s *Service) CloseIO(ctx context.Context, r *shimapi.CloseIORequest) (*ptypes.Empty, error) { 428 | s.mu.Lock() 429 | defer s.mu.Unlock() 430 | p := s.processes[r.ID] 431 | if p == nil { 432 | return nil, errdefs.ToGRPCf(errdefs.ErrNotFound, "process does not exist %s", r.ID) 433 | } 434 | //在这里看到其实是关闭了stdin(这里对应上文中打开的r.stdin的写端) 435 | if stdin := p.Stdin(); stdin != nil { 436 | if err := stdin.Close(); err != nil { 437 | return nil, errors.Wrap(err, "close stdin") 438 | } 439 | } 440 | return empty, nil 441 | } 442 | 443 | 444 | ### 总结 445 | tty情况下,runc和shim之间通过unix socket进行通信, 通过runc create --console-socket value 可以看出,此时stdout和stderr不区分,统一走socket; 而当non-tty的情况下,是通过父子进程之间的pipe进行通信的, stdout和stderr是有区分的。 446 | 447 | ### 总结-流程图 448 | ![angular.js.png-8.9kB][1] 449 | 450 | 451 | ### Pouch I/O设计 452 | ![image.png-79.5kB][2] 453 | 454 | https://github.com/alibaba/pouch/pull/2375/files 455 | 456 | 着重注意一下对这段话的理解: 457 | 458 | > The contained-shim will open fifo twice for reading and writing. For 459 | > the writing mode, the shim doesn't close stdin fifo until the client 460 | > calls CloseIO. In some case, the pouch daemon might be crash before 461 | > finishing the input. If shim doesn't hold writing mode fifo, the 462 | > process in container will consider that it is EOF signal and exit. 463 | > 464 | > Based on this case, if the client sends EOF signal to input channel, 465 | > the pouchd should send the CloseIO to shim to let the process exit. 466 | > 467 | > StdinOnce in container's config is used by attach request. If the 468 | > StdinOnce is true, when one attach request finishes stream copy, the 469 | > pouchd will closes the input of process. So other attach requests to 470 | > the same container will be stopped. 471 | > 472 | > If the user wants StdinOnce, it should set it to true during creating 473 | > container. 474 | 475 | Thanks to @fuwid explaination. 476 | 477 | ### 参考 478 | http://hustcat.github.io/terminal-and-docker/ 479 | https://github.com/alibaba/pouch/pull/2375/files 480 | 481 | 482 | [1]: http://static.zybuluo.com/myecho/3xdwajlta9567m0bcjq77qc6/angular.js.png 483 | [2]: http://static.zybuluo.com/myecho/lig45a4gvcubxul5b9bqv05d/image.png 484 | -------------------------------------------------------------------------------- /chapter3/section3.3.md: -------------------------------------------------------------------------------- 1 | # runc 2 | 3 | ### OCI runtime spec 4 | 介绍的比较好的文章: 5 | https://segmentfault.com/a/1190000009583199 6 | 7 | https://github.com/opencontainers/runtime-spec 8 | https://github.com/opencontainers/runtime-tools 9 | 10 | ### runc使用实例 11 | 12 | 1. 首先通过 13 | 14 | $ docker pull busybox 15 | $ mkdir -p /tmp/mycontainer/rootfs 16 | $ cd /tmp/mycontainer 17 | $ docker export $(docker create busybox) | tar -C rootfs -xvf - 18 | 19 | 产生一个rootfs,当然还可以通过其他的runtime-tools来直接生成 20 | 2. 通过runc spec产生一个符合runtime spec的bundle config.json,下边是一个busybox镜像导出的config.json的例子 21 | 22 | { 23 | "ociVersion": "1.0.0", 24 | //表示进入容器后要执行什么命令,后边还会提到 25 | "process": { 26 | "terminal": true, 27 | "user": { 28 | "uid": 0, 29 | "gid": 0 30 | }, 31 | "args": [ 32 | "sh" 33 | ], 34 | "env": [ 35 | "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 36 | "TERM=xterm" 37 | ], 38 | "cwd": "/", 39 | "capabilities": { 40 | "bounding": [ 41 | "CAP_AUDIT_WRITE", 42 | "CAP_KILL", 43 | "CAP_NET_BIND_SERVICE" 44 | ], 45 | "effective": [ 46 | "CAP_AUDIT_WRITE", 47 | "CAP_KILL", 48 | "CAP_NET_BIND_SERVICE" 49 | ], 50 | "inheritable": [ 51 | "CAP_AUDIT_WRITE", 52 | "CAP_KILL", 53 | "CAP_NET_BIND_SERVICE" 54 | ], 55 | "permitted": [ 56 | "CAP_AUDIT_WRITE", 57 | "CAP_KILL", 58 | "CAP_NET_BIND_SERVICE" 59 | ], 60 | "ambient": [ 61 | "CAP_AUDIT_WRITE", 62 | "CAP_KILL", 63 | "CAP_NET_BIND_SERVICE" 64 | ] 65 | }, 66 | "rlimits": [ 67 | { 68 | "type": "RLIMIT_NOFILE", 69 | "hard": 1024, 70 | "soft": 1024 71 | } 72 | ], 73 | "noNewPrivileges": true 74 | }, 75 | "root": { 76 | "path": "rootfs", 77 | "readonly": true 78 | }, 79 | "hostname": "runc", 80 | "mounts": [ 81 | { 82 | "destination": "/proc", 83 | "type": "proc", 84 | "source": "proc" 85 | }, 86 | { 87 | "destination": "/dev", 88 | "type": "tmpfs", 89 | "source": "tmpfs", 90 | "options": [ 91 | "nosuid", 92 | "strictatime", 93 | "mode=755", 94 | "size=65536k" 95 | ] 96 | }, 97 | { 98 | "destination": "/dev/pts", 99 | "type": "devpts", 100 | "source": "devpts", 101 | "options": [ 102 | "nosuid", 103 | "noexec", 104 | "newinstance", 105 | "ptmxmode=0666", 106 | "mode=0620", 107 | "gid=5" 108 | ] 109 | }, 110 | { 111 | "destination": "/dev/shm", 112 | "type": "tmpfs", 113 | "source": "shm", 114 | "options": [ 115 | "nosuid", 116 | "noexec", 117 | "nodev", 118 | "mode=1777", 119 | "size=65536k" 120 | ] 121 | }, 122 | { 123 | "destination": "/dev/mqueue", 124 | "type": "mqueue", 125 | "source": "mqueue", 126 | "options": [ 127 | "nosuid", 128 | "noexec", 129 | "nodev" 130 | ] 131 | }, 132 | { 133 | "destination": "/sys", 134 | "type": "sysfs", 135 | "source": "sysfs", 136 | "options": [ 137 | "nosuid", 138 | "noexec", 139 | "nodev", 140 | "ro" 141 | ] 142 | }, 143 | { 144 | "destination": "/sys/fs/cgroup", 145 | "type": "cgroup", 146 | "source": "cgroup", 147 | "options": [ 148 | "nosuid", 149 | "noexec", 150 | "nodev", 151 | "relatime", 152 | "ro" 153 | ] 154 | } 155 | ], 156 | "linux": { 157 | "resources": { 158 | "devices": [ 159 | { 160 | "allow": false, 161 | "access": "rwm" 162 | } 163 | ] 164 | }, 165 | "namespaces": [ 166 | { 167 | "type": "pid" 168 | }, 169 | { 170 | "type": "network", 171 | "path": "/var/run/netns/runc1" 172 | }, 173 | { 174 | "type": "ipc" 175 | }, 176 | { 177 | "type": "uts" 178 | }, 179 | { 180 | "type": "mount" 181 | } 182 | ], 183 | "maskedPaths": [ 184 | "/proc/kcore", 185 | "/proc/latency_stats", 186 | "/proc/timer_list", 187 | "/proc/timer_stats", 188 | "/proc/sched_debug", 189 | "/sys/firmware" 190 | ], 191 | "readonlyPaths": [ 192 | "/proc/asound", 193 | "/proc/bus", 194 | "/proc/fs", 195 | "/proc/irq", 196 | "/proc/sys", 197 | "/proc/sysrq-trigger" 198 | ] 199 | } 200 | } 201 | 202 | 3. runc run启动容器 203 | 4. runc list查看目前已经有的容器 204 | 205 | ### 源码分析 206 | 本质上runc是对libContainer的一层封装,将符合OCI的config.json转化为libContainer需要的配置文件,然后通过libContainer将容器启动。 207 | 208 | #### 容器创建 209 | path: opencontainers/runc/create.go 210 | 211 | Action: func(context *cli.Context) error { 212 | if err := checkArgs(context, 1, exactArgs); err != nil { 213 | return err 214 | } 215 | if err := revisePidFile(context); err != nil { 216 | return err 217 | } 218 | //load config.json到内存来 219 | spec, err := setupSpec(context) 220 | if err != nil { 221 | return err 222 | } 223 | // CT_ACT_CREATE参数,表示首次创建容器 224 | status, err := startContainer(context, spec, CT_ACT_CREATE, nil) 225 | if err != nil { 226 | return err 227 | } 228 | // exit with the container's exit status so any external supervisor is 229 | // notified of the exit with the correct exit status. 230 | os.Exit(status) 231 | return nil 232 | } 233 | 234 | func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) { 235 | //获取容器id 236 | id := context.Args().First() 237 | if id == "" { 238 | return -1, errEmptyID 239 | } 240 | 241 | notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id) 242 | if notifySocket != nil { 243 | //如果systemd支持的话,给容器添加对应的socket通信路径 244 | notifySocket.setupSpec(context, spec) 245 | } 246 | //根据spec中Container相关的内容,调用libcontainer 创建容器对象,且容器的状态设置为Stopped。仅仅只是一个内存中的数据结构,并没有与之对应的进程 247 | container, err := createContainer(context, id, spec) 248 | if err != nil { 249 | return -1, err 250 | } 251 | 252 | if notifySocket != nil { 253 | err := notifySocket.setupSocket() 254 | if err != nil { 255 | return -1, err 256 | } 257 | } 258 | 259 | // Support on-demand socket activation by passing file descriptors into the container init process. 260 | listenFDs := []*os.File{} 261 | if os.Getenv("LISTEN_FDS") != "" { 262 | listenFDs = activation.Files(false) 263 | } 264 | r := &runner{ 265 | enableSubreaper: !context.Bool("no-subreaper"), 266 | shouldDestroy: true, 267 | container: container, 268 | listenFDs: listenFDs, 269 | notifySocket: notifySocket, 270 | consoleSocket: context.String("console-socket"), 271 | detach: context.Bool("detach"), 272 | pidFile: context.String("pid-file"), 273 | preserveFDs: context.Int("preserve-fds"), 274 | action: action, 275 | criuOpts: criuOpts, 276 | init: true, 277 | } 278 | //将spec中的Process转换成libcontainer兼容的模式,并对容器的IO进行配置 279 | return r.run(spec.Process) 280 | } 281 | 282 | func (r *runner) run(config *specs.Process) (int, error) { 283 | //检查有关tty的设置,其中的console-socket就是tty mode下需要用的unix-socket 284 | if err := r.checkTerminal(config); err != nil { 285 | r.destroy() 286 | return -1, err 287 | } 288 | //将spec的Process转换为libcontainer要求的Process配置格式 289 | process, err := newProcess(*config, r.init) 290 | if err != nil { 291 | r.destroy() 292 | return -1, err 293 | } 294 | if len(r.listenFDs) > 0 { 295 | process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1") 296 | process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...) 297 | } 298 | baseFd := 3 + len(process.ExtraFiles) 299 | for i := baseFd; i < baseFd+r.preserveFDs; i++ { 300 | process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i))) 301 | } 302 | rootuid, err := r.container.Config().HostRootUID() 303 | if err != nil { 304 | r.destroy() 305 | return -1, err 306 | } 307 | rootgid, err := r.container.Config().HostRootGID() 308 | if err != nil { 309 | r.destroy() 310 | return -1, err 311 | } 312 | var ( 313 | detach = r.detach || (r.action == CT_ACT_CREATE) 314 | ) 315 | // Setting up IO is a two stage process. We need to modify process to deal 316 | // with detaching containers, and then we get a tty after the container has 317 | // started. 318 | / 319 | handler := newSignalHandler(r.enableSubreaper, r.notifySocket) 320 | //配置容器I/O,前边有章节专门介绍过 321 | tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket) 322 | if err != nil { 323 | r.destroy() 324 | return -1, err 325 | } 326 | defer tty.Close() 327 | 328 | //根据调用方法传入参数的不同,调用不同的执行步骤,在这里就直接start 329 | switch r.action { 330 | case CT_ACT_CREATE: 331 | err = r.container.Start(process) 332 | case CT_ACT_RESTORE: 333 | err = r.container.Restore(process, r.criuOpts) 334 | case CT_ACT_RUN: 335 | err = r.container.Run(process) 336 | default: 337 | panic("Unknown action") 338 | } 339 | if err != nil { 340 | r.destroy() 341 | return -1, err 342 | } 343 | //以下都是完成一些start之后的后续工作 344 | if err := tty.waitConsole(); err != nil { 345 | r.terminate(process) 346 | r.destroy() 347 | return -1, err 348 | } 349 | if err = tty.ClosePostStart(); err != nil { 350 | r.terminate(process) 351 | r.destroy() 352 | return -1, err 353 | } 354 | if r.pidFile != "" { 355 | //为容器创建一个pid-file 356 | if err = createPidFile(r.pidFile, process); err != nil { 357 | r.terminate(process) 358 | r.destroy() 359 | return -1, err 360 | } 361 | } 362 | status, err := handler.forward(process, tty, detach) 363 | if err != nil { 364 | r.terminate(process) 365 | } 366 | if detach { 367 | return 0, nil 368 | } 369 | r.destroy() 370 | return status, err 371 | } 372 | 373 | func (c *linuxContainer) Start(process *Process) error { 374 | c.m.Lock() 375 | defer c.m.Unlock() 376 | if process.Init { 377 | // 创建一个路径为/run/runc/$ID/exec.fifo的管道文件 378 | if err := c.createExecFifo(); err != nil { 379 | return err 380 | } 381 | } 382 | // 真正启动容器进程,runc与容器进程之间的通信通过创建的init管道或者环境变量 383 | if err := c.start(process); err != nil { 384 | if process.Init { 385 | //失败了需要删除刚才创建的管道 386 | c.deleteExecFifo() 387 | } 388 | return err 389 | } 390 | return nil 391 | } 392 | 393 | 容器进程在产生后必须从runc读取配置才能够继续进行,path: opencontainers/runc/libcontainer/factory_linux.go 394 | 395 | func (l *LinuxFactory) StartInitialization() (err error) { 396 | var ( 397 | pipefd, fifofd int 398 | consoleSocket *os.File 399 | envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE") 400 | envFifoFd = os.Getenv("_LIBCONTAINER_FIFOFD") 401 | envConsole = os.Getenv("_LIBCONTAINER_CONSOLE") 402 | ) 403 | 404 | // Get the INITPIPE. 405 | pipefd, err = strconv.Atoi(envInitPipe) 406 | if err != nil { 407 | return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err) 408 | } 409 | 410 | var ( 411 | pipe = os.NewFile(uintptr(pipefd), "pipe") 412 | // 判断是`runc create`还是`runc exec` 413 | it = initType(os.Getenv("_LIBCONTAINER_INITTYPE")) 414 | ) 415 | defer pipe.Close() 416 | 417 | // Only init processes have FIFOFD. 418 | // 只有init进程有FIFOFD 419 | fifofd = -1 420 | if it == initStandard { 421 | if fifofd, err = strconv.Atoi(envFifoFd); err != nil { 422 | return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err) 423 | } 424 | } 425 | ... 426 | // 会从管道中读取config,然后返回Init的接口对象 427 | i, err := newContainerInit(it, pipe, consoleSocket, fifofd) 428 | if err != nil { 429 | return err 430 | } 431 | // If Init succeeds, syscall.Exec will not return, hence none of the defers will be called. 432 | //下边的代码片段就是展示这个方法 433 | return i.Init() 434 | } 435 | 436 | 437 | path: opencontainers/runc/libcontainer/standard_init_linux.go 438 | func (l *linuxStandardInit) Init() error { 439 | ... 440 | // 配置network, 配置路由等等 441 | ... 442 | // 准备rootfs 443 | if err := prepareRootfs(l.pipe, l.config); err != nil { 444 | return err 445 | } 446 | // 配置console, hostname, apparmor, process label, sysctl等等 447 | ... 448 | // 告诉父进程我们已经准备好Exec了 449 | if err := syncParentReady(l.pipe); err != nil { 450 | return err 451 | } 452 | // 配置seccomp 453 | ... 454 | // 设置正确的capability,用户以及工作目录 455 | if err := finalizeNamespace(l.config); err != nil { 456 | return err 457 | } 458 | ... 459 | // 确定用户指定的容器进程在容器文件系统中的路径 460 | name, err := exec.LookPath(l.config.Args[0]) 461 | if err != nil { 462 | return err 463 | } 464 | // 关闭init管道,告诉runC进程,我们已经完成了初始化工作 465 | l.pipe.Close() 466 | // 在exec用户进程之前等待exec.fifo管道在另一端被打开 467 | // 我们通过/proc/self/fd/$fd打开它 468 | fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0) 469 | ... 470 | // 向exec.fifo管道写数据,阻塞,直到用户调用`runc start`,读取管道中的数据 471 | if _, err := unix.Write(fd, []byte("0")); err != nil { 472 | return newSystemErrorWithCause(err, "write 0 exec fifo") 473 | } 474 | ... 475 | // 调用exec命令,执行用户进程,也就是我们在config文件中看到的process描述的命令 476 | if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { 477 | return newSystemErrorWithCause(err, "exec user process") 478 | } 479 | return nil 480 | } 481 | 482 | path: opencontainers/runc/libcontainer/rootfs_linux.go 483 | 484 | func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { 485 | ... 486 | // 配置mounts, dev,将mounts挂载到rootfs等 487 | ... 488 | // 通知父进程运行pre-start hooks 489 | if err := syncParentHooks(pipe); err != nil { 490 | return err 491 | } 492 | ... 493 | if config.NoPivotRoot { 494 | err = msMoveRoot(config.Rootfs) 495 | } else if config.Namespaces.Contains(configs.NEWNS) { 496 | err = pivotRoot(config.Rootfs) 497 | } else { 498 | //最后还是通过chroot来切换文件系统的视角 499 | err = chroot(config.Rootfs) 500 | } 501 | ... 502 | return nil 503 | 504 | > prepareRootfs先对容器的Mounts和Dev等信息进行配置,之后再调用syncParentHooks,通过init管道向runC进程发送procHooks信号。runC进程接收到procHooks信号之后,执行容器的PreStart 505 | > Hook回调函数,再通过init管道给容器初始化进程发送信号procResume,通知其继续执行。可见容器的PreStart 506 | > Hook是在根目录尚未切换之前执行完成的。最终,调用chroot函数,切换根目录。至此,容器的文件系统切换完毕。 507 | > 508 | > 在文件系统准备完成之后,Init方法还会对Console, 509 | > hostname等属性进行配置。当一切就绪之后,调用syncParentReady通过init管道通知runC进程,获取响应之后,关闭init管道,同步结束,准备开始执行用户指定的容器进程。 510 | > 511 | > 不过在找到了用户指定的容器程序在容器文件系统的执行路径之后,初始化进程又打开了我们之前多次提到的exec.fifo这个管道,并且往里面写入了一个字节,之后才执行Exec系统调用,切换到用户程序。既然exec.fifo是一个管道,那么我们在这一端写入之后,就必须有消费者在另外一端进行读取,否则写进程就会一直处于阻塞状态。 512 | > 513 | > 事实上,此处对exec.fifo管道的写阻塞正是runc create和runc 514 | > start执行流的分界点。容器的创建工作,在容器初始化进程往exec.fifo管道进行写操作的那一刻,就全部结束了。 515 | 516 | ### 容器启动 517 | 518 | path: opencontainers/runc/start.go 519 | 520 | Action: func(context *cli.Context) error { 521 | if err := checkArgs(context, 1, exactArgs); err != nil { 522 | return err 523 | } 524 | container, err := getContainer(context) 525 | if err != nil { 526 | return err 527 | } 528 | status, err := container.Status() 529 | if err != nil { 530 | return err 531 | } 532 | switch status { 533 | case libcontainer.Created: 534 | // runc start的执行路径到这 535 | return container.Exec() 536 | case libcontainer.Stopped: 537 | return errors.New("cannot start a container that has stopped") 538 | case libcontainer.Running: 539 | return errors.New("cannot start an already running container") 540 | default: 541 | return fmt.Errorf("cannot start a container in the %s state\n", status) 542 | } 543 | } 544 | 545 | path: opencontainers/runc/libcontainer/container_linux.go 546 | 547 | func (c *linuxContainer) exec() error { 548 | path := filepath.Join(c.root, execFifoFilename) 549 | 550 | fifoOpen := make(chan struct{}) 551 | select { 552 | case <-awaitProcessExit(c.initProcess.pid(), fifoOpen): 553 | return errors.New("container process is already dead") 554 | //打开fifo,以解开刚才创建容器过程中exec-fifo的写堵塞 555 | case result := <-awaitFifoOpen(path): 556 | close(fifoOpen) 557 | if result.err != nil { 558 | return result.err 559 | } 560 | f := result.file 561 | defer f.Close() 562 | //读取exec-fifo中的内容,也就是刚才写入的那个字节 563 | if err := readFromExecFifo(f); err != nil { 564 | return err 565 | } 566 | return os.Remove(path) 567 | } 568 | } 569 | 570 | > 可是这一路分析下来,似乎并没有对容器的namespace进行配置的操作?事实上,子进程runc 571 | > init的执行流在进入Go语言的运行时之前,会被包/runc/libcontainer/nsenter劫持,先去执行一段C代码。这段C代码同样会从init管道中读取容器的配置,主要是namespace的路径,clone 572 | > flag等等,并根据这些配置,调用setns系统调用,将容器进程加入到合适的namespace中。之后再进入Go的运行时,完成上文所述的各种初始化操作。 573 | 574 | ### 总结 575 | 摘一张来自[zju blog][1]的图片 576 | ![image.png-68.8kB][2] 577 | 578 | ### 参考 579 | 1. http://www.sel.zju.edu.cn/?p=840 580 | 2. https://cizixs.com/2017/11/05/oci-and-runc/ 581 | 3. https://blog.csdn.net/zhonglinzhang/article/details/76757277 582 | 4. https://segmentfault.com/a/1190000016366810 583 | 584 | [1]: http://www.sel.zju.edu.cn/?p=840 585 | [2]: http://static.zybuluo.com/myecho/5ey61f8u33wpawhc9i30wyu2/image.png 586 | -------------------------------------------------------------------------------- /chapter4/README.md: -------------------------------------------------------------------------------- 1 | # 镜像 2 | 3 | 介绍containerd如何完成镜像的拉取以及使用过程,注意containerd 没有build image的功能。 4 | -------------------------------------------------------------------------------- /chapter4/section4.1.md: -------------------------------------------------------------------------------- 1 | # image fetch 2 | 3 | 如下图所展示pull images的流程,总体来说containerd的pull流程可以分为fetch和unpack两部分。本节主要关注fetch步骤。 4 | 5 | ![image.png-351.2kB][1] 6 | 7 | ### oci distribution spec 8 | 目前containerd当前同时支持docker版的和oci版的registry api。我们首先来看一下oci distribution spec定义了哪些内容。 9 | 10 | path: https://github.com/opencontainers/distribution-spec/blob/master/spec.md 11 | 12 | An "image" is a combination of a JSON manifest and individual layer files. The process of pulling an image centers around retrieving these two components. 13 | 14 | 想要pull image首先要从registry中获取manifests,包含以下的fields. 15 | 16 | | field | description | 17 | |-----------|------------------------------------------------| 18 | | name | The name of the image. | 19 | | tag | The tag for this version of the image. | 20 | | fsLayers | A list of layer descriptors (including digest) | 21 | | signature | A JWS used to verify the manifest content | 22 | 23 | 更详细的manifest的example可以在`https://github.com/ZYecho/image-spec/blob/master/manifest.md`中找到。 24 | 对应的API为`GET /v2//manifests/`,the reference may include a tag or digest. 25 | 26 | 然后根据manifests中包含的digest去获取每层layer的blob,通过API `GET /v2//blobs/`进行获取。 27 | 28 | 值得注意在整个拉取过程中没有出现image id的用处,image id是docker的用法,image id和manifest中config的digest相同,本质上是image configuration JSON的digest。 29 | 30 | Docker-Content-Digest header: 31 | 这个头部后边还需要用到,先放到这里了解一下。 32 | > To provide verification of http content, any response may include a 33 | > Docker-Content-Digest header. This will include the digest of the 34 | > target entity returned in the response. For blobs, this is the entire 35 | > blob content. For manifests, this is the manifest body without the 36 | > signature content, also known as the JWS payload. Note that the 37 | > commonly used canonicalization for digest calculation may be dependent 38 | > on the mediatype of the content, such as with manifests. 39 | 40 | ### fetch流程 41 | path: containerd/cmd/ctr/commands/images/pull.go 42 | ctr image pull操作流程如下: 43 | 44 | 1. resolve用户需要下载的镜像 45 | 2. 从registry pull镜像,把镜像层内容和config保存进content服务,把镜像相关的元数据保存进images元数据服务 46 | 3. unpack进snapshot服务 47 | 48 | //ref就是拉取用的reference, 形如docker.io/library/redis:alpine 49 | img, err := content.Fetch(ctx, client, ref, config) 50 | if err != nil { 51 | return err 52 | } 53 | 54 | 这里直接调用content实现的Fetch函数,是和`ctr content fetch`用的同样的处理逻辑。 55 | 56 | // Fetch loads all resources into the content store and returns the image 57 | func Fetch(ctx context.Context, client *containerd.Client, ref string, config *FetchConfig) (images.Image, error) { 58 | ongoing := newJobs(ref) 59 | 60 | pctx, stopProgress := context.WithCancel(ctx) 61 | progress := make(chan struct{}) 62 | 63 | go func() { 64 | if config.ProgressOutput != nil { 65 | // no progress bar, because it hides some debug logs 66 | showProgress(pctx, ongoing, client.ContentStore(), config.ProgressOutput) 67 | } 68 | close(progress) 69 | }() 70 | 71 | h := images.HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) { 72 | // 将非manifest的digest添加到Ongoing中,在showProgress中会使用到 73 | if desc.MediaType != images.MediaTypeDockerSchema1Manifest { 74 | ongoing.add(desc) 75 | } 76 | return nil, nil 77 | }) 78 | 79 | log.G(pctx).WithField("image", ref).Debug("fetching") 80 | labels := commands.LabelArgs(config.Labels) 81 | opts := []containerd.RemoteOpt{ 82 | containerd.WithPullLabels(labels), 83 | containerd.WithResolver(config.Resolver), 84 | containerd.WithImageHandler(h), 85 | containerd.WithSchema1Conversion, 86 | } 87 | for _, platform := range config.Platforms { 88 | opts = append(opts, containerd.WithPlatform(platform)) 89 | } 90 | //调用containerd的grpc服务 91 | img, err := client.Fetch(pctx, ref, opts...) 92 | stopProgress() 93 | if err != nil { 94 | return images.Image{}, err 95 | } 96 | 97 | <-progress 98 | return img, nil 99 | } 100 | 101 | path: containerd/containerd/client.go 102 | 103 | func (c *Client) fetch(ctx context.Context, rCtx *RemoteContext, ref string, limit int) (images.Image, error) { 104 | store := c.ContentStore() 105 | //第一步首先将ref解析为descriptor,descriptor可以理解为一个可以用来下载的描述对象 106 | name, desc, err := rCtx.Resolver.Resolve(ctx, ref) 107 | if err != nil { 108 | return images.Image{}, errors.Wrapf(err, "failed to resolve reference %q", ref) 109 | } 110 | 111 | fetcher, err := rCtx.Resolver.Fetcher(ctx, name) 112 | if err != nil { 113 | return images.Image{}, errors.Wrapf(err, "failed to get fetcher for %q", name) 114 | } 115 | 116 | var ( 117 | schema1Converter *schema1.Converter 118 | handler images.Handler 119 | ) 120 | if desc.MediaType == images.MediaTypeDockerSchema1Manifest && rCtx.ConvertSchema1 { 121 | // 兼容逻辑 122 | schema1Converter = schema1.NewConverter(store, fetcher) 123 | handler = images.Handlers(append(rCtx.BaseHandlers, schema1Converter)...) 124 | } else { 125 | // Get all the children for a descriptor 126 | childrenHandler := images.ChildrenHandler(store) 127 | // Set any children labels for that content 128 | childrenHandler = images.SetChildrenLabels(store, childrenHandler) 129 | // Filter children by platforms 130 | childrenHandler = images.FilterPlatforms(childrenHandler, rCtx.PlatformMatcher) 131 | // Sort and limit manifests if a finite number is needed 132 | if limit > 0 { 133 | childrenHandler = images.LimitManifests(childrenHandler, rCtx.PlatformMatcher, limit) 134 | } 135 | // 会在后边分别介绍这几个Handler, Handlers返回一个新的Handler(具备顺序遍历的功能) 136 | handler = images.Handlers(append(rCtx.BaseHandlers, 137 | remotes.FetchHandler(store, fetcher), 138 | childrenHandler, 139 | )...) 140 | } 141 | 142 | //这个真正开始下载的流程,包括manifest以及镜像层 143 | if err := images.Dispatch(ctx, handler, desc); err != nil { 144 | return images.Image{}, err 145 | } 146 | if schema1Converter != nil { 147 | desc, err = schema1Converter.Convert(ctx) 148 | if err != nil { 149 | return images.Image{}, err 150 | } 151 | } 152 | 153 | img := images.Image{ 154 | Name: name, 155 | Target: desc, 156 | Labels: rCtx.Labels, 157 | } 158 | 159 | is := c.ImageService() 160 | for { 161 | //调用images服务存储镜像元数据 162 | if created, err := is.Create(ctx, img); err != nil { 163 | if !errdefs.IsAlreadyExists(err) { 164 | return images.Image{}, err 165 | } 166 | 167 | updated, err := is.Update(ctx, img) 168 | if err != nil { 169 | // if image was removed, try create again 170 | if errdefs.IsNotFound(err) { 171 | continue 172 | } 173 | return images.Image{}, err 174 | } 175 | 176 | img = updated 177 | } else { 178 | img = created 179 | } 180 | 181 | return img, nil 182 | } 183 | } 184 | 185 | ### 构造descriptor对象 186 | 下边看一下第一步ref是如何转到为descriptor的。 187 | 188 | func (r *dockerResolver) Resolve(ctx context.Context, ref string) (string, ocispec.Descriptor, error) { 189 | //首先将ref转化为locator(docker.io/library/redis)和object(tag -> 2.7.8或者是digest)两部分 190 | refspec, err := reference.Parse(ref) 191 | if err != nil { 192 | return "", ocispec.Descriptor{}, err 193 | } 194 | 195 | if refspec.Object == "" { 196 | return "", ocispec.Descriptor{}, reference.ErrObjectRequired 197 | } 198 | 199 | //在其中提取出了镜像repo的地址,为了后期方便组装镜像的地址 200 | base, err := r.base(refspec) 201 | if err != nil { 202 | return "", ocispec.Descriptor{}, err 203 | } 204 | 205 | fetcher := dockerFetcher{ 206 | dockerBase: base, 207 | } 208 | 209 | var ( 210 | urls []string 211 | dgst = refspec.Digest() 212 | ) 213 | 214 | //如果是Object是用digest来表示的 215 | if dgst != "" { 216 | if err := dgst.Validate(); err != nil { 217 | // need to fail here, since we can't actually resolve the invalid 218 | // digest. 219 | return "", ocispec.Descriptor{}, err 220 | } 221 | 222 | //首先尝试registry-1.docker.io/v2/library/ubuntu/manifest/sha256:xxx 223 | // turns out, we have a valid digest, make a url. 224 | urls = append(urls, fetcher.url("manifests", dgst.String())) 225 | 226 | // fallback to blobs on not found. 227 | //如果失败再使用registry-1.docker.io/v2/library/ubuntu/blobs/sha256:xxx 228 | urls = append(urls, fetcher.url("blobs", dgst.String())) 229 | } else { 230 | //直接使用tag来进行访问,registry-1.docker.io/v2/library/redis/manifest/alpine 231 | urls = append(urls, fetcher.url("manifests", refspec.Object)) 232 | } 233 | 234 | ctx, err = contextWithRepositoryScope(ctx, refspec, false) 235 | if err != nil { 236 | return "", ocispec.Descriptor{}, err 237 | } 238 | for _, u := range urls { 239 | //注意这里是HEAD,也就是对应着spec中的Existing Manifests API,先看看Manifests在不在,在的话再去下载之 240 | req, err := http.NewRequest(http.MethodHead, u, nil) 241 | if err != nil { 242 | return "", ocispec.Descriptor{}, err 243 | } 244 | 245 | // set headers for all the types we support for resolution. 246 | //构建HTTP请求对象头部 247 | req.Header.Set("Accept", strings.Join([]string{ 248 | images.MediaTypeDockerSchema2Manifest, 249 | images.MediaTypeDockerSchema2ManifestList, 250 | ocispec.MediaTypeImageManifest, 251 | ocispec.MediaTypeImageIndex, "*"}, ", ")) 252 | 253 | log.G(ctx).Debug("resolving") 254 | resp, err := fetcher.doRequestWithRetries(ctx, req, nil) 255 | if err != nil { 256 | if errors.Cause(err) == ErrInvalidAuthorization { 257 | err = errors.Wrapf(err, "pull access denied, repository does not exist or may require authorization") 258 | } 259 | return "", ocispec.Descriptor{}, err 260 | } 261 | //在构建descriptor过程中没有使用到resp body的内容 262 | resp.Body.Close() // don't care about body contents. 263 | 264 | if resp.StatusCode > 299 { 265 | if resp.StatusCode == http.StatusNotFound { 266 | continue 267 | } 268 | return "", ocispec.Descriptor{}, errors.Errorf("unexpected status code %v: %v", u, resp.Status) 269 | } 270 | 271 | // this is the only point at which we trust the registry. we use the 272 | // content headers to assemble a descriptor for the name. when this becomes 273 | // more robust, we mostly get this information from a secure trust store. 274 | //关于这个头部,在前边介绍OCI distribution spec的时候有提到,目的还是为了校验 275 | dgstHeader := digest.Digest(resp.Header.Get("Docker-Content-Digest")) 276 | 277 | if dgstHeader != "" { 278 | if err := dgstHeader.Validate(); err != nil { 279 | return "", ocispec.Descriptor{}, errors.Wrapf(err, "%q in header not a valid digest", dgstHeader) 280 | } 281 | dgst = dgstHeader 282 | } 283 | 284 | if dgst == "" { 285 | return "", ocispec.Descriptor{}, errors.Errorf("could not resolve digest for %v", ref) 286 | } 287 | 288 | var ( 289 | size int64 290 | sizeHeader = resp.Header.Get("Content-Length") 291 | ) 292 | 293 | size, err = strconv.ParseInt(sizeHeader, 10, 64) 294 | if err != nil { 295 | 296 | return "", ocispec.Descriptor{}, errors.Wrapf(err, "invalid size header: %q", sizeHeader) 297 | } 298 | if size < 0 { 299 | return "", ocispec.Descriptor{}, errors.Errorf("%q in header not a valid size", sizeHeader) 300 | } 301 | 302 | desc := ocispec.Descriptor{ 303 | //可以看到这三个值都是从头部中拿到的 304 | Digest: dgst, 305 | //可能为application/vnd.docker.distribution.manifest.list.v2+json, 对应着不同平台的多个image, containerd在向registry下载manifest list之后,再去选择下载其中的某个平台的镜像。 306 | MediaType: resp.Header.Get("Content-Type"), // need to strip disposition? 307 | Size: size, 308 | } 309 | 310 | log.G(ctx).WithField("desc.digest", desc.Digest).Debug("resolved") 311 | return ref, desc, nil 312 | } 313 | 314 | return "", ocispec.Descriptor{}, errors.Errorf("%v not found", ref) 315 | } 316 | 317 | //执行下载的主要框架,具体的执行逻辑在handler中,在下一节进行介绍 318 | func Dispatch(ctx context.Context, handler Handler, descs ...ocispec.Descriptor) error { 319 | eg, ctx := errgroup.WithContext(ctx) 320 | for _, desc := range descs { 321 | desc := desc 322 | //一个协程池,对于不同的descriptor来说下载是并发的 323 | eg.Go(func() error { 324 | desc := desc 325 | 326 | children, err := handler.Handle(ctx, desc) 327 | if err != nil { 328 | if errors.Cause(err) == ErrSkipDesc { 329 | return nil // don't traverse the children. 330 | } 331 | return err 332 | } 333 | 334 | if len(children) > 0 { 335 | // 本质上是个dfs过程 336 | return Dispatch(ctx, handler, children...) 337 | } 338 | 339 | return nil 340 | }) 341 | } 342 | 343 | return eg.Wait() 344 | } 345 | 346 | #### Handlers介绍 347 | 348 | 下载的主要是任务都是通过Handler来完成的,看一下其最原始的定义。 349 | path: /home/zhangyue/go/src/github.com/containerd/containerd/images/handlers.go, 也关注一下其中的一些utils函数 350 | 351 | // HandlerFunc function implementing the Handler interface 352 | type HandlerFunc func(ctx context.Context, desc ocispec.Descriptor) (subdescs []ocispec.Descriptor, err error) 353 | 354 | BaseHandler: 355 | 通过connext注册过来的,只是负责将当前的desc进行登记, 356 | 357 | h := images.HandlerFunc(func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) { 358 | if desc.MediaType != images.MediaTypeDockerSchema1Manifest { 359 | ongoing.add(desc) 360 | } 361 | return nil, nil 362 | }) 363 | 364 | FetchHandler: containerd/containerd/remotes/handlers.go 365 | 366 | func fetch(ctx context.Context, ingester content.Ingester, fetcher Fetcher, desc ocispec.Descriptor) error { 367 | log.G(ctx).Debug("fetch") 368 | 369 | cw, err := content.OpenWriter(ctx, ingester, content.WithRef(MakeRefKey(ctx, desc)), content.WithDescriptor(desc)) 370 | if err != nil { 371 | if errdefs.IsAlreadyExists(err) { 372 | return nil 373 | } 374 | return err 375 | } 376 | defer cw.Close() 377 | 378 | ws, err := cw.Status() 379 | if err != nil { 380 | return err 381 | } 382 | 383 | if ws.Offset == desc.Size { 384 | // If writer is already complete, commit and return 385 | err := cw.Commit(ctx, desc.Size, desc.Digest) 386 | if err != nil && !errdefs.IsAlreadyExists(err) { 387 | return errors.Wrapf(err, "failed commit on ref %q", ws.Ref) 388 | } 389 | return nil 390 | } 391 | 392 | //通过http get得到Reader 393 | rc, err := fetcher.Fetch(ctx, desc) 394 | if err != nil { 395 | return err 396 | } 397 | defer rc.Close() 398 | 399 | //将得到的rc写入到content中去 400 | return content.Copy(ctx, cw, rc, desc.Size, desc.Digest) 401 | } 402 | 403 | path: containerd/containerd/remotes/docker/fetcher.go 404 | 405 | func (r dockerFetcher) Fetch(ctx context.Context, desc ocispec.Descriptor) (io.ReadCloser, error) { 406 | ctx = log.WithLogger(ctx, log.G(ctx).WithFields( 407 | logrus.Fields{ 408 | "base": r.base.String(), 409 | "digest": desc.Digest, 410 | }, 411 | )) 412 | // 如果是manifest的话首先是*/manifests/* 413 | // 其次是走*/blobs/*即可 414 | // 如果不是manifest的话直接走*/blobs/*即可 415 | urls, err := r.getV2URLPaths(ctx, desc) 416 | if err != nil { 417 | return nil, err 418 | } 419 | 420 | ctx, err = contextWithRepositoryScope(ctx, r.refspec, false) 421 | if err != nil { 422 | return nil, err 423 | } 424 | 425 | return newHTTPReadSeeker(desc.Size, func(offset int64) (io.ReadCloser, error) { 426 | for _, u := range urls { 427 | rc, err := r.open(ctx, u, desc.MediaType, offset) 428 | if err != nil { 429 | if errdefs.IsNotFound(err) { 430 | continue // try one of the other urls. 431 | } 432 | 433 | return nil, err 434 | } 435 | 436 | return rc, nil 437 | } 438 | 439 | return nil, errors.Wrapf(errdefs.ErrNotFound, 440 | "could not fetch content descriptor %v (%v) from remote", 441 | desc.Digest, desc.MediaType) 442 | 443 | }) 444 | } 445 | 446 | childrenHandler: 447 | 448 | // Get all the children for a descriptor 449 | // 这是个解析manifest的handler,返回[]Descriptor 450 | childrenHandler := images.ChildrenHandler(store) 451 | // Set any children labels for that content 452 | childrenHandler = images.SetChildrenLabels(store, childrenHandler) 453 | // Filter children by platforms 454 | childrenHandler = images.FilterPlatforms(childrenHandler, pullCtx.Platforms...) 455 | 456 | 中间的处理handler, path: containerd/containerd/images/handlers.go 457 | 458 | func SetChildrenLabels(manager content.Manager, f HandlerFunc) HandlerFunc { 459 | return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) { 460 | children, err := f(ctx, desc) 461 | if err != nil { 462 | return children, err 463 | } 464 | 465 | if len(children) > 0 {//如果在上一步处理结束后发现包含子descriptor 466 | info := content.Info{ 467 | Digest: desc.Digest, 468 | Labels: map[string]string{}, 469 | } 470 | fields := []string{} 471 | for i, ch := range children { 472 | info.Labels[fmt.Sprintf("containerd.io/gc.ref.content.%d", i)] = ch.Digest.String() 473 | fields = append(fields, fmt.Sprintf("labels.containerd.io/gc.ref.content.%d", i)) 474 | } 475 | //将子descriptor作为descriptor的label 476 | _, err := manager.Update(ctx, info, fields...) 477 | if err != nil { 478 | return nil, err 479 | } 480 | } 481 | 482 | return children, err 483 | } 484 | } 485 | 486 | //主要作用是在manifest list上添加label来表示包含关系 487 | func SetChildrenLabels(manager content.Manager, f HandlerFunc) HandlerFunc { 488 | return func(ctx context.Context, desc ocispec.Descriptor) ([]ocispec.Descriptor, error) { 489 | children, err := f(ctx, desc) 490 | if err != nil { 491 | return children, err 492 | } 493 | 494 | if len(children) > 0 {//如果在上一步处理结束后发现包含子descriptor 495 | info := content.Info{ 496 | Digest: desc.Digest, 497 | Labels: map[string]string{}, 498 | } 499 | fields := []string{} 500 | for i, ch := range children { 501 | info.Labels[fmt.Sprintf("containerd.io/gc.ref.content.%d", i)] = ch.Digest.String() 502 | fields = append(fields, fmt.Sprintf("labels.containerd.io/gc.ref.content.%d", i)) 503 | } 504 | //将子descriptor作为descriptor的label 505 | _, err := manager.Update(ctx, info, fields...) 506 | if err != nil { 507 | return nil, err 508 | } 509 | } 510 | 511 | return children, err 512 | } 513 | } 514 | 515 | 要注意这些handler在使用过程中是按照顺序来调用,因此首先是baseHandler将任务添加到onGoning任务列表,然后是fetchHandler完成descreiptor的下载任务(落入content对象),最后是childrenHandler的3个子handler完成任务(完成解析,打标签,过滤的操作); 同时要注意dispatch函数是一个深度优先遍历的过程,同时其内部如果有多个descriptor的话,那么本身是一个并行的过程,也就是说不同layer之间实际上是并行下载的。 516 | 517 | ### 参考 518 | http://www.sel.zju.edu.cn/?p=921 519 | 520 | [1]: http://static.zybuluo.com/myecho/5ge7ybzdsb69920sxvkd958k/image.png 521 | -------------------------------------------------------------------------------- /chapter4/section4.2.md: -------------------------------------------------------------------------------- 1 | # image unpack 2 | 3 | ![pull image][1] 4 | 前边介绍过fetch把镜像层内容和config保存进content服务,把镜像相关的元数据保存进images元数据服务中,而unpack过程中如下所述 5 | 6 | > Once the image is pulled, the user can instruct the bundle controller 7 | > to unpack the image into a bundle. Consuming from the content store, 8 | > layers from the image are unpacked into the snapshot component. 9 | 10 | ### OCI image spec 11 | ![镜像之间关系图][2] 12 | Image Index: 可以理解为Manifest list, 是镜像在不同平台的Manifest的集合, path: https://github.com/ZYecho/image-spec/blob/master/image-index.md 13 | 14 | Manifest是一个镜像描述文件,包含config和layers两个主体,path:https://github.com/ZYecho/image-spec/blob/master/manifest.md 15 | 16 | layers有很多层,其digest对应压缩后的格式比如gzip,tar的sha256值。 17 | 18 | 这里要区分下Manifest和镜像的描述文件的区别,下边给出两个例子 19 | 20 | // Manifest文件,其digest在Image Index中 21 | 22 | { 23 | "schemaVersion": 2, 24 | "mediaType": "application/vnd.docker.distribution.manifest.v2+json", 25 | "config": { 26 | "mediaType": "application/vnd.docker.container.image.v1+json", 27 | "size": 5108, 28 | //对应config文件的digest,需要再去下载 29 | "digest": "sha256:80581db8c700155a91bec6fd6398dad9733135e7c58a19472aa679e8367692ab" 30 | }, 31 | "layers": [ 32 | { 33 | "mediaType": "application/vnd.docker.image.rootfs.diff.tar.gzip", 34 | "size": 2206542, 35 | "digest": "sha256:8e3ba11ec2a2b39ab372c60c16b421536e50e5ce64a0bc81765c2e38381bcff6" 36 | }, 37 | { 38 | "mediaType": "application/vnd.docker.image.rootfs.diff.tar.gzip", 39 | "size": 1249, 40 | "digest": "sha256:1f20bd2a5c234ffab42de6cbf83522946614b21b642a8208dca6b0fd614c31db" 41 | }, 42 | { 43 | "mediaType": "application/vnd.docker.image.rootfs.diff.tar.gzip", 44 | "size": 9071, 45 | "digest": "sha256:782ff7702b5cd0a7c0109740838c74945fc27e4ce34e1028c24bf73f8249a63a" 46 | }, 47 | { 48 | "mediaType": "application/vnd.docker.image.rootfs.diff.tar.gzip", 49 | "size": 9407568, 50 | "digest": "sha256:cd719ead7ee305492514a8dfa2afcd0979a16e8192836b4aaed98d8d932973c0" 51 | }, 52 | { 53 | "mediaType": "application/vnd.docker.image.rootfs.diff.tar.gzip", 54 | "size": 98, 55 | "digest": "sha256:01018940af9a67873ad6737c275cb134372cdf1cda565af58dd14a1e3b85ab2a" 56 | }, 57 | { 58 | "mediaType": "application/vnd.docker.image.rootfs.diff.tar.gzip", 59 | "size": 398, 60 | "digest": "sha256:3f1bfdda9588f5c0643b485580060b460b21b5331f4760778ef3279680e20966" 61 | } 62 | ] 63 | } 64 | 65 | 这是config文件,path: https://github.com/ZYecho/image-spec/blob/master/config.md 66 | 67 | { 68 | "architecture":"amd64", 69 | "config":{ 70 | "Hostname":"", 71 | "Domainname":"", 72 | "User":"", 73 | "AttachStdin":false, 74 | "AttachStdout":false, 75 | "AttachStderr":false, 76 | "ExposedPorts":{ 77 | "6379/tcp":{ 78 | 79 | } 80 | }, 81 | ... 82 | "history":[ 83 | { 84 | "created":"2018-07-06T14:14:06.165546783Z", 85 | "created_by":"/bin/sh -c #(nop) ADD file:25f61d70254b9807a40cd3e8d820f6a5ec0e1e596de04e325f6a33810393e95a in / " 86 | }, 87 | { 88 | "created":"2018-07-11T00:55:43.769226605Z", 89 | "created_by":"/bin/sh -c apk add --no-cache 'su-exec\u003e=0.2'" 90 | }, 91 | { 92 | "created":"2018-07-11T00:57:21.027940339Z", 93 | "created_by":"/bin/sh -c #(nop) VOLUME [/data]", 94 | "empty_layer":true 95 | }, 96 | { 97 | "created":"2018-07-11T00:57:21.312115762Z", 98 | "created_by":"/bin/sh -c #(nop) WORKDIR /data", 99 | "empty_layer":true 100 | } 101 | ], 102 | "os":"linux", 103 | //这个很重要 104 | "rootfs":{ 105 | "type":"layers", 106 | "diff_ids":[ 107 | "sha256:73046094a9b835e443af1a9d736fcfc11a994107500e474d0abf399499ed280c", 108 | "sha256:9f8f870604a08909589f09337944210db2bf72b2a71f0f707642b3aa9d225f9b", 109 | "sha256:221a0f51690d5f9063c9a113aa5e1340b50ac4474e7525efb9c60b945589110f", 110 | "sha256:1b44c45cbb1c6711042de1c2e8785b554da0d25ed03bbd2a6a13fb498eceb6ae", 111 | "sha256:211a9f3eb69c634f0368994600e7c93df7510f87870da57e51460f177c603ca9", 112 | "sha256:151bcb0152a97b3a445bc2e2ed29432fc77d662fcb746b05b87d77a8d7bbf023" 113 | ] 114 | } 115 | } 116 | 117 | 注意区别一下diffid和manifest layer digest的区别: 118 | 119 | > A layer DiffID is the digest over the layer's uncompressed tar archive 120 | > and serialized in the descriptor digest format 121 | > 122 | > NOTE: Do not confuse DiffIDs with layer digests, often referenced in 123 | > the manifest, which are digests over compressed or uncompressed 124 | > content. 125 | 126 | 127 | Image layout: 128 | https://github.com/ZYecho/image-spec/blob/master/image-layout.md 129 | 这个是说镜像体现在文件系统上的layout,当然可能是存在于tar包或者nfs等。 130 | 131 | ### 流程解析 132 | //snapshotterName default为overlay 133 | func (i *image) Unpack(ctx context.Context, snapshotterName string) error { 134 | // lease的意义?应该是和GC相关 135 | ctx, done, err := i.client.WithLease(ctx) 136 | if err != nil { 137 | return err 138 | } 139 | defer done(ctx) 140 | 141 | layers, err := i.getLayers(ctx, i.platform) 142 | if err != nil { 143 | return err 144 | } 145 | 146 | var ( 147 | sn = i.client.SnapshotService(snapshotterName) 148 | a = i.client.DiffService() 149 | cs = i.client.ContentStore() 150 | 151 | chain []digest.Digest 152 | unpacked bool 153 | ) 154 | for _, layer := range layers { 155 | unpacked, err = rootfs.ApplyLayer(ctx, layer, chain, sn, a) 156 | if err != nil { 157 | return err 158 | } 159 | //unpack成功了 160 | if unpacked { 161 | // Set the uncompressed label after the uncompressed 162 | // digest has been verified through apply. 163 | // 看起来这个意思是会把compressed的变成uncompressed的?有待确认 164 | cinfo := content.Info{ 165 | Digest: layer.Blob.Digest, 166 | Labels: map[string]string{ 167 | "containerd.io/uncompressed": layer.Diff.Digest.String(), 168 | }, 169 | } 170 | if _, err := cs.Update(ctx, cinfo, "labels.containerd.io/uncompressed"); err != nil { 171 | return err 172 | } 173 | } 174 | 175 | chain = append(chain, layer.Diff.Digest) 176 | } 177 | //最后一层也unpack成功了 178 | if unpacked { 179 | desc, err := i.i.Config(ctx, cs, i.platform) 180 | if err != nil { 181 | return err 182 | } 183 | 184 | rootfs := identity.ChainID(chain).String() 185 | 186 | cinfo := content.Info{ 187 | Digest: desc.Digest, 188 | Labels: map[string]string{ 189 | fmt.Sprintf("containerd.io/gc.ref.snapshot.%s", snapshotterName): rootfs, 190 | }, 191 | } 192 | if _, err := cs.Update(ctx, cinfo, fmt.Sprintf("labels.containerd.io/gc.ref.snapshot.%s", snapshotterName)); err != nil { 193 | return err 194 | } 195 | } 196 | 197 | return nil 198 | } 199 | 200 | 201 | func (i *image) getLayers(ctx context.Context, platform platforms.MatchComparer) ([]rootfs.Layer, error) { 202 | cs := i.client.ContentStore() 203 | //从content中根据manifest的desc读出符合条件的Manifest对象 204 | manifest, err := images.Manifest(ctx, cs, i.i.Target, platform) 205 | if err != nil { 206 | return nil, err 207 | } 208 | //拿到image config里的diffids 209 | diffIDs, err := i.i.RootFS(ctx, cs, platform) 210 | if err != nil { 211 | return nil, errors.Wrap(err, "failed to resolve rootfs") 212 | } 213 | // 查看镜像层数是否相同 214 | if len(diffIDs) != len(manifest.Layers) { 215 | return nil, errors.Errorf("mismatched image rootfs and manifest layers") 216 | } 217 | layers := make([]rootfs.Layer, len(diffIDs)) 218 | for i := range diffIDs { 219 | //这个是image Config的desc, -> tar 220 | layers[i].Diff = ocispec.Descriptor{ 221 | // TODO: derive media type from compressed type 222 | MediaType: ocispec.MediaTypeImageLayer, 223 | Digest: diffIDs[i], 224 | } 225 | //这个是Manifest里边的desec, -> tar+gzip 226 | layers[i].Blob = manifest.Layers[i] 227 | } 228 | return layers, nil 229 | } 230 | 231 | path: containerd/containerd/rootfs/apply.go 232 | 233 | func applyLayers(ctx context.Context, layers []Layer, chain []digest.Digest, sn snapshots.Snapshotter, a diff.Applier, opts ...snapshots.Opt) error { 234 | var ( 235 | parent = identity.ChainID(chain[:len(chain)-1]) 236 | chainID = identity.ChainID(chain) 237 | layer = layers[len(layers)-1] 238 | diff ocispec.Descriptor 239 | key string 240 | mounts []mount.Mount 241 | err error 242 | ) 243 | 244 | for { 245 | // 注意这个Key并不是完全等价于chainID 246 | key = fmt.Sprintf("extract-%s %s", uniquePart(), chainID) 247 | 248 | // Prepare snapshot with from parent, label as root 249 | // step1 获取到经过COW之后的可挂载的mounts(type可能为bind brtfs等,目录和moby存储目录类似) 250 | mounts, err = sn.Prepare(ctx, key, parent.String(), opts...) 251 | if err != nil { 252 | if errdefs.IsNotFound(err) && len(layers) > 1 { 253 | if err := applyLayers(ctx, layers[:len(layers)-1], chain[:len(chain)-1], sn, a); err != nil { 254 | if !errdefs.IsAlreadyExists(err) { 255 | return err 256 | } 257 | } 258 | // Do no try applying layers again 259 | layers = nil 260 | continue 261 | } else if errdefs.IsAlreadyExists(err) { 262 | // Try a different key 263 | continue 264 | } 265 | 266 | // Already exists should have the caller retry 267 | return errors.Wrapf(err, "failed to prepare extraction snapshot %q", key) 268 | 269 | } 270 | break 271 | } 272 | defer func() { 273 | if err != nil { 274 | if !errdefs.IsAlreadyExists(err) { 275 | log.G(ctx).WithError(err).WithField("key", key).Infof("apply failure, attempting cleanup") 276 | } 277 | 278 | if rerr := sn.Remove(ctx, key); rerr != nil { 279 | log.G(ctx).WithError(rerr).WithField("key", key).Warnf("extraction snapshot removal failed") 280 | } 281 | } 282 | }() 283 | 284 | // step2, Blob依然是tar+gzip的形式 285 | // 先将mounts挂载到一个temp dir上并且applyDiff,要明白DIffs在mount的path原有挂载点目录里也是可见的,和temp dir在哪没有关系 286 | diff, err = a.Apply(ctx, layer.Blob, mounts) 287 | if err != nil { 288 | err = errors.Wrapf(err, "failed to extract layer %s", layer.Diff.Digest) 289 | return err 290 | } 291 | // 判断一下是否符合预期 292 | if diff.Digest != layer.Diff.Digest { 293 | err = errors.Errorf("wrong diff id calculated on extraction %q", diff.Digest) 294 | return err 295 | } 296 | //step3 297 | if err = sn.Commit(ctx, chainID.String(), key, opts...); err != nil { 298 | err = errors.Wrapf(err, "failed to commit snapshot %s", key) 299 | return err 300 | } 301 | 302 | return nil 303 | } 304 | 305 | // 有兴趣可以跟一下apply的细节 306 | // Apply applies a tar stream of an OCI style diff tar. 307 | // See https://github.com/opencontainers/image-spec/blob/master/layer.md#applying-changesets 308 | func Apply(ctx context.Context, root string, r io.Reader, opts ...ApplyOpt) (int64, error) { 309 | root = filepath.Clean(root) 310 | 311 | var options ApplyOptions 312 | for _, opt := range opts { 313 | if err := opt(&options); err != nil { 314 | return 0, errors.Wrap(err, "failed to apply option") 315 | } 316 | } 317 | if options.Filter == nil { 318 | options.Filter = all 319 | } 320 | 321 | return apply(ctx, root, tar.NewReader(r), options) 322 | } 323 | 324 | 这一节最后有关snapshotter的细节比较多,特别是step1-step3,会在介绍snapshotter和具体storage driver的时候看一下其工作机制和step1-3的细节,并总结回顾一下这部分内容。 325 | 326 | [1]: http://static.zybuluo.com/myecho/5ge7ybzdsb69920sxvkd958k/image.png 327 | [2]: http://static.zybuluo.com/myecho/75bv8w7hnh82usvhwe6kok02/image.png 328 | -------------------------------------------------------------------------------- /chapter4/section4.3.md: -------------------------------------------------------------------------------- 1 | # snapshotter 2 | 3 | ### 为什么要有snapshotter? 4 | https://blog.mobyproject.org/where-are-containerds-graph-drivers-145fc9b7255 5 | 6 | > These differ from the concept of the graphdriver in that the 7 | > Snapshotter has no knowledge of images or containers. Users simply 8 | > prepare and commit directories. We also avoid the integration between 9 | > graph drivers and the tar format used to represent the changesets. 10 | > 11 | > The Snapshotter will only provide mount-oriented snapshot access with 12 | > minimal metadata. Serialization, hashing, unpacking, packing and 13 | > mounting are not included in this design, opting for common 14 | > implementations between graphdrivers, rather than specialized ones. 15 | > This is less of a problem for performance since direct access to 16 | > changesets is provided in the interface. 17 | > 18 | > The Snapshotter provides an API for allocating, snapshotting and 19 | > mounting abstract, layer-based filesystems. The model works by 20 | > building up sets of directories with parent-child relationships, known 21 | > as Snapshots. 22 | 23 | ### 整体架构和Model 24 | 25 | ![image.png-119.7kB][1] 26 | ![image.png-104.8kB][2] 27 | 28 | > Snapshots are best understood by their lifecycle. Active snapshots are 29 | > always created with Prepare or View from a Committed snapshot 30 | > (including the empty snapshot). Committed snapshots are always created 31 | > with Commit from an Active snapshot. Active snapshots never become 32 | > committed snapshots and vice versa. All snapshots may be removed. 33 | > 34 | > After mounting an Active snapshot, changes can be made to the 35 | > snapshot. The act of committing creates a Committed snapshot. The 36 | > committed snapshot will inherit the parent of the active snapshot. The 37 | > committed snapshot can then be used as a parent. Active snapshots can 38 | > never be used as a parent. 39 | > 40 | > In this diagram, you can see that the active snapshot a is created by 41 | > calling Prepare with the committed snapshot P0. After modification, a 42 | > becomes a' and a committed snapshot P1 is created by calling Commit. 43 | > a' can be further modified as a'' and a second committed snapshot can 44 | > be created as P2 by calling Commit again. Note here that P2's parent 45 | > is P0 and not P1. 46 | 47 | 要搞清楚这个p2的parent为什么是p0,而不是p1? 48 | --- 本质上a''还是从a做改动来的,而p2是会继承a''的parent的,也就是a的来源,p0 snapshot。 49 | 50 | > Types of container filesystems In the container world we use two types 51 | > of filesystems: overlays and snapshotting filesystems. AUFS and 52 | > OverlayFS are overlay filesystems which have multiple directories with 53 | > file diffs for each “layer” in an image. Snapshotting filesystems 54 | > include devicemapper, btrfs, and ZFS which handle file diffs at the 55 | > block level. Overlays usually work on common filesystem types such as 56 | > EXT4 and XFS whereas snapshotting filesystems only run on volumes 57 | > formatted for them. 58 | 59 | 快照类型的文件系统需要底层块设备提前format成对应的特定格式,不能直接运行在common filsystem上比如ext4等 60 | 61 | 目前支持的存储引擎有哪些? 62 | 63 | * native 对应着vfs?对每个image layer直接无脑拷贝,不存在COW的语义 64 | * lcow http://dockone.io/article/3299, https://github.com/moby/moby/pull/34859 65 | * overlay 66 | * aufs 67 | * btrfs 68 | * zfs 69 | 70 | ### 基本流程 71 | 72 | #### Importing a Layer 73 | To import a layer, we simply have the Snapshotter provide a list of mounts to be applied such that our destination will capture a changeset. We start out by getting a path to the layer tar file and creating a temp location to unpack it to: 74 | 75 | layerPath, tmpDir := getLayerPath(), mkTmpDir() // just a path to layer tar file. 76 | 77 | We start by using a Snapshotter to Prepare a new snapshot transaction, using a key and descending from the empty parent "": 78 | 79 | mounts, err := snapshotter.Prepare(key, "") 80 | if err != nil { ... } 81 | 82 | We get back a list of mounts from Snapshotter.Prepare, with the key identifying the active snapshot. Mount this to the temporary location with the following: 83 | 84 | if err := mount.All(mounts, tmpDir); err != nil { ... } 85 | 86 | Once the mounts are performed, our temporary location is ready to capture a diff. In practice, this works similar to a filesystem transaction. The next step is to unpack the layer. We have a special function unpackLayer that applies the contents of the layer to target location and calculates the DiffID of the unpacked layer (this is a requirement for docker implementation): 87 | 88 | layer, err := os.Open(layerPath) 89 | if err != nil { ... } 90 | digest, err := unpackLayer(tmpLocation, layer) // unpack into layer location 91 | if err != nil { ... } 92 | 93 | When the above completes, we should have a filesystem the represents the contents of the layer. Careful implementations should verify that digest matches the expected DiffID. When completed, we unmount the mounts: 94 | 95 | unmount(mounts) // optional, for now 96 | 97 | Now that we've verified and unpacked our layer, we commit the active snapshot to a name. For this example, we are just going to use the layer digest, but in practice, this will probably be the ChainID: 98 | 99 | if err := snapshotter.Commit(digest.String(), key); err != nil { ... } 100 | 101 | Now, we have a layer in the Snapshotter that can be accessed with the digest provided during commit. Once you have committed the snapshot, the active snapshot can be removed with the following: 102 | 103 | snapshotter.Remove(key) 104 | 105 | 从上边的描述我们可以看出,上一节unpack中的提到的step1-step3中只有step1 prepare以及step3 commit属于snapshotter的工作范畴,而step2的apply实际上调用的diff服务实现的接口。 106 | 107 | #### Importing the Next Layer 108 | Making a layer depend on the above is identical to the process described above except that the parent is provided as parent when calling Snapshotter.Prepare, assuming a clean tmpLocation: 109 | 110 | mounts, err := snapshotter.Prepare(tmpLocation, parentDigest) 111 | 112 | We then mount, apply and commit, as we did above. The new snapshot will be based on the content of the previous one. 113 | 114 | #### Running a Container 115 | To run a container, we simply provide Snapshotter.Prepare the committed image snapshot as the parent. After mounting, the prepared path can be used directly as the container's filesystem: 116 | 117 | mounts, err := snapshotter.Prepare(containerKey, imageRootFSChainID) 118 | 119 | The returned mounts can then be passed directly to the container runtime. If one would like to create a new image from the filesystem, Snapshotter.Commit is called: 120 | 121 | if err := snapshotter.Commit(newImageSnapshot, containerKey); err != nil { ... } 122 | 123 | Alternatively, for most container runs, Snapshotter.Remove will be called to signal the Snapshotter to abandon the changes. 124 | 125 | #### ctr snapshot命令 126 | 127 | COMMANDS: 128 | commit commit an active snapshot into the provided name 129 | info get info about a snapshot 130 | list, ls list snapshots 131 | mounts, m, mount mount gets mount commands for the snapshots 132 | prepare prepare a snapshot from a committed snapshot 133 | remove, rm remove snapshots 134 | label add labels to content 135 | tree display tree view of snapshot branches 136 | unpack unpack applies layers from a manifest to a snapshot 137 | usage usage snapshots 138 | view create a read-only snapshot from a committed snapshot 139 | 140 | 涵盖了上文中介绍过的operation. 141 | 142 | ### 参考 143 | https://github.com/containerd/containerd/blob/master/design/snapshots.md 144 | https://integratedcode.us/2016/08/30/storage-drivers-in-docker-a-deep-dive/ 145 | https://www.cnblogs.com/breezey/p/9589288.html 146 | 147 | [1]: http://static.zybuluo.com/myecho/yo7ugmvzsg9fauj7b5nqv7x6/image.png 148 | [2]: http://static.zybuluo.com/myecho/qkfamwyg8x3f3vb2ylctkyqg/image.png 149 | 150 | -------------------------------------------------------------------------------- /chapter4/section4.4.md: -------------------------------------------------------------------------------- 1 | # pouch commit实现 2 | 3 | 我们借助pouch commit的实现来看一下,如何使用containerd的diff和snapshot服务。 4 | 首先要明白containerd是不支持build/commit功能的,那么pouch是如何实现的commit命令呢? 5 | 6 | ### docker commit 7 | 要明白pouch commit如何实现,我们首先看一下docker commit是如何实现的,在功能上基本相关,只不过前者依赖containerd提供的grpc服务。 8 | 9 | > 深入学习docker commit 的原理前,我不妨先来看看一下 docker help 中关于 commit 命令的阐述: commit 10 | > Create a new image from a container's changes 结合上图与命令docker commit 11 | > 的描述,我们可以发现有三个关键字Image、Container 与Changes 。如何理解这三个关键字,我们可以从以下三个步骤入手: 12 | 13 | > 1. Docker Daemon 会通过一个 Docker 镜像运行一个 Docker 容器,Docker 通过层级文件系统为 Docker 容器提供文件系统视角,最上层的是可读可写层(Read-Write Layer)。 14 | > 15 | > 2. Docker 容器初始的可读可写层内容均为空,Docker 容器对文件系统的内容更新将全部更新于可读可写层(Read-Write Layer)。 16 | > 17 | > 3. 实现 docker commit 操作时,Docker 仅仅是将可读可写层(Read-Write Layer)中的更新内容,打包为一个全新的镜像。 18 | 19 | 简言之,所谓的commit功能就是将容器最上层的R/W层重新打包成一个新的镜像,至于说如何构造镜像要依赖不同的实现。 20 | 21 | 22 | ### pouch commit实现 23 | 有了上边的介绍,我们对commit的功能有了一个基本的了解,如果让我们在pouch里边去实现commit的话,无非依赖于以下几个操作,Diff操作来提取出R/W读写层,Content服务写入tar包,然后unpack到snapshot中去,我们实际看一下pouch commit的实现看是不是这样做的。 24 | 25 | path: alibaba/pouch/ctrd/image_commit.go 26 | 27 | func (c *Client) Commit(ctx context.Context, config *CommitConfig) (_ digest.Digest, err0 error) { 28 | // get a containerd client 29 | wrapperCli, err := c.Get(ctx) 30 | if err != nil { 31 | return "", fmt.Errorf("failed to get a containerd grpc client: %v", err) 32 | } 33 | client := wrapperCli.client 34 | 35 | var ( 36 | sn = client.SnapshotService(defaultSnapshotterName) 37 | cs = client.ContentStore() 38 | differ = client.DiffService() 39 | ) 40 | 41 | // export new layer 42 | snapshot, err := c.GetSnapshot(ctx, config.ContainerID) 43 | if err != nil { 44 | return "", fmt.Errorf("failed to get snapshot: %s", err) 45 | } 46 | // 调用diff服务,返回描述这一层的descriptor 47 | layer, diffIDStr, err := exportLayer(ctx, snapshot.Name, sn, cs, differ) 48 | if err != nil { 49 | return "", err 50 | } 51 | 52 | // create child image 53 | diffIDDigest, err := digest.Parse(diffIDStr) 54 | if err != nil { 55 | return "", err 56 | } 57 | 58 | //结合parent image构建一个符合oci spec的img描述体 59 | childImg, err := newChildImage(ctx, config, diffIDDigest) 60 | if err != nil { 61 | return "", err 62 | } 63 | 64 | // create new snapshot for new layer 65 | //产生新镜像chainId作为镜像的snapshotKey 66 | snapshotKey := identity.ChainID(childImg.RootFS.DiffIDs).String() 67 | //还是按照pull镜像时候的3步走 先prepare -> apply -> commit 68 | if err = newSnapshot(ctx, config.Image, sn, differ, layer, snapshotKey, diffIDStr); err != nil { 69 | return "", err 70 | } 71 | defer func() { 72 | if err0 != nil { 73 | logrus.Warnf("remove snapshot %s cause commit image failed", snapshotKey) 74 | client.SnapshotService(defaultSnapshotterName).Remove(ctx, snapshotKey) 75 | } 76 | }() 77 | 78 | imgJSON, err := json.Marshal(childImg) 79 | if err != nil { 80 | return "", err 81 | } 82 | 83 | // 以下分别构建新镜像config descriptor的layer descriptor 84 | 85 | // new config descriptor 86 | configDesc := ocispec.Descriptor{ 87 | MediaType: configType, 88 | Digest: digest.FromBytes(imgJSON), 89 | Size: int64(len(imgJSON)), 90 | } 91 | 92 | // get parent image layer descriptor 93 | pmfst, err := images.Manifest(ctx, cs, config.CImage.Target(), platforms.Default()) 94 | if err != nil { 95 | return "", err 96 | } 97 | 98 | // new layer descriptor 99 | layers := append(pmfst.Layers, layer) 100 | labels := map[string]string{ 101 | "containerd.io/gc.ref.content.0": configDesc.Digest.String(), 102 | } 103 | for i, l := range layers { 104 | labels[fmt.Sprintf("containerd.io/gc.ref.content.%d", i+1)] = l.Digest.String() 105 | } 106 | 107 | // new manifest descriptor 108 | mfst := ocispec.Manifest{ 109 | Versioned: specs.Versioned{ 110 | SchemaVersion: 2, 111 | }, 112 | Config: configDesc, 113 | Layers: layers, 114 | } 115 | 116 | mfstJSON, err := json.MarshalIndent(mfst, "", " ") 117 | if err != nil { 118 | return "", errors.Wrap(err, "failed to marshal manifest") 119 | } 120 | 121 | mfstDigest := digest.FromBytes(mfstJSON) 122 | mfstDesc := ocispec.Descriptor{ 123 | Digest: mfstDigest, 124 | Size: int64(len(mfstJSON)), 125 | } 126 | 127 | desc := ocispec.Descriptor{ 128 | MediaType: manifestType, 129 | Digest: mfstDigest, 130 | Size: int64(len(mfstJSON)), 131 | } 132 | 133 | // image create 134 | img := images.Image{ 135 | Name: config.Reference, 136 | Target: desc, 137 | CreatedAt: time.Now(), 138 | } 139 | 140 | // register containerd image metadata. 141 | // 向containerd的images注册镜像的元数据 142 | if _, err := client.ImageService().Update(ctx, img); err != nil { 143 | if !errdefs.IsNotFound(err) { 144 | return "", fmt.Errorf("failed to cover exist image %s", err) 145 | } 146 | if _, err := client.ImageService().Create(ctx, img); err != nil { 147 | return "", fmt.Errorf("failed to create new image %s", err) 148 | } 149 | } 150 | 151 | 152 | // write manifest content 153 | //镜像为单位的,在unpack那个环节有介绍 154 | if err := content.WriteBlob(ctx, cs, mfstDigest.String(), bytes.NewReader(mfstJSON), mfstDesc.Size, mfstDesc.Digest, content.WithLabels(labels)); err != nil { 155 | return "", errors.Wrapf(err, "error writing manifest blob %s", mfstDigest) 156 | } 157 | 158 | // write config content 159 | // 实际上就是前边那个oci spec img 160 | if err := content.WriteBlob(ctx, cs, configDesc.Digest.String(), bytes.NewReader(imgJSON), configDesc.Size, configDesc.Digest); err != nil { 161 | return "", errors.Wrap(err, "error writing config blob") 162 | } 163 | 164 | // pouch record config descriptor digest as image id. 165 | return configDesc.Digest, nil 166 | } 167 | 168 | ### 总结 169 | 通过观察commit的实现来说,我们可以发现diff服务和snapshot服务之间的职责分工,diff服务会操作tar包来完成r/w的打包,applyDiff这些操作,而snapshot服务来说完成类似登记的功能,以mounts挂载点来和其他服务连接起来。 170 | 171 | ### 参考 172 | https://github.com/alibaba/pouch/pull/2125 173 | http://guide.daocloud.io/dcs/docker-commit-9153991.html 174 | -------------------------------------------------------------------------------- /chapter5/README.md: -------------------------------------------------------------------------------- 1 | # 存储 2 | 3 | 在本节以overlay和btrfs为典型介绍联合文件系统和快照类型文件系统在snapshottor上的实现。 4 | -------------------------------------------------------------------------------- /chapter5/section5.1.md: -------------------------------------------------------------------------------- 1 | # native 2 | 3 | native即用原生的linux文件系统比如ext4等完成snapshottor的工作,虽然在应用环境中很少使用,但对于实现自己的snapshottor有很好的借鉴意义,因为其细节比较简单,对于理解snapshottor的主要流程有很好的帮助。 4 | 5 | 我们来学习snapshottor主要放在prepare和commit两个方法上。 6 | 7 | // 这里的key应该是chaninID 8 | func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) { 9 | return o.createSnapshot(ctx, snapshots.KindActive, key, parent, opts) 10 | } 11 | 12 | func (o *snapshotter) createSnapshot(ctx context.Context, kind snapshots.Kind, key, parent string, opts []snapshots.Opt) ([]mount.Mount, error) { 13 | var ( 14 | err error 15 | path, td string 16 | ) 17 | 18 | if kind == snapshots.KindActive || parent == "" { 19 | // 如果是一个active的snapshot或者是no parent需要搞一个目录来存储这一层的内容(相当于moby的)RW层 20 | td, err = ioutil.TempDir(filepath.Join(o.root, "snapshots"), "new-") 21 | if err != nil { 22 | return nil, errors.Wrap(err, "failed to create temp dir") 23 | } 24 | if err := os.Chmod(td, 0755); err != nil { 25 | return nil, errors.Wrapf(err, "failed to chmod %s to 0755", td) 26 | } 27 | defer func() { 28 | if err != nil { 29 | if td != "" { 30 | if err1 := os.RemoveAll(td); err1 != nil { 31 | err = errors.Wrapf(err, "remove failed: %v", err1) 32 | } 33 | } 34 | if path != "" { 35 | if err1 := os.RemoveAll(path); err1 != nil { 36 | err = errors.Wrapf(err, "failed to remove path: %v", err1) 37 | } 38 | } 39 | } 40 | }() 41 | } 42 | 43 | // MetaStore开启一个事务 44 | ctx, t, err := o.ms.TransactionContext(ctx, true) 45 | if err != nil { 46 | return nil, err 47 | } 48 | //完成存储层元数据的管理 49 | s, err := storage.CreateSnapshot(ctx, kind, key, parent, opts...) 50 | if err != nil { 51 | if rerr := t.Rollback(); rerr != nil { 52 | log.G(ctx).WithError(rerr).Warn("failed to rollback transaction") 53 | } 54 | return nil, errors.Wrap(err, "failed to create snapshot") 55 | } 56 | 57 | // 如果是需要创建新的目录的话 58 | if td != "" { 59 | // layer have parent 60 | if len(s.ParentIDs) > 0 { 61 | // 直接拿直系parent的即可,不支持多层的parent没有意义 62 | parent := o.getSnapshotDir(s.ParentIDs[0]) 63 | // 因为native filesystem没有提供COW的能力,因此必须全量拷贝parent的目录 64 | if err := fs.CopyDir(td, parent); err != nil { 65 | return nil, errors.Wrap(err, "copying of parent failed") 66 | } 67 | } 68 | // s.ID应该每个snapshot唯一,不能冲突 69 | path = o.getSnapshotDir(s.ID) 70 | // 重命名临时目录,因此path对于snapshot应该是唯一的,和s.ID一一对应 71 | if err := os.Rename(td, path); err != nil { 72 | if rerr := t.Rollback(); rerr != nil { 73 | log.G(ctx).WithError(rerr).Warn("failed to rollback transaction") 74 | } 75 | return nil, errors.Wrap(err, "failed to rename") 76 | } 77 | td = "" 78 | } 79 | 80 | if err := t.Commit(); err != nil { 81 | return nil, errors.Wrap(err, "commit failed") 82 | } 83 | 84 | return o.mounts(s), nil 85 | } 86 | 87 | // 其实就是在bolt的元数据的存储中登记一下 88 | func (o *snapshotter) mounts(s storage.Snapshot) []mount.Mount { 89 | var ( 90 | roFlag string 91 | source string 92 | ) 93 | 94 | // 区分是否可读写 95 | if s.Kind == snapshots.KindView { 96 | roFlag = "ro" 97 | } else { 98 | roFlag = "rw" 99 | } 100 | 101 | if len(s.ParentIDs) == 0 || s.Kind == snapshots.KindActive { 102 | source = o.getSnapshotDir(s.ID) 103 | } else { 104 | // 只读的话直接拿parent的即可 105 | source = o.getSnapshotDir(s.ParentIDs[0]) 106 | } 107 | 108 | return []mount.Mount{ 109 | { 110 | Source: source, 111 | // 联合挂载,可在本机上实验一下 112 | Type: "bind", 113 | Options: []string{ 114 | roFlag, 115 | // rbind表示会把原来目录下的挂载点也会一起挂载过去 116 | "rbind", 117 | }, 118 | }, 119 | } 120 | } 121 | 122 | // 在bolt元数据存储中标记一下,表示snapshot为commited状态 123 | func (o *snapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) error { 124 | ctx, t, err := o.ms.TransactionContext(ctx, true) 125 | if err != nil { 126 | return err 127 | } 128 | 129 | id, _, _, err := storage.GetInfo(ctx, key) 130 | if err != nil { 131 | return err 132 | } 133 | 134 | usage, err := fs.DiskUsage(ctx, o.getSnapshotDir(id)) 135 | if err != nil { 136 | return err 137 | } 138 | 139 | if _, err := storage.CommitActive(ctx, key, name, snapshots.Usage(usage), opts...); err != nil { 140 | if rerr := t.Rollback(); rerr != nil { 141 | log.G(ctx).WithError(rerr).Warn("failed to rollback transaction") 142 | } 143 | return errors.Wrap(err, "failed to commit snapshot") 144 | } 145 | return t.Commit() 146 | } 147 | -------------------------------------------------------------------------------- /chapter5/section5.2.md: -------------------------------------------------------------------------------- 1 | # overlayfs 2 | 3 | 4 | > OverlayFS is a modern union filesystem that is similar to AUFS, but 5 | > faster and with a simpler implementation. Docker provides two storage 6 | > drivers for OverlayFS: the original overlay, and the newer and more 7 | > stable overlay2. 8 | 9 | ### 预备知识 10 | https://docs.docker.com/storage/storagedriver/overlayfs-driver/ 11 | 12 | overlay vs overlay2 13 | > If you are still using the overlay driver rather than overlay2, see 14 | > How the overlay driver works instead. 15 | > 16 | > OverlayFS layers two directories on a single Linux host and presents 17 | > them as a single directory. These directories are called layers and 18 | > the unification process is referred to as a union mount. OverlayFS 19 | > refers to the lower directory as lowerdir and the upper directory a 20 | > upperdir. The unified view is exposed through its own directory called 21 | > merged. 22 | > 23 | > While the overlay driver only works with a single lower OverlayFS 24 | > layer and hence requires hard links for implementation of 25 | > multi-layered images, the overlay2 driver natively supports up to 128 26 | > lower OverlayFS layers. This capability provides better performance 27 | > for layer-related Docker commands such as docker build and docker 28 | > commit, and consumes fewer inodes on the backing filesystem. 29 | 30 | overlay1存在的问题: 31 | 32 | > While the overlay driver only works with a single lower OverlayFS 33 | > layer and hence requires hard links for implementation of 34 | > multi-layered images, the overlay2 driver natively supports up to 128 35 | > lower OverlayFS layers. This capability provides better performance 36 | > for layer-related Docker commands such as docker build and docker 37 | > commit, and consumes fewer inodes on the backing filesystem. 38 | 39 | 如果还是在同一个文件系统的时候,比如pull镜像的时候是可以使用hard-link的方式来引用低层的镜像层的数据的。 40 | 41 | To create a container, the overlay driver combines the directory representing the image’s top layer plus a new directory for the container. The image’s top layer is the lowerdir in the overlay and is read-only. The new directory for the container is the upperdir and is writable. 42 | 43 | moby overlay的代码:moby/moby/daemon/graphdriver/overlay 44 | * `Create()`方法也就是产生rootfs的过程中都是`copy.DirCopy(parentUpperDir, upperDir, copy.Content, true)`也就是拷贝的内容(将parentUpperDir的内容拷贝到upperDir,因为只支持一层lowerDir,同时不能将parentUpperDir hard-link到upperDir目录,因为upper层是可读写的,hard-link也会破坏原有的parent) 45 | * `ApplyDiff()`也就是构建镜像过程中调用的函数都是使用的hard-link的方式`copy.DirCopy(parentRootDir, tmpRootDir, copy.Hardlink, true)` 46 | 47 | docker commit过程需要`Create() -> Diff() -> ApplyDiff()`的过程,因为`Create()`会拷贝文件内容,因此会消耗多余的inode。 48 | 49 | ### 具体实现 50 | 51 | func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) { 52 | return o.createSnapshot(ctx, snapshots.KindActive, key, parent, opts) 53 | } 54 | 55 | func (o *snapshotter) createSnapshot(ctx context.Context, kind snapshots.Kind, key, parent string, opts []snapshots.Opt) ([]mount.Mount, error) { 56 | ctx, t, err := o.ms.TransactionContext(ctx, true) 57 | if err != nil { 58 | return nil, err 59 | } 60 | 61 | var td, path string 62 | defer func() { 63 | if err != nil { 64 | if td != "" { 65 | if err1 := os.RemoveAll(td); err1 != nil { 66 | log.G(ctx).WithError(err1).Warn("failed to cleanup temp snapshot directory") 67 | } 68 | } 69 | if path != "" { 70 | if err1 := os.RemoveAll(path); err1 != nil { 71 | log.G(ctx).WithError(err1).WithField("path", path).Error("failed to reclaim snapshot directory, directory may need removal") 72 | err = errors.Wrapf(err, "failed to remove path: %v", err1) 73 | } 74 | } 75 | } 76 | }() 77 | 78 | snapshotDir := filepath.Join(o.root, "snapshots") 79 | td, err = o.prepareDirectory(ctx, snapshotDir, kind) 80 | if err != nil { 81 | if rerr := t.Rollback(); rerr != nil { 82 | log.G(ctx).WithError(rerr).Warn("failed to rollback transaction") 83 | } 84 | return nil, errors.Wrap(err, "failed to create prepare snapshot dir") 85 | } 86 | rollback := true 87 | defer func() { 88 | if rollback { 89 | if rerr := t.Rollback(); rerr != nil { 90 | log.G(ctx).WithError(rerr).Warn("failed to rollback transaction") 91 | } 92 | } 93 | }() 94 | 95 | s, err := storage.CreateSnapshot(ctx, kind, key, parent, opts...) 96 | if err != nil { 97 | return nil, errors.Wrap(err, "failed to create snapshot") 98 | } 99 | 100 | if len(s.ParentIDs) > 0 { 101 | st, err := os.Stat(o.upperPath(s.ParentIDs[0])) 102 | if err != nil { 103 | return nil, errors.Wrap(err, "failed to stat parent") 104 | } 105 | 106 | stat := st.Sys().(*syscall.Stat_t) 107 | 108 | // 设置目录权限和parent的相同 109 | if err := os.Lchown(filepath.Join(td, "fs"), int(stat.Uid), int(stat.Gid)); err != nil { 110 | if rerr := t.Rollback(); rerr != nil { 111 | log.G(ctx).WithError(rerr).Warn("failed to rollback transaction") 112 | } 113 | return nil, errors.Wrap(err, "failed to chown") 114 | } 115 | } 116 | 117 | path = filepath.Join(snapshotDir, s.ID) 118 | if err = os.Rename(td, path); err != nil { 119 | return nil, errors.Wrap(err, "failed to rename") 120 | } 121 | td = "" 122 | 123 | // 这里不太理解为什么rollback=false? 124 | rollback = false 125 | if err = t.Commit(); err != nil { 126 | return nil, errors.Wrap(err, "commit failed") 127 | } 128 | 129 | // merge目录不由snapshotter决定,挂载在哪,哪就是merge层 130 | return o.mounts(s), nil 131 | } 132 | 133 | func (o *snapshotter) mounts(s storage.Snapshot) []mount.Mount { 134 | if len(s.ParentIDs) == 0 { 135 | // if we only have one layer/no parents then just return a bind mount as overlay 136 | // will not work 137 | roFlag := "rw" 138 | if s.Kind == snapshots.KindView { 139 | roFlag = "ro" 140 | } 141 | 142 | return []mount.Mount{ 143 | { 144 | Source: o.upperPath(s.ID), 145 | Type: "bind", 146 | Options: []string{ 147 | roFlag, 148 | "rbind", 149 | }, 150 | }, 151 | } 152 | } 153 | var options []string 154 | 155 | if s.Kind == snapshots.KindActive { 156 | options = append(options, 157 | // filepath.Join(o.root, "snapshots", id, "work") 158 | // 文件系统挂载后用于存放临时和间接文件的工作基目录 159 | fmt.Sprintf("workdir=%s", o.workPath(s.ID)), 160 | // upper目录,也就是文件系统存储的主目录,可以认为是container的RW层 161 | // filepath.Join(o.root, "snapshots", id, "fs") 162 | fmt.Sprintf("upperdir=%s", o.upperPath(s.ID)), 163 | ) 164 | } else if len(s.ParentIDs) == 1 { 165 | // 只有一层且只是返回可读层的时候上边注释也有说明,直接返回父亲的bind mount即可 166 | return []mount.Mount{ 167 | { 168 | Source: o.upperPath(s.ParentIDs[0]), 169 | Type: "bind", 170 | Options: []string{ 171 | "ro", 172 | "rbind", 173 | }, 174 | }, 175 | } 176 | } 177 | 178 | // 不论是返回读写层都需要把lowerdir给放到options中去 179 | parentPaths := make([]string, len(s.ParentIDs)) 180 | for i := range s.ParentIDs { 181 | parentPaths[i] = o.upperPath(s.ParentIDs[i]) 182 | } 183 | 184 | options = append(options, fmt.Sprintf("lowerdir=%s", strings.Join(parentPaths, ":"))) 185 | // 具体如何处理由mount包处理,参考containerd/containerd/sys/mount_linux.go 186 | return []mount.Mount{ 187 | { 188 | Type: "overlay", 189 | Source: "overlay", 190 | Options: options, 191 | }, 192 | } 193 | 194 | } 195 | 196 | // 和native的一样的,没什么好说的 197 | func (o *snapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) error { 198 | ctx, t, err := o.ms.TransactionContext(ctx, true) 199 | if err != nil { 200 | return err 201 | } 202 | 203 | defer func() { 204 | if err != nil { 205 | if rerr := t.Rollback(); rerr != nil { 206 | log.G(ctx).WithError(rerr).Warn("failed to rollback transaction") 207 | } 208 | } 209 | }() 210 | 211 | // grab the existing id 212 | id, _, _, err := storage.GetInfo(ctx, key) 213 | if err != nil { 214 | return err 215 | } 216 | 217 | usage, err := fs.DiskUsage(ctx, o.upperPath(id)) 218 | if err != nil { 219 | return err 220 | } 221 | 222 | if _, err = storage.CommitActive(ctx, key, name, snapshots.Usage(usage), opts...); err != nil { 223 | return errors.Wrap(err, "failed to commit snapshot") 224 | } 225 | return t.Commit() 226 | } 227 | 228 | 229 | -------------------------------------------------------------------------------- /chapter5/section5.3.md: -------------------------------------------------------------------------------- 1 | # btrfs 2 | 3 | 4 | ### 预备知识 5 | 对于Docker 社区版本来说,不同linux发行版的选择如下: 6 | ![不同系统支持][1] 7 | 对于不同的文件系统,推荐如下: 8 | ![文件系统所需要支持][2] 9 | 10 | https://docs.docker.com/storage/storagedriver/btrfs-driver/ 11 | 12 | > Btrfs is a next generation copy-on-write filesystem that supports many 13 | > advanced storage technologies that make it a good fit for Docker. 14 | > Btrfs is included in the mainline Linux kernel. 15 | > btrfs requires a dedicated block storage device such as a physical 16 | > disk. This block device must be formatted for Btrfs and mounted into 17 | > /var/lib/docker/. 18 | > 19 | > One of the benefits of Btrfs is the ease of managing Btrfs filesystems 20 | > without the need to unmount the filesystem or restart Docker. 21 | > 22 | > When space gets low, Btrfs automatically expands the volume in chunks 23 | > of roughly 1 GB. 24 | > 25 | > To add a block device to a Btrfs volume, use the btrfs device add and 26 | > btrfs filesystem balance commands. 27 | > 28 | > $ sudo btrfs device add /dev/svdh /var/lib/docker 29 | > $ sudo btrfs filesystem balance /var/lib/docker 30 | > 31 | > With Btrfs, writing and updating lots of small files can result in 32 | > slow performance. 33 | 34 | 适合大I/O的场景,且由于其日志的实现,顺序写的性能也不会很高。 35 | 36 | ### 具体实现 37 | 38 | func (b *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) { 39 | return b.makeSnapshot(ctx, snapshots.KindActive, key, parent, opts) 40 | } 41 | 42 | func (b *snapshotter) makeSnapshot(ctx context.Context, kind snapshots.Kind, key, parent string, opts []snapshots.Opt) ([]mount.Mount, error) { 43 | ctx, t, err := b.ms.TransactionContext(ctx, true) 44 | if err != nil { 45 | return nil, err 46 | } 47 | defer func() { 48 | if err != nil && t != nil { 49 | if rerr := t.Rollback(); rerr != nil { 50 | log.G(ctx).WithError(rerr).Warn("failed to rollback transaction") 51 | } 52 | } 53 | }() 54 | 55 | s, err := storage.CreateSnapshot(ctx, kind, key, parent, opts...) 56 | if err != nil { 57 | return nil, err 58 | } 59 | // 形如root/active/snapshotId 60 | // 这里为什么要加kind?不能直接全部放到snapshots里边去? 61 | target := filepath.Join(b.root, strings.ToLower(s.Kind.String()), s.ID) 62 | 63 | if len(s.ParentIDs) == 0 { 64 | // create new subvolume 65 | // btrfs subvolume create /dir 66 | // 不做快照直接创建subvolume 67 | if err = btrfs.SubvolCreate(target); err != nil { 68 | return nil, err 69 | } 70 | } else { 71 | // root/snapshots/parent (单亲关系,不存在多个父节点的情况) 72 | /* 73 | var ( 74 | active = filepath.Join(root, "active") 75 | view = filepath.Join(root, "view") 76 | snapshots = filepath.Join(root, "snapshots") 77 | ) 78 | */ 79 | parentp := filepath.Join(b.root, "snapshots", s.ParentIDs[0]) 80 | 81 | var readonly bool 82 | if kind == snapshots.KindView { 83 | readonly = true 84 | } 85 | 86 | // btrfs subvolume snapshot /parent /subvol 87 | // 通过snapshot进行创建是因为其底层共享storage pool的数据,符合COW的语义 88 | if err = btrfs.SubvolSnapshot(target, parentp, readonly); err != nil { 89 | return nil, err 90 | } 91 | } 92 | err = t.Commit() 93 | t = nil 94 | if err != nil { 95 | if derr := btrfs.SubvolDelete(target); derr != nil { 96 | log.G(ctx).WithError(derr).WithField("subvolume", target).Error("failed to delete subvolume") 97 | } 98 | return nil, err 99 | } 100 | 101 | return b.mounts(target, s) 102 | } 103 | 104 | func (b *snapshotter) mounts(dir string, s storage.Snapshot) ([]mount.Mount, error) { 105 | var options []string 106 | 107 | // get the subvolume id back out for the mount 108 | sid, err := btrfs.SubvolID(dir) 109 | if err != nil { 110 | return nil, err 111 | } 112 | 113 | options = append(options, fmt.Sprintf("subvolid=%d", sid)) 114 | 115 | if s.Kind != snapshots.KindActive { 116 | options = append(options, "ro") 117 | } 118 | 119 | return []mount.Mount{ 120 | { 121 | Type: "btrfs", 122 | Source: b.device, 123 | // NOTE(stevvooe): While it would be nice to use to uuids for 124 | // mounts, they don't work reliably if the uuids are missing. 125 | Options: options, 126 | }, 127 | }, nil 128 | } 129 | 130 | // 这里和前边简介的commit操作有所不同,不是只做了元数据的处理 131 | func (b *snapshotter) Commit(ctx context.Context, name, key string, opts ...snapshots.Opt) (err error) { 132 | usage, err := b.usage(ctx, key) 133 | if err != nil { 134 | return errors.Wrap(err, "failed to compute usage") 135 | } 136 | 137 | ctx, t, err := b.ms.TransactionContext(ctx, true) 138 | if err != nil { 139 | return err 140 | } 141 | defer func() { 142 | if err != nil && t != nil { 143 | if rerr := t.Rollback(); rerr != nil { 144 | log.G(ctx).WithError(rerr).Warn("failed to rollback transaction") 145 | } 146 | } 147 | }() 148 | 149 | id, err := storage.CommitActive(ctx, key, name, usage, opts...) // TODO(stevvooe): Resolve a usage value for btrfs 150 | if err != nil { 151 | return errors.Wrap(err, "failed to commit") 152 | } 153 | 154 | source := filepath.Join(b.root, "active", id) 155 | target := filepath.Join(b.root, "snapshots", id) 156 | 157 | // 必须完成active到可读写的snapshot的转化,因为前边prepare用的是snapshots的目录 158 | // commit的时候是 active -> snapshot 159 | // prepare的时候是 parent snapshot -> snapshot的转换,保证parent snapshot是只读的 160 | if err := btrfs.SubvolSnapshot(target, source, true); err != nil { 161 | return err 162 | } 163 | 164 | err = t.Commit() 165 | t = nil 166 | if err != nil { 167 | if derr := btrfs.SubvolDelete(target); derr != nil { 168 | log.G(ctx).WithError(derr).WithField("subvolume", target).Error("failed to delete subvolume") 169 | } 170 | return err 171 | } 172 | 173 | if derr := btrfs.SubvolDelete(source); derr != nil { 174 | // Log as warning, only needed for cleanup, will not cause name collision 175 | log.G(ctx).WithError(derr).WithField("subvolume", source).Warn("failed to delete subvolume") 176 | } 177 | 178 | return nil 179 | } 180 | 181 | [1]: https://raw.githubusercontent.com/zhangchenchen/zhangchenchen.github.io/hexo/images/20180309151325-linux-distribution.jpg 182 | [2]: https://raw.githubusercontent.com/zhangchenchen/zhangchenchen.github.io/hexo/images/20180309151640-file-system.jpg 183 | 184 | --------------------------------------------------------------------------------