├── LICENSE
├── README.md
└── flink
    ├── .DS_Store
    ├── flink-scheduler
        ├── .DS_Store
        ├── flink-scheduler.md
        ├── flink-slot-group.png
        ├── flink-streaming-deploy-flow.png
        └── slots_parallelism.svg
    ├── flink-watermark-checkpoint
        ├── .DS_Store
        ├── checkpoint-event-flow.png
        ├── flink-watermark-checkpoint.md
        ├── fs-snapshort-extend.png
        ├── state-backend-extend.png
        ├── state-describtor.png
        └── value-state-extend.png
    ├── flink基本组件和JobGraph的生成
        ├── .DS_Store
        ├── flink-cluster-start-flow.png
        ├── flink-datastream-extend.png
        ├── flink-job-graph-create.png
        ├── flink-on-yarn-arch.png
        ├── flink基本组件和逻辑计划生成.md
        ├── index.md
        ├── stream-operator-extend.png
        └── transformation-to-node.png
    ├── flink对用户代码异常处理
        └── flink用户异常处理.md
    ├── flink物理计划生成
        ├── .DS_Store
        ├── execution-many-one.png
        ├── execution-one-many.png
        ├── execution-vertex-one-to-one.png
        ├── flink-job-vertex-to-execution.png
        ├── flink物理计划生成.md
        ├── job-graph-node-sort.png
        └── jobclient-to-jobmanager.png
    ├── flink算子的生命周期
        ├── .DS_Store
        ├── flink-operator-extend.png
        ├── flink算子生命周期.md
        ├── op-chain-internal.png
        ├── op-chian-chianable.png
        ├── operator-chain-simple.png
        └── stream-task-extend.png
    ├── flink网络栈
        ├── .DS_Store
        ├── flink-network-dataflow.png
        ├── flink网络栈.md
        ├── intermediate-result.png
        ├── netty-client-server.png
        └── sub-partition.png
    ├── jobmanager基本组件
        ├── .DS_Store
        ├── blob-server-cache-store.png
        ├── blob-server-contact.png
        ├── blob-server-store-dirctory-tree.png
        ├── blob-service-extends-arch.png
        ├── jobmanager基本组件.md
        └── zk-state-handle-storage.png
    ├── taskmanager基本组件
        ├── .DS_Store
        ├── file-channel-entend.png
        ├── io-manager-async.png
        ├── memory-segment-extend.png
        └── taskmanager基本组件.md
    └── 简历-陈玉兆.pdf


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # flink-source-code-analysis
 2 | *Apache Flink 源码分析系列，基于 git tag 1.1.2*
 3 | 
 4 | Apache Flink 被视为第四代的大数据处理框架，它融合了流式计算和批处理【批处理被视为流式计算的特例】
 5 | 
 6 | 在流式计算方面，使用分布式快照【Checkpoint】实现了高效的数据不丢的机制从而实现 exactly-once 的计算语义；使用 WaterMark 技术实现了窗口计算中延迟数据的处理，同时对流式计算的窗口时间加以分类：processing time、ingestion time、event time
 7 | 
 8 | 本人觉得 flink 的这些特性一定程序上可以窥探出大数据的未来方向，所以花了些时间来阅读源码，先共享出来希望和大家一起探讨
 9 | 
10 | 目前分为一下几个专题：
11 | 
12 | - flink 基本组件和逻辑计划：介绍了 flink 的基本组件、集群构建的过程、以及客户端逻辑计划的生成过程
13 | - flink 物理计划生成：介绍了 flink JobManager 对逻辑计划的运行时抽象，运行时物理计划的生成和管理等
14 | - jobmanager 基本组件：介绍了 JobManager 的核心组件，它们各自承担的作用
15 | - flink 算子的生命周期：介绍了 flink 的算子从构建、生成、运行、及销毁的过程
16 | - taskmanager 的基本组件：介绍了 flink TaskManager 的核心组件，它们在执行节点上发挥的作用
17 | - flink 网络栈：介绍了 flink 网络层的抽象，包括中间结果抽象、输入输出管理、BackPressure 技术、Netty 连接等
18 | - flink-watermark-checkpoint：介绍 flink 的核心特性：watermark 对计算时间的管理、checkpoint 实现 exactly-once 计算语义
19 | - flink-scheduler：介绍 flink 的任务调度算法及负载均衡
20 | - flink对用户代码异常处理：介绍作业的代码异常后 flink 的处理逻辑，从而更好的理解 flink 是如何保证了 exactly-once 的计算语义
21 | 
22 | 
23 | 
24 | 另：本人已将内存更新到博客上： [玉兆的博客](http://chenyuzhao.me) 欢迎访问和吐槽！
25 | 
26 | 
27 | 
28 | 本人会陆续更新，欢迎随时交流！
29 | 
30 | 


--------------------------------------------------------------------------------
/flink/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/.DS_Store


--------------------------------------------------------------------------------
/flink/flink-scheduler/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink-scheduler/.DS_Store


--------------------------------------------------------------------------------
/flink/flink-scheduler/flink-scheduler.md:
--------------------------------------------------------------------------------
  1 | # flink任务调度与负载均衡
  2 | 
  3 | ## 前言
  4 | 
  5 | 前面已经介绍了一系列的 flink 任务抽象、网络传输、可靠性机制等细节，有了这些铺垫，终于可以开心的介绍 flink 的任务调度机制了，也是不易^_^
  6 | 
  7 | 因为没有这些铺垫，就无法明白 flink 为什么要设计这样的一套调度机制！所以本章节讲解时会多穿插一些为什么
  8 | 
  9 | ## 资源组
 10 | 
 11 | ### 资源组模型
 12 | 
 13 | flink 的一个 Instance 可以被划分出多个 Slot，通过初始参数可以指定，他们既可以是 SimpleSlot，也可以是同时跑多个 task 的 SharedSlot，为了约束 task 之间的运行时的绑定关系，flink 抽象出了 SlotSharingGroup 和 CoLocationGroup 的概念。
 14 | 
 15 | 一个 SlotSharingGroup 规定了一个 Job 的 DAG 图中的哪些 JobVertex 的 sub task 可以部署到一个 SharedSlot 上，这是一个软限制，并不是一定会满足，只是调度的时候有位置偏好，而 CoLocationGroup 是在 SlotSharingGroup 的基础上的硬限制，它限定了 CoLocationGroup 中的 JobVertex 中的 sub task 运行必须是一一对应的：假如 CoLocationGrou 限定了 JobVertex A 和 B ，那么 A 的编号为 i 的 sub task 必须和 B 的编号为 i 的 sub task 跑在一起。假如一个 job 的运算逻辑包括 source -> head -> tail -> sink，那么它的 task 运行时限制关系见下图：
 16 | 
 17 | ![flink-slot-group.png](flink-slot-group.png)
 18 | 
 19 | ### 资源组
 20 | 
 21 | #### SlotSharingGroup
 22 | 
 23 | 上面已经提到 SlotSharingGroup 具有绑定 JobVertex 的 sub task 运行的作用，用户可以自己为 JobVertex 定义一个 SlotSharingGroup，如果不定义的话使用名为 default 的 SlotSharingGroup，定义的接口如下：
 24 | 
 25 | `someStream.filter(...).slotSharingGroup("name");`
 26 | 
 27 | #### ColocationGroup
 28 | 
 29 | ColocationGroup 通过 CoLocationConstraint 来管理一个 SharedSlot 上的 sub task
 30 | 
 31 | 用户同样可以通过 api 定义 ColocationGroup：
 32 | 
 33 | ### 资源Slot
 34 | 
 35 | 一个 TaskManager 在初始化时可以指定自己最大持有的 Slot 数，包括 SharedSlot 和 SimpleSlot。
 36 | 
 37 | flink 使用 slot 作为资源抽象【主要是 cpu 和 memory】，一个 Instance 可以持有多个 SharedSlot，一个 SharedSlot 可以并行执行多个 sub task，对于 PIPELINED 来说，一种典型的模式就是一个 SharedSlot 同时执行一个 job 每个 JobVertex 上的一个并行 task，这样不仅可以尽量保证每个 Instance 上的任务负载尽量均匀，也能最大化的利用 PIPELINED 的流水线处理特性优化网络传输。
 38 | 
 39 | flink 的 slot 有两种：SharedSlot 和 SimpleSlot，前者可以绑定执行多个 sub task，后者代表一个 task 的资源占用。
 40 | 
 41 | #### SharedSlot
 42 | 
 43 | 一个 SharedSlot 可以拥有多个 SimpleSlot，也可以包含嵌套的 SharedSlot【ColocationConstraint】，这样便形成了树形结构，SimpleSlot 和 SharedSlot 继承自共同的接口：Slot，它们都包含如下的关键信息：
 44 | 
 45 | - jobID：被哪个 job 占有
 46 | - groupID：属于哪个 SlotSharingGroup
 47 | - instance：属于哪个 TaskManager，或者属于哪个物理节点
 48 | - status：当前分配状态，共有四种状态 ALLOCATED_AND_ALIVE、CANCELLED、RELEASED、unknown
 49 | 
 50 | *只有定义了 SlotSharingGroup 时才会通过 SharedSlot 来绑定 sub task 的执行*
 51 | 
 52 | #### SimpleSlot
 53 | 
 54 | SimpleSlot 是执行单个 task 的 slot 抽象，它既可以在 TaskManager 上独立存在，也可以作为 SharedSlot 的子节点，内部封装了一个 task 的一次 Execution
 55 | 
 56 | 继承关系下如下图：
 57 | 
 58 | SharedSlot 可以视作管理 SimpleSlot 的工具，那么 SharedSlot 自身又由什么方式管理呢？
 59 | 
 60 | #### SlotSharingGroupAssignment
 61 | 
 62 | flink 通过抽象 SlotSharingGroupAssignment 来管理 SharedSlot，这里的资源以  JobVertex 微粒度划分 group，也就是一个 JobVertex 占有一个资源 group。
 63 | 
 64 | ##### Slot初始划分
 65 | 
 66 | SlotSharingGroupAssignment 是如何添加一个初始的 SharedSlot 节点的呢？
 67 | 
 68 | ```java
 69 | //SlotSharingGroupAssignment line174
 70 | private SimpleSlot addSharedSlotAndAllocateSubSlot(SharedSlot sharedSlot, Locality locality,
 71 | 													JobVertexID groupId, CoLocationConstraint constraint) {
 72 | 		// sanity checks
 73 | 		if (!sharedSlot.isRootAndEmpty()) {
 74 | 			throw new IllegalArgumentException("The given slot is not an empty root slot.");
 75 | 		}
 76 | ```
 77 | 
 78 | 总结其逻辑：
 79 | 
 80 | - 这里必须是一个根节点 SharedSlot【没有父亲节点和子节点】，也就是 TaskManager 被配置的一个 Slot
 81 | - 如果没有强制的 task 位置绑定【ColocationConstraint】，从根 SharedSlot 上分配一个 SImpleSlot，编号为递增的 simple slot 个数
 82 | - 如果有 ColocationConstraint 限制，为传入的 SharedSlot 生成一个子 SharedSlot 并分配 SimpleSlot 注册到 ColocationConstraint 中
 83 | - 如果申请到 slot【simple or shared】，设置位置偏好：LOCAL、NON_LOCAL、UNCONSTRAINED【相对于持有的 Instance 来说】，并且将这个 SharedSlot 加入到其它 JobVertex 的可调度资源队列中，也就是说其它的 JobVertex 都可以讲在这个 SharedSlot 上部署自己的 sub task
 84 | 
 85 | ##### 为Task分配 SharedSlot
 86 | 
 87 | 最底层的分配策略：
 88 | 
 89 | ```java
 90 | //SlotSharingGroupAssignment line408
 91 | private Pair<SharedSlot, Locality> getSlotForTaskInternal(AbstractID groupId,
 92 | 																Iterable<Instance> preferredLocations,
 93 | 																boolean localOnly)
 94 | 	{
 95 | 		// check if there is anything at all in this group assignment
 96 | 		if (allSlots.isEmpty()) {
 97 | 			return null;
 98 | 		}
 99 | 
100 | 		// get the available slots for the group
101 | 		Map<Instance, List<SharedSlot>> slotsForGroup = availableSlotsPerJid.get(groupId);
102 | ```
103 | 
104 | 总结其逻辑：
105 | 
106 | - 判断对应的 group 是否有资源，如果没有将 SlotSharingGroupAssignment 目前所有的 SharedSlot 槽位视为可用资源
107 | - 如果对偏好位置有要求，从 group 里筛选是否有满足的 SharedSlot，如果有设置 Locality 为 LOCAL 并返回，同时从该 group 的资源组中移除该 SharedSlot
108 | - 如果对位置有要求但是没有找到符合的 SharedSlot，则从资源组里选择第一个可用的 SharedSlot，并将 Locality 设置为 `Locality.NON_LOCA`
109 | - 如果对位置没要求，则从资源组里选择第一个可用的 SharedSlot，并将 Locality 设置为 `Locality.UNCONSTRAINED`
110 | - 如果没资源，返回 null
111 | - 一旦一个 group 中的 SharedSlot 被分出去就会被从资源池中删除，也就是说一个 SharedSlot 不可能分配一个 JobVertex 的两个 sub tasks，这一点非常重要
112 | 
113 | ###### 无 CoLocationConstraint 限制的资源划分策略
114 | 
115 | 主要是从走上面的逻辑，细节这里就不说了
116 | 
117 | ###### 有 CoLocationConstraint 限制的资源划分策略
118 | 
119 | 有 CoLocationConstraint 限制的时候，优先考虑 CoLocationConstraint 中的 SharedSlot【如果之前 CoLocationGroup 中的其它 task 分配过】，如果 CoLocationConstraint 中还没有分配 SharedSlot 则重新分配，并且再分配一个 SharedSlot 子节点，再这个节点上划出 SimpleSlot 供 task 使用
120 | 
121 | ```java
122 | //SlotSharingGroupAssignment line333
123 | SimpleSlot getSlotForTask(CoLocationConstraint constraint, Iterable<Instance> locationPreferences) {
124 | 		synchronized (lock) {
125 | 			if (constraint.isAssignedAndAlive()) {
126 | 				// the shared slot of the co-location group is initialized and set we allocate a sub-slot
127 | 				final SharedSlot shared = constraint.getSharedSlot();
128 | 				SimpleSlot subslot = shared.allocateSubSlot(null);
129 | 				subslot.setLocality(Locality.LOCAL);
130 | 				return subslot;
131 | 			}
132 | 			else if (constraint.isAssigned()) {
133 | 				// we had an assignment before.
134 | ```
135 | 
136 | - 如果之前 CoLocationGroup 中的其它 sub task 有分配过资源，直接复用这个资源，对应 Locality 属性为 `Locality.LOCAL`
137 | - 如果之前分配过，但是该 SharedSlot 但是却被标记死亡，那么依据之前的 SharedSlot 的所在节点重新分配一次 SharedSlot，再此基础上再分配一个 SharedSlot，后分配 SimpleSlot 返回，对应 Locality 属性为 `Locality.LOCAL`
138 | - 如果是第一次分配，依据节点的偏好位置为参考，并再此基础上再分配一个 SharedSlot，后分配 SimpleSlot 返回，对应 Locality 属性为 `Locality.LOCAL`
139 | 
140 | ## 调度器
141 | 
142 | flink 调度器的调度单位被抽象为一个 ScheduledUnit，一个 ScheduledUnit 封装了以下信息：Execution、SlotSharingGroup、CoLocationConstraint
143 | 
144 | flink 的关于调度的细节全部集成于 Scheduler
145 | 
146 | ### 调度细节
147 | 
148 |  首先来明确下 Scheduler 的调度核心：
149 | 
150 | ```java
151 | //Scheduler line156
152 | private Object scheduleTask(ScheduledUnit task, boolean queueIfNoResource) throws NoResourceAvailableException {
153 | 		if (task == null) {
154 | 			throw new NullPointerException();
155 | 		}
156 | 		if (LOG.isDebugEnabled()) {
157 | 			LOG.debug("Scheduling task " + task);
158 | 		}
159 | ```
160 | 
161 | 总结其逻辑：
162 | 
163 | - 先判断该类型的 ExecutionVertex 是否已强制指定执行节点
164 | - 如果有设置 SlotSharingGroup ，拿到对应的 SlotSharingGroupAssignment 和 ColocationConstraint，走上面描述的接口拿到分配的 SimpleSlot A，如果有 ColocationConstraint，会锁定位置，表示已经获取 SharedSlot 并分配完成，如果这个 SimpleSlot 的 Locality 是 LOCAL【预期的位置】，则立即返回，否则走下面的流程
165 | - 如果上面的过程没有获取到 SimpleSlot，那么表示当前已经没有符合要求的 SharedSlot，这时候会重新分配一个新的 SharedSlot，方式是先遍历当前有资源的 Instance 依据偏好位置找到其中一个，并在 SlotSharingGroupAssignment 中注册返回一个新的 SimpleSlot B【细节上面介绍 SlotSharingGroupAssignment 时已说明】
166 | - 如果已走到这一步，比较 A 与 B 的优劣，主要是调度位置是否符合预期的比较，选择更优的，释放掉另一个
167 | - 如果没有 SlotSharingGroup 的约束，直接从 Instance 上申请一个根 SimpleSlot 来执行这个 task
168 | - 以上调度器在分配 SharedSlot 的时候维护了一个队列：`instancesWithAvailableResources`，每次有 Slot 资源的 Instance 被加入对列尾部，消费过的 Slot 会被 remove，这样可以轮询机器，可以使机器的 SharedSlot 分配尽量均衡
169 | 
170 | #### 约束信息的生成
171 | 
172 | 影响上面调度预期位置有三个重要因素：SlotSharingGroup、ColocationConstraint、prefferedLocations，我们逐一分析它们的生成逻辑：
173 | 
174 | ##### SlotSharingGroup
175 | 
176 | 向上追溯我们发现，Scheduler 的调度逻辑由 Execution 触发：
177 | 
178 | ```java
179 | //Execution line265
180 | public boolean scheduleForExecution(Scheduler scheduler, boolean queued) throws NoResourceAvailableException {
181 |    if (scheduler == null) {
182 |       throw new IllegalArgumentException("Cannot send null Scheduler when scheduling execution.");
183 |    }
184 | 
185 |    final SlotSharingGroup sharingGroup = vertex.getJobVertex().getSlotSharingGroup();
186 |    final CoLocationConstraint locationConstraint = vertex.getLocationConstraint();
187 | 
188 |    // sanity check
189 |    if (locationConstraint != null && sharingGroup == null) {
190 |       throw new RuntimeException("Trying to schedule with co-location constraint but without slot sharing allowed.");
191 |    }
192 |  
193 | //StreamingJobGraphGenerator line416
194 | private void setSlotSharing() {
195 | 
196 | 		Map<String, SlotSharingGroup> slotSharingGroups = new HashMap<>();
197 | 
198 | 		for (Entry<Integer, JobVertex> entry : jobVertices.entrySet()) {
199 | 
200 | 			String slotSharingGroup = streamGraph.getStreamNode(entry.getKey()).getSlotSharingGroup();
201 | 
202 | 			SlotSharingGroup group = slotSharingGroups.get(slotSharingGroup);
203 | 			if (group == null) {
204 | 				group = new SlotSharingGroup();
205 | 				slotSharingGroups.put(slotSharingGroup, group);
206 | 			}
207 | 			entry.getValue().setSlotSharingGroup(group);
208 | 		}
209 |   
210 | //StreamGraphGenerator line577
211 | private String determineSlotSharingGroup(String specifiedGroup, Collection<Integer> inputIds) {
212 | 		if (specifiedGroup != null) {
213 | 			return specifiedGroup;
214 | 		} else {
215 | 			String inputGroup = null;
216 | 			for (int id: inputIds) {
217 | 				String inputGroupCandidate = streamGraph.getSlotSharingGroup(id);
218 | 				if (inputGroup == null) {
219 | 					inputGroup = inputGroupCandidate;
220 | 				} else if (!inputGroup.equals(inputGroupCandidate)) {
221 | 					return "default";
222 | 				}
223 | 			}
224 | 			return inputGroup == null ? "default" : inputGroup;
225 | 		}
226 | 	}
227 | ```
228 | 
229 | 总结其逻辑：
230 | 
231 | - 如果用户没有为 JobVertex 指定 SlotSharingGroup ，则生成名为 'default‘ 的 SlotSharingGroup，否则为每个用户指定的名字定义一个 SlotSharingGroup
232 | - 同一个 SlotSharingGroup 中的节点的 sub task 会共享 SharedSlot 资源
233 | 
234 | ##### ColocationConstraint
235 | 
236 | ```java
237 | //CoLocationGroup line81
238 | public CoLocationConstraint getLocationConstraint(int subtask) {
239 | 		ensureConstraints(subtask + 1);
240 | 		return constraints.get(subtask);
241 | 	}
242 | 	
243 | 	private void ensureConstraints(int num) {
244 | 		if (constraints == null) {
245 | 			constraints = new ArrayList<CoLocationConstraint>(num);
246 | 		} else {
247 | 			constraints.ensureCapacity(num);
248 | 		}
249 | 		
250 | 		if (num > constraints.size()) {
251 | 			constraints.ensureCapacity(num);
252 | 			for (int i = constraints.size(); i < num; i++) {
253 | 				constraints.add(new CoLocationConstraint(this));
254 | 			}
255 | 		}
256 | 	}
257 | ```
258 | 
259 | 总结其逻辑：
260 | 
261 | - CoLocationGroup 的生成逻辑和 SlotSharingGroup 类似，不过这个没有默认，需要用户手动指定，并且前提是需要有 SlotSharingGroup
262 | - 从 CoLocationGroup 内 sub task 的 index 获取一个 ColocationConstraint，这样便实现了一一对应关系
263 | 
264 | ##### prefferedLocations
265 | 
266 | 这是调度位置信息的关键，不管是 CoLocationGroup 还是 SlotSharingGroup，都会优先参考节点偏好来申请资源，那么 flink 是依据什么信息来生成偏好位置的呢？
267 | 
268 | ```java
269 | //ExecutionVertex line373
270 | public Iterable<Instance> getPreferredLocations() {
271 | 		// if we have hard location constraints, use those
272 | 		List<Instance> constraintInstances = this.locationConstraintInstances;
273 | 		if (constraintInstances != null && !constraintInstances.isEmpty()) {
274 | 			return constraintInstances;
275 | 		}
276 | 
277 | 		// otherwise, base the preferred locations on the input connections
278 | 		if (inputEdges == null) {
279 | 			return Collections.emptySet();
280 | 		}
281 | ```
282 | 
283 | 总结其逻辑：
284 | 
285 | - flink 计算节点的上游所有生产者所在节点，并作为自己的偏好位置
286 | - 清空之前旧的偏好位置
287 | 
288 | *这样就形成了最开始【资源组模型】一节中的调度模式，因为一开始的 source task 显然没有 prefferedLocations，由调度细节可以知道 flink 会轮询集群的不同 Instance，将 source task 分配在这些机器上，后面的 source task 的 consumer task 会优先调度到 source task 的节点上，这样便形成了一开始的调度模式！*
289 | 
290 | ### 触发调度
291 | 
292 | - 第一次提交任务时
293 | - 有新的 Instance 或者新的 Slot 获取时，会轮询排对的调度任务进行调度
294 | 
295 | ### 调度流程
296 | 
297 | *这里只介绍 streaming 的流程，批处理类似，有兴趣的童鞋自行研究！*
298 | 
299 | 先梳理代码逻辑：
300 | 
301 | ```java
302 | //ExecutionGraph line 716
303 | //schedule from source JobVertex first
304 | public void scheduleForExecution(Scheduler scheduler) throws JobException {
305 | 		if (scheduler == null) {
306 | 			throw new IllegalArgumentException("Scheduler must not be null.");
307 | 		}
308 | 
309 | 		if (this.scheduler != null && this.scheduler != scheduler) {
310 | 			throw new IllegalArgumentException("Cannot use different schedulers for the same job");
311 | 		}
312 | 
313 | 		if (transitionState(JobStatus.CREATED, JobStatus.RUNNING)) {
314 | 			this.scheduler = scheduler;
315 | 
316 | 			switch (scheduleMode) {
317 | 
318 | 				case FROM_SOURCES:
319 | 					// simply take the vertices without inputs.
320 | 					for (ExecutionJobVertex ejv : this.tasks.values()) {
321 | 						if (ejv.getJobVertex().isInputVertex()) {
322 | 							ejv.scheduleAll(scheduler, allowQueuedScheduling);
323 | 						}
324 | 					}
325 | 					break;
326 | 
327 | //NetworkEnvironment line348
328 | //For PIPELINED, eagerly notify is true, when source tasks is deployed and registered in NetworkEnvironment, will trigger consumer task to deploy
329 | for (ResultPartition partition : producedPartitions) {
330 | 			// Eagerly notify consumers if required.
331 | 			if (partition.getEagerlyDeployConsumers()) {
332 | 				jobManagerNotifier.notifyPartitionConsumable(
333 | 						partition.getJobId(), partition.getPartitionId());
334 | 			}
335 | 		}
336 |                 
337 | //NetworkEnvironment line467
338 | public void notifyPartitionConsumable(JobID jobId, final ResultPartitionID partitionId) {
339 | 
340 | 			final ScheduleOrUpdateConsumers msg = new ScheduleOrUpdateConsumers(jobId, partitionId);
341 | 
342 | 			Future<Object> futureResponse = jobManager.ask(msg, jobManagerMessageTimeout);
343 | 
344 | 			...
345 | //Explain why for PIPELINED, eagerlyDeployConsumers is always true here.
346 | //JobVertex line371
347 | public JobEdge connectNewDataSetAsInput(
348 | 			JobVertex input,
349 | 			DistributionPattern distPattern,
350 | 			ResultPartitionType partitionType,
351 | 			boolean eagerlyDeployConsumers) {
352 | 
353 | 		IntermediateDataSet dataSet = input.createAndAddResultDataSet(partitionType);
354 | 		dataSet.setEagerlyDeployConsumers(eagerlyDeployConsumers);
355 | 
356 | 		JobEdge edge = new JobEdge(dataSet, this, distPattern);
357 | 		this.inputs.add(edge);
358 | 		dataSet.addConsumer(edge);
359 | 		return edge;
360 | 	}
361 |   
362 | //StreamingJobGraphGenerator line371
363 | if (partitioner instanceof ForwardPartitioner) {
364 | 			downStreamVertex.connectNewDataSetAsInput(
365 | 				headVertex,
366 | 				DistributionPattern.POINTWISE,
367 | 				ResultPartitionType.PIPELINED,
368 | 				true);
369 | 		} else if (partitioner instanceof RescalePartitioner){
370 | 			downStreamVertex.connectNewDataSetAsInput(
371 | 				headVertex,
372 | 				DistributionPattern.POINTWISE,
373 | 				ResultPartitionType.PIPELINED,
374 | 				true);
375 | 		} else {
376 | 			downStreamVertex.connectNewDataSetAsInput(
377 | 					headVertex,
378 | 					DistributionPattern.ALL_TO_ALL,
379 | 					ResultPartitionType.PIPELINED,
380 | 					true);
381 | 		}
382 |               
383 | //ResultPartition line244
384 | //Also for a ResultPartition, when is has first buffer produced, it will notify JobManager to deploy it's consumer tasks
385 | public void add(Buffer buffer, int subpartitionIndex) throws IOException {
386 | 		boolean success = false;
387 | 
388 | 		try {
389 | 			checkInProduceState();
390 | 
391 | 			final ResultSubpartition subpartition = subpartitions[subpartitionIndex];
392 | 
393 | 			synchronized (subpartition) {
394 | 				success = subpartition.add(buffer);
395 | 
396 | 				// Update statistics
397 | 				totalNumberOfBuffers++;
398 | 				totalNumberOfBytes += buffer.getSize();
399 | 			}
400 | 		}
401 | 		finally {
402 | 			if (success) {
403 | 				notifyPipelinedConsumers();
404 | 			}
405 | 			else {
406 | 				buffer.recycle();
407 | 			}
408 | 		}
409 | 	}
410 |   
411 | //ResultPartition line440
412 | //Only PIPELINED will work here
413 | private void notifyPipelinedConsumers() throws IOException {
414 | 		if (partitionType.isPipelined() && !hasNotifiedPipelinedConsumers) {
415 | 			partitionConsumableNotifier.notifyPartitionConsumable(jobId, partitionId);
416 | 
417 | 			hasNotifiedPipelinedConsumers = true;
418 | 		}
419 | 	}
420 | ```
421 | 
422 | 简单总结：
423 | 
424 | - 对于 PIPELINED，在 source tasks 部署完成后立马会触发一次下游 consumer tasks 的部署
425 | - 在生产者 task 产生第一个 buffer 数据的时候也会触发一次 consumer tasks 的部署
426 | 
427 | 附一张图解释该流程：
428 | 
429 | ![flink-streaming-deploy-flow.png](flink-streaming-deploy-flow.png)
430 | 
431 | 再附上官方的一张经典调度图：
432 | 
433 | ![slots_parallelism.svg](slots_parallelism.svg)
434 | 
435 | ### Slot的资源隔离
436 | 
437 | 待更新


--------------------------------------------------------------------------------
/flink/flink-scheduler/flink-slot-group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink-scheduler/flink-slot-group.png


--------------------------------------------------------------------------------
/flink/flink-scheduler/flink-streaming-deploy-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink-scheduler/flink-streaming-deploy-flow.png


--------------------------------------------------------------------------------
/flink/flink-watermark-checkpoint/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink-watermark-checkpoint/.DS_Store


--------------------------------------------------------------------------------
/flink/flink-watermark-checkpoint/checkpoint-event-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink-watermark-checkpoint/checkpoint-event-flow.png


--------------------------------------------------------------------------------
/flink/flink-watermark-checkpoint/flink-watermark-checkpoint.md:
--------------------------------------------------------------------------------
  1 | # Flink Watermark And Checkpoint
  2 | 
  3 | ## 前言
  4 | 
  5 | 在前面一章 flink 网络栈的讲解中，我们介绍了 Barrier 的改变以及 Barrier 在 InputGate 消费数据的过程中扮演的时间对齐作用，同时，我们介绍了 InputProcessor 负责数据读取，同时会追踪 watermark 时间并分发到下游。这里我们从 InputProcessor 开始讲起
  6 | 
  7 | 为什么将 Watermark 和 Checkpoint 放在一起将，是因为它们在原理上有相似之处：上游节点逐级广播消息给下游节点来完成一次行为
  8 | 
  9 | ## WaterMark
 10 | 
 11 | ### WaterMark是什么
 12 | 
 13 | Watermark 是协调窗口计算的一种方式，它告诉了算子时间不大于 WaterMark 的消息不应该再被接收【如果出现意味着延迟到达】。WaterMark 从源算子开始 emit，并逐级向下游算子传递，算子需要依据自己的缓存策略在适当的时机将 WaterMark 传递到下游。当源算子关闭时，会发射一个携带 `Long.MAX_VALUE` 值时间戳的 WaterMark，下游算子接收到之后便知道不会再有消息到达。
 14 | 
 15 | Flink 提供三种消息时间特性：EventTime【消息产生的时间】、ProcessingTime【消息处理时间】 和 IngestionTime【消息流入 flink 框架的时间】，WaterMark 只在时间特性 EventTime 和 IngestionTime 起作用，并且 IngestionTime 的时间等同于消息的 ingestion 时间。
 16 | 
 17 | ### WaterMark的协调与分发
 18 | 
 19 | 对于 watermark 的协调与分发集中在 InputProcessor 的 processInput 方法中，下面我们来详细分析其逻辑：
 20 | 
 21 | ```java
 22 | //StreamInputProcessor line134
 23 | public boolean processInput(OneInputStreamOperator<IN, ?> streamOperator, final Object lock) throws Exception {
 24 | 		if (isFinished) {
 25 | 			return false;
 26 | 		}
 27 | 		if (numRecordsIn == null) {
 28 | 			numRecordsIn = streamOperator.getMetricGroup().counter("numRecordsIn");
 29 | 		}
 30 | 
 31 | 		while (true) {
 32 | 			if (currentRecordDeserializer != null) {
 33 | 				DeserializationResult result = currentRecordDeserializer.getNextRecord(deserializationDelegate);
 34 | 
 35 | 				if (result.isBufferConsumed()) {
 36 | 					currentRecordDeserializer.getCurrentBuffer().recycle();
 37 | 					currentRecordDeserializer = null;
 38 | 				}
 39 | ```
 40 | 
 41 | 总结其逻辑：
 42 | 
 43 | - 如果消费到的消息是一个 WaterMark，获得其对应的 source channel id 并将时间更新进去，同时记录下当前所有 channel 的最小 WaterMark 时间
 44 | - 如果当前最小 WaterMark 时间【所有的 channel 都至少消费到该时间】大于上次发射给下游的 WaterMark 时间，则更新 WaterMark 时间并将其交给算子处理
 45 | - 通常算子在处理【尤其是涉及了窗口计算或者需要时间缓存策略的算子】后会将 WaterMark 继续往下游广播发送
 46 | 
 47 | ### WaterMark 的来源
 48 | 
 49 | 上面我们提到 WaterMark 最初由源算子负责发射到下游，那么它的生成规则是什么呢？又是如何协调的？
 50 | 
 51 | *我们来看一个源算子的实现便知*
 52 | 
 53 | 在第一章 flink 逻辑计划生成，我们了解了 flink 所有的源算子都继承自 SourceFunction 接口，SourceFuntion 定义了管理消息发射环境的接口 SourceContext，SourceContext 的具体实现在 StreamSource 中，一共有三种：NonTimestampContext、AutomaticWatermarkContext、ManualWatermarkContext，我们来逐一分析。
 54 | 
 55 | #### NonTimestampContext
 56 | 
 57 | 适用于时间特性：TimeCharacteristic#ProcessingTime，顾名思义，不会 emit 任何 WaterMark
 58 | 
 59 | #### AutomaticWatermarkContext
 60 | 
 61 | 自动发射 WaterMark，适用于 TimeCharacteristic#IngestionTime ，也就是源算子的处理时间。flink 起了一个timer task 专门以一定的 interval 间隔发射 WaterMark，一个 Interval 内所有 Record 的发射时间处于上次 emit 的 WaterMark 和下次将要 emit 的 WaterMark 之间，Interval 边界到达后会提升下一个 WaterMark 时间，计算本次的 WaterMark 时间并 emit 出去。
 62 | 
 63 | 自动 emit watermark 的 interval 默认是 200ms ，这是写死不可配置的，具体见：
 64 | 
 65 | ```java
 66 | //StreamExecutionEnvironment line598
 67 | public void setStreamTimeCharacteristic(TimeCharacteristic characteristic) {
 68 | 		this.timeCharacteristic = Preconditions.checkNotNull(characteristic);
 69 | 		if (characteristic == TimeCharacteristic.ProcessingTime) {
 70 | 			getConfig().setAutoWatermarkInterval(0);
 71 | 		} else {
 72 | 			getConfig().setAutoWatermarkInterval(200);
 73 | 		}
 74 | 	}
 75 | ```
 76 | 
 77 | 
 78 | 
 79 | #### ManualWatermarkContext
 80 | 
 81 | 用户自己指定 WaterMark 时间，使用于 TimeCharacteristic#EventTime，用户需要提供依据源消息提取 Watermark 时间的工具 function
 82 | 
 83 | *那么用户有哪些指定时间的逻辑呢？*
 84 | 
 85 | ##### TimestampAssigner
 86 | 
 87 | flink 通过接口 TimestampAssigner 来让用户依据消息的格式自己抽取可能被用于 WaterMark 的 timestamp，它只定义了一个接口：`long extractTimestamp(T element, long previousElementTimestamp);`
 88 | 
 89 | 而 TimestampAssigner 的两个继承接口 AssignerWithPunctuatedWatermarks 以及 AssignerWithPeriodicWatermarks 定义了另种典型的时间戳生成规则：
 90 | 
 91 | - AssignerWithPunctuatedWatermarks：依据消息中事件元素及自带 timestamp 来驱动 watermark 的递增
 92 | - AssignerWithPeriodicWatermarks：依据消息中的 timestamp 周期性地驱动 watermark 的递增
 93 | 
 94 | 各个场景可以依据业务的需求去继承和实现
 95 | 
 96 | ## Checkpoint
 97 | 
 98 | CheckPoint 是 flink 保证消息不丢的机制，通过 Barrier 的方式来协调时机，那么什么是 Barrier 呢？
 99 | 
100 | 其实前一章介绍 flink 网络栈 的时候已经有介绍在消费端 flink 对于不同的 Barrier 处理，实际上，Barrier 是用来校准 checkpint 的方式。由于对于一个拓扑结构，只有上游算子 checkpoint 完，下游算子的 cehckpoint 才能开始并有意义，同时下游算子的消费速率并不统一【有的 channel 快，有的 channel 慢】，而 Barrier 就是这样一种协调上下游算子的机制。
101 | 
102 | JobManager 统一通知源算子发射 Barrier 事件，并向下游广播，当下游算子收到这样的事件后，它就知道自己处于两次 checkpoint 之间【一次新的 checkpoint 将被发起】
103 | 
104 | 当下游算子收到了它所有的 InputChannel 的 Barrier 事件后，它便知道上游算子的一次 checkpoint 已完成，自己也可以做 checkpoint 了，完成之后继续将 checkpoint 事件广播到下游算子
105 | 
106 | 在 Exact-once 语义下，消费端会延迟消费并校准不同 channel 的消费速率，这在 flink 网络栈一章有详细介绍！
107 | 
108 | ### Checkpoint 的协调与发起
109 | 
110 | 前面提到过 checkpoint 统一由 JobManager 发起，我们来看相关逻辑：
111 | 
112 | #### CheckpointCoordinator
113 | 
114 | flink 的 checkpoint 统一由 CheckpointCoordinator 来协调，通过将 checkpoint 命令事件发送给相关的 tasks 【源 tasks】，它发起 checkpoint 并且收集 checkpoint 的 ack 消息。
115 | 
116 | ##### 构造参数
117 | 
118 | 这里有必要了解一些关键参数，以便我们更加了解 Checkpoint 的细节策略
119 | 
120 | - baseInternal：快照的间隔
121 | - checkpointTimeout：一次 checkpoint 的超时时间，超时的 checkpoint 会被取消
122 | - maxConcurrentCheckpointAttempts：最多可同时存在的 checkpoint 任务，是对于整个 flink job
123 | - tasksToTrigger：触发分布式 Checkpoint 的起始 tasks，也就是 source tasks
124 | 
125 | ##### Checkpoint的发起
126 | 
127 | 前面的章节我们介绍过 ExecutionGraph【flink物理计划抽象】，它有一个核心接口 `enableSnapshotCheckpointing` ，这个接口在 JobManager 提交作业的时候被执行，具体见`JobManager line1238 from method submitJob`。这个接口的逻辑总结如下：
128 | 
129 | - 获取 checkpoint 的发起节点【源节点】，需要 ack 和 commit 的节点【所有节点】
130 | - 先关闭已有 checkpoint
131 | - 实例化 CheckpointCoordinator 和它的监听 Akka 系统 CheckpointCoordinatorDeActivator，并将 CheckpointCoordinatorDeActivator 注册为 EecutionGraph 的 listener，当作业的执行状态变为 RUNNING 时，会通知 CheckpointCoordinatorDeActivator 启动 CheckpointCoordinator 的 checkpoint 线程
132 | 
133 | *那么 CheckpointCoordinator 在收到这样的消息后会怎么处理呢？*
134 | 
135 | 它会发起一个 timer task，定时执行，并且传入的时间为当前的系统时间，由于 CheckpointCoordinator 全局只有一个，这个时间也是全局递增并且唯一的：
136 | 
137 | ```java
138 | //CheckpointCoordinator line 1020
139 | private class ScheduledTrigger extends TimerTask {
140 | 
141 | 		@Override
142 | 		public void run() {
143 | 			try {
144 | 				triggerCheckpoint(System.currentTimeMillis());
145 | 			}
146 | 			catch (Exception e) {
147 | 				LOG.error("Exception while triggering checkpoint", e);
148 | 			}
149 | 		}
150 | 	}
151 | ```
152 | 
153 | *下面我们来具体分析 checkpoint 的一些核心动作*
154 | 
155 | ###### checkpoint 的触发
156 | 
157 | ```java
158 | //CheckpointCoordinator line389
159 | public boolean triggerCheckpoint(long timestamp, long nextCheckpointId) throws Exception {
160 | 		// make some eager pre-checks
161 | 		synchronized (lock) {
162 | 			// abort if the coordinator has been shutdown in the meantime
163 | 			if (shutdown) {
164 | 				return false;
165 | 			}
166 | //...
167 | ```
168 | 
169 | 总结逻辑：
170 | 
171 | - 如果已关闭或优先处理排队请求会总额并发任务超过限制，都会取消此次 checkpoint 的发起
172 | - 如果最小间隔时间未达到，也会取消此次 checkpoint
173 | - check 所有的发起节点【源节点】与其他节点都为 RUNNING 状态后才会发起 checkpoint
174 | - 发起 checkpoint 并生成一个 PendingCheckpoint 【已经发起但尚未 ack 的 checkpoint】
175 | - 每个源节点都会发一条消息给自己的 TaskManager 进行 checkpoint
176 | 
177 | ###### 取消 CheckPoint 消息的处理
178 | 
179 | ```java
180 | //CheckpointCoordinator line568
181 | public boolean receiveDeclineMessage(DeclineCheckpoint message) throws Exception {
182 |    if (shutdown || message == null) {
183 |       return false;
184 |    }
185 |    if (!job.equals(message.getJob())) {
186 |       LOG.error("Received DeclineCheckpoint message for wrong job: {}", message);
187 |       return false;
188 |    }
189 | //...
190 | ```
191 | 
192 | 总结其逻辑：
193 | 
194 | - 如果有对应的 PendingCheckpoint ，取消掉并且如果在其之后还有其它 checkpoint 的话，重新发起它们的 checkpoint 任务
195 | 
196 | ###### Ack Checkpoint 消息的处理
197 | 
198 | ```java
199 | //CheckpointCoordinator line651
200 | public boolean receiveAcknowledgeMessage(AcknowledgeCheckpoint message) throws Exception {
201 | 		if (shutdown || message == null) {
202 | 			return false;
203 | 		}
204 | 		if (!job.equals(message.getJob())) {
205 | 			LOG.error("Received AcknowledgeCheckpoint message for wrong job: {}", message);
206 | 			return false;
207 | 		}
208 | ```
209 | 
210 | 总结其逻辑：
211 | 
212 | - 通过消息里的 checkpoint id 找到对应的  PendingCheckpoint，记录下 对应的 JobVertex 下某个 ExecutionVertex 的 ack 状态
213 | - PendingCheckpoint 里维护了该次 checkpoint 需要 ack 的全部 ExecutionVertex
214 | - 如果全部 ack 完成，则清除 PendingCheckpoint 里维护的状态数据并将句柄转化给 CompletedCheckpoint 来维护
215 | - 丢弃过时的 checkpoint 任务，并重新出发新的 checkpoint
216 | - 如果全部 ack 完成，通知对应的 TaskManager checkpoint 已完成【checkpoint commit 阶段】，然后通过 CompletedCheckpointStore 将 CompletedCheckpoint 序列化并存储，高可用模式下为 ZK 的方式，具体细节见章节：【flink job manager 基本组件】，将来恢复时，将每个节点需要的句柄注入到状态中，之后算子启动时将状态数据附属于 TaskDeploymentDescriptor 之中分发给 TaskManager 去执行
217 | 
218 | ### Checkpoint 的消息流
219 | 
220 | 上面我们说到 TaskManager 收到 AbstractCheckpointMessage 消息，并处理，我们来看核心逻辑：
221 | 
222 | ```java
223 | //TaskManager line520
224 | private def handleCheckpointingMessage(actorMessage: AbstractCheckpointMessage): Unit = {
225 | 
226 |     actorMessage match {
227 |       case message: TriggerCheckpoint =>
228 |         val taskExecutionId = message.getTaskExecutionId
229 |         val checkpointId = message.getCheckpointId
230 |         val timestamp = message.getTimestamp
231 | 
232 |         log.debug(s"Receiver TriggerCheckpoint $checkpointId@$timestamp for $taskExecutionId.")
233 |         
234 | //...
235 | //Task.java line927
236 | public void triggerCheckpointBarrier(final long checkpointID, final long checkpointTimestamp) {
237 | 		AbstractInvokable invokable = this.invokable;
238 | 
239 | 		if (executionState == ExecutionState.RUNNING && invokable != null) {
240 | 			if (invokable instanceof StatefulTask) {
241 | 
242 | 				// build a local closure
243 | 				final StatefulTask<?> statefulTask = (StatefulTask<?>) invokable;
244 | 				final String taskName = taskNameWithSubtask;       
245 | 
246 | //...
247 | //StreamTask line577
248 | protected boolean performCheckpoint(final long checkpointId, final long timestamp) throws Exception {
249 | 		LOG.debug("Starting checkpoint {} on task {}", checkpointId, getName());
250 | 		
251 | 		synchronized (lock) {
252 | 			if (isRunning) {
253 | ```
254 | 
255 | 总结其逻辑：
256 | 
257 | - 先是通过 TaskManager 进行消息路由，对于 TriggerCheckpoint 消息，会路由给相应的 Task 做处理
258 | - Task 会起一个异步 task 进行 checkpoint，内部是调用 StreamTask 的 performCheckpoint 方法
259 | - performCheckpoint 内部首先先将此次 checkpoint 的 barrier 广播到下游，以便让下游快速 checkpoint
260 | - 后执行具体的 checkpoint，将状态持久化，目前支持的持久化方式有：FileSystem、Memory、RocksDB，成功后通知 JobManager 进行 ack，否则取消此次 checkpoint
261 | - 如果是 ack 消息，依据具体情况通知对应的 KVState
262 | 
263 | 附一张图描述交互过程：
264 | 
265 | ![checkpoint-event-flow.png](checkpoint-event-flow.png)
266 | 
267 | ### Checkpoint 的存储和恢复
268 | 
269 | Checkpoint 的存储和恢复均是通过 AbstractStateBackend 来完成，AbstractStateBackend 有三个实现类，FsStateBackend 是通过 HDFS 来存储 checkpoint 状态，继承关系如下：
270 | 
271 | ![state-backend-extend.png](state-backend-extend.png)
272 | 
273 | *我们来看最常见的一种 FsStateBackend*，AbstractStateBackend 内部通过 State 来管理状态数据，依据状态数据的不同特性，状态分为三种：
274 | 
275 | - ValueState ：最简单的状态，一个 key 一个单值 value，可以跟更新和删除
276 | - ListState：一个 key 对应一个 value list
277 | - ReducingState：一个 key 对应的 value 可以进行 reduce 操作
278 | - FoldingState：一个key，后续添加的值都会通过 folding 函数附加到第一个值上
279 | 
280 | AbstractStateBackend 内部通过 KvState 接口来管理用户自定义的 kv 数据，我们来看 FsValueState 的继承关系：
281 | 
282 | ![value-state-extend.png](value-state-extend.png)
283 | 
284 | 那么如何获取这些 State 呢？flink 抽象了另一套接口：StateDescriptor 来获取 State，通过绑定特定的 StateBackend 来获取。这样一层抽象，解耦了 State 的类型和底层的具体的存储实现。我们来看 StateDescriptor 的继承关系：
285 | 
286 | ![state-describtor.png](state-describtor.png)
287 | 
288 | 那么这些抽象是如何协调工作的呢？
289 | 
290 | ```java
291 | //KvState 的初始化和获取
292 | //AbstractKeyedCEPPatternOperator line93
293 | public void open() throws Exception {
294 | 		if (keys == null) {
295 | 			keys = new HashSet<>();
296 | 		}
297 | 
298 | 		if (nfaOperatorState == null) {
299 | 			nfaOperatorState = getPartitionedState(
300 | 					new ValueStateDescriptor<NFA<IN>>(
301 | 						NFA_OPERATOR_STATE_NAME,
302 | 						new NFA.Serializer<IN>(),
303 | 						null));
304 | 		}
305 | 
306 | //AbstractStreamOperator line273
307 | protected <S extends State> S getPartitionedState(StateDescriptor<S, ?> stateDescriptor) throws Exception {
308 | 		return getStateBackend().getPartitionedState(null, VoidSerializer.INSTANCE, stateDescriptor);
309 | 	}
310 |   
311 | //AbstractStateBackend line205 
312 | //具体的 kvState 由子类具体实现来决定
313 | public <N, S extends State> S getPartitionedState(final N namespace, final TypeSerializer<N> namespaceSerializer, final StateDescriptor<S, ?> stateDescriptor) throws Exception {
314 | 
315 | 		if (keySerializer == null) {
316 | 			throw new RuntimeException("State key serializer has not been configured in the config. " +
317 | 					"This operation cannot use partitioned state.");
318 | 		}
319 | 		
320 | 		if (!stateDescriptor.isSerializerInitialized()) {
321 | 			stateDescriptor.initializeSerializerUnlessSet(new ExecutionConfig());
322 | 		}
323 | //获取 KvState 后，用户经过一番更新...下面是快照的过程
324 | // StateBackend 的创建
325 | //AbstractStreamOperator line114 
326 | try {
327 | 			TypeSerializer<Object> keySerializer = config.getStateKeySerializer(getUserCodeClassloader());
328 | 			// if the keySerializer is null we still need to create the state backend
329 | 			// for the non-partitioned state features it provides, such as the state output streams
330 | 			String operatorIdentifier = getClass().getSimpleName() + "_" + config.getVertexID() + "_" + runtimeContext.getIndexOfThisSubtask();
331 | 			stateBackend = container.createStateBackend(operatorIdentifier, keySerializer);
332 | 		} catch (Exception e) {
333 | 			throw new RuntimeException("Could not initialize state backend. ", e);
334 | 		}
335 | //StreamTask line102
336 | public AbstractStateBackend createStateBackend(String operatorIdentifier, TypeSerializer<?> keySerializer) throws Exception {
337 | 		AbstractStateBackend stateBackend = configuration.getStateBackend(userClassLoader);
338 | 
339 | 		if (stateBackend != null) {
340 | 			// backend has been configured on the environment
341 | 			LOG.info("Using user-defined state backend: " + stateBackend);
342 | 		}
343 | //快照入口
344 | //AbstractStreamOperator line179
345 | public StreamTaskState snapshotOperatorState(long checkpointId, long timestamp) throws Exception {
346 | 		// here, we deal with key/value state snapshots
347 | 		
348 | 		StreamTaskState state = new StreamTaskState();
349 | 
350 | 		if (stateBackend != null) {
351 | 			HashMap<String, KvStateSnapshot<?, ?, ?, ?, ?>> partitionedSnapshots =
352 | 				stateBackend.snapshotPartitionedState(checkpointId, timestamp);
353 | 			if (partitionedSnapshots != null) {
354 | 				state.setKvStates(partitionedSnapshots);
355 | 			}
356 | 		}
357 | 		return state;
358 | 	} 
359 | ```
360 | 
361 | 上面的快照之行结束后，用户会获取 KvStateSnapshot 抽象，对于 FsState 来说，起内部封装了文件句柄以及序列化元数据等信息，同时提供了恢复快照的接口，其抽象关系如下：
362 | 
363 | ![fs-snapshort-extend.png](fs-snapshort-extend.png)
364 | 
365 | flink  进一步将每个 task 的每个 operator 快照后获取的 KvStateSnapshot 封装成 StreamTaskState，并最终获取一个 StreamTaskState List【对应一个 task 的一组 operators】，分装成 StreamTaskStateList，随后通知 JobManager 的 CheckpointCoordinator：
366 | 
367 | ```java
368 | //RuntimeEnvironment line260
369 | AcknowledgeCheckpoint message = new AcknowledgeCheckpoint(
370 | 				jobId,
371 | 				executionId,
372 | 				checkpointId,
373 | 				serializedState,
374 | 				stateSize);
375 | 
376 | 		jobManager.tell(message);
377 | ```
378 | 
379 | JobManager  再将这些句柄的数据再快照到本地和zk，具体见 JobManager 基本组件。恢复的过程是逆向的，暂时就不分析了，有耐心的用户可以自行查看源码!


--------------------------------------------------------------------------------
/flink/flink-watermark-checkpoint/fs-snapshort-extend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink-watermark-checkpoint/fs-snapshort-extend.png


--------------------------------------------------------------------------------
/flink/flink-watermark-checkpoint/state-backend-extend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink-watermark-checkpoint/state-backend-extend.png


--------------------------------------------------------------------------------
/flink/flink-watermark-checkpoint/state-describtor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink-watermark-checkpoint/state-describtor.png


--------------------------------------------------------------------------------
/flink/flink-watermark-checkpoint/value-state-extend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink-watermark-checkpoint/value-state-extend.png


--------------------------------------------------------------------------------
/flink/flink基本组件和JobGraph的生成/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink基本组件和JobGraph的生成/.DS_Store


--------------------------------------------------------------------------------
/flink/flink基本组件和JobGraph的生成/flink-cluster-start-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink基本组件和JobGraph的生成/flink-cluster-start-flow.png


--------------------------------------------------------------------------------
/flink/flink基本组件和JobGraph的生成/flink-datastream-extend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink基本组件和JobGraph的生成/flink-datastream-extend.png


--------------------------------------------------------------------------------
/flink/flink基本组件和JobGraph的生成/flink-job-graph-create.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink基本组件和JobGraph的生成/flink-job-graph-create.png


--------------------------------------------------------------------------------
/flink/flink基本组件和JobGraph的生成/flink-on-yarn-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink基本组件和JobGraph的生成/flink-on-yarn-arch.png


--------------------------------------------------------------------------------
/flink/flink基本组件和JobGraph的生成/flink基本组件和逻辑计划生成.md:
--------------------------------------------------------------------------------
  1 | # Flink 基本组件和逻辑计划生成
  2 | 
  3 | ## 概要和背景
  4 | 
  5 | *flink*是一个被誉为 *the 4th G* 的计算框架，不同的框架特性及其代表项目列表如下：
  6 | 
  7 | | 第一代              | 第二代                       | 第三代                                      | 第四代                                      |
  8 | | ---------------- | ------------------------- | ---------------------------------------- | ---------------------------------------- |
  9 | | Batch            | **Batch** **Interactive** | **Batch** **Interactive** **Near-Real-Time** **Interative-processing** | **Hybrid** **Interactive** **Real-Time-Streaming** **Native-Iterative-processing** |
 10 | |                  | DAG Dataflows             | RDD                                      | Cyclic Dataflows                         |
 11 | | Hadoop MapReduce | TEZ                       | Spark                                    | Flink                                    |
 12 | 
 13 | 本文主要介绍*flink*的核心组件以及物理计划的生成过程
 14 | 
 15 | *参考代码分支 flink-1.1.2*
 16 | 
 17 | ## 核心组件介绍
 18 | 
 19 | *这里只介绍 on yarn 模式下的组件*
 20 | 
 21 | *flink* 的 on yarn 模式支持两种不同的类型：
 22 | 
 23 | 1. 单作业单集群
 24 | 2. 多作业单集群
 25 | 
 26 | 首先介绍 *单作业单集群* 的架构，单作业单集群下一个正常的 *flink* 程序会拥有以下组件
 27 | 
 28 | ---
 29 | 
 30 | job Cli: 非 detatched 模式下的客户端进程，用以获取 yarn Application Master 的运行状态并将日志输出掉终端
 31 | 
 32 | JobManager[JM]: 负责作业的运行时计划 ExecutionGraph 的生成、物理计划生成和作业调度
 33 | 
 34 | TaskManager[TM]: 负责被分发 task 的执行、心跳/状态上报、资源管理
 35 | 
 36 | ---
 37 | 
 38 | 整体的架构大致如下图所示：
 39 | 
 40 | ![flink on yarn](flink-on-yarn-arch.png)
 41 | 
 42 | 下面将以一次 Job 的提交过程描述 *flink* 的各组件的作用及协同
 43 | 
 44 | ### 作业提交流程分析
 45 | 
 46 | 单作业单集群模式下，一个作业会启动一个 JM，并依据用户的参数传递启动相应数量的 TM，每个 TM 运行在 yarn 的一个 container 中，
 47 | 
 48 | 一个通常的 flink on yarn 提交命令：`./bin/flink run -m yarn-cluster -yn 2 -j flink-demo-1.0.0-with-dependencies.jar —ytm 1024 -yst 4 -yjm 1024 —yarnname flink_demo_waimai_e` *flink* 在收到这样一条命令后会首先通过 Cli 获取 flink 的配置，并解析命令行参数。
 49 | 
 50 | #### 配置加载
 51 | 
 52 | `CliFrontend.java` 是 flink 提交作业的入口
 53 | 
 54 | ```java
 55 | //CliFrontend line144
 56 | public CliFrontend() throws Exception {
 57 |    this(getConfigurationDirectoryFromEnv());
 58 | }
 59 | ```
 60 | 
 61 | 这里会尝试加载 conf 文件夹下的所有 yaml 文件，配置文件的命名并没有强制限制
 62 | 
 63 | #### 参数解析
 64 | 
 65 | 解析命令行参数的第一步是路由用户的命令，然后交由`run`方法去处理
 66 | 
 67 | ```java
 68 | //CliFrontend line993
 69 | try {
 70 |     return SecurityUtils.runSecured(new SecurityUtils.FlinkSecuredRunner<Integer>() {
 71 | 	    Override
 72 | 	    public Integer run() throws Exception {
 73 | 	        return CliFrontend.this.run(params);
 74 | 		});
 75 | 	}
 76 | 	catch (Exception e) {
 77 | 		return handleError(e);
 78 | 	}
 79 | ```
 80 | 
 81 | 接下来是程序参数设置过程，*flink* 将 jar包路径和参数配置封装成了 `PackagedProgram` 
 82 | 
 83 | ```java
 84 | //CliFrontend line223
 85 | PackagedProgram program;
 86 | try {
 87 |    LOG.info("Building program from JAR file");
 88 |    program = buildProgram(options);
 89 | }
 90 | catch (Throwable t) {
 91 |    return handleError(t);
 92 | }
 93 | ```
 94 | 
 95 | #### flink集群的构建
 96 | 
 97 | ##### 集群类型的解析
 98 | 
 99 | 获取参数后下一步就是集群的构建和部署，flink 通过 两个不同的 `CustomCommandLine ` 来实现不同集群模式的解析，分别是 `FlinkYarnSessionCli`和 `DefaultCLI` 【吐槽一下 flink 类名的命名规范】解析命令行参数
100 | 
101 | ```java
102 | //CliFrontend line125
103 | static {
104 |    /** command line interface of the YARN session, with a special initialization here
105 |     *  to prefix all options with y/yarn. */
106 |    loadCustomCommandLine("org.apache.flink.yarn.cli.FlinkYarnSessionCli", "y", "yarn");
107 |    customCommandLine.add(new DefaultCLI());
108 | }
109 | ...
110 | //line882 这里将决定Cli的类型
111 | CustomCommandLine<?> activeCommandLine = getActiveCustomCommandLine(options.getCommandLine());
112 | ```
113 | 
114 | 那么什么时候解析成 Yarn Cluster 什么时候解析成 Standalone 呢？由于`FlinkYarnSessionCli`被优先添加到`customCommandLine`,所以会先触发下面这段逻辑
115 | 
116 | ```java
117 | //FlinkYarnSessionCli line469
118 | @Override
119 | public boolean isActive(CommandLine commandLine, Configuration configuration) {
120 |    String jobManagerOption = commandLine.getOptionValue(ADDRESS_OPTION.getOpt(), null);
121 |    boolean yarnJobManager = ID.equals(jobManagerOption);
122 |    boolean yarnAppId = commandLine.hasOption(APPLICATION_ID.getOpt());
123 |    return yarnJobManager || yarnAppId || loadYarnPropertiesFile(commandLine, configuration) != null;
124 | }
125 | ```
126 | 
127 | 从上面可以看出如果用户传入了 `-m`参数或者`application id`或者配置了yarn properties 文件，则启动yarn cluster模式，否则是Standalone模式的集群
128 | 
129 | ##### 集群部署
130 | 
131 | flink通过`YarnClusterDescriptor`来描述yarn集群的部署配置，具体对应的配置文件为`flink-conf.yaml`，通过下面这段逻辑触发集群部署：
132 | 
133 | ```java
134 | //AbstractYarnClusterDescriptor line372
135 | /**
136 |  * This method will block until the ApplicationMaster/JobManager have been
137 |  * deployed on YARN.
138 |  */
139 | protected YarnClusterClient deployInternal() throws Exception {
140 | ```
141 | 
142 | 大致列下过程：
143 | 
144 | - check yarn 集群队列资源是否满足请求
145 | - 设置 AM Context、启动命令、submission context
146 | - 如果开启高可用模式【通过反射调用 submission context 的两个方法修改属性】 keepContainersMethod    attemptFailuresValidityIntervalMethod 【和 Hadoop 的版本有关】第一个属性表示应用重试时是否保留 AM container，第二个属性表示 指定 间隔时间之内应用允许失败重启的次数
147 | - 上传 用户 jar、flink-conf.yaml、lib 目录下所有的 jar 包、logback log4j配置文件 到 HDFS
148 | - 通过 yarn client submit am context
149 | - 将yarn client 及相关配置封装成 YarnClusterClient 返回
150 | 
151 | 真正在 AM 中运行的主类是 `YarnApplicationMasterRunner`，它的 `run`方法做了如下工作：
152 | 
153 | -  启动JobManager ActorSystem
154 | -  启动 flink ui
155 | -  启动`YarnFlinkResourceManager`来负责与yarn的ResourceManager交互，管理yarn资源
156 | -  启动 actor System supervise 进程
157 | 
158 | 到这里 JobManager 已经启动起来，那么 TaskManager是什么时候起动的呢？
159 | 
160 | 在 `YarnFlinkResourceManager`启动的时候会预先执行一段逻辑【Akka actor的preStart方法】：
161 | 
162 | ```java
163 | @Override
164 | public void preStart() {
165 |     try {
166 |         // we start our leader retrieval service to make sure we get informed
167 |         // about JobManager leader changes
168 |         leaderRetriever.start(new LeaderRetrievalListener() {
169 | 
170 | 		    @Override
171 | 		    public void notifyLeaderAddress(String leaderAddress, UUID leaderSessionID) {
172 | 		        self().tell(
173 | 						new NewLeaderAvailable(leaderAddress, leaderSessionID),
174 | 						ActorRef.noSender());
175 | 		    }
176 | ```
177 | 
178 | 这段逻辑会先尝试获取 JobManager 的地址并给自己发送一个路由消息`NewLeaderAvailable`，然后`YarnFlinkResourceManager`会把自己注册到 `JobManager` 中，接着`JobManager`会发送一个回调命令：
179 | 
180 | ```scala
181 | //JobManager line358
182 | sender ! decorateMessage(new RegisterResourceManagerSuccessful(self, taskManagerResources))
183 | ```
184 | 
185 | 接着会触发这样一段逻辑：
186 | 
187 | ```java
188 | //FlinkResourceManager line555
189 | private void checkWorkersPool() {
190 |    int numWorkersPending = getNumWorkerRequestsPending();
191 |    int numWorkersPendingRegistration = getNumWorkersPendingRegistration();
192 | 
193 |    // sanity checks
194 |    Preconditions.checkState(numWorkersPending >= 0,
195 |       "Number of pending workers should never be below 0.");
196 |    Preconditions.checkState(numWorkersPendingRegistration >= 0,
197 |       "Number of pending workers pending registration should never be below 0.");
198 | 
199 |    // see how many workers we want, and whether we have enough
200 |    int allAvailableAndPending = startedWorkers.size() +
201 |       numWorkersPending + numWorkersPendingRegistration;
202 | 
203 |    int missing = designatedPoolSize - allAvailableAndPending;
204 | 
205 |    if (missing > 0) {
206 |       requestNewWorkers(missing);
207 |    }
208 | }
209 | ```
210 | 
211 | 将所有的 TS 起动起来，这样一个 flink 集群便构建出来了。下面附图解释下这个流程：
212 | 
213 | ![flink-cluster-start-flow](flink-cluster-start-flow.png)
214 | 
215 | 1. flink cli 解析本地环境配置，启动 `ApplicationMaster`
216 | 2. 在 `ApplicationMaster` 中启动 `JobManager`
217 | 3. 在 `ApplicationMaster` 中启动`YarnFlinkResourceManager`
218 | 4. `YarnFlinkResourceManager`给`JobManager`发送注册信息
219 | 5. `YarnFlinkResourceManager`注册成功后，`JobManager`给`YarnFlinkResourceManager`发送注册成功信息
220 | 6. `YarnFlinkResourceManage`知道自己注册成功后像`ResourceManager`申请和`TaskManager`数量对等的 container
221 | 7. 在container中启动`TaskManager`
222 | 8. `TaskManager`将自己注册到`JobManager`中
223 | 
224 | *接下来便是程序的提交和运行*
225 | 
226 | 程序在`CliFrontend`中被提交后，会触发这样一段逻辑
227 | 
228 | ```java
229 | //ClusterClient 304
230 | 	public JobSubmissionResult run(PackagedProgram prog, int parallelism)
231 | 			throws ProgramInvocationException
232 | 	{
233 | 		Thread.currentThread().setContextClassLoader(prog.getUserCodeClassLoader());
234 | 		...
235 | 		else if (prog.isUsingInteractiveMode()) {
236 | 			LOG.info("Starting program in interactive mode");
237 | 			ContextEnvironmentFactory factory = new ContextEnvironmentFactory(this, prog.getAllLibraries(),
238 | 					prog.getClasspaths(), prog.getUserCodeClassLoader(), parallelism, isDetached(),
239 | 					prog.getSavepointPath());
240 | 			ContextEnvironment.setAsContext(factory);
241 | 
242 | 			try {
243 | 				// invoke main method
244 | 				prog.invokeInteractiveModeForExecution();
245 | 				...
246 | 			}
247 | 			finally {
248 | 				ContextEnvironment.unsetContext();
249 | 			}
250 | 		}
251 | 		else {
252 | 			throw new RuntimeException("PackagedProgram does not have a valid invocation mode.");
253 | 		}
254 | 	}
255 | ```
256 | 
257 | 注意到有一段`prog.invokeInteractiveModeForExecution()`，这是客户端生成初步逻辑计划的核心逻辑，下面将详细介绍
258 | 
259 | ### 客户端逻辑计划
260 | 
261 | 上面提到`prog.invokeInteractiveModeForExecution()`这段逻辑会触发客户端逻辑计划的生成，那么是怎样一个过程呢？其实这里只是调用了用户jar包的主函数，真正的触发生成过程由用户代码的执行来完成。例如用户写了这样一段 flink 代码：
262 | 
263 | ```java
264 | object FlinkDemo extends App with Logging{
265 |   override def main(args: Array[String]): Unit ={
266 |     val properties = new Properties
267 |     properties.setProperty("bootstrap.servers", DemoConfig.kafkaBrokerList)
268 | 
269 |  properties.setProperty("zookeeper.connect","host01:2181,host02:2181,host03:2181/kafka08")
270 |     properties.setProperty("group.id", "flink-demo-waimai-e")
271 | 
272 |     val env = StreamExecutionEnvironment.getExecutionEnvironment
273 |     env.enableCheckpointing(5000L, CheckpointingMode.EXACTLY_ONCE) //checkpoint every 5 seconds.
274 |     val stream = env.addSource(new FlinkKafkaConsumer08[String]("log.waimai_e", new SimpleStringSchema, properties)).setParallelism(2)
275 |     val counts = stream.name("log.waimai_e").map(toPoiIdTuple(_)).filter(_._2 != null)
276 |       .keyBy(0)
277 |       .timeWindow(Time.seconds(5))
278 |       .sum(1)
279 | 
280 |     counts.addSink(sendToKafka(_))
281 |     env.execute()
282 |   }
283 | ```
284 | 
285 | 注意到这样一段`val env = StreamExecutionEnvironment.getExecutionEnvironment`，这段代码会获取客户端的环境配置，它首先会转到这样一段逻辑：
286 | 
287 | ```java
288 | //StreamExecutionEnvironment 1256
289 | public static StreamExecutionEnvironment getExecutionEnvironment() {
290 |    if (contextEnvironmentFactory != null) {
291 |       return contextEnvironmentFactory.createExecutionEnvironment();
292 |    }
293 | 
294 |    // because the streaming project depends on "flink-clients" (and not the other way around)
295 |    // we currently need to intercept the data set environment and create a dependent stream env.
296 |    // this should be fixed once we rework the project dependencies
297 |    
298 |    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
299 | ```
300 | 
301 | `ExecutionEnvironment.getExecutionEnvironment();`获取环境的逻辑如下：
302 | 
303 | ```java
304 | //ExecutionEnvironment line1137
305 | public static ExecutionEnvironment getExecutionEnvironment() {
306 |    return contextEnvironmentFactory == null ? 
307 |          createLocalEnvironment() : contextEnvironmentFactory.createExecutionEnvironment();
308 | }
309 | ```
310 | 
311 | 这里的`contextEnvironmentFactory`是一个静态成员，早在`ContextEnvironment.setAsContext(factory)`已经触发过初始化了，其中包含了如下的环境信息:
312 | 
313 | ```java
314 | //ContextEnvironmentFactory line51
315 | public ContextEnvironmentFactory(ClusterClient client, List<URL> jarFilesToAttach,
316 |       List<URL> classpathsToAttach, ClassLoader userCodeClassLoader, int defaultParallelism,
317 |       boolean isDetached, String savepointPath)
318 | {
319 |    this.client = client;
320 |    this.jarFilesToAttach = jarFilesToAttach;
321 |    this.classpathsToAttach = classpathsToAttach;
322 |    this.userCodeClassLoader = userCodeClassLoader;
323 |    this.defaultParallelism = defaultParallelism;
324 |    this.isDetached = isDetached;
325 |    this.savepointPath = savepointPath;
326 | }
327 | ```
328 | 
329 | 其中的 client 就是上面生成的 `YarnClusterClient`，其它的意思较明显，就不多做解释了。
330 | 
331 | 用户在执行`val env = StreamExecutionEnvironment.getExecutionEnvironment`这样一段逻辑后会得到一个`StreamContextEnvironment`，其中封装了 streaming 的一些执行配置 【buffer time out等】，另外保存了上面提到的 `ContextEnvironmen`t 的引用。
332 | 
333 | 到这里关于 streaming 需要的执行环境信息已经设置完成。
334 | 
335 | #### 初步逻辑计划 StreamGraph 的生成
336 | 
337 | 接下来用户代码执行到`val stream = env.addSource(new FlinkKafkaConsumer08`，这段逻辑实际会生成一个`DataStream`抽象，`DataStream`是flink关于streaming抽象的最核心抽象，后续所有的算子转换都会在`DataStream`上来完成，上面的`addSource`操作会触发下面这段逻辑:
338 | 
339 | ```java
340 | public <OUT> DataStreamSource<OUT> addSource(SourceFunction<OUT> function, String sourceName, TypeInformation<OUT> typeInfo) {
341 | 
342 |    if(typeInfo == null) {
343 |       if (function instanceof ResultTypeQueryable) {
344 |          typeInfo = ((ResultTypeQueryable<OUT>) function).getProducedType();
345 |       } else {
346 |          try {
347 |             typeInfo = TypeExtractor.createTypeInfo(
348 |                   SourceFunction.class,
349 |                   function.getClass(), 0, null, null);
350 |          } catch (final InvalidTypesException e) {
351 |             typeInfo = (TypeInformation<OUT>) new MissingTypeInfo(sourceName, e);
352 |          }
353 |       }
354 |    }
355 | 
356 |    boolean isParallel = function instanceof ParallelSourceFunction;
357 | 
358 |    clean(function);
359 |    StreamSource<OUT, ?> sourceOperator;
360 |    if (function instanceof StoppableFunction) {
361 |       sourceOperator = new StoppableStreamSource<>(cast2StoppableSourceFunction(function));
362 |    } else {
363 |       sourceOperator = new StreamSource<>(function);
364 |    }
365 | 
366 |    return new DataStreamSource<>(this, typeInfo, sourceOperator, isParallel, sourceName);
367 | }
368 | ```
369 | 
370 | 简要总结下上面的逻辑：
371 | 
372 | - 获取数据源 source 的 output 信息 TypeInformation
373 | - 生成 StreamSource sourceOperator
374 | - 生成 DataStreamSource【封装了 sourceOperator】，并返回
375 | - 将 StreamTransformation 添加到算子列表 transformations 中【只有 转换 transform 操作才会添加算子，其它都只是暂时做了 transformation 的叠加封装】
376 | - 后续会在 DataStream 上做操作
377 | 
378 | 该输出`DataStreamSource`继承自`SingleOutputStreamOperator`具体的继承关系如下：
379 | 
380 | ![flink-datastream-extend](flink-datastream-extend.png)
381 | 
382 | 而生成的 StreamSource operator 走的是另一套继承接口：
383 | 
384 | ![stream-operator-extend.png](stream-operator-extend.png)
385 | 
386 | DataStreamSource 是一个 DataStream **数据流**抽象，StreamSource 是一个 StreamOperator **算子**抽象，在 flink 中一个 DataStream 封装了一次数据流转换，一个 StreamOperator 封装了一个函数接口，比如 map、reduce、keyBy等。*关于算子的介绍会另起一节：flink算子的声明周期*
387 | 
388 | 可以看到在 DataStream 上可以进行一系列的操作(map filter 等)，来看一个常规操作比如 map 会发生什么：
389 | 
390 | ```java
391 | //DataStream line503
392 | public <R> SingleOutputStreamOperator<R> map(MapFunction<T, R> mapper) {
393 | 
394 |    TypeInformation<R> outType = TypeExtractor.getMapReturnTypes(clean(mapper), getType(),
395 |          Utils.getCallLocationName(), true);
396 | 
397 |    return transform("Map", outType, new StreamMap<>(clean(mapper)));
398 | }
399 | ```
400 | 
401 | 一个map操作会触发一次 transform，那么transform做了什么工作呢？
402 | 
403 | ```java
404 | //DataStream line1020
405 | @PublicEvolving
406 | public <R> SingleOutputStreamOperator<R> transform(String operatorName, TypeInformation<R> outTypeInfo, OneInputStreamOperator<T, R> operator) {
407 | 
408 |    // read the output type of the input Transform to coax out errors about MissingTypeInfo
409 |    transformation.getOutputType();
410 | 
411 |    OneInputTransformation<T, R> resultTransform = new OneInputTransformation<>(
412 |          this.transformation,
413 |          operatorName,
414 |          operator,
415 |          outTypeInfo,
416 |          environment.getParallelism());
417 | 
418 |    @SuppressWarnings({ "unchecked", "rawtypes" })
419 |    SingleOutputStreamOperator<R> returnStream = new SingleOutputStreamOperator(environment, resultTransform);
420 | 
421 |    getExecutionEnvironment().addOperator(resultTransform);
422 | 
423 |    return returnStream;
424 | }
425 | ```
426 | 
427 | 这一步生成了一个 `StreamTransformation`并以此作为成员变量封装成另一个 DataStream 返回，`StreamTransformation`是 flink关于数据流转换的核心抽象，只有需要 transform 的流才会生成新的DataStream 算子，后面会详细解释，注意上面有这一行`getExecutionEnvironment().addOperator(resultTransform)`flink会将transformation维护起来：
428 | 
429 | ```java
430 | //StreamExecutionEnvironment line 1237
431 | @Internal
432 | public void addOperator(StreamTransformation<?> transformation) {
433 |    Preconditions.checkNotNull(transformation, "transformation must not be null.");
434 |    this.transformations.add(transformation);
435 | }
436 | ```
437 | 
438 | 所以，用户的一连串操作 map join等实际上在 DataStream 上做了转换，并且flink将这些 `StreamTransformation` 维护起来，一直到最后，用户执行 `env.execute()`这样一段逻辑，StreamGraph 的构建才算真正开始...
439 | 
440 | 用户在执行` env.execute()`会触发这样一段逻辑：
441 | 
442 | ```java
443 | //StreamContextEnvironment line51   
444 | public JobExecutionResult execute(String jobName) throws Exception {
445 |       Preconditions.checkNotNull("Streaming Job name should not be null.");
446 | 
447 |       StreamGraph streamGraph = this.getStreamGraph();
448 |       streamGraph.setJobName(jobName);
449 | 
450 |       transformations.clear();
451 | 
452 |       // execute the programs
453 |       if (ctx instanceof DetachedEnvironment) {
454 |          LOG.warn("Job was executed in detached mode, the results will be available on completion.");
455 |          ((DetachedEnvironment) ctx).setDetachedPlan(streamGraph);
456 |          return DetachedEnvironment.DetachedJobExecutionResult.INSTANCE;
457 |       } else {
458 |          return ctx.getClient().runBlocking(streamGraph, ctx.getJars(), ctx.getClasspaths(), ctx.getUserCodeClassLoader(), ctx.getSavepointPath());
459 |       }
460 |    }
461 | }
462 | ```
463 | 
464 | 这段代码做了两件事情：
465 | 
466 | - 首先使用 `StreamGraphGenerator` 产生 StreamGraph
467 | - 使用 Client 运行 stream graph
468 | 
469 | 那么` StreamGraphGenerator` 做了哪些操作呢？
470 | 
471 | ` StreamGraphGenerator`会依据添加算子时保存的 transformations 信息生成 job graph 中的节点，并创建节点连接，分流操作 如 union,select,split 不会添加边，只会创建虚拟节点或在上有节点添加 selector
472 | 
473 | 这里会将 StreamTransformation 转换为 StreamNode，StreamNode 保存了算子的信息【会另外介绍】，如下图所示
474 | 
475 | <img src="./transformation-to-node.png" width="535" height="300" alt="transformation-to-node.png" align=center />
476 | 
477 | 到这里由 `StreamNode` 构成的 DAG 图 `StreamGraph`就生成了
478 | 
479 | 不过 在提交给 client 的时候，flink 会做进一步的优化:
480 | 
481 |  `StreamGraph` 将进一步转换为 `JobGraph`，这一步工作由 `StreamingJobGraphGenerator` 来完成，为什么要做这一步转换呢？主要因为有可以 chain 的算子，这里进一步将 StreamNode 转换为 JobVertex，主要工作是将可以 chain 的算子合并【这一步优化是默认打开的】，并设置资源，重试策略等，最终生成可以提交给 JobManager 的 JobGraph
482 | 
483 | #### 优化的逻辑计划 JobGraph 的生成
484 | 
485 | ```java
486 | //StreamingJobGraphGenerator line181
487 | private List<StreamEdge> createChain(
488 |       Integer startNodeId,
489 |       Integer currentNodeId,
490 |       Map<Integer, byte[]> hashes) {
491 | 
492 |    if (!builtVertices.contains(startNodeId)) {
493 | 
494 |       List<StreamEdge> transitiveOutEdges = new ArrayList<StreamEdge>();
495 | 
496 |       List<StreamEdge> chainableOutputs = new ArrayList<StreamEdge>();
497 |       List<StreamEdge> nonChainableOutputs = new ArrayList<StreamEdge>();
498 | 
499 |       for (StreamEdge outEdge : streamGraph.getStreamNode(currentNodeId).getOutEdges()) {
500 |          if (isChainable(outEdge)) {
501 |             chainableOutputs.add(outEdge);
502 |          } else {
503 |             nonChainableOutputs.add(outEdge);
504 |          }
505 |       }
506 | 
507 |       for (StreamEdge chainable : chainableOutputs) {
508 |          transitiveOutEdges.addAll(createChain(startNodeId, chainable.getTargetId(), hashes));
509 |       }
510 |      ...
511 | ```
512 | 
513 | 上面的方法是算子 chain 的核心操作，简要概括下：
514 | 
515 | - 如果从此 start node 开始未生成过 JobVertex，则执行 chain逻辑，由于是递归操作，会先深度优先遍历，将源节点开始到第一个不可 chain 的 StreamNode 之间的算子做 chain 操作【先算叶子节点的 chain，依次往根节点计算】
516 | - line 207 遇到不可 chain 的边，开始深度遍历生成 JobVertex
517 | - line 216 将 StreamNode 的输入输出配置，包括序列化配置等设置到上面的 StreamingConfig 中，并在 vertexConfigs 中保存起来，如果是 新生成的 JobVertex，起对应的 StreamingConfig 会以 start node id 为 key 进行保存
518 | - transitiveOutEdges 保存的该节点下游所有的 non chain_able  edges，最终的方法会返回此数据结构
519 | - 连接 start node 和所有的 transitiveOutEdges 【在输入 JobVertex 创建 IntermediateDataSet，partition类型为 pipeline，生成 JobEdge】
520 | - 如果是新生成JobVertex，继续设置config，包括 chain start，所有物理输出，及直接逻辑输出、chained config等
521 | - 如果不是新生成 JobVertex，直接chain configs
522 | 
523 | 这里总结下JobGraph的构建过程，见下图:
524 | 
525 | ![flink-job-graph-create](flink-job-graph-create.png)
526 | 
527 | 大致过程总结如下：
528 | 
529 | - 由`DataStream`上的操作生成`StreamTransformation`列表
530 | - 从`StreamTransformation`的生成关系创建`StreamNode`和`StreamEdge`
531 | - 做算子chain，合并成 `JobVertex`，并生成 `JobEdge`
532 | 
533 | 一个 JobVertex 代表一个逻辑计划的节点，就是 DAG 图上的顶点，有点类似于 Storm 的 bolt 或 spout，生成一个 JobVertex 的逻辑如下：
534 | 
535 | ```java
536 | //StreamingJobGenerator line258
537 | private StreamConfig createJobVertex(
538 |       Integer streamNodeId,
539 |       Map<Integer, byte[]> hashes) {
540 | 
541 |    JobVertex jobVertex;
542 |    ...
543 |    JobVertexID jobVertexId = new JobVertexID(hash);
544 | 
545 |    if (streamNode.getInputFormat() != null) {
546 |       jobVertex = new InputFormatVertex(
547 |             chainedNames.get(streamNodeId),
548 |             jobVertexId);
549 |       TaskConfig taskConfig = new TaskConfig(jobVertex.getConfiguration());
550 |       taskConfig.setStubWrapper(new UserCodeObjectWrapper<Object>(streamNode.getInputFormat()));
551 |    } else {
552 |       jobVertex = new JobVertex(
553 |             chainedNames.get(streamNodeId),
554 |             jobVertexId);
555 |    }
556 | 
557 |    jobVertex.setInvokableClass(streamNode.getJobVertexClass());
558 | 
559 |    ...
560 | 
561 |    return new StreamConfig(jobVertex.getConfiguration());
562 | }
563 | ```
564 | 
565 | *这里有两段逻辑值得注意，第一是数据源节点的判断，第二是运行时执行类 InvokableClass 的设置*
566 | 
567 | `streamNode.getInputFormat()`是判断是否是数据源节点的逻辑，如果是数据源节点，这里会将用户代码【这里为 InputFormat.class 的子类】设置进 JobVertex 的配置中，并在 JobManager 执行提交作业命令的时候做初始化，会在 Flink 物理计划生成一节介绍。
568 | 
569 | `jobVertex.setInvokableClass`是设置运行时的执行类，通过这个类再调用用户定义的 operator，是 flink task 中真正被执行的类，具体会在 flink-task-runtime 一节中详细介绍。
570 | 
571 | 至此 JobGraph 生成，并扔给 JobManager 执行😝


--------------------------------------------------------------------------------
/flink/flink基本组件和JobGraph的生成/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: about
3 | date: 2016-12-03 11:59:30
4 | ---
5 | 


--------------------------------------------------------------------------------
/flink/flink基本组件和JobGraph的生成/stream-operator-extend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink基本组件和JobGraph的生成/stream-operator-extend.png


--------------------------------------------------------------------------------
/flink/flink基本组件和JobGraph的生成/transformation-to-node.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink基本组件和JobGraph的生成/transformation-to-node.png


--------------------------------------------------------------------------------
/flink/flink对用户代码异常处理/flink用户异常处理.md:
--------------------------------------------------------------------------------
  1 | # flink 对用户代码异常的处理
  2 | 
  3 | ## 前言
  4 | 
  5 | flink 的架构在 flink 基本组件一节已经介绍过，其中的 TaskManager 负责监护 task 的执行，对于每个 task，flink 都会启动一个线程去执行，那么当用户的代码抛出异常时，flink 的处理逻辑是什么呢？
  6 | 
  7 | ## 异常后的组件通信
  8 | 
  9 | flink 的 task 的 Runnable 类是 Task.java，我们观察到它的 `run()` 方法真个被一个大的 try catch 包住，我们重点关注 catch 用户异常之后的部分：
 10 | 
 11 | ```java
 12 | //Task line612
 13 | 		catch (Throwable t) {
 14 | 
 15 | 			// ----------------------------------------------------------------
 16 | 			// the execution failed. either the invokable code properly failed, or
 17 | 			// an exception was thrown as a side effect of cancelling
 18 | 			// ----------------------------------------------------------------
 19 | 
 20 | 			try {
 21 | 				// transition into our final state. we should be either in DEPLOYING, RUNNING, CANCELING, or FAILED
 22 | 				// loop for multiple retries during concurrent state changes via calls to cancel() or
 23 | 				// to failExternally()
 24 | 				while (true) {
 25 | 					ExecutionState current = this.executionState;
 26 | ```
 27 | 
 28 | 简单总结其逻辑：
 29 | 
 30 | - 如果当前的执行状态是 `ExecutionState.RUNNING` 或者 `ExecutionState.DEPLOYING`，表明是从正常运行到异常状态的过度，这时候判断是主动 Cancel 执行，如果是，执行 StreamTask 的 cancel 方法， 并通知观察者它的状态已变成：`ExecutionState.CANCELED`；如果不是主动 Cancel，表明是用户异常触发，这时候同样执行 StreamTask 的 cancel 方法，然后通知观察者它的状态变成：`ExecutionState.FAILED`，这里的 cancel 方法留给 flink 内部的算子来实现，对于普通 task ，会停止消费上游数据，对于 source task，会停止发送源数据
 31 | 
 32 | - 对于用户异常来说，通知观察者的状态应该为 `ExecutionState.FAILED`，*我们下面详细分析*
 33 | 
 34 | - finally 的部分会释放掉这个 task 占有的所有资源，包括线程池、输入 InputGate 及 写出 ResultPartition 占用的全部 BufferPool、缓存的 jar 包等，最后通知 TaskManager 这个 Job 的 这个 task 已经执行结束：
 35 | 
 36 |   `notifyFinalState()`
 37 | 
 38 | 
 39 | - 如果异常逻辑发生了任何其它异常，说明是 TaskManager 相关环境发生问题，这个时候会杀死 TaskManager
 40 | 
 41 | ### 通知TaskManager
 42 | 
 43 | 上面提到，finally 的最后阶段会通知 TaskManager，我们来梳理逻辑：
 44 | 
 45 | ```java
 46 | //TaskManager line444
 47 | // removes the task from the TaskManager and frees all its resources
 48 |         case TaskInFinalState(executionID) =>
 49 |           unregisterTaskAndNotifyFinalState(executionID)
 50 |           
 51 | //TaskManager line1228
 52 | private def unregisterTaskAndNotifyFinalState(executionID: ExecutionAttemptID): Unit = {
 53 | 
 54 |     val task = runningTasks.remove(executionID)
 55 |     if (task != null) {
 56 | 
 57 |       // the task must be in a terminal state
 58 |       if (!task.getExecutionState.isTerminal) {
 59 |         try {
 60 |           task.failExternally(new Exception("Task is being removed from TaskManager"))
 61 |         } catch {
 62 |           case e: Exception => log.error("Could not properly fail task", e)
 63 |         }
 64 |       }
 65 |       
 66 | //TaskManager line1251
 67 |      self ! decorateMessage(
 68 |         UpdateTaskExecutionState(
 69 |           new TaskExecutionState(
 70 |             task.getJobID,
 71 |             task.getExecutionId,
 72 |             task.getExecutionState,
 73 |             task.getFailureCause,
 74 |             accumulators)
 75 |         )
 76 |       )
 77 |        
 78 | //ExecutionGraph line1189
 79 | 				case FAILED:
 80 | 					attempt.markFailed(state.getError(userClassLoader));
 81 | 					return true;
 82 | 
 83 | //Execution line658
 84 | 	void markFinished(Map<AccumulatorRegistry.Metric, Accumulator<?, ?>> flinkAccumulators, Map<String, Accumulator<?, ?>> userAccumulators) {
 85 | 
 86 | 		// this call usually comes during RUNNING, but may also come while still in deploying (very fast tasks!)
 87 | 		while (true) {
 88 | 			ExecutionState current = this.state;
 89 | 
 90 | 			if (current == RUNNING || current == DEPLOYING) {
 91 | 
 92 | 				if (transitionState(current, FINISHED)) {
 93 | 					try {
 94 | 
 95 | //Execution line991
 96 | 			try {
 97 | 				vertex.notifyStateTransition(attemptId, targetState, error);
 98 | 			}
 99 | 			catch (Throwable t) {
100 | 				LOG.error("Error while notifying execution graph of execution state transition.", t);
101 | 			}
102 |                       
103 | //ExecutionGraph line1291
104 | 	void notifyExecutionChange(JobVertexID vertexId, int subtask, ExecutionAttemptID executionID, ExecutionState
105 | 							newExecutionState, Throwable error)
106 | 	{
107 |      //...
108 |         // see what this means for us. currently, the first FAILED state means -> FAILED
109 | 		if (newExecutionState == ExecutionState.FAILED) {
110 | 			fail(error);
111 | 		}
112 | 
113 | //ExecutionGraph line845     
114 | 	public void fail(Throwable t) {
115 | 		while (true) {
116 | 			JobStatus current = state;
117 | 			// stay in these states
118 | 			if (current == JobStatus.FAILING ||
119 | 				current == JobStatus.SUSPENDED ||
120 | 				current.isGloballyTerminalState()) {
121 | 				return;
122 | 			} else if (current == JobStatus.RESTARTING && transitionState(current, JobStatus.FAILED, t)) {
123 | 				synchronized (progressLock) {
124 | 					postRunCleanup();
125 | 					progressLock.notifyAll();
126 | 
127 | 					LOG.info("Job {} failed during restart.", getJobID());
128 | 					return;
129 | 				}
130 | 			} else if (transitionState(current, JobStatus.FAILING, t)) {
131 | 				this.failureCause = t;
132 | 
133 | 				if (!verticesInCreationOrder.isEmpty()) {
134 | 					// cancel all. what is failed will not cancel but stay failed
135 | 					for (ExecutionJobVertex ejv : verticesInCreationOrder) {
136 | 						ejv.cancel();
137 | 					}
138 | 				} else {
139 | 					// set the state of the job to failed
140 | 					transitionState(JobStatus.FAILING, JobStatus.FAILED, t);
141 | 				}
142 | 
143 | 				return;
144 | 			}
145 | 
146 | 			// no need to treat other states
147 | 		}
148 | 	}
149 | ```
150 | 
151 | 总结其逻辑：
152 | 
153 | - 在一些合法性 check 之后，TaskManager 会给自己发送一条路由消息：`UpdateTaskExecutionState`，TaskManager 继而将这条消息转发给 JobManager
154 | - JobManager 会标志 Job 状态为 FAILING 并通知 JobCli，并且立即停止所有 task 的执行，这时候 CheckpointCoordinator 在执行 checkpoint 的时候感知到 task 失败状态会立即返回，停止 checkpoint
155 | 
156 | 
157 | 
158 | ## 异常后的资源释放
159 | 
160 | 主要包括以下资源：
161 | 
162 | - 网络资源：InputGate 和 ResultPartiton 的内存占用
163 | - 其他内存：通过 MemoryManager 申请的资源
164 | - 缓存资源：lib 包和其他缓存
165 | - 线程池：Task 内部持有


--------------------------------------------------------------------------------
/flink/flink物理计划生成/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink物理计划生成/.DS_Store


--------------------------------------------------------------------------------
/flink/flink物理计划生成/execution-many-one.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink物理计划生成/execution-many-one.png


--------------------------------------------------------------------------------
/flink/flink物理计划生成/execution-one-many.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink物理计划生成/execution-one-many.png


--------------------------------------------------------------------------------
/flink/flink物理计划生成/execution-vertex-one-to-one.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink物理计划生成/execution-vertex-one-to-one.png


--------------------------------------------------------------------------------
/flink/flink物理计划生成/flink-job-vertex-to-execution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink物理计划生成/flink-job-vertex-to-execution.png


--------------------------------------------------------------------------------
/flink/flink物理计划生成/flink物理计划生成.md:
--------------------------------------------------------------------------------
  1 | # Flink执行计划生成
  2 | 
  3 | ## 前言
  4 | 
  5 | 上一节讲到业务代码`StreamExecutionEnvironment.execute()`会触发job的客户端逻辑计划`JobGraph` 的生成，之后是客户端与`JobManager`的交互过程
  6 | 
  7 | ```java
  8 | //ClusterClient line388
  9 | public JobExecutionResult run(JobGraph jobGraph, ClassLoader classLoader) throws ProgramInvocationException {
 10 | 
 11 |    waitForClusterToBeReady();
 12 | 
 13 |    final LeaderRetrievalService leaderRetrievalService;
 14 |    try {
 15 |       leaderRetrievalService = LeaderRetrievalUtils.createLeaderRetrievalService(flinkConfig);
 16 |    } catch (Exception e) {
 17 |       throw new ProgramInvocationException("Could not create the leader retrieval service", e);
 18 |    }
 19 | 
 20 |    try {
 21 |       logAndSysout("Submitting job with JobID: " + jobGraph.getJobID() + ". Waiting for job completion.");
 22 |       this.lastJobExecutionResult = JobClient.submitJobAndWait(actorSystemLoader.get(),
 23 |          leaderRetrievalService, jobGraph, timeout, printStatusDuringExecution, classLoader);
 24 |       return this.lastJobExecutionResult;
 25 |    } catch (JobExecutionException e) {
 26 |       throw new ProgramInvocationException("The program execution failed: " + e.getMessage(), e);
 27 |    }
 28 | }
 29 | ```
 30 | 其中`leaderRetrievalService = LeaderRetrievalUtils.createLeaderRetrievalService(flinkConfig);`是启动获取 leader JobManager 的服务，flink 支持 JobManager HA，需要通过 leader JobManager 获取当前的 leader JobManager，稍微介绍下这个服务：
 31 | 
 32 | ## JobManager Leader 选举
 33 | 
 34 | 先来看获取 `LeaderRetrievalService`的逻辑：
 35 | 
 36 | ```java
 37 | //LeaderRetrievalUtils line61
 38 | public static LeaderRetrievalService createLeaderRetrievalService(Configuration configuration)
 39 |    throws Exception {
 40 | 
 41 |    RecoveryMode recoveryMode = getRecoveryMode(configuration);
 42 | 
 43 |    switch (recoveryMode) {
 44 |       case STANDALONE:
 45 |          return StandaloneUtils.createLeaderRetrievalService(configuration);
 46 |       case ZOOKEEPER:
 47 |          return ZooKeeperUtils.createLeaderRetrievalService(configuration);
 48 |       default:
 49 |          throw new Exception("Recovery mode " + recoveryMode + " is not supported.");
 50 |    }
 51 | }
 52 | ```
 53 | 
 54 | 首先 flink 会依据配置获取 `RecoveryMode`，`RecoveryMode`一共两种：*STANDALONE*和*ZOOKEEPER*。如果用户配置的是*STANDALONE*，会直接去配置中获取`JobManager`的地址，这里主要介绍`ZOOKEEPER`模式下的`JobManager`leader的发现过程：
 55 | 
 56 | ```java
 57 | //ZooKeeperUtils line141
 58 | public static ZooKeeperLeaderRetrievalService createLeaderRetrievalService(
 59 |       Configuration configuration) throws Exception {
 60 |    CuratorFramework client = startCuratorFramework(configuration);
 61 |    String leaderPath = configuration.getString(ConfigConstants.ZOOKEEPER_LEADER_PATH,
 62 |          ConfigConstants.DEFAULT_ZOOKEEPER_LEADER_PATH);
 63 | 
 64 |    return new ZooKeeperLeaderRetrievalService(client, leaderPath);
 65 | }
 66 | 
 67 | ...
 68 | //ZooKeeperLeaderRetrievalService line103
 69 | 	public void nodeChanged() throws Exception {
 70 | 		try {
 71 | 			LOG.debug("Leader node has changed.");
 72 | 
 73 | 			ChildData childData = cache.getCurrentData();
 74 | 
 75 | 			String leaderAddress;
 76 | 			UUID leaderSessionID;
 77 | 
 78 | 			if (childData == null) {
 79 | 				leaderAddress = null;
 80 | 				leaderSessionID = null;
 81 | 			} else {
 82 | 				byte[] data = childData.getData();
 83 | 
 84 | 				if (data == null || data.length == 0) {
 85 | 					leaderAddress = null;
 86 | 					leaderSessionID = null;
 87 | 				} else {
 88 | 					ByteArrayInputStream bais = new ByteArrayInputStream(data);
 89 | 					ObjectInputStream ois = new ObjectInputStream(bais);
 90 | 
 91 | 					leaderAddress = ois.readUTF();
 92 | 					leaderSessionID = (UUID) ois.readObject();
 93 | ```
 94 | 
 95 | 这里 flink 会首先尝试连接 zookeeper，利用 zookeeper的leader选举服务发现leader节点的地址和当前的 sessionid，session id的作用介绍`JobManager`的时候会详细说明
 96 | 
 97 | ## 客户端JobGraph的提交
 98 | 
 99 | 客户端的`JobGraph`生成之后，通过上面的`LeaderRetrivalService`获取`JobManager`的地址，接下来就是将`JobGraph`提交给`JobManager`去执行。flink 的核心进程通信是通过 Akka 来完成的，`JobManager`、`TaskManager`都是一个 Akka system，所以这里的提交首先需要生成一个客户端actor与`JobManager`交互，然后执行rpc命令，具体见：
100 | 
101 | ```java
102 | //JobClient line98
103 | public static JobExecutionResult submitJobAndWait(
104 |       ActorSystem actorSystem,
105 |       LeaderRetrievalService leaderRetrievalService,
106 |       JobGraph jobGraph,
107 |       FiniteDuration timeout,
108 |       boolean sysoutLogUpdates,
109 |       ClassLoader classLoader) throws JobExecutionException {
110 | 
111 |    ...
112 | 
113 |    // for this job, we create a proxy JobClientActor that deals with all communication with
114 |    // the JobManager. It forwards the job submission, checks the success/failure responses, logs
115 |    // update messages, watches for disconnect between client and JobManager, ...
116 | 
117 |    Props jobClientActorProps = JobClientActor.createJobClientActorProps(
118 |       leaderRetrievalService,
119 |       timeout,
120 |       sysoutLogUpdates);
121 | 
122 |    ActorRef jobClientActor = actorSystem.actorOf(jobClientActorProps);
123 |    
124 |    // first block handles errors while waiting for the result
125 |    Object answer;
126 |    try {
127 |       Future<Object> future = Patterns.ask(jobClientActor,
128 |             new JobClientMessages.SubmitJobAndWait(jobGraph),
129 |             new Timeout(AkkaUtils.INF_TIMEOUT()));
130 |       
131 |       answer = Await.result(future, AkkaUtils.INF_TIMEOUT());
132 |    }
133 |    ...
134 | ```
135 | 
136 | 在`JobClientActor`启动之前会启动`LeaderRetrivalService`，`LeaderRetrivalService`启动之后会通知它的 Listener `JobClientActor `获取` JobManager`的地址和当前 session id。之后经过消息路由跳转到提交的核心逻辑：
137 | 
138 | ```java
139 | //JobClientActor line354
140 | private void tryToSubmitJob(final JobGraph jobGraph) {
141 |    this.jobGraph = jobGraph;
142 | 
143 |    if (isConnected()) {
144 |       LOG.info("Sending message to JobManager {} to submit job {} ({}) and wait for progress",
145 |          jobManager.path().toString(), jobGraph.getName(), jobGraph.getJobID());
146 | 
147 |       Futures.future(new Callable<Object>() {
148 |          @Override
149 |          public Object call() throws Exception {
150 |             ActorGateway jobManagerGateway = new AkkaActorGateway(jobManager, leaderSessionID);
151 | 
152 |             LOG.info("Upload jar files to job manager {}.", jobManager.path());
153 | 
154 |             try {
155 |                jobGraph.uploadUserJars(jobManagerGateway, timeout);
156 | ```
157 | 
158 | 上面的代码有所省略😝。
159 | 
160 | 总结下上面的过程：
161 | 
162 | ![jobclient-to-jobmanager](jobclient-to-jobmanager.png)
163 | 
164 | - 启动`JobClientActor`用来和`JobManager`交互
165 | - 启动`LeaderRetrievalService`获取`JobManager`的地址
166 | - 上传用户 jar 包
167 | - 提交 SubmitJob 命令
168 | 
169 | ## JobManager执行计划生成
170 | 
171 | `JobManager`负责接收 flink 的作业，调度 task，收集 job 的状态、管理 TaskManagers。被实现为一个 akka actor。
172 | 
173 | 客户端上传完 jar 包和`JobGraph`，flink 会进一步解析封装成运行时的执行计划`ExecutionGraph`，`JobManager`的构造器在初始化的时候传入了很多组件，这里简单列举下功能方便后面的逻辑展开，具体的细节将会在下一节讲解。
174 | 
175 | - `BlobServer`：实现了 BOLB server，其会监听收到的 requests，并会创建 目录结构存储 BLOBS 【持久化】或者临时性的缓存他们
176 | - `InstanceManager`：TaskManager在`flink`框架内部被叫做`Instance`，flink通过`InstanceManager`管理 flink 集群中当前所有活跃的 TaskManager，包括接收心跳，通知 InstanceListener Instance 的生成与死亡，一个典型的 `InstanceListener` 为 flink 的 Scheduler
177 | - `BlobLibraryCacheManager`：flink job 的 jar 包存储服务，使用上面的 BlobServer 完成。
178 | - `MemoryArchivist`备案已提交的flink作业，包括`JobGraph`、`ExecutionGraph`等
179 | - ​
180 | - `ZooKeeperCompletedCheckpointStore`：负责持久化 job 的 checkpoint 信息，一个 job 可以持久化多个 checkpoint，但只有最新的会被使用，具体方式为先在文件系统中持久化一份，再将文件句柄更新到 zk，并在 zk上依次递增节点路径号，zk 上保存了最近的 10 次 checkpoint
181 | - SavepointStore：flink 的状态存储，负责存储算子内部定义的状态，与 checkpoint 稍有区别，后者由 flink 框架来维护
182 | 
183 | *为了对`JobManager`中所起的 actors 服务有所了解，这里简单介绍下`JobManager`的启动过程*
184 | 
185 | 简单分析得知`line2049: runJobManager`是JobManager启动的入口，在获取`JobManager`启动的主机和端口后，变开始启动 actor system，web ui以及其他 actors：
186 | 
187 | ```java
188 | //JobManager line2008
189 | def runJobManager(
190 |     configuration: Configuration,
191 |     executionMode: JobManagerMode,
192 |     listeningAddress: String,
193 |     listeningPort: Int)
194 |   : Unit = {
195 | 
196 |   val (jobManagerSystem, _, _, webMonitorOption, _) = startActorSystemAndJobManagerActors(
197 |     configuration,
198 |     executionMode,
199 |     listeningAddress,
200 |     listeningPort,
201 |     classOf[JobManager],
202 |     classOf[MemoryArchivist],
203 |     Option(classOf[StandaloneResourceManager])
204 |   )
205 | 
206 |   // block until everything is shut down
207 |   jobManagerSystem.awaitTermination()
208 | ```
209 | 
210 | 具体的启动逻辑在`startActorSystemAndJobManagerActors`方法中：
211 | 
212 | ```java
213 | //JobManager line2150
214 | def startActorSystemAndJobManagerActors(
215 |     configuration: Configuration,
216 |     executionMode: JobManagerMode,
217 |     listeningAddress: String,
218 |     listeningPort: Int,
219 |     jobManagerClass: Class[_ <: JobManager],
220 |     archiveClass: Class[_ <: MemoryArchivist],
221 |     resourceManagerClass: Option[Class[_ <: FlinkResourceManager[_]]])
222 |   : (ActorSystem, ActorRef, ActorRef, Option[WebMonitor], Option[ActorRef]) = {
223 |   ...
224 | ```
225 | 
226 | 简单列举下逻辑：
227 | 
228 | - JobManager 程序的主入口，由 ApplicationMasterBase 发起
229 | - line 2174 使用 Json 配置 Akka 并生成 ActorSystem
230 | - line 2197 初始化 ZooKeeperLeaderRetrievalService，JobManager在启动的时候会以 LeaderRetrievalListener 的身份将自己注册进来，该 service 负责监听最新的 leader 信息，当发生改变时 通知所有 listener【所有的 JobManager】
231 | - line 2220 启动 YarnJobManager 和 MemoryArchivist actors【这里并没有启动】
232 | - line2268 启动【flink基本组件和JobGraph的生成一节中提到的】FlinkResourceManager
233 | - line 2620 createJobManagerComponents 获取以上两个组件必要的配置，并初始化相关服务 具体见【 flink JobManager 中所起的服务】这里在初始化相关组件后会初始化 JobManager，akka actorOf 方法传入的属性为构造器中参数，重载 preStart 和 postStop 方法会在 actor 启动和关闭后 相继执行，JobManager 会在这两个方法中启动和停止这些服务
234 | 
235 | 到这里一个完整的`JobManager` actor 便启动起来了😜
236 | 
237 | 既然是 actor ，那么他的核心逻辑一定是各种消息的路由和处理：
238 | 
239 | ```java
240 | //JobManager line304
241 | override def handleMessage: Receive = {
242 | 
243 |   case GrantLeadership(newLeaderSessionID) =>
244 |     log.info(s"JobManager $getAddress was granted leadership with leader session ID " +
245 |       s"$newLeaderSessionID.")
246 | 
247 |     leaderSessionID = newLeaderSessionID
248 | ```
249 | 
250 | 介绍下这里比较重要的几种消息：
251 | 
252 | - 处理消息的核心方法
253 | - GrantLeadership 获得leader授权，将自身被分发到的 session id 写到 zookeeper，并恢复所有的 jobs.
254 | - RevokeLeadership 剥夺leader授权，打断清空所有的 job 信息，但是保留作业缓存，注销所有的 TaskManagers. 
255 | - RegisterTaskManagers 注册 TaskManager，如果之前已经注册过，则只给对应的 Instance 发送消息，否则启动注册逻辑：在 InstanceManager 中注册该 Instance 的信息，并停止 Instance BlobLibraryCacheManager 的端口【供下载 lib 包用】，同时使用 watch 监听 task manager 的存活
256 | - SubmitJob 提交 jobGraph
257 | 
258 | ### 执行计划 ExecutionGraph 的生成
259 | 
260 | flink 的运行时执行计划为 ExecutionGraph，ExecutionGraph 对应之前的 JobGraph，一个 ExecutionGraph 包含多个 ExecutionJobVertex 节点，JobGraph 的 JobVertex，每个 ExecutionJobVertex 节点的并发子 task 对应一个 ExecutionVertex，每个 ExecutionVertex 的一次 attempt 执行被抽象为一次 Execution，具体如下图所示：
261 | 
262 | ![flink-job-vertex-to-execution.png](flink-job-vertex-to-execution.png)
263 | 
264 | *下面会对每个抽象做详细的介绍*
265 | 
266 | ExecutionGraph 的创建是在 JobManager 接收 SubmitJob 命令后开始的，这条消息会被路由到方法：
267 | 
268 | ```java
269 | //JobManager line1048
270 | private def submitJob(jobGraph: JobGraph, jobInfo: JobInfo, isRecovery: Boolean = false): Unit = {
271 |   if (jobGraph == null) {
272 |     jobInfo.client ! decorateMessage(JobResultFailure(
273 |       new SerializedThrowable(
274 |         new JobSubmissionException(null, "JobGraph must not be null.")
275 |       )
276 |     ))
277 |   }
278 | ```
279 | 
280 | 其逻辑总结如下：
281 | 
282 | - 提交作业
283 | - 具体的组件交互过程 Client.java line169 runBlocking -> JobClient.java line102 submitJobAndWait -> JobClientActor.java line 337 tryToSubmitJob  这里会先上传 jars 到 JobManager 的 BlobServer，然后发起提交命令
284 | - line1068: 设置用户lib包，使用  LibraryCacheManager book job 的jar包，由于之前包已上传，这会创建jobId 和 jars 以及class paths 的对应关系
285 | - line1114: 将 JobGraph 转换为 ExecutionGraph 逻辑计划转化为物理计划【后者维护 data flow 的协调执行、连接、计算中间结果】具体见章节： flink runtime
286 | - line 1178 ExecutionJobVertex 在此处生成，通过 JobGraph 依照数据源顺序获取下游 JobVertex，具体算法如下：
287 | 
288 | ![job-graph-node-sort.png](job-graph-node-sort.png)
289 | 
290 | flink排序节点的顺序：
291 | 
292 | - 数据源节点
293 | - 只有一个上游的节点
294 | - sink节点
295 | 
296 | *例如上图的两个拓扑结构，左边节点排序完的顺序为： 1 2 3 4 5 右边的节点排序完的顺序为：1 2 3 5 4 6*
297 | 
298 | 那么 flink 为什么要将 JobGraph 转换为 ExecutionGraph ，并且排序这些节点呢？ExecutionGraph 代表了运行时的执行计划，包括 task 的并发、连接、中间结果的维护等，排序的目的是给 task 的部署设置先后顺序，想来也是很自然的。我们来看一下 ExecutionGraph 的构造器就能了解个大概：
299 | 
300 | ```java
301 | public ExecutionGraph(
302 |       ExecutionContext executionContext,
303 |       JobID jobId,
304 |       String jobName,
305 |       Configuration jobConfig,
306 |       SerializedValue<ExecutionConfig> serializedConfig,
307 |       FiniteDuration timeout,
308 |       RestartStrategy restartStrategy,
309 |       List<BlobKey> requiredJarFiles,
310 |       List<URL> requiredClasspaths,
311 |       ClassLoader userClassLoader,
312 |       MetricGroup metricGroup) {
313 | 
314 |    ...
315 | 
316 |    this.executionContext = executionContext;
317 | 
318 |    this.jobID = jobId;
319 |    this.jobName = jobName;
320 |    this.jobConfiguration = jobConfig;
321 |    this.userClassLoader = userClassLoader;
322 | 
323 |    this.tasks = new ConcurrentHashMap<JobVertexID, ExecutionJobVertex>();
324 |    this.intermediateResults = new ConcurrentHashMap<IntermediateDataSetID, IntermediateResult>();
325 |    this.verticesInCreationOrder = new ArrayList<ExecutionJobVertex>();
326 |    this.currentExecutions = new ConcurrentHashMap<ExecutionAttemptID, Execution>();
327 | 
328 |    this.jobStatusListenerActors  = new CopyOnWriteArrayList<ActorGateway>();
329 |    this.executionListenerActors = new CopyOnWriteArrayList<ActorGateway>();
330 | 
331 |    this.stateTimestamps = new long[JobStatus.values().length];
332 |    this.stateTimestamps[JobStatus.CREATED.ordinal()] = System.currentTimeMillis();
333 | 
334 |    this.requiredJarFiles = requiredJarFiles;
335 |    this.requiredClasspaths = requiredClasspaths;
336 | 
337 |    this.serializedExecutionConfig = checkNotNull(serializedConfig);
338 | 
339 |    this.timeout = timeout;
340 | 
341 |    this.restartStrategy = restartStrategy;
342 | 
343 |    metricGroup.gauge(RESTARTING_TIME_METRIC_NAME, new RestartTimeGauge());
344 | }
345 | ```
346 | 
347 | 从构造器可以看出，ExecutionGraph 会维护当前的逻辑计划信息【就是有哪些task要执行】、中间结果生成信息，当前正在运行的 task，负责 job 和 task 状态切换的通知等。
348 | 
349 | #### 执行计划节点 ExecutionJobVertex 的生成
350 | 
351 | attachJobGraph 是 ExecutionGraph 构造图结构的核心方法，而其中最关键的逻辑是 执行节点 ExecutionJobGraph 的创建，下面详细分析下其创建过程和核心功能：
352 | 
353 | ```java
354 | //ExecutionJobVertex line95
355 | public ExecutionJobVertex(ExecutionGraph graph, JobVertex jobVertex,
356 |                   int defaultParallelism, FiniteDuration timeout, long createTimestamp)
357 |       throws JobException
358 | {
359 |    ...
360 |    this.graph = graph;
361 |    this.jobVertex = jobVertex;
362 |    
363 |    int vertexParallelism = jobVertex.getParallelism();
364 |    int numTaskVertices = vertexParallelism > 0 ? vertexParallelism : defaultParallelism;
365 |    
366 |    this.parallelism = numTaskVertices;
367 |    this.taskVertices = new ExecutionVertex[numTaskVertices];
368 |    
369 |    this.inputs = new ArrayList<IntermediateResult>(jobVertex.getInputs().size());
370 |    
371 |    // take the sharing group
372 |    this.slotSharingGroup = jobVertex.getSlotSharingGroup();
373 |    this.coLocationGroup = jobVertex.getCoLocationGroup();
374 |    ...
375 |    
376 |    // create the intermediate results
377 |    this.producedDataSets = new IntermediateResult[jobVertex.getNumberOfProducedIntermediateDataSets()];
378 | 
379 |    for (int i = 0; i < jobVertex.getProducedDataSets().size(); i++) {
380 |       final IntermediateDataSet result = jobVertex.getProducedDataSets().get(i);
381 | 
382 |       this.producedDataSets[i] = new IntermediateResult(
383 |             result.getId(),
384 |             this,
385 |             numTaskVertices,
386 |             result.getResultType(),
387 |             result.getEagerlyDeployConsumers());
388 |    }
389 | 
390 |    // create all task vertices
391 |    for (int i = 0; i < numTaskVertices; i++) {
392 |       ExecutionVertex vertex = new ExecutionVertex(this, i, this.producedDataSets, timeout, createTimestamp);
393 |       this.taskVertices[i] = vertex;
394 |    }
395 |    ...
396 |    
397 |    // set up the input splits, if the vertex has any
398 |    try {
399 |       @SuppressWarnings("unchecked")
400 |       InputSplitSource<InputSplit> splitSource = (InputSplitSource<InputSplit>) jobVertex.getInputSplitSource();
401 |       
402 |       if (splitSource != null) {
403 |          inputSplits = splitSource.createInputSplits(numTaskVertices);
404 |          
405 |          if (inputSplits != null) {
406 |             if (splitSource instanceof StrictlyLocalAssignment) {
407 |                inputSplitsPerSubtask = computeLocalInputSplitsPerTask(inputSplits);
408 |                splitAssigner = new PredeterminedInputSplitAssigner(inputSplitsPerSubtask);
409 |             } else {
410 |                splitAssigner = splitSource.getInputSplitAssigner(inputSplits);
411 |             }
412 |          }
413 |       }
414 |       else {
415 |          inputSplits = null;
416 |       }
417 |    }
418 |    catch (Throwable t) {
419 |       throw new JobException("Creating the input splits caused an error: " + t.getMessage(), t);
420 |    }
421 |    
422 |    finishedSubtasks = new boolean[parallelism];
423 | }
424 | ```
425 | 
426 | 简要介绍下其构建逻辑：
427 | 
428 | - 依据对应的 JobVetex 的并发生成对应个数的 ExecutionVertex，一个 ExecutionVertex 代表一个 ExecutionJobVertex 的并发子 task
429 | - 设置 SlotSharingGroup 和 CoLocationGroup，这两个组件是 flink 运行时任务调度的核心抽象，会约束 flink 调度 task 的策略，在 flink 任务调度算法 一节会详细介绍
430 | - 将原来 JobVertex 的中间结果 IntermediateDataSet 转化为 IntermediateResult，后者在前者的基础上加入了 当前正在运行的 producer 信息，是真正关于运行时中间数据的抽象
431 | - 如果对应的 job 节点是数据源节点，会获取其 InputSplitSource，InputSplitSource 控制了数据源并发子 task 和生产的 InputSplit 的对应关系，一个 InputSplit 代表一个数据源分片，对于 flink streaming 来说，InputSplitSource 就是一个 InputFormat，对应一个输入源 task 
432 | - 这里的 InputSplitSource 是在什么时候设置进去的呢？见`JobManager line1163 vertex.initializeOnMaster(userCodeLoader)`以及`StreamingJobGraphGenerator.java line 278 createDataSourceVertex `
433 | 
434 | #### 执行计划节点 ExecutionJobVertex 的连接
435 | 
436 | 构建完节点后通过连接生成执行计划 DAG【见ExecutionGraph attachJobGraph 方法】，connectToPredecessors 是连接执行节点的核心逻辑：
437 | 
438 | ```java
439 | //ExecutionJobGraph line237
440 | public void connectToPredecessors(Map<IntermediateDataSetID, IntermediateResult> intermediateDataSets) throws JobException {
441 |    
442 |    List<JobEdge> inputs = jobVertex.getInputs();
443 |    
444 |    ...
445 |    
446 |    for (int num = 0; num < inputs.size(); num++) {
447 |       JobEdge edge = inputs.get(num);
448 |       
449 |       ...
450 |       
451 |       // fetch the intermediate result via ID. if it does not exist, then it either has not been created, or the order
452 |       // in which this method is called for the job vertices is not a topological order
453 |       IntermediateResult ires = intermediateDataSets.get(edge.getSourceId());
454 |       if (ires == null) {
455 |          throw new JobException("Cannot connect this job graph to the previous graph. No previous intermediate result found for ID "
456 |                + edge.getSourceId());
457 |       }
458 |       
459 |       this.inputs.add(ires);
460 |       
461 |       int consumerIndex = ires.registerConsumer();
462 |       
463 |       for (int i = 0; i < parallelism; i++) {
464 |          ExecutionVertex ev = taskVertices[i];
465 |          ev.connectSource(num, ires, edge, consumerIndex);
466 |       }
467 |    }
468 | }
469 | ```
470 | 
471 | 简要概括逻辑如下：
472 | 
473 | - 设置输入 IntermediateResult
474 | - 将自己注册到  IntermediateResult，目前一个 IntermediateResult 只支持一个 消费 ExecutionJobVertex 节点
475 | - 设置并发子 task ExecutionVertex 和中间结果 IntermediateResult 的连接关系，通过 ExecutionVertex 的 connectSource  方法设置 ExecutionVertex 的连接策略，策略一共两种： POINT_WISE ALL_TO_ALL 前者上游 partition 与下游 consumers 之间是一对多关系，后者是 all to all 关系，这里会将 ExecutionEdge 创建出来并添加 consumer 为此 edge【partition在 new ExecutionVertex时创建出来，由 ExecutionVertex 构造器可知一个 ExecutionVertex 生产一个 partition，partition number 就是 sub task index】
476 | 
477 | #### 执行节点子任务 ExecutionVertex
478 | 
479 | 先看一下 ExecutionVertex 的创建过程：
480 | 
481 | ```java
482 | public ExecutionVertex(
483 |       ExecutionJobVertex jobVertex,
484 |       int subTaskIndex,
485 |       IntermediateResult[] producedDataSets,
486 |       FiniteDuration timeout,
487 |       long createTimestamp) {
488 |    this.jobVertex = jobVertex;
489 |    this.subTaskIndex = subTaskIndex;
490 | 
491 |    this.resultPartitions = new LinkedHashMap<IntermediateResultPartitionID, IntermediateResultPartition>(producedDataSets.length, 1);
492 | 
493 |    for (IntermediateResult result : producedDataSets) {
494 |       IntermediateResultPartition irp = new IntermediateResultPartition(result, this, subTaskIndex);
495 |       result.setPartition(subTaskIndex, irp);
496 | 
497 |       resultPartitions.put(irp.getPartitionId(), irp);
498 |    }
499 | 
500 |    this.inputEdges = new ExecutionEdge[jobVertex.getJobVertex().getInputs().size()][];
501 | 
502 |    this.priorExecutions = new CopyOnWriteArrayList<Execution>();
503 | 
504 |    this.currentExecution = new Execution(
505 |       getExecutionGraph().getExecutionContext(),
506 |       this,
507 |       0,
508 |       createTimestamp,
509 |       timeout);
510 | 
511 |    // create a co-location scheduling hint, if necessary
512 |    CoLocationGroup clg = jobVertex.getCoLocationGroup();
513 |    if (clg != null) {
514 |       this.locationConstraint = clg.getLocationConstraint(subTaskIndex);
515 |    }
516 |    else {
517 |       this.locationConstraint = null;
518 |    }
519 | 
520 |    this.timeout = timeout;
521 | }
522 | ```
523 | 
524 | 逻辑总结如下：
525 | 
526 | - 依据对应的 ExecutionJobGraph 生成的中间数据集 IntermediateResult 的个数生成一定个数的 partition，这里是一个 IntermediateResult 输出一个 partition
527 | - 生成 Execution
528 | - 配置资源相关
529 | 
530 | 下面重点介绍下其连接上游 ExecutionVertex 的过程：
531 | 
532 | connectSource 是连接的核心逻辑，逻辑如下:
533 | 
534 | ```java
535 | //ExecutionVertex line250
536 | public void connectSource(int inputNumber, IntermediateResult source, JobEdge edge, int consumerNumber) {
537 | 
538 |    final DistributionPattern pattern = edge.getDistributionPattern();
539 |    final IntermediateResultPartition[] sourcePartitions = source.getPartitions();
540 | 
541 |    ExecutionEdge[] edges;
542 | 
543 |    switch (pattern) {
544 |       case POINTWISE:
545 |          edges = connectPointwise(sourcePartitions, inputNumber);
546 |          break;
547 | 
548 |       case ALL_TO_ALL:
549 |          edges = connectAllToAll(sourcePartitions, inputNumber);
550 |          break;
551 | 
552 |       default:
553 |          throw new RuntimeException("Unrecognized distribution pattern.");
554 | 
555 |    }
556 | 
557 |    this.inputEdges[inputNumber] = edges;
558 | 
559 |    // add the consumers to the source
560 |    // for now (until the receiver initiated handshake is in place), we need to register the 
561 |    // edges as the execution graph
562 |    for (ExecutionEdge ee : edges) {
563 |       ee.getSource().addConsumer(ee, consumerNumber);
564 |    }
565 | }
566 | ```
567 | 
568 | 逻辑总结如下：
569 | 
570 | - 获取 JobEdge 的数据分发策略：如果非 shuffle 操作就是 DistributionPattern.POINTWISE 否则是 DistributionPattern.ALL_TO_ALL具体见代码：
571 | 
572 | ```java
573 | //StreamingJobGraphGenerator line370
574 | StreamPartitioner<?> partitioner = edge.getPartitioner();
575 | if (partitioner instanceof ForwardPartitioner) {
576 |    downStreamVertex.connectNewDataSetAsInput(
577 |       headVertex,
578 |       DistributionPattern.POINTWISE,
579 |       ResultPartitionType.PIPELINED,
580 |       true);
581 | } else if (partitioner instanceof RescalePartitioner){
582 |    downStreamVertex.connectNewDataSetAsInput(
583 |       headVertex,
584 |       DistributionPattern.POINTWISE,
585 |       ResultPartitionType.PIPELINED,
586 |       true);
587 | } else {
588 |    downStreamVertex.connectNewDataSetAsInput(
589 |          headVertex,
590 |          DistributionPattern.ALL_TO_ALL,
591 |          ResultPartitionType.PIPELINED,
592 |          true);
593 | }
594 | ```
595 | 
596 | - 按照不同的分发策略连接上游
597 | 
598 | DistributionPattern.ALL_TO_ALL 就是简单的全连接，这里就不介绍了，只介绍DistributionPattern.POINTWISE 策略。
599 | 
600 | 该策略连接 execution vertex 与上游的 partitions，会先获取上游的 partition 数与 此 ExecutionJobVertex 的并发度，如果两者并发度相等，则是 一对一 连接：
601 | 
602 | ![execution-vertex-one-to-one.png](execution-vertex-one-to-one.png)
603 | 
604 | 如果 partition 数小于 并发数 ，子 task 只会连接一个上游 partition，具体关系如下图：
605 | 
606 | ![execution-one-many.png](execution-one-many.png)
607 | 
608 | 如果 partition 数大于并发数，子 task 会连接多个上游 partition，具体见下图：
609 | 
610 | ![execution-many-one.png](execution-many-one.png)
611 | 
612 | 到这里运行时执行计划 ExecutionGraph 的生成就介绍完了😄下节将先介绍 JobManager 的核心组件


--------------------------------------------------------------------------------
/flink/flink物理计划生成/job-graph-node-sort.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink物理计划生成/job-graph-node-sort.png


--------------------------------------------------------------------------------
/flink/flink物理计划生成/jobclient-to-jobmanager.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink物理计划生成/jobclient-to-jobmanager.png


--------------------------------------------------------------------------------
/flink/flink算子的生命周期/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink算子的生命周期/.DS_Store


--------------------------------------------------------------------------------
/flink/flink算子的生命周期/flink-operator-extend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink算子的生命周期/flink-operator-extend.png


--------------------------------------------------------------------------------
/flink/flink算子的生命周期/flink算子生命周期.md:
--------------------------------------------------------------------------------
  1 | # Flink算子的生命周期
  2 | 
  3 | ## 前言
  4 | 
  5 | 前面已经介绍了 flink 的逻辑计划、物理计划等相关信息，本文将重点介绍 flink 的 operator 以及运行时的 task，后续会介绍 flink task 的调度算法
  6 | 
  7 | ## 算子
  8 | 
  9 | ### 什么是一个算子
 10 | 
 11 | flink 中的一个 operator 代表一个最顶级的 api 接口，拿 streaming 来说就是，在 DataStream 上做诸如 map/reduce/keyBy 等操作均会生成一个算子
 12 | 
 13 | ### 算子的生成
 14 | 
 15 | 先来看 operator 的继承关系:
 16 | 
 17 | ![flink-operator-extend.png](flink-operator-extend.png)对于 Streaming 来说所有的算子都继承自 StreamOperator，StreamOperator 中定义了一系列的生命周期方法，同时也定义了 snapshort 的接口，AbstractStreamOperator 定义了基本的设置和声明周期方法，AbstractUdfStreamOperator 定义了用户自定义函数的生命周期和快照策略，这些接口的调用时机会在下面一一阐述😄。
 18 | 
 19 | 算子的生成触发于对 DataStream 的操作上，比如 map addSink等。
 20 | 
 21 | ### 算子 chain
 22 | 
 23 | 在 **flink 基本组件和逻辑计划生成一节** 我们介绍了 JobGraph 的生成过程，其中 JobGraph 的生成最大的意义在于做了一些算子的 chain 优化，那么什么样的节点可以被 chain 呢？如下图：
 24 | 
 25 | ![op-chian-chianable.png](op-chian-chianable.png)
 26 | 
 27 | 一些必须要经过 shuffle 的节点是 chain 或者 节点可达 的边界，非常类似于 Spark Streaming 中对于 Stage 的划分，上图中 keyBy 这样的 groupBy 操作就是划分是否可被 chain 的边界
 28 | 
 29 | 在 StreamingJobGraphGenerator 的 createChain 方法中为每个 StreamNode 生成了一个 StreamConfig，并且对于可以生成 JobVertex 的节点[ *chain 的起始节点* ]设置了如下属性：
 30 | 
 31 | ```java
 32 | //StreamingJobGraphGenerator line212
 33 | if (currentNodeId.equals(startNodeId)) {
 34 | 
 35 |    config.setChainStart();
 36 |    config.setChainIndex(0);
 37 |    config.setOutEdgesInOrder(transitiveOutEdges);
 38 |    config.setOutEdges(streamGraph.getStreamNode(currentNodeId).getOutEdges());
 39 | 
 40 |    for (StreamEdge edge : transitiveOutEdges) {
 41 |       connect(startNodeId, edge);
 42 |    }
 43 | 
 44 |    config.setTransitiveChainedTaskConfigs(chainedConfigs.get(startNodeId));
 45 | 
 46 | }
 47 | ```
 48 | 
 49 | 上面的逻辑概括如下：
 50 | 
 51 | - 标志本节点为 chain 的起始位置
 52 | - 设置 chain 的索引
 53 | - 设置可达输出边，就是与下游 JobVertex 直接连接的 StreamEdge
 54 | - 设置自身的直接输出边 StreamEdge
 55 | - 将本 JobVertex 与下游的 JobVertex 连接起来
 56 | - 将被 chained 的可达的下游 StreamNode 的配置一同设置进本 JobVertex 的配置中，后面 task 运行时会用到
 57 | 
 58 | 连接的逻辑如下：
 59 | 
 60 | ```java
 61 | //StreamingJobGraphGenerator line357
 62 | private void connect(Integer headOfChain, StreamEdge edge) {
 63 | 
 64 |    physicalEdgesInOrder.add(edge);
 65 | 
 66 |    Integer downStreamvertexID = edge.getTargetId();
 67 | 
 68 |    JobVertex headVertex = jobVertices.get(headOfChain);
 69 |    JobVertex downStreamVertex = jobVertices.get(downStreamvertexID);
 70 | 
 71 |    StreamConfig downStreamConfig = new StreamConfig(downStreamVertex.getConfiguration());
 72 | 
 73 |    downStreamConfig.setNumberOfInputs(downStreamConfig.getNumberOfInputs() + 1);
 74 | 
 75 |    StreamPartitioner<?> partitioner = edge.getPartitioner();
 76 |    if (partitioner instanceof ForwardPartitioner) {
 77 |       downStreamVertex.connectNewDataSetAsInput(
 78 |          headVertex,
 79 |          DistributionPattern.POINTWISE,
 80 |          ResultPartitionType.PIPELINED,
 81 |          true);
 82 |    } else if (partitioner instanceof RescalePartitioner){
 83 |       downStreamVertex.connectNewDataSetAsInput(
 84 |          headVertex,
 85 |          DistributionPattern.POINTWISE,
 86 |          ResultPartitionType.PIPELINED,
 87 |          true);
 88 |    } else {
 89 |       downStreamVertex.connectNewDataSetAsInput(
 90 |             headVertex,
 91 |             DistributionPattern.ALL_TO_ALL,
 92 |             ResultPartitionType.PIPELINED,
 93 |             true);
 94 |    }
 95 | 
 96 |    if (LOG.isDebugEnabled()) {
 97 |       LOG.debug("CONNECTED: {} - {} -> {}", partitioner.getClass().getSimpleName(),
 98 |             headOfChain, downStreamvertexID);
 99 |    }
100 | }
101 | ```
102 | 
103 | 概括下逻辑：
104 | 
105 | * 获取要连接的两个 JobVertex 对象
106 | * 设置下游 JobVertex 的输入 partition 算法，如果是 forward 或 rescale 的话为 POINTWISE，否则为全连接，也就是 shuffle，POINTWISE 的连接算法在 **flink 物理计划生成** 一节已经介绍，这里不再赘述
107 | 
108 | *以上只是客户端生成逻辑计划时的算子 chain，在运行时算子的的 chain 被封装成了一个单独的对象 OperatorChain，里面在原有的基础上将 operators 的操作封装起来并且确定了下游的的输出入口*
109 | 
110 | 来看 OperatorChain 的核心实现
111 | 
112 | 首先总结下构造器的功能:
113 | 
114 | * 获取可达的 chain 的 StreamNode 配置
115 | * 为直接可达的输出 StreamEdge 分别创建一个 Output，这里为 RecordWriterOutput
116 | * 创建chain的入口
117 | * 如果创建有任何失败，释放掉 RecordWriterOutput 占用的资源，主要是内存 buffer，后面章节会介绍
118 | 
119 | 这里的关键是算子 chain 的创建过程，见下图创建过程：
120 | 
121 | ![op-chain-internal.png](op-chain-internal.png)
122 | 
123 | 上图中 S 节点的下游 A/B/C 是可以与 S Chain 在一起的，D/E 是必须经过网络传输的节点，一个 OperatorChain 封装了图中的节点 S/A/B/C，也就是说上图可以被看做如下所示：
124 | 
125 | ![operator-chain-simple.png](operator-chain-simple.png)
126 | 
127 | OperatorChain 中有两个关键的方法：`createOutputCollector` 和 `createChainedOperator`，前者负责获取一个 StreamNode 的输出Output，后者负责创建 StreamNode 对应的 chain 算子，两者相互调用形成递归，如上面的创建过程图，具体的流程如下：
128 | 
129 | * 创建 S 的所有网络输出 RecordWriterOutput，这里会为 D 和 E 分别创建一个
130 | * 由于从 A 开始对于 S 是可被 chain 的，会递归创建从 C 开始
131 | * 先获取 C 的输出，这里为对应 D 的 RecordWriterOutput
132 | * 拿到 C 对应的 StreamOperator 并将 运行时的 StreamTask 和 Output 设置进去
133 | * 将 StreamOperator 封装成 ChainingOutput 并作为 Output 传给 B
134 | * B 将重复 C 的过程，直到 S/A/B/C 全部被创建
135 | 
136 | *那么 S 发射一条消息后的处理流程是如何呢？*
137 | 
138 | S 在调用 `processElement` 方法时会调用 `output.collect`，这里的 output 为 A 对应的 ChainingOutput，ChainingOutput 的 collect 调用了对应的算子 `StreamOperator A` 的 `processElement` 方法，这里又会调用 B 的 ChainingOutput 的 collect 方法，以此类推。这样便实现了可 chain 算子的本地处理，最终经由网络输出 RecordWriterOutput 发送到下游节点
139 | 
140 | ### 算子的运行
141 | 
142 | flink 算子的运行牵涉到两个关键类 `Task.java` 和 `StreamTask.java`，Task 是直接受 TaskManager 管理和调度的，而 Task 又会调用 StreamTask，StreamTask 中封装了算子的处理逻辑
143 | 
144 | **我们先来看 StreamTask**
145 | 
146 | StreamTask 的 JavaDoc 上描述了其生命周期:
147 | 
148 | ```java
149 | *  -- restoreState() -> restores state of all operators in the chain
150 | *  
151 | *  -- invoke()
152 | *        |
153 | *        +----> Create basic utils (config, etc) and load the chain of operators
154 | *        +----> operators.setup()
155 | *        +----> task specific init()
156 | *        +----> open-operators()
157 | *        +----> run()
158 | *        +----> close-operators()
159 | *        +----> dispose-operators()
160 | *        +----> common cleanup
161 | *        +----> task specific cleanup()
162 | ```
163 | 
164 | StreamTask 运行之初会尝试恢复算子的 State 快照，然后由 Task 调用其 invoke 方法
165 | 
166 | 下面重点分析一下其 invoke 方法的实现
167 | 
168 | - 获取 headOperator，这里的 headOperator 在 StreamingJobGraphGenerator line 210 `setVertexConfig(currentNodeId, config, chainableOutputs, nonChainableOutputs);`设置，对应上面算子 chain 中的 S 节点
169 | - 创建 operatorChain 并设置为 headOperator 的 Output
170 | - `init()`
171 | - `restoreState`
172 | - 执行 operatorChain 中所有 operator 的 open 方法
173 | - `run()`
174 | - 执行 operatorChain 中所有 operator 的 close 方法
175 | - 执行资源回收及 `cleanup()`，最主要的目的是回收内存 buffer
176 | 
177 | StreamTask 中还有关于 Checkpoint 和 StateBackup 的核心逻辑，这里先不介绍，会另开一篇😄
178 | 
179 | 我们来看 StreamTask 的实现类之一 OneInputStreamTask ，便可以知道 `init()` 和 `run()` 分别都做了什么：
180 | 
181 | **init方法**：
182 | 
183 | - 获取算子对应的输入序列化器 TypeSerializer
184 | - 获取输入数据 InputGate[]，InputGate 是 flink 网络传输的核心抽象之一，其在内部封装了消息的接收和内存的管理，后面介绍 flink 网络栈的时候会详细介绍，这里只要了解从 InputGate 可以拿到上游传送过来的数据就可以了
185 | - 初始化 StreamInputProcessor
186 | - 设置一些 metrics 及 累加器
187 | 
188 | StreamInputProcessor 是 StreamTask 内部用来处理 Record 的组件，里面封装了外部 IO 逻辑【*内存不够时将 buffer 吐到磁盘上*】以及 时间对齐逻辑【*Watermark*】，这两个将会合并一节在下一章介绍^_^
189 | 
190 | **run方法**:
191 | 
192 | - 从 StreamInputProcessor 中处理一条记录
193 | - check 是否有异常
194 | 
195 | **真正的运行时类 Task**
196 | 
197 |  *这里我们会详细的介绍下 Task 的核心逻辑*
198 | 
199 | Task 代表一个 TaskManager 中所起的并行 子任务，执行封装的 flink 算子并运行，提供以下服务：消费输入data、生产 IntermediateResultPartition [ *flink关于中间结果的抽象* ]、与 JobManager 交互
200 | 
201 | JobManager 分发 Task 时最初是抽象成了一个描述类 TaskDeploymentDescriptor，TaskManager 在抽到对应的 RPC 请求后会将 Task 初始化后将 线程 拉起，TaskDeploymentDescriptor 是提供 task 信息的核心抽象：
202 | 
203 | - ResultPartitions：task 输出的 partition 数[ *通常和 JobVertex 的下游节点数对应*  ]
204 | - InputGates：task 的输入中间结果 partition
205 | - operator-state：算子的状态句柄，由 TaskManager 上报给 JobManager，并统一维护
206 | - jar-files
207 | - class-paths
208 | 
209 | 构造器的一些组件我们会在介绍 TaskManager 的时候再详述
210 | 
211 | 其核心的运行方法 run()逻辑总结如下：
212 | 
213 | line408: run
214 | 
215 | - 核心的运行逻辑
216 | - line429: 遇到错误后通知 TaskManager
217 | - line469: 从 NetworkEnvironment 中申请 BufferPool，包括 InputGate 的接收 pool 以及 task 的每个 ResultPartition 的输出 pool，申请的资源数[ *num of Buffer* ] 由 input channels 和 ResultSubPartition 数决定
218 | 
219 | 关于网络管理[ 输入和输出 ] NetworkEnvironment，内存管理 MemoryManager 会分别开章节介绍
220 | 
221 | 那么 StreamTask 是如何在 Task 中被实例化，又是如何被调用的呢？
222 | 
223 | ```java
224 | //line 418
225 | invokable = loadAndInstantiateInvokable(userCodeClassLoader, nameOfInvokableClass);
226 | //一系列初始化操作 ...
227 | //line 584
228 | invokable.invoke();
229 | ```
230 | 
231 | 上面的 invokable 就是 StreamTask，StreamTask  的继承关系:
232 | 
233 | ![stream-task-extend.png](stream-task-extend.png)
234 | 
235 | 那么具体是什么时候被 set 进去作为属性的呢？
236 | 
237 | 在 StreamNode 生成的时候有这样一段逻辑:
238 | 
239 | ```java
240 | public <IN, OUT> void addOperator(
241 |       Integer vertexID,
242 |       String slotSharingGroup,
243 |       StreamOperator<OUT> operatorObject,
244 |       TypeInformation<IN> inTypeInfo,
245 |       TypeInformation<OUT> outTypeInfo,
246 |       String operatorName) {
247 | 
248 |    if (operatorObject instanceof StoppableStreamSource) {
249 |       addNode(vertexID, slotSharingGroup, StoppableSourceStreamTask.class, operatorObject, operatorName);
250 |    } else if (operatorObject instanceof StreamSource) {
251 |       addNode(vertexID, slotSharingGroup, SourceStreamTask.class, operatorObject, operatorName);
252 |    } else {
253 |       addNode(vertexID, slotSharingGroup, OneInputStreamTask.class, operatorObject, operatorName);
254 |    }
255 | ```
256 | 
257 | 将 OneInputStreamTask 等 StreamTask 设置到 StreamNode 的节点属性中，同时在 JobVertex 的节点构造时也会做一次初始化:
258 | 
259 | ```java
260 | jobVertex.setInvokableClass(streamNode.getJobVertexClass());
261 | ```
262 | 
263 | 在 TaskDeploymentDescriptor 实例化的时候会获取 jobVertex 中的属性，见`ExecutionVertex line673`
264 | 
265 | #### 算子初始化
266 | 
267 | 那么算子是什么时候被初始化的呢？这就需要梳理下 StreamTask 的 `init()` 方法的处理时机，上面已经分析过 `init()` 方法会在 StreamTask 的 `invoke()` 方法中被调用，那么 `invoke()` 方法又是何时被调用的呢？这就涉及到另外一个重要的类 Task.java，Task 才是运行时真正直接被 TaskManager 实例化和调用的类，上面已经分析过 Task 的 run 方法，是 TaskManager 收到 rpc 命令后起起来的 具体的细节会另起一章 flink 任务分发
268 | 
269 | #### 算子销毁
270 | 
271 | StreamTask 下执行完 invoke 方法之后[*意味着流程正常结束或者有异常打断*]，会执行下面这段逻辑:
272 | 
273 | ```java
274 | /**
275 |  * Execute the operator-specific {@link StreamOperator#dispose()} method in each
276 |  * of the operators in the chain of this {@link StreamTask}. </b> Disposing happens
277 |  * from <b>tail to head</b> operator in the chain.
278 |  */
279 | private void tryDisposeAllOperators() throws Exception {
280 |    for (StreamOperator<?> operator : operatorChain.getAllOperators()) {
281 |       if (operator != null) {
282 |          operator.dispose();
283 |       }
284 |    }
285 | }
286 | ```
287 | 
288 | 所以，算子中有任何 hook 函数或者必须执行的销毁工作可以写在 dispose 方法里，这段逻辑是 flink 保证一定可以执行到的


--------------------------------------------------------------------------------
/flink/flink算子的生命周期/op-chain-internal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink算子的生命周期/op-chain-internal.png


--------------------------------------------------------------------------------
/flink/flink算子的生命周期/op-chian-chianable.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink算子的生命周期/op-chian-chianable.png


--------------------------------------------------------------------------------
/flink/flink算子的生命周期/operator-chain-simple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink算子的生命周期/operator-chain-simple.png


--------------------------------------------------------------------------------
/flink/flink算子的生命周期/stream-task-extend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink算子的生命周期/stream-task-extend.png


--------------------------------------------------------------------------------
/flink/flink网络栈/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink网络栈/.DS_Store


--------------------------------------------------------------------------------
/flink/flink网络栈/flink-network-dataflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink网络栈/flink-network-dataflow.png


--------------------------------------------------------------------------------
/flink/flink网络栈/flink网络栈.md:
--------------------------------------------------------------------------------
  1 | # Flink网络栈
  2 | 
  3 | 本章节主要介绍 flink 的网络交互，包括每个 task 的输入输出管理，内存分配和释放等，因为涉及到内存申请，这里会介绍 flink 的内存管理
  4 | 
  5 | *我们会从 flink 网络相关的核心抽象开始介绍*
  6 | 
  7 | ## IntermediateResult
  8 | 
  9 | 代表一个 Job Vertex 的中间执行结果，由于同一个 Job vertex 可能有多个线程并发执行，这里的 IntermediateResult 对应一个 Job Edge 的输出下游结果集，一个 IntermediateResult 包含多个 IntermediateResultPartition，一个 IntermediateResultPartition 对应一个并行任务 ExecutionVertex 的输出结果，如下图所示：
 10 | 
 11 | ![intermediate-result.png](intermediate-result.png) *对应关系如下*：
 12 | 
 13 | - IntermediateDataSet [StreamGraph] <-> IntermediateResult[JobGraph]  代表一个 dataset
 14 | - IntermediateResultPartition [ExecutionGraph] 代表一个并行任务的输出 partition 
 15 | - ResultPartition 代表一个  IntermediateResultPartition 的生产者 ExecutionVertex 的输出 副本，可输出多个副本
 16 | - 一个 ResultPartition 可能包含多个 ResultSubPartition【依据下游消费 task 的数目】
 17 | 
 18 | 
 19 | 
 20 | 下面介绍数据的消费*
 21 | 
 22 | 
 23 | 
 24 | ## InputGate
 25 | 
 26 | InputGate 是 flink 关于 task 的一个输入源的抽象，一个 InputGate 代表一个上游数据源，对应**一个**上游中间结果的一个或多个 partition
 27 | 
 28 | flink 依据中间结果的 producer 【*生产task*】的并发度来生成相应个数的 partition，每个 producer task 生产一个 partition，为了优化并发读写，flink 依据中间结果的消费者 task 的并发度进一步将每个 partition 划分为多个 subPartition，假设有如下这样的一个简单地生产消费模型：
 29 | 
 30 | ![sub-partition.png](sub-partition.png)
 31 | 
 32 | 上图的模型包含了一个生产者任务 map() 和消费者任务 reduce()，两者的并行度都是 2，由于有两个 producer task，这时候 flink 会将中间结果划分为两个 partition，同时由于有 2 个 consumer task，每个 partition被进一步划分为两个 sub partiton
 33 | 
 34 | 对于每个 consumer task，flink 会维护一个消费的中间数据集和 task 本身的对应关系，这就是 InputGate
 35 | 
 36 | 每个 InputGate 会维护消费一个中间结果的每个partition中的一个 subPartition，这样的一个 subPartition 的消费需要建立一个 InputChannel
 37 | 
 38 | ps: 目前 flink 的一个中间结果 partition 仅支持一个 consumer 节点【JobVertex】
 39 | 
 40 | ### InputGate的创建
 41 | 
 42 | 在上一节 Flink 算子的声明周期一节，我们介绍了 Flink 的 Instance 直接执行类 Task.java，在其构造器中有这样一段逻辑：
 43 | 
 44 | ```java
 45 | // Consumed intermediate result partitions
 46 | 		this.inputGates = new SingleInputGate[consumedPartitions.size()];
 47 | 		this.inputGatesById = new HashMap<IntermediateDataSetID, SingleInputGate>();
 48 | 
 49 | 		for (int i = 0; i < this.inputGates.length; i++) {
 50 | 			SingleInputGate gate = SingleInputGate.create(
 51 | 					taskNameWithSubtaskAndId, jobId, executionId, consumedPartitions.get(i), networkEnvironment, 
 52 | 					metricGroup.getIOMetricGroup());
 53 | 
 54 | 			this.inputGates[i] = gate;
 55 | 			inputGatesById.put(gate.getConsumedResultId(), gate);
 56 | 		}
 57 | ```
 58 | 
 59 | 这里直接用的 SingleInputGate，而 SingleInputGate 有是如何去建立和上游 ResultSubPartition 的连接关系的呢？SingleInputGate 在初始化的时候利用了关键的的参数 InputGateDeploymentDescriptor，其中包含了 partition 的一些关键信息，具体见 `SingleInputGate line 502`，这里列一下初始化过程：
 60 | 
 61 | - 从 InputGateDeploymentDescriptor 中获取 IntermediateDataSetID、consumedSubpartitionIndex
 62 | 
 63 |   、InputChannelDeploymentDescriptor
 64 | 
 65 | - IntermediateDataSetID 告知了消费的数据集，consumedSubpartitionIndex 告知了每个 ResultPartition 要消费的 ResultSubPartition 的索引，InputChannelDeploymentDescriptor 中告知了每个要消费的 ResultPartition 的 id 和 location 信息
 66 | 
 67 | 这样 InputGate 便知道从哪里消费 partition 以及 partition 的连接信息【本地 or 远程】
 68 | 
 69 | ### 注册 BufferPool
 70 | 
 71 | 在 NetworkEnvironment 注册 Task 的时候会为每个 task 的每个 InputGate 申请一个 BufferPool：
 72 | 
 73 | ```java
 74 | //NetworkEnvironment line323
 75 | for (SingleInputGate gate : inputGates) {
 76 | 				BufferPool bufferPool = null;
 77 | 
 78 | 				try {
 79 | 					bufferPool = networkBufferPool.createBufferPool(gate.getNumberOfInputChannels(), false);
 80 | 					gate.setBufferPool(bufferPool);
 81 | 				}
 82 | 				catch (Throwable t) {
 83 | 					if (bufferPool != null) {
 84 | 						bufferPool.lazyDestroy();
 85 | 					}
 86 | ```
 87 | 
 88 | BufferPool 的最小大小为 input channel 的个数！
 89 | 
 90 | ### InputChannel
 91 | 
 92 | 和上游的一个 ResultSubPartition 的连接叫做一个 InputChannel，InputChannel 一共分为三种: LocalInputChannel、RemoteInputChannel、UnknownInputChannel，分别表示一个本地连接，远程连接以及需要运行时确立的连接
 93 | 
 94 | ### InputGateDeploymentDescriptor
 95 | 
 96 | 上面我们提到，SingleInputGate 在初始化的时候用到了关键信息主要由 InputGateDeploymentDescriptor 来提供，那么 InputGateDeploymentDescriptor 是如何生成的呢？规则是什么？
 97 | 
 98 | 在 Task 实例化的时候有一个关键的描述对象参数：TaskDeploymentDescriptor，其中包含了 task 运行时需要的 jar 包路径，配置等重要信息，其中一个关键信息便是：InputGateDeploymentDescriptor，而 Task 是 TaskManager 负责启动的，TaskManager 接收了 JobManager 的 RPC 命令，提交作业，具体的生成首先会牵涉到 Flink 的调度，这里会简单介绍下，细节会另开一节专门介绍
 99 | 
100 | 前面章节中，我们介绍运行时，一个并发 Task 被抽象为 ExecutionVertex，而一次执行被抽象为 Execution，在 Execution 中有负责获取资源和调度的接口：
101 | 
102 | ```java
103 | //Execution.java line265
104 | public boolean scheduleForExecution(Scheduler scheduler, boolean queued) throws NoResourceAvailableException
105 | ```
106 | 
107 | Execution 通过 Flink 的核心调度器 [ 目前也是唯一一个 ] 先获取一个 Slot [ SimpleSlot/ flink 对资源的抽象，在调度章节会详细介绍 ]，后将 Task 部署到该 Slot 上，而 TaskDeploymentDescriptor 就在部署前被创建：
108 | 
109 | ```java
110 | //Execution.java line 370
111 | final TaskDeploymentDescriptor deployment = vertex.createDeploymentDescriptor(
112 | 	attemptId,
113 | 	slot,
114 | 	operatorState,
115 | 	operatorKvState,
116 | 	attemptNumber);
117 | ```
118 | 
119 | 具体的规则见 ExecutionVertex `line 636: createDeploymentDescriptor`，这里简单分析下：
120 | 
121 | - 依据要输出的 IntermediateResultPartition 生成对应的 ResultPartition 实例
122 | - 对每个上游的 JobVertex 生成一个 InputChannelDeploymentDescriptor 数组用来实例化一个 InputGateDeploymentDescriptor
123 | - 获取额外配置
124 | 
125 | #### InputGateDeploymentDescriptor的生成规则
126 | 
127 | - InputGateDeploymentDescriptor line 87
128 | - 对于每个上游输入边 ExecutionEdge 获取其生产者 ExecutionVertex
129 | - 获取 ExecutionVertex 的执行状态，如果是 RUNNING 或 FINISHED，判断生产者的 slot 和本 task 即将部署的 slot 是否在一个 Instance，生成相应的 Location 抽象： ResultPartitionLocation
130 | - 如果不是  RUNNING 或 FINISHED 状态，生成未知 Location
131 | - 一个 ExecutionEdge 实例化一个 InputGateDeploymentDescriptor ，一个 InputGateDeploymentDescripto 会同时消费上游每个生产者 ExecutionVertex 的固定 index 的 sub partition
132 | 
133 | 
134 | ## InputProcessor
135 | 
136 | 对于每个 task，flink 都会生成一个 InputProcessor，具体就是 StreamIputProcessor。StreamIputProcessor 干了两件事情：
137 | 
138 | - 消息解序列化并交由算子处理
139 | - 追踪 Watermark Event 并分发时间对齐事件
140 | 
141 | *时间 对齐会专门起一章讲解，这里只介绍 消息序列化 ^_^*
142 | 
143 | Flink 将消息的序列化抽象为两个模块：
144 | 
145 | - 内存申请和流数据读取：由 SpillingAdaptiveSpanningRecordDeserializer 负责，从名字就可以大致了解其会动态吐磁盘【当内存不够的时候】
146 | - 解序列化：由 NonReusingDeserializationDelegate 负责
147 | 
148 | 
149 | NonReusingDeserializationDelegate 包含一个 serializer，依据是否需要发射水印区分为 MultiplexingStreamRecordSerializer 和 StreamRecordSerializer，这两个 serializer 只是在 TypeSerializer 上做了一层封装，这里就不做介绍了【TypeSerializer 是类型系统的重要组成，如果有时间的话我会单独开一章节介绍 flink 的类型系统】
150 | 
151 | 这里重点讲一下流数据的读取部分，也就是  SpillingAdaptiveSpanningRecordDeserializer
152 | ###SpillingAdaptiveSpanningRecordDeserializer
153 | *每个 channel 都有一个 SpillingAdaptiveSpanningRecordDeserializer*
154 | 
155 | 前面提到，SpillingAdaptiveSpanningRecordDeserializer 主要负责流数据的读取，同时通过 NonReusingDeserializationDelegate 来解序列化，从而获得可以发送给算子处理的 StreamElement。SpillingAdaptiveSpanningRecordDeserializer 内部有两个实现类：NonSpanningWrapper 和 SpanningWrapper，前者将数据存储进内存，后者存储前者存不下的部分内存数据以及将超量数据吐到磁盘。我们来解释下其核心方法：
156 | 
157 | ####添加 Buffer
158 | 
159 | 添加 Buffer 会首先 check spanningWrapper 中是否已有数据，如果有，说明 nonSpanningWrapper 中的数据已满，会继续走 spanningWrapper 添加数据，否则走 nonSpanningWrapper 添加数据。
160 | 
161 | ```java
162 | @Override
163 | 	public void setNextMemorySegment(MemorySegment segment, int numBytes) throws IOException {
164 | 		// check if some spanning record deserialization is pending
165 | 		if (this.spanningWrapper.getNumGatheredBytes() > 0) {
166 | 			this.spanningWrapper.addNextChunkFromMemorySegment(segment, numBytes);
167 | 		}
168 | 		else {
169 | 			this.nonSpanningWrapper.initializeFromMemorySegment(segment, 0, numBytes);
170 | 		}
171 | 	}
172 | ```
173 | 
174 | ####读取 Record
175 | 
176 | ```java
177 | //SpillingAdaptiveSpanningRecordDeserializer line98
178 | @Override
179 | 	public DeserializationResult getNextRecord(T target) throws IOException {
180 | 		// always check the non-spanning wrapper first.
181 | 		// this should be the majority of the cases for small records
182 | 		// for large records, this portion of the work is very small in comparison anyways
183 | 		
184 | 		int nonSpanningRemaining = this.nonSpanningWrapper.remaining();
185 | 		
186 | 		// check if we can get a full length;
187 | 		if (nonSpanningRemaining >= 4) {
188 | 			int len = this.nonSpanningWrapper.readInt();
189 | ```
190 | 
191 | 简单总结：
192 | 
193 | - 先判断 nonSpanningWrapper 中有没有记录，如果有优先读取并返回解序列化的类型，其中有两个特例，一是 Record 长度过大，超出了 nonSpanningWrapper 中的 Buffer 的剩余内存，这时候将 nonSpanningWrapper 的数据托管给 spanningWrapper，二是 record 的长度数据不全，这时候也是采取托管给 spanningWrapper 的策略
194 | - 如果 nonSpanningWrapper  没有记录，判断 spanningWrapper 是否有完全记录，有的话读取，并将 spanningWrapper 中的不完全 buffer 托管给 nonSpanningWrapper
195 | - 除了个别超大的 record ，对于流式计算来说，大部分走的都应该是 nonSpanningWrapper，之所以设计了 spanningWrapper，主要是为了适应超大的 Record
196 | 
197 | ### 处理输入
198 | 
199 | 我们回来 InputProcessor
200 | 
201 | 上面提到了 nonSpanningWrapper 或 spanningWrapper 都是从 buffer 中读取数据的，那么这个 buffer 是如何写入的呢？
202 | 
203 | 答案是通过 BarrierHandler 写入
204 | 
205 | 这里涉及到了 flink 的 checkpoint 机制，会在下一节详细介绍，这里只介绍 Buffer 内存的获取
206 | 
207 | 处理输入的核心逻辑：
208 | 
209 | ```java
210 | //StreamInputProcessor line134
211 | public boolean processInput(OneInputStreamOperator<IN, ?> streamOperator, final Object lock) throws Exception {
212 | 		if (isFinished) {
213 | 			return false;
214 | 		}
215 | 		if (numRecordsIn == null) {
216 | 			numRecordsIn = streamOperator.getMetricGroup().counter("numRecordsIn");
217 | 		}
218 | ```
219 | 
220 | 这里简要概括：
221 | 
222 | - 第一次进方法的时候，currentRecordDeserializer 为空，通过 barrierHandler 来协调当前读取的 channel 索引和获取数据流 Buffer
223 | - 以后 currentRecordDeserializer 的消费以 Buffer 为单位，Buffer 消费完会转换下一个 channel 继续消费
224 | 
225 | *这里的关键是 barrierHandler BufferOrEvent 事件的获取，我们向上追溯*
226 | 
227 | 依据消费语义的不同，barrierHandler 有两种：
228 | 
229 | - BarrierBuffer 对应 exactly_once 语义【有 checkpoint】
230 | - BarrierTracker 对应 at_least_once 语义
231 | 
232 | *我们先来看 BarrierBuffer*
233 | 
234 | #### BarrierBuffer
235 | 
236 | ##### BufferSpiller
237 | 
238 | BufferSpiller 消费 buffer 和 event 事件流并吐到磁盘文件，在写一定数量的流数据后，可以执行 rollOver 操作：将磁盘数据以 sequence 的方式暴露出来，同时创建新的磁盘文件以备下次读写。由于读写操作间隔很短，文件本身也不是很大，大部分读写操作都会在 ms 级别完成。
239 | 
240 | ##### 获取 BufferOrEvent
241 | 
242 | *以下简称 BufferOrEvent 为 boe*
243 | 
244 | 核心逻辑在 getNextNonBlocked方法中:
245 | 
246 | ```java
247 | //BarrierBuffer line103
248 | public BufferOrEvent getNextNonBlocked() throws IOException, InterruptedException {
249 | 		while (true) {
250 | 			// process buffered BufferOrEvents before grabbing new ones
251 | 			BufferOrEvent next;
252 | 			if (currentBuffered == null) {
253 | 				next = inputGate.getNextBufferOrEvent();
254 | 			}
255 | ```
256 | 
257 | 简单总结：
258 | 
259 | - 如果当前没有堆积的 boe，直接从 InputGate 中获取，否则从缓存中获取【通过BufferSpiller缓存的数据】
260 | - 如果是从堆积中获取 boe，并且来自一个被 block 的 channel，再次将 boe 通过 BufferSpiller 写入缓存
261 | - 如果是该 boe 是消息 Buffer，返回；如果是 Checkpoint Event，处理 Barrier 事件
262 | - 如果获取到某个 channel 的 Partition 消费结束事件或获取不到消息，取消 block 所有的 channel，并继续读取缓存中的数据，如果最后缓存中的数据也消费完了，返回 null
263 | 
264 | *那么处理 barrier 的逻辑是怎样的呢？*
265 | 
266 | ##### 处理 Barrier
267 | 
268 | 处理 barrier 的核心逻辑在 processBarrier 方法中:
269 | 
270 | ```java
271 | //BarrierBuffer line161
272 | private void processBarrier(CheckpointBarrier receivedBarrier, int channelIndex) throws IOException {
273 | 		final long barrierId = receivedBarrier.getId();
274 | 
275 | 		if (numBarriersReceived > 0) {
276 | 			// subsequent barrier of a checkpoint.
277 | 			if (barrierId == currentCheckpointId) {
278 | 				// regular case
279 | 				onBarrier(channelIndex);
280 | 			}
281 | ```
282 | 
283 | 简单总结：
284 | 
285 | - 如果当前的 Barrier 事件的 id 与当前 checkpointId 相等，则 block 该 boe 的源 channel，并增加 numBarriersReceived 计数
286 | - 如果 Barrier 事件的 id 大于 checkpointId，取消 block 所有的 channels，并更新当前 checkpointId 为该 id 并且阻断对应的 channel
287 | - 如果 numBarriersReceived 加上 已经关闭的 channel 数等于 channel 的总数，进行 checkpoint，并取消 block 所有的 channel
288 | 
289 | 这里补张图说明数据的消费过程:
290 | 
291 | ![flink-network-dataflow.png](flink-network-dataflow.png)
292 | 
293 | 在一轮 barrier 过程中，flink 接收每个 channel 的 barrier event，获取其 barrier id 与此轮之前最大的 id：checkpoint id 作比较：如果相等，则 block 对应的 channel，被 block 的 channel 在此轮中的数据会通过 BufferSpiller 吐到磁盘【大部分情况是 page cache】；如果大于 checkpoint id，会提升 checkpoint id 为此 id，并取消 block 所有的 channel，直接进入下一轮 barrier；如果小于，直接丢弃此事件；如果此轮所有的 channel 都发送了一致的 id，则以此 id 进行 checkpoint，并取消所有的 channel block
294 | 
295 | 值得注意的是，每次取消所有的 channel block 都会将 BufferSpiller 中的数据暴露成 buffer sequence 并加入队列中，下次获取记录时会优先取 sequence 中的数据
296 | 
297 | 列举几种典型的数据流向：
298 | 
299 | - input-gate -> operator：消费顺畅，一轮无任何阻塞
300 | - input-gate -> BufferSpiller -> current-queue -> operator：相对于其它 channel，消费过快，此轮被吐到磁盘，下一轮再消费
301 | - current-queue -> operator：相对于其它 channel，消费过慢，被吐到磁盘，下一轮再消费
302 | 
303 | 
304 | 
305 | 
306 | *下面介绍数据的生产*
307 | 
308 | 
309 | 
310 | ## RecordWriterOutput
311 | 
312 | 在 flink 算子的声明周期一节我们介绍过：算子通过 RecordWriterOutput 将处理过的 record 写出去，下面我们来分析其行为
313 | 
314 | RecordWriterOutput 实现了接口 Output，具备以下功能接口：
315 | 
316 | - collect：发射 record
317 | - emitWatermark：广播 watermark 到每个输出 channel
318 | - broadcastEvent：广播 Barrier Event 到每个输出 channel
319 | 
320 | 它的内部维护了一个序列化器和真正的 RecordWriter【这里是 StreamRecordWriter】，我们来稍微分析下它们
321 | 
322 | ### SpanningRecordSerializer
323 | 
324 | 实现比较简单，将输入的 record 序列化并连同长度信息一同写入内存 Buffer
325 | 
326 | ### RecordWriter
327 | 
328 | recordWriter 维护了运行时的 ResultPartitionWriter【真正写 ResultSubPartition】，将 record 序列化进内存 Buffer，来看它的构造器：
329 | 
330 | - 传入 ResultPartitionWriter
331 | - 传入 channelSelector【不同的 partition 方式有不同的 selector，如 forward、shuffle 等】
332 | - 为每一个写 channel 实例化一个 SpanningRecordSerializer
333 | 
334 | *下面介绍其核心接口*
335 | 
336 | #### 发射 record
337 | 
338 | 对应方法 emit，核心逻辑概括：
339 | 
340 | - 通过  channelSelector 选择要输出的 channel id
341 | - 找到对应的 SpanningRecordSerializer 并通过其序列化
342 | - 有三种情况：record 小于 Buffer 大小、record 等于 Buffer 大小、record 大于【准确的说应该加上长度信息的4个字节】，分别对应三种不同的序列化返回结果，这里有一个细节需要特别注意，写 record 的 Buffer 是写死的 128 bytes 长度，长度不够时会 resize 其大小，所以消息是不会丢失的
343 | - 如果以上的前两种情况满足意味着消息被完整序列化并写入内存 Buffer，此时通过 ResultPartitionWriter 将包含序列化数据的 Buffer 写出去
344 | - 同时为 SpanningRecordSerializer 向 NetworkeEnvironment 申请的 BufferPool 再次申请新 Buffer【见 TaskManager 基本组件】
345 | 
346 | ###### 生产端反压
347 | 
348 | 细心的童鞋会发现，在每个发送的接口里，用来解析序列化消息的内存 Buffer 申请都走了这样一个逻辑 `writer.getBufferProvider().requestBufferBlocking()`
349 | 
350 | 这段逻辑是阻塞操作，对应每个 ResultPartition 来说，其 BufferPool 是有限大小的，当内存不足时会阻塞，并停止继续解析消息！
351 | 
352 | #### 广播 Barrier 事件
353 | 
354 | 逻辑与发射 Record 类似，稍有不同：
355 | 
356 | - 对于每个要广播的 channel，如果有残留在内存 Buffer 的数据，先通过 ResultPartitionWriter 写出去，再写 Barrier 事件
357 | - 申请内存逻辑类似
358 | 
359 | ### ResultPartitionWriter
360 | 
361 | 上面介绍过 ResultPartitionWriter 是执行了序列化后 Record 的写操作，其实它是消息的路由器，具体写是通过 ResultPartition 来完成的
362 | 
363 | ### ResultPartition
364 | 
365 | *先翻译下官方的 java doc：*
366 | 
367 | 一个 ResultPartition 是单个 task 的输出结果，在运行时，它是逻辑概念 IntermediateResultPartition 的一部分。ResultPartition 是一个 Buffer 的集合，并且以 ResultSubpartition 的形式进行组织，而 ResultSubPartiton 的个数等同于下游 consumer task 的个数，具体写入到哪个 ResultSubpartition 由 DistributionPattern 【上面说的 channel selector】来决定。
368 | 
369 | 关于生命周期：一个 ResultPartition 有三个阶段：
370 | 
371 | - Produce
372 | - Consume
373 | - Release
374 | 
375 | 关于 consumer tasks 的延迟部署和更新：
376 | 
377 | - 依据 ResultPartition 的类型：PIPELINED 和 BLOCKING 对应两种部署模式，分别对应流式计算和批处理
378 | - 对于 PIPELINED 类型，consumer tasks 在 ResultPartiton 产生第一个结果 Buffer 的时候会立即部署
379 | - 对于 BLOCKING 类型，consumer tasks 在 ResultPartiton 结束后才会部署
380 | 
381 | 观察 `line162  ` 可知如果是 PIPELINED 类型，ResultSubPartition 的类型为：PipelinedSubpartition
382 | 
383 | 我们来分析下它的核心接口：
384 | 
385 | #### 注册 BufferPool
386 | 
387 | 代码见 `line187 registerBufferPool` 方法，此 BufferPool 为 TaskManager 在运行 task 时 通过 NetworkEnvironment 注册进来的。如果是 BLOCKING 类型，这里会注册 BufferPool 的 owner 为此 ResultPartiton，NetworkEnvironment 如果出现内存不足的情况，会通过 ResultPartiton 的引用，以 Sub Partiton 为单位将内存释放掉【见 TaskManager 基本组件 NetworkEnvironment】。
388 | 
389 | 既然注册 BufferPool 是以 task 为单位的，那么每个 task 会拥有几个 BufferPool， BufferPool 的大小又是多少呢？在 NetworkEnvironment 中有这样一段逻辑：
390 | 
391 | ```java
392 | //NetworkEnvironment line297
393 | try {
394 | 					bufferPool = networkBufferPool.createBufferPool(partition.getNumberOfSubpartitions(), false);
395 | 					partition.registerBufferPool(bufferPool);
396 | 
397 | 					partitionManager.registerResultPartition(partition);
398 | 				}
399 | ```
400 | 
401 | 其中的 `partition.getNumberOfSubpartitions()` 是 ResultPartition 的 sub partition 的个数，我们可以看到，一个 task 拥有的 BufferPool 个数等于它产生的 ResultPartition 的个数，一个 ResultPartition 对应一个 BufferPool，而每个 BufferPool 的最小大小等于它的 sub partition 的个数【consumer task 的个数】。为什么说是最小大小，见 TaskManager 基本组件 NetworkEnvironment。
402 | 
403 | #### 写入 Buffer
404 | 
405 | ```java
406 | // ResultPartition line244
407 | public void add(Buffer buffer, int subpartitionIndex) throws IOException {
408 | 		boolean success = false;
409 | 
410 | 		try {
411 | 			checkInProduceState();
412 | ```
413 | 
414 | 简单总结：
415 | 
416 | - 找到对应的 ResultSubPartition 并写入 Buffer
417 | - 如果是第一次写入，并且是 PIPELINED 类型，通知 JobManager 可以部署下游 consumer，具体细节将会在 调度一章专门讲解
418 | - 如果写入失败，回收内存 Buffer
419 | 
420 | #### ResultSubpatition
421 | 
422 | 这里我们只介绍 PipelinedSubpartition，实现比较简单，就不具体介绍了，有一点需要注意，就是 NotificationListener
423 | 
424 | 每次 ResultSubpatition 的状态有变动【被写入，被释放，写入完成等】都会通知 NotificationListener，通知完之后就会销毁 NotificationListener 的注册
425 | 
426 | *典型的我们详细分析下 add Buffer 后通过 NotificationListener 触发的一些列行为*
427 | 
428 | PipelinedSubPartition 在添加一个 Buffer 后，如果 NotificationListener 不为 null，那么会通知 NotificationListener，NotificationListener 共有两个实现类 LocalInputChannel 和 SequenceNumberingSubpartitionView，分别对应本地传输和网络传输。
429 | 
430 | #### 与 InputChannel 的交互
431 | 
432 | 在 consumer task 向 SingleInputGate 请求 Buffer 时，会先触发 requestPartitions 操作，不同的 channel 会触发不同的行为:
433 | 
434 | - 对于 LocalInputChannel 来说就是获取 ResultSubPartitonView 并将自己注册到对应的 ResultSubPartition 中
435 | - 对于 RemoteInputChannel 会触发 netty 连接并开始请求队列数据
436 | 
437 | *我们先来看 LocalInputChannel*
438 | 
439 | ##### LocalInputChannel
440 | 
441 | LocalInputChannel 在收到通知后，发生以下行为：
442 | 
443 | - check ResultSubPartitonView 中有无数据，如果有的话通知 SingleInputGate，并将自己【LocalInputChannel】加入到 SingleInputGate 的有数据 channel 队列中
444 | - 如果没有数据，将自己重新注册到对应的 ResultSubPartition 中
445 | 
446 | *再来看SequenceNumberingSubpartitionView，不过在这之前先介绍 RemoteInputChannel*
447 | 
448 | ##### RemoteInputChannel
449 | 
450 | 上面提到 RemoteInputChannel 在 SingleInputGate 第一次执行获取 Buffer 的逻辑时，会触发 RemoteInputChannel 的网络连接，具体逻辑如下：
451 | 
452 | ```java
453 | //RemoteInputChannel line114
454 | @Override
455 | void requestSubpartition(int subpartitionIndex) throws IOException, InterruptedException {
456 | 		if (partitionRequestClient == null) {
457 | 			// Create a client and request the partition
458 | 			partitionRequestClient = connectionManager
459 | 					.createPartitionRequestClient(connectionId);
460 | 
461 | 			partitionRequestClient.requestSubpartition(partitionId, subpartitionIndex, this, 0);
462 | 		}
463 | 	}
464 | ```
465 | 
466 | 这一段先会创建 netty 连接，然后会请求 netty 数据，下面我们详细分析 flink 关于 netty 连接的抽象
467 | 
468 | ## Netty 连接
469 | 
470 | flink 采用 netty 作为底层的网络传输框架，各个 Instance 之间会依照消费依赖建立一个 Netty 连接，然后通过 NetworkEnvironment 管理数据的消费，本人对 Netty 研究的不是很精，这里只站在 flink 的角度梳理一些重点的组件
471 | 
472 | ### ConnectionManager
473 | 
474 | flink 通过 ConnectionManager 来创建和管理一个 Instance 上的 netty 连接【client】，它有两个实现：LocalConnectionManager 和 NettyConnectionManager，这里只介绍后者
475 | 
476 | #### NettyConnectionManager
477 | 
478 | ##### PartitionRequestQueue
479 | 
480 | 对于 netty 来说，这是 server 端 netty handler 的 adapter，除了基本的事件处理如注册 channel 外还定义了用户可以手动触发的两个事件：
481 | 
482 | - 取消 channel 事件：会取消对应的 channel 的队列【用来读取对应 subPartition 的临时队列】，并标志该 channel 已被取消
483 | - 入队事件：当开始消费某个 sub partition 时会触发此事件，会将该队列 queue 加入到已有的 队列中
484 | 
485 | ###### SequenceNumberingSubpartitionView
486 | 
487 | 上面说的对应每个 sub partition 的读取 queue 被抽象为 SequenceNumberingSubpartitionView，它维护了两个成员 ResultSubpartitionView 和 一个不断递增的 buffer 序列编号，之前在介绍 ResultPartition 时介绍过，当往 ResultSubPartiton 中写数据时会通知对应的 NotificationListener 【如果有的话】，其中对应 RemoteInputChannel 的便是 SequenceNumberingSubpartitionView，它在收到通知后会触发 入队 用户事件，将自己加入到已有队列中，表明自己是一个有数据队列
488 | 
489 | ###### 数据发送
490 | 
491 | 每当 channel 可写，就会触发 netty 的 channelWritabilityChanged 接口调用，触发数据发送，我们来看下数据发送的核心逻辑：
492 | 
493 | ```java
494 | //PartitionRequestQueue line134
495 | @Override
496 | 	public void channelWritabilityChanged(ChannelHandlerContext ctx) throws Exception {
497 | 		writeAndFlushNextMessageIfPossible(ctx.channel());
498 | 	}
499 | 
500 | 	private void writeAndFlushNextMessageIfPossible(final Channel channel) throws IOException {
501 | 		if (fatalError) {
502 | 			return;
503 | 		}
504 | 
505 | 		Buffer buffer = null;
506 | 
507 | 		try {
508 | 			if (channel.isWritable()) {
509 | 				while (true) {
510 | 					if (currentPartitionQueue == null && (currentPartitionQueue = queue.poll()) == null) {
511 | 						return;
512 | 					}
513 | 
514 | ```
515 | 
516 | 总结其逻辑：
517 | 
518 | - 如果当前没有可写数据的队列，直接返回
519 | - 如果某个 queue 有数据，会一直读取该队列的数据发送到下游，知道把数据读完后将该队列注册到 对应的 ResultSubPartition 中，等待下次有数据时再次入队
520 | - 如果某个 ResultSubPartition 被 release 【批处理内存不足时】，标识其被 release
521 | - 如果收到某个 ResultSubPartition 处理完成的事件，释放对应 ResultSubPartition 所有的资源【将对应的 ResultSubPartition 资源释放】并给出相应通知
522 | 
523 | ##### PartitionRequetServerHandler
524 | 
525 | Netty Server 端的 handler，负责做一些初始化和派发事件的功能
526 | 
527 | ###### 消息路由
528 | 
529 | 该 handler 负责路由以下消息 NettyMessage：
530 | 
531 | - PartitionRequest：将对应的 ResultSubPartition 封装成 SequenceNumberingSubpartitionView 入队
532 | - TaskEventRequest：像对应的 ResultPartitionWriter 派发 task event
533 | - CancelPartition
534 | - CloseRequest
535 | 
536 | ##### PartitionRequestProtocol
537 | 
538 | flink 的 netty 消息 协议，主要是客户端和 Server 端 handlers 的获取
539 | 
540 | ##### NettyBufferPool
541 | 
542 | 配置每个 netty chunk 为 16M
543 | 
544 | ### PartitionRequestClient
545 | 
546 | 通过 NettyConnectionManager 可以创建 PartitionRequestClient，它封装了原生的 NettyClient，通过 PartitionRequestClientFactory 来创建
547 | 
548 | #### PartitionRequestClientFactory 
549 | 
550 | 主要用来缓存已获得的 NettyChannel，这里就不具体介绍了，有兴趣的童鞋可以自行研究
551 | 
552 | #### PartitionRequestClientHandler
553 | 
554 | Netty Client 端的 Client Handler，负责将消息 decode 到 RemoteInputChannel，在 PartitionRequestClient 发起 PartitionRequest 请求消息的时候会将对应的 RemoteInputChannel 加入到 PartitionRequestClientHandler 备份起来。
555 | 
556 | ##### Channel 消息读
557 | 
558 | ###### 消费端反压
559 | 
560 | 为了防止读取消息时内存不够，读速度快于生产，这里有两个有意思的 runnable，BufferListenerTask 和 StagedMessagesHandlerTask，它们均由 netty io 线程负责执行，当消息顺畅读取，内存充足时，当然是 decode 消息，并将消息加入到对应的 RemoteInputChannel buffer 队列中，但是内存不足时，上面的两个 task 就会扮演重要角色！
561 | 
562 | - 如果之前有消息堆积，将消息直接写入堆积队列
563 | - BufferListenerTask：当第一次出现读内存不足时，会将对应的 Buffer 托管给 BufferListenerTask，BufferListenerTask 将自己注册到 SingleInputGate 的 BufferPool 中，同时停止 channel 的自动读取，当 BufferPool 有内存，会通知 BufferListenerTask 并解析消息
564 | - 这时候有一个意外，就是收到通知后仍然没有获取到内存 Buffer，这表明 BufferPool 已经被销毁【批处理内存不够时】，这时候启动 StagedMessagesHandlerTask，一直循环解析堆积队列里的数据
565 | 
566 | 这里涉及到了 backpressure，流式计算里的反压技术，flink 采取的是逐级反压，并且这两个 task 扮演了消费端的 反压角色
567 | 
568 | 
569 | 
570 | 这里补一张图，说明 flink 的消费与生产之间的关系：
571 | 
572 | 
573 | 
574 | ![netty-client-server.png](netty-client-server.png)
575 | 
576 | 
577 | 
578 | 这里链接一个 flink 官方的 wiki介绍了 flink task 的数据流交换细节，比较经典，这里偷个懒，就不翻译了，有时间一定补上^_^
579 | 
580 | >  [flink data exchange betweek tasks ](https://cwiki.apache.org/confluence/display/FLINK/Data+exchange+between+tasks)
581 | 
582 | 


--------------------------------------------------------------------------------
/flink/flink网络栈/intermediate-result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink网络栈/intermediate-result.png


--------------------------------------------------------------------------------
/flink/flink网络栈/netty-client-server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink网络栈/netty-client-server.png


--------------------------------------------------------------------------------
/flink/flink网络栈/sub-partition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/flink网络栈/sub-partition.png


--------------------------------------------------------------------------------
/flink/jobmanager基本组件/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/jobmanager基本组件/.DS_Store


--------------------------------------------------------------------------------
/flink/jobmanager基本组件/blob-server-cache-store.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/jobmanager基本组件/blob-server-cache-store.png


--------------------------------------------------------------------------------
/flink/jobmanager基本组件/blob-server-contact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/jobmanager基本组件/blob-server-contact.png


--------------------------------------------------------------------------------
/flink/jobmanager基本组件/blob-server-store-dirctory-tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/jobmanager基本组件/blob-server-store-dirctory-tree.png


--------------------------------------------------------------------------------
/flink/jobmanager基本组件/blob-service-extends-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/jobmanager基本组件/blob-service-extends-arch.png


--------------------------------------------------------------------------------
/flink/jobmanager基本组件/jobmanager基本组件.md:
--------------------------------------------------------------------------------
  1 | # JobManager中的基本组件
  2 | 
  3 | ## 前言
  4 | 
  5 | JobManager 是 flink 集群的中控节点，类似于 Apache Storm 的 Nimbus 以及 Apache Spark 的 Driver 的角色，它负责作业的调度、jar 包管理、checkpoint 的协调和发起等，为了后续章节的开展，本文将介绍 flink JobManager 中所部署的一些服务。
  6 | 
  7 | 
  8 | 
  9 | ## BolbServer
 10 | 
 11 | flink 用来管理二进制大文件的服务，flink JobManager 中启动的 BLOB Server 负责监听请求并派发线程去处理。更进一步，它将负责创建对应的目录结构去存储这些 BLOBs 或者只是临时性地缓存。背后支持的文件系统：本底磁盘
 12 | 
 13 | 来看它的构造器：
 14 | 
 15 | - 第一步获取 RecoveryMode，一共两种 STANDALONE 和 ZOOKEEPER，后者是有 JobManager leader 选举的高可用模式
 16 | - 获取文件系统存储的根目录，可配置，默认是从系统环境变量 `System.getProperty("java.io.tmpdir")` 中获取，其实就是本次磁盘存储
 17 | - 初始化 *恢复存储* 模块 BolbStore，STANDALONE 模式下为 VoidBlobStore，VoidBlobStore 是一个空实现；不会有任何持久化操作；ZOOKEEPER 模式下为 FileSystemBlobStore，FileSystemBlobStore 内部封装了磁盘文件的管理，包括添加、删除、拷贝等，BlogStore 会备份 BlobServer 的本地存储，主要用于恢复模式下的作业磁盘状态恢复用
 18 | - 启动 ServerSocket
 19 | - 启动 BlobServer 服务线程
 20 | 
 21 | ### BlogServer 和 BlobStore
 22 | 
 23 | BlobStore 是 BlobServer 的组件之一，BolbStore 主要负责 BlobServer 本地存储的恢复【JobManager 重启】，这里只介绍 FileSystemBlobStore，FileSystemBlobStore 依据配置的不同支持两种文件系统存储：HDFS 和 本地文件系统
 24 | 
 25 | BlobServer 和  FileSystemBlobStore 的存储目录结构如下图所示：
 26 | 
 27 | ![blob-server-store-dirctory-tree.png](blob-server-store-dirctory-tree.png)
 28 | 
 29 | *下面以一次客户端连接请求的发起介绍两者的协同*
 30 | 
 31 | 来看 BolbServer 的核心 `run` 方法:
 32 | 
 33 | ```java
 34 | //BlobServer line230
 35 | public void run() {
 36 |    try {
 37 |       while (!this.shutdownRequested.get()) {
 38 |          BlobServerConnection conn = new BlobServerConnection(serverSocket.accept(), this);
 39 |          try {
 40 |             synchronized (activeConnections) {
 41 |                while (activeConnections.size() >= maxConnections) {
 42 |                   activeConnections.wait(2000);
 43 |                }
 44 |                activeConnections.add(conn);
 45 |             }
 46 | 
 47 |             conn.start();
 48 |             conn = null;
 49 |          }
 50 |          finally {
 51 |             if (conn != null) {
 52 |                conn.close();
 53 |                synchronized (activeConnections) {
 54 |                   activeConnections.remove(conn);
 55 |                }
 56 |             }
 57 |          }
 58 |       }
 59 |    }
 60 | ```
 61 | 
 62 | 简要概括下逻辑：
 63 | 
 64 | - 当服务端收到一次存储的 request 时，会首先封装成对象 BlobServerConnection，并执行其 `start()` 方法
 65 | - BlobServerConnection 本身也是一个 Thread，封装了具体的存储逻辑
 66 | - 会接收 3 种客户端请求：PUT/GET/DELETE，具体见：
 67 | 
 68 | ```java
 69 | //BlobServerConnection line111
 70 | switch (operation) {
 71 | case PUT_OPERATION:
 72 |    put(inputStream, outputStream, buffer);
 73 |    break;
 74 | case GET_OPERATION:
 75 |    get(inputStream, outputStream, buffer);
 76 |    break;
 77 | case DELETE_OPERATION:
 78 |    delete(inputStream, outputStream, buffer);
 79 |    break;
 80 | default:
 81 |    throw new IOException("Unknown operation " + operation);
 82 | }
 83 | ```
 84 | 
 85 | *这里重点介绍下 PUT 操作*
 86 | 
 87 | - 获取本次存储操作是否带 JobID
 88 | - 在 BlobServer 的本地 incoming 文件夹中生成临时文件：temp-[auto increment integer]
 89 | - 读取将要存储的字节长度
 90 | - 读取该长度字节存储到临时文件 temp-[auto increment integer]
 91 | - 如果带 JobID，会将临时文件移动到 JobID 对应的存储目录，并将该存储文件在 BlobStore 的对应 JobID恢复目录中备份，写 OK 消息到 Socket Client 端，最终生成的路径和文件： job-id/blob_[base64 encode key]
 92 | - 如果不带 JobID，则依据传递的消息字节数组生成一个 key：BlogKey，并存储在 cache 文件夹下，同时在 BlobStore 的 cache 文件夹下做备份，将 OK 消息和 BlobKey 写回 Socket Client，最终生成的路径和文件：cache/blob_[unique hex string]
 93 | 
 94 | ### BlobServer 交互协议
 95 | 
 96 | 与 BlobServer 通信的消息协议包括四段：操作类型【PUT/GET/DELETE】、存储类型【是否带 JobID】、内容长度、内容，如下图所示：
 97 | 
 98 | ![blob-server-contact.png](blob-server-contact.png)
 99 | 
100 | *到这里 BlobServer 就介绍完了*
101 | 
102 | ## InstanceManager
103 | 
104 | flink 用来追踪当前存活的 TaskManager 的管理组件，实现比较简单，这里只简单罗列下其功能：
105 | 
106 | - book 下载 JobManager 中注册的所有 TaskManager
107 | - 负责更新从 TaskManager 中上报的心跳及 metrics 信息
108 | - 通知 InstanceListener TaskManager 的增加与死亡
109 | 
110 | 
111 | ## BlobLibraryCacheManager
112 | 
113 | flink job 的 jar 包存储服务，使用上面的 BlobServer 完成，一个 JVM 里只会存在一个 BlobLibraryCacheManager，BlobLibraryCacheManager 负责管理 BlobService【这里为BlobServer】 中存储的 jars，并存储运行时 task 对 BlobService 中 jars 的引用计数，会清理不被使用任何 task 使用的 jars。
114 | 
115 | *BlobCache 负责 jars 的下载，介绍 TaskManager 的时候会详细介绍*
116 | 
117 | BlobLibraryCacheManager 与 BlobService 交互，而 BlobService 负责具体的文件管理，其具体实现有两个：BlobServer 和 BlobCache，具体见下图：
118 | 
119 | ![blob-service-extends-arch.png](blob-service-extends-arch.png)
120 | 
121 | BlobServer 前面已经介绍过了，那么 BlobCache 的功能是什么呢？
122 | 
123 | 来看 BlobCache 的构造器：
124 | 
125 | ```java
126 | //BlobCache line60
127 | public BlobCache(InetSocketAddress serverAddress, Configuration configuration) {
128 |    if (serverAddress == null || configuration == null) {
129 |       throw new NullPointerException();
130 |    }
131 | 
132 |    this.serverAddress = serverAddress;
133 | 
134 |    // configure and create the storage directory
135 |    String storageDirectory = configuration.getString(ConfigConstants.BLOB_STORAGE_DIRECTORY_KEY, null);
136 |    this.storageDir = BlobUtils.initStorageDirectory(storageDirectory);
137 |    LOG.info("Created BLOB cache storage directory " + storageDir);
138 | ```
139 | 
140 | 这里传入的 serverAddress 其实是 BlobServer 的服务端口，在 TaskManager 中可以看到：
141 | 
142 | ```java
143 | // start a blob service, if a blob server is specified TaskManager line940
144 | if (blobPort > 0) {
145 |   val jmHost = jobManager.path.address.host.getOrElse("localhost")
146 |   val address = new InetSocketAddress(jmHost, blobPort)
147 | 
148 |   log.info(s"Determined BLOB server address to be $address. Starting BLOB cache.")
149 | 
150 |   try {
151 |     val blobcache = new BlobCache(address, config.configuration)
152 |     blobService = Option(blobcache)
153 |     libraryCacheManager = Some(new BlobLibraryCacheManager(blobcache, config.cleanupInterval))
154 |   }
155 | ```
156 | 
157 | 来看 BlobCache 的核心服务方法：
158 | 
159 | ```java
160 | //BlobCache line97
161 | public URL getURL(final BlobKey requiredBlob) throws IOException {
162 |    if (requiredBlob == null) {
163 |       throw new IllegalArgumentException("BLOB key cannot be null.");
164 |    }
165 | 
166 |    final File localJarFile = BlobUtils.getStorageLocation(storageDir, requiredBlob);
167 | 
168 |    if (!localJarFile.exists()) {
169 | 
170 |       final byte[] buf = new byte[BlobServerProtocol.BUFFER_SIZE];
171 | 
172 |       // loop over retries
173 |       int attempt = 0;
174 |       while (true) {
175 | 
176 |          if (attempt == 0) {
177 |             LOG.info("Downloading {} from {}", requiredBlob, serverAddress);
178 |          } else {
179 |             LOG.info("Downloading {} from {} (retry {})", requiredBlob, serverAddress, attempt);
180 |          }
181 | 
182 |          try {
183 |             BlobClient bc = null;
184 |             InputStream is = null;
185 |             OutputStream os = null;
186 | 
187 |             try {
188 |                bc = new BlobClient(serverAddress);
189 |                is = bc.get(requiredBlob);
190 |                os = new FileOutputStream(localJarFile);
191 | 
192 |                while (true) {
193 |                   final int read = is.read(buf);
194 |                   if (read < 0) {
195 |                      break;
196 |                   }
197 |                   os.write(buf, 0, read);
198 |                }
199 | 
200 |                // we do explicitly not use a finally block, because we want the closing
201 |                // in the regular case to throw exceptions and cause the writing to fail.
202 |                // But, the closing on exception should not throw further exceptions and
203 |                // let us keep the root exception
204 |                os.close();
205 |                os = null;
206 |                is.close();
207 |                is = null;
208 |                bc.close();
209 |                bc = null;
210 | 
211 |                // success, we finished
212 |                break;
213 | ```
214 | 
215 | 简要概括下其逻辑：
216 | 
217 | - 先从本地磁盘中获取，如果存在，直接返回
218 | - 如果没有，生成 BlobClient 与 BlobServer 交互，并拉取文件到本地缓存，后返回本地缓存的文件句柄
219 | 
220 | 从这里我们可以看到 BlobCache 是 TaskManager 操作本地文件的工具，它负责从 JobManager 中的 BlobServer 同步所需的文件【jar包等】，而 BlobServer 和 BlobCache 的文件管理的入口，统一由对应 JVM 中的 BlobLibraryCacheManager 来控制【没有任务使用的 jar 定期清除等】。
221 | 
222 | task 拉取 jar包文件的过程如下：
223 | 
224 | ![blob-server-cache-store.png](blob-server-cache-store.png)
225 | 
226 | 
227 | 
228 | ## ZooKeeperCompletedCheckpointStore
229 | 
230 | flink 做 checkpoint 【有关 checkpoint 会另起一节介绍】存储的组件，负责存储已完成的 Checkpoint ，实现了接口 CompletedCheckpointStore，StandaloneCompletedCheckpointStore 和 ZooKeeperCompletedCheckpointStore 都实现了 CompletedCheckpointStore 接口，前者只在内存里存储 checkpoint，这里只介绍 ZooKeeperCompletedCheckpointStore 的实现。
231 | 
232 | ZooKeeperCompletedCheckpointStore 存储 checkpoint 的基本思路：
233 | 
234 | - 先在本地磁盘持久化指定数量的 checkpoint
235 | - 将文件句柄更新到 ZK 的特定节点下
236 | - 滑动更新 zk 的节点存储
237 | - 在恢复的时候只取最近一次的更新值
238 | 
239 | 先来看下  ZooKeeperCompletedCheckpointStore 用来和 ZK 存储交互的组件：ZooKeeperStateHandleStore，来看它的核心添加 state 的方法：
240 | 
241 | ```java
242 | //ZooKeeperStateHandleStore line117
243 | public StateHandle<T> add(
244 |       String pathInZooKeeper,
245 |       T state,
246 |       CreateMode createMode) throws Exception {
247 |    checkNotNull(pathInZooKeeper, "Path in ZooKeeper");
248 |    checkNotNull(state, "State");
249 | 
250 |    StateHandle<T> stateHandle = storage.store(state);
251 | 
252 |    boolean success = false;
253 | 
254 |    try {
255 |       // Serialize the state handle. This writes the state to the backend.
256 |       byte[] serializedStateHandle = InstantiationUtil.serializeObject(stateHandle);
257 | 
258 |       // Write state handle (not the actual state) to ZooKeeper. This is expected to be
259 |       // smaller than the state itself. This level of indirection makes sure that data in
260 |       // ZooKeeper is small, because ZooKeeper is designed for data in the KB range, but
261 |       // the state can be larger.
262 |       client.create().withMode(createMode).forPath(pathInZooKeeper, serializedStateHandle);
263 | 
264 |       success = true;
265 | 
266 |       return stateHandle;
267 |    }
268 |    finally {
269 |       if (!success) {
270 |          // Cleanup the state handle if it was not written to ZooKeeper.
271 |          if (stateHandle != null) {
272 |             stateHandle.discardState();
273 |          }
274 |       }
275 |    }
276 | }
277 | ```
278 | 
279 | 简要概括其逻辑：
280 | 
281 | - 使用 StateStorageHelper 存储 state，ZK 模式下为 FileSystemStateStorageHelper，方式为直接存储到本地磁盘
282 | - 将 state 的句柄对象 StateHandle 序列化并持久化到 ZK 的节点
283 | 
284 | 其在 zk 上的存储路径如下图所示：
285 | 
286 | <img src="zk-state-handle-storage.png" width="300" height="300" alt="zk-state-handle-storage.png" align="center" />
287 | 
288 | 现在来看 ZooKeeperCompletedCheckpointStore 的核心功能：添加 checkpoint 和 从 checkpoint 做 recovery
289 | 
290 | ### 添加 checkpoint
291 | 
292 | ```java
293 | //ZooKeeperCompletedCheckpointStore line190
294 | public void addCheckpoint(CompletedCheckpoint checkpoint) throws Exception {
295 |    checkNotNull(checkpoint, "Checkpoint");
296 | 
297 |    // First add the new one. If it fails, we don't want to loose existing data.
298 |    String path = String.format("/%s", checkpoint.getCheckpointID());
299 | 
300 |    final StateHandle<CompletedCheckpoint> stateHandle = checkpointsInZooKeeper.add(path, checkpoint);
301 | 
302 |    checkpointStateHandles.addLast(new Tuple2<>(stateHandle, path));
303 | 
304 |    // Everything worked, let's remove a previous checkpoint if necessary.
305 |    if (checkpointStateHandles.size() > maxNumberOfCheckpointsToRetain) {
306 |       removeFromZooKeeperAndDiscardCheckpoint(checkpointStateHandles.removeFirst());
307 |    }
308 | 
309 |    LOG.debug("Added {} to {}.", checkpoint, path);
310 | }
311 | ```
312 | 
313 | 简要概括其逻辑：
314 | 
315 | - 在本地磁盘存储该 checkpoint 的内容并返回句柄对象：StateHandle
316 | - 以 checkpoint id 在 zk 上新建一个 node，并存储对应的序列化后的 StateHandle
317 | - 检查存储的 checkpoint 个数是否超过限制，如果超过，删除本地磁盘及zk上最旧的数据
318 | - 如果添加失败，已有的 checkpoint 数据不会受影响，这里 flink 想最大化保留作业的 checkpoint
319 | 
320 | ### 从 checkpoint 中恢复
321 | 
322 | ```java
323 | //ZooKeeperCompletedCheckpointStore line137
324 | public void recover() throws Exception {
325 |    LOG.info("Recovering checkpoints from ZooKeeper.");
326 | 
327 |    // Clear local handles in order to prevent duplicates on
328 |    // recovery. The local handles should reflect the state
329 |    // of ZooKeeper.
330 |    checkpointStateHandles.clear();
331 | 
332 |    // Get all there is first
333 |    List<Tuple2<StateHandle<CompletedCheckpoint>, String>> initialCheckpoints;
334 |    while (true) {
335 |       try {
336 |          initialCheckpoints = checkpointsInZooKeeper.getAllSortedByName();
337 |          break;
338 |       }
339 |       catch (ConcurrentModificationException e) {
340 |          LOG.warn("Concurrent modification while reading from ZooKeeper. Retrying.");
341 |       }
342 |    }
343 | 
344 |    int numberOfInitialCheckpoints = initialCheckpoints.size();
345 | 
346 |    LOG.info("Found {} checkpoints in ZooKeeper.", numberOfInitialCheckpoints);
347 | 
348 |    if (numberOfInitialCheckpoints > 0) {
349 |       // Take the last one. This is the latest checkpoints, because path names are strictly
350 |       // increasing (checkpoint ID).
351 |       Tuple2<StateHandle<CompletedCheckpoint>, String> latest = initialCheckpoints
352 |             .get(numberOfInitialCheckpoints - 1);
353 | 
354 |       CompletedCheckpoint latestCheckpoint = latest.f0.getState(userClassLoader);
355 | 
356 |       checkpointStateHandles.add(latest);
357 | 
358 |       LOG.info("Initialized with {}. Removing all older checkpoints.", latestCheckpoint);
359 | 
360 |       for (int i = 0; i < numberOfInitialCheckpoints - 1; i++) {
361 |          try {
362 |             removeFromZooKeeperAndDiscardCheckpoint(initialCheckpoints.get(i));
363 |          }
364 |          catch (Exception e) {
365 |             LOG.error("Failed to discard checkpoint", e);
366 |          }
367 |       }
368 |    }
369 | }
370 | ```
371 | 
372 | 简要概括其逻辑：
373 | 
374 | - 清除内存中维护的句柄对象 StateHandle s
375 | - 从 ZK 上拉取作业对应的所有的 checkpoint StateHandle 节点，并排序【从小到大】
376 | - 获取最新的一次快照并从本地磁盘恢复 checkpoint
377 | - 删除其余所有的 checkpoint 信息【ZK 和本地磁盘】
378 | 
379 | ZooKeeperCompletedCheckpointStore 由 ZooKeeperCheckpointRecoveryFactory 负责实例化，一个 Job 会实例化一个 ZooKeeperCompletedCheckpointStore 负责快照。这里存储的只是个节点快照的句柄，并不是真正的状态数据。
380 | 
381 | 具体的启动流程见 JobManager
382 | 
383 | `line1208 val completedCheckpoints = checkpointRecoveryFactory.createCheckpointStore(jobId, userCodeLoader)` 
384 | 
385 | `line1238 executionGraph.enableSnapshotCheckpointing`
386 | 
387 | 到这里 JobManager 的核心组件基本就介绍结束了😄


--------------------------------------------------------------------------------
/flink/jobmanager基本组件/zk-state-handle-storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/jobmanager基本组件/zk-state-handle-storage.png


--------------------------------------------------------------------------------
/flink/taskmanager基本组件/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/taskmanager基本组件/.DS_Store


--------------------------------------------------------------------------------
/flink/taskmanager基本组件/file-channel-entend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/taskmanager基本组件/file-channel-entend.png


--------------------------------------------------------------------------------
/flink/taskmanager基本组件/io-manager-async.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/taskmanager基本组件/io-manager-async.png


--------------------------------------------------------------------------------
/flink/taskmanager基本组件/memory-segment-extend.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/taskmanager基本组件/memory-segment-extend.png


--------------------------------------------------------------------------------
/flink/taskmanager基本组件/taskmanager基本组件.md:
--------------------------------------------------------------------------------
  1 | # TaskManager基本组件
  2 | 
  3 | TaskManager 在 Flink 中也被叫做一个 Instance，统一管理该物理节点上的所有 Flink job 的 task 的运行，它的功能包括了 task 的启动销毁、内存管理、磁盘IO、网络传输管理等，本章将一一介绍这些功能，方面后续章节的开展
  4 | 
  5 | ## MemoryManager
  6 | 
  7 | *先来翻译一下类的 JavaDoc ^_^*
  8 | 
  9 | MemoryManager 统一管理了 flink 的内存使用，内存被划分为相同大小的 segment，通过申请不同数量的 segment 来分配不同大小的内存
 10 | 
 11 | 这里支持两种内存：on-heap 内存和 off-heap 内存，通过参数可以控制分配内存的种类
 12 | 
 13 | MemoryManager 管理内存也分两种模式：预分配和按需分配。预分配模式下，内存在启动时就会分好，这就会意味着不会发生 OOM 异常，释放的内存会重新归还 MemoryManager 的内存池；按需模式下，MemoryManager 仅仅追踪内存的使用【做记录】，释放内存不会归还 MemoryManager 的内存池，而是通过托管给 JVM 的垃圾回收来最终释放，这样便可能会发生 OOM
 14 | 
 15 | *下面我们就来分析下 MemoryManager 的实现细节*
 16 | 
 17 | ### MemorySegment
 18 | 
 19 | 上面已经提到，MemoryManager 以 segment 为单位来实现内存的分配和管理，在 flink 中一个 segment 被抽象为 MemorySegment，MemorySegment 为抽象类，定义了基本的 put/get 方法，以及 swap、compare 等工具方法，同时维护了一个偏移量：BYTE_ARRAY_BASE_OFFSET，这个偏移量为 byte[] 对象在内存中的基本偏移量，后续通过 `sun.misc.Unsafe` 直接操纵内存就是基于这个偏移量来完成，这个类定义的实现方法屏蔽了内存的种类【堆和非堆】，当其成员变量 heapMemory 不为空时就是堆内存，此时的 address 就是 BYTE_ARRAY_BASE_OFFSET；而 heapMemory 为 null 时代表非堆内存，此时的 address 是内存中的绝对地址。
 20 | 
 21 | MemorySegment 有两个实现类：HeapMemorySegment 和 HibridMemorySegment，分别代表堆内存 segment 和 非堆内存 segment，具体的继承关系如下：
 22 | 
 23 | ![memory-segment-extend.png](memory-segment-extend.png)
 24 | 
 25 | HeapMemorySegment 和 HibridMemorySegment 中都分别定义了工厂类来实例化对象实例。
 26 | 
 27 | ### MemoryPool
 28 | 
 29 | MemoryPool 是 MemoryManager 用来统一管理资源的组件，具体又分为 HeapMemoryPool 和 HybridOffHeapMemoryPool，前者管理堆内存，后者管理非堆内存。
 30 | 
 31 | *先来看HeapMemoryPool*
 32 | 
 33 | ```java
 34 | //MemoryManager.java line 616
 35 | @Override
 36 | 		HeapMemorySegment allocateNewSegment(Object owner) {
 37 | 			return HeapMemorySegment.FACTORY.allocateUnpooledSegment(segmentSize, owner);
 38 | 		}
 39 | 
 40 | 		@Override
 41 | 		HeapMemorySegment requestSegmentFromPool(Object owner) {
 42 | 			byte[] buf = availableMemory.remove();
 43 | 			return  HeapMemorySegment.FACTORY.wrapPooledHeapMemory(buf, owner);
 44 | 		}
 45 | 
 46 | 		@Override
 47 | 		void returnSegmentToPool(MemorySegment segment) {
 48 | 			if (segment.getClass() == HeapMemorySegment.class) {
 49 | 				HeapMemorySegment heapSegment = (HeapMemorySegment) segment;
 50 | 				availableMemory.add(heapSegment.getArray());
 51 | 				heapSegment.free();
 52 | 			}
 53 | 			else {
 54 | 				throw new IllegalArgumentException("Memory segment is not a " + HeapMemorySegment.class.getSimpleName());
 55 | 			}
 56 | 		}
 57 | ```
 58 | 
 59 | 简单总结：
 60 | 
 61 | - allocateNewSegment 走的是 on demand 模式，通过 new byte[] 从堆上分配内存
 62 | - requestSegmentFromPool 走的是 pre allocate 模式，通过复用已有的堆对象
 63 | 
 64 | HybridOffHeapMemoryPool 的接口与其类似，不过分配内存走的是 `ByteBuffer.allocateDirect(segmentSize);` 直接分配了物理内存，也就是非堆内存
 65 | 
 66 | ## IOManager
 67 | 
 68 | flink 通过 IOManager 来控制磁盘 IO 的过程，提供同步和异步两种写模式【其实只有异步】，具体的读写方式又分为 block、buffer、bulk 三种方式；用户可以指定 IO 的文件目录集合，IOManager 会以 round-robin 的方式写不同目录的不同文件。
 69 | 
 70 | IOManager 提供两种方式枚举新的 IO 文件：
 71 | 
 72 | - 直接 round-robin 文件夹并生成文件，每个新文件的命名 pattern 为 random_hex_string.channel，最终对应的目录结构是：
 73 | 
 74 |   path1/random_hex_string1.channel
 75 | 
 76 |   path2/random_hex_string2.channel
 77 | 
 78 |   path3/random_hex_string3.channel
 79 | 
 80 | - 采取 Enumerator 的模式，每个 Enumerator 也是类似如上一种方式进行 round-robin，不过 Enumerator 会维护一个固定的本地命名前缀、一个本地计数器、一个全局计数器，命名前缀用于区分不同的 Enumerator 写的文件，本地计数器用于 Enumerator 自身的文件命名递增，全局计数器用于 round-robin 文件夹，最终的目录结构是：
 81 | 
 82 |   path1/prefix.local_counter1.channel
 83 | 
 84 |   path2/prefix.local_counter2.channel
 85 | 
 86 |   path3/prefix.local_counter3.channel
 87 | 
 88 | flink 又进一步将一个文件的 IO 抽象成了 FileIOChannel，通过 FileIOChannel 封装了底层的文件读写，具体的继承关系如下：
 89 | 
 90 | ![file-channel-entend.png](file-channel-entend.png)
 91 | 
 92 | IOManager 的唯一实现类：IOManagerAsync 为每个人临时文件加【用户初始化的时候指定】维护了一个读线程和写线程，并且每个读写线程内部会维护一个请求队列: RequestQueue，上面的 FileIOChannel 通过将 读写请求加入到对应的 RequestQueue 中来实现文件读写，具体的线程模型如下：
 93 | 
 94 | ![io-manager-async.png](io-manager-async.png)
 95 | 
 96 | ps: 默认的临时文件夹目录是 java.io.tmpDir
 97 | 
 98 | ## NetworkEnvironment
 99 | 
100 | NetworkEnvironment 是每个 Instance 的网络 IO 组件，包含了追踪中间结果和数据交换的数据结构。它的构造器会统一将配置的内存先分配出来，抽象成 NetworkBufferPool 统一管理内存的申请和释放。
101 | 
102 | ### BufferPool
103 | 
104 | 从 MemoryManager 的介绍中我们讲到 flink 是以 MemorySegment 为单位来管理内存的，而一个 MemorySegment 又被叫做一个 Buffer。BufferPool 是管理 Buffer 的工具。Buffer 的申请统一交给 NetworkBufferPool，具体的管理交给 LocalBufferPool。
105 | 
106 | #### LocalBufferPool
107 | 
108 | 我们来看 LocalBufferPool 的关键接口，以了解具体都有哪些方式来管理 Buffer 😄。
109 | 
110 | ##### 申请 Buffer
111 | 
112 | ```java
113 | //LocalBufferPool line136
114 | private Buffer requestBuffer(boolean isBlocking) throws InterruptedException, IOException {
115 | 		synchronized (availableMemorySegments) {
116 | 			returnExcessMemorySegments();
117 | 
118 | 			boolean askToRecycle = owner != null;
119 | //...
120 | ```
121 | 
122 | 总结其逻辑：
123 | 
124 | - 申请 Buffer
125 | - 释放超量申请的 Buffer
126 | - 像 NetworkBufferPool 申请 Buffer
127 | - 如果此 LocalBufferPool 有 owner【ResultPartition】，像 ResultPartition 释放内存，这里又会下发到 ResultPartition 的 subPartition，释放是以 subPartition 的全部内存为单位，会将内存中的数据吐到磁盘上或者不释放【依据配置的不同】
128 | 
129 | ##### 回收 Buffer
130 | 
131 | ```java
132 | //LocalBufferPool line175 
133 | public void recycle(MemorySegment segment) {
134 | 		synchronized (availableMemorySegments) {
135 | 			if (isDestroyed || numberOfRequestedMemorySegments > currentPoolSize) {
136 | 				returnMemorySegment(segment);
137 | 			}
138 | ```
139 | 
140 | 简单的总结：
141 | 
142 | - 如果此 LocalBuffer 已销毁或超量使用，将 Buffer 归还给 NetworkBufferPool
143 | - 否则如果注册了 EventListener ，通知每个 listener 这个 Buffer 被回收
144 | - 如果没有注册，将这个 Buffer 重新标记为可使用【加入到待申请队列】
145 | 
146 | ##### 调整 Buffer 大小
147 | 
148 | ```java
149 | //LocalBufferPool line237
150 | public void setNumBuffers(int numBuffers) throws IOException {
151 | 		synchronized (availableMemorySegments) {
152 | 			checkArgument(numBuffers >= numberOfRequiredMemorySegments, "Buffer pool needs at least " + numberOfRequiredMemorySegments + " buffers, but tried to set to " + numBuffers + ".");
153 | 
154 | 			currentPoolSize = numBuffers;
155 | ```
156 | 
157 | 简单总结：
158 | 
159 | - 归还超量使用的内存给 NetworkBufferPool
160 | - 如果还是超量使用，调用 owner 的释放接口【以 ResultSubPartiton 为单位释放】
161 | 
162 | #### NetworkBufferPool
163 | 
164 | 上面已经提到，NetworkbufferPool 统一管理了网络栈的内存，LocalBufferPool 只是管理 Buffer 的方式，具体的申请和释放还是要走 NetworkBufferPool 的接口。值得注意的是，NetworkBufferPool 在实例化的时候就将初始的固定大小的内存分配出来了【不管是堆还是非堆】。我们来看它的关键接口：
165 | 
166 | ##### 创建 LocalBufferPool
167 | 
168 | ```java
169 | //NetworkBufferPool line184
170 | @Override
171 | 	public BufferPool createBufferPool(int numRequiredBuffers, boolean isFixedSize) throws IOException {
172 | 		// It is necessary to use a separate lock from the one used for buffer
173 | 		// requests to ensure deadlock freedom for failure cases.
174 | 		synchronized (factoryLock) {
175 | 			if (isDestroyed) {
176 | 				throw new IllegalStateException("Network buffer pool has already been destroyed.");
177 | 			}
178 | ```
179 | 
180 | 简单总结：
181 | 
182 | - 做一些状态备份，包括整体使用的 Buffer 数、可动态调整大小的 BufferPool 等
183 | - 对于可动态调整的 BufferPool，重新调整可用内存，调整方式为 round-robin
184 | 
185 | ##### 销毁 LocalBufferPool
186 | 
187 | ```java
188 | //NetworkBufferPool line227
189 | @Override
190 | 	public void destroyBufferPool(BufferPool bufferPool) {
191 | 		if (!(bufferPool instanceof LocalBufferPool)) {
192 | 			throw new IllegalArgumentException("bufferPool is no LocalBufferPool");
193 | 		}
194 | ```
195 | 
196 | 
197 | 
198 | 简单总结：
199 | 
200 | - 消除状态记录
201 | - 对于可动态调整的 BufferPool，重新调整可用内存，调整方式为 round-robin
202 | 
203 | 


--------------------------------------------------------------------------------
/flink/简历-陈玉兆.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/danny0405/flink-source-code-analysis/1e082c6c3a0235f52554ed64190e66ce50b60da6/flink/简历-陈玉兆.pdf


--------------------------------------------------------------------------------