├── .gitignore ├── source-code-reading ├── v0.7.71-nightly │ ├── 7_view.md │ ├── assets │ │ ├── image.png │ │ ├── 1658305358904-image.png │ │ ├── 1658305391305-image.png │ │ ├── 1658305438488-image.png │ │ ├── 1661135526502-image.png │ │ ├── 1661140385795-image.png │ │ ├── 1661143930884-image.png │ │ ├── 1661145672312-image.png │ │ ├── 1661263886543-image.png │ │ ├── 1661265316769-image.png │ │ ├── 1661265322060-image.png │ │ ├── 1661265830054-image.png │ │ └── 1661265911577-image.png │ ├── README.md │ ├── 6_join.md │ ├── 4_expression.md │ ├── 3_distributed_query.md │ ├── 8_storage.md │ ├── 5_aggregation.md │ └── 1_services.md └── v0.8.177-nightly │ ├── assets │ ├── image.png │ ├── 1673421080755-image.png │ └── 1673429218678-image.png │ ├── README.md │ └── 1_scheduler.md ├── README.md └── official-docs └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea/ 3 | -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/7_view.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/assets/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.7.71-nightly/assets/image.png -------------------------------------------------------------------------------- /source-code-reading/v0.8.177-nightly/assets/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.8.177-nightly/assets/image.png -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/assets/1658305358904-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.7.71-nightly/assets/1658305358904-image.png -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/assets/1658305391305-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.7.71-nightly/assets/1658305391305-image.png -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/assets/1658305438488-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.7.71-nightly/assets/1658305438488-image.png -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/assets/1661135526502-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.7.71-nightly/assets/1661135526502-image.png -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/assets/1661140385795-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.7.71-nightly/assets/1661140385795-image.png -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/assets/1661143930884-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.7.71-nightly/assets/1661143930884-image.png -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/assets/1661145672312-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.7.71-nightly/assets/1661145672312-image.png -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/assets/1661263886543-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.7.71-nightly/assets/1661263886543-image.png -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/assets/1661265316769-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.7.71-nightly/assets/1661265316769-image.png -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/assets/1661265322060-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.7.71-nightly/assets/1661265322060-image.png -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/assets/1661265830054-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.7.71-nightly/assets/1661265830054-image.png -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/assets/1661265911577-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.7.71-nightly/assets/1661265911577-image.png -------------------------------------------------------------------------------- /source-code-reading/v0.8.177-nightly/assets/1673421080755-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.8.177-nightly/assets/1673421080755-image.png -------------------------------------------------------------------------------- /source-code-reading/v0.8.177-nightly/assets/1673429218678-image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonLi-cn/databend-comment/HEAD/source-code-reading/v0.8.177-nightly/assets/1673429218678-image.png -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/README.md: -------------------------------------------------------------------------------- 1 | # source code version 2 | 3 | * tag: v0.7.71-nightly 4 | * commit: 2d0e289a88dfe19a7fa11437c71842e0e3883229 5 | * [source code](https://github.com/datafuselabs/databend/tree/v0.7.71-nightly) 6 | -------------------------------------------------------------------------------- /source-code-reading/v0.8.177-nightly/README.md: -------------------------------------------------------------------------------- 1 | # source code version 2 | 3 | * tag: v0.8.176-patch1 4 | * commit: 025af70f01f5306e53f747a9a55d22f0281f514e 5 | * [source code](https://github.com/datafuselabs/databend/tree/v0.8.176-patch1) 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # databend-comment 2 | 3 | databend source reading notes. 4 | 5 | ## Databend 6 | 7 | * [home](https://databend.rs/doc) 8 | * [github](https://github.com/datafuselabs/databend) 9 | * [bilibili](https://space.bilibili.com/275673537) 10 | * [虎哥的博客](https://bohutang.me/tags/databend/) 11 | * [Databend 内幕大揭秘](https://psiace.github.io/databend-internals/docs/getting-started/introduction/) 12 | -------------------------------------------------------------------------------- /official-docs/README.md: -------------------------------------------------------------------------------- 1 | ## Databend 源码阅读系列 2 | 3 | * [《Databend 源码阅读系列(一): 开篇》](https://mp.weixin.qq.com/s/5snnRuOLdCdWEktRFcwwug) 4 | * [《Databend 源码阅读系列(二):Query server 启动,Session 管理及请求处理》](https://mp.weixin.qq.com/s?__biz=Mzg4NzYzMzk1Mw==&mid=2247488004&idx=1&sn=1de114314011d82bfad8a0320d8aac08&chksm=cf86334af8f1ba5c339497999ec4a9b0d1088cf06631a53827a2b08856b47d91c1ef1f65c368&scene=178&cur_album_id=2515308497635557377#rd) 5 | -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/6_join.md: -------------------------------------------------------------------------------- 1 | * [六 Join](#六-join) 2 | * [Build Pipeline](#build-pipeline) 3 | * [V2](#v2) 4 | * [PhysicalPlan](#physicalplan) 5 | * [HashJoin](#hashjoin) 6 | * [JoinType](#jointype) 7 | * [SelectInterpreterV2](#selectinterpreterv2) 8 | * [PipelineBuilder](#pipelinebuilder) 9 | * [JoinHashTable](#joinhashtable) 10 | * [SinkBuildHashTable](#sinkbuildhashtable) 11 | * [TransformHashJoinProbe](#transformhashjoinprobe) 12 | 13 | ## 六 Join 14 | 15 | ### Build Pipeline 16 | 17 | #### V2 18 | 19 | ##### PhysicalPlan 20 | 21 | ```rust 22 | // file: src/query/service/src/sql/executor/physical_plan.rs 23 | pub enum PhysicalPlan { 24 | TableScan(TableScan), 25 | Filter(Filter), 26 | Project(Project), 27 | EvalScalar(EvalScalar), 28 | AggregatePartial(AggregatePartial), 29 | AggregateFinal(AggregateFinal), 30 | Sort(Sort), 31 | Limit(Limit), 32 | HashJoin(HashJoin), 33 | Exchange(Exchange), 34 | UnionAll(UnionAll), 35 | 36 | /// Synthesized by fragmenter 37 | ExchangeSource(ExchangeSource), 38 | ExchangeSink(ExchangeSink), 39 | } 40 | ``` 41 | 42 | ##### HashJoin 43 | 44 | ```rust 45 | // file: src/query/service/src/sql/executor/physical_plan.rs 46 | pub struct HashJoin { 47 | pub build: Box, 48 | pub probe: Box, 49 | pub build_keys: Vec, 50 | pub probe_keys: Vec, 51 | pub other_conditions: Vec, // 目前好像没用上 52 | pub join_type: JoinType, 53 | pub marker_index: Option, 54 | pub from_correlated_subquery: bool, 55 | } 56 | ``` 57 | 58 | ##### JoinType 59 | 60 | ```rust 61 | // file: src/query/service/src/sql/planner/plans/logical_join.rs 62 | pub enum JoinType { 63 | Inner, 64 | Left, 65 | Right, 66 | Full, 67 | Semi, 68 | Anti, 69 | Cross, 70 | /// Mark Join is a special case of join that is used to process Any subquery and correlated Exists subquery. 71 | Mark, 72 | /// Single Join is a special kind of join that is used to process correlated scalar subquery. 73 | Single, 74 | } 75 | ``` 76 | 77 | ##### SelectInterpreterV2 78 | 79 | 入口,调用流程: 80 | 81 | ```rust 82 | // file: src/query/service/src/interpreters/interpreter_select_v2.rs 83 | SelectInterpreterV2::execute() -> Self::build_pipeline() -> PhysicalPlanBuilder::new() 84 | -> PhysicalPlanBuilder::build() // build physical_plan 85 | -> PipelineBuilder::create() 86 | -> PipelineBuilder::finalize(physical_plan) // build pipeline 87 | ``` 88 | 89 | ##### PipelineBuilder 90 | 91 | 负责构造Pipeline 92 | 93 | ```rust 94 | // file: src/query/service/src/sql/executor/pipeline_builder.rs 95 | pub struct PipelineBuilder { 96 | ctx: Arc, 97 | main_pipeline: Pipeline, 98 | pub pipelines: Vec, 99 | } 100 | // func: finalize() 101 | pub fn finalize(mut self, plan: &PhysicalPlan) -> Result { 102 | self.build_pipeline(plan)?; // 构造 Pipeline,把产生的Pipeline放入self.pipelines和self.pipelines 103 | 104 | for source_pipeline in &self.pipelines { 105 | if !source_pipeline.is_complete_pipeline()? { 106 | return Err(ErrorCode::IllegalPipelineState( 107 | "Source pipeline must be complete pipeline.", 108 | )); 109 | } 110 | } 111 | 112 | Ok(PipelineBuildResult { 113 | main_pipeline: self.main_pipeline, 114 | sources_pipelines: self.pipelines, 115 | }) 116 | } 117 | 118 | // func: build_pipeline() 119 | fn build_pipeline(&mut self, plan: &PhysicalPlan) -> Result<()> { 120 | ... 121 | PhysicalPlan::Limit(limit) => self.build_limit(limit), 122 | PhysicalPlan::HashJoin(join) => self.build_join(join), 123 | ... 124 | } 125 | 126 | // func: build_join() ⭐️ 127 | fn build_join(&mut self, join: &HashJoin) -> Result<()> { 128 | let state = self.build_join_state(join)?; // 创建HashMap 129 | self.expand_build_side_pipeline(&join.build, state.clone())?; // 创建Build表的pipeline 130 | self.build_join_probe(join, state) // 创建probe的pipeline 131 | } 132 | 133 | // func: build_join_state() 134 | // 创建HashMap 135 | fn build_join_state(&mut self, join: &HashJoin) -> Result> { 136 | JoinHashTable::create_join_state( 137 | self.ctx.clone(), 138 | &join.build_keys, 139 | join.build.output_schema()?, 140 | HashJoinDesc::create(join)?, 141 | ) 142 | } 143 | 144 | // func: expand_build_side_pipeline() 145 | fn expand_build_side_pipeline( 146 | &mut self, 147 | build: &PhysicalPlan, 148 | join_state: Arc, 149 | ) -> Result<()> { 150 | let build_side_context = QueryContext::create_from(self.ctx.clone()); 151 | let build_side_builder = PipelineBuilder::create(build_side_context); 152 | let mut build_res = build_side_builder.finalize(build)?; 153 | 154 | assert!(build_res.main_pipeline.is_pulling_pipeline()?); 155 | let mut sink_pipeline_builder = SinkPipeBuilder::create(); 156 | for _index in 0..build_res.main_pipeline.output_len() { 157 | let input_port = InputPort::create(); 158 | sink_pipeline_builder.add_sink( 159 | input_port.clone(), 160 | Sinker::::create( 161 | input_port, 162 | SinkBuildHashTable::try_create(join_state.clone())?, 163 | ), 164 | ); 165 | } 166 | 167 | build_res 168 | .main_pipeline 169 | .add_pipe(sink_pipeline_builder.finalize()); // build side pipeline add sink 170 | 171 | self.pipelines.push(build_res.main_pipeline); // 把build side main pipeline加入self.pipelines 172 | self.pipelines 173 | .extend(build_res.sources_pipelines.into_iter()); // 把build side sources_pipelines加入self.pipelines 174 | Ok(()) 175 | } 176 | 177 | // func: build_join_probe() 178 | // 创建probe side pipeline 179 | fn build_join_probe(&mut self, join: &HashJoin, state: Arc) -> Result<()> { 180 | self.build_pipeline(&join.probe)?; 181 | 182 | self.main_pipeline.add_transform(|input, output| { // 把join probe tranform加入self.main_pipeline 183 | Ok(TransformHashJoinProbe::create( 184 | self.ctx.clone(), 185 | input, 186 | output, 187 | state.clone(), 188 | join.output_schema()?, 189 | )) 190 | })?; 191 | 192 | if join.join_type == JoinType::Mark { 193 | self.main_pipeline.resize(1)?; 194 | self.main_pipeline.add_transform(|input, output| { 195 | TransformMarkJoin::try_create( 196 | input, 197 | output, 198 | MarkJoinCompactor::create(state.clone()), 199 | ) 200 | })?; 201 | } 202 | 203 | Ok(()) 204 | } 205 | ``` 206 | 207 | ##### JoinHashTable 208 | 209 | ```rust 210 | // file: src/query/service/src/pipelines/processors/transforms/hash_join/join_hash_table.rs 211 | pub struct JoinHashTable { 212 | pub(crate) ctx: Arc, 213 | /// Reference count 214 | ref_count: Mutex, 215 | is_finished: Mutex, 216 | /// A shared big hash table stores all the rows from build side 217 | pub(crate) hash_table: RwLock, 218 | pub(crate) row_space: RowSpace, 219 | pub(crate) hash_join_desc: HashJoinDesc, 220 | pub(crate) row_ptrs: RwLock>, 221 | finished_notify: Arc, 222 | } 223 | ``` 224 | 225 | ##### SinkBuildHashTable 226 | 227 | ```rust 228 | // file: src/query/service/src/pipelines/processors/transforms/transform_hash_join.rs 229 | pub struct SinkBuildHashTable { 230 | join_state: Arc, 231 | } 232 | 233 | // func: try_create() 234 | pub fn try_create(join_state: Arc) -> Result { 235 | join_state.attach()?; 236 | Ok(Self { join_state }) 237 | } 238 | // implement Sink 239 | impl Sink for SinkBuildHashTable { 240 | const NAME: &'static str = "BuildHashTable"; 241 | 242 | fn on_finish(&mut self) -> Result<()> { 243 | self.join_state.detach() 244 | } 245 | 246 | fn consume(&mut self, data_block: DataBlock) -> Result<()> { 247 | self.join_state.build(data_block) 248 | } 249 | } 250 | ``` 251 | 252 | ##### TransformHashJoinProbe 253 | 254 | ```rust 255 | // file: src/query/service/src/pipelines/processors/transforms/transform_hash_join.rs 256 | pub struct TransformHashJoinProbe { 257 | input_data: Option, 258 | output_data_blocks: VecDeque, 259 | 260 | input_port: Arc, 261 | output_port: Arc, 262 | step: HashJoinStep, 263 | join_state: Arc, 264 | probe_state: ProbeState, 265 | } 266 | 267 | // func: probe() 268 | fn probe(&mut self, block: &DataBlock) -> Result<()> { 269 | self.probe_state.clear(); 270 | self.output_data_blocks 271 | .extend(self.join_state.probe(block, &mut self.probe_state)?); // 调用JoinHashTable的probe函数 272 | Ok(()) 273 | } 274 | 275 | impl Processor for TransformHashJoinProbe { 276 | // func: event() 277 | // 输出驱动 278 | fn event(&mut self) -> Result { 279 | match self.step { 280 | HashJoinStep::Build => Ok(Event::Async), 281 | HashJoinStep::Probe => { 282 | // 如果output_port结束,则结束input_port,同时返回Event::Finished 283 | if self.output_port.is_finished() { 284 | self.input_port.finish(); 285 | return Ok(Event::Finished); 286 | } 287 | // 如果output_port当前不能接受数据,则告知input_port当前不需要数据,同时返回Event::NeedConsume, 288 | // 调度器会执行下游算子 289 | if !self.output_port.can_push() { 290 | self.input_port.set_not_need_data(); 291 | return Ok(Event::NeedConsume); 292 | } 293 | // 如果当前算子的输出缓存不为空,则把缓存中的数据push到output_port,同时返回Event::NeedConsume 294 | if !self.output_data_blocks.is_empty() { 295 | let data = self.output_data_blocks.pop_front().unwrap(); 296 | self.output_port.push_data(Ok(data)); 297 | return Ok(Event::NeedConsume); 298 | } 299 | // 如果当前输入缓存不为空,则返回Event::Sync,说明需要处理当前的输入数据。 300 | if self.input_data.is_some() { 301 | return Ok(Event::Sync); 302 | } 303 | // 如果input_port有数据,则从中pull数据,并放到self.input_data,返回Event::Sync,等待下一步处理 304 | if self.input_port.has_data() { 305 | let data = self.input_port.pull_data().unwrap()?; 306 | self.input_data = Some(data); 307 | return Ok(Event::Sync); 308 | } 309 | // 如果input_port结束了,则结束output_port,并返回Event::Finished 310 | if self.input_port.is_finished() { 311 | self.output_port.finish(); 312 | return Ok(Event::Finished); 313 | } 314 | // 设置input_port需要数据,从而其对应的上游的output_port调用can_push时,就会返回true. 315 | self.input_port.set_need_data(); 316 | Ok(Event::NeedData) 317 | } 318 | } 319 | } 320 | // func: process() 321 | // 处理数据,这里主要是进行probe 322 | fn process(&mut self) -> Result<()> { 323 | match self.step { 324 | HashJoinStep::Build => Ok(()), 325 | HashJoinStep::Probe => { 326 | if let Some(data) = self.input_data.take() { 327 | self.probe(&data)?; 328 | } 329 | Ok(()) 330 | } 331 | } 332 | } 333 | // func: async_process() 334 | // 等待build表构建完毕 335 | async fn async_process(&mut self) -> Result<()> { 336 | if let HashJoinStep::Build = &self.step { 337 | self.join_state.wait_finish().await?; 338 | self.step = HashJoinStep::Probe; 339 | } 340 | 341 | Ok(()) 342 | } 343 | } 344 | ``` 345 | -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/4_expression.md: -------------------------------------------------------------------------------- 1 | * [四 Expression](#四-expression) 2 | * [V2](#v2) 3 | * [Expr](#expr) 4 | * [ScalarBinder](#scalarbinder) 5 | * [Scalar](#scalar) 6 | * [ExpressionBuilder](#expressionbuilder) 7 | * [Expression](#expression) 8 | * [ExpressionExecutor](#expressionexecutor) 9 | * [ExpressionChain](#expressionchain) 10 | * [ExpressionAction](#expressionaction) 11 | * [Function](#function) 12 | * [FunctionFactory](#functionfactory) 13 | 14 | ## 四 Expression 15 | 16 | #### V2 17 | 18 | ```rust 19 | let mut planner = Planner::new(ctx.clone()); 20 | let (plan_node, _) = planner.plan_sql(sql).await?; 21 | 22 | // 从sql创建逻辑计划的流程 23 | Planner::plan_sql(sql: &str) -> Plan // query/src/sql/planner/mod.rs 24 | -> tokenize_sql(&str) -> Token // common/ast/src/parser/mod.rs 25 | parse_sql(Token) -> Vec // common/ast/src/parser/mod.rs 26 | -> statements(Token) -> Vec // common/ast/src/parser/statement.rs 27 | -> query(Token) -> Query // common/ast/src/parser/query.rs 28 | -> select_target(Token) -> SelectTarget // common/ast/src/parser/query.rs 29 | -> expr(Token) -> Expr // common/ast/src/parser/expr.rs 30 | -> subexpr() // common/ast/src/parser/expr.rs 31 | Binder::bind(Statement) -> Plan // query/src/sql/planner/mod.rs 32 | -> bind_statement(Statement) -> Plan // query/src/sql/planner/binder/mod.rs 33 | -> bind_query(Query) -> SExpr // query/src/sql/planner/binder/select.rs 34 | -> bind_select_stmt(SelectStmt, OrderByExpr) -> SExpr // query/src/sql/planner/binder/select.rs 35 | -> bind_where(Expr, SExpr) -> SExpr // query/src/sql/planner/binder/select.rs 36 | -> ScalarBinder::bind(Expr) -> Scalar // query/src/sql/planner/binder/select.rs 37 | Scalar -> FilterPlan impl Operator // query/src/sql/planner/binder/select.rs 38 | SExpr::create_unary(Plan) -> SExpr // query/src/sql/optimizer/s_expr.rs 39 | SubqueryRewriter::rewrite(SExpr) -> SExpr // query/src/sql/planner/binder/subquery.rs 40 | SExpr -> Plan::Query // query/src/sql/planner/binder/subquery.rs 41 | ``` 42 | 43 | ```shell 44 | str -> Token -> ExprElement -> WithSpan -> Expr -> Scalar | Expression -> ExpressionChain -> ExpressionAction 45 | ExprParser ScalarBinder | ExpressionBuilder 46 | Before build pipeline | Building pipeline 47 | ``` 48 | 49 | ##### Expr 50 | 51 | AST中的表达式 52 | 53 | ```rust 54 | // file: common/ast/src/ast/expr.rs 55 | pub enum Expr<'a> { 56 | /// Column reference, with indirection like `table.column` 57 | ColumnRef { 58 | span: &'a [Token<'a>], 59 | database: Option>, 60 | table: Option>, 61 | column: Identifier<'a>, 62 | }, 63 | ... 64 | 65 | /// `BETWEEN ... AND ...` 66 | Between { 67 | span: &'a [Token<'a>], 68 | expr: Box>, 69 | low: Box>, 70 | high: Box>, 71 | not: bool, 72 | }, 73 | /// Binary operation 74 | BinaryOp { 75 | span: &'a [Token<'a>], 76 | op: BinaryOperator, 77 | left: Box>, 78 | right: Box>, 79 | }, 80 | ... 81 | } 82 | 83 | pub enum Literal { 84 | Integer(u64), 85 | Float(f64), 86 | BigInt { lit: String, is_hex: bool }, 87 | // Quoted string literal value 88 | String(String), 89 | Boolean(bool), 90 | CurrentTimestamp, 91 | Null, 92 | } 93 | 94 | #[derive(Debug, Clone, PartialEq)] 95 | pub enum TypeName { 96 | Boolean, 97 | UInt8, 98 | UInt16, 99 | UInt32, 100 | UInt64, 101 | Int8, 102 | Int16, 103 | Int32, 104 | Int64, 105 | Float32, 106 | Float64, 107 | Date, 108 | DateTime { precision: Option }, 109 | Timestamp, 110 | String, 111 | Array { item_type: Option> }, 112 | Object, 113 | Variant, 114 | } 115 | 116 | pub enum BinaryOperator { 117 | Plus, 118 | Minus, 119 | Multiply, 120 | Div, 121 | Divide, 122 | Modulo, 123 | StringConcat, 124 | // `>` operator 125 | Gt, 126 | // `<` operator 127 | Lt, 128 | ... 129 | } 130 | ``` 131 | 132 | ##### ScalarBinder 133 | 134 | Expr -> Scalar 135 | 136 | ```rust 137 | // file: query/src/sql/planner/binder/scalar.rs 138 | pub async fn bind(&mut self, expr: &Expr<'a>) -> Result<(Scalar, DataTypeImpl)> { 139 | let mut type_checker = 140 | TypeChecker::new(self.bind_context, self.ctx.clone(), self.metadata.clone()); 141 | type_checker.resolve(expr, None).await 142 | } 143 | ``` 144 | 145 | ##### Scalar 146 | 147 | ```rust 148 | // file: query/src/sql/planner/plans/scalar.rs 149 | pub enum Scalar { 150 | BoundColumnRef(BoundColumnRef), 151 | ConstantExpr(ConstantExpr), 152 | AndExpr(AndExpr), 153 | OrExpr(OrExpr), 154 | ComparisonExpr(ComparisonExpr), 155 | AggregateFunction(AggregateFunction), 156 | FunctionCall(FunctionCall), 157 | // TODO(leiysky): maybe we don't need this variant any more 158 | // after making functions static typed? 159 | Cast(CastExpr), 160 | SubqueryExpr(SubqueryExpr), 161 | } 162 | ``` 163 | 164 | ##### ExpressionBuilder 165 | 166 | Scalar -> Expression 167 | 168 | ```rust 169 | // file: query/src/sql/exec/expression_builder.rs 170 | // 在PipelineBuilder中被用于创建Expression 171 | pub struct ExpressionBuilder { 172 | metadata: MetadataRef, 173 | } 174 | // func: create(...) -> ExpressionBuilder 175 | // 创建ExpressionBuilder 176 | pub fn create(metadata: MetadataRef) -> Self { 177 | ExpressionBuilder { metadata } 178 | } 179 | // func: build(...) -> Expression 180 | // 生成Expression 181 | pub fn build(&self, scalar: &Scalar) -> Result { 182 | match scalar { 183 | Scalar::BoundColumnRef(BoundColumnRef { column }) => { 184 | self.build_column_ref(column.index) 185 | } 186 | Scalar::ConstantExpr(ConstantExpr { value, data_type }) => { 187 | self.build_literal(value, data_type) 188 | } 189 | ... 190 | } 191 | } 192 | ``` 193 | 194 | ##### Expression 195 | 196 | ```rust 197 | // file: common/planners/src/plan_expression.rs 198 | #[derive(serde::Serialize, serde::Deserialize, Clone, PartialEq)] 199 | pub enum Expression { 200 | /// An expression with a alias name. 201 | Alias(String, Box), 202 | 203 | /// Column name. 204 | Column(String), 205 | /// Qualified column name. 206 | QualifiedColumn(Vec), 207 | 208 | /// Constant value. 209 | /// Note: When literal represents a column, its column_name will not be None 210 | Literal { 211 | value: DataValue, 212 | column_name: Option, 213 | 214 | // Logic data_type for this literal 215 | data_type: DataTypeImpl, 216 | }, 217 | 218 | ... 219 | } 220 | ``` 221 | 222 | ##### ExpressionExecutor 223 | 224 | ```rust 225 | // file: query/src/pipelines/transforms/transform_expression_executor.rs 226 | // 表达式执行器,在TransformFilter和TransformHaving等地方会使用 227 | pub struct ExpressionExecutor { 228 | // description of this executor 229 | description: String, 230 | _input_schema: DataSchemaRef, 231 | output_schema: DataSchemaRef, 232 | chain: Arc, 233 | // whether to perform alias action in executor 234 | alias_project: bool, 235 | ctx: Arc, 236 | } 237 | // func: try_create() -> ExpressionExecutor 238 | // 创建ExpressionExecutor 239 | ... 240 | let chain = ExpressionChain::try_create(input_schema.clone(), &exprs)?; // 创建ExpressionChain 241 | // func: execute(...) -> DataBlock 242 | pub fn execute(&self, block: &DataBlock) -> Result { 243 | ... 244 | 245 | let mut column_map: HashMap<&str, ColumnWithField> = HashMap::new(); // 存放参与运算的列,以及缓存临时查询结果 246 | 247 | let mut alias_map: HashMap<&str, &ColumnWithField> = HashMap::new(); // 如果alias_project等于true,则用于存储查询结果列 248 | 249 | // supported a + 1 as b, a + 1 as c 250 | // supported a + 1 as a, a as b 251 | // !currently not supported a+1 as c, b+1 as c 252 | let mut alias_action_map: HashMap<&str, Vec<&str>> = HashMap::new(); // 记录alias的映射 253 | 254 | for f in block.schema().fields().iter() { 255 | let column = 256 | ColumnWithField::new(block.try_column_by_name(f.name())?.clone(), f.clone()); 257 | column_map.insert(f.name(), column); 258 | } 259 | 260 | let rows = block.num_rows(); // 当前DataBlock行数 261 | for action in self.chain.actions.iter() { 262 | if let ExpressionAction::Alias(alias) = action { 263 | if let Some(v) = alias_action_map.get_mut(alias.arg_name.as_str()) { 264 | v.push(alias.name.as_str()); 265 | } else { 266 | alias_action_map.insert(alias.arg_name.as_str(), vec![alias.name.as_str()]); 267 | } 268 | } 269 | 270 | if column_map.contains_key(action.column_name()) { // 如果缓存中已经包含这个action的结果,则跳过 271 | continue; 272 | } 273 | 274 | match action { 275 | ExpressionAction::Input(input) => { 276 | let column = block.try_column_by_name(&input.name)?.clone(); 277 | let column = ColumnWithField::new( 278 | column, 279 | block.schema().field_with_name(&input.name)?.clone(), 280 | ); 281 | column_map.insert(input.name.as_str(), column); 282 | } 283 | ExpressionAction::Function(f) => { 284 | let column_with_field = self.execute_function(&mut column_map, f, rows)?; 285 | column_map.insert(f.name.as_str(), column_with_field); 286 | } 287 | ExpressionAction::Constant(constant) => { 288 | let column = constant 289 | .data_type 290 | .create_constant_column(&constant.value, rows)?; 291 | 292 | let column = ColumnWithField::new( 293 | column, 294 | DataField::new(constant.name.as_str(), constant.data_type.clone()), 295 | ); 296 | 297 | column_map.insert(constant.name.as_str(), column); 298 | } 299 | _ => {} 300 | } 301 | } 302 | 303 | // 如果是alias,把结果放入alias_map 304 | if self.alias_project { 305 | for (k, v) in alias_action_map.iter() { 306 | let column = column_map.get(k).ok_or_else(|| { 307 | ErrorCode::LogicalError("Arguments must be prepared before alias transform") 308 | })?; 309 | 310 | for name in v.iter() { 311 | match alias_map.insert(name, column) { 312 | Some(_) => Err(ErrorCode::UnImplement(format!( 313 | "Duplicate alias name :{}", 314 | name 315 | ))), 316 | _ => Ok(()), 317 | }?; 318 | } 319 | } 320 | } 321 | 322 | let mut project_columns = Vec::with_capacity(self.output_schema.fields().len()); // 提取计算结果 323 | for f in self.output_schema.fields() { 324 | let column = match alias_map.get(f.name().as_str()) { // 优先从alias_map中获取结果 325 | Some(data_column) => data_column, 326 | None => column_map.get(f.name().as_str()).ok_or_else(|| { // 从column_map中获取结果 327 | ErrorCode::LogicalError(format!( 328 | "Projection column: {} not exists in {:?}, there are bugs!", 329 | f.name(), 330 | column_map.keys() 331 | )) 332 | })?, 333 | }; 334 | project_columns.push(column.column().clone()); 335 | } 336 | // projection to remove unused columns 337 | Ok(DataBlock::create( 338 | self.output_schema.clone(), 339 | project_columns, 340 | )) 341 | } 342 | // func: execute_function() -> ColumnWithField 343 | // 执行Function 344 | fn execute_function( 345 | &self, 346 | column_map: &mut HashMap<&str, ColumnWithField>, 347 | f: &ActionFunction, 348 | rows: usize, 349 | ) -> Result { 350 | // check if it's cached 351 | let mut arg_columns = Vec::with_capacity(f.arg_names.len()); 352 | 353 | for arg in f.arg_names.iter() { 354 | let column = column_map.get(arg.as_str()).cloned().ok_or_else(|| { 355 | ErrorCode::LogicalError("Arguments must be prepared before function transform") 356 | })?; 357 | arg_columns.push(column); 358 | } 359 | 360 | let tz = self.ctx.get_settings().get_timezone()?; 361 | let tz = String::from_utf8(tz).map_err(|_| { 362 | ErrorCode::LogicalError("Timezone has beeen checked and should be valid.") 363 | })?; 364 | let tz = tz.parse::().map_err(|_| { 365 | ErrorCode::InvalidTimezone("Timezone has been checked and should be valid") 366 | })?; 367 | let func_ctx = FunctionContext { tz }; 368 | let column = f.func.eval(func_ctx, &arg_columns, rows)?; // 执行funcion 369 | Ok(ColumnWithField::new( 370 | column, 371 | DataField::new(&f.name, f.return_type.clone()), 372 | )) 373 | } 374 | ``` 375 | 376 | ##### ExpressionChain 377 | 378 | ```rust 379 | // file: common/planners/src/plan_expression_chain.rs 380 | pub struct ExpressionChain { 381 | // input schema 382 | pub schema: DataSchemaRef, 383 | pub actions: Vec, 384 | } 385 | // func: try_create(...) -> ExpressionChain 386 | // 创建ExpressionChain 387 | pub fn try_create(schema: DataSchemaRef, exprs: &[Expression]) -> Result { 388 | let mut chain = Self { 389 | schema, 390 | actions: vec![], 391 | }; 392 | 393 | for expr in exprs { 394 | chain.recursion_add_expr(expr)?; 395 | } 396 | 397 | Ok(chain) 398 | } 399 | // func: recursion_add_expr(...) 400 | // 递归填加表达式 401 | fn recursion_add_expr(&mut self, expr: &Expression) -> Result<()> { 402 | struct ExpressionActionVisitor(*mut ExpressionChain); 403 | 404 | impl ExpressionVisitor for ExpressionActionVisitor { 405 | fn pre_visit(self, _expr: &Expression) -> Result> { 406 | Ok(Recursion::Continue(self)) 407 | } 408 | 409 | fn post_visit(self, expr: &Expression) -> Result { 410 | unsafe { 411 | (*self.0).add_expr(expr)?; 412 | Ok(self) 413 | } 414 | } 415 | } 416 | 417 | ExpressionActionVisitor(self).visit(expr)?; 418 | Ok(()) 419 | } 420 | // func: add_expr() 421 | // 往self.actions中填加Action 422 | fn add_expr(&mut self, expr: &Expression) -> Result<()> { 423 | ... 424 | Expression::BinaryExpression { op, left, right } => { 425 | let arg_types = vec![ 426 | left.to_data_type(&self.schema)?, 427 | right.to_data_type(&self.schema)?, 428 | ]; 429 | 430 | let arg_types2: Vec<&DataTypeImpl> = arg_types.iter().collect(); 431 | let func = FunctionFactory::instance().get(op, &arg_types2)?; // 根据name获取Function 432 | let return_type = func.return_type(); 433 | 434 | let function = ActionFunction { 435 | name: expr.column_name(), 436 | func_name: op.clone(), 437 | func, 438 | arg_names: vec![left.column_name(), right.column_name()], 439 | arg_types, 440 | return_type, 441 | }; 442 | 443 | self.actions.push(ExpressionAction::Function(function)); 444 | } 445 | ... 446 | } 447 | ``` 448 | 449 | SQL: 450 | 451 | `SELECT number as c0 FROM numbers_mt(10000) where number = 10 limit 10` 452 | 453 | FilterTransform的ExpressionChain为: 454 | 455 | ```rust 456 | [ 457 | Constant(ActionConstant { name: "10", value: 10, data_type: UInt8(UInt8) }), 458 | Input(ActionInput { name: "number", return_type: UInt64(UInt64) }), 459 | Function(ActionFunction { name: "(number = 10)", func_name: "=", return_type: Boolean(Boolean), arg_names: ["number", "10"], arg_types: [UInt64(UInt64), UInt8(UInt8)] }) 460 | ] 461 | ``` 462 | 463 | ProjectionTransform的ExpressionChain为: 464 | 465 | ```rust 466 | [ 467 | Input(ActionInput { name: "number", return_type: UInt64(UInt64) }), 468 | Alias(ActionAlias { name: "c0", arg_name: "number", arg_type: UInt64(UInt64) }) 469 | ] 470 | ``` 471 | 472 | ##### ExpressionAction 473 | 474 | ```rust 475 | // file: common/planners/src/plan_expression_action.rs 476 | // Expression类型,细节见上述示例 477 | pub enum ExpressionAction { 478 | /// Column which must be in input. 479 | Input(ActionInput), 480 | /// Constant column with known value. 481 | Constant(ActionConstant), 482 | Alias(ActionAlias), 483 | Function(ActionFunction), 484 | } 485 | 486 | #[derive(Debug, Clone)] 487 | pub struct ActionInput { 488 | pub name: String, // 输入输出列名 489 | pub return_type: DataTypeImpl, 490 | } 491 | 492 | #[derive(Debug, Clone)] 493 | pub struct ActionConstant { 494 | pub name: String, // 输入输出列名 495 | pub value: DataValue, 496 | pub data_type: DataTypeImpl, 497 | } 498 | 499 | #[derive(Debug, Clone)] 500 | pub struct ActionAlias { 501 | pub name: String, // 输出列名 502 | pub arg_name: String, // 输入列名 503 | pub arg_type: DataTypeImpl, 504 | } 505 | 506 | #[derive(Clone)] 507 | pub struct ActionFunction { 508 | pub name: String, // 输出列名 509 | pub func_name: String, // 函数名,eg: = 510 | pub return_type: DataTypeImpl, // 输出类型 511 | pub func: Box, // 函数 512 | 513 | // for functions 514 | pub arg_names: Vec, 515 | pub arg_types: Vec, 516 | } 517 | 518 | ``` 519 | 520 | ##### Function 521 | 522 | ```rust 523 | // file: common/functions/src/scalars/function.rs 524 | pub trait Function: fmt::Display + Sync + Send + DynClone { 525 | /// Returns the name of the function, should be unique. 526 | fn name(&self) -> &str; 527 | 528 | /// Calculate the monotonicity from arguments' monotonicity information. 529 | /// The input should be argument's monotonicity. For binary function it should be an 530 | /// array of left expression's monotonicity and right expression's monotonicity. 531 | /// For unary function, the input should be an array of the only argument's monotonicity. 532 | /// The returned monotonicity should have 'left' and 'right' fields None -- the boundary 533 | /// calculation relies on the function.eval method. 534 | fn get_monotonicity(&self, _args: &[Monotonicity]) -> Result { 535 | Ok(Monotonicity::default()) 536 | } 537 | 538 | /// The method returns the return_type of this function. 539 | fn return_type(&self) -> DataTypeImpl; 540 | 541 | /// Evaluate the function, e.g. run/execute the function. 542 | fn eval( 543 | &self, 544 | _func_ctx: FunctionContext, 545 | _columns: &ColumnsWithField, 546 | _input_rows: usize, 547 | ) -> Result; 548 | 549 | /// If all args are constant column, then we just return the constant result 550 | /// TODO, we should cache the constant result inside the context for better performance 551 | fn passthrough_constant(&self) -> bool { 552 | true 553 | } 554 | } 555 | ``` 556 | 557 | ##### FunctionFactory 558 | 559 | ```rust 560 | // file: common/functions/src/scalars/function_factory.rs 561 | pub struct FunctionFactory { 562 | case_insensitive_desc: HashMap, 563 | } 564 | 565 | static FUNCTION_FACTORY: Lazy> = Lazy::new(|| { 566 | let mut function_factory = FunctionFactory::create(); 567 | 568 | ArithmeticFunction::register(&mut function_factory); 569 | CommonFunction::register(&mut function_factory); 570 | ToCastFunction::register(&mut function_factory); 571 | TupleClassFunction::register(&mut function_factory); 572 | ComparisonFunction::register(&mut function_factory); 573 | ContextFunction::register(&mut function_factory); 574 | SemiStructuredFunction::register(&mut function_factory); 575 | StringFunction::register(&mut function_factory); 576 | HashesFunction::register(&mut function_factory); 577 | ConditionalFunction::register(&mut function_factory); 578 | LogicFunction::register(&mut function_factory); 579 | DateFunction::register(&mut function_factory); 580 | OtherFunction::register(&mut function_factory); 581 | UUIDFunction::register(&mut function_factory); 582 | MathsFunction::register(&mut function_factory); 583 | 584 | Arc::new(function_factory) 585 | }); 586 | // func: instance() -> FunctionFactory 587 | // 获取单例 588 | pub fn instance() -> &'static FunctionFactory { 589 | FUNCTION_FACTORY.as_ref() 590 | } 591 | // func: register(...) 592 | // 注册函数 593 | pub fn register(&mut self, name: &str, desc: FunctionDescription) { 594 | let case_insensitive_desc = &mut self.case_insensitive_desc; 595 | case_insensitive_desc.insert(name.to_lowercase(), desc); 596 | } 597 | // func: get(...) -> Function 598 | // 根据函数名获取函数,在ExpressionChain::add_expr()函数中会调用,构造ActionFunction 599 | pub fn get(&self, name: impl AsRef, args: &[&DataTypeImpl]) -> Result> { 600 | let origin_name = name.as_ref(); 601 | let lowercase_name = origin_name.to_lowercase(); 602 | 603 | let desc = self 604 | .case_insensitive_desc 605 | .get(&lowercase_name) 606 | .ok_or_else(|| { 607 | // TODO(Winter): we should write similar function names into error message if function name is not found. 608 | ErrorCode::UnknownFunction(format!("Unsupported Function: {}", origin_name)) 609 | })?; 610 | 611 | FunctionAdapter::try_create(desc, origin_name, args) 612 | } 613 | ``` 614 | -------------------------------------------------------------------------------- /source-code-reading/v0.8.177-nightly/1_scheduler.md: -------------------------------------------------------------------------------- 1 | * [分布式查询调度器](#分布式查询调度器) 2 | 3 | * [1 构造分布式Pipeline](#1-构造分布式pipeline) 4 | * [生成PlanFragment](#生成planfragment) 5 | * [Fragmenter](#fragmenter) 6 | * [fn build_fragment()](#fn-build_fragment) 7 | * [fn replace_exchange()](#fn-replace_exchange) 8 | * [fn get_exchange()⭐️](#fn-get_exchange) 9 | * [PlanFragment](#planfragment) 10 | * [FragmentType](#fragmenttype) 11 | * [DataExchange](#dataexchange) 12 | * [生成QueryFragmentsActions](#生成queryfragmentsactions) 13 | * [QueryFragmentsActions](#queryfragmentsactions) 14 | * [QueryFragmentActions](#queryfragmentactions) 15 | * [QueryFragmentAction](#queryfragmentaction) 16 | * [FragmentPayload](#fragmentpayload) 17 | * [PlanFragment的get_actions函数](#planfragment的get_actions函数) 18 | * [生成PipelineBuildResult](#生成pipelinebuildresult) 19 | * [DataExchangeManager](#dataexchangemanager) 20 | * [fn commit_actions() ⭐️](#fn-commit_actions-) 21 | * [QueryFragmentsActions](#queryfragmentsactions) 22 | * [fn get_init_nodes_channel_packets()](#fn-get_init_nodes_channel_packets) 23 | * [fn fragments_connections()](#fn-fragments_connections) 24 | * [fn get_query_fragments_plan_packets()](#fn-get_query_fragments_plan_packets) 25 | * [fn get_execute_partial_query_packets()](#fn-get_execute_partial_query_packets) 26 | * [QueryCoordinator](#querycoordinator) 27 | * [InitNodesChannelPacket](#initnodeschannelpacket) 28 | * [fn commit()](#fn-commit) 29 | * [2 分布式查询的执行](#2-分布式查询的执行) 30 | * [3 节点间的RPC通信](#3-节点间的rpc通信) 31 | * [创建Connection](#创建connection) 32 | * [fn create_client()](#fn-create_client) 33 | * [ConnectionFactory](#connectionfactory) 34 | * [FlightClient](#flightclient) 35 | * [DatabendQueryFlightService](#databendqueryflightservice) 36 | 37 | ## 分布式查询调度器 38 | 39 | 两个之前阅读过这块的代码(详见[分布式查询](../v0.7.71-nightly/3_distributed_query.md)),但是不够全面,而且Databend的更新频率太快,所以打算重新阅读新的代码,顺便记录一下。 40 | 41 | 相关代码路径:schedulers,rpc 42 | 43 | ```shell 44 | schedulers 45 | ├── fragments // 将PhysicalPlan转换为分布式PlanFragment,用于构造分布式Pipeline 46 | │   ├── fragmenter.rs 47 | │   ├── mod.rs 48 | │   ├── plan_fragment.rs // 分布式查询计划 49 | │   ├── query_fragment_actions.rs // 带节点信息的分布式查询计划,并用于生成FlightClient与FlightService间通信的各种packet 50 | │   └── query_fragment_actions_display.rs 51 | ├── mod.rs 52 | └── scheduler.rs // 构造分布式Pipeline 53 | 2 directories, 7 files 54 | 55 | rpc 56 | ├── exchange 57 | │   ├── data_exchange.rs 58 | │   ├── exchange_manager.rs // 维护Exchange任务⭐️ 59 | │   ├── exchange_params.rs 60 | │   ├── exchange_sink.rs 61 | │   ├── exchange_sink_merge.rs 62 | │   ├── exchange_sink_shuffle.rs 63 | │   ├── exchange_transform.rs 64 | │   ├── exchange_transform_source.rs 65 | │   ├── mod.rs 66 | │   ├── statistics_receiver.rs 67 | │   └── statistics_sender.rs 68 | ├── flight_actions.rs 69 | ├── flight_client.rs 70 | ├── flight_scatter.rs 71 | ├── flight_scatter_broadcast.rs 72 | ├── flight_scatter_hash.rs 73 | ├── flight_service.rs 74 | ├── mod.rs 75 | ├── packets 76 | │   ├── mod.rs 77 | │   ├── packet.rs 78 | │   ├── packet_data.rs 79 | │   ├── packet_data_precommit.rs 80 | │   ├── packet_data_progressinfo.rs 81 | │   ├── packet_execute.rs // ExecutePartialQueryPacket,启动查询任务 82 | │   ├── packet_executor.rs 83 | │   ├── packet_fragment.rs // FragmentPlanPacket,发布查询任务 84 | │   └── packet_publisher.rs // InitNodesChannelPacket,用于创建计算节点间Channels 85 | └── request_builder.rs 86 | 3 directories, 28 files 87 | ``` 88 | 89 | ### 1 构造分布式Pipeline 90 | 91 | * 生成PlanFragment:分布式查询计划。 92 | * 生成QueryFragmentsActions:带节点信息的分布式查询计划。 93 | * 生成PipelineBuildResult:分布式Pipeline。 94 | 95 | ```rust 96 | file: src/query/service/src/schedulers/scheduler.rs 97 | 98 | /// Build distributed pipeline via fragment and actions. 99 | pub async fn build_distributed_pipeline( 100 | ctx: &Arc, 101 | plan: &PhysicalPlan, 102 | ) -> Result { 103 | // 1 生成PlanFragment 104 | let fragmenter = Fragmenter::try_create(ctx.clone())?; 105 | let root_fragment = fragmenter.build_fragment(plan)?; 106 | 107 | // 2 生成QueryFragmentsActions 108 | let mut fragments_actions = QueryFragmentsActions::create(ctx.clone()); 109 | root_fragment.get_actions(ctx.clone(), &mut fragments_actions)?; 110 | 111 | // 3 生成PipelineBuildResult 112 | let exchange_manager = ctx.get_exchange_manager(); 113 | 114 | let mut build_res = exchange_manager 115 | .commit_actions(ctx.clone(), fragments_actions) 116 | .await?; 117 | 118 | let settings = ctx.get_settings(); 119 | build_res.set_max_threads(settings.get_max_threads()? as usize); 120 | Ok(build_res) 121 | } 122 | ``` 123 | 124 | #### 生成PlanFragment 125 | 126 | ```rust 127 | let fragmenter = Fragmenter::try_create(ctx.clone())?; 128 | let root_fragment = fragmenter.build_fragment(plan)?; 129 | ``` 130 | 131 | ##### Fragmenter 132 | 133 | ```rust 134 | file: src/query/service/src/schedulers/fragments/fragmenter.rs 135 | /// Visitor to split a `PhysicalPlan` into fragments. 136 | pub struct Fragmenter { 137 | ctx: Arc, 138 | fragments: Vec, 139 | query_id: String, 140 | 141 | /// A state to track if is visiting a source pipeline. 142 | visiting_source_pipeline: bool, 143 | } 144 | 145 | ``` 146 | 147 | ###### fn build_fragment() 148 | 149 | 该函数通过遍历PhysicalPlan Tree,根据PhysicalPlan::Exchange来划分PlanFragment。 150 | 151 | Fragmenter实现了`PhysicalPlanReplacer` trait的几个functions,包含`replace_table_scan`、`replace_hash_join`和`replace_exchange`。 152 | 153 | ###### fn replace_exchange() 154 | 155 | 该函数最最重要的功能就是生成PlanFragment,步骤如下: 156 | 157 | * 生成新的input:遍历plan.input,得到新的input paln; 158 | * 生成新的plan:PhysicalPlan::ExchangeSink 159 | * ```rust 160 | let plan = PhysicalPlan::ExchangeSink(ExchangeSink { 161 | input: Box::new(input), 162 | schema: input_schema.clone(), 163 | kind: plan.kind.clone(), 164 | keys: plan.keys.clone(), 165 | 166 | destinations: Self::get_executors(self.ctx.clone()), 167 | query_id: self.query_id.clone(), 168 | 169 | // We will connect the fragments later, so we just 170 | // set the fragment id to a invalid value here. 171 | destination_fragment_id: usize::MAX, 172 | }); 173 | ``` 174 | * 生成fragment_type:根据self.visiting_source_pipeline的标记来判断要生成的PlanFragment的类型,FragmentType::Source或着FragmentType::Intermediate; 175 | * 生成exchange:Option\ ⭐️ 176 | * ```rust 177 | let exchange = Self::get_exchange( 178 | self.ctx.clone(), 179 | &plan, 180 | self.fragments 181 | .iter() 182 | .all(|fragment| !matches!(&fragment.exchange, Some(DataExchange::Merge(_)))), 183 | )?; 184 | ``` 185 | * 生成fragment_id:self.ctx.get_fragment_id(); 186 | * 生成fragment: 187 | * ```rust 188 | let mut fragment = PlanFragment { 189 | plan, 190 | fragment_type, 191 | 192 | fragment_id: source_fragment_id, 193 | exchange, 194 | query_id: self.query_id.clone(), 195 | 196 | source_fragments: self.fragments.drain(..).collect(), 197 | }; 198 | ``` 199 | * 给fragment.sourcec_fragments的ExchageSink设置fragment.fragment_id:`Self::resolve_fragment_connection(&mut source_fragment);` 200 | * 将上述产生的fragment放入self.fragments; 201 | * 返回PhysicalPlan::ExchnageSource 202 | * ```rust 203 | Ok(PhysicalPlan::ExchangeSource(ExchangeSource { 204 | schema: input_schema, 205 | query_id: self.query_id.clone(), 206 | 207 | source_fragment_id, 208 | })) 209 | ``` 210 | 211 | ###### fn get_exchange()⭐️ 212 | 213 | ```rust 214 | pub fn get_exchange( 215 | ctx: Arc, 216 | plan: &PhysicalPlan, 217 | from_multiple_nodes: bool, 218 | ) -> Result> { 219 | match plan { 220 | PhysicalPlan::ExchangeSink(plan) => match plan.kind { 221 | FragmentKind::Normal => Ok(Some(ShuffleDataExchange::create( 222 | Self::get_executors(ctx), // 设置下游节点为当前集群的所有节点 223 | plan.keys.clone(), 224 | ))), 225 | FragmentKind::Merge => { 226 | Ok(Some(MergeExchange::create(Self::get_local_executor(ctx)))) // 设置为当前节点 227 | } 228 | FragmentKind::Expansive => Ok(Some(BroadcastExchange::create( 229 | from_multiple_nodes, 230 | Self::get_executors(ctx), // 设置为所有节点 231 | ))), 232 | _ => Ok(None), 233 | }, 234 | _ => Ok(None), 235 | } 236 | } 237 | ``` 238 | 239 | ##### PlanFragment 240 | 241 | 分布式的查询计划 242 | 243 | ```rust 244 | file: src/query/service/src/schedulers/fragments/plan_fragment.rs 245 | pub struct PlanFragment { 246 | pub plan: PhysicalPlan, 247 | pub fragment_type: FragmentType, 248 | pub fragment_id: usize, 249 | pub exchange: Option, 250 | pub query_id: String, 251 | 252 | // The fragments to ask data from. 253 | pub source_fragments: Vec, 254 | } 255 | ``` 256 | 257 | ##### FragmentType 258 | 259 | ```rust 260 | file: src/query/service/src/schedulers/fragments/plan_fragment.rs 261 | pub enum FragmentType { 262 | /// Root fragment of a query plan 263 | Root, 264 | 265 | /// Intermediate fragment of a query plan, 266 | /// doesn't contain any `TableScan` operator. 267 | Intermediate, 268 | 269 | /// Leaf fragment of a query plan, which contains 270 | /// a `TableScan` operator. 271 | Source, 272 | } 273 | ``` 274 | 275 | ##### DataExchange 276 | 277 | ```rust 278 | // file: src/query/service/src/api/rpc/exchange/data_exchange.rs 279 | pub enum DataExchange { 280 | Merge(MergeExchange), 281 | Broadcast(BroadcastExchange), 282 | ShuffleDataExchange(ShuffleDataExchange), 283 | } 284 | ``` 285 | 286 | #### 生成QueryFragmentsActions 287 | 288 | ```rust 289 | let mut fragments_actions = QueryFragmentsActions::create(ctx.clone()); 290 | root_fragment.get_actions(ctx.clone(), &mut fragments_actions)?; 291 | ``` 292 | 293 | ##### QueryFragmentsActions 294 | 295 | ##### QueryFragmentActions 296 | 297 | ##### QueryFragmentAction 298 | 299 | ##### FragmentPayload 300 | 301 | ```rust 302 | file: src/query/service/src/schedulers/fragments/query_fragment_actions.rs 303 | /// 多个PlanFragment的分布式任务集合 304 | pub struct QueryFragmentsActions { 305 | ctx: Arc, 306 | pub fragments_actions: Vec, 307 | } 308 | 309 | /// 来自于一个PlanFragment的多个分布式任务 310 | pub struct QueryFragmentActions { 311 | pub fragment_id: usize, 312 | pub exchange_actions: bool, 313 | pub data_exchange: Option, 314 | pub fragment_actions: Vec, 315 | } 316 | 317 | /// 属于一个节点的查询任务 318 | pub struct QueryFragmentAction { 319 | pub payload: FragmentPayload, 320 | pub executor: String, 321 | } 322 | 323 | pub enum FragmentPayload { 324 | Plan(PhysicalPlan), 325 | } 326 | ``` 327 | 328 | ##### PlanFragment的get_actions函数 329 | 330 | * 后序遍历PlanFragment Tree,给每个PlanFragment生成一个QueryFramentActions,并放入QueryFragmentsActions; 331 | * 给每个PlanFragment配置相应的计算节点(一个或多个),每个节点生成一个的QueryFragmentAction,并放入QueryFragmentActions中。 332 | * 配置一个节点的情况包含: 333 | * fragment_type为FragmentType::Root的; 334 | * fragment_type为FragmentType::Intermediate,且任意一个source_fragments包含DaaExchange::Merge的; 335 | * 配置多个节点的情况包含: 336 | * fragment_type为FragmentType::Intermediate,且所有source_fragments不包含DaaExchange::Merge的; 337 | * fragment_type为FragmentType::Source的,这里会调用`redistribute_source_fragment`函数对Source任务的Partitions进行一个reshuffle操作,并返回一个QueryFragmentActions,即多个QueryFragmentAction。 338 | 339 | #### 生成PipelineBuildResult 340 | 341 | ```rust 342 | let exchange_manager = ctx.get_exchange_manager(); 343 | 344 | let mut build_res = exchange_manager 345 | .commit_actions(ctx.clone(), fragments_actions) 346 | .await?; 347 | 348 | let settings = ctx.get_settings(); 349 | build_res.set_max_threads(settings.get_max_threads()? as usize); 350 | Ok(build_res) 351 | ``` 352 | 353 | ##### DataExchangeManager 354 | 355 | 全局单例 356 | 357 | ```rust 358 | // file: src/query/service/src/api/rpc/exchange/exchange_manager.rs 359 | pub struct DataExchangeManager { 360 | queries_coordinator: ReentrantMutex>>, 361 | } 362 | ``` 363 | 364 | ![image.png](./assets/image.png) 365 | 366 | ###### fn commit_actions() ⭐️ 367 | 368 | // TODO 画个流程图 369 | 370 | 在Scheduler调用,用于往远程DatabendQueryFlightService发送请求。 371 | 372 | 这个函数在代码中有注释,这里再简单说明一下细节。 373 | 374 | * 初始化集群节点间的Channels,这里会往远程节点发送rpc请求; 375 | * ```rust 376 | actions 377 | .get_init_nodes_channel_packets()? // 获取Vec 378 | .commit(conf.as_ref(), timeout) // 调用InitNodesChannelPacket的commit函数,这里创建了节点间的rpc channel ⭐️ 379 | .await?; 380 | ``` 381 | * 创建本地和远程Executor的QueryFragmentPlanPacket 382 | * ```rust 383 | let (local_query_fragments_plan_packet, query_fragments_plan_packets) = 384 | actions.get_query_fragments_plan_packets()?; 385 | ``` 386 | * 提交上述query_fragments_plan_packets到远程Executor 387 | * ```rust 388 | query_fragments_plan_packets 389 | .commit(conf.as_ref(), timeout) // 调用QueryFragmentsPlanPacket的commit函数 390 | .await?; 391 | ``` 392 | * 提交上述local_query_fragments_plan_packet到本地Executor 393 | * ```rust 394 | self.init_query_fragments_plan(&ctx, &local_query_fragments_plan_packet)?; 395 | ``` 396 | * 创建root QueryFragmentActions的PipelineBuildResult 397 | * ```rust 398 | let build_res = self.get_root_pipeline(ctx, root_actions)?; ⭐️ 399 | ``` 400 | * 提交执行查询的请求 401 | * ```rust 402 | actions 403 | .get_execute_partial_query_packets()? 404 | .commit(conf.as_ref(), timeout) 405 | .await?; 406 | ``` 407 | * 返回Pipeline 408 | * ```rust 409 | Ok(build_res) 410 | ``` 411 | 412 | ##### QueryFragmentsActions 413 | 414 | ###### fn get_init_nodes_channel_packets() 415 | 416 | 生成集群节点间的Connection关系,返回`Vec` 417 | 418 | * 初始化local_id和node_info: 419 | * ```rust 420 | let local_id = &self.ctx.get_cluster().local_id; // 获取当前节点id 421 | let nodes_info = Self::nodes_info(&self.ctx); // 获取集群所有节点信息 422 | ``` 423 | * 获取connections_iofo:`self.fragments_connections()`,返回`Map>>` 424 | * 初始化init_nodes_channel_packets:`let mut init_nodes_channel_packets = Vec::with_capacity(connections_info.len());` 425 | * 遍历connections_iofo:`for(executor, fragments_connections) in &connections_iofo` 426 | * 如果nodes_info中不包含executor,则报错; 427 | * 获取executor_node_info:`let executor_node_info = &nodes_info[executor];` 428 | * 初始化connections_info:`let mut connections_info = Vec::with_capacity(fragments_connections.len());` 429 | * 遍历fragments_connections:`for (target, fragments) in fragments_connections ` 430 | * 如果nodes_info中不包含target,则报错; 431 | * 若包含,则创建ConnectionInfo,并放入connections_info中: 432 | * ```rust 433 | connections_info.push(ConnectionInfo { 434 | target: nodes_info[target].clone(), 435 | fragments: fragments.iter().cloned().unique().collect::>(), 436 | create_request_channel: &executor_node_info.id == local_id 437 | || target == local_id, 438 | }); 439 | ``` 440 | * 把connections_info放入init_nodes_channel_packets 441 | * ```rust 442 | init_nodes_channel_packets.push(InitNodesChannelPacket::create( 443 | self.ctx.get_id(), // query id 444 | executor_node_info.clone(), // source节点信息 445 | connections_info, // target节点信息 446 | )); 447 | ``` 448 | * 返回Ok(init_nodes_channel_packets) 449 | 450 | ###### fn fragments_connections() 451 | 452 | 返回 Map\\>\> 453 | 454 | We can exchange data on one connection, so let's plan how to use the least connections to complete the query. 455 | 456 | 过程如下: 457 | 458 | * 初始化返回结果`let mut source_target_fragments = HashMap::>>::new();` 459 | * 遍历self.fragment_actions,只处理包含DataExchange的QueryFragments; 460 | * 生成fragment_id:获取当前QueryFragmentActions的fragment_id; 461 | * 生成destinations:获取下游的计算节点id,`exchange.get_destinations()`,这些id是在Fragmenter的`get_exchange()`函数中设置的; 462 | * 遍历当前QueryFragmentActions包含的每个QueryFragmentAction; 463 | * 获取source:当前Action所属的节点; 464 | * 遍历上述destinations, 465 | * 如果souce与destiontion相同,则跳过; 466 | * 否则,把`(source, (destination, fragment_id))`放入source_target_fragments。 467 | 468 | ###### fn get_query_fragments_plan_packets() 469 | 470 | 返回:`(QueryFragmentsPlanPacket, Vec)` 471 | 472 | 步骤如下: 473 | 474 | * 获取nodes_info:集群节点信息`let nodes_info = Self::nodes_info(&self.ctx);` 475 | * 获取fragments_packets:每个节点的FragmentPlanPacket,`let mut fragments_packets = self.get_executors_fragments();` 476 | * 初始化query_fragments_plan_packets:`let mut query_fragments_plan_packets = Vec::with_capacity(fragments_packets.len());` 477 | * 生成local_query_fragments_plan_packet:本地节点的QueryFragmentsPlanPacket 478 | * ```rust 479 | let local_query_fragments_plan_packet = QueryFragmentsPlanPacket::create( 480 | self.ctx.get_id(), 481 | cluster.local_id.clone(), 482 | fragments_packets.remove(&cluster.local_id).unwrap(), // 并移除 483 | nodes_info.clone(), 484 | cluster.local_id(), 485 | ); 486 | ``` 487 | * 遍历剩余的fragments_packets,并为创建每个executor的QueryFragmentsPlanPacket 488 | * ```rust 489 | for (executor, fragments) in fragments_packets.into_iter() { 490 | let query_id = self.ctx.get_id(); 491 | let executors_info = nodes_info.clone(); 492 | 493 | query_fragments_plan_packets.push(QueryFragmentsPlanPacket::create( 494 | query_id, 495 | executor, 496 | fragments, 497 | executors_info, 498 | cluster.local_id(), 499 | )); 500 | } 501 | ``` 502 | * 返回 503 | * ```rust 504 | Ok(( 505 | local_query_fragments_plan_packet, 506 | query_fragments_plan_packets, 507 | )) 508 | ``` 509 | 510 | ###### fn get_execute_partial_query_packets() 511 | 512 | 返回:`Vec` 513 | 514 | * 获取nodes_info:集群节点信息`let nodes_info = Self::nodes_info(&self.ctx);` 515 | * 初始化execute_partial_query_packets:`let mut execute_partial_query_packets = Vec::with_capacity(nodes_info.len());` 516 | * 遍历上述nodes_info,并创建每个node的Packet 517 | * ```rust 518 | for node_id in nodes_info.keys() { 519 | execute_partial_query_packets.push(ExecutePartialQueryPacket::create( 520 | self.ctx.get_id(), 521 | node_id.to_owned(), 522 | nodes_info.clone(), 523 | )); 524 | } 525 | ``` 526 | * 返回`Ok(execute_partial_query_packets)` 527 | 528 | ##### QueryCoordinator 529 | 530 | ![image.png](./assets/1673421080755-image.png) 531 | 532 | ##### InitNodesChannelPacket 533 | 534 | 用于初始化查询节点间的Channel 535 | 536 | ```rust 537 | // file: src/query/service/src/api/rpc/packets/packet_publisher.rs 538 | pub struct InitNodesChannelPacket { 539 | pub query_id: String, 540 | pub executor: Arc, 541 | pub connections_info: Vec, 542 | } 543 | 544 | pub struct ConnectionInfo { 545 | pub target: Arc, 546 | pub fragments: Vec, 547 | pub create_request_channel: bool, 548 | } 549 | ``` 550 | 551 | ###### fn commit() 552 | 553 | ```rust 554 | impl Packet for InitNodesChannelPacket { 555 | async fn commit(&self, config: &Config, timeout: u64) -> Result<()> { 556 | let executor_info = &self.executor; 557 | let mut conn = create_client(config, &executor_info.flight_address).await?; // 创建FlightClient,详见第3节 558 | let action = FlightAction::InitNodesChannel(InitNodesChannel { 559 | init_nodes_channel_packet: self.clone(), 560 | }); 561 | conn.execute_action(action, timeout).await // 调用FlightClient的execute_action()发送InitNodes请求⭐️ 562 | } 563 | } 564 | ``` 565 | 566 | ### 2 分布式查询的执行 567 | 568 | ### 3 节点间的RPC通信 569 | 570 | #### 创建Connection 571 | 572 | ##### fn create_client() 573 | 574 | ```rust 575 | // file: src/query/service/src/api/rpc/packets/packet.rs 576 | pub async fn create_client(config: &Config, address: &str) -> Result { 577 | match config.tls_query_cli_enabled() { 578 | true => Ok(FlightClient::new(FlightServiceClient::new( 579 | ConnectionFactory::create_rpc_channel( 580 | address.to_owned(), 581 | None, 582 | Some(config.query.to_rpc_client_tls_config()), 583 | ) 584 | .await?, 585 | ))), 586 | false => Ok(FlightClient::new(FlightServiceClient::new( 587 | ConnectionFactory::create_rpc_channel(address.to_owned(), None, None).await?, 588 | ))), 589 | } 590 | } 591 | ``` 592 | 593 | ##### ConnectionFactory 594 | 595 | ![image.png](./assets/1673429218678-image.png) 596 | 597 | ##### FlightClient 598 | 599 | 与DatabendQueryFlightService通信 600 | 601 | ```rust 602 | // file: src/query/service/src/api/rpc/flight_client.rs 603 | pub struct FlightClient { 604 | inner: FlightServiceClient, 605 | } 606 | 607 | ``` 608 | 609 | ##### DatabendQueryFlightService 610 | 611 | ```rust 612 | async fn do_action(&self, request: Request) -> Response { 613 | common_tracing::extract_remote_span_as_parent(&request); 614 | 615 | let action = request.into_inner(); 616 | let flight_action: FlightAction = action.try_into()?; 617 | 618 | let action_result = match &flight_action { 619 | FlightAction::InitQueryFragmentsPlan(init_query_fragments_plan) => { 620 | let session = SessionManager::instance() 621 | .create_session(SessionType::FlightRPC) 622 | .await?; 623 | let ctx = session.create_query_context().await?; 624 | DataExchangeManager::instance() 625 | .init_query_fragments_plan(&ctx, &init_query_fragments_plan.executor_packet)?; 626 | 627 | FlightResult { body: vec![] } 628 | } 629 | FlightAction::InitNodesChannel(init_nodes_channel) => { 630 | let publisher_packet = &init_nodes_channel.init_nodes_channel_packet; 631 | DataExchangeManager::instance() 632 | .init_nodes_channel(publisher_packet) 633 | .await?; 634 | 635 | FlightResult { body: vec![] } 636 | } 637 | FlightAction::ExecutePartialQuery(query_id) => { 638 | DataExchangeManager::instance().execute_partial_query(query_id)?; 639 | 640 | FlightResult { body: vec![] } 641 | } 642 | }; 643 | ``` 644 | -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/3_distributed_query.md: -------------------------------------------------------------------------------- 1 | * [三 分布式查询](#三-分布式查询) 2 | * [1 执行入口](#1-执行入口) 3 | * [SelectInterpreter](#selectinterpreter) 4 | * [2 创建分布式逻辑计划](#2-创建分布式逻辑计划) 5 | * [Optimizers](#optimizers) 6 | * [ScattersOptimizer](#scattersoptimizer) 7 | * [3 分布式调度与执行](#3-分布式调度与执行) 8 | * [fn schedule_query](#fn-schedule_query) 9 | * [调度](#调度) 10 | * [PlanScheduler](#planscheduler) 11 | * [Tasks](#tasks) 12 | * [PipelineBuilder](#pipelinebuilder) 13 | * [执行](#执行) 14 | * [RemoteTransform](#remotetransform) 15 | * [FlightAction](#flightaction) 16 | * [FlightTicket](#flightticket) 17 | * [FlightClient](#flightclient) 18 | * [DatabendQueryFlightService](#databendqueryflightservice) 19 | 20 | ## 三 分布式查询 21 | 22 | ### 1 执行入口 23 | 24 | ##### SelectInterpreter 25 | 26 | ```rust 27 | func: execute(...) -> Result 28 | // 执行查询 29 | async fn execute( 30 | &self, 31 | _input_stream: Option, 32 | ) -> Result { 33 | let settings = self.ctx.get_settings(); 34 | 35 | if settings.get_enable_new_processor_framework()? != 0 && self.ctx.get_cluster().is_empty() 36 | { 37 | ... 38 | } 39 | let optimized_plan = self.rewrite_plan()?; // 1. 这里会生成分布式查询计划 40 | plan_schedulers::schedule_query(&self.ctx, &optimized_plan).await // 2. 分布式调度 41 | } 42 | // func: rewrite_plan() -> Result 43 | // 优化逻辑查询计划,通过ScattersOptimizer生成分布式逻辑查询计划 44 | fn rewrite_plan(&self) -> Result { 45 | plan_schedulers::apply_plan_rewrite( 46 | Optimizers::create(self.ctx.clone()), 47 | &self.select.input, 48 | ) 49 | } 50 | ``` 51 | 52 | ### 2 创建分布式逻辑计划 53 | 54 | ##### Optimizers 55 | 56 | ```rust 57 | // file: query/src/optimizers/optimizer.rs 58 | pub struct Optimizers { 59 | inner: Vec>, 60 | } 61 | // func: create(...) 62 | // 创建优化规则集合 63 | pub fn create(ctx: Arc) -> Self { 64 | let mut optimizers = Self::without_scatters(ctx.clone()); 65 | optimizers 66 | .inner 67 | .push(Box::new(ScattersOptimizer::create(ctx))); // 产生分布式计划的优化规则 68 | optimizers 69 | } 70 | // func: without_scatters(...) 71 | pub fn without_scatters(ctx: Arc) -> Self { 72 | Optimizers { 73 | inner: vec![ 74 | Box::new(ConstantFoldingOptimizer::create(ctx.clone())), 75 | Box::new(ExprTransformOptimizer::create(ctx.clone())), 76 | Box::new(TopNPushDownOptimizer::create(ctx.clone())), 77 | Box::new(StatisticsExactOptimizer::create(ctx)), 78 | ], 79 | } 80 | } 81 | ``` 82 | 83 | ##### ScattersOptimizer 84 | 85 | ```rust 86 | // file: query/src/optimizers/optimizer_scatters.rs 87 | pub struct ScattersOptimizer { 88 | ctx: Arc, 89 | } 90 | // func: optimize(...) -> Result 91 | // 优化逻辑 92 | fn optimize(&mut self, plan: &PlanNode) -> Result { 93 | if self.ctx.get_cluster().is_empty() { 94 | // Standalone mode. 95 | return Ok(plan.clone()); 96 | } 97 | 98 | let mut optimizer_impl = ScattersOptimizerImpl::create(self.ctx.clone()); 99 | let rewrite_plan = optimizer_impl.rewrite_plan_node(plan)?; 100 | 101 | // We need to converge at the end 102 | match optimizer_impl.running_mode { 103 | RunningMode::Standalone => Ok(rewrite_plan), 104 | RunningMode::Cluster => Ok(PlanNode::Stage(StagePlan { // 创建StagePlan 105 | kind: StageKind::Convergent, 106 | scatters_expr: Expression::create_literal(DataValue::UInt64(0)), 107 | input: Arc::new(rewrite_plan), 108 | })), 109 | } 110 | } 111 | 112 | enum RunningMode { 113 | Standalone, 114 | Cluster, 115 | } 116 | 117 | struct ScattersOptimizerImpl { 118 | ctx: Arc, 119 | running_mode: RunningMode, // 表示当前重写的PlanNode应该在单节点或者分布式运行 120 | before_group_by_schema: Option, 121 | 122 | // temporary node 123 | input: Option>, 124 | } 125 | // func: create(...) -> ScattersOptimizerImpl 126 | // 创建ScattersOptimizerImpl 127 | pub fn create(ctx: Arc) -> ScattersOptimizerImpl { 128 | ScattersOptimizerImpl { 129 | ctx, 130 | running_mode: RunningMode::Standalone, // 初始化为Standalone 131 | before_group_by_schema: None, 132 | input: None, 133 | } 134 | } 135 | // impl trait: 136 | // 这里的逻辑比较简单,之间看源码即可,这里只列出函数定义 137 | impl PlanRewriter for ScattersOptimizerImpl { 138 | fn rewrite_subquery_plan(&mut self, subquery_plan: &PlanNode) -> Result { 139 | let subquery_ctx = QueryContext::create_from(self.ctx.clone()); 140 | let mut subquery_optimizer = ScattersOptimizerImpl::create(subquery_ctx); 141 | let rewritten_subquery = subquery_optimizer.rewrite_plan_node(subquery_plan)?; 142 | 143 | match (&self.running_mode, &subquery_optimizer.running_mode) { 144 | (RunningMode::Standalone, RunningMode::Standalone) => Ok(rewritten_subquery), 145 | // 填加PlanNode::Stage 146 | (RunningMode::Standalone, RunningMode::Cluster) => { 147 | Ok(Self::convergent_shuffle_stage(rewritten_subquery)?) 148 | } 149 | // 填加PlanNode::Broadcast 150 | (RunningMode::Cluster, RunningMode::Standalone) => { 151 | Ok(PlanNode::Broadcast(BroadcastPlan { 152 | input: Arc::new(rewritten_subquery), 153 | })) 154 | } 155 | // 填加PlanNode::Broadcast 156 | (RunningMode::Cluster, RunningMode::Cluster) => { 157 | Ok(PlanNode::Broadcast(BroadcastPlan { 158 | input: Arc::new(rewritten_subquery), 159 | })) 160 | } 161 | } 162 | } 163 | fn rewrite_aggregate_partial(&mut self, plan: &AggregatorPartialPlan) -> Result; 164 | fn rewrite_aggregate_final(&mut self, plan: &AggregatorFinalPlan) -> Result; 165 | fn rewrite_sort(&mut self, plan: &SortPlan) -> Result; 166 | fn rewrite_limit(&mut self, plan: &LimitPlan) -> Result; 167 | fn rewrite_limit_by(&mut self, plan: &LimitByPlan) -> Result; 168 | fn rewrite_read_data_source(&mut self, plan: &ReadDataSourcePlan) -> Result; 169 | } 170 | // 会创建PlanNode::Stage的函数 171 | // func: convergent_shuffle_stage_builder(...) -> PlanBuilder 172 | // 创建收敛于一个节点的StagePlan,cluster_sort()、cluster_limit()、cluster_limit_by()都会调用此函数,即 n -> 1 173 | fn convergent_shuffle_stage_builder(input: Arc) -> PlanBuilder { 174 | PlanBuilder::from(&PlanNode::Stage(StagePlan { 175 | kind: StageKind::Convergent, 176 | scatters_expr: Expression::create_literal(DataValue::UInt64(0)), 177 | input, 178 | })) 179 | } 180 | // func: convergent_shuffle_stage_builder(...) -> Result 181 | // 同convergent_shuffle_stage_builder(),cluster_aggregate_without_key()和rewrite_subquery_plan()在(Standalone,Cluster)时会调用此函数 182 | fn convergent_shuffle_stage(input: PlanNode) -> Result { 183 | Ok(PlanNode::Stage(StagePlan { 184 | kind: StageKind::Convergent, 185 | scatters_expr: Expression::create_literal(DataValue::UInt64(0)), 186 | input: Arc::new(input), 187 | })) 188 | } 189 | // func: normal_shuffle_stage(...) -> Result 190 | // 创建普通shuffle stage,cluster_aggregate_with_key()会调用此函数,即 n -> m,比如Hash shuffle 191 | fn normal_shuffle_stage(key: impl Into, input: PlanNode) -> Result { 192 | let scatters_expr = Expression::ScalarFunction { 193 | op: String::from("sipHash"), 194 | args: vec![Expression::Column(key.into())], 195 | }; 196 | 197 | Ok(PlanNode::Stage(StagePlan { 198 | scatters_expr, 199 | kind: StageKind::Normal, 200 | input: Arc::new(input), 201 | })) 202 | } 203 | 204 | ``` 205 | 206 | ### 3 分布式调度与执行 207 | 208 | ##### fn schedule_query 209 | 210 | ```rust 211 | // file: query/src/interpreters/plan_schedulers/plan_scheduler_query.rs 212 | pub async fn schedule_query( 213 | ctx: &Arc, 214 | plan: &PlanNode, 215 | ) -> Result { 216 | let scheduler = PlanScheduler::try_create(ctx.clone())?; // 创建PlanScheduler 217 | let scheduled_tasks = scheduler.reschedule(plan)?; // 生成Tasks 218 | let remote_stage_actions = scheduled_tasks.get_tasks()?; // 获取每个节点和其对应的FlightAction 219 | 220 | let config = ctx.get_config(); 221 | let cluster = ctx.get_cluster(); 222 | let timeout = ctx.get_settings().get_flight_client_timeout()?; 223 | let mut scheduled = Scheduled::new(); 224 | for (node, action) in remote_stage_actions { // 执行远程节点任务 225 | let mut flight_client = cluster.create_node_conn(&node.id, &config).await?; // 创建链接 226 | let executing_action = flight_client.execute_action(action.clone(), timeout); // 在各节点执行action 227 | 228 | executing_action.await?; 229 | scheduled.insert(node.id.clone(), node.clone()); 230 | } 231 | 232 | let pipeline_builder = PipelineBuilder::create(ctx.clone()); 233 | let mut in_local_pipeline = pipeline_builder.build(&scheduled_tasks.get_local_task())?; // 创建本地节点pipeline,这里会创建RemoteTransform,从其它节点获取数据 234 | 235 | match in_local_pipeline.execute().await { // 执行本地节点任务 236 | Ok(stream) => Ok(ScheduledStream::create(ctx.clone(), scheduled, stream)), 237 | Err(error) => { 238 | plan_schedulers::handle_error(ctx, scheduled, timeout).await; 239 | Err(error) 240 | } 241 | } 242 | } 243 | ``` 244 | 245 | #### 调度 246 | 247 | ##### PlanScheduler 248 | 249 | ```rust 250 | // file: query/src/interpreters/plan_schedulers/plan_scheduler.rs 251 | pub struct PlanScheduler { 252 | stage_id: String, 253 | cluster_nodes: Vec, // 集群节点名称 254 | 255 | local_pos: usize, // 本地节点名称的序号 256 | nodes_plan: Vec, // 记录每个节点临时的PlanNode,在Visit的时候对其进行update,并生成Task。 257 | running_mode: RunningMode, // 执行模式,单机或集群 258 | query_context: Arc, 259 | subqueries_expressions: Vec, 260 | } 261 | // func: reschedule(...) -> Result 262 | // 根据PlanNode生成Tasks 263 | pub fn reschedule(mut self, plan: &PlanNode) -> Result { 264 | let context = self.query_context.clone(); 265 | let cluster = context.get_cluster(); 266 | let mut tasks = Tasks::create(context); // 初始化Tasks 267 | 268 | match cluster.is_empty() { 269 | true => tasks.finalize(plan), // 无其它节点 270 | false => { 271 | self.visit_plan_node(plan, &mut tasks)?; 272 | tasks.finalize(&self.nodes_plan[self.local_pos]) 273 | } 274 | } 275 | } 276 | // func: visit_plan_node(...) 277 | fn visit_plan_node(&mut self, node: &PlanNode, tasks: &mut Tasks) -> Result<()> { 278 | match node { 279 | PlanNode::AggregatorPartial(plan) => self.visit_aggr_part(plan, tasks), 280 | PlanNode::AggregatorFinal(plan) => self.visit_aggr_final(plan, tasks), 281 | PlanNode::Empty(plan) => self.visit_empty(plan, tasks), 282 | PlanNode::Projection(plan) => self.visit_projection(plan, tasks), 283 | PlanNode::Filter(plan) => self.visit_filter(plan, tasks), 284 | PlanNode::Sort(plan) => self.visit_sort(plan, tasks), 285 | PlanNode::Limit(plan) => self.visit_limit(plan, tasks), 286 | PlanNode::LimitBy(plan) => self.visit_limit_by(plan, tasks), 287 | PlanNode::ReadSource(plan) => self.visit_data_source(plan, tasks), 288 | PlanNode::Sink(plan) => self.visit_sink(plan, tasks), 289 | PlanNode::Select(plan) => self.visit_select(plan, tasks), 290 | PlanNode::Stage(plan) => self.visit_stage(plan, tasks), // 这里会创建FlightAction 291 | PlanNode::Broadcast(plan) => self.visit_broadcast(plan, tasks), // 这里会创建FlightAction 292 | PlanNode::Having(plan) => self.visit_having(plan, tasks), 293 | PlanNode::Expression(plan) => self.visit_expression(plan, tasks), 294 | PlanNode::SubQueryExpression(plan) => self.visit_subqueries_set(plan, tasks), 295 | _ => Err(ErrorCode::UnImplement("")), 296 | } 297 | } 298 | // func: visit_limit(...) 299 | fn visit_limit(&mut self, plan: &LimitPlan, tasks: &mut Tasks) -> Result<()> { 300 | self.visit_plan_node(plan.input.as_ref(), tasks)?; // 先遍历孩子节点 301 | match self.running_mode { 302 | RunningMode::Cluster => self.visit_cluster_limit(plan), // 集群模式 303 | RunningMode::Standalone => self.visit_local_limit(plan), // 单机模式 304 | }; 305 | Ok(()) 306 | } 307 | // func: visit_local_limit(...) 308 | fn visit_local_limit(&mut self, plan: &LimitPlan) { 309 | self.nodes_plan[self.local_pos] = PlanNode::Limit(LimitPlan { 310 | n: plan.n, 311 | offset: plan.offset, 312 | input: Arc::new(self.nodes_plan[self.local_pos].clone()), // 只处理当前节点的PlanNode 313 | }); // 更新节点的PlanNode 314 | } 315 | // func: visit_cluster_limit(...) 316 | fn visit_cluster_limit(&mut self, plan: &LimitPlan) { 317 | for index in 0..self.nodes_plan.len() { // 处理所有节点的PlanNode 318 | self.nodes_plan[index] = PlanNode::Limit(LimitPlan { 319 | n: plan.n, 320 | offset: plan.offset, 321 | input: Arc::new(self.nodes_plan[index].clone()), 322 | }); // 更新节点的PlanNode 323 | } 324 | } 325 | // func: visit_stage(...) 326 | // 处理StagePlan 327 | fn visit_stage(&mut self, stage: &StagePlan, tasks: &mut Tasks) -> Result<()> { 328 | self.visit_plan_node(stage.input.as_ref(), tasks)?; 329 | 330 | // Entering new stage 331 | self.stage_id = uuid::Uuid::new_v4().to_string(); 332 | 333 | match stage.kind { 334 | StageKind::Normal => self.schedule_normal_tasks(stage, tasks), 335 | StageKind::Expansive => self.schedule_expansive_tasks(stage, tasks), 336 | StageKind::Convergent => self.schedule_converge_tasks(stage, tasks), 337 | } 338 | } 339 | // func: schedule_normal_tasks(...) 340 | fn schedule_normal_tasks(&mut self, stage: &StagePlan, tasks: &mut Tasks) -> Result<()> { 341 | if let RunningMode::Standalone = self.running_mode { // 必须是集群 342 | return Err(ErrorCode::LogicalError( 343 | "Normal stage cannot work on standalone mode", 344 | )); 345 | } 346 | 347 | for index in 0..self.nodes_plan.len() { // 遍历所有节点 348 | let node_name = &self.cluster_nodes[index]; // 获取节点名 349 | let shuffle_action = self.normal_action(stage, &self.nodes_plan[index]); // 创建ShuffleAction 350 | let remote_plan_node = self.normal_remote_plan(node_name, &shuffle_action); // 创建RemotePlan 351 | let shuffle_flight_action = FlightAction::PrepareShuffleAction(shuffle_action); // 创建FlightAction 352 | 353 | tasks.add_task(node_name, shuffle_flight_action); // 加入tasks,启动任务时会用 354 | self.nodes_plan[index] = PlanNode::Remote(remote_plan_node); // 更新节点的PlanNode 355 | } 356 | 357 | Ok(()) 358 | } 359 | // func: schedule_expansive_tasks() 360 | fn schedule_expansive_tasks(&mut self, stage: &StagePlan, tasks: &mut Tasks) -> Result<()> { 361 | if let RunningMode::Cluster = self.running_mode { // 必须是单节点 362 | return Err(ErrorCode::LogicalError( 363 | "Expansive stage cannot work on Cluster mode", 364 | )); 365 | } 366 | 367 | self.running_mode = RunningMode::Cluster; 368 | let node_name = &self.cluster_nodes[self.local_pos]; // 本地节点名称 369 | let shuffle_action = self.expansive_action(stage, &self.nodes_plan[self.local_pos]); // 创建ShuffleAction 370 | tasks.add_task( 371 | node_name, 372 | FlightAction::PrepareShuffleAction(shuffle_action.clone()), 373 | ); // 加入tasks,启动任务时会用 374 | 375 | for index in 0..self.nodes_plan.len() { 376 | let node_name = &self.cluster_nodes[index]; 377 | self.nodes_plan[index] = self.expansive_remote_plan(node_name, &shuffle_action); // 更新节点的PlanNode 378 | } 379 | 380 | Ok(()) 381 | } 382 | // func: schedule_converge_tasks(...) 383 | fn schedule_converge_tasks(&mut self, stage: &StagePlan, tasks: &mut Tasks) -> Result<()> { 384 | if let RunningMode::Standalone = self.running_mode { // 必须是集群 385 | return Err(ErrorCode::LogicalError( 386 | "Converge stage cannot work on standalone mode", 387 | )); 388 | } 389 | 390 | for index in 0..self.nodes_plan.len() { 391 | let node_name = &self.cluster_nodes[index]; // 节点名 392 | let shuffle_action = self.converge_action(stage, &self.nodes_plan[index]); // 创建ShuffleAction 393 | let shuffle_flight_action = FlightAction::PrepareShuffleAction(shuffle_action); // 创建FlightAction 394 | 395 | tasks.add_task(node_name, shuffle_flight_action); // 加入tasks,启动任务时会用 396 | } 397 | 398 | self.running_mode = RunningMode::Standalone; 399 | let node_name = &self.cluster_nodes[self.local_pos]; 400 | let remote_plan_node = self.converge_remote_plan(node_name, stage); 401 | self.nodes_plan[self.local_pos] = PlanNode::Remote(remote_plan_node); // 更新节点的PlanNode 402 | 403 | Ok(()) 404 | } 405 | // func: visit_broadcast(...) 406 | // 处理BroadcastPlan 407 | fn visit_broadcast(&mut self, plan: &BroadcastPlan, tasks: &mut Tasks) -> Result<()> { 408 | self.visit_plan_node(plan.input.as_ref(), tasks)?; 409 | 410 | // Entering new stage 411 | self.stage_id = uuid::Uuid::new_v4().to_string(); 412 | 413 | match self.running_mode { 414 | RunningMode::Cluster => self.visit_cluster_broadcast(tasks), 415 | RunningMode::Standalone => self.visit_local_broadcast(tasks), 416 | }; 417 | 418 | Ok(()) 419 | } 420 | // func: visit_cluster_broadcast(...) 421 | fn visit_cluster_broadcast(&mut self, tasks: &mut Tasks) { 422 | self.running_mode = RunningMode::Cluster; 423 | for index in 0..self.nodes_plan.len() { 424 | let node_name = &self.cluster_nodes[index]; 425 | let action = self.broadcast_action(&self.nodes_plan[index]); 426 | let remote_plan_node = self.broadcast_remote(node_name, &action); 427 | 428 | tasks.add_task(node_name, FlightAction::BroadcastAction(action)); 429 | self.nodes_plan[index] = PlanNode::Remote(remote_plan_node); 430 | } 431 | } 432 | ``` 433 | 434 | ##### Tasks 435 | 436 | ```rust 437 | // file: query/src/interpreters/plan_schedulers/plan_scheduler.rs 438 | pub struct Tasks { 439 | plan: PlanNode, 440 | context: Arc, 441 | actions: HashMap>, // 一个节点可能有多个FlightAction 442 | } 443 | // func: finalize(...) 444 | // 设置本地节点PlanNode 445 | pub fn finalize(mut self, plan: &PlanNode) -> Result { 446 | self.plan = plan.clone(); 447 | Ok(self) 448 | } 449 | // func: get_tasks() -> Result, FlightAction)>> 450 | // 获取集群各节点对应的FlightAction 451 | pub fn get_tasks(&self) -> Result, FlightAction)>> { 452 | let cluster = self.context.get_cluster(); 453 | 454 | let mut tasks = Vec::new(); 455 | for cluster_node in &cluster.get_nodes() { 456 | if let Some(actions) = self.actions.get(&cluster_node.id) { 457 | for action in actions { 458 | tasks.push((cluster_node.clone(), action.clone())); 459 | } 460 | } 461 | } 462 | 463 | Ok(tasks) 464 | } 465 | // func: add_task(...) ⭐️ 466 | // 加入节点对应的FlightAction,会被visit_stage()、visit_local_broadcast()、visit_cluster_broadcast()函数调用 467 | #[allow(clippy::ptr_arg)] 468 | pub fn add_task(&mut self, node_name: &String, action: FlightAction) { 469 | match self.actions.entry(node_name.to_string()) { 470 | Entry::Occupied(mut entry) => entry.get_mut().push_back(action), // 加入已有的队列中 471 | Entry::Vacant(entry) => { 472 | let mut node_tasks = VecDeque::new(); // 创建新的队列 473 | node_tasks.push_back(action); 474 | entry.insert(node_tasks); 475 | } 476 | }; 477 | } 478 | ``` 479 | 480 | ##### PipelineBuilder 481 | 482 | ```rust 483 | // file: query/src/pipelines/processors/pipeline_builder.rs 484 | // func: build(...) -> Result 485 | pub fn build(mut self, node: &PlanNode) -> Result { 486 | let pipeline = self.visit(node)?; 487 | Ok(pipeline) 488 | } 489 | // func: visit(...) 490 | fn visit(&mut self, node: &PlanNode) -> Result { 491 | match node { 492 | ... 493 | PlanNode::Remote(node) => self.visit_remote(node), 494 | ... 495 | } 496 | } 497 | // func: visit_remote(...) -> Result 498 | fn visit_remote(&self, plan: &RemotePlan) -> Result { 499 | let mut pipeline = Pipeline::create(self.ctx.clone()); 500 | 501 | for fetch_node in &plan.fetch_nodes { // 遍历其它节点 502 | let flight_ticket = 503 | FlightTicket::stream(&plan.query_id, &plan.stage_id, &plan.stream_id); // 创建FlightTicket,用于创建Request 504 | 505 | pipeline.add_source(Arc::new(RemoteTransform::try_create( // 创建RemoteTransform,并作为Source加入pipeline 506 | flight_ticket, 507 | self.ctx.clone(), 508 | /* fetch_node_name */ fetch_node.clone(), 509 | /* fetch_stream_schema */ plan.schema.clone(), 510 | )?))?; 511 | } 512 | 513 | Ok(pipeline) 514 | } 515 | ``` 516 | 517 | #### 执行 518 | 519 | ##### RemoteTransform 520 | 521 | ```rust 522 | // file: query/src/pipelines/transforms/transform_remote.rs 523 | pub struct RemoteTransform { 524 | ticket: FlightTicket, 525 | fetch_node_name: String, 526 | schema: DataSchemaRef, 527 | pub ctx: Arc, 528 | } 529 | // func: execute() -> Result 530 | // 执行 531 | async fn execute(&self) -> Result { 532 | tracing::debug!( 533 | "execute, flight_ticket {:?}, node name:{:#}...", 534 | self.ticket, 535 | self.fetch_node_name 536 | ); 537 | 538 | let data_schema = self.schema.clone(); 539 | let timeout = self.ctx.get_settings().get_flight_client_timeout()?; 540 | 541 | let fetch_ticket = self.ticket.clone(); 542 | let mut flight_client = self.flight_client().await?; // 创建Cliet 543 | let fetch_stream = flight_client 544 | .fetch_stream(fetch_ticket, data_schema, timeout) 545 | .await?; // 从远端获取数据流 546 | Ok(Box::pin(self.ctx.try_create_abortable(fetch_stream)?)) 547 | } 548 | // func: flight_client() -> Result 549 | // 创建FlightClient 550 | async fn flight_client(&self) -> Result { 551 | let context = self.ctx.clone(); 552 | let node_name = self.fetch_node_name.clone(); 553 | 554 | let cluster = context.get_cluster(); 555 | cluster 556 | .create_node_conn(&node_name, &self.ctx.get_config()) 557 | .await 558 | } 559 | ``` 560 | 561 | ##### FlightAction 562 | 563 | ```rust 564 | // file: query/src/api/rpc/flight_actions.rs 565 | pub enum FlightAction { 566 | PrepareShuffleAction(ShuffleAction), 567 | BroadcastAction(BroadcastAction), 568 | CancelAction(CancelAction), 569 | } 570 | ``` 571 | 572 | ##### FlightTicket 573 | 574 | 记录节点和query任务信息,从远程节点获取对应的数据流。FlightClient -> (FlightTicket -> Ticket) -> DatabendQueryFlightService 575 | 576 | ```rust 577 | // file: query/src/api/rpc/flight_tickets.rs 578 | pub enum FlightTicket { 579 | StreamTicket(StreamTicket), 580 | } 581 | impl TryInto for Ticket {...} 582 | impl TryInto for FlightTicket {...} 583 | 584 | pub struct StreamTicket { 585 | pub query_id: String, 586 | pub stage_id: String, 587 | pub stream: String, 588 | } 589 | ``` 590 | 591 | ##### FlightClient 592 | 593 | 两个功能: 594 | 595 | * 启动远程节点的查询任务,execute_action(FlightAction) -> do_action(Action) 596 | * 获取远程节点的数据流,fetch_stream(FlightTicket) -> do_get(Ticket) -> Streaming\ -> SendableDataBlockStream 597 | 598 | ```rust 599 | // file: query/src/api/rpc/flight_client.rs 600 | pub struct FlightClient { 601 | inner: FlightServiceClient, 602 | } 603 | // func: new() -> Self 604 | // 创建FlightClient 605 | pub fn new(inner: FlightServiceClient) -> FlightClient { 606 | FlightClient { inner } 607 | } 608 | // func: execute_action() 609 | // 执行查询计划 610 | pub async fn execute_action(&mut self, action: FlightAction, timeout: u64) -> Result<()> { 611 | self.do_action(action, timeout).await?; 612 | Ok(()) 613 | } 614 | async fn do_action(&mut self, action: FlightAction, timeout: u64) -> Result> { 615 | let action: Action = action.try_into()?; 616 | let action_type = action.r#type.clone(); 617 | let request = Request::new(action); 618 | let mut request = common_tracing::inject_span_to_tonic_request(request); 619 | request.set_timeout(Duration::from_secs(timeout)); 620 | 621 | let response = self.inner.do_action(request).await?; 622 | 623 | match response.into_inner().message().await? { 624 | Some(response) => Ok(response.body), 625 | None => Result::Err(ErrorCode::EmptyDataFromServer(format!( 626 | "Can not receive data from flight server, action: {action_type:?}", 627 | ))), 628 | } 629 | } 630 | // func: fetch_stream() -> Result 631 | // 获取数据流 632 | pub async fn fetch_stream( 633 | &mut self, 634 | ticket: FlightTicket, 635 | schema: DataSchemaRef, 636 | timeout: u64, 637 | ) -> Result { 638 | let ticket = ticket.try_into()?; 639 | let inner = self.do_get(ticket, timeout).await?; 640 | Ok(Box::pin(FlightDataStream::from_remote(schema, inner))) 641 | } 642 | async fn do_get(&mut self, ticket: Ticket, timeout: u64) -> Result> { 643 | let request = Request::new(ticket); 644 | let mut request = common_tracing::inject_span_to_tonic_request(request); 645 | request.set_timeout(Duration::from_secs(timeout)); 646 | 647 | let response = self.inner.do_get(request).await?; 648 | Ok(response.into_inner()) 649 | } 650 | ``` 651 | 652 | ##### DatabendQueryFlightService 653 | 654 | ```rust 655 | // file: query/src/api/rpc/flight_service.rs 656 | pub struct DatabendQueryFlightService { 657 | sessions: Arc, 658 | dispatcher: Arc, 659 | } 660 | ``` 661 | 662 | 详细逻辑见《9 RPC API Service》 663 | -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/8_storage.md: -------------------------------------------------------------------------------- 1 | * [八 Storage](#八-storage) 2 | * [Init Storeage and Table](#init-storeage-and-table) 3 | * [GlobalServices](#globalservices) 4 | * [Operator](#operator) 5 | * [CatalogManager](#catalogmanager) 6 | * [CatalogManagerHelper](#catalogmanagerhelper) 7 | * [DatabaseCatalog](#databasecatalog) 8 | * [MutableCatalog](#mutablecatalog) 9 | * [StorageFactory](#storagefactory) 10 | * [FuseTable](#fusetable) 11 | * [Create Table](#create-table) 12 | * [Binder](#binder) 13 | * [PhysicalPlanBuilder](#physicalplanbuilder) 14 | * [PipelineBuilder](#pipelinebuilder) 15 | * [QueryContext](#querycontext) 16 | * [DatabaseCatalog](#databasecatalog) 17 | * [MutableCatalog](#mutablecatalog) 18 | * [StorageFactory](#storagefactory) 19 | * [Table trait](#table-trait) 20 | * [FuseTable](#fusetable) 21 | * [FuseTableSource](#fusetablesource) 22 | * [Reference](#reference) 23 | 24 | ## 八 Storage 25 | 26 | ### Init Storeage and Table 27 | 28 | ```rust 29 | // file: src/binaries/query/main.rs 30 | GlobalServices::init(conf.clone()).await?; 31 | ``` 32 | 33 | #### GlobalServices 34 | 35 | ```rust 36 | // src/query/service/src/global_services.rs 37 | pub async fn init(config: Config) -> Result<()> { 38 | let global_services = Arc::new(GlobalServices { 39 | ... 40 | storage_operator: UnsafeCell::new(None), 41 | ... 42 | }); 43 | // 初始化DataOperator,即Storage 44 | DataOperator::init(&config.storage, global_services.clone()).await?; 45 | // 初始化CatalogManager,即Catalog,依赖Storage 46 | CatalogManager::init(&config, global_services.clone()).await?; 47 | } 48 | 49 | impl SingletonImpl for GlobalServices { 50 | fn get(&self) -> DataOperator { 51 | unsafe { 52 | match &*self.storage_operator.get() { 53 | None => panic!("StorageOperator is not init"), 54 | Some(storage_operator) => storage_operator.clone(), 55 | } 56 | } 57 | } 58 | 59 | fn init(&self, value: DataOperator) -> Result<()> { 60 | unsafe { 61 | *(self.storage_operator.get() as *mut Option) = Some(value); 62 | Ok(()) 63 | } 64 | } 65 | } 66 | ``` 67 | 68 | #### Operator 69 | 70 | ```rust 71 | // src/common/storage/src/operator.rs 72 | 73 | // 单例 74 | static DATA_OPERATOR: OnceCell> = OnceCell::new(); 75 | 76 | // 初始化单例 77 | pub async fn init( 78 | conf: &StorageConfig, 79 | v: Singleton, 80 | ) -> common_exception::Result<()> { 81 | v.init(Self::try_create(conf).await?)?; 82 | 83 | DATA_OPERATOR.set(v).ok(); 84 | Ok(()) 85 | } 86 | 87 | // 获取单例 88 | pub fn instance() -> DataOperator { 89 | match DATA_OPERATOR.get() { 90 | None => panic!("StorageOperator is not init"), 91 | Some(storage_operator) => storage_operator.get(), 92 | } 93 | } 94 | 95 | pub async fn try_create(conf: &StorageConfig) -> common_exception::Result { 96 | Self::try_create_with_storage_params(&conf.params).await 97 | } 98 | 99 | pub async fn try_create_with_storage_params( 100 | sp: &StorageParams, 101 | ) -> common_exception::Result { 102 | let operator = init_operator(sp)?; // 初始化operator 103 | 104 | // OpenDAL will send a real request to underlying storage to check whether it works or not. 105 | // If this check failed, it's highly possible that the users have configured it wrongly. 106 | // 107 | // Make sure the check is called inside GlobalIORuntime to prevent 108 | // IO hang on reuse connection. 109 | let op = operator.clone(); 110 | if let Err(cause) = GlobalIORuntime::instance() 111 | .spawn(async move { op.check().await }) 112 | .await 113 | .expect("join must succeed") 114 | { 115 | return Err(ErrorCode::StorageUnavailable(format!( 116 | "current configured storage is not available: config: {:?}, cause: {cause}", 117 | sp 118 | ))); 119 | } 120 | 121 | Ok(DataOperator { 122 | operator, 123 | params: sp.clone(), 124 | }) 125 | } 126 | 127 | /// init_operator will init an opendal operator based on storage config. 128 | pub fn init_operator(cfg: &StorageParams) -> Result { 129 | let op = match &cfg { 130 | StorageParams::Azblob(cfg) => init_azblob_operator(cfg)?, 131 | StorageParams::Fs(cfg) => init_fs_operator(cfg)?, 132 | StorageParams::Ftp(cfg) => init_ftp_operator(cfg)?, 133 | StorageParams::Gcs(cfg) => init_gcs_operator(cfg)?, 134 | #[cfg(feature = "storage-hdfs")] 135 | StorageParams::Hdfs(cfg) => init_hdfs_operator(cfg)?, 136 | StorageParams::Http(cfg) => init_http_operator(cfg)?, 137 | StorageParams::Ipfs(cfg) => init_ipfs_operator(cfg)?, 138 | StorageParams::Memory => init_memory_operator()?, 139 | StorageParams::Moka(cfg) => init_moka_operator(cfg)?, 140 | StorageParams::Obs(cfg) => init_obs_operator(cfg)?, 141 | StorageParams::S3(cfg) => init_s3_operator(cfg)?, // 初始化s3 142 | StorageParams::Oss(cfg) => init_oss_operator(cfg)?, 143 | }; 144 | 145 | let op = op 146 | // Add retry 147 | .layer(RetryLayer::new(ExponentialBackoff::default().with_jitter())) 148 | // Add metrics 149 | .layer(MetricsLayer) 150 | // Add logging 151 | .layer(LoggingLayer) 152 | // Add tracing 153 | .layer(TracingLayer) 154 | // NOTE 155 | // 156 | // Magic happens here. We will add a layer upon original 157 | // storage operator so that all underlying storage operations 158 | // will send to storage runtime. 159 | .layer(RuntimeLayer::new(GlobalIORuntime::instance().inner())); 160 | 161 | Ok(op) 162 | } 163 | 164 | /// init_s3_operator will init a opendal s3 operator with input s3 config. 165 | fn init_s3_operator(cfg: &StorageS3Config) -> Result { 166 | let mut builder = s3::Builder::default(); 167 | 168 | // Endpoint. 169 | builder.endpoint(&cfg.endpoint_url); 170 | 171 | // Region 172 | builder.region(&cfg.region); 173 | 174 | // Credential. 175 | builder.access_key_id(&cfg.access_key_id); 176 | builder.secret_access_key(&cfg.secret_access_key); 177 | builder.security_token(&cfg.security_token); 178 | builder.role_arn(&cfg.role_arn); 179 | builder.external_id(&cfg.external_id); 180 | 181 | // Bucket. 182 | builder.bucket(&cfg.bucket); 183 | 184 | // Root. 185 | builder.root(&cfg.root); 186 | 187 | // Disable credential loader 188 | if cfg.disable_credential_loader { 189 | builder.disable_credential_loader(); 190 | } 191 | 192 | // Enable virtual host style 193 | if cfg.enable_virtual_host_style { 194 | builder.enable_virtual_host_style(); 195 | } 196 | 197 | Ok(Operator::new(builder.build()?)) 198 | } 199 | ``` 200 | 201 | #### CatalogManager 202 | 203 | ```rust 204 | // file: src/query/catalog/src/catalog.rs 205 | static CATALOG_MANAGER: OnceCell>> = OnceCell::new(); // 全局单例 206 | 207 | pub struct CatalogManager { 208 | pub catalogs: HashMap>, 209 | } 210 | 211 | // file: src/query/service/src/catalogs/catalog_manager.rs 212 | #[async_trait::async_trait] 213 | impl CatalogManagerHelper for CatalogManager { 214 | async fn init(conf: &Config, v: Singleton>) -> Result<()> { 215 | v.init(Self::try_create(conf).await?)?; 216 | CatalogManager::set_instance(v); 217 | Ok(()) 218 | } 219 | 220 | async fn try_create(conf: &Config) -> Result> { 221 | let mut catalog_manager = CatalogManager { 222 | catalogs: HashMap::new(), 223 | }; 224 | 225 | catalog_manager.register_build_in_catalogs(conf).await?; 226 | 227 | ... 228 | 229 | Ok(Arc::new(catalog_manager)) 230 | } 231 | 232 | async fn register_build_in_catalogs(&mut self, conf: &Config) -> Result<()> { 233 | let default_catalog: Arc = 234 | Arc::new(DatabaseCatalog::try_create_with_config(conf.clone()).await?); // 创建DatabaseCatalog 235 | self.catalogs 236 | .insert(CATALOG_DEFAULT.to_owned(), default_catalog); 237 | Ok(()) 238 | } 239 | } 240 | ``` 241 | 242 | #### CatalogManagerHelper 243 | 244 | ```rust 245 | // file: src/query/service/src/catalogs/catalog_manager.rs 246 | pub trait CatalogManagerHelper { 247 | async fn init(conf: &Config, v: Singleton>) -> Result<()>; 248 | 249 | async fn try_create(conf: &Config) -> Result>; 250 | 251 | async fn register_build_in_catalogs(&mut self, conf: &Config) -> Result<()>; 252 | 253 | #[cfg(feature = "hive")] 254 | fn register_external_catalogs(&mut self, conf: &Config) -> Result<()>; 255 | } 256 | ``` 257 | 258 | #### DatabaseCatalog 259 | 260 | ```rust 261 | // file: src/query/service/src/catalogs/default/database_catalog.rs 262 | pub struct DatabaseCatalog { 263 | /// the upper layer, read only 264 | immutable_catalog: Arc, 265 | /// bottom layer, writing goes here 266 | mutable_catalog: Arc, 267 | /// table function engine factories 268 | table_function_factory: Arc, 269 | } 270 | 271 | pub async fn try_create_with_config(conf: Config) -> Result { 272 | let immutable_catalog = ImmutableCatalog::try_create_with_config(&conf).await?; 273 | let mutable_catalog = MutableCatalog::try_create_with_config(conf).await?; // 创建MutableCatalog 274 | let table_function_factory = TableFunctionFactory::create(); 275 | let res = DatabaseCatalog::create( 276 | Arc::new(immutable_catalog), 277 | Arc::new(mutable_catalog), 278 | Arc::new(table_function_factory), 279 | ); 280 | Ok(res) 281 | } 282 | ``` 283 | 284 | #### MutableCatalog 285 | 286 | ```rust 287 | // file: src/query/service/src/catalogs/default/mutable_catalog.rs 288 | pub struct MutableCatalog { 289 | ctx: CatalogContext, 290 | } 291 | 292 | pub async fn try_create_with_config(conf: Config) -> Result { 293 | ... 294 | // Storage factory. 295 | let storage_factory = StorageFactory::create(conf.clone()); 296 | // Database factory. 297 | let database_factory = DatabaseFactory::create(conf.clone()); 298 | ... 299 | } 300 | ``` 301 | 302 | #### StorageFactory 303 | 304 | ```rust 305 | // file: src/query/storages/factory/src/storage_factory.rs 306 | pub struct StorageFactory { 307 | storages: RwLock>, 308 | } 309 | 310 | pub fn create(conf: Config) -> Self { 311 | ... 312 | // Register FUSE table engine. 313 | creators.insert("FUSE".to_string(), Storage { 314 | creator: Arc::new(FuseTable::try_create), // 创建Fuze Creator 315 | descriptor: Arc::new(FuseTable::description), 316 | }); 317 | ... 318 | } 319 | ``` 320 | 321 | #### FuseTable 322 | 323 | ```rust 324 | // file: src/query/storages/fuse/src/fuse_table.rs 325 | pub struct FuseTable { 326 | pub(crate) table_info: TableInfo, 327 | pub(crate) meta_location_generator: TableMetaLocationGenerator, 328 | 329 | pub(crate) cluster_keys: Vec, 330 | pub(crate) cluster_key_meta: Option, 331 | pub(crate) read_only: bool, 332 | 333 | pub(crate) operator: Operator, 334 | pub(crate) data_metrics: Arc, 335 | } 336 | 337 | // 创建FuseTable,该函数会被注册到StorageFactory中 338 | pub fn try_create(table_info: TableInfo) -> Result> { 339 | let r = Self::do_create(table_info, false)?; 340 | Ok(r) 341 | } 342 | 343 | fn init_operator(table_info: &TableInfo) -> Result { 344 | let operator = match table_info.from_share { 345 | Some(ref from_share) => create_share_table_operator( 346 | ShareTableConfig::share_endpoint_address(), 347 | &table_info.tenant, 348 | &from_share.tenant, 349 | &from_share.share_name, 350 | &table_info.name, 351 | ), 352 | None => { 353 | let storage_params = table_info.meta.storage_params.clone(); 354 | match storage_params { 355 | Some(sp) => init_operator(&sp)?, 356 | None => { 357 | let op = &*(DataOperator::instance()); // 获取DataOperator单例 358 | op.clone() 359 | } 360 | } 361 | } 362 | }; 363 | Ok(operator) 364 | } 365 | 366 | pub fn do_create(table_info: TableInfo, read_only: bool) -> Result> { 367 | let operator = Self::init_operator(&table_info)?; // 获取DataOperator单例 368 | Self::do_create_with_operator(table_info, operator, read_only) 369 | } 370 | 371 | pub fn do_create_with_operator( 372 | table_info: TableInfo, 373 | operator: Operator, 374 | read_only: bool, 375 | ) -> Result> { 376 | let storage_prefix = Self::parse_storage_prefix(&table_info)?; 377 | let cluster_key_meta = table_info.meta.cluster_key(); 378 | let mut cluster_keys = Vec::new(); 379 | if let Some((_, order)) = &cluster_key_meta { 380 | cluster_keys = ExpressionParser::parse_exprs(order)?; 381 | } 382 | let data_metrics = Arc::new(StorageMetrics::default()); 383 | let operator = operator.layer(StorageMetricsLayer::new(data_metrics.clone())); // Create a new layer. 384 | Ok(Box::new(FuseTable { 385 | table_info, 386 | cluster_keys, 387 | cluster_key_meta, 388 | meta_location_generator: TableMetaLocationGenerator::with_prefix(storage_prefix), 389 | read_only, 390 | operator, 391 | data_metrics, 392 | })) 393 | } 394 | ``` 395 | 396 | ### Create Table 397 | 398 | #### Binder 399 | 400 | ```rust 401 | // file: src/query/sql/src/planner/binder/select.rs 402 | pub(super) async fn bind_select_stmt() { 403 | let (mut s_expr, mut from_context) = if stmt.from.is_empty() { 404 | self.bind_one_table(bind_context, stmt).await? 405 | } else { 406 | ... 407 | self.bind_table_reference(bind_context, &cross_joins) 408 | } 409 | } 410 | ``` 411 | 412 | ```rust 413 | // file: src/query/sql/src/planner/binder/table.rs 414 | pub(super) async fn bind_table_reference( 415 | &mut self, 416 | bind_context: &BindContext, 417 | table_ref: &TableReference<'a>, 418 | ) -> Result<(SExpr, BindContext)> { 419 | // Resolve table with catalog 420 | let table_meta: Arc = self 421 | .resolve_data_source( 422 | tenant.as_str(), 423 | catalog.as_str(), 424 | database.as_str(), 425 | table_name.as_str(), 426 | &navigation_point, 427 | ) 428 | .await?; 429 | match table_meta.engine() { 430 | "VIEW" => { 431 | } 432 | _ => { 433 | let table_index = 434 | self.metadata 435 | .write() 436 | .add_table(catalog, database.clone(), table_meta); // 把Table放入Meta,这里会在创建PhysicalPlan时使用 437 | 438 | let (s_expr, mut bind_context) = 439 | self.bind_base_table(bind_context, database.as_str(), table_index)?; 440 | if let Some(alias) = alias { 441 | bind_context.apply_table_alias(alias, &self.name_resolution_ctx)?; 442 | } 443 | } 444 | } 445 | 446 | } 447 | 448 | async fn resolve_data_source( 449 | &self, 450 | tenant: &str, 451 | catalog_name: &str, 452 | database_name: &str, 453 | table_name: &str, 454 | travel_point: &Option, 455 | ) -> Result> { 456 | // Resolve table with catalog 457 | let catalog = self.catalogs.get_catalog(catalog_name)?; 458 | let mut table_meta = catalog.get_table(tenant, database_name, table_name).await?; // 获取Table 459 | 460 | if let Some(tp) = travel_point { 461 | table_meta = table_meta.navigate_to(tp).await?; 462 | } 463 | Ok(table_meta) 464 | } 465 | ``` 466 | 467 | #### PhysicalPlanBuilder 468 | 469 | ```rust 470 | // file: src/query/sql/src/executor/physical_plan_builder.rs 471 | 472 | pub async fn build(&self, s_expr: &SExpr) -> Result { 473 | debug_assert!(check_physical(s_expr)); 474 | 475 | match s_expr.plan() { 476 | RelOperator::PhysicalScan(scan) => { 477 | ... 478 | let table_entry = metadata.table(scan.table_index); // 从meta中获取table 479 | let table = table_entry.table(); 480 | let table_schema = table.schema(); 481 | 482 | let push_downs = self.push_downs(scan, &table_schema, has_inner_column)?; 483 | 484 | let source = table 485 | .read_plan_with_catalog( 486 | self.ctx.clone(), 487 | table_entry.catalog().to_string(), 488 | Some(push_downs), 489 | ) 490 | .await?; // 创建ReadDataSourcePlan,这里会在创建Pipeline时使用 491 | Ok(PhysicalPlan::TableScan(TableScan { 492 | name_mapping, 493 | source: Box::new(source), 494 | table_index: scan.table_index, 495 | })) 496 | } 497 | } 498 | } 499 | ``` 500 | 501 | #### PipelineBuilder 502 | 503 | ```rust 504 | // file: src/query/service/src/pipelines/pipeline_builder.rs 505 | pub struct PipelineBuilder { 506 | ctx: Arc, 507 | main_pipeline: Pipeline, 508 | pub pipelines: Vec, 509 | } 510 | 511 | fn build_table_scan(&mut self, scan: &TableScan) -> Result<()> { 512 | let table = self.ctx.build_table_from_source_plan(&scan.source)?; // 创建Table,scan.source是在PhysicalPlanBuilder中创建的 513 | self.ctx.try_set_partitions(scan.source.parts.clone())?; // set partition 514 | table.read_data(self.ctx.clone(), &scan.source, &mut self.main_pipeline)?; // 读取数据,详见下节 515 | let schema = scan.source.schema(); 516 | let projections = scan 517 | .name_mapping 518 | .iter() 519 | .map(|(name, _)| schema.index_of(name.as_str())) 520 | .collect::>>()?; 521 | 522 | let func_ctx = self.ctx.try_get_function_context()?; 523 | self.main_pipeline.add_transform(|input, output| { 524 | Ok(CompoundChunkOperator::create( 525 | input, 526 | output, 527 | func_ctx.clone(), 528 | vec![ChunkOperator::Project { 529 | offsets: projections.clone(), 530 | }], 531 | )) 532 | })?; 533 | 534 | self.main_pipeline.add_transform(|input, output| { 535 | Ok(CompoundChunkOperator::create( 536 | input, 537 | output, 538 | func_ctx.clone(), 539 | vec![ChunkOperator::Rename { 540 | output_schema: scan.output_schema()?, 541 | }], 542 | )) 543 | }) 544 | } 545 | ``` 546 | 547 | #### QueryContext 548 | 549 | ```rust 550 | // file: src/query/service/src/sessions/query_ctx.rs 551 | impl TableContext for QueryContext { 552 | /// Build a table instance the plan wants to operate on. 553 | /// 554 | /// A plan just contains raw information about a table or table function. 555 | /// This method builds a `dyn Table`, which provides table specific io methods the plan needs. 556 | fn build_table_from_source_plan(&self, plan: &ReadDataSourcePlan) -> Result> { 557 | match &plan.source_info { 558 | SourceInfo::TableSource(table_info) => { 559 | self.build_table_by_table_info(&plan.catalog, table_info, plan.tbl_args.clone()) // 创建Table 560 | } 561 | SourceInfo::StageSource(stage_info) => { 562 | self.build_external_by_table_info(&plan.catalog, stage_info, plan.tbl_args.clone()) 563 | } 564 | } 565 | } 566 | } 567 | 568 | // Build fuse/system normal table by table info. 569 | fn build_table_by_table_info( 570 | &self, 571 | catalog_name: &str, 572 | table_info: &TableInfo, 573 | table_args: Option>, 574 | ) -> Result> { 575 | let catalog = self.get_catalog(catalog_name)?; 576 | if table_args.is_none() { 577 | catalog.get_table_by_info(table_info) // case1, 578 | } else { 579 | Ok(catalog 580 | .get_table_function(&table_info.name, table_args)? // case2,内置的table,e.g. numbers, numbers_mt, fuse_segment等,详见TableFunctionFactory 581 | .as_table()) 582 | } 583 | } 584 | ``` 585 | 586 | #### DatabaseCatalog 587 | 588 | ```rust 589 | // file: src/query/service/src/catalogs/default/database_catalog.rs 590 | fn get_table_by_info(&self, table_info: &TableInfo) -> Result> { 591 | let res = self.immutable_catalog.get_table_by_info(table_info); // 先从immutable_catalog中获取 592 | match res { 593 | Ok(t) => Ok(t), 594 | Err(e) => { 595 | if e.code() == ErrorCode::unknown_table_code() { 596 | self.mutable_catalog.get_table_by_info(table_info) // 再从mutable_catalog中获取 597 | } else { 598 | Err(e) 599 | } 600 | } 601 | } 602 | } 603 | ``` 604 | 605 | #### MutableCatalog 606 | 607 | ```rust 608 | // file: src/query/service/src/catalogs/default/mutable_catalog.rs 609 | 610 | async fn get_table( 611 | &self, 612 | tenant: &str, 613 | db_name: &str, 614 | table_name: &str, 615 | ) -> Result> { 616 | let db = self.get_database(tenant, db_name).await?; 617 | db.get_table(table_name).await 618 | } 619 | 620 | fn get_table_by_info(&self, table_info: &TableInfo) -> Result> { 621 | let storage = self.ctx.storage_factory.clone(); // 获取storage 622 | storage.get_table(table_info) // 获取Table 623 | } 624 | ``` 625 | 626 | #### StorageFactory 627 | 628 | ```rust 629 | // file: src/query/storages/factory/src/storage_factory.rs 630 | pub fn get_table(&self, table_info: &TableInfo) -> Result> { 631 | let engine = table_info.engine().to_uppercase(); 632 | let lock = self.storages.read(); 633 | let factory = lock.get(&engine).ok_or_else(|| { // 获取Engine 634 | ErrorCode::UnknownTableEngine(format!("Unknown table engine {}", engine)) 635 | })?; 636 | 637 | let table: Arc = factory.creator.try_create(table_info.clone())?.into(); // 创建Table 638 | Ok(table) 639 | } 640 | ``` 641 | 642 | #### Table trait 643 | 644 | ```rust 645 | // file: src/query/catalog/src/table.rs 646 | /// Gather partitions to be scanned according to the push_downs 647 | /// 在PhysicalPlanBuilder中被调用,获取要查询的分片 648 | async fn read_partitions( 649 | &self, 650 | ctx: Arc, 651 | push_downs: Option, 652 | ) -> Result<(Statistics, Partitions)> { 653 | let (_, _) = (ctx, push_downs); 654 | Err(ErrorCode::UnImplement(format!( 655 | "read_partitions operation for table {} is not implemented. table engine : {}", 656 | self.name(), 657 | self.get_table_info().meta.engine 658 | ))) 659 | } 660 | 661 | /// Assembly the pipeline of reading data from storage, according to the plan 662 | // 在PipelineBuilder中被调用 663 | fn read_data( 664 | &self, 665 | ctx: Arc, 666 | plan: &ReadDataSourcePlan, 667 | pipeline: &mut Pipeline, 668 | ) -> Result<()> { 669 | let (_, _, _) = (ctx, plan, pipeline); 670 | 671 | Err(ErrorCode::UnImplement(format!( 672 | "read_data operation for table {} is not implemented. table engine : {}", 673 | self.name(), 674 | self.get_table_info().meta.engine 675 | ))) 676 | } 677 | ``` 678 | 679 | #### FuseTable 680 | 681 | ```rust 682 | // file: src/query/storages/fuse/src/fuse_table.rs 683 | #[async_trait::async_trait] 684 | impl Table for FuseTable { 685 | #[tracing::instrument(level = "debug", name = "fuse_table_read_partitions", skip(self, ctx), fields(ctx.id = ctx.get_id().as_str()))] 686 | async fn read_partitions( 687 | &self, 688 | ctx: Arc, 689 | push_downs: Option, 690 | ) -> Result<(Statistics, Partitions)> { 691 | self.do_read_partitions(ctx, push_downs).await 692 | } 693 | 694 | #[tracing::instrument(level = "debug", name = "fuse_table_read_data", skip(self, ctx, pipeline), fields(ctx.id = ctx.get_id().as_str()))] 695 | fn read_data( 696 | &self, 697 | ctx: Arc, 698 | plan: &ReadDataSourcePlan, 699 | pipeline: &mut Pipeline, 700 | ) -> Result<()> { 701 | let max_io_requests = ctx.get_settings().get_max_storage_io_requests()? as usize; 702 | self.do_read_data(ctx, plan, pipeline, max_io_requests) 703 | } 704 | } 705 | 706 | impl FuseTable { 707 | #[inline] 708 | pub fn do_read_data( 709 | &self, 710 | ctx: Arc, 711 | plan: &ReadDataSourcePlan, 712 | pipeline: &mut Pipeline, 713 | max_io_requests: usize, 714 | ) -> Result<()> { 715 | ... 716 | // Add source pipe. 717 | pipeline.add_source( 718 | |output| { 719 | FuseTableSource::create( 720 | ctx.clone(), 721 | output, 722 | block_reader.clone(), 723 | prewhere_reader.clone(), 724 | prewhere_filter.clone(), 725 | remain_reader.clone(), 726 | ) 727 | }, 728 | max_io_requests, 729 | )?; 730 | 731 | // Resize pipeline to max threads. 732 | let max_threads = ctx.get_settings().get_max_threads()? as usize; 733 | let resize_to = std::cmp::min(max_threads, max_io_requests); 734 | info!( 735 | "read block pipeline resize from:{} to:{}", 736 | max_io_requests, resize_to 737 | ); 738 | pipeline.resize(resize_to) 739 | } 740 | } 741 | ``` 742 | 743 | #### FuseTableSource 744 | 745 | ```rust 746 | // file: src/query/storages/fuse/src/operations/fuse_source.rs 747 | pub struct FuseTableSource { 748 | state: State, 749 | ctx: Arc, 750 | scan_progress: Arc, 751 | output: Arc, 752 | output_reader: Arc, 753 | 754 | prewhere_reader: Arc, 755 | prewhere_filter: Arc>, 756 | remain_reader: Arc>, 757 | 758 | support_blocking: bool, 759 | } 760 | 761 | impl Processor for FuseTableSource { 762 | fn name(&self) -> String { 763 | "FuseEngineSource".to_string() 764 | } 765 | 766 | fn event(&mut self) -> Result { 767 | ... 768 | } 769 | 770 | fn process(&mut self) -> Result<()> { 771 | ... 772 | } 773 | } 774 | ``` 775 | 776 | ### Reference 777 | 778 | * [《Databend存储架构总览》](https://mp.weixin.qq.com/s/jXAu3mSmJF80TwK3xeFlcg) 779 | -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/5_aggregation.md: -------------------------------------------------------------------------------- 1 | * [五 Aggregation](#五-aggregation) 2 | * [Build Pipeline](#build-pipeline) 3 | * [V1](#v1) 4 | * [QueryPipelineBuilder](#querypipelinebuilder) 5 | * [V2](#v2) 6 | * [PipelineBuilder](#pipelinebuilder) 7 | * [关键模块](#关键模块) 8 | * [TransformAggregator](#transformaggregator) 9 | * [AggregatorTransform](#aggregatortransform) 10 | * [Aggregator(trait)](#aggregatortrait) 11 | * [PartialAggregator](#partialaggregator) 12 | * [FinalAggregator](#finalaggregator) 13 | * [HashMethod(trait)](#hashmethodtrait) 14 | * [HashMethodSingleString](#hashmethodsinglestring) 15 | * [PolymorphicKeyHelper(trait)](#polymorphickeyhelpertrait) 16 | * [HashMethodSingleString](#hashmethodsinglestring) 17 | * [AggregatorState(trait)](#aggregatorstatetrait) 18 | * [SerializedKeysAggregatorState](#serializedkeysaggregatorstate) 19 | * [HashTable/HashMapKind/HashTableKind](#hashtablehashmapkindhashtablekind) 20 | * [AggregateFunction(trait)](#aggregatefunctiontrait) 21 | * [AggregateCountFunction](#aggregatecountfunction) 22 | 23 | ## 五 Aggregation 24 | 25 | ### Build Pipeline 26 | 27 | #### V1 28 | 29 | ##### QueryPipelineBuilder 30 | 31 | ```rust 32 | // file: query/src/pipelines/new/pipeline_builder.rs 33 | fn visit_plan_node(&mut self, node: &PlanNode) -> Result<()> { 34 | match node { 35 | ... 36 | PlanNode::AggregatorPartial(n) => self.visit_aggregate_partial(n), 37 | PlanNode::AggregatorFinal(n) => self.visit_aggregate_final(n), 38 | ... 39 | } 40 | } 41 | // func: visit_aggregate_partial(...) 42 | // 创建Partial Aggregator Processor 43 | fn visit_aggregate_partial(&mut self, plan: &AggregatorPartialPlan) -> Result<()> { 44 | self.visit_plan_node(&plan.input)?; 45 | 46 | let aggregator_params = AggregatorParams::try_create( 47 | &plan.aggr_expr, 48 | &plan.group_expr, 49 | &plan.input.schema(), 50 | &plan.schema(), 51 | )?; 52 | self.pipeline 53 | .add_transform(|transform_input_port, transform_output_port| { 54 | TransformAggregator::try_create_partial( // 创建Partial Aggregator Processor 55 | transform_input_port.clone(), 56 | transform_output_port.clone(), 57 | AggregatorTransformParams::try_create( 58 | transform_input_port, 59 | transform_output_port, 60 | &aggregator_params, 61 | )?, 62 | self.ctx.clone(), 63 | ) 64 | }) 65 | } 66 | // func: visit_aggregate_final(...) 67 | // 创建Final Aggregator Processor 68 | fn visit_aggregate_final(&mut self, plan: &AggregatorFinalPlan) -> Result<()> { 69 | self.visit_plan_node(&plan.input)?; 70 | 71 | self.pipeline.resize(1)?; 72 | let aggregator_params = AggregatorParams::try_create( 73 | &plan.aggr_expr, 74 | &plan.group_expr, 75 | &plan.schema_before_group_by, 76 | &plan.schema, 77 | )?; 78 | self.pipeline 79 | .add_transform(|transform_input_port, transform_output_port| { 80 | TransformAggregator::try_create_final( // 创建Final Aggregator Processor 81 | transform_input_port.clone(), 82 | transform_output_port.clone(), 83 | AggregatorTransformParams::try_create( 84 | transform_input_port, 85 | transform_output_port, 86 | &aggregator_params, 87 | )?, 88 | self.ctx.clone(), 89 | ) 90 | }) 91 | } 92 | 93 | ``` 94 | 95 | #### V2 96 | 97 | ##### PipelineBuilder 98 | 99 | ```rust 100 | // file: query/src/sql/exec/mod.rs 101 | // func: 102 | pub fn build_pipeline( 103 | &mut self, 104 | context: Arc, 105 | s_expr: &SExpr, 106 | pipeline: &mut NewPipeline, 107 | ) -> Result { 108 | let plan = s_expr.plan(); 109 | 110 | match plan { 111 | ... 112 | RelOperator::Aggregate(aggregate) => { 113 | let input_schema = 114 | self.build_pipeline(context.clone(), s_expr.child(0)?, pipeline)?; 115 | self.build_aggregate(context, aggregate, input_schema, pipeline) 116 | } 117 | ... 118 | } 119 | } 120 | // func: 121 | fn build_aggregate( 122 | &mut self, 123 | ctx: Arc, 124 | aggregate: &AggregatePlan, 125 | input_schema: DataSchemaRef, 126 | pipeline: &mut NewPipeline, 127 | ) -> Result { 128 | ... 129 | pipeline.add_transform(|transform_input_port, transform_output_port| { 130 | TransformAggregator::try_create_partial( // 创建Partial Aggregator Processor 131 | transform_input_port.clone(), 132 | transform_output_port.clone(), 133 | AggregatorTransformParams::try_create( 134 | transform_input_port, 135 | transform_output_port, 136 | &partial_aggr_params, 137 | )?, 138 | ctx.clone(), 139 | ) 140 | })?; 141 | ... 142 | pipeline.add_transform(|transform_input_port, transform_output_port| { 143 | TransformAggregator::try_create_final( // 创建Final Aggregator Processor 144 | transform_input_port.clone(), 145 | transform_output_port.clone(), 146 | AggregatorTransformParams::try_create( 147 | transform_input_port, 148 | transform_output_port, 149 | &final_aggr_params, 150 | )?, 151 | ctx.clone(), 152 | ) 153 | })?; 154 | } 155 | ``` 156 | 157 | #### 关键模块 158 | 159 | ##### TransformAggregator 160 | 161 | ```rust 162 | // file: query/src/pipelines/new/processors/transforms/transform_aggregator.rs 163 | // 创建Porcessor的辅助struct 164 | pub struct TransformAggregator; 165 | // func: try_create_partial(...) -> ProcessorPtr 166 | // 根据不同的聚合key类型和是否有聚合函数,创建partial aggregate processor 167 | pub fn try_create_partial( 168 | input_port: Arc, 169 | output_port: Arc, 170 | transform_params: AggregatorTransformParams, 171 | ctx: Arc, 172 | ) -> Result { 173 | ... 174 | match aggregator_params.aggregate_functions.is_empty() { // 无聚合函数 175 | true => match transform_params.method { 176 | HashMethodKind::KeysU8(method) => AggregatorTransform::create( 177 | transform_params.transform_input_port, 178 | transform_params.transform_output_port, 179 | KeysU8PartialAggregator::::create(ctx, method, aggregator_params), // 使用KeysXXPartialAggregator 180 | ), 181 | ... 182 | } 183 | false => match transform_params.method { // 有聚合函数 184 | HashMethodKind::KeysU8(method) => AggregatorTransform::create( 185 | transform_params.transform_input_port, 186 | transform_params.transform_output_port, 187 | KeysU8PartialAggregator::::create(ctx, method, aggregator_params), // 使用KeysXXPartialAggregator 188 | ), 189 | ... 190 | } 191 | } 192 | // func: try_create_final(...) -> ProcessorPtr 193 | // 根据不同的聚合key类型和是否有聚合函数,创建final aggregate processor 194 | pub fn try_create_final( 195 | input_port: Arc, 196 | output_port: Arc, 197 | transform_params: AggregatorTransformParams, 198 | ctx: Arc, 199 | ) -> Result { 200 | ... 201 | match aggregator_params.aggregate_functions.is_empty() { // 无聚合函数 202 | true => match transform_params.method { 203 | HashMethodKind::KeysU8(method) => AggregatorTransform::create( 204 | transform_params.transform_input_port, 205 | transform_params.transform_output_port, 206 | KeysU8FinalAggregator::::create(ctx, method, aggregator_params)?, // 使用KeysXXFinalAggregator 207 | ), 208 | ... 209 | } 210 | false => match transform_params.method { // 有聚合函数 211 | HashMethodKind::KeysU8(method) => AggregatorTransform::create( 212 | transform_params.transform_input_port, 213 | transform_params.transform_output_port, 214 | KeysU8FinalAggregator::::create(ctx, method, aggregator_params)?, // 使用KeysXXFinalAggregator 215 | ), 216 | ... 217 | } 218 | } 219 | 220 | ``` 221 | 222 | ##### AggregatorTransform 223 | 224 | ```rust 225 | // file: query/src/pipelines/new/processors/transforms/transform_aggregator.rs 226 | enum AggregatorTransform { 227 | ConsumeData(ConsumeState), 228 | Generate(GenerateState), 229 | Finished, 230 | } 231 | // func: create(...) -> ProcessorPtr 232 | // 创建AggregatorTransform 233 | pub fn create( 234 | input_port: Arc, 235 | output_port: Arc, 236 | inner: TAggregator, 237 | ) -> Result { 238 | Ok(ProcessorPtr::create(Box::new(AggregatorTransform::< 239 | TAggregator, 240 | >::ConsumeData( 241 | ConsumeState { 242 | inner, 243 | input_port, 244 | output_port, 245 | input_data_block: None, 246 | }, 247 | )))) 248 | } 249 | // func: consume_event() 250 | // 从上游算子获取数据 251 | fn consume_event(&mut self) -> Result { 252 | if let AggregatorTransform::ConsumeData(state) = self { 253 | if state.input_data_block.is_some() { 254 | return Ok(Event::Sync); 255 | } 256 | 257 | if state.input_port.is_finished() { 258 | let mut temp_state = AggregatorTransform::Finished; 259 | std::mem::swap(self, &mut temp_state); 260 | temp_state = temp_state.convert_to_generate()?; // 上游已经无数据,转换类型ConsumeData -> Generate 261 | std::mem::swap(self, &mut temp_state); 262 | debug_assert!(matches!(temp_state, AggregatorTransform::Finished)); 263 | return Ok(Event::Sync); 264 | } 265 | 266 | return match state.input_port.has_data() { 267 | true => { 268 | state.input_data_block = Some(state.input_port.pull_data().unwrap()?); 269 | Ok(Event::Sync) 270 | } 271 | false => { 272 | state.input_port.set_need_data(); 273 | Ok(Event::NeedData) 274 | } 275 | }; 276 | } 277 | 278 | Err(ErrorCode::LogicalError("It's a bug")) 279 | } 280 | impl Processor for AggregatorTransform { 281 | fn name(&self) -> &'static str { 282 | TAggregator::NAME 283 | } 284 | 285 | fn event(&mut self) -> Result { 286 | match self { 287 | AggregatorTransform::Finished => Ok(Event::Finished), 288 | AggregatorTransform::Generate(_) => self.generate_event(), // 4 产出数据 289 | AggregatorTransform::ConsumeData(_) => self.consume_event(), // 1 获取数据 290 | } 291 | } 292 | 293 | fn process(&mut self) -> Result<()> { 294 | match self { 295 | AggregatorTransform::Finished => Ok(()), 296 | AggregatorTransform::ConsumeData(state) => state.consume(), // 2 聚合,调用Aggregator的consume() 297 | AggregatorTransform::Generate(state) => state.generate(), // 3 生成聚合结果,调用Aggregator的generate() 298 | } 299 | } 300 | } 301 | 302 | struct ConsumeState { 303 | inner: TAggregator, 304 | input_port: Arc, 305 | output_port: Arc, 306 | input_data_block: Option, 307 | } 308 | 309 | impl ConsumeState { 310 | pub fn consume(&mut self) -> Result<()> { 311 | if let Some(input_data) = self.input_data_block.take() { 312 | self.inner.consume(input_data)?; 313 | } 314 | 315 | Ok(()) 316 | } 317 | } 318 | ``` 319 | 320 | ##### Aggregator(trait) 321 | 322 | 聚合算子,实现的struct有:PartialAggregator和FinalAggregator 323 | 324 | ```rust 325 | // file: query/src/pipelines/new/processors/transforms/transform_aggregator.rs 326 | pub trait Aggregator: Sized + Send { 327 | const NAME: &'static str; 328 | 329 | fn consume(&mut self, data: DataBlock) -> Result<()>; 330 | fn generate(&mut self) -> Result>; 331 | } 332 | ``` 333 | 334 | ##### PartialAggregator 335 | 336 | ```rust 337 | // file: query/src/pipelines/new/processors/transforms/aggregator/aggregator_partial.rs 338 | // 泛型,partial aggregate的执行框架,聚合键的转化、聚合函数的状态等是由Method控制;HAS_AGG标志是否有聚合函数 339 | pub struct PartialAggregator< 340 | const HAS_AGG: bool, 341 | Method: HashMethod + PolymorphicKeysHelper, 342 | > { 343 | is_generated: bool, 344 | states_dropped: bool, 345 | 346 | method: Method, 347 | state: Method::State, 348 | params: Arc, 349 | ctx: Arc, 350 | } 351 | 352 | // 1. 带聚合函数的情况 353 | impl + Send> Aggregator 354 | for PartialAggregator 355 | { 356 | const NAME: &'static str = "PartialAggregator"; 357 | 358 | fn consume(&mut self, block: DataBlock) -> Result<()> { 359 | // 1.1 and 1.2. 360 | let group_columns = Self::group_columns(&self.params.group_columns_name, &block)?; // 获取聚合key的所有列 361 | let group_keys = self.method.build_keys(&group_columns, block.num_rows())?; // 把聚合的列转为HashMap的key 362 | 363 | // 计算使用two level hashmap的阈值 364 | let group_by_two_level_threshold = 365 | self.ctx.get_settings().get_group_by_two_level_threshold()? as usize; 366 | if !self.state.is_two_level() && self.state.len() >= group_by_two_level_threshold { 367 | self.state.convert_to_two_level(); 368 | } 369 | 370 | let places = Self::lookup_state(&self.params, group_keys, &mut self.state); // 获取每个group key的StateAddr 371 | Self::execute(&self.params, &block, &places) // 执行聚合函数 372 | } 373 | 374 | fn generate(&mut self) -> Result> { 375 | self.generate_data() 376 | } 377 | } 378 | // func: lookup_state(...) -> StateAddrs 379 | // 获取每个group key对应的StateAddr,其中StateAddr是一个内存地址 380 | fn lookup_state( 381 | params: &Arc, 382 | keys: Vec>, 383 | state: &mut Method::State, 384 | ) -> StateAddrs { 385 | let mut places = Vec::with_capacity(keys.len()); 386 | 387 | let mut inserted = true; 388 | for key in keys.iter() { 389 | let entity = state.entity(key, &mut inserted); 390 | 391 | match inserted { 392 | true => { // 当前key不存在,需要开辟新的内存 393 | if let Some(place) = state.alloc_layout2(params) { 394 | places.push(place); 395 | entity.set_state_value(place.addr()); 396 | } 397 | } 398 | false => { // 当前key已存在,直接返回内存地址 399 | let place: StateAddr = (*entity.get_state_value()).into(); 400 | places.push(place); 401 | } 402 | } 403 | } 404 | places 405 | } 406 | // func: execute(...) 407 | // 执行聚合函数 408 | fn execute( 409 | params: &Arc, 410 | block: &DataBlock, 411 | places: &StateAddrs, 412 | ) -> Result<()> { 413 | let aggregate_functions = ¶ms.aggregate_functions; 414 | let offsets_aggregate_states = ¶ms.offsets_aggregate_states; 415 | let aggregate_arguments_columns = Self::aggregate_arguments(block, params)?; // 获取每个聚合函数需要的列 416 | 417 | // This can benificial for the case of dereferencing 418 | // This will help improve the performance ~hundreds of megabits per second 419 | let aggr_arg_columns_slice = &aggregate_arguments_columns; 420 | 421 | for index in 0..aggregate_functions.len() { 422 | let rows = block.num_rows(); 423 | let function = &aggregate_functions[index]; 424 | let state_offset = offsets_aggregate_states[index]; 425 | let function_arguments = &aggr_arg_columns_slice[index]; 426 | function.accumulate_keys(places, state_offset, function_arguments, rows)?; 427 | } 428 | 429 | Ok(()) 430 | } 431 | ``` 432 | 433 | ##### FinalAggregator 434 | 435 | ```rust 436 | // file: query/src/pipelines/new/processors/transforms/aggregator/aggregator_final.rs 437 | // 泛型 438 | pub struct FinalAggregator< 439 | const HAS_AGG: bool, 440 | Method: HashMethod + PolymorphicKeysHelper + Send, 441 | > { 442 | is_generated: bool, 443 | states_dropped: bool, 444 | 445 | method: Method, 446 | state: Method::State, 447 | params: Arc, 448 | // used for deserialization only, so we can reuse it during the loop 449 | temp_place: Option, 450 | ctx: Arc, 451 | } 452 | ``` 453 | 454 | ##### HashMethod(trait) 455 | 456 | ```rust 457 | // file: common/datablocks/src/kernels/data_block_group_by_hash.rs 458 | // 负责序列化聚合key,便于放入AggregatorState(HashMap)中 459 | pub trait HashMethod { 460 | type HashKey<'a>: std::cmp::Eq + Hash + Clone + Debug 461 | where Self: 'a; 462 | // func: 463 | // 返回实现的name 464 | fn name(&self) -> String; 465 | // 测试用,group by逻辑的具体实现 466 | fn group_by_get_indices<'a>( 467 | &self, 468 | block: &'a DataBlock, 469 | column_names: &[String], 470 | ) -> Result>> {} 471 | // 测试用,group by入口 472 | fn group_by<'a>( 473 | &self, 474 | block: &'a DataBlock, 475 | column_names: &[String], 476 | ) -> Result>> { 477 | let group_indices = self.group_by_get_indices(block, column_names)?; 478 | ... 479 | } 480 | // 构建group by key, 在PartialAggregator.consume()中调用 481 | fn build_keys<'a>( 482 | &self, 483 | group_columns: &[&'a ColumnRef], 484 | rows: usize, 485 | ) -> Result>>; 486 | } 487 | ``` 488 | 489 | ![image.png](./assets/1658305438488-image.png) 490 | 491 | ###### HashMethodSingleString 492 | 493 | ```rust 494 | // file: common/datablocks/src/kernels/data_block_group_by_hash.rs 495 | pub struct HashMethodSingleString {} 496 | impl HashMethodSingleString { 497 | #[inline] 498 | pub fn get_key(&self, column: &StringColumn, row: usize) -> Vec { 499 | let v = column.get_data(row); 500 | v.to_owned() 501 | } 502 | 503 | pub fn deserialize_group_columns( 504 | &self, 505 | keys: Vec>, 506 | group_fields: &[DataField], 507 | ) -> Result> { 508 | debug_assert!(!keys.is_empty()); 509 | debug_assert!(group_fields.len() == 1); 510 | let column = StringColumn::new_from_slice(&keys); 511 | Ok(vec![column.arc()]) 512 | } 513 | } 514 | 515 | impl HashMethod for HashMethodSingleString { 516 | type HashKey<'a> = &'a [u8]; 517 | 518 | fn name(&self) -> String { 519 | "SingleString".to_string() 520 | } 521 | 522 | fn build_keys<'a>( 523 | &self, 524 | group_columns: &[&'a ColumnRef], 525 | rows: usize, 526 | ) -> Result> { 527 | debug_assert!(group_columns.len() == 1); 528 | let column = group_columns[0]; 529 | let str_column: &StringColumn = Series::check_get(column)?; 530 | 531 | let mut values = Vec::with_capacity(rows); 532 | for row in 0..rows { 533 | values.push(str_column.get_data(row)); 534 | } 535 | Ok(values) 536 | } 537 | } 538 | 539 | // file: query/src/pipelines/transforms/group_by/aggregator_polymorphic_keys.rs 540 | impl PolymorphicKeysHelper for HashMethodSingleString { 541 | ... 542 | } 543 | ``` 544 | 545 | ##### PolymorphicKeyHelper(trait) 546 | 547 | ```rust 548 | // file: query/src/pipelines/transforms/group_by/aggregator_polymorphic_keys.rs 549 | // 负责创建AggregatorState,遍历聚合的key值,辅助反序列化聚合key用于输出 550 | pub trait PolymorphicKeysHelper { 551 | type State: AggregatorState; 552 | fn aggregate_state(&self) -> Self::State; // 创建State,在创建PartialAggregator和FinalAggregator时调用 553 | 554 | type ColumnBuilder: KeysColumnBuilder<>::Key>; 555 | fn keys_column_builder(&self, capacity: usize) -> Self::ColumnBuilder; // 构造生成聚合结果key的Builder,在PartialAggregator.generate_data()中调用 556 | 557 | type KeysColumnIter: KeysColumnIter<>::Key>; 558 | fn keys_iter_from_column(&self, column: &ColumnRef) -> Result; // 返回聚合key的迭代器,在FinalAggregator.consume()中调用 559 | 560 | type GroupColumnsBuilder: GroupColumnsBuilder<>::Key>; 561 | fn group_columns_builder( // 在FinalAggregator.gennerate()中调用 562 | &self, 563 | capacity: usize, 564 | params: &AggregatorParams, 565 | ) -> Self::GroupColumnsBuilder; 566 | } 567 | ``` 568 | 569 | ![image.png](./assets/1658305358904-image.png) 570 | 571 | ###### HashMethodSingleString 572 | 573 | ```rust 574 | // file: common/datablocks/src/kernels/data_block_group_by_hash.rs 575 | pub struct HashMethodSingleString {} 576 | 577 | impl HashMethod for HashMethodSingleString { 578 | ... 579 | } 580 | 581 | // file: query/src/pipelines/transforms/group_by/aggregator_polymorphic_keys.rs 582 | impl PolymorphicKeysHelper for HashMethodSingleString { 583 | type State = SerializedKeysAggregatorState; 584 | fn aggregate_state(&self) -> Self::State { 585 | SerializedKeysAggregatorState { 586 | keys_area: Bump::new(), 587 | state_area: Bump::new(), 588 | data_state_map: HashMapKind::create_hash_table(), 589 | two_level_flag: false, 590 | } 591 | } 592 | 593 | type ColumnBuilder = SerializedKeysColumnBuilder; 594 | fn keys_column_builder(&self, capacity: usize) -> Self::ColumnBuilder { 595 | SerializedKeysColumnBuilder { 596 | inner_builder: MutableStringColumn::with_capacity(capacity), 597 | } 598 | } 599 | 600 | type KeysColumnIter = SerializedKeysColumnIter; 601 | fn keys_iter_from_column(&self, column: &ColumnRef) -> Result { 602 | SerializedKeysColumnIter::create(Series::check_get::(column)?) 603 | } 604 | 605 | type GroupColumnsBuilder = SingleStringGroupColumnsBuilder; 606 | fn group_columns_builder( 607 | &self, 608 | capacity: usize, 609 | params: &AggregatorParams, 610 | ) -> Self::GroupColumnsBuilder { 611 | SingleStringGroupColumnsBuilder::create(capacity, params) 612 | } 613 | } 614 | ``` 615 | 616 | ##### AggregatorState(trait) 617 | 618 | ```rust 619 | // file: query/src/pipelines/transforms/group_by/aggregator_state.rs 620 | // 封装HashMap 621 | pub trait AggregatorState: Sync + Send { 622 | type Key; 623 | type Entity: StateEntity; 624 | type Iterator: Iterator; 625 | 626 | fn len(&self) -> usize; 627 | 628 | fn iter(&self) -> Self::Iterator; 629 | 630 | fn alloc_place(&self, layout: Layout) -> StateAddr; 631 | 632 | fn alloc_layout(&self, params: &AggregatorParams) -> Option { 633 | params.layout?; 634 | let place: StateAddr = self.alloc_place(params.layout.unwrap()); 635 | 636 | for idx in 0..params.offsets_aggregate_states.len() { 637 | let aggr_state = params.offsets_aggregate_states[idx]; 638 | let aggr_state_place = place.next(aggr_state); 639 | params.aggregate_functions[idx].init_state(aggr_state_place); 640 | } 641 | Some(place) 642 | } 643 | 644 | fn alloc_layout2(&self, params: &NewAggregatorParams) -> Option { 645 | params.layout?; 646 | let place: StateAddr = self.alloc_place(params.layout.unwrap()); 647 | 648 | for idx in 0..params.offsets_aggregate_states.len() { 649 | let aggr_state = params.offsets_aggregate_states[idx]; 650 | let aggr_state_place = place.next(aggr_state); 651 | params.aggregate_functions[idx].init_state(aggr_state_place); 652 | } 653 | Some(place) 654 | } 655 | 656 | fn entity(&mut self, key: &Method::HashKey<'_>, inserted: &mut bool) -> *mut Self::Entity; 657 | 658 | fn entity_by_key(&mut self, key: &Self::Key, inserted: &mut bool) -> *mut Self::Entity; 659 | 660 | fn is_two_level(&self) -> bool { 661 | false 662 | } 663 | 664 | fn convert_to_two_level(&mut self) {} 665 | } 666 | 667 | ``` 668 | 669 | ![image.png](./assets/1658305391305-image.png) 670 | 671 | ###### SerializedKeysAggregatorState 672 | 673 | ```rust 674 | // file: query/src/pipelines/transforms/group_by/aggregator_state.rs 675 | pub struct SerializedKeysAggregatorState { 676 | pub keys_area: Bump, 677 | pub state_area: Bump, 678 | pub data_state_map: HashMapKind, // HashMap 679 | pub two_level_flag: bool, 680 | } 681 | ``` 682 | 683 | ##### HashTable/HashMapKind/HashTableKind 684 | 685 | ```rust 686 | // file: query/src/common/hashtable/mod.rs 687 | pub type HashMap = HashTable, SingleLevelGrower>; 688 | pub type TwoLevelHashMap = 689 | TwoLevelHashTable, TwoLevelGrower>; 690 | pub type HashMapIteratorKind = HashTableIteratorKind>; 691 | pub type HashMapKind = 692 | HashTableKind, SingleLevelGrower, TwoLevelGrower>; 693 | ``` 694 | 695 | ```rust 696 | // file: query/src/common/hashtable/two_level_hash_table.rs 697 | pub enum HashTableKind< 698 | Key: HashTableKeyable, 699 | Entity: HashTableEntity, 700 | SingleLevelGrower: HashTableGrower, 701 | TwoLevelGrower: HashTableGrower, 702 | > { 703 | HashTable(HashTable), 704 | TwoLevelHashTable(TwoLevelHashTable), 705 | } 706 | ``` 707 | 708 | ```rust 709 | // file: query/src/common/hashtable/hash_table.rs 710 | pub struct HashTable, Grower: HashTableGrower> { 711 | size: usize, 712 | grower: Grower, 713 | entities: *mut Entity, 714 | entities_raw: *mut u8, 715 | zero_entity: Option<*mut Entity>, 716 | zero_entity_raw: Option<*mut u8>, 717 | 718 | /// Generics hold 719 | generics_hold: PhantomData, 720 | } 721 | ``` 722 | 723 | ##### AggregateFunction(trait) 724 | 725 | ```rust 726 | // rust: common/functions/src/aggregates/aggregate_function.rs 727 | pub trait AggregateFunction: fmt::Display + Sync + Send { 728 | // accumulate is to accumulate the arrays in batch mode 729 | // common used when there is no group by for aggregate function 730 | fn accumulate( 731 | &self, 732 | _place: StateAddr, 733 | _columns: &[ColumnRef], 734 | _validity: Option<&Bitmap>, 735 | _input_rows: usize, 736 | ) -> Result<()>; 737 | 738 | // used when we need to calculate with group keys 739 | fn accumulate_keys( 740 | &self, 741 | places: &[StateAddr], 742 | offset: usize, 743 | columns: &[ColumnRef], 744 | _input_rows: usize, 745 | ) -> Result<()> { 746 | for (row, place) in places.iter().enumerate() { 747 | self.accumulate_row(place.next(offset), columns, row)?; 748 | } 749 | Ok(()) 750 | } 751 | ... 752 | } 753 | ``` 754 | 755 | ###### AggregateCountFunction 756 | 757 | ```rust 758 | // file: common/functions/src/aggregates/aggregate_count.rs 759 | pub struct AggregateCountFunction { 760 | display_name: String, 761 | nullable: bool, 762 | } 763 | impl AggregateFunction for AggregateCountFunction { 764 | fn accumulate( 765 | &self, 766 | place: StateAddr, 767 | _columns: &[ColumnRef], 768 | validity: Option<&Bitmap>, 769 | input_rows: usize, 770 | ) -> Result<()> { 771 | let state = place.get::(); 772 | 773 | let nulls = match validity { 774 | Some(b) => b.null_count(), 775 | None => 0, 776 | }; 777 | 778 | state.count += (input_rows - nulls) as u64; 779 | Ok(()) 780 | } 781 | 782 | fn accumulate_keys( 783 | &self, 784 | places: &[StateAddr], 785 | offset: usize, 786 | columns: &[ColumnRef], 787 | _input_rows: usize, 788 | ) -> Result<()> { 789 | let validity = match columns.len() { 790 | 0 => None, 791 | _ => { 792 | let (_, validity) = columns[0].validity(); 793 | validity 794 | } 795 | }; 796 | 797 | match validity { 798 | Some(v) => { // 存在null 799 | for (valid, place) in v.iter().zip(places.iter()) { 800 | if valid { 801 | let state = place.next(offset).get::(); 802 | state.count += 1; 803 | } 804 | } 805 | } 806 | None => { // 全非null 807 | for place in places { 808 | let state = place.get::(); 809 | state.count += 1; 810 | } 811 | } 812 | } 813 | 814 | Ok(()) 815 | } 816 | } 817 | ``` 818 | -------------------------------------------------------------------------------- /source-code-reading/v0.7.71-nightly/1_services.md: -------------------------------------------------------------------------------- 1 | * [一 子服务概览](#一-子服务概览) 2 | * [1 Config](#1-config) 3 | * [初始化](#初始化) 4 | * [关键模块](#关键模块) 5 | * [Config](#config) 6 | * [QueryConfig](#queryconfig) 7 | * [LogConfig](#logconfig) 8 | * [MetaConfig](#metaconfig) 9 | * [StorageConfig](#storageconfig) 10 | * [HiveCatalogConfig](#hivecatalogconfig) 11 | * [2 Tracing](#2-tracing) 12 | * [3 SessionManager](#3-sessionmanager) 13 | * [初始化](#初始化) 14 | * [关键模块](#关键模块) 15 | * [SessionManager](#sessionmanager) 16 | * [Config](#config) 17 | * [ClusterDiscovery](#clusterdiscovery) 18 | * [CatalogManager](#catalogmanager) 19 | * [HttpQueryManager](#httpquerymanager) 20 | * [CacheManager](#cachemanager) 21 | * [4 MySQL Handler](#4-mysql-handler) 22 | * [启动](#启动) 23 | * [关键模块](#关键模块) 24 | * [MySQLHandler](#mysqlhandler) 25 | * [MySQLConnection](#mysqlconnection) 26 | * [InteractiveWorker/InteractiveWorkerBase](#interactiveworkerinteractiveworkerbase) 27 | * [Session](#session) 28 | * [QueryContextShared](#querycontextshared) 29 | * [QueryContext](#querycontext) 30 | * [5 ClickHouse Handler](#5-clickhouse-handler) 31 | * [6 HTTP Handler](#6-http-handler) 32 | * [7 Metrics API Service](#7-metrics-api-service) 33 | * [启动](#启动) 34 | * [关键模块](#关键模块) 35 | * [MetricService](#metricservice) 36 | * [PROMETHEUS_HANDLE](#prometheus_handle) 37 | * [8 HTTP API Service](#8-http-api-service) 38 | * [启动](#启动) 39 | * [关键模块](#关键模块) 40 | * [HttpService](#httpservice) 41 | * [9 RPC API Service](#9-rpc-api-service) 42 | * [Service启动](#service启动) 43 | * [关键模块](#关键模块) 44 | * [RpcService](#rpcservice) 45 | * [DatabendQueryFlightService](#databendqueryflightservice) 46 | * [DatabendQueryFlightDispatcher ⭐️](#databendqueryflightdispatcher-) 47 | * [FlightScatter](#flightscatter) 48 | * [HashFligthScatter](#hashfligthscatter) 49 | * [BroadcastFlightScatter](#broadcastflightscatter) 50 | * [10 Cluster Register](#10-cluster-register) 51 | * [Woker节点注册](#woker节点注册) 52 | * [关键模块](#关键模块) 53 | * [SessionManager](#sessionmanager) 54 | * [ClusterDiscovery/ClusterHeartbeat/Cluster/ClusterMgr](#clusterdiscoveryclusterheartbeatclusterclustermgr) 55 | 56 | ## 一 子服务概览 57 | 58 | 59 | | 模块 | 功能 | 60 | | --------------------- | ----------------------------- | 61 | | Config | 负责加载和管理配置文件 | 62 | | Tracing | 负责日志管理 | 63 | | SessionManager | 负责session管理 | 64 | | MySQL Handler | 负责对外提供MySQL服务 | 65 | | ClickHouse Handler | 负责对外提供ClickHouse服务 | 66 | | HTTP Handler | 负责对外提供HTTP接口服务 | 67 | | Metrics API Service | 负责指标统计(Prometheus) | 68 | | RPC API Service | 负责RPC接口服务和节点间通信 | 69 | | Cluster Register | 负责节点的注册 | 70 | 71 | ### 1 Config 72 | 73 | #### 初始化 74 | 75 | ```rust 76 | // file: query/bin/databend-query.rs 77 | let conf: Config = Config::load()?; 78 | ``` 79 | 80 | #### 关键模块 81 | 82 | ##### Config 83 | 84 | ```rust 85 | // file: query/src/config/inner.rs 86 | pub struct Config { 87 | pub cmd: String, 88 | pub config_file: String, 89 | 90 | // Query engine config. 91 | pub query: QueryConfig, 92 | 93 | pub log: LogConfig, 94 | 95 | // Meta Service config. 96 | pub meta: MetaConfig, 97 | 98 | // Storage backend config. 99 | pub storage: StorageConfig, 100 | 101 | // external catalog config. 102 | // - Later, catalog information SHOULD be kept in KV Service 103 | // - currently only supports HIVE (via hive meta store) 104 | pub catalog: HiveCatalogConfig, 105 | } 106 | // func: load() -> Result 107 | // 加载 108 | pub fn load() -> Result { 109 | let cfg = OuterV0Config::load()?.try_into()?; // 通过try_into()转换到Config 110 | 111 | Ok(cfg) 112 | } 113 | 114 | // file: query/src/config/outer_v0.rs 115 | pub struct Config { 116 | /// Run a command and quit 117 | #[clap(long, default_value_t)] 118 | pub cmd: String, 119 | 120 | #[clap(long, short = 'c', default_value_t)] 121 | pub config_file: String, 122 | 123 | // Query engine config. 124 | #[clap(flatten)] 125 | pub query: QueryConfig, 126 | 127 | #[clap(flatten)] 128 | pub log: LogConfig, 129 | 130 | // Meta Service config. 131 | #[clap(flatten)] 132 | pub meta: MetaConfig, 133 | 134 | // Storage backend config. 135 | #[clap(flatten)] 136 | pub storage: StorageConfig, 137 | 138 | // external catalog config. 139 | // - Later, catalog information SHOULD be kept in KV Service 140 | // - currently only supports HIVE (via hive meta store) 141 | #[clap(flatten)] 142 | pub catalog: HiveCatalogConfig, 143 | } 144 | // func: load() -> Self 145 | // 从文件中加载 146 | /// Load will load config from file, env and args. 147 | /// 148 | /// - Load from file as default. 149 | /// - Load from env, will override config from file. 150 | /// - Load from args as finally override 151 | pub fn load() -> Result { 152 | let arg_conf = Self::parse(); 153 | 154 | let mut builder: serfig::Builder = serfig::Builder::default(); // serfig::Builder是基于serde的分层配置系统 155 | 156 | // Load from config file first. 157 | { 158 | let config_file = if !arg_conf.config_file.is_empty() { 159 | arg_conf.config_file.clone() 160 | } else if let Ok(path) = env::var("CONFIG_FILE") { 161 | path 162 | } else { 163 | "".to_string() 164 | }; 165 | 166 | builder = builder.collect(from_file(Toml, &config_file)); 167 | } 168 | 169 | // Then, load from env. 170 | builder = builder.collect(from_env()); 171 | 172 | // Finally, load from args. 173 | builder = builder.collect(from_self(arg_conf)); 174 | 175 | Ok(builder.build()?) 176 | } 177 | ``` 178 | 179 | ##### QueryConfig 180 | 181 | ```rust 182 | // file: query/src/config/inner.rs 183 | pub struct QueryConfig { 184 | /// Tenant id for get the information from the MetaSrv. 185 | pub tenant_id: String, 186 | /// ID for construct the cluster. 187 | pub cluster_id: String, 188 | pub num_cpus: u64, 189 | pub mysql_handler_host: String, 190 | pub mysql_handler_port: u16, 191 | pub max_active_sessions: u64, 192 | pub clickhouse_handler_host: String, 193 | pub clickhouse_handler_port: u16, 194 | pub http_handler_host: String, 195 | pub http_handler_port: u16, 196 | pub http_handler_result_timeout_millis: u64, 197 | pub flight_api_address: String, 198 | pub admin_api_address: String, 199 | pub metric_api_address: String, 200 | pub http_handler_tls_server_cert: String, 201 | pub http_handler_tls_server_key: String, 202 | pub http_handler_tls_server_root_ca_cert: String, 203 | pub api_tls_server_cert: String, 204 | pub api_tls_server_key: String, 205 | pub api_tls_server_root_ca_cert: String, 206 | /// rpc server cert 207 | pub rpc_tls_server_cert: String, 208 | /// key for rpc server cert 209 | pub rpc_tls_server_key: String, 210 | /// Certificate for client to identify query rpc server 211 | pub rpc_tls_query_server_root_ca_cert: String, 212 | pub rpc_tls_query_service_domain_name: String, 213 | /// Table engine memory enabled 214 | pub table_engine_memory_enabled: bool, 215 | /// Database engine github enabled 216 | pub database_engine_github_enabled: bool, 217 | pub wait_timeout_mills: u64, 218 | pub max_query_log_size: usize, 219 | /// Table Cached enabled 220 | pub table_cache_enabled: bool, 221 | /// Max number of cached table snapshot 222 | pub table_cache_snapshot_count: u64, 223 | /// Max number of cached table segment 224 | pub table_cache_segment_count: u64, 225 | /// Max number of cached table block meta 226 | pub table_cache_block_meta_count: u64, 227 | /// Table memory cache size (mb) 228 | pub table_memory_cache_mb_size: u64, 229 | /// Table disk cache folder root 230 | pub table_disk_cache_root: String, 231 | /// Table disk cache size (mb) 232 | pub table_disk_cache_mb_size: u64, 233 | /// If in management mode, only can do some meta level operations(database/table/user/stage etc.) with metasrv. 234 | pub management_mode: bool, 235 | pub jwt_key_file: String, 236 | } 237 | ``` 238 | 239 | ##### LogConfig 240 | 241 | ```rust 242 | // file: common/tracing/src/config.rs 243 | use common_tracing::Config as LogConfig; 244 | pub struct Config { 245 | pub level: String, 246 | pub dir: String, 247 | pub query_enabled: bool, 248 | } 249 | ``` 250 | 251 | ##### MetaConfig 252 | 253 | ```rust 254 | // file: query/src/config/inner.rs 255 | pub struct MetaConfig { 256 | /// The dir to store persisted meta state for a embedded meta store 257 | pub embedded_dir: String, 258 | /// MetaStore backend address 259 | pub address: String, 260 | pub endpoints: Vec, 261 | /// MetaStore backend user name 262 | pub username: String, 263 | /// MetaStore backend user password 264 | pub password: String, 265 | /// Timeout for each client request, in seconds 266 | pub client_timeout_in_second: u64, 267 | /// Certificate for client to identify meta rpc serve 268 | pub rpc_tls_meta_server_root_ca_cert: String, 269 | pub rpc_tls_meta_service_domain_name: String, 270 | } 271 | ``` 272 | 273 | ##### StorageConfig 274 | 275 | ```rust 276 | // file: common/io/src/configs.rs 277 | pub struct StorageConfig { 278 | pub num_cpus: u64, 279 | 280 | pub params: StorageParams, 281 | } 282 | 283 | pub enum StorageParams { 284 | Azblob(StorageAzblobConfig), 285 | Fs(StorageFsConfig), 286 | #[cfg(feature = "storage-hdfs")] 287 | Hdfs(StorageHdfsConfig), 288 | Memory, 289 | S3(StorageS3Config), 290 | } 291 | 292 | pub struct StorageFsConfig { 293 | pub root: String, 294 | } 295 | 296 | pub struct StorageS3Config { 297 | pub endpoint_url: String, 298 | pub region: String, 299 | pub bucket: String, 300 | pub access_key_id: String, 301 | pub secret_access_key: String, 302 | pub master_key: String, 303 | pub root: String, 304 | } 305 | ``` 306 | 307 | ##### HiveCatalogConfig 308 | 309 | ```rust 310 | // file: query/src/config/inner.rs 311 | pub struct HiveCatalogConfig { 312 | pub meta_store_address: String, 313 | pub protocol: ThriftProtocol, 314 | } 315 | ``` 316 | 317 | ### 2 Tracing 318 | 319 | ```rust 320 | let _guards = init_global_tracing( 321 | app_name.as_str(), 322 | conf.log.dir.as_str(), 323 | conf.log.level.as_str(), 324 | ); 325 | ``` 326 | 327 | ### 3 SessionManager 328 | 329 | #### 初始化 330 | 331 | ```rust 332 | // file: query/bin/databend-query.rs 333 | let session_manager = SessionManager::from_conf(conf.clone()).await?; 334 | let mut shutdown_handle = ShutdownHandle::create(session_manager.clone()); 335 | ``` 336 | 337 | #### 关键模块 338 | 339 | ##### SessionManager 340 | 341 | ```rust 342 | // file: query/src/sessions/session_mgr.rs 343 | pub struct SessionManager { 344 | pub(in crate::sessions) conf: RwLock, 345 | pub(in crate::sessions) discovery: RwLock>, // 集群发现模块 346 | pub(in crate::sessions) catalogs: RwLock>, 347 | pub(in crate::sessions) http_query_manager: Arc, 348 | 349 | pub(in crate::sessions) max_sessions: usize, 350 | pub(in crate::sessions) active_sessions: Arc>>>, 351 | pub(in crate::sessions) storage_cache_manager: RwLock>, 352 | pub(in crate::sessions) query_logger: 353 | RwLock>>, 354 | pub status: Arc>, 355 | storage_operator: RwLock, 356 | storage_runtime: Arc, 357 | _guards: Vec, 358 | 359 | user_api_provider: RwLock>, 360 | role_cache_manager: RwLock>, 361 | // When typ is MySQL, insert into this map, key is id, val is MySQL connection id. 362 | pub(crate) mysql_conn_map: Arc, String>>>, 363 | pub(in crate::sessions) mysql_basic_conn_id: AtomicU32, 364 | } 365 | ``` 366 | 367 | ##### Config 368 | 369 | ##### ClusterDiscovery 370 | 371 | ##### CatalogManager 372 | 373 | ##### HttpQueryManager 374 | 375 | ##### CacheManager 376 | 377 | ### 4 MySQL Handler 378 | 379 | #### 启动 380 | 381 | ```rust 382 | // file: query/bin/databend-query.rs 383 | // func: main() 384 | // MySQL handler. 385 | { 386 | let hostname = conf.query.mysql_handler_host.clone(); 387 | let listening = format!("{}:{}", hostname, conf.query.mysql_handler_port); // 生成服务地址 388 | let mut handler = MySQLHandler::create(session_manager.clone()); // 创建MySQLHandler 389 | let listening = handler.start(listening.parse()?).await?; // 启动MySQLHandler 390 | shutdown_handle.add_service(handler); // 填加到shutdown_handle 391 | } 392 | ``` 393 | 394 | #### 关键模块 395 | 396 | ##### MySQLHandler 397 | 398 | ```rust 399 | // file: query/src/servers/mysql/mysql_handler.rs 400 | /// 实现了Server trait,并提供了创建MySQL server的方法,包括listener_tcp(), listen_loop(), accept_socket(), reject_session()等 401 | pub struct MySQLHandler { 402 | sessions: Arc, 403 | abort_handle: AbortHandle, 404 | abort_registration: Option, 405 | join_handle: Option>, 406 | } 407 | // func: start(...) 408 | // 启动MySQL server 409 | async fn start(&mut self, listening: SocketAddr) -> Result { 410 | match self.abort_registration.take() { 411 | None => Err(ErrorCode::LogicalError("MySQLHandler already running.")), 412 | Some(registration) => { 413 | let rejected_rt = Arc::new(Runtime::with_worker_threads( 414 | 1, 415 | Some("mysql-handler".to_string()), 416 | )?); // 创建rejected runtime 417 | let (stream, listener) = Self::listener_tcp(listening).await?; // 创建TcpListenerStream 418 | let stream = Abortable::new(stream, registration); 419 | self.join_handle = Some(tokio::spawn(self.listen_loop(stream, rejected_rt))); // 监听客户端的连接请求,然后调用accept_socket()函数 420 | Ok(listener) 421 | } 422 | } 423 | } 424 | // func: accept_socket(...) 425 | // 接受或者拒绝session 426 | fn accept_socket(sessions: Arc, executor: Arc, socket: TcpStream) { 427 | executor.spawn(async move { 428 | match sessions.create_session(SessionType::MySQL).await { // 创建session 429 | Err(error) => Self::reject_session(socket, error).await, // 拒绝 430 | Ok(session) => { 431 | tracing::info!("MySQL connection coming: {:?}", socket.peer_addr()); 432 | if let Err(error) = MySQLConnection::run_on_stream(session, socket) { // 接受 433 | tracing::error!("Unexpected error occurred during query: {:?}", error); 434 | }; 435 | } 436 | } 437 | }); 438 | } 439 | ``` 440 | 441 | ##### MySQLConnection 442 | 443 | ```rust 444 | // query/src/servers/mysql/mysql_session.rs 445 | // func: run_on_stream(...) 446 | pub fn run_on_stream(session: SessionRef, stream: TcpStream) -> Result<()> { 447 | let blocking_stream = Self::convert_stream(stream)?; // 转化为标准的TcpStream 448 | MySQLConnection::attach_session(&session, &blocking_stream)?; // attach session 449 | 450 | let non_blocking_stream = TcpStream::from_std(blocking_stream)?; 451 | let query_executor = 452 | Runtime::with_worker_threads(1, Some("mysql-query-executor".to_string()))?; 453 | Thread::spawn(move || { 454 | let join_handle = query_executor.spawn(async move { 455 | let client_addr = non_blocking_stream.peer_addr().unwrap().to_string(); 456 | let interactive_worker = InteractiveWorker::create(session, client_addr); // 创建InteractiveWorker 457 | let opts = IntermediaryOptions { 458 | process_use_statement_on_query: true, 459 | }; 460 | AsyncMysqlIntermediary::run_with_options( // 创建AsyncMysqlIntermediary,这个是一个第三方库,具体实现在InteractiveWorker中 461 | interactive_worker, 462 | non_blocking_stream, 463 | &opts, 464 | ) 465 | .await 466 | }); 467 | let _ = futures::executor::block_on(join_handle); 468 | }); 469 | Ok(()) 470 | } 471 | ``` 472 | 473 | ##### InteractiveWorker/InteractiveWorkerBase 474 | 475 | ```rust 476 | // query/src/servers/mysql/mysql_interactive_worker.rs 477 | /// 实现了AsyncMysqlShim trait,封装了InteractiveWorkerBase 478 | pub struct InteractiveWorker { 479 | session: SessionRef, 480 | base: InteractiveWorkerBase, // 实际的操作在InteractiveWorkerBase中 481 | version: String, 482 | salt: [u8; 20], 483 | client_addr: String, 484 | } 485 | // func: on_query(...) 486 | async fn on_query<'a>( 487 | &'a mut self, 488 | query: &'a str, 489 | writer: QueryResultWriter<'a, W>, 490 | ) -> Result<()> { 491 | if self.session.is_aborting() { 492 | writer.error( 493 | ErrorKind::ER_ABORTING_CONNECTION, 494 | "Aborting this connection. because we are try aborting server.".as_bytes(), 495 | )?; 496 | 497 | return Err(ErrorCode::AbortedSession( 498 | "Aborting this connection. because we are try aborting server.", 499 | )); 500 | } 501 | 502 | let mut writer = DFQueryResultWriter::create(writer); // 创建Response Wirter 503 | 504 | let instant = Instant::now(); 505 | let blocks = self.base.do_query(query).await; // 调用InteractiveWorkerBase的do_query(),获得查询结果blocks 506 | 507 | let format = self 508 | .session 509 | .get_shared_query_context() 510 | .await? 511 | .get_format_settings()?; 512 | let mut write_result = writer.write(blocks, &format); // 将查询结果写到writer 513 | 514 | if let Err(cause) = write_result { 515 | let suffix = format!("(while in query {})", query); 516 | write_result = Err(cause.add_message_back(suffix)); 517 | } 518 | 519 | histogram!( 520 | super::mysql_metrics::METRIC_MYSQL_PROCESSOR_REQUEST_DURATION, 521 | instant.elapsed() 522 | ); 523 | 524 | write_result 525 | } 526 | 527 | /// 实现MySQL server的处理细节 528 | struct InteractiveWorkerBase { 529 | session: SessionRef, // 是在MySQLHandler::accept_socket()中,由SessionManager::create_session()创建的 530 | generic_hold: PhantomData, 531 | } 532 | // func: do_query(...) 533 | // 查询入口 534 | async fn do_query(&mut self, query: &str) -> Result<(Vec, String)> { 535 | match self.federated_server_command_check(query) { // 判断是不是Federated query 536 | Some(data_block) => { // 1 Federated查询 537 | tracing::info!("Federated query: {}", query); 538 | if data_block.num_rows() > 0 { 539 | tracing::info!("Federated response: {:?}", data_block); 540 | } 541 | Ok((vec![data_block], String::from(""))) 542 | } 543 | None => { // 2 普通查询 544 | tracing::info!("Normal query: {}", query); 545 | let context = self.session.create_query_context().await?; // 创建QueryContext,这里会获取Shared信息,具体查询Session的详解 546 | context.attach_query_str(query); // 记录query 547 | 548 | let settings = context.get_settings(); 549 | 550 | let (stmts, hints) = 551 | DfParser::parse_sql(query, context.get_current_session().get_type())?; // 对SQL进行词法语法分析,hits是什么??? 552 | 553 | // 创建Interpreter,比如SelectInterpreter或者SelectInterpreterV2等 554 | let interpreter: Result> = 555 | if settings.get_enable_new_processor_framework()? != 0 556 | && context.get_cluster().is_empty() 557 | && settings.get_enable_planner_v2()? != 0 558 | && stmts.get(0).map_or(false, InterpreterFactoryV2::check) 559 | { 560 | let mut planner = Planner::new(context.clone()); // 新的Planner 561 | planner 562 | .plan_sql(query) 563 | .await 564 | .and_then(|v| InterpreterFactoryV2::get(context.clone(), &v.0)) 565 | } else { 566 | let (plan, _) = PlanParser::parse_with_hint(query, context.clone()).await; // 旧的Planner 567 | plan.and_then(|v| InterpreterFactory::get(context.clone(), v)) 568 | }; 569 | 570 | let hint = hints 571 | .iter() 572 | .find(|v| v.error_code.is_some()) 573 | .and_then(|x| x.error_code); 574 | 575 | match (hint, interpreter) { 576 | (None, Ok(interpreter)) => Self::exec_query(interpreter, &context).await, // 执行查询 577 | (Some(code), Ok(interpreter)) => { // Error 1 578 | let res = Self::exec_query(interpreter, &context).await; 579 | match res { 580 | Ok(_) => Err(ErrorCode::UnexpectedError(format!( 581 | "Expected server error code: {} but got: Ok.", 582 | code 583 | ))), 584 | Err(e) => { 585 | if code != e.code() { 586 | return Err(ErrorCode::UnexpectedError(format!( 587 | "Expected server error code: {} but got: Ok.", 588 | code 589 | ))); 590 | } 591 | Ok((vec![DataBlock::empty()], String::from(""))) 592 | } 593 | } 594 | } 595 | (None, Err(e)) => { // Error 2 596 | InterpreterQueryLog::fail_to_start(context, e.clone()).await; 597 | Err(e) 598 | } 599 | (Some(code), Err(e)) => { // Error 3 600 | if code != e.code() { 601 | InterpreterQueryLog::fail_to_start(context, e.clone()).await; 602 | return Err(ErrorCode::UnexpectedError(format!( 603 | "Expected server error code: {} but got: Ok.", 604 | code 605 | ))); 606 | } 607 | Ok((vec![DataBlock::empty()], String::from(""))) 608 | } 609 | } 610 | } 611 | } 612 | } 613 | ``` 614 | 615 | ##### Session 616 | 617 | ```rust 618 | // file: query/src/sessions/session.rs 619 | pub struct Session { 620 | pub(in crate::sessions) id: String, 621 | #[ignore_malloc_size_of = "insignificant"] 622 | pub(in crate::sessions) typ: RwLock, 623 | #[ignore_malloc_size_of = "insignificant"] 624 | pub(in crate::sessions) session_mgr: Arc, 625 | pub(in crate::sessions) ref_count: Arc, 626 | pub(in crate::sessions) session_ctx: Arc, 627 | #[ignore_malloc_size_of = "insignificant"] 628 | session_settings: Settings, 629 | #[ignore_malloc_size_of = "insignificant"] 630 | status: Arc>, 631 | pub(in crate::sessions) mysql_connection_id: Option, 632 | } 633 | // func: create_query_context(...) -> Result> 634 | // 创建QueryContext 635 | /// Create a query context for query. 636 | /// For a query, execution environment(e.g cluster) should be immutable. 637 | /// We can bind the environment to the context in create_context method. 638 | pub async fn create_query_context(self: &Arc) -> Result> { 639 | let shared = self.get_shared_query_context().await?; 640 | 641 | Ok(QueryContext::create_from_shared(shared)) 642 | } 643 | // func: get_shared_query_context(...) -> Result> 644 | // 创建QueryContextShared 645 | pub async fn get_shared_query_context(self: &Arc) -> Result> { 646 | let discovery = self.session_mgr.get_cluster_discovery(); 647 | 648 | let session = self.clone(); 649 | let cluster = discovery.discover().await?; // 获取集群信息 650 | let shared = QueryContextShared::try_create(session, cluster).await?; // 创建QueryContextShared 651 | self.session_ctx 652 | .set_query_context_shared(Some(shared.clone())); 653 | Ok(shared) 654 | } 655 | ``` 656 | 657 | ##### QueryContextShared 658 | 659 | ```rust 660 | // file: query/src/sessions/query_ctx_shared.rs 661 | pub struct QueryContextShared { 662 | ... 663 | } 664 | ``` 665 | 666 | ##### QueryContext 667 | 668 | ```rust 669 | // file: query/src/sessions/query_ctx.rs 670 | pub struct QueryContext { 671 | version: String, 672 | statistics: Arc>, 673 | partition_queue: Arc>>, 674 | shared: Arc, 675 | precommit_blocks: Arc>>, 676 | } 677 | ``` 678 | 679 | ### 5 ClickHouse Handler 680 | 681 | ### 6 HTTP Handler 682 | 683 | ### 7 Metrics API Service 684 | 685 | #### 启动 686 | 687 | ```rust 688 | // file: query/bin/databend-query.rs 689 | init_default_metrics_recorder(); // 初始化metrics 690 | 691 | // Metric API service. 692 | { 693 | let address = conf.query.metric_api_address.clone(); 694 | let mut srv = MetricService::create(session_manager.clone()); 695 | let listening = srv.start(address.parse()?).await?; 696 | shutdown_handle.add_service(srv); 697 | tracing::info!("Metric API server listening on {}/metrics", listening); 698 | } 699 | ``` 700 | 701 | #### 关键模块 702 | 703 | ##### MetricService 704 | 705 | ```rust 706 | // file: query/src/metrics/metric_service.rs 707 | pub struct MetricService { 708 | shutdown_handler: HttpShutdownHandler, 709 | } 710 | // func: start() 711 | async fn start(&mut self, listening: SocketAddr) -> Result { 712 | self.start_without_tls(listening).await 713 | } 714 | // func: start_without_tls() 715 | async fn start_without_tls(&mut self, listening: SocketAddr) -> Result { 716 | let prometheus_handle = common_metrics::try_handle().ok_or_else(|| { 717 | ErrorCode::InitPrometheusFailure("Prometheus recorder has not been initialized yet.") 718 | })?; // 创建PrometheusHandle 719 | let app = poem::Route::new() 720 | .at("/metrics", poem::get(metric_handler)) 721 | .data(prometheus_handle); // 注册路由 722 | let addr = self 723 | .shutdown_handler 724 | .start_service(listening, None, app) // 启动服务 725 | .await?; 726 | Ok(addr) 727 | } 728 | // func: metric_handler() -> impl IntoResponse 729 | #[poem::handler] 730 | pub async fn metric_handler(prom_extension: Data<&PrometheusHandle>) -> impl IntoResponse { 731 | prom_extension.0.render() 732 | } 733 | ``` 734 | 735 | ##### PROMETHEUS_HANDLE 736 | 737 | ```rust 738 | // file: common/metrics/src/recorder.rs 739 | static PROMETHEUS_HANDLE: Lazy>>> = 740 | Lazy::new(|| Arc::new(RwLock::new(None))); 741 | 742 | pub const LABEL_KEY_TENANT: &str = "tenant"; 743 | pub const LABEL_KEY_CLUSTER: &str = "cluster_name"; 744 | 745 | pub fn init_default_metrics_recorder() { 746 | static START: Once = Once::new(); 747 | START.call_once(init_prometheus_recorder) 748 | } 749 | 750 | /// Init prometheus recorder. 751 | fn init_prometheus_recorder() { 752 | let recorder = PrometheusBuilder::new().build_recorder(); 753 | let mut h = PROMETHEUS_HANDLE.as_ref().write(); 754 | *h = Some(recorder.handle()); 755 | metrics::clear_recorder(); 756 | match metrics::set_boxed_recorder(Box::new(recorder)) { 757 | Ok(_) => (), 758 | Err(err) => tracing::warn!("Install prometheus recorder failed, cause: {}", err), 759 | }; 760 | } 761 | 762 | pub fn try_handle() -> Option { 763 | PROMETHEUS_HANDLE.as_ref().read().clone() 764 | } 765 | ``` 766 | 767 | ### 8 HTTP API Service 768 | 769 | #### 启动 770 | 771 | ```rust 772 | // file: query/bin/databend-query.rs 773 | // HTTP API service. 774 | { 775 | let address = conf.query.admin_api_address.clone(); 776 | let mut srv = HttpService::create(session_manager.clone()); 777 | let listening = srv.start(address.parse()?).await?; 778 | shutdown_handle.add_service(srv); 779 | tracing::info!("HTTP API server listening on {}", listening); 780 | } 781 | ``` 782 | 783 | #### 关键模块 784 | 785 | ##### HttpService 786 | 787 | ```rust 788 | // file: query/src/api/http_service.rs 789 | pub struct HttpService { 790 | sessions: Arc, 791 | shutdown_handler: HttpShutdownHandler, 792 | } 793 | // func: build_router() -> impl Endpoint 794 | fn build_router(&self) -> impl Endpoint { 795 | #[cfg_attr(not(feature = "memory-profiling"), allow(unused_mut))] 796 | let mut route = Route::new() 797 | .at("/v1/health", get(super::http::v1::health::health_handler)) 798 | .at("/v1/config", get(super::http::v1::config::config_handler)) 799 | .at("/v1/logs", get(super::http::v1::logs::logs_handler)) 800 | .at("/v1/status", get(super::http::v1::status::status_handler)) 801 | .at( 802 | "/v1/cluster/list", 803 | get(super::http::v1::cluster::cluster_list_handler), 804 | ) 805 | .at( 806 | "/debug/home", 807 | get(super::http::debug::home::debug_home_handler), 808 | ) 809 | .at( 810 | "/debug/pprof/profile", 811 | get(super::http::debug::pprof::debug_pprof_handler), 812 | ); 813 | 814 | #[cfg(feature = "memory-profiling")] 815 | { 816 | route = route.at( 817 | // to follow the conversions of jepref, we arrange the path in 818 | // this way, so that jeprof could be invoked like: 819 | // `jeprof ./target/debug/databend-query http://localhost:8080/debug/mem` 820 | // and jeprof will translate the above url into sth like: 821 | // "http://localhost:8080/debug/mem/pprof/profile?seconds=30" 822 | "/debug/mem/pprof/profile", 823 | get(super::http::debug::jeprof::debug_jeprof_dump_handler), 824 | ); 825 | }; 826 | route.data(self.sessions.clone()) 827 | } 828 | ``` 829 | 830 | ### 9 RPC API Service 831 | 832 | RpcClient会通过此服务启动Stage和获取Stage的输出数据流。 833 | 834 | #### Service启动 835 | 836 | ```rust 837 | // file: query/bin/databend-query.rs 838 | // RPC API service. 839 | { 840 | let address = conf.query.flight_api_address.clone(); 841 | let mut srv = RpcService::create(session_manager.clone()); 842 | let listening = srv.start(address.parse()?).await?; 843 | shutdown_handle.add_service(srv); 844 | tracing::info!("RPC API server listening on {}", listening); 845 | } 846 | ``` 847 | 848 | #### 关键模块 849 | 850 | ##### RpcService 851 | 852 | ```rust 853 | // file: query/src/api/rpc_service.rs 854 | pub struct RpcService { 855 | pub sessions: Arc, // 会话管理器 856 | pub abort_notify: Arc, 857 | pub dispatcher: Arc, // 数据流调度器,RpcClinet从这里获取查询结果 858 | } 859 | // func: create(...) -> Box 860 | // 创建RpcService 861 | pub fn create(sessions: Arc) -> Box { 862 | Box::new(Self { 863 | sessions, 864 | abort_notify: Arc::new(Notify::new()), 865 | dispatcher: Arc::new(DatabendQueryFlightDispatcher::create()), 866 | }) 867 | } 868 | // func: start(...) 869 | async fn start(&mut self, listening: SocketAddr) -> Result { 870 | let (listener_stream, listener_addr) = Self::listener_tcp(listening).await?; 871 | self.start_with_incoming(listener_stream).await?; 872 | Ok(listener_addr) 873 | } 874 | // func: start_with_incoming(...) 875 | pub async fn start_with_incoming(&mut self, listener_stream: TcpListenerStream) -> Result<()> { 876 | let sessions = self.sessions.clone(); 877 | let flight_dispatcher = self.dispatcher.clone(); 878 | let flight_api_service = DatabendQueryFlightService::create(flight_dispatcher, sessions); // 创建DatabendQueryFlightService 879 | let conf = self.sessions.get_conf(); 880 | let builder = Server::builder(); 881 | let mut builder = if conf.tls_rpc_server_enabled() { 882 | tracing::info!("databend query tls rpc enabled"); 883 | builder 884 | .tls_config(Self::server_tls_config(&conf).await.map_err(|e| { 885 | ErrorCode::TLSConfigurationFailure(format!( 886 | "failed to load server tls config: {e}", 887 | )) 888 | })?) 889 | .map_err(|e| { 890 | ErrorCode::TLSConfigurationFailure(format!("failed to invoke tls_config: {e}",)) 891 | })? 892 | } else { 893 | builder 894 | }; 895 | 896 | let server = builder 897 | .add_service(FlightServiceServer::new(flight_api_service)) // 创建FlightServiceServer 898 | .serve_with_incoming_shutdown(listener_stream, self.shutdown_notify()); 899 | 900 | common_base::base::tokio::spawn(server); // 启动FlightServiceServer 901 | Ok(()) 902 | } 903 | ``` 904 | 905 | ##### DatabendQueryFlightService 906 | 907 | ```rust 908 | // file: query/src/api/rpc/flight_service.rs 909 | pub struct DatabendQueryFlightService { 910 | sessions: Arc, 911 | dispatcher: Arc, 912 | } 913 | // func: do_action() 914 | // 启动查询计划 915 | async fn do_action(&self, request: Request) -> Response { 916 | common_tracing::extract_remote_span_as_parent(&request); 917 | 918 | let action = request.into_inner(); 919 | let flight_action: FlightAction = action.try_into()?; 920 | 921 | let action_result = match &flight_action { 922 | FlightAction::CancelAction(action) => { 923 | // We only destroy when session is exist 924 | let session_id = action.query_id.clone(); 925 | if let Some(session) = self.sessions.get_session_by_id(&session_id).await { 926 | // TODO: remove streams 927 | session.force_kill_session(); 928 | } 929 | 930 | FlightResult { body: vec![] } 931 | } 932 | FlightAction::BroadcastAction(action) => { 933 | let session_id = action.query_id.clone(); 934 | let is_aborted = self.dispatcher.is_aborted(); 935 | let session = self 936 | .sessions 937 | .create_rpc_session(session_id, is_aborted) 938 | .await?; 939 | 940 | self.dispatcher 941 | .broadcast_action(session, flight_action) 942 | .await?; 943 | FlightResult { body: vec![] } 944 | } 945 | FlightAction::PrepareShuffleAction(action) => { 946 | let session_id = action.query_id.clone(); 947 | let is_aborted = self.dispatcher.is_aborted(); 948 | let session = self 949 | .sessions 950 | .create_rpc_session(session_id, is_aborted) 951 | .await?; 952 | 953 | self.dispatcher 954 | .shuffle_action(session, flight_action) 955 | .await?; 956 | FlightResult { body: vec![] } 957 | } 958 | }; 959 | 960 | // let action_result = do_flight_action.await?; 961 | Ok(RawResponse::new( 962 | Box::pin(tokio_stream::once(Ok(action_result))) as FlightStream, 963 | )) 964 | } 965 | // func: do_get() 966 | // 获取查询结果数据流 967 | async fn do_get(&self, request: Request) -> Response { 968 | common_tracing::extract_remote_span_as_parent(&request); 969 | let ticket: FlightTicket = request.into_inner().try_into()?; 970 | 971 | match ticket { 972 | FlightTicket::StreamTicket(steam_ticket) => { 973 | let (receiver, data_schema) = self.dispatcher.get_stream(&steam_ticket)?; 974 | let arrow_schema = data_schema.to_arrow(); 975 | let ipc_fields = default_ipc_fields(&arrow_schema.fields); 976 | 977 | serialize_schema(&arrow_schema, Some(&ipc_fields)); 978 | 979 | Ok(RawResponse::new( 980 | Box::pin(FlightDataStream::create(receiver, ipc_fields)) 981 | as FlightStream, 982 | )) 983 | } 984 | } 985 | } 986 | ``` 987 | 988 | ##### DatabendQueryFlightDispatcher ⭐️ 989 | 990 | ```rust 991 | // file: query/src/api/rpc/flight_dispatcher.rs 992 | struct StreamInfo { 993 | #[allow(unused)] 994 | schema: DataSchemaRef, 995 | tx: mpsc::Sender>, 996 | rx: mpsc::Receiver>, 997 | } 998 | 999 | pub struct DatabendQueryFlightDispatcher { 1000 | streams: Arc>>, // key = "query_id/stage_id/sink" 1001 | stages_notify: Arc>>>, 1002 | abort: Arc, 1003 | } 1004 | // func: shuffle_action() 1005 | // 执行FlightAction::PrepareShuffleAction 1006 | pub async fn shuffle_action(&self, session: SessionRef, action: FlightAction) -> Result<()> { 1007 | let query_id = action.get_query_id(); 1008 | let stage_id = action.get_stage_id(); 1009 | let action_sinks = action.get_sinks(); 1010 | let data_schema = action.get_plan().schema(); 1011 | self.create_stage_streams(&query_id, &stage_id, &data_schema, &action_sinks); 1012 | 1013 | match action.get_sinks().len() { 1014 | 0 => Err(ErrorCode::LogicalError("")), 1015 | 1 => self.one_sink_action(session, &action).await, // 只有一个sink 1016 | _ => { 1017 | self.action_with_scatter::(session, &action) // 有多个sinks 1018 | .await 1019 | } 1020 | } 1021 | } 1022 | // func: create_stage_streams() 1023 | // 记录Stream 1024 | fn create_stage_streams( 1025 | &self, 1026 | query_id: &str, 1027 | stage_id: &str, 1028 | schema: &DataSchemaRef, 1029 | streams_name: &[String], 1030 | ) { 1031 | let stage_name = format!("{}/{}", query_id, stage_id); 1032 | self.stages_notify 1033 | .write() 1034 | .insert(stage_name.clone(), Arc::new(Notify::new())); 1035 | 1036 | let mut streams = self.streams.write(); 1037 | 1038 | for stream_name in streams_name { 1039 | let (tx, rx) = mpsc::channel(5); // 创建mpsc 1040 | let stream_name = format!("{}/{}", stage_name, stream_name); 1041 | 1042 | streams.insert(stream_name, StreamInfo { 1043 | schema: schema.clone(), 1044 | tx, 1045 | rx, 1046 | }); 1047 | } 1048 | } 1049 | // func: one_sink_action() 1050 | // 处理只有一个sink的情况 1051 | async fn one_sink_action(&self, session: SessionRef, action: &FlightAction) -> Result<()> { 1052 | let query_context = session.create_query_context().await?; 1053 | let action_context = QueryContext::create_from(query_context.clone()); 1054 | let pipeline_builder = PipelineBuilder::create(action_context.clone()); 1055 | 1056 | let query_plan = action.get_plan(); 1057 | action_context.attach_query_plan(&query_plan); 1058 | let mut pipeline = pipeline_builder.build(&query_plan)?; 1059 | 1060 | let action_sinks = action.get_sinks(); 1061 | let action_query_id = action.get_query_id(); 1062 | let action_stage_id = action.get_stage_id(); 1063 | 1064 | assert_eq!(action_sinks.len(), 1); 1065 | let stage_name = format!("{}/{}", action_query_id, action_stage_id); 1066 | let stages_notify = self.stages_notify.clone(); 1067 | 1068 | let stream_name = format!("{}/{}", stage_name, action_sinks[0]); // stream_name = "query_id/stage_id/sink" 1069 | let tx_ref = self.streams.read().get(&stream_name).map(|x| x.tx.clone()); // 获取tx,查询结果数据从这里发出 1070 | let tx = tx_ref.ok_or_else(|| ErrorCode::NotFoundStream("Not found stream"))?; 1071 | 1072 | query_context.try_spawn( 1073 | async move { 1074 | let _session = session; 1075 | wait_start(stage_name, stages_notify).await; 1076 | 1077 | match pipeline.execute().await { // 执行pipeline 1078 | Err(error) => { 1079 | tx.send(Err(error)).await.ok(); 1080 | } 1081 | Ok(mut abortable_stream) => { 1082 | while let Some(item) = abortable_stream.next().await { 1083 | if let Err(error) = tx.send(item).await { // 发送结果到mpsc 1084 | tracing::error!( 1085 | "Cannot push data when run_action_without_scatters. {}", 1086 | error 1087 | ); 1088 | break; 1089 | } 1090 | } 1091 | } 1092 | }; 1093 | } 1094 | .instrument(Span::current()), 1095 | )?; 1096 | Ok(()) 1097 | } 1098 | // func: action_with_scatter() 1099 | // 处理多个Sinks到情况 1100 | async fn action_with_scatter( 1101 | &self, 1102 | session: SessionRef, 1103 | action: &FlightAction, 1104 | ) -> Result<()> 1105 | where 1106 | T: FlightScatter + Send + 'static, 1107 | { 1108 | let query_context = session.create_query_context().await?; 1109 | let action_context = QueryContext::create_from(query_context.clone()); 1110 | let pipeline_builder = PipelineBuilder::create(action_context.clone()); 1111 | 1112 | let query_plan = action.get_plan(); 1113 | action_context.attach_query_plan(&query_plan); 1114 | let mut pipeline = pipeline_builder.build(&query_plan)?; // 创建pipeline 1115 | 1116 | let action_query_id = action.get_query_id(); 1117 | let action_stage_id = action.get_stage_id(); 1118 | 1119 | let sinks_tx = { 1120 | let action_sinks = action.get_sinks(); 1121 | 1122 | assert!(action_sinks.len() > 1); 1123 | let mut sinks_tx = Vec::with_capacity(action_sinks.len()); 1124 | 1125 | for sink in &action_sinks { // 遍历sinks 1126 | let stream_name = format!("{}/{}/{}", action_query_id, action_stage_id, sink); 1127 | match self.streams.read().get(&stream_name) { 1128 | Some(stream) => sinks_tx.push(stream.tx.clone()), 1129 | None => { 1130 | return Err(ErrorCode::NotFoundStream(format!( 1131 | "Not found stream {}", 1132 | stream_name 1133 | ))) 1134 | } 1135 | } 1136 | } 1137 | 1138 | Result::Ok(sinks_tx) 1139 | }?; 1140 | 1141 | let stage_name = format!("{}/{}", action_query_id, action_stage_id); 1142 | let stages_notify = self.stages_notify.clone(); 1143 | 1144 | let flight_scatter = T::try_create( 1145 | query_context.clone(), 1146 | action.get_plan().schema(), 1147 | action.get_scatter_expression(), 1148 | action.get_sinks().len(), 1149 | )?; 1150 | 1151 | query_context.try_spawn( 1152 | async move { 1153 | let _session = session; 1154 | wait_start(stage_name, stages_notify).await; 1155 | 1156 | let sinks_tx_ref = &sinks_tx; 1157 | let forward_blocks = async move { 1158 | let mut abortable_stream = pipeline.execute().await?; // 执行 1159 | while let Some(item) = abortable_stream.next().await { 1160 | let forward_blocks = flight_scatter.execute(&item?)?; // 散射 1161 | 1162 | assert_eq!(forward_blocks.len(), sinks_tx_ref.len()); 1163 | 1164 | for (index, forward_block) in forward_blocks.iter().enumerate() { 1165 | let tx: &Sender> = &sinks_tx_ref[index]; 1166 | tx.send(Ok(forward_block.clone())) // 把数据发送到sink对应的tx 1167 | .await 1168 | .map_err_to_code(ErrorCode::LogicalError, || { 1169 | "Cannot push data when run_action" 1170 | })?; 1171 | } 1172 | } 1173 | 1174 | Result::Ok(()) 1175 | }; 1176 | 1177 | if let Err(error) = forward_blocks.await { 1178 | for tx in &sinks_tx { 1179 | if !tx.is_closed() { 1180 | let send_error_message = tx.send(Err(error.clone())); 1181 | let _ignore_send_error = send_error_message.await; 1182 | } 1183 | } 1184 | } 1185 | } 1186 | .instrument(Span::current()), 1187 | )?; 1188 | 1189 | Ok(()) 1190 | } 1191 | ``` 1192 | 1193 | ##### FlightScatter 1194 | 1195 | ```rust 1196 | // file: query/src/api/rpc/flight_scatter.rs 1197 | pub trait FlightScatter: Sized { 1198 | fn try_create( 1199 | ctx: Arc, 1200 | schema: DataSchemaRef, 1201 | expr: Option, 1202 | num: usize, 1203 | ) -> Result; 1204 | 1205 | fn execute(&self, data_block: &DataBlock) -> Result>; 1206 | } 1207 | ``` 1208 | 1209 | ##### HashFligthScatter 1210 | 1211 | ```rust 1212 | // file: query/src/api/rpc/flight_scatter_hash.rs 1213 | pub struct HashFlightScatter { 1214 | scatter_expression_executor: Arc, 1215 | scatter_expression_name: String, 1216 | scattered_size: usize, 1217 | } 1218 | 1219 | // func: 1220 | // 把输入的data_block Hash到多个sinks 1221 | fn execute(&self, data_block: &DataBlock) -> common_exception::Result> { 1222 | let expression_executor = self.scatter_expression_executor.clone(); 1223 | let evaluated_data_block = expression_executor.execute(data_block)?; 1224 | let indices = evaluated_data_block.try_column_by_name(&self.scatter_expression_name)?; 1225 | 1226 | let col: &PrimitiveColumn = Series::check_get(indices)?; 1227 | let indices: Vec = col.iter().map(|c| *c as usize).collect(); 1228 | DataBlock::scatter_block(data_block, &indices, self.scattered_size) 1229 | } 1230 | ``` 1231 | 1232 | ```rust 1233 | // file: common/datablocks/src/kernels/data_block_scatter.rs 1234 | // 散列DataBlock 1235 | pub fn scatter_block( 1236 | block: &DataBlock, 1237 | indices: &[usize], 1238 | scatter_size: usize, 1239 | ) -> Result> { 1240 | let columns_size = block.num_columns(); 1241 | let mut scattered_columns = Vec::with_capacity(scatter_size); 1242 | 1243 | for column_index in 0..columns_size { // 先把每一列打散 1244 | let column = block.column(column_index).scatter(indices, scatter_size); 1245 | scattered_columns.push(column); 1246 | } 1247 | 1248 | let mut scattered_blocks = Vec::with_capacity(scatter_size); 1249 | for index in 0..scatter_size { // 把属于同一个sink的所有列组装成DataBlock 1250 | let mut block_columns = vec![]; 1251 | 1252 | for item in scattered_columns.iter() { 1253 | block_columns.push(item[index].clone()) 1254 | } 1255 | scattered_blocks.push(DataBlock::create(block.schema().clone(), block_columns)); 1256 | } 1257 | 1258 | Ok(scattered_blocks) 1259 | } 1260 | ``` 1261 | 1262 | ##### BroadcastFlightScatter 1263 | 1264 | ```rust 1265 | // file: query/src/api/rpc/flight_scatter_broadcast.rs 1266 | pub struct BroadcastFlightScatter { 1267 | scattered_size: usize, 1268 | } 1269 | // func: execute(...) 1270 | // 广播数据,把输入的data_block广播到下游多个sink 1271 | fn execute(&self, data_block: &DataBlock) -> Result> { 1272 | let mut data_blocks = vec![]; 1273 | for _ in 0..self.scattered_size { 1274 | data_blocks.push(data_block.clone()); 1275 | } 1276 | 1277 | Ok(data_blocks) 1278 | } 1279 | ``` 1280 | 1281 | ### 10 Cluster Register 1282 | 1283 | #### Woker节点注册 1284 | 1285 | ```rust 1286 | // file: query/bin/databend-query.rs 1287 | // func: main() 1288 | // Cluster register. 1289 | { 1290 | let cluster_discovery = session_manager.get_cluster_discovery(); // 从SessionManager获取ClusterDiscovery 1291 | let register_to_metastore = cluster_discovery.register_to_metastore(&conf); // 将本地节点注册到meta服务 1292 | register_to_metastore.await?; 1293 | // ... 1294 | } 1295 | ``` 1296 | 1297 | #### 关键模块 1298 | 1299 | ##### SessionManager 1300 | 1301 | ```rust 1302 | // file: query/src/sessions/session_mgr.rs 1303 | /// SessisionManager管理模块,这里只列出集群管理相关的代码 1304 | pub struct SessionManager { 1305 | ... 1306 | pub(in crate::sessions) discovery: RwLock>, 1307 | ... 1308 | } 1309 | // func: from_conf(...) -> Result> 1310 | // 创建ClusterDiscovery 1311 | let discovery = ClusterDiscovery::create_global(conf.clone()).await?; // 初始化ClusterDiscovery 1312 | SessionManager { 1313 | ... 1314 | discovery: RwLock::new(discovery), 1315 | } 1316 | // func: get_cluster_discovery() 1317 | // 获取ClusterDiscovery 1318 | pub fn get_cluster_discovery(self: &Arc) -> Arc { 1319 | self.discovery.read().clone() // 获取读锁 1320 | } 1321 | ``` 1322 | 1323 | ##### ClusterDiscovery/ClusterHeartbeat/Cluster/ClusterMgr 1324 | 1325 | ```rust 1326 | // file: query/src/clusters/cluster.rs 1327 | /// 服务发现模块,负责维护本地节点的ID和与meta服务通信,包括当前节点的心跳、注册节点、删除节点、获取集群信息 1328 | pub struct ClusterDiscovery { 1329 | local_id: String, // 当前Woker节点的UUID 1330 | heartbeat: Mutex, // 心跳服务,用于worker与meta service间的心跳 1331 | api_provider: Arc, // trait object, 可以是任意实现ClusterApi trait的类,目前只有ClusterApi struct 1332 | } 1333 | // func: register_to_metastore(...) 1334 | // 注册当前节点到meta服务 1335 | pub async fn register_to_metastore(self: &Arc, cfg: &Config) -> Result<()> { 1336 | let cpus = cfg.query.num_cpus; 1337 | // TODO: 127.0.0.1 || ::0 1338 | let address = cfg.query.flight_api_address.clone(); 1339 | let node_info = NodeInfo::create(self.local_id.clone(), cpus, address); // 创建节点信息 1340 | 1341 | self.drop_invalid_nodes(&node_info).await?; // 删除当前节点过期的信息 1342 | match self.api_provider.add_node(node_info.clone()).await { // 通过ClusterMgr注册当前节点信息 1343 | Ok(_) => self.start_heartbeat(node_info).await, // 如果注册成功,则开启心跳 1344 | Err(cause) => Err(cause.add_message_back("(while cluster api add_node).")), 1345 | } 1346 | } 1347 | 1348 | 1349 | /// 心跳模块 1350 | struct ClusterHeartbeat { 1351 | timeout: Duration, 1352 | shutdown: Arc, 1353 | shutdown_notify: Arc, 1354 | cluster_api: Arc, 1355 | shutdown_handler: Option>, 1356 | } 1357 | // func: start(...) 1358 | // 启动心跳loop 1359 | pub fn start(&mut self, node_info: NodeInfo) { 1360 | self.shutdown_handler = Some(tokio::spawn(self.heartbeat_loop(node_info))); // 放到一个线程中loop 1361 | } 1362 | 1363 | /// 集群模块,表示了一个集群 1364 | pub struct Cluster { 1365 | local_id: String, // 当前节点的ID 1366 | nodes: Vec>, // 集群内所有节点信息 1367 | } 1368 | // func: create_node_conn(&self, name: &str, config: &Config) -> Result 1369 | // 创建当前节点访问name节点的Client 1370 | for node in &self.nodes { 1371 | if node.id == name { 1372 | return match config.tls_query_cli_enabled() { 1373 | true => // 创建tls client 1374 | false => // 创建普通client 1375 | } 1376 | } 1377 | } 1378 | ``` 1379 | 1380 | ```rust 1381 | // file: common/management/src/cluster/cluster_mgr.rs 1382 | /// 集群管理模块,实现了ClusterApi trait,与meta服务通信,包括心跳、添加、获取、删除节点信息 1383 | pub struct ClusterMgr { 1384 | kv_api: Arc, // KVDB 1385 | lift_time: Duration, // 生命周期 1386 | cluster_prefix: String, 1387 | } 1388 | 1389 | impl ClusterApi for ClusterMgr { 1390 | async fn add_node(&self, node: NodeInfo) -> Result {...} 1391 | async fn get_nodes(&self) -> Result> {...} 1392 | async fn drop_node(&self, node_id: String, seq: Option) -> Result<()> {...} 1393 | async fn heartbeat(&self, node: &NodeInfo, seq: Option) -> Result {...} 1394 | } 1395 | ``` 1396 | --------------------------------------------------------------------------------