├── .DS_Store ├── .gitignore ├── .gitmodules ├── .nvmrc ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── assets └── scss │ └── _variables_project.scss ├── config.toml ├── content └── zh │ ├── _index.html │ ├── docs │ ├── _index.md │ ├── best-practice │ │ ├── _index.md │ │ ├── async-message-tracing │ │ │ └── _index.md │ │ ├── http-header-case │ │ │ └── _index.md │ │ ├── image │ │ │ ├── eshop-demo-1.jpg │ │ │ ├── eshop-demo.jpg │ │ │ ├── monolith-microserivce.jpg │ │ │ ├── screenshot1.png │ │ │ ├── trace-screenshot-1.png │ │ │ ├── trace-screenshot-2.png │ │ │ ├── trace-screenshot-3.png │ │ │ ├── trace-screenshot-4.png │ │ │ ├── trace-screenshot-5.png │ │ │ ├── trace-screenshot-6.png │ │ │ └── tracing_mental_model.png │ │ ├── internal-redirect │ │ │ └── _index.md │ │ ├── method-level-trcing │ │ │ └── _index.md │ │ └── startup-dependence │ │ │ └── _index.md │ ├── common-problem │ │ ├── _index.md │ │ ├── application-start-fail │ │ │ └── _index.md │ │ ├── duplicate-tls-hosts │ │ │ └── _index.md │ │ ├── external-name-service-highjacks │ │ │ └── _index.md │ │ ├── image │ │ │ ├── envoy-initialize.png │ │ │ ├── externalname.png │ │ │ ├── pilot_total_rejected_configs.png │ │ │ ├── tcp-keepalive-package.png │ │ │ ├── tcp-keepalive-ss-1.png │ │ │ ├── tcp-keepalive-ss-2.png │ │ │ ├── tcp-keepalive-ss-3.png │ │ │ ├── tcp-keepalive-ss-4.png │ │ │ └── tcp-keepalive-ss-5.png │ │ ├── server-speaks-first-protocol │ │ │ └── _index.md │ │ └── tcp-keepalive │ │ │ └── _index.md │ ├── debug-istio │ │ ├── _index.md │ │ └── envoy-log │ │ │ ├── _index.md │ │ │ └── image │ │ │ ├── downstream-upstream.png │ │ │ ├── envoy-model.png │ │ │ └── request-route.png │ └── tcm │ │ └── _index.md │ ├── featured-background.jpeg │ ├── featured-background.jpg │ └── search.md ├── docker-compose.yaml ├── go.mod ├── go.sum ├── layouts └── 404.html ├── netlify.toml └── package.json /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /public 2 | resources/ 3 | node_modules/ 4 | package-lock.json 5 | .hugo_build.lock -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | 2 | [submodule "themes/docsy"] 3 | path = themes/docsy 4 | url = https://github.com/google/docsy 5 | -------------------------------------------------------------------------------- /.nvmrc: -------------------------------------------------------------------------------- 1 | lts/* 2 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows 28 | [Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). 29 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM klakegg/hugo:ext-alpine 2 | 3 | RUN apk add git 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Istio 运维手册 2 | 3 | [在线阅读](https://istio-operation-bible.aeraki.net/) 4 | -------------------------------------------------------------------------------- /assets/scss/_variables_project.scss: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Add styles or override variables from the theme here. 4 | 5 | */ 6 | 7 | -------------------------------------------------------------------------------- /config.toml: -------------------------------------------------------------------------------- 1 | baseURL = "/" 2 | title = "Istio 运维实战" 3 | 4 | # Language settings 5 | contentDir = "content" 6 | defaultContentLanguage = "zh" 7 | defaultContentLanguageInSubdir = false 8 | # Useful when translating. 9 | enableMissingTranslationPlaceholders = true 10 | 11 | enableRobotsTXT = true 12 | 13 | # Will give values to .Lastmod etc. 14 | enableGitInfo = true 15 | 16 | # Comment out to enable taxonomies in Docsy 17 | # disableKinds = ["taxonomy", "taxonomyTerm"] 18 | 19 | # You can add your own taxonomies 20 | [taxonomies] 21 | tag = "tags" 22 | category = "categories" 23 | 24 | [params.taxonomy] 25 | # set taxonomyCloud = [] to hide taxonomy clouds 26 | taxonomyCloud = ["tags", "categories"] 27 | 28 | # If used, must have same length as taxonomyCloud 29 | taxonomyCloudTitle = ["Tag Cloud", "Categories"] 30 | 31 | # set taxonomyPageHeader = [] to hide taxonomies on the page headers 32 | taxonomyPageHeader = ["tags", "categories"] 33 | 34 | 35 | # Highlighting config 36 | pygmentsCodeFences = true 37 | pygmentsUseClasses = false 38 | # Use the new Chroma Go highlighter in Hugo. 39 | pygmentsUseClassic = false 40 | #pygmentsOptions = "linenos=table" 41 | # See https://help.farbox.com/pygments.html 42 | pygmentsStyle = "tango" 43 | 44 | # Configure how URLs look like per section. 45 | [permalinks] 46 | blog = "/:section/:year/:month/:day/:slug/" 47 | 48 | ## Configuration for BlackFriday markdown parser: https://github.com/russross/blackfriday 49 | [blackfriday] 50 | plainIDAnchors = true 51 | hrefTargetBlank = true 52 | angledQuotes = false 53 | latexDashes = true 54 | 55 | # Image processing configuration. 56 | [imaging] 57 | resampleFilter = "CatmullRom" 58 | quality = 75 59 | anchor = "smart" 60 | 61 | [services] 62 | [services.googleAnalytics] 63 | # Comment out the next line to disable GA tracking. Also disables the feature described in [params.ui.feedback]. 64 | id = "UA-00000000-0" 65 | 66 | # Language configuration 67 | 68 | [languages.zh] 69 | title = "Istio 运维实战" 70 | description = "Istio 运维实战" 71 | languageName ="中文" 72 | contentDir = "content/zh" 73 | time_format_default = "2006.02.01" 74 | time_format_blog = "2006.02.01" 75 | 76 | [markup] 77 | [markup.goldmark] 78 | [markup.goldmark.renderer] 79 | unsafe = true 80 | [markup.highlight] 81 | # See a complete list of available styles at https://xyproto.github.io/splash/docs/all.html 82 | style = "tango" 83 | # Uncomment if you want your chosen highlight style used for code blocks without a specified language 84 | # guessSyntax = "true" 85 | 86 | # Everything below this are Site Params 87 | 88 | # Comment out if you don't want the "print entire section" link enabled. 89 | [outputs] 90 | section = ["HTML", "print", "RSS"] 91 | 92 | [params] 93 | copyright = "Aeraki Mesh" 94 | privacy_policy = "https://www.aeraki.net/" 95 | 96 | # First one is picked as the Twitter card image if not set on page. 97 | # images = ["images/project-illustration.png"] 98 | 99 | # Menu title if your navbar has a versions selector to access old versions of your site. 100 | # This menu appears only if you have at least one [params.versions] set. 101 | version_menu = "Releases" 102 | 103 | # Flag used in the "version-banner" partial to decide whether to display a 104 | # banner on every page indicating that this is an archived version of the docs. 105 | # Set this flag to "true" if you want to display the banner. 106 | archived_version = false 107 | 108 | # The version number for the version of the docs represented in this doc set. 109 | # Used in the "version-banner" partial to display a version number for the 110 | # current doc set. 111 | version = "0.0" 112 | 113 | # A link to latest version of the docs. Used in the "version-banner" partial to 114 | # point people to the main doc site. 115 | url_latest_version = "https://github.com/aeraki-mesh/istio-operation-bible" 116 | 117 | # Repository configuration (URLs for in-page links to opening issues and suggesting changes) 118 | github_repo = "https://github.com/aeraki-mesh/istio-operation-bible" 119 | # An optional link to a related project repo. For example, the sibling repository where your product code lives. 120 | #github_project_repo = "https://github.com/aeraki-mesh/aeraki" 121 | 122 | # Specify a value here if your content directory is not in your repo's root directory 123 | # github_subdir = "" 124 | 125 | # Uncomment this if your GitHub repo does not have "main" as the default branch, 126 | # or specify a new value if you want to reference another branch in your GitHub links 127 | github_branch= "master" 128 | 129 | # Google Custom Search Engine ID. Remove or comment out to disable search. 130 | gcs_engine_id = "d72aa9b2712488cc3" 131 | 132 | # Enable Algolia DocSearch 133 | algolia_docsearch = false 134 | 135 | # Enable Lunr.js offline search 136 | offlineSearch = false 137 | 138 | # Enable syntax highlighting and copy buttons on code blocks with Prism 139 | prism_syntax_highlighting = false 140 | 141 | # User interface configuration 142 | [params.ui] 143 | # Set to true to disable breadcrumb navigation. 144 | breadcrumb_disable = false 145 | # Set to true to disable the About link in the site footer 146 | footer_about_disable = true 147 | # Set to false if you don't want to display a logo (/assets/icons/logo.svg) in the top navbar 148 | navbar_logo = true 149 | # Set to true if you don't want the top navbar to be translucent when over a `block/cover`, like on the homepage. 150 | navbar_translucent_over_cover_disable = false 151 | # Enable to show the side bar menu in its compact state. 152 | sidebar_menu_compact = true 153 | ul_show = 1 154 | sidebar_menu_foldable = true 155 | sidebar_cache_limit = 100 156 | # Set to true to hide the sidebar search box (the top nav search box will still be displayed if search is enabled) 157 | sidebar_search_disable = false 158 | 159 | # Adds a H2 section titled "Feedback" to the bottom of each doc. The responses are sent to Google Analytics as events. 160 | # This feature depends on [services.googleAnalytics] and will be disabled if "services.googleAnalytics.id" is not set. 161 | # If you want this feature, but occasionally need to remove the "Feedback" section from a single page, 162 | # add "hide_feedback: true" to the page's front matter. 163 | [params.ui.feedback] 164 | enable = false 165 | # The responses that the user sees after clicking "yes" (the page was helpful) or "no" (the page was not helpful). 166 | yes = 'Glad to hear it! Please tell us how we can improve.' 167 | no = 'Sorry to hear that. Please tell us how we can improve.' 168 | 169 | # Adds a reading time to the top of each doc. 170 | # If you want this feature, but occasionally need to remove the Reading time from a single page, 171 | # add "hide_readingtime: true" to the page's front matter 172 | [params.ui.readingtime] 173 | enable = false 174 | 175 | [params.links] 176 | # Developer relevant links. These will show up on right side of footer and in the community page if you have one. 177 | [[params.links.developer]] 178 | name = "GitHub" 179 | url = "https://github.com/aeraki-mesh/istio-operation-bible" 180 | icon = "fab fa-github" 181 | desc = "Development takes place here!" 182 | [params.plantuml] 183 | enable = true 184 | theme = "default" 185 | 186 | #Set url to plantuml server 187 | #default is http://www.plantuml.com/plantuml/svg/ 188 | svg_image_url = "https://www.plantuml.com/plantuml/svg/" 189 | 190 | # hugo module configuration 191 | 192 | [module] 193 | # uncomment line below for temporary local development of module 194 | # replacements = "github.com/google/docsy -> ../../docsy" 195 | [module.hugoVersion] 196 | extended = true 197 | min = "0.75.0" 198 | [[module.imports]] 199 | path = "github.com/google/docsy" 200 | disable = false 201 | [[module.imports]] 202 | path = "github.com/google/docsy/dependencies" 203 | disable = false 204 | -------------------------------------------------------------------------------- /content/zh/_index.html: -------------------------------------------------------------------------------- 1 | +++ 2 | title = "Istio 运维实战" 3 | linkTitle = "Istio 运维实战" 4 | 5 | +++ 6 | 7 | {{< blocks/cover title="Istio 运维实战" image_anchor="top" height="full" color="orange" >}} 8 |
9 | }}"> 10 | 在线阅读 11 | 12 | 13 | 查看源码 14 | 15 | 19 |
20 | {{< /blocks/cover >}} 21 | 22 | 23 | 86 | -------------------------------------------------------------------------------- /content/zh/docs/_index.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "Istio 运维实战" 4 | linkTitle: "Istio 运维实战" 5 | weight: 1 6 | date: 2022-07-05 7 | menu: 8 | main: 9 | weight: 20 10 | pre: 11 | --- 12 | ## 前言 13 | 通过将微服务中原本在 SDK 中实现的应用流量管理、可见性、通信安全等服务治理能力下放到一个专门的“服务网格”基础设施中,Istio 解开了微服务的服务治理需求和业务逻辑之间的代码、编译、部署时机等的耦合,让微服务真正做到了承诺的“按需选择开发语言”,“独立部署升级”等能力,提升了微服务开发和部署的敏捷性,释放了微服务模式的生产力。 14 | 15 | 然而,“服务网格”这一基础设施的引入也给整个微服务的运维技术栈带来了新的挑战。对于运维同学来说,Istio 和 Envoy 的运维存在着较陡的学习曲线。腾讯云 TCM(Tencent Cloud Mesh)团队是业内最早一批接触服务网格技术的人员之一,有着大量 Istio/Envoy 故障排查和运维经验。本电子书记录了腾讯云 TCM 团队从大量实际案例中总结出来的 Istio 运维经验,以及使用 Istio 的最佳实践,希望对大家有所帮助。 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /content/zh/docs/best-practice/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Istio 最佳实践" 3 | linkTitle: "Istio 最佳实践" 4 | weight: 3 5 | description: > 6 | 介绍用户从 Spring Cloud,Dubbo 等传统微服务框架迁移到 Istio 服务网格时的最佳实践 7 | --- -------------------------------------------------------------------------------- /content/zh/docs/best-practice/async-message-tracing/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "在 Istio 中实现异步消息调用跟踪" 3 | linkTitle: "在 Istio 中实现异步消息调用跟踪" 4 | weight: 5 5 | date: 2022-07-06 6 | description: 7 | --- 8 | 9 | 在实际项目中,除了同步调用之外,异步消息也是微服务架构中常见的一种通信方式。在本篇文章中,我将继续利用 eshop demo 程序来探讨如何通过 OpenTracing 将 Kafka 异步消息也纳入到 Istio 的分布式调用跟踪中。 10 | 11 | # eshop 示例程序结构 12 | 13 | 如下图所示,demo 程序中增加了发送和接收 Kafka 消息的代码。eshop 微服务在调用 inventory,billing,delivery 服务后,发送了一个 kafka 消息通知,consumer 接收到通知后调用 notification 服务的 REST 接口向用户发送购买成功的邮件通知。 14 | ![ 'eshop-demo.jpg'](image/eshop-demo-1.jpg) 15 | 16 | # 将 Kafka 消息处理加入调用链跟踪 17 | 18 | ## 植入 Kafka OpenTracing 代码 19 | 首先从 github 下载代码。 20 | 21 | ```bash 22 | git clone git@github.com:aeraki-framework/method-level-tracing-with-istio.git 23 | ``` 24 | 25 | 可以直接使用该代码,但建议跟随下面的步骤查看相关的代码,以了解各个步骤背后的原理。 26 | 27 | 根目录下分为了 rest-service 和 kafka-consumer 两个目录,rest-service 下包含了各个 REST 服务的代码,kafka-consumer 下是 Kafka 消息消费者的代码。 28 | 29 | 首先需要将 spring kafka 和 OpenTracing kafka 的依赖加入到两个目录下的 pom 文件中。 30 | 31 | ```xml 32 | 33 | org.springframework.kafka 34 | spring-kafka 35 | 36 | 37 | io.opentracing.contrib 38 | opentracing-kafka-client 39 | ${version.opentracing.kafka-client} 40 | 41 | ``` 42 | 43 | 在 rest-service 目录中的 KafkaConfig.java 中配置消息 Producer 端的 OpenTracing Instrument。TracingProducerInterceptor 会在发送 Kafka 消息时生成发送端的 Span。 44 | 45 | ```java 46 | @Bean 47 | public ProducerFactory producerFactory() { 48 | Map configProps = new HashMap<>(); 49 | configProps.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapAddress); 50 | configProps.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, StringSerializer.class); 51 | configProps.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class); 52 | configProps.put(ProducerConfig.INTERCEPTOR_CLASSES_CONFIG, TracingProducerInterceptor.class.getName()); 53 | return new DefaultKafkaProducerFactory<>(configProps); 54 | } 55 | ``` 56 | 57 | 在 kafka-consumer 目录中的 KafkaConfig.java 中配置消息 Consumer 端的 OpenTracing Instrument。TracingConsumerInterceptor 会在接收到 Kafka 消息是生成接收端的 Span。 58 | 59 | ```java 60 | @Bean 61 | public ConsumerFactory consumerFactory() { 62 | Map props = new HashMap<>(); 63 | props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapAddress); 64 | props.put(ConsumerConfig.GROUP_ID_CONFIG, groupId); 65 | props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); 66 | props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class); 67 | props.put(ConsumerConfig.INTERCEPTOR_CLASSES_CONFIG, TracingConsumerInterceptor.class.getName()); 68 | return new DefaultKafkaConsumerFactory<>(props); 69 | } 70 | ``` 71 | 只需要这两步即可完成 Spring 程序的 Kafka OpenTracing 代码植入。下面安装并运行示例程序查看效果。 72 | 73 | ## 安装 Kafka 集群 74 | 75 | 示例程序中使用到了 Kafka 消息,因此我们在 TKE 集群中部署一个简单的 Kafka 实例: 76 | 77 | ```bash 78 | cd method-level-tracing-with-istio 79 | kubectl apply -f k8s/kafka.yaml 80 | ``` 81 | 82 | ## 部署 demo 应用 83 | 84 | 修改 Kubernetes yaml 部署文件 k8s/eshop.yaml,设置 Kafka bootstrap server,以用于 demo 程序连接到 Kafka 集群中。 85 | 86 | ```yml 87 | apiVersion: apps/v1 88 | kind: Deployment 89 | metadata: 90 | name: delivery 91 | ...... 92 | spec: 93 | containers: 94 | - name: eshop 95 | image: aeraki/istio-opentracing-demo:latest 96 | ports: 97 | - containerPort: 8080 98 | env: 99 | .... 100 | //在这里加入 Kafka server 地址 101 | - name: KAFKA_BOOTSTRAP_SERVERS 102 | value: "kafka-service:9092" 103 | 104 | --- 105 | apiVersion: apps/v1 106 | kind: Deployment 107 | metadata: 108 | name: kafka-consumer 109 | ...... 110 | spec: 111 | containers: 112 | - name: kafka-consumer 113 | image: aeraki/istio-opentracing-demo-kafka-consumer:latest 114 | env: 115 | .... 116 | //在这里加入 Kafka server 地址 117 | - name: KAFKA_BOOTSTRAP_SERVERS 118 | value: "kafka-service:9092" 119 | ``` 120 | 121 | 然后部署应用程序,相关的镜像可以直接从 dockerhub 下载,也可以通过源码编译生成。 122 | 123 | ```bash 124 | kubectl apply -f k8s/eshop.yaml 125 | ``` 126 | 127 | 在浏览器中打开地址:http://${INGRESS_EXTERNAL_IP}/checkout ,以触发调用 eshop 示例程序的 REST 接口。然后打开 TCM 的界面查看生成的分布式调用跟踪信息。 128 | ![ 'Screen Shot 2021-04-01 at 2.43.06 PM.png'](image/trace-screenshot-5.png) 129 | 130 | 从图中可以看到,在调用链中增加了两个 Span,分布对应于Kafka消息发送和接收的两个操作。由于Kafka消息的处理是异步的,消息发送端不直接依赖接收端的处理。根据 OpenTracing 对引用关系的定义,From_eshop_topic Span 对 To_eshop_topic Span 的引用关系是 FOLLOWS_FROM 而不是 CHILD_OF 关系。 131 | 132 | # 将调用跟踪上下文从Kafka传递到REST服务 133 | 134 | 现在 eshop 代码中已经加入了 REST 和 Kafka 的 OpenTracing Instrumentation,可以在进行 REST 调用和发送 Kafka 消息时生成调用跟踪信息。但如果需要从 Kafka 的消息消费者的处理方法中调用一个 REST 接口呢? 135 | 136 | 我们会发现在 eshop 示例程序中,缺省生成的调用链里面并不会把 Kafka 消费者的 Span 和其发起的调用 notification 服务的 REST 请求的 Span 关联在同一个 Trace 中。 137 | 138 | 要分析导致该问题的原因,我们首先需要了解[“Active Span”](https://opentracing.io/docs/overview/scopes-and-threading/)的概念。在 OpenTracing 中,一个线程可以有一个 Active Span,该 Active Span 代表了目前该线程正在执行的工作。在调用 Tracer.buildSpan() 方法创建新的 Span 时,如果 Tracer 目前存在一个 Active Span,则会将该 Active Span 缺省作为新创建的 Span 的 Parent Span。 139 | 140 | Tracer.buildSpan 方法的说明如下: 141 | 142 | ```java 143 | Tracer.SpanBuilder buildSpan(String operationName) 144 | Return a new SpanBuilder for a Span with the given `operationName`. 145 | You can override the operationName later via BaseSpan.setOperationName(String). 146 | 147 | A contrived example: 148 | 149 | 150 | Tracer tracer = ... 151 | 152 | // Note: if there is a `tracer.activeSpan()`, it will be used as the target of an implicit CHILD_OF 153 | // Reference for "workSpan" when `startActive()` is invoked. 154 | // 如果存在 active span,则其创建的新 Span 会隐式地创建一个 CHILD_OF 引用到该 active span 155 | try (ActiveSpan workSpan = tracer.buildSpan("DoWork").startActive()) { 156 | workSpan.setTag("...", "..."); 157 | // etc, etc 158 | } 159 | 160 | // 也可以通过 asChildOf 方法指定新创建的 Span 的 Parent Span 161 | // It's also possible to create Spans manually, bypassing the ActiveSpanSource activation. 162 | Span http = tracer.buildSpan("HandleHTTPRequest") 163 | .asChildOf(rpcSpanContext) // an explicit parent 164 | .withTag("user_agent", req.UserAgent) 165 | .withTag("lucky_number", 42) 166 | .startManual(); 167 | ``` 168 | 169 | 分析 Kafka OpenTracing Instrumentation 的代码,会发现 TracingConsumerInterceptor 在调用 Kafka 消费者的处理方法之前已经把消费者的 Span 结束了,因此发起 REST 调用时 tracer 没有 active span,不会将 Kafka 消费者的 Span 作为后面 REST 调用的 parent span。 170 | 171 | ```java 172 | public static void buildAndFinishChildSpan(ConsumerRecord record, Tracer tracer, 173 | BiFunction consumerSpanNameProvider) { 174 | SpanContext parentContext = TracingKafkaUtils.extractSpanContext(record.headers(), tracer); 175 | 176 | String consumerOper = 177 | FROM_PREFIX + record.topic(); // <====== It provides better readability in the UI 178 | Tracer.SpanBuilder spanBuilder = tracer 179 | .buildSpan(consumerSpanNameProvider.apply(consumerOper, record)) 180 | .withTag(Tags.SPAN_KIND.getKey(), Tags.SPAN_KIND_CONSUMER); 181 | 182 | if (parentContext != null) { 183 | spanBuilder.addReference(References.FOLLOWS_FROM, parentContext); 184 | } 185 | 186 | Span span = spanBuilder.start(); 187 | SpanDecorator.onResponse(record, span); 188 | 189 | //在调用消费者的处理方法之前,该 Span 已经被结束。 190 | span.finish(); 191 | 192 | // Inject created span context into record headers for extraction by client to continue span chain 193 | //这个 Span 被放到了 Kafka 消息的 header 中 194 | TracingKafkaUtils.inject(span.context(), record.headers(), tracer); 195 | } 196 | ``` 197 | 198 | 此时 TracingConsumerInterceptor 已经将 Kafka 消费者的 Span 放到了 Kafka 消息的 header 中,因此从 Kafka 消息头中取出该 Span,显示地将 Kafka 消费者的 Span 作为 REST 调用的 Parent Span 即可。 199 | 200 | 为MessageConsumer.java使用的RestTemplate设置一个TracingKafka2RestTemplateInterceptor。 201 | 202 | ```java 203 | @KafkaListener(topics = "eshop-topic") 204 | public void receiveMessage(ConsumerRecord record) { 205 | restTemplate 206 | .setInterceptors(Collections.singletonList(new TracingKafka2RestTemplateInterceptor(record.headers()))); 207 | restTemplate.getForEntity("http://notification:8080/sendEmail", String.class); 208 | } 209 | ``` 210 | 211 | TracingKafka2RestTemplateInterceptor 是基于 Spring OpenTracing Instrumentation 的 TracingRestTemplateInterceptor 修改的,将从 Kafka header 中取出的 Span 设置为出向请求的 Span 的 Parent Span。 212 | 213 | ```java 214 | @Override 215 | public ClientHttpResponse intercept(HttpRequest httpRequest, byte[] body, ClientHttpRequestExecution xecution) 216 | throws IOException { 217 | ClientHttpResponse httpResponse; 218 | SpanContext parentSpanContext = TracingKafkaUtils.extractSpanContext(headers, tracer); 219 | Span span = tracer.buildSpan(httpRequest.getMethod().toString()).asChildOf(parentSpanContext) 220 | .withTag(Tags.SPAN_KIND.getKey(), Tags.SPAN_KIND_CLIENT).start(); 221 | ...... 222 | } 223 | ``` 224 | 225 | 在浏览器中打开地址:http://${INGRESS_EXTERNAL_IP}/checkout ,以触发调用 eshop 示例程序的 REST 接口。然后打开 TCM 的界面查看生成的分布式调用跟踪信息。 226 | ![ 'WeChatWorkScreenshot_487c2202-4960-48be-b6f6-33fbec457cf8 copy.png'](image/trace-screenshot-5.png) 227 | 228 | 从上图可以看到,调用链中出现了 Kafka 消费者调用 notification 服务的 sendEmail REST 接口的 Span。从图中可以看到,由于调用链经过了 Kafka 消息,sendEmail Span 的时间没有包含在 checkout Span 中。 229 | 230 | # 总结 231 | 232 | Istio 服务网格通过分布式调用跟踪来提高微服务应用的可见性,这需要在应用程序中通过 HTTP header 传递调用跟踪的上下文。对于 JAVA 应用程序,我们可以使用 OpenTracing Instrumentation 来代替应用编码传递分布式跟踪的相关 http header,以减少对业务代码的影响;我们还可以将方法级的调用跟踪和 Kafka 消息的调用跟踪加入到 Istio 生成的调用跟踪链中,以为应用程序的故障定位提供更为丰富详细的调用跟踪信息。 233 | 234 | # 参考资料 235 | 236 | 1. [本文中 eshop 示例程序的源代码](https://github.com/aeraki-framework/method-level-tracing-with-istio) 237 | 238 | -------------------------------------------------------------------------------- /content/zh/docs/best-practice/http-header-case/_index.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "在 Istio 中指定 HTTP Header 大小写" 4 | linkTitle: "在 Istio 中指定 HTTP Header 大小写" 5 | weight: 2 6 | date: 2022-07-06 7 | description: 8 | --- 9 | 10 | # 在 Istio 中指定 HTTP Header 大小写 11 | 12 | ## 问题背景 13 | 14 | Envoy 缺省会把 HTTP Header 的 key 转换为小写,例如有一个 HTTP Header Test-Upper-Case-Header: some-value,经过 Envoy 代理后会变成 test-upper-case-header: some-value。这个在正常情况下没问题,RFC 2616 规范也说明了处理 HTTP Header 应该是大小写不敏感的。 15 | 16 | 部分场景下,业务请求对某些 Header 字段有大小写要求,此时被 Envoy 转换成为小些会导致请求出现问题。 17 | 18 | ## 解决方案 19 | 20 | Envoy 支持几种不同的 Header 规则: 21 | - 全小写(默认规则) 22 | - 首字母大写 23 | 24 | Envoy 1.8 之后新增支持: 25 | - 保留请求原本样式 26 | 27 | 基于以上能力,为了解决 Header 默认改为小写的问题在 Istio 1.8 及之前可配置成为首字母大写形式,Istio 1.10 及以后可以配置保留 Header 原有样式。 28 | 29 | ## 配置方法 30 | 31 | Istio 1.8 之前可添加如下 EnvoyFilter 配置: 32 | ```yaml 33 | apiVersion: networking.istio.io/v1alpha3 34 | kind: EnvoyFilter 35 | metadata: 36 | name: http-header-proper-case-words 37 | namespace: istio-system 38 | spec: 39 | configPatches: 40 | - applyTo: CLUSTER 41 | match: 42 | context: SIDECAR_OUTBOUND 43 | cluster: 44 | # 集群名称可通过 ConfigDump 查询 45 | name: "outbound|3000||test2.default.svc.cluster.local" 46 | patch: 47 | operation: MERGE 48 | value: 49 | http_protocol_options: 50 | header_key_format: 51 | proper_case_words: {} 52 | ``` 53 | 在需要依赖大写 Header 的服务对应的集群中添加规则,将 Header 全部转为首字母大写的形式。 54 | 55 | Istio 1.10 及之后可以添加如下 EnvoyFilter 配置: 56 | ```yaml 57 | apiVersion: networking.istio.io/v1alpha3 58 | kind: EnvoyFilter 59 | metadata: 60 | name: http-header-proper-case-words 61 | namespace: istio-system 62 | spec: 63 | configPatches: 64 | # 配置保留发向 upstream 的 request header 大小写 65 | - applyTo: CLUSTER 66 | patch: 67 | operation: MERGE 68 | value: 69 | typed_extension_protocol_options: 70 | envoy.extensions.upstreams.http.v3.HttpProtocolOptions: 71 | '@type': type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions 72 | use_downstream_protocol_config: 73 | http_protocol_options: 74 | header_key_format: 75 | stateful_formatter: 76 | name: preserve_case 77 | typed_config: 78 | '@type': type.googleapis.com/envoy.extensions.http.header_formatters.preserve_case.v3.PreserveCaseFormatterConfig 79 | # 配置保留收到的 response header 大小写 80 | - applyTo: NETWORK_FILTER 81 | match: 82 | listener: 83 | filterChain: 84 | filter: 85 | name: envoy.filters.network.http_connection_manager 86 | patch: 87 | operation: MERGE 88 | value: 89 | typed_config: 90 | '@type': type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager 91 | http_protocol_options: 92 | header_key_format: 93 | stateful_formatter: 94 | name: preserve_case 95 | typed_config: 96 | '@type': type.googleapis.com/envoy.extensions.http.header_formatters.preserve_case.v3.PreserveCaseFormatterConfig 97 | 98 | ``` 99 | 通过此配置可以让 Envoy 保持 Header 原有大小写形式。 100 | 101 | Envoy 文档中对此的说明: https://www.envoyproxy.io/docs/envoy/latest/configuration/http/http_conn_man/header_casing#config-http-conn-man-header-casing 102 | -------------------------------------------------------------------------------- /content/zh/docs/best-practice/image/eshop-demo-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/best-practice/image/eshop-demo-1.jpg -------------------------------------------------------------------------------- /content/zh/docs/best-practice/image/eshop-demo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/best-practice/image/eshop-demo.jpg -------------------------------------------------------------------------------- /content/zh/docs/best-practice/image/monolith-microserivce.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/best-practice/image/monolith-microserivce.jpg -------------------------------------------------------------------------------- /content/zh/docs/best-practice/image/screenshot1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/best-practice/image/screenshot1.png -------------------------------------------------------------------------------- /content/zh/docs/best-practice/image/trace-screenshot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/best-practice/image/trace-screenshot-1.png -------------------------------------------------------------------------------- /content/zh/docs/best-practice/image/trace-screenshot-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/best-practice/image/trace-screenshot-2.png -------------------------------------------------------------------------------- /content/zh/docs/best-practice/image/trace-screenshot-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/best-practice/image/trace-screenshot-3.png -------------------------------------------------------------------------------- /content/zh/docs/best-practice/image/trace-screenshot-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/best-practice/image/trace-screenshot-4.png -------------------------------------------------------------------------------- /content/zh/docs/best-practice/image/trace-screenshot-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/best-practice/image/trace-screenshot-5.png -------------------------------------------------------------------------------- /content/zh/docs/best-practice/image/trace-screenshot-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/best-practice/image/trace-screenshot-6.png -------------------------------------------------------------------------------- /content/zh/docs/best-practice/image/tracing_mental_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/best-practice/image/tracing_mental_model.png -------------------------------------------------------------------------------- /content/zh/docs/best-practice/internal-redirect/_index.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "Sidecar 初始化完成后再启动应用程序" 4 | linkTitle: "Sidecar 初始化完成后再启动应用程序" 5 | weight: 3 6 | date: 2022-07-06 7 | description: 8 | --- 9 | 10 | ## Envoy 内部重定向 11 | 12 | Envoy 支持在内部处理 3xx 重定向,捕获可配置的 3xx 重定向响应,合成一个新的请求,将其发送给新路由匹配指定的上游,将重定向的响应作为对原始请求的响应返回。原始请求的 header 和 body 将会发送至新位置。Trailers 尚不支持。 13 | 14 | 内部重定向可以使用路由配置中的 [internal_redirect_policy](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-routeaction-internal-redirect-policy) 字段来配置。 当重定向处理开启,任何来自上游的 3xx 响应,只要匹配到配置的 [redirect_response_codes](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-internalredirectpolicy-redirect-response-codes) 的响应都将由 Envoy 来处理。 15 | 16 | 如果 Envoy 内部重定向配置了 303 并且接收到了 303 响应,如果原始请求不是 GET 或者 HEAD,Envoy 将使用没有 body 的 GET 处理重定向。如果原始请求是 GET 或者 HEAD,Envoy 将使用原始的 HTTP Method 处理重定向。更多信息请查看 [RFC 7231 Section 6.4.4](https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4) 。 17 | 18 | 要成功地处理重定向,必须通过以下检查: 19 | 20 | 1. 响应码匹配到配置的 [redirect_response_codes](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-internalredirectpolicy-redirect-response-codes) ,默认是 302, 或者其他的 3xx 状态码(301, 302, 303, 307, 308)。 21 | 2. 拥有一个有效的、完全限定的 URL 的 location 头。 22 | 3. 该请求必须已被 Envoy 完全处理。 23 | 4. 请求必须小于 [per_request_buffer_limit_bytes](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-route-per-request-buffer-limit-bytes) 的限制。 24 | 5. [allow_cross_scheme_redirect](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-internalredirectpolicy-allow-cross-scheme-redirect) 是 true(默认是 false), 或者下游请求的 scheme 和 location 头一致。 25 | 6. 给定的下游请求之前处理的内部重定向次数不超过请求或重定向请求命中的路由配置的 [max_internal_redirects](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-internalredirectpolicy-max-internal-redirects) 。 26 | 7. 所有 [predicates](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-internalredirectpolicy-predicates) 都接受目标路由。 27 | 28 | 任何失败都将导致重定向传递给下游。 29 | 30 | 由于重定向请求可能会在不同的路由之间传递,重定向链中的任何满足以下条件的路由都将导致重定向被传递给下游。 31 | 32 | 1. 没有启用内部重定向 33 | 2. 或者当重定向链命中的路由的 [max_internal_redirects](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-internalredirectpolicy-max-internal-redirects) 小于等于重定向链的长度。 34 | 3. 或者路由被 [predicates](https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/route/v3/route_components.proto#envoy-v3-api-field-config-route-v3-internalredirectpolicy-predicates) 拒绝。 35 | 36 | [previous_routes](https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/internal_redirect/previous_routes/v3/previous_routes_config.proto#envoy-v3-api-msg-extensions-internal-redirect-previous-routes-v3-previousroutesconfig) 和 [allow_listed_routes](https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/internal_redirect/allow_listed_routes/v3/allow_listed_routes_config.proto#envoy-v3-api-msg-extensions-internal-redirect-allow-listed-routes-v3-allowlistedroutesconfig) 这两个 predicates 可以创建一个有向无环图 (DAG) 来定义一个过滤器链,具体来说,allow_listed_routes 定义的有向无环图(DAG)中各个节点的边,而 previous_routes 定义了边的“访问”状态,因此如果需要就可以避免循环。 37 | 38 | 第三个 predicate [safe_cross_scheme](https://www.envoyproxy.io/docs/envoy/latest/api-v3/extensions/internal_redirect/safe_cross_scheme/v3/safe_cross_scheme_config.proto#envoy-v3-api-msg-extensions-internal-redirect-safe-cross-scheme-v3-safecrossschemeconfig) 被用来阻止 HTTP -> HTTPS 的重定向。 39 | 40 | 一旦重定向通过这些检查,发送到原始上游的请求头将被修改为: 41 | 42 | - 将完全限定的原始请求 URL 放到 x-envoy-original-url 头中。 43 | - 使用 Location 头中的值替换 Authority/Host、Scheme、Path 头。 44 | 45 | 修改后的请求头将选择一个新的路由,通过一个新的过滤器链发送,然后把所有正常的 Envoy 请求都发送到上游进行清理。 46 | 47 | 请注意,HTTP 连接管理器头清理(例如清除不受信任的标头)仅应用一次。即使原始路由和第二个路由相同,每个路由的头修改也将同时应用于原始路由和第二路由,因此请谨慎配置头修改规则, 以避免重复不必要的请求头值。 48 | 49 | 50 | 一个简单的重定向流如下所示: 51 | 52 | 1. 客户端发送 GET 请求以获取 http://foo.com/bar 53 | 2. 上游 1 发送 302 响应码并携带 “location: http://baz.com/eep” 54 | 3. Envoy 被配置为允许原始路由上重定向,并发送新的 GET 请求到上游 2,携带请求头 “x-envoy-original-url: http://foo.com/bar” 获取 http://baz.com/eep 55 | 4. Envoy 将 http://baz.com/eep 的响应数据代理到客户端,作为对原始请求的响应。 56 | 57 | ## 在 Isito 中通过 Envoyfilter 开启内部重定向 58 | 59 | ```yaml 60 | apiVersion: networking.istio.io/v1alpha3 61 | kind: EnvoyFilter 62 | metadata: 63 | name: follow-redirects 64 | namespace: istio-system 65 | spec: 66 | workloadSelector: 67 | labels: 68 | app: istio-ingressgateway 69 | configPatches: 70 | - applyTo: HTTP_ROUTE 71 | match: 72 | context: ANY 73 | patch: 74 | operation: MERGE 75 | value: 76 | route: 77 | internal_redirect_policy: 78 | max_internal_redirects: 5 79 | redirect_response_codes: ["302"] 80 | ``` 81 | 82 | ## 测试 83 | 84 | 开启前 85 | 86 | ```bash 87 | curl -i '172.16.0.2/redirect-to?url=http://172.16.0.2/status/200' 88 | 89 | HTTP/1.1 302 Found 90 | server: istio-envoy 91 | date: Fri, 11 Mar 2022 07:20:38 GMT 92 | content-type: text/html; charset=utf-8 93 | content-length: 0 94 | location: http://172.16.0.2/status/200 95 | access-control-allow-origin: * 96 | access-control-allow-credentials: true 97 | x-envoy-upstream-service-time: 1 98 | ``` 99 | 100 | 开启后 101 | 102 | ```bash 103 | curl -i '172.16.0.2/redirect-to?url=http://172.16.0.2/status/200' 104 | 105 | HTTP/1.1 200 OK 106 | server: istio-envoy 107 | date: Fri, 11 Mar 2022 07:21:03 GMT 108 | content-type: text/html; charset=utf-8 109 | access-control-allow-origin: * 110 | access-control-allow-credentials: true 111 | content-length: 0 112 | x-envoy-upstream-service-time: 0 113 | ``` 114 | 115 | 注意 location 需返回完整 URL,下面这种情况不会触发内部重定向 116 | 117 | ```bash 118 | curl -i '172.16.0.2/status/302' 119 | 120 | HTTP/1.1 302 Found 121 | server: istio-envoy 122 | date: Fri, 11 Mar 2022 07:30:38 GMT 123 | location: /redirect/1 124 | access-control-allow-origin: * 125 | access-control-allow-credentials: true 126 | content-length: 0 127 | x-envoy-upstream-service-time: 1 128 | ``` 129 | 130 | ## 参考资料 131 | * https://www.envoyproxy.io/docs/envoy/latest/intro/arch_overview/http/http_connection_management#internal-redirects 132 | * https://cloudnative.to/blog/envoy-http-connection-management/ 133 | * https://github.com/istio/istio/issues/32673 134 | -------------------------------------------------------------------------------- /content/zh/docs/best-practice/method-level-trcing/_index.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "在 Istio 中实现方法级调用跟踪" 4 | linkTitle: "在 Istio 中实现方法级调用跟踪" 5 | weight: 4 6 | date: 2022-07-06 7 | description: 8 | --- 9 | 10 | 本文将通过一个网上商店的示例程序介绍如何利用 Spring 和 OpenTracing 简化应用程序的 Tracing 上下文传递,以及如何在 Istio 提供的进程间调用跟踪基础上实现方法级别的细粒度调用跟踪。 11 | 12 | # 分布式调用跟踪和 OpenTracing 规范 13 | 14 | ## 什么是分布式调用跟踪? 15 | 16 | 相比传统的“巨石”应用,微服务的一个主要变化是将应用中的不同模块拆分为了独立的进程。在微服务架构下,原来进程内的方法调用成为了跨进程的 RPC 调用。相对于单一进程的方法调用,跨进程调用的调试和故障分析是非常困难的,很难用传统的调试器或者日志打印来对分布式调用进行查看和分析。 17 | ![ 'monolith-microserivce.jpg'](../image/monolith-microserivce.jpg) 18 | 如上图所示,一个来自客户端的请求经过了多个微服务进程。如果要对该请求进行分析,则必须将该请求经过的所有服务的相关信息都收集起来并关联在一起,这就是“分布式调用跟踪”。 19 | 20 | ## 什么是 OpenTracing? 21 | 22 | ### CNCF OpenTracing 项目 23 | 24 | [OpenTracing](http://https://opentracing.io/)是[CNCF](https://www.cncf.io/)(云原生计算基金会)下的一个项目,其中包含了一套分布式调用跟踪的标准规范,各种语言的 API,编程框架和函数库。OpenTracing 的目的是定义一套分布式调用跟踪的标准,以统一各种分布式调用跟踪的实现。目前已有大量支持 [OpenTracing 规范的 Tracer 实现](https://opentracing.io/docs/supported-tracers/),包括 Jager,Skywalking,LightStep 等。在微服务应用中采用 OpenTracing API 实现分布式调用跟踪,可以避免 vendor locking,以最小的代价和任意一个兼容 OpenTracing 的基础设施进行对接。 25 | 26 | ### OpenTracing 概念模型 27 | 28 | OpenTracing 的概念模型参见下图: 29 | 30 | ![ 'tracing_mental_model.png'](../image/tracing_mental_model.png) 31 | 图源自 [https://opentracing.io/](https://opentracing.io/) 32 | 如图所示,OpenTracing 中主要包含下述几个概念: 33 | 34 | * Trace: 描述一个分布式系统中的端到端事务,例如来自客户端的一个请求。 35 | * Span:一个具有名称和时间长度的操作,例如一个 REST 调用或者数据库操作等。Span 是分布式调用跟踪的最小跟踪单位,一个 Trace 由多段 Span 组成。 36 | * Span context:分布式调用跟踪的上下文信息,包括 Trace id,Span id 以及其它需要传递到下游服务的内容。一个 OpenTracing 的实现需要将 Span context 通过某种序列化机制(Wire Protocol)在进程边界上进行传递,以将不同进程中的 Span 关联到同一个 Trace 上。这些 Wire Protocol 可以是基于文本的,例如 HTTP header,也可以是二进制协议。 37 | 38 | ### OpenTracing 数据模型 39 | 40 | 一个 Trace 可以看成由多个相互关联的 Span 组成的有向无环图(DAG 图)。下图是一个由 8 个 Span 组成的 Trace: 41 | 42 | ``` 43 | [Span A] ←←←(the root span) 44 | | 45 | +------+------+ 46 | | | 47 | [Span B] [Span C] ←←←(Span C is a `ChildOf` Span A) 48 | | | 49 | [Span D] +---+-------+ 50 | | | 51 | [Span E] [Span F] >>> [Span G] >>> [Span H] 52 | ↑ 53 | ↑ 54 | ↑ 55 | (Span G `FollowsFrom` Span F) 56 | ``` 57 | 上图的 trace 也可以按照时间先后顺序表示如下: 58 | ``` 59 | ––|–––––––|–––––––|–––––––|–––––––|–––––––|–––––––|–––––––|–> time 60 | 61 | [Span A···················································] 62 | [Span B··············································] 63 | [Span D··········································] 64 | [Span C········································] 65 | [Span E·······] [Span F··] [Span G··] [Span H··] 66 | ``` 67 | 68 | Span 的数据结构中包含以下内容: 69 | 70 | * name: Span 所代表的操作名称,例如 REST 接口对应的资源名称。 71 | * Start timestamp: Span 所代表操作的开始时间 72 | * Finish timestamp: Span 所代表的操作的的结束时间 73 | * Tags:一系列标签,每个标签由一个 key value 键值对组成。该标签可以是任何有利于调用分析的信息,例如方法名,URL 等。 74 | * SpanContext:用于跨进程边界传递 Span 相关信息,在进行传递时需要结合一种序列化协议(Wire Protocol)使用。 75 | * References:该 Span 引用的其它关联 Span,主要有两种引用关系,Childof 和 FollowsFrom。 76 | * Childof: 最常用的一种引用关系,表示 Parent Span 和 Child Span 之间存在直接的依赖关系。例 RPC 服务端 Span 和 RPC 客户端 Span,或者数据库 SQL 插入 Span 和 ORM Save 动作 Span 之间的关系。 77 | * FollowsFrom:如果 Parent Span 并不依赖 Child Span 的执行结果,则可以用 FollowsFrom 表示。例如网上商店购物付款后会向用户发一个邮件通知,但无论邮件通知是否发送成功,都不影响付款成功的状态,这种情况则适用于用 FollowsFrom 表示。 78 | 79 | ### 跨进程调用信息传播 80 | 81 | SpanContext 是 OpenTracing 中一个让人比较迷惑的概念。在 OpenTracing 的概念模型中提到 SpanContext 用于跨进程边界传递分布式调用的上下文。但实际上 OpenTracing 只定义一个 SpanContext 的抽象接口,该接口封装了分布式调用中一个 Span 的相关上下文内容,包括该 Span 所属的 Trace id,Span id 以及其它需要传递到 downstream 服务的信息。SpanContext 自身并不能实现跨进程的上下文传递,需要由 Tracer(Tracer 是一个遵循 OpenTracing 协议的实现,如 Jaeger,Skywalking 的 Tracer)将 SpanContext 序列化后通过 Wire Protocol 传递到下一个进程中,然后在下一个进程将 SpanContext 反序列化,得到相关的上下文信息,以用于生成 Child Span。 82 | 83 | 为了为各种具体实现提供最大的灵活性,OpenTracing 只是提出了跨进程传递 SpanContext 的要求,并未规定将 SpanContext 进行序列化并在网络中传递的具体实现方式。各个不同的 Tracer 可以根据自己的情况使用不同的 Wire Protocol 来传递 SpanContext。 84 | 85 | 在基于 HTTP 协议的分布式调用中,通常会使用 HTTP Header 来传递 SpanContext 的内容。常见的 Wire Protocol 包含 Zipkin 使用的 [b3 HTTP header](https://github.com/openzipkin/b3-propagation),Jaeger 使用的 [uber-trace-id HTTP Header](https://www.jaegertracing.io/docs/1.7/client-libraries/#trace-span-identity),LightStep 使用的 "x-ot-span-context" HTTP Header 等。Istio/Envoy 支持 b3 header 和 x-ot-span-context header,可以和 Zipkin,Jaeger 及 LightStep 对接。其中 b3 HTTP header 的示例如下: 86 | 87 | ``` 88 | X-B3-TraceId: 80f198ee56343ba864fe8b2a57d3eff7 89 | X-B3-ParentSpanId: 05e3ac9a4f6e3b90 90 | X-B3-SpanId: e457b5a2e4d86bd1 91 | X-B3-Sampled: 1 92 | ``` 93 | 94 | # Istio 对分布式调用跟踪的支持 95 | 96 | Istio/Envoy 为微服务提供了开箱即用的分布式调用跟踪功能。在安装了 Istio 和 Envoy 的微服务系统中,Envoy 会拦截服务的入向和出向请求,为微服务的每个调用请求自动生成调用跟踪数据。通过在服务网格中接入一个分布式跟踪的后端系统,例如 Zipkin 或者 Jaeger,就可以查看一个分布式请求的详细内容,例如该请求经过了哪些服务,调用了哪个 REST 接口,每个 REST 接口所花费的时间等。 97 | 98 | 需要注意的是,Istio/Envoy 虽然在此过程中完成了大部分工作,但还是要求对应用代码进行少量修改:应用代码中需要将收到的上游 HTTP 请求中的 b3 header 拷贝到其向下游发起的 HTTP 请求的 header 中,以将调用跟踪上下文传递到下游服务。这部分代码不能由 Envoy 代劳,原因是 Envoy 并不清楚其代理的服务中的业务逻辑,无法将入向请求和出向请求按照业务逻辑进行关联。这部分代码量虽然不大,但需要对每一处发起 HTTP 请求的代码都进行修改,非常繁琐而且容易遗漏。当然,可以将发起 HTTP 请求的代码封装为一个代码库来供业务模块使用,来简化该工作。 99 | 100 | 下面以一个简单的网上商店示例程序来展示 Istio 如何提供分布式调用跟踪。该示例程序由 eshop,inventory,billing,delivery 几个微服务组成,结构如下图所示: 101 | ![ 'eshop-demo.jpg'](../image/eshop-demo.jpg) 102 | eshop 微服务接收来自客户端的请求,然后调用 inventory,billing,delivery 这几个后端微服务的 REST 接口来实现用户购买商品的 checkout 业务逻辑。本例的代码可以从 github 下载:https://github.com/aeraki-framework/method-level-tracing-with-istio 103 | 104 | 如下面的代码所示,我们需要在 eshop 微服务的应用代码中传递 b3 HTTP Header。 105 | 106 | ```java 107 | @RequestMapping(value = "/checkout") 108 | public String checkout(@RequestHeader HttpHeaders headers) { 109 | String result = ""; 110 | // Use HTTP GET in this demo. In a real world use case,We should use HTTP POST 111 | // instead. 112 | // The three services are bundled in one jar for simplicity. To make it work, 113 | // define three services in Kubernets. 114 | result += restTemplate.exchange("http://inventory:8080/createOrder", HttpMethod.GET, 115 | new HttpEntity<>(passTracingHeader(headers)), String.class).getBody(); 116 | result += "
"; 117 | result += restTemplate.exchange("http://billing:8080/payment", HttpMethod.GET, 118 | new HttpEntity<>(passTracingHeader(headers)), String.class).getBody(); 119 | result += "
"; 120 | result += restTemplate.exchange("http://delivery:8080/arrangeDelivery", HttpMethod.GET, 121 | new HttpEntity<>(passTracingHeader(headers)), String.class).getBody(); 122 | return result; 123 | } 124 | private HttpHeaders passTracingHeader(HttpHeaders headers) { 125 | HttpHeaders tracingHeaders = new HttpHeaders(); 126 | extractHeader(headers, tracingHeaders, "x-request-id"); 127 | extractHeader(headers, tracingHeaders, "x-b3-traceid"); 128 | extractHeader(headers, tracingHeaders, "x-b3-spanid"); 129 | extractHeader(headers, tracingHeaders, "x-b3-parentspanid"); 130 | extractHeader(headers, tracingHeaders, "x-b3-sampled"); 131 | extractHeader(headers, tracingHeaders, "x-b3-flags"); 132 | extractHeader(headers, tracingHeaders, "x-ot-span-context"); 133 | return tracingHeaders; 134 | } 135 | 136 | ``` 137 | 138 | 下面我们来测试一下 eshop 实例程序。我们可以自己搭建一个 Kubernetes 集群并安装 Istio 以用于测试。这里为了方便,直接使用腾讯云上提供的全托管的服务网格 [TCM](https://console.cloud.tencent.com/tke2/mesh?rid=16),并在创建的 Mesh 中加入了一个容器服务 [TKE](https://console.cloud.tencent.com/tke2/cluster/startUp) 集群来进行测试。 139 | 140 | 在 TKE 集群中部署该程序,查看 Istio 分布式调用跟踪的效果。 141 | 142 | ```bash 143 | git clone git@github.com:aeraki-framework/method-level-tracing-with-istio.git 144 | cd method-level-tracing-with-istio 145 | git checkout without-opentracing 146 | kubectl apply -f k8s/eshop.yaml 147 | ``` 148 | 149 | * 在浏览器中打开地址:http://${INGRESS_EXTERNAL_IP}/checkout ,以触发调用 eshop 示例程序的 REST 接口。 150 | * 在浏览器中打开 TCM 的界面,查看生成的分布式调用跟踪信息。 151 | 152 | TCM 图形界面直观地展示了这次调用的详细信息,可以看到客户端请求从 Ingressgateway 进入到系统中,然后调用了 eshop 微服务的 checkout 接口,checkout 调用有三个 child span,分别对应到 inventory,billing 和 delivery 三个微服务的 REST 接口。 153 | ![ 'Screen Shot 2021-04-01 at 10.32.48 AM.png'](../image/trace-screenshot-1.png) 154 | 155 | # 使用 OpenTracing 来传递分布式跟踪上下文 156 | 157 | OpenTracing 提供了基于 Spring 的代码埋点,因此我们可以使用 OpenTracing Spring 框架来提供 HTTP header 的传递,以避免这部分硬编码工作。在 Spring 中采用 OpenTracing 来传递分布式跟踪上下文非常简单,只需要下述两个步骤: 158 | 159 | * 在 Maven POM 文件中声明相关的依赖,一是对 OpenTracing Spring Cloud Starter 的依赖;另外由于 Istio 采用了 Zipkin 的上报接口,我们也需要引入 Zipkin 的相关依赖。 160 | * 在 Spring Application 中声明一个 Tracer bean。如下所示,注意我们需要把 Istio 中的 Zipkin 上报地址设置到 OKHttpSernder 中。 161 | 162 | ```java 163 | @Bean 164 | public io.opentracing.Tracer zipkinTracer() { 165 | String zipkinEndpoint = System.getenv("ZIPKIN_ENDPOINT"); 166 | if (zipkinEndpoint == null || zipkinEndpoint == ""){ 167 | zipkinEndpoint = "http://zipkin.istio-system:9411/api/v2/spans"; 168 | } 169 | 170 | OkHttpSender sender = OkHttpSender.create(zipkinEndpoint); 171 | Reporter spanReporter = AsyncReporter.create(sender); 172 | 173 | Tracing braveTracing = Tracing.newBuilder() 174 | .localServiceName("my-service") 175 | .propagationFactory(B3Propagation.FACTORY) 176 | .spanReporter(spanReporter) 177 | .build(); 178 | 179 | Tracing braveTracer = Tracing.newBuilder() 180 | .localServiceName("spring-boot") 181 | .spanReporter(spanReporter) 182 | .propagationFactory(B3Propagation.FACTORY) 183 | .traceId128Bit(true) 184 | .sampler(Sampler.ALWAYS_SAMPLE) 185 | .build(); 186 | return BraveTracer.create(braveTracer); 187 | } 188 | ``` 189 | 190 | 部署采用 OpenTracing 进行 HTTP header 传递的程序版本,其调用跟踪信息如下所示: 191 | ![ 'Screen Shot 2021-04-01 at 11.15.53 AM.png'](../image/trace-screenshot-2.png) 192 | 从上图中可以看到,相比在应用代码中直接传递 HTTP header 的方式,采用 OpenTracing 进行代码埋点后,相同的调用增加了 7 个名称前缀为 spring-boot 的 Span,这 7 个 Span 是由 OpenTracing 的 tracer 生成的。虽然我们并没有在代码中显示创建这些 Span,但 OpenTracing 的代码埋点会自动为每一个 REST 请求生成一个 Span,并根据调用关系关联起来。 193 | 194 | OpenTracing 生成的这些 Span 为我们提供了更详细的分布式调用跟踪信息,从这些信息中可以分析出一个 HTTP 调用从客户端应用代码发起请求,到经过客户端的 Envoy,再到服务端的 Envoy,最后到服务端接受到请求各个步骤的耗时情况。从图中可以看到,Envoy 转发的耗时在 1 毫秒左右,相对于业务代码的处理时长非常短,对这个应用而言,Envoy 的处理和转发对于业务请求的处理效率基本没有影响。 195 | 196 | # 在 Istio 调用跟踪链中加入方法级的调用跟踪信息 197 | 198 | Istio/Envoy 提供了跨服务边界的调用链信息,在大部分情况下,服务粒度的调用链信息对于系统性能和故障分析已经足够。但对于某些服务,需要采用更细粒度的调用信息来进行分析,例如一个 REST 请求内部的业务逻辑和数据库访问分别的耗时情况。在这种情况下,我们需要在服务代码中进行埋点,并将服务代码中上报的调用跟踪数据和 Envoy 生成的调用跟踪数据进行关联,以统一呈现 Envoy 和服务代码中生成的调用数据。 199 | 200 | 在方法中增加调用跟踪的代码是类似的,因此我们用 AOP + Annotation 的方式实现,以简化代码。 201 | 首先定义一个 Traced 注解和对应的 AOP 实现逻辑: 202 | 203 | ```java 204 | @Retention(RetentionPolicy.RUNTIME) 205 | @Target(ElementType.METHOD) 206 | @Documented 207 | public @interface Traced { 208 | } 209 | ``` 210 | ```java 211 | @Aspect 212 | @Component 213 | public class TracingAspect { 214 | @Autowired 215 | Tracer tracer; 216 | 217 | @Around("@annotation(com.zhaohuabing.demo.instrument.Traced)") 218 | public Object aroundAdvice(ProceedingJoinPoint jp) throws Throwable { 219 | String class_name = jp.getTarget().getClass().getName(); 220 | String method_name = jp.getSignature().getName(); 221 | Span span = tracer.buildSpan(class_name + "." + method_name).withTag("class", class_name) 222 | .withTag("method", method_name).start(); 223 | Object result = jp.proceed(); 224 | span.finish(); 225 | return result; 226 | } 227 | } 228 | ``` 229 | 230 | 然后在需要进行调用跟踪的方法上加上 Traced 注解: 231 | 232 | ```java 233 | @Component 234 | public class DBAccess { 235 | 236 | @Traced 237 | public void save2db() { 238 | try { 239 | Thread.sleep((long) (Math.random() * 100)); 240 | } catch (InterruptedException e) { 241 | e.printStackTrace(); 242 | } 243 | } 244 | } 245 | ``` 246 | 247 | ```java 248 | @Component 249 | public class BankTransaction { 250 | @Traced 251 | public void transfer() { 252 | try { 253 | Thread.sleep((long) (Math.random() * 100)); 254 | } catch (InterruptedException e) { 255 | e.printStackTrace(); 256 | } 257 | } 258 | } 259 | ``` 260 | 261 | demo 程序的 master branch 已经加入了方法级代码跟踪,可以直接部署。 262 | 263 | ```bash 264 | git checkout master 265 | kubectl apply -f k8s/eshop.yaml 266 | ``` 267 | 268 | 效果如下图所示,可以看到 trace 中增加了 transfer 和 save2db 两个方法级的 Span。 269 | ![ 'Screen Shot 2021-04-01 at 11.04.03 AM.png'](../image/trace-screenshot-3.png) 270 | 可以打开一个方法的 Span,查看详细信息,包括 Java 类名和调用的方法名等,在 AOP 代码中还可以根据需要添加出现异常时的异常堆栈等信息。 271 | ![ 'Screen Shot 2021-04-01 at 11.07.22 AM.png'](../image/trace-screenshot-4.png) 272 | # 总结 273 | 274 | Istio/Envoy 为微服务应用提供了分布式调用跟踪功能,提高了服务调用的可见性。我们可以使用 OpenTracing 来代替应用硬编码,以传递分布式跟踪的相关 http header;还可以通过 OpenTracing 将方法级的调用信息加入到 Istio/Envoy 缺省提供的调用链跟踪信息中,以提供更细粒度的调用跟踪信息。 275 | 276 | # 下一步 277 | 278 | 除了同步调用之外,异步消息也是微服务架构中常见的一种通信方式。在下一篇文章中,我将继续利用 eshop demo 程序来探讨如何通过 OpenTracing 将 Kafka 异步消息也纳入到 Istio 的分布式调用跟踪中。 279 | 280 | # 参考资料 281 | 282 | 1. [本文中 eshop 示例程序的源代码](https://github.com/aeraki-framework/method-level-tracing-with-istio) 283 | 1. [Opentracing docs](https://opentracing.io/docs/) 284 | 1. [Opentracing specification](https://github.com/opentracing/specification/blob/master/specification.md) 285 | 1. [Opentracing wire protocols](https://github.com/opentracing/specification/blob/master/rfc/trace_identifiers.md) 286 | 1. [Istio Trace context propagation](https://istio.io/docs/tasks/telemetry/distributed-tracing/overview/#trace-context-propagation) 287 | 1. [Zipkin-b3-propagation](https://github.com/apache/incubator-zipkin-b3-propagation) 288 | 1. [OpenTracing Project Deep Dive](https://www.youtube.com/watch?v=ySR_FVNX4bQ&t=184s) -------------------------------------------------------------------------------- /content/zh/docs/best-practice/startup-dependence/_index.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "Sidecar 初始化完成后再启动应用程序" 4 | linkTitle: "Sidecar 初始化完成后再启动应用程序" 5 | weight: 1 6 | date: 2022-07-06 7 | description: 8 | --- 9 | 10 | ## 为什么需要配置 Sidecar 和应用程序的启动顺序? 11 | 在安装了 Sidecar Proxy 的 Pod 中,应用发出的外部网络请求会被 Iptables 规则重定向到 Proxy 中。如果应用发出请求时 Proxy 还未初始化完成,则 Proxy 无法对请求进行正确路由,导致请求失败。该问题导致的故障现象参见 [常见问题-应用程序启动失败/启动时无法访问网络](../common-problem/application-start-fail.md)。 12 | 13 | ## 配置方法 - Istio 1.7 及之后版本 14 | Istio 1.7 及之后的版本中,可以通过下面的方法配置在 Sidecar 初始化完成后再启动应用容器。 15 | 16 | 全局配置: 17 | 18 | 在 istio-system/istio ConfigMap 中将 `holdApplicationUntilProxyStarts` 这个全局配置项设置为 true。 19 | 20 | ```yaml 21 | apiVersion: v1 22 | data: 23 | mesh: |- 24 | defaultConfig: 25 | holdApplicationUntilProxyStarts: true 26 | ``` 27 | 28 | 按 Deployment 配置: 29 | 30 | 如果不希望该配置全局生效,则可以通过下面的 annotation 在 Deployment 级别进行配置。 31 | 32 | ```yaml 33 | template: 34 | metadata: 35 | annotations: 36 | proxy.istio.io/config: '{ "holdApplicationUntilProxyStarts": true }' 37 | ``` 38 | 39 | 实现原理:在开启 `holdApplicationUntilProxyStarts` 选项后,Istio Sidecar Injector Webhook 会在 Pod 中插入下面的 yaml 片段。该 yaml 片段在 Sidecar proxy 的 postStart 生命周期时间中执行了 `pilot-agent wait` 命令。该命令会检测 Proxy 的状态,待 Proxy 初始化完成后再启动 Pod 中的下一个容器。这样,在应用容器启动时,Sidecar proxy 已经完成了配置初始化,可以正确代理应用容器的对外网络请求。 40 | 41 | ```yaml 42 | spec: 43 | containers: 44 | - name: istio-proxy 45 | lifecycle: 46 | postStart: 47 | exec: 48 | command: 49 | - pilot-agent 50 | - wait 51 | ``` 52 | 53 | ## 配置方法 - Istio 1.7 之前的版本 54 | 55 | Istio 1.7 之前的版本没有直接提供配置 Sidecar 和应用容器启动顺序的能力。由于 Istio 新版本中解决了老版本中的很多故障,建议尽量升级到新版本。如果由于特殊原因还要继续使用 Istio 1.7 之前的版本,可以在应用进程启动时判断 Envoy Sidecar 的初始化状态,待其初始化完成后再启动应用进程。 56 | 57 | Envoy 的健康检查接口 localhost:15020/healthz/ready 会在 xDS 配置初始化完成后才返回 200,否则将返回 503,因此可以根据该接口判断 Envoy 的配置初始化状态,待其完成后再启动应用容器。我们可以在应用容器的启动命令中加入调用 Envoy 健康检查的脚本,如下面的配置片段所示。在其他应用中使用时,将 start-awesome-app-cmd 改为容器中的应用启动命令即可。 58 | 59 | ```yaml 60 | apiVersion: apps/v1 61 | kind: Deployment 62 | metadata: 63 | name: awesome-app-deployment 64 | spec: 65 | selector: 66 | matchLabels: 67 | app: awesome-app 68 | replicas: 1 69 | template: 70 | metadata: 71 | labels: 72 | app: awesome-app 73 | spec: 74 | containers: 75 | - name: awesome-app 76 | image: awesome-app 77 | ports: 78 | - containerPort: 80 79 | command: ["/bin/bash", "-c"] 80 | args: ["while [[ \"$(curl -s -o /dev/null -w ''%{http_code}'' localhost:15020/healthz/ready)\" != '200' ]]; do echo Waiting for Sidecar;sleep 1; done; echo Sidecar available; start-awesome-app-cmd"] 81 | ``` 82 | 83 | ## 解耦应用服务之间的启动依赖关系 84 | 85 | 以上配置的思路是控制 Pod 中容器的启动顺序,在 Envoy Sidecar 初始化完成后再启动应用容器,以确保应用容器启动时能够通过网络正常访问其他服务。但即使 Pod 中对外的网络访问没有问题,应用容器依赖的其他服务也可能由于尚未启动,或者某些问题而不能在此时正常提供服务。要彻底解决该问题,建议解耦应用服务之间的启动依赖关系,使应用容器的启动不再强依赖其他服务。 86 | 87 | 在一个微服务系统中,原单体应用中的各个业务模块被拆分为多个独立进程(服务)。这些服务的启动顺序是随机的,并且服务之间通过不可靠的网络进行通信。微服务多进程部署、跨进程网络通信的特定决定了服务之间的调用出现异常是一个常见的情况。为了应对微服务的该特点,微服务的一个基本的设计原则是 “design for failure”,即需要以优雅的方式应对可能出现的各种异常情况。当在微服务进程中不能访问一个依赖的外部服务时,需要通过重试、降级、超时、断路等策略对异常进行容错处理,以尽可能保证系统的正常运行。 88 | 89 | Envoy Sidecar 初始化期间网络暂时不能访问的情况只是放大了微服务系统未能正确处理服务依赖的问题,即使解决了 Envoy Sidecar 的依赖顺序,该问题依然存在。假设应用启动时依赖配置中心,配置中心是一个独立的微服务,当一个依赖配置中心的微服务启动时,配置中心有可能尚未启动,或者尚未初始化完成。在这种情况下,如果在代码中没有对该异常情况进行处理,也会导致依赖配置中心的微服务启动失败。在一个更为复杂的系统中,多个微服务进程之间可能存在网状依赖关系,如果没有按照 “design for failure” 的原则对微服务进行容错处理,那么只是将整个系统启动起来就将是一个巨大的挑战。 -------------------------------------------------------------------------------- /content/zh/docs/common-problem/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Istio 常见问题" 3 | linkTitle: "Istio 常见问题" 4 | weight: 2 5 | description: > 6 | 介绍在使用 Istio 过程中可能遇到的一些常见问题的解决方法 7 | --- 8 | -------------------------------------------------------------------------------- /content/zh/docs/common-problem/application-start-fail/_index.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "应用程序启动失败/启动时无法访问网络" 4 | linkTitle: "应用程序启动失败/启动时无法访问网络" 5 | weight: 1 6 | date: 2022-07-06 7 | description: 8 | --- 9 | 10 | ## 故障现象 11 | 12 | 该问题的表现是安装了 Sidecar proxy 的应用在启动后的一小段时间内无法通过网络访问 Pod 外面的服务。应用在启动时通常会从一些外部服务中获取数据,并采用这些数据对自身进行初始化。例如从配置中心读取程序配置,从数据库中初始化程序用户信息等。而安装了 Sidecar proxy 的应用在启动后的一小段时间内网络是不通的。如果应用代码中没有合适的容错和重试逻辑,该问题常常会导致应用启动失败。 13 | 14 | ## 故障原因 15 | 16 | 如下图所示,Envoy 启动后会通过 xDS 协议向 Pilot 请求服务和路由配置信息,Pilot 收到请求后会根据 Envoy 所在的节点(Pod 或者 VM)组装配置信息,包括 Listener、Route、Cluster 等,然后再通过 xDS 协议下发给 Envoy。根据 Mesh 的规模和网络情况,该配置下发过程需要数秒到数十秒的时间。在这段时间内,由于初始化容器已经在 Pod 中创建了 Iptables rule 规则,因此应用向外发送的网络流量会被重定向到 Envoy ,而此时 Envoy 中尚没有对这些网络请求进行处理的监听器和路由规则,无法对此进行处理,导致网络请求失败。(关于 Envoy Sidecar 初始化过程和 Istio 流量管理原理的更多内容,可以参考这篇文章 [Istio流量管理实现机制深度解析](https://zhaohuabing.com/post/2018-09-25-istio-traffic-management-impl-intro/))。 17 | 18 | ![](../image/envoy-initialize.png) 19 | 20 | ## 解决方案 21 | 22 | 参见:[最佳实践-在 Sidecar 初始化完成后再启动应用容器](../best-practice/startup-dependence.md) -------------------------------------------------------------------------------- /content/zh/docs/common-problem/duplicate-tls-hosts/_index.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "Gateway TLS hosts 冲突导致配置被拒绝" 4 | linkTitle: "Gateway TLS hosts 冲突导致配置被拒绝" 5 | weight: 3 6 | date: 2022-07-06 7 | description: 8 | --- 9 | 10 | ## 故障现象 11 | 网格中同时存在以下两个 Gateway 12 | ```yaml 13 | apiVersion: networking.istio.io/v1beta1 14 | kind: Gateway 15 | metadata: 16 | name: test1 17 | spec: 18 | selector: 19 | istio: ingressgateway 20 | servers: 21 | - hosts: 22 | - test1.example.com 23 | port: 24 | name: https 25 | number: 443 26 | protocol: HTTPS 27 | tls: 28 | credentialName: example-credential 29 | mode: SIMPLE 30 | --- 31 | apiVersion: networking.istio.io/v1beta1 32 | kind: Gateway 33 | metadata: 34 | name: test2 35 | spec: 36 | selector: 37 | istio: ingressgateway 38 | servers: 39 | - hosts: 40 | - test1.example.com 41 | - test2.example.com 42 | port: 43 | name: https 44 | number: 443 45 | protocol: HTTPS 46 | tls: 47 | credentialName: example-credential 48 | mode: SIMPLE 49 | ``` 50 | 51 | 172.18.0.6 为 ingress gateway Pod IP,请求 https://test1.example.com 正常返回 404 52 | ```bash 53 | curl -i -HHost:test1.example.com --resolve "test1.example.com:443:172.18.0.6" --cacert example.com.crt "https://test1.example.com" 54 | HTTP/2 404 55 | date: Mon, 29 Nov 2021 06:59:26 GMT 56 | server: istio-envoy 57 | ``` 58 | 59 | 请求 https://test2.example.com 异常 60 | ```bash 61 | $ curl -HHost:test2.example.com --resolve "test2.example.com:443:172.18.0.6" --cacert example.com.crt "https://test2.example.com" 62 | curl: (35) OpenSSL SSL_connect: Connection reset by peer in connection to test2.example.com:443 63 | ``` 64 | ## 故障原因 65 | 66 | 通过 istiod 监控发现`pilot_total_rejected_configs`指标异常,显示`default/test2`配置被拒绝 67 | ![](../image/pilot_total_rejected_configs.png) 68 | 调整 istiod 日志级别查看被拒绝的原因 69 | ``` 70 | --log_output_level=model:debug 71 | ``` 72 | ``` 73 | 2021-11-29T07:24:21.703924Z debug model skipping server on gateway default/test2, duplicate host names: [test1.example.com] 74 | ``` 75 | 通过日志定位到具体代码位置 76 | ```go 77 | if duplicateHosts := CheckDuplicates(s.Hosts, tlsHostsByPort[resolvedPort]); len(duplicateHosts) != 0 { 78 | log.Debugf("skipping server on gateway %s, duplicate host names: %v", gatewayName, duplicateHosts) 79 | RecordRejectedConfig(gatewayName) 80 | continue 81 | } 82 | ``` 83 | ```go 84 | // CheckDuplicates returns all of the hosts provided that are already known 85 | // If there were no duplicates, all hosts are added to the known hosts. 86 | func CheckDuplicates(hosts []string, knownHosts sets.Set) []string { 87 | var duplicates []string 88 | for _, h := range hosts { 89 | if knownHosts.Contains(h) { 90 | duplicates = append(duplicates, h) 91 | } 92 | } 93 | // No duplicates found, so we can mark all of these hosts as known 94 | if len(duplicates) == 0 { 95 | for _, h := range hosts { 96 | knownHosts.Insert(h) 97 | } 98 | } 99 | return duplicates 100 | } 101 | ``` 102 | 校验逻辑是每个域名在同一端口上只能配置一次 TLS,我们这里 test1.example.com 在 2 个 Gateway 的 443 端口都配置了 TLS, 103 | 导致其中一个被拒绝,通过监控确认被拒绝的是 test2,test2.example.com 和 test1.example.com 配置在 test2 的同一个 Server,Server 配置被拒绝导致请求异常 104 | 105 | ## 解决方案 106 | 同一个域名不要在多个 Gateway 中的同一端口重复配置 TLS,这里我们删除 test1 后请求恢复正常 107 | ```bash 108 | $ curl -i -HHost:test1.example.com --resolve "test1.example.com:443:172.18.0.6" --cacert example.com.crt "https://test1.example.com" 109 | HTTP/2 404 110 | date: Mon, 29 Nov 2021 07:43:40 GMT 111 | server: istio-envoy 112 | 113 | $ curl -i -HHost:test2.example.com --resolve "test2.example.com:443:172.18.0.6" --cacert example.com.crt "https://test2.example.com" 114 | HTTP/2 404 115 | date: Mon, 29 Nov 2021 07:43:41 GMT 116 | server: istio-envoy 117 | ``` -------------------------------------------------------------------------------- /content/zh/docs/common-problem/external-name-service-highjacks/_index.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "ExternalName Service 劫持了其他服务流量" 4 | linkTitle: "ExternalName Service 劫持了其他服务流量" 5 | weight: 2 6 | date: 2022-07-06 7 | description: 8 | --- 9 | 10 | ## 故障现象 11 | 12 | 如果网格内存在一个 ExternalName 类型 Service, 网格内访问其他外部服务的的某一端口,如果这个端口刚好和该 ExternalName Service 重叠,那么流量会被路由到这个 ExternalName Service 对应的 CDS。 13 | 14 | ## 故障重现 15 | 16 | ### 正常情况 17 | 18 | 在 namespace sample 安装 sleep Pod: 19 | 20 | ``` 21 | kubectl create ns sample 22 | kubectl label ns sample istio-injection=enabled 23 | kubectl -nsample apply -f https://raw.githubusercontent.com/istio/istio/1.11.4/samples/sleep/sleep.yaml 24 | ``` 25 | 26 | 通过 sleep 访问外部服务 https://httpbin.org:443, 请求成功: 27 | 28 | ``` 29 | kubectl -nsample exec sleep-74b7c4c84c-22zkq -- curl -I https://httpbin.org 30 | HTTP/2 200 31 | ...... 32 | ``` 33 | 34 | 从 access log 确认流量是从 PassthroughCluster 出去,符合预期: 35 | 36 | ``` 37 | "- - -" 0 - - - "-" 938 5606 1169 - "-" "-" "-" "-" "18.232.227.86:443" PassthroughCluster 172.24.0.10:42434 18.232.227.86:443 172.24.0.10:42432 - - 38 | ``` 39 | 40 | ### 异常情况 41 | 42 | 现在 在 default 下创建一个 ExternalName 类型的 Service, 端口也是 443: 43 | 44 | ``` 45 | kind: Service 46 | apiVersion: v1 47 | metadata: 48 | name: my-externalname 49 | spec: 50 | type: ExternalName 51 | externalName: bing.com 52 | ports: 53 | - port: 443 54 | targetPort: 443 55 | ``` 56 | 57 | 通过 sleep 访问外部服务 https://httpbin.org:443, 请求失败: 58 | 59 | ``` 60 | kubectl -nsample exec sleep-74b7c4c84c-22zkq -- curl -I https://httpbin.org 61 | curl: (60) SSL: no alternative certificate subject name matches target host name 'httpbin.org' 62 | More details here: https://curl.se/docs/sslcerts.html 63 | ...... 64 | ``` 65 | 66 | 查看 access log, 发现请求外部服务,被错误路由到了 my-externalname 的 ExternalName Service: 67 | 68 | ``` 69 | "- - -" 0 - - - "-" 706 5398 67 - "-" "-" "-" "-" "204.79.197.200:443" outbound|443||my-externalname.default.svc.cluster.local 172.24.0.10:56806 34.192.79.103:443 172.24.0.10:36214 httpbin.org - 70 | ``` 71 | 72 | ## 故障原因 73 | 74 | 通过对比 sleep Pod 前后两次的 xDS, 发现增加了 ExternalName Service 后,xDS 里会多一个 LDS `0.0.0.0_443`, 该 LDS 包括一个`default_filter_chain` 会把该 LDS 中其他 filter chain 没有 match 到的流量,都路由到这个 `default_filter_chain` 中的 Cluster,也就是 `my-externalname` 对应的 CDS: 75 | 76 | 77 | ![](../image/externalname.png) 78 | 79 | 80 | ## 解决方案 81 | 82 | 该问题属于 Istio 实现缺陷,相关 issue: https://github.com/istio/istio/issues/20703 83 | 84 | 目前的解决方案是避免 ExternalName Service 和其他服务端口冲突。 -------------------------------------------------------------------------------- /content/zh/docs/common-problem/image/envoy-initialize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/common-problem/image/envoy-initialize.png -------------------------------------------------------------------------------- /content/zh/docs/common-problem/image/externalname.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/common-problem/image/externalname.png -------------------------------------------------------------------------------- /content/zh/docs/common-problem/image/pilot_total_rejected_configs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/common-problem/image/pilot_total_rejected_configs.png -------------------------------------------------------------------------------- /content/zh/docs/common-problem/image/tcp-keepalive-package.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/common-problem/image/tcp-keepalive-package.png -------------------------------------------------------------------------------- /content/zh/docs/common-problem/image/tcp-keepalive-ss-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/common-problem/image/tcp-keepalive-ss-1.png -------------------------------------------------------------------------------- /content/zh/docs/common-problem/image/tcp-keepalive-ss-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/common-problem/image/tcp-keepalive-ss-2.png -------------------------------------------------------------------------------- /content/zh/docs/common-problem/image/tcp-keepalive-ss-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/common-problem/image/tcp-keepalive-ss-3.png -------------------------------------------------------------------------------- /content/zh/docs/common-problem/image/tcp-keepalive-ss-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/common-problem/image/tcp-keepalive-ss-4.png -------------------------------------------------------------------------------- /content/zh/docs/common-problem/image/tcp-keepalive-ss-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/common-problem/image/tcp-keepalive-ss-5.png -------------------------------------------------------------------------------- /content/zh/docs/common-problem/server-speaks-first-protocol/_index.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "Server Speaks First 协议访问失败" 4 | linkTitle: "Server Speaks First 协议访问失败" 5 | weight: 4 6 | date: 2022-07-06 7 | description: 8 | --- 9 | 10 | ## 故障现象 11 | 12 | Istio 网格开启 allow any 访问模式,在一个注入了 sidecar 的 pod 内,mysql 客户端访问 mysql-ip-1:3306 成功,访问 mysql-ip-2:10000 没有响应: 13 | 14 | ``` 15 | # mysql -h55.135.153.1 -utest -pxxxx -P3306 16 | Welcome to the MariaDB monitor. Commands end with ; or \g. 17 | 18 | # mysql -h55.108.108.2 -utest -pxxxx -P10000 19 | (no response) 20 | ``` 21 | 22 | ## 故障分析 23 | 24 | 查看日志,把 access log 设置为 debug、trace 均没有发现有用信息。 25 | 26 | 分析发现,网格内有一个 http server,也使用了和 mysql-ip-2 相同的端口 10000: 27 | 28 | ``` 29 | apiVersion: v1 30 | kind: Service 31 | metadata: 32 | name: irrelevant-svc 33 | ...... 34 | spec: 35 | ports: 36 | - name: http 37 | nodePort: 31025 38 | port: 10000 # 端口相同 39 | protocol: TCP 40 | targetPort: 8080 41 | ``` 42 | 43 | 我们尝试把该服务端口改成 10001,访问 mysql-ip-2:10000 成功,推测和端口冲突相关: 44 | 45 | ``` 46 | # mysql -h55.108.108.2 -utest -pxxxx -P10000 47 | Welcome to the MariaDB monitor. Commands end with ; or \g. 48 | ``` 49 | 50 | 我们再尝试对 mysql-ip-1 复现故障:在网格内创建了一个包括 3306 端口的 http 服务,mysql 请求无响应,问题复现。 51 | 52 | 另外我们还尝试过,如果把冲突端口的协议定义为 tcp(通过 port name),该问题不存在: 53 | 54 | ``` 55 | apiVersion: v1 56 | kind: Service 57 | metadata: 58 | name: irrelevant-svc 59 | ...... 60 | spec: 61 | ports: 62 | - name: tcp # 如果是 tcp 则不会出问题 63 | nodePort: 31025 64 | port: 10000 65 | protocol: TCP 66 | targetPort: 8080 67 | ``` 68 | 69 | ## 故障原因 70 | 71 | ### Server Speaks First 72 | 73 | Mysql 协议是一种 **Server Speaks First** 协议,也就是说 client 和 server 完成三次握手后,是 server 会先发起会话, 简要过程: 74 | 75 | ``` 76 | S: 服务端首先会发一个握手包到客户端 77 | C: 客户端向服务端发送认证信息 ( 用户名,密码等 ) 78 | S: 服务端收到认证包后,会检查用户名与密码是否合法,并发送包告知客户端认证信息。 79 | ``` 80 | 81 | 除了 Mysql,常见的 Server Speaks First 协议还包括 SMTP,DNS,MongoDB 等。下面是一个 SMTP 交互流程: 82 | 83 | ``` 84 | S: 220 smtp.example.com ESMTP Postfi 85 | C: HELO relay.example.com 86 | S: 250 smtp.example.com, I am glad to meet you 87 | C: MAIL FROM: 88 | S: 250 Ok 89 | C: RCPT TO: 90 | S: 250 Ok 91 | C: RCPT TO: 92 | S: 250 Ok 93 | C: DATA 94 | S: 354 End data with . 95 | C: From: "Bob Example" 96 | C: To: Alice Example 97 | C: Cc: theboss@example.com 98 | C: Date: Tue, 15 Jan 2008 16:02:43 -0500 99 | C: Subject: Test message 100 | C: 101 | C: Hello Alice. 102 | C: This is a test message with 5 header fields and 4 lines in the message body. 103 | C: Your friend, 104 | C: Bob 105 | C: . 106 | S: 250 Ok: queued as 12345 107 | C: QUIT 108 | S: 221 Bye 109 | {The server closes the connection} 110 | ``` 111 | 112 | ### istio 不是完全透明 113 | 114 | 当前 istio 的某些特性,不能做到**透明**兼容 Server Speaks First 协议,这些特性包括: 115 | 116 | * 协议嗅探 117 | * PERMISSIVE mTLS 118 | * Authorization Policy 119 | 120 | 这些特性都希望 client 能先发起会话,以协议嗅探为例,envoy 是通过分析 client 发出的初始若干字节来推测协议类型。 121 | 122 | 对于 Server Speaks First 协议,比如 mysql,三次握手后,这时候 mysql client 在等待 mysql server 发起初次会话,而 client 端的 envoy 尝试做协议嗅探,也在等 mysql client 发出数据,这类似一个死锁,最终超时。 123 | 124 | 125 | ## 解决方案 126 | 127 | 以下是一些可行的方案: 128 | 129 | 1. 为 Server Speaks First 协议服务创建一个 ServiceEntry,并指定协议为 TCP。 130 | 2. 避免 Server Speaks First 协议服务端口和网格内服务端口重叠,这样请求可以直接走 passthrough。 131 | 3. 把 Server Speaks First 服务 ip 放到 excludeIPRanges,这样请求不经过 envoy 处理,适用于 DB 服务不需要网格治理的情况。 132 | 133 | 134 | ## 参考资料 135 | 136 | * [Server First Protocols](https://istio.io/latest/docs/ops/deployment/requirements/#server-first-protocols) 137 | * [Server-first TCP protocols are not supported](https://istio.io/latest/docs/ops/best-practices/security/#server-first-tcp-protocols-are-not-supported) 138 | * [Istio Envoy passthrough goes wrong when port 80 are used for SMTP protocol instead of standard ports](https://www.linkedin.com/pulse/istio-envoy-passthrough-goes-wrong-when-port-80-used-smtp-liu-) 139 | * [Server-Speaks-First 有点坑](https://www.cnblogs.com/hacker-linner/p/15122404.html) -------------------------------------------------------------------------------- /content/zh/docs/common-problem/tcp-keepalive/_index.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | title: "长连接未开启 tcp keepalive" 4 | linkTitle: "长连接未开启 tcp keepalive" 5 | weight: 5 6 | date: 2022-07-06 7 | description: 8 | --- 9 | 10 | ## 故障现象 11 | 用户反馈链路偶发 500 错误,频率低但是持续存在。 12 | 13 | 用户访问链路较长,核心链路简化如下: 14 | 15 | ``` 16 | 1. client -> 17 | 2. [istio ingress gateway] -> 18 | 3. podA[app->sidecar] -> 19 | 4. 腾讯云内网CLB -> 20 | 5. [istio ingress gateway] -> 21 | 6. podB[sidecar->app] 22 | ``` 23 | 24 | 应用对外是 https 服务,证书在 istio ingress gateway 上处理。 25 | 26 | ## 故障分析 27 | 28 | 通过分析链路中 sidecar accesslog 日志,有以下现象: 29 | 30 | 1. 第 3 跳 podA 正常发出请求,但接收到 500 返回。 31 | 2. 第 5 跳 istio ingress gateway 没有该 500 对应的访问日志。 32 | 33 | 因此重点分析 第 3,4,5 跳。 34 | 35 | 36 | 在第 3 跳 podA 上抓到 500 对应的数据包: 37 | 38 | 39 | ![podA 抓包](../image/tcp-keepalive-package.png) 40 | 41 | 42 | 抓包显示,podA 向一个已经断开的连接发送数据包,收到 RST 因此返回 500,但抓包并没有发现这个连接之前有主动断开的行为(FIN)。 43 | 44 | 登录 podA,查看连接情况: 45 | 46 | 47 | ![长连接未开启keepalive](../image/tcp-keepalive-ss-1.png) 48 | 49 | ss 显示用户代码里使用了 tcp 长连接,注意这里我们使用了 ss 参数 `-o`, 该参数可以显示 tcp keepalive timer 信息: 50 | 51 | ``` 52 | -o, --options 53 | Show timer information. For TCP protocol, the output 54 | format is: 55 | 56 | timer:(,,) 57 | 58 | 59 | the name of the timer, there are five kind of timer 60 | names: 61 | 62 | on : means one of these timers: TCP retrans timer, 63 | TCP early retrans timer and tail loss probe timer 64 | 65 | keepalive: tcp keep alive timer 66 | 67 | timewait: timewait stage timer 68 | 69 | persist: zero window probe timer 70 | 71 | unknown: none of the above timers 72 | 73 | 74 | how long time the timer will expire 75 | 76 | 77 | how many times the retransmission occurred 78 | ``` 79 | 80 | 但从 ss 结果并未看到 timer 信息,推断 podA 使用的长连接并未开启 keepalive。 81 | 82 | 83 | ## 故障原因 84 | 85 | podA 使用了 tcp 长连接,但是没有开启 keepalive,当长连接出现一段时间空闲,该连接可能被网络中间组件释放,比如 client、server 端的母机, 但 client 端还是持有断开连接,后续重用该链接就会导致上述异常。 86 | 87 | ## 解决方案 88 | 89 | 问题本质是因为长连接 idle 过长,且缺乏探活机制,导致 client 没感知到连接已释放,尝试三种方案: 90 | 91 | 1. 应用代码修复 92 | 2. istio 方案:client sidecar 开启 keepalive 93 | 3. istio 方案:server 开启 keepalive 94 | 95 | ### 应用代码修复 96 | 97 | 最直接的方案是应用在使用长连接时,开启 tcp keepalive,以 golang 程序示例,我们尝试用长连接访问 https://www.baidu.com 98 | 99 | 先模拟使用长连接但不开启 keepalive: 100 | 101 | ```golang 102 | var HTTPTransport = &http.Transport{ 103 | DialContext: (&net.Dialer{ 104 | Timeout: 10 * time.Second, 105 | KeepAlive: -1 * time.Second, // disable TCP KeepAlive 106 | }).DialContext, 107 | MaxIdleConns: 50, 108 | IdleConnTimeout: 60 * time.Second, 109 | MaxIdleConnsPerHost: 20, 110 | } 111 | 112 | func main() { 113 | uri := "https://www.baidu.com" 114 | times := 200 115 | 116 | client := http.Client{Transport: HTTPTransport} 117 | for i := 0; i < times; i++ { 118 | time.Sleep(2 * time.Second) 119 | req, err := http.NewRequest(http.MethodGet, uri, nil) 120 | if err != nil { 121 | fmt.Println("NewRequest Failed " + err.Error()) 122 | continue 123 | } 124 | resp, err := client.Do(req) 125 | if err != nil { 126 | fmt.Println("Http Request Failed " + err.Error()) 127 | continue 128 | } 129 | fmt.Println(resp.Status) 130 | ioutil.ReadAll(resp.Body) 131 | resp.Body.Close() 132 | } 133 | ``` 134 | 135 | 注意 `KeepAlive: -1` 表示禁用了 tcp keepalive 探活,ss 查看: 136 | 137 | ![应用长连接未开启keepalive](../image/tcp-keepalive-ss-2.png) 138 | 139 | 结果显示长连接缺乏 timer。注意测试 pod 在 istio 环境,上述第一个连接是 go 程序到 envoy,第二个连接是 envoy 到 baidu。 140 | 141 | golang 代码修复方案很简单,只需要把 `KeepAlive` 设置为非负数, 代码修改 142 | 143 | ```golang 144 | var HTTPTransport = &http.Transport{ 145 | DialContext: (&net.Dialer{ 146 | Timeout: 10 * time.Second, 147 | KeepAlive: 120 * time.Second, // keepalive 设置为 2 分钟 148 | }).DialContext, 149 | MaxIdleConns: 50, 150 | IdleConnTimeout: 60 * time.Second, 151 | MaxIdleConnsPerHost: 20, 152 | } 153 | ``` 154 | 155 | ss 查看连接情况: 156 | 157 | ![golang 长连接开启keepalive](../image/tcp-keepalive-ss-3.png) 158 | 159 | ss 显示 go client 到 envoy 开启了 keepalive,问题解决。 160 | 161 | **但用户应用程序较多,不方便逐一调整 keepalive,希望通过 istio sidecar 来解决上述问题**。keepalive 可以在 client、server 任意一端开启,以下是使用 istio 的两种方案: 162 | 163 | 164 | ### istio 方案:client sidecar 开启 keepalive 165 | 166 | 该方案需要client 注入 istio sidecar,仍以访问 baidu https 为例,外部服务在 istio 中默认转发到 PassthroughCluster, 要对指定外部服务流量进行流控,我们需要先给该服务创建一个 service entry: 167 | 168 | ```yaml 169 | apiVersion: networking.istio.io/v1alpha3 170 | kind: ServiceEntry 171 | metadata: 172 | name: baidu-https 173 | spec: 174 | hosts: 175 | - www.baidu.com 176 | location: MESH_EXTERNAL 177 | ports: 178 | - number: 443 179 | name: https 180 | protocol: TLS 181 | ``` 182 | 183 | 然后增加 tcp keepalive 设置: 184 | 185 | ```yaml 186 | apiVersion: networking.istio.io/v1alpha3 187 | kind: DestinationRule 188 | metadata: 189 | name: baidu-https 190 | spec: 191 | host: www.baidu.com 192 | trafficPolicy: 193 | connectionPool: 194 | tcp: 195 | maxConnections: 100 196 | tcpKeepalive: 197 | time: 600s 198 | interval: 75s 199 | probes: 9 200 | ``` 201 | 202 | ![client sidecar 开启 keepalive](../image/tcp-keepalive-ss-4.png) 203 | 204 | ss 显示 go client 到 envoy 并没有 keepalive, 但 envoy 到 baidu 开启了 keepalive。 205 | 206 | 207 | ### istio 方案:server 开启 keepalive 208 | 209 | 用户异常链路的 server 入口 是 CLB 后端的 ingress gateway,在 ingress gateway 上开启 keepalive 会稍微复杂一点,需要使用 envoyfilter 来设置 socekt options: 210 | 211 | ``` 212 | apiVersion: networking.istio.io/v1alpha3 213 | kind: EnvoyFilter 214 | metadata: 215 | name: ingress-gateway-socket-options 216 | namespace: istio-system 217 | spec: 218 | configPatches: 219 | - applyTo: LISTENER 220 | match: 221 | context: GATEWAY 222 | listener: 223 | name: 0.0.0.0_443 224 | portNumber: 443 225 | patch: 226 | operation: MERGE 227 | value: 228 | socket_options: 229 | - int_value: 1 230 | level: 1 # SOL_SOCKET 231 | name: 9 # SO_KEEPALIVE 232 | state: STATE_PREBIND 233 | - int_value: 9 234 | level: 6 # IPPROTO_TCP 235 | name: 6 # TCP_KEEPCNT 236 | state: STATE_PREBIND 237 | - int_value: 600 238 | level: 6 # IPPROTO_TCP 239 | name: 4 # TCP_KEEPIDLE 240 | state: STATE_PREBIND 241 | - int_value: 75 242 | level: 6 # IPPROTO_TCP 243 | name: 5 # TCP_KEEPINTVL 244 | state: STATE_PREBIND 245 | ``` 246 | 247 | 上述配置的含义是:对于 433 LDS,tcp 连接设置 socket options:连接空闲 600s 后,开始发送探活 probe;如果探活失败,会持续探测 9 次,探测间隔为 75 s。 248 | 249 | 在 ingress gateway 上 ss, 显示 443 上连接都开启了 keepalive: 250 | 251 | ![ingress gateway 开启 keepalive](../image/tcp-keepalive-ss-5.png) 252 | 253 | 如果用户 client 较多不便调整,更适合在 server (ingress gateway)开启 keepalive。另外该方案对 client 有无 sidecar 没有要求。 254 | 255 | ## 总结 256 | 257 | 使用长连接时,应用需要设置合理的 keepalive 参数,特别是对于访问频次较低的场景,以及链路较长的情况。 258 | 259 | **istio 无入侵式的流量操纵能力,可以很方便的对流量行为进行调优,这也是用户选择 istio 的重要原因。** 260 | 261 | --- 262 | 263 | ## 参考资料 264 | 265 | * https://www.envoyproxy.io/docs/envoy/latest/api-v3/config/core/v3/socket_option.proto 266 | * https://github.com/envoyproxy/envoy/issues/3634 267 | * https://github.com/istio/istio/issues/28879 -------------------------------------------------------------------------------- /content/zh/docs/debug-istio/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Istio 调试指南" 3 | linkTitle: "Istio 调试指南" 4 | weight: 1 5 | description: > 6 | 如何快速分析处理服务网格系统自身的问题 7 | --- 8 | 9 | 服务网格为微服务提供了一个服务通信的基础设施层,统一为上层的微服务提供了服务发现,负载均衡,重试,断路等基础通信功能,以及服务路由,灰度发布,Chaos 测试等高级管控功能。 10 | 11 | 服务网格的引入大大降低了个微服务应用的开发难度,让微服务应用开发人员不再需要花费大量时间用于保障底层通讯的正确性上,而是重点关注于产生用户价值的业务需求。 12 | 13 | 然而由于微服务架构的分布式架构带来的复杂度并未从系统中消失,而是从各个微服务应用中转移到了服务网格中。由服务网格对所有微服务应用的通讯进行统一控制,好处是可以保证整个系统中分布式通讯策略的一致性,并可以方便地进行集中管控。 14 | 15 | 除微服务之间分布式调用的复杂度之外,服务网格在底层通讯和微服务应用之间引入了新的抽象层,为系统引入了一些额外的复杂度。在此情况下,如果服务网格自身出现故障,将对上层的微服务应用带来灾难性的影响。 16 | 17 | 当系统中各微服务应用之间的通讯出现异常时,我们可以通过服务网格提供的分布式调用跟踪,故障注入,服务路由等手段快速进行分析和处理。但如果服务网格系统自身出现问题的话,我们如何才能快速进行分析处理呢? -------------------------------------------------------------------------------- /content/zh/docs/debug-istio/envoy-log/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Envoy 日志调试指南" 3 | linkTitle: "" 4 | weight: 1 5 | date: 2022-07-06 6 | description: 7 | --- 8 | 9 | 10 | ## 1. 问题背景 11 | 12 | 这是使用 Istio 最常见的困境:在微服务中引入 Envoy 作为代理后,当流量访问和预期行为不符时,用户很难快速确定问题是出在哪个环节。客户端收到的异常响应,诸如 403、404、503 或者连接中断等,可能是链路中任一 Sidecar 执行流量管控的结果, 但也有可能是来自某个服务的合理逻辑响应。 13 | 14 | 特别的,当 Service Mesh 系统的维护者和应用程序的开发者来自不同的团队时,问题尤为凸显。 15 | 16 | 在 Mesh 中引入全链路跟踪系统,可以解决部分问题,我们可以知道请求到达了哪些工作负载,但是对于中断的异常请求,我们仍然很难确定原因。 因为本着最大透明化(Maximize Transparency)的设计目标,Istio 的遥测系统会尽量屏蔽掉 Sidecar 的存在。另一方面,用户自行维护一套全链路跟踪系统成本也很高,受限于遥测采样率和有限的协议支持,我们通常无法采集所有链路数据。 17 | 18 | 幸运的是,Envoy 本身可以记录流量的信息,本文主要介绍如何利用 Envoy 日志,对类似问题进行定位。 19 | 20 | --- 21 | 22 | ## 2. Envoy 流量模型 23 | 24 | 我们先看看 Envoy 的流量模型: 25 | 26 | 1. 监听,接受连接 27 | 2. 根据用户流量操纵规则,进行流量特征识别 28 | 3. 进行流量操纵,如负载均衡,转发,拒绝等 29 | 30 | 在以上流程中, Envoy 接受请求流量叫做 **Downstream**,Envoy 发出请求流量叫做 **Upstream**。在处理 Downstream 和 Upstream 过程中, 分别会涉及 2 个流量端点,即请求的发起端和接收端: 31 | 32 | ![envoy-model](image/envoy-model.png) 33 | 34 | 在这个过程中, Envoy 会根据用户规则,计算出符合条件的转发目的主机集合,这个集合叫做 **UPSTREAM_CLUSTER**, 并根据负载均衡规则,从这个集合中选择一个 host 作为流量转发的接收端点,这个 host 就是 **UPSTREAM_HOST**。 35 | 36 | 以上就是 Envoy 请求处理的 **流量五元组信息**, 这是 Envoy 日志里最重要的部分,通过这个五元组我们可以准确的观测流量「从哪里来」和「到哪里去」。 37 | 38 | * UPSTREAM_CLUSTER 39 | * DOWNSTREAM_REMOTE_ADDRESS 40 | * DOWNSTREAM_LOCAL_ADDRESS 41 | * UPSTREAM_LOCAL_ADDRESS 42 | * UPSTREAM_HOST 43 | 44 | --- 45 | 46 | ## 3. Helloworld example 47 | 48 | 在 Istio 场景中,Envoy 既可以是正向代理,也可以是反向代理。在上图中, 如果 Envoy 处理的是 Outbound 流量, 业务容器是作为 Downstream 端点(右边);如果 Envoy 处理的是 Inbound 流量, 业务容器是作为 Upstream 端点(左边)。 49 | 50 | Istio 中默认不开启 Envoy 中的访问日志,需要手动打开,将 Istio 配置中 `accessLogFile` 设置为 `/dev/stdout`: 51 | 52 | ```yaml 53 | % kubectl -n istio-system edit cm istio 54 | ...... 55 | # Set accessLogFile to empty string to disable access log. 56 | accessLogFile: "/dev/stdout" # 开启日志 57 | 58 | accessLogEncoding: 'JSON' # 默认日志是单行格式, 可选设置为 JSON 59 | ...... 60 | ``` 61 | 62 | 我们以 sleep Pod 访问 hello 服务来举例说明: 63 | 64 | ```shell 65 | kubectl apply -f sleep-hello.yaml 66 | ``` 67 | 68 | ![image-20200212222251433](https://zhongfox-blogimage-1256048497.cos.ap-guangzhou.myqcloud.com/2020-02-12-142255.png) 69 | 70 | 该文件定义了 2 个版本的 helloworld 和一个 sleep Pod,helloworld Service 的端口是 4000, 而 Pod 的端口是 5000。 71 | 72 | 从 sleep Pod 中去访问 helloworld 服务, 确认应用正常: 73 | 74 | ```shell 75 | % SLEEP_POD=$(kubectl get pod -l app=sleep -o jsonpath="{.items[0].metadata.name}") 76 | % HELLO_V1_POD=$(kubectl get pod -l app=helloworld -l version=v1 -o jsonpath="{.items[0].metadata.name}") 77 | % kubectl exec -it $SLEEP_POD -csleep -- sh 78 | / # curl helloworld:4000/hello 79 | ``` 80 | 81 | 这时候我们可以去分析 2 个 Pod 各自的 Envoy 日志: 82 | 83 | ![image-20200212222055391](https://zhongfox-blogimage-1256048497.cos.ap-guangzhou.myqcloud.com/2020-02-12-142111.png) 84 | 85 | 用一张图来说明: 86 | 87 | ![downstream-upstream](image/downstream-upstream.png) 88 | 89 | 从日志中我们可以分析出: 90 | 91 | 对于 sleep Pod, sleep app 发出的流量目的端是 hello Service ip 和 Service port,sleep Envoy 处理的是 Outbound 流量, Envoy 根据规则选择的 「UPSTREAM_CLUSTER」是 `outbound|4000||helloworld.default.svc.cluster.local `, 然后转发给其中的一个 「UPSTREAM_HOST」, 也就是 hello Pod 的 ip 和 port。 92 | 93 | 对于 hello Pod,其 Envoy 处理的是 Inbound 流量,Envoy 根据规则选择的 「UPSTREAM_CLUSTER」 是 `inbound|4000|http|helloworld.default.svc.cluster.local `, 其中的 「UPSTREAM_HOST」 是 「127.0.0.1:5000」, 也就是该 Pod 里的 hello app。 94 | 95 | 因此,我们可以总结出 Istio 中流量端点值的逻辑规则: 96 | 97 | #### UPSTREAM_HOST 98 | 99 | 上游主机的 host,表示从 Envoy 发出的请求的目的端,通常是「ip:port」 100 | 101 | 通常来说,对于 Outbound Cluster,此值是「上游 pod-ip : pod-port」 ,而对于 Inbound Cluster,此值是「127.0.0.1 : pod-port」 102 | 103 | #### UPSTREAM_LOCAL_ADDRESS 104 | 105 | 上游连接中,当前 Envoy 的本地地址,此值是「当前 pod-ip : 随机端口」 106 | 107 | #### DOWNSTREAM_LOCAL_ADDRESS 108 | 109 | 下游连接中,当前 Envoy 的本地地址。 110 | 111 | 通常来说,对于 Outbound Cluster,此值是「目的 service-ip : service-port 」,而对于 Inbound Cluster,此值是「当前 pod-ip : pod-port」 112 | 113 | #### DOWNSTREAM_REMOTE_ADDRESS 114 | 115 | 下游连接中远端地址。 116 | 117 | 通常来说,对于 Outbound Cluster,此值是「当前 pod-ip : 随机端口 」,而对于 Inbound Cluster,此值是「下游 pod-ip : 随机端口」 118 | 119 | --- 120 | 121 | ## 4. Envoy 日志格式 122 | 123 | Envoy 允许定制日志格式, 格式通过若干「Command Operators」组合,用于提取请求信息,Istio 没有使用 Envoy 默认的日志格式, Istio 定制的访问日志格式如下: 124 | 125 | ![image-20200205002607125](https://zhongfox-blogimage-1256048497.cos.ap-guangzhou.myqcloud.com/2020-02-04-162610.png) 126 | 127 | 完整的「Command Operators」含义可查阅 [Envoy Access logging Command Operators](https://www.envoyproxy.io/docs/envoy/latest/configuration/observability/access_log/usage#command-operators) 128 | 129 | 除了以上流量五元组,流量分析中常用的重要信息还有: 130 | 131 | #### RESPONSE_CODE 132 | 133 | 响应状态码 134 | 135 | #### RESPONSE_FLAGS 136 | 137 | 很重要的信息,Envoy 中自定义的响应标志位, 可以认为是 Envoy 附加的流量状态码。 138 | 139 | 如「NR」表示找不到路由,「UH」表示 Upstream Cluster 中没有健康的 host,「RL」表示触发 rate limit,「UO」触发断路器。 140 | 141 | `RESPONSE_FLAGS` 可选值有十几个,这些信息在调试中非常关键。 142 | 143 | #### X-REQUEST-ID 144 | 145 | 一次 C 到 S 的 http 请求,Envoy 会在 C 端生产 request id,并附加到 header 中,传递到 S 端,在 2 端的日志中都会记录该值, 因此可以通过这个 ID 关联请求的上下游。注意不要和全链路跟踪中的 trace id 混淆。 146 | 147 | #### ROUTE_NAME 148 | 149 | 匹配执行的路由名称 150 | 151 | --- 152 | 153 | ## 5. 场景:判断异常返回是来自业务还是 Sidecar? 154 | 155 | 比如我们希望所有请求 helloworld 都路由到 v1 版本,创建对应的 VirtualService: 156 | 157 | ```shell 158 | % kubectl apply -f hello-v1-virtualservice.yaml 159 | ``` 160 | 161 | ```yaml 162 | apiVersion: networking.istio.io/v1alpha3 163 | kind: VirtualService 164 | metadata: 165 | name: hello 166 | spec: 167 | hosts: 168 | - "helloworld" 169 | http: 170 | - route: 171 | - destination: 172 | host: helloworld 173 | subset: v1 174 | port: 175 | number: 4000 176 | ``` 177 | 178 | 从 sleep 中访问发现响应 503: 179 | 180 | ![image-20200212222518280](https://zhongfox-blogimage-1256048497.cos.ap-guangzhou.myqcloud.com/2020-02-12-142520.png) 181 | 182 | 183 | 如果没有上下文,我们很难判断 503 是来自业务容器还是 Sidecar,查看 sleep 和 hello 的 Envoy 日志,可以发现:hello Pod 的 Envoy 没有接受到请求,sleep Pod 的 Envoy 里日志: 184 | 185 | ![image-20200212222631659](https://zhongfox-blogimage-1256048497.cos.ap-guangzhou.myqcloud.com/2020-02-12-142634.png) 186 | 187 | 其中 `"response_flags": "NR"` 表示「No route configured」,也就是 Envoy 找不到路由,我们可以判断出该异常是有 Envoy 返回。 188 | 189 | 通过简单的分析就可以找到原因, 我们在 VirtualService 中使用的 Destination 没有定义,将其补上: 190 | 191 | ```shell 192 | % kubectl apply -f hello-v1-destinationrule.yaml 193 | ``` 194 | 195 | ```yaml 196 | apiVersion: networking.istio.io/v1alpha3 197 | kind: DestinationRule 198 | metadata: 199 | name: hello 200 | spec: 201 | host: helloworld 202 | subsets: 203 | - name: v1 204 | labels: 205 | version: v1 206 | ``` 207 | 208 | 再次访问请求正常,日志中 `response_flags` 为空: 209 | 210 | ![image-20200212222913583](https://zhongfox-blogimage-1256048497.cos.ap-guangzhou.myqcloud.com/2020-02-12-142915.png) 211 | 212 | --- 213 | 214 | ## 6. 开启 debug 模式 215 | 216 | Envoy 默认日志级别是 info,其日志内容能满足大部分调试场景需求,但对于比较复杂的异常,我们往往还需要开启 debug 级别,能获取到更多的流量处理过程和信息,对某个特定的 Pod,调整日志级别为 debug 的命令: 217 | 218 | ``` 219 | kubectl exec {POD-NAME} -c istio-proxy -- curl -X POST http://127.0.0.1:15000/logging?level=debug 220 | ``` -------------------------------------------------------------------------------- /content/zh/docs/debug-istio/envoy-log/image/downstream-upstream.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/debug-istio/envoy-log/image/downstream-upstream.png -------------------------------------------------------------------------------- /content/zh/docs/debug-istio/envoy-log/image/envoy-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/debug-istio/envoy-log/image/envoy-model.png -------------------------------------------------------------------------------- /content/zh/docs/debug-istio/envoy-log/image/request-route.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/docs/debug-istio/envoy-log/image/request-route.png -------------------------------------------------------------------------------- /content/zh/docs/tcm/_index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "TCM 介绍" 3 | linkTitle: "TCM 介绍" 4 | weight: 100 5 | description: > 6 | 腾讯云 TCM 介绍 7 | --- 8 | 9 | TCM(Tencent Cloud Mesh)是腾讯云上提供的基于 Istio 进行增强,和 Istio API 完全兼容的 Service Mesh 托管服务,集成腾讯云基础设施,提供全托管服务化的支撑能力保障网格生命周期管理。可以帮助用户以较小的迁移成本和维护代价快速利用到 Service Mesh 提供的流量管理和服务治理能力。 10 | 11 | [体验 TCM](https://cloud.tencent.com/product/tcm) -------------------------------------------------------------------------------- /content/zh/featured-background.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/featured-background.jpeg -------------------------------------------------------------------------------- /content/zh/featured-background.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aeraki-mesh/istio-operation-bible/ef6c6a65b522bbcf90c288f7a64dd1beed01aaae/content/zh/featured-background.jpg -------------------------------------------------------------------------------- /content/zh/search.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Search Results 3 | layout: search 4 | 5 | --- 6 | 7 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3.3" 2 | 3 | services: 4 | 5 | site: 6 | image: docsy/docsy-example 7 | build: 8 | context: . 9 | command: server 10 | ports: 11 | - "1313:1313" 12 | volumes: 13 | - .:/src 14 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/google/docsy-example 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/FortAwesome/Font-Awesome v0.0.0-20210804190922-7d3d774145ac // indirect 7 | github.com/google/docsy v0.2.0 // indirect 8 | github.com/twbs/bootstrap v4.6.1+incompatible // indirect 9 | ) 10 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/FortAwesome/Font-Awesome v0.0.0-20210804190922-7d3d774145ac h1:AjwgwoaDsNEA1Wtc8pgw/BqG7SEk9bKxXPjEPQQ42vY= 2 | github.com/FortAwesome/Font-Awesome v0.0.0-20210804190922-7d3d774145ac/go.mod h1:IUgezN/MFpCDIlFezw3L8j83oeiIuYoj28Miwr/KUYo= 3 | github.com/google/docsy v0.2.0-pre.0.20220404161753-f7b37a0aca2a h1:bnufXLbTD8QCLbqygy/kmYxUK1JINSlHU5rLQYTcFMQ= 4 | github.com/google/docsy v0.2.0-pre.0.20220404161753-f7b37a0aca2a/go.mod h1:yuKLZHMX5CKiLUH55+ePFJaYnoSwUVVffNareaOGQYo= 5 | github.com/google/docsy v0.2.0 h1:DN6wfyyp2rXsjdV1K3wioxOBTRvG6Gg48wLPDso2lc4= 6 | github.com/google/docsy v0.2.0/go.mod h1:shlabwAQakGX6qpXU6Iv/b/SilpHRd7d+xqtZQd3v+8= 7 | github.com/google/docsy/dependencies v0.2.0-pre.0.20220404161753-f7b37a0aca2a h1:fy6IqUmWGMdQngRa7+CP1cRkTseQK7OEsqx6r7dNuSA= 8 | github.com/google/docsy/dependencies v0.2.0-pre.0.20220404161753-f7b37a0aca2a/go.mod h1:oPdn05sNt61uT6K+LqNRhYq1jeqrsbbQMDXkPdPscmA= 9 | github.com/google/docsy/dependencies v0.2.0/go.mod h1:2zZxHF+2qvkyXhLZtsbnqMotxMukJXLaf8fAZER48oo= 10 | github.com/twbs/bootstrap v4.6.1+incompatible h1:75PsBfPU1SS65ag0Z3Cq6JNXVAfUNfB0oCLHh9k9Fu8= 11 | github.com/twbs/bootstrap v4.6.1+incompatible/go.mod h1:fZTSrkpSf0/HkL0IIJzvVspTt1r9zuf7XlZau8kpcY0= 12 | -------------------------------------------------------------------------------- /layouts/404.html: -------------------------------------------------------------------------------- 1 | {{ define "main"}} 2 |
3 |
4 |

Not found

5 |

Oops! This page doesn't exist. Try going back to our home page.

6 | 7 |

You can learn how to make a 404 page like this in Custom 404 Pages.

8 |
9 |
10 | {{ end }} 11 | -------------------------------------------------------------------------------- /netlify.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | [build.environment] 3 | HUGO_VERSION = "0.96.0" 4 | GO_VERSION = "1.18" 5 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "learning-envoy", 3 | "version": "0.0.1", 4 | "description": "Envoy 学习笔记.", 5 | "main": "none.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/zhaohuabing/learning-envoy.git" 12 | }, 13 | "author": "Huabing Zhao", 14 | "license": "ISC", 15 | "bugs": { 16 | "url": "https://github.com/zhaohuabing/learning-envoy/issues" 17 | }, 18 | "homepage": "http://zhaohuabing.com/learning-envoy", 19 | "devDependencies": { 20 | "autoprefixer": "^10.4.0", 21 | "postcss": "^8.3.7", 22 | "postcss-cli": "^9.0.2" 23 | } 24 | } 25 | --------------------------------------------------------------------------------