├── .all-contributorsrc ├── .gitignore ├── LICENSE ├── README.md ├── doc ├── Glossary_from_DDIA.pdf ├── Translate_Workflow.pdf ├── how_to_do.md └── manual.md ├── glossary.md ├── lec01 └── introduction.srt ├── lec02 └── rpc_and_threads.srt ├── lec03 └── gfs.srt ├── lec04 ├── Lec4-3.en.txt ├── Lec4-3.zh.txt ├── Lec4-4.en.txt ├── Lec4-4.zh.txt ├── Lec4-5.en.txt ├── Lec4-5.zh.txt ├── Lec4-6.en.txt ├── Lec4-6.zh.txt ├── Lec4-7.en.txt ├── Lec4-7.zh.txt ├── Lec4.en.txt └── primary_backup_replication.en.srt ├── lec05 ├── Lec5.en.txt └── threads_and_raft.en.srt ├── lec06 └── tolerance_raft_1.srt ├── lec07 └── tolerance_raft_2.en.srt ├── lec08 ├── zh-zookeeper.srt └── zookeeper.srt └── lec09 ├── more_replication_craq.srt └── zh-more_replication_craq.srt /.all-contributorsrc: -------------------------------------------------------------------------------- 1 | { 2 | "files": [ 3 | "README.md" 4 | ], 5 | "imageSize": 100, 6 | "commit": false, 7 | "contributors": [ 8 | { 9 | "login": "ZiheLiu", 10 | "name": "zihe.liu", 11 | "avatar_url": "https://avatars2.githubusercontent.com/u/13313784?v=4", 12 | "profile": "https://ziheliu.github.io/", 13 | "contributions": [ 14 | "content" 15 | ] 16 | }, 17 | { 18 | "login": "wildandyang", 19 | "name": "Fan Yang", 20 | "avatar_url": "https://avatars0.githubusercontent.com/u/16045380?v=4", 21 | "profile": "https://github.com/wildandyang", 22 | "contributions": [ 23 | "content" 24 | ] 25 | }, 26 | { 27 | "login": "CyrusF", 28 | "name": "Cyru1s", 29 | "avatar_url": "https://avatars0.githubusercontent.com/u/20309761?v=4", 30 | "profile": "http://blog.cyru1s.com", 31 | "contributions": [ 32 | "content" 33 | ] 34 | }, 35 | { 36 | "login": "hoooga", 37 | "name": "hoooga", 38 | "avatar_url": "https://avatars3.githubusercontent.com/u/8995262?v=4", 39 | "profile": "https://github.com/hoooga", 40 | "contributions": [ 41 | "content" 42 | ] 43 | }, 44 | { 45 | "login": "ivanallen", 46 | "name": "Allen", 47 | "avatar_url": "https://avatars1.githubusercontent.com/u/12481610?v=4", 48 | "profile": "https://allen.blog.csdn.net", 49 | "contributions": [ 50 | "content" 51 | ] 52 | }, 53 | { 54 | "login": "fisheuler", 55 | "name": "fisheuler", 56 | "avatar_url": "https://avatars2.githubusercontent.com/u/4300522?v=4", 57 | "profile": "https://github.com/fisheuler", 58 | "contributions": [ 59 | "doc" 60 | ] 61 | }, 62 | { 63 | "login": "2014BDuck", 64 | "name": "2014bduck", 65 | "avatar_url": "https://avatars0.githubusercontent.com/u/30280396?v=4", 66 | "profile": "https://github.com/2014BDuck", 67 | "contributions": [ 68 | "content" 69 | ] 70 | }, 71 | { 72 | "login": "Laurel-rao", 73 | "name": "Laurel-rao", 74 | "avatar_url": "https://avatars2.githubusercontent.com/u/42195541?v=4", 75 | "profile": "https://github.com/Laurel-rao", 76 | "contributions": [ 77 | "bug" 78 | ] 79 | } 80 | ], 81 | "contributorsPerLine": 7, 82 | "projectName": "thor", 83 | "projectOwner": "ivanallen", 84 | "repoType": "github", 85 | "repoHost": "https://github.com", 86 | "skipCi": true 87 | } 88 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # thor 2 | 3 | [![All Contributors](https://img.shields.io/badge/all_contributors-8-orange.svg?style=flat-square)](#contributors-) 4 | 5 | 雷神项目,翻译 mit 6.824 2020 6 | 7 | ## 组织 8 | 9 | QQ:1035287657 10 | 11 | ## 视频资源 12 | 13 | - https://www.bilibili.com/video/BV1R7411t71W?p=1 14 | 15 | ## 快速开始 16 | 17 | [如何翻译](https://github.com/ivanallen/thor/wiki/%E5%A6%82%E4%BD%95%E7%BF%BB%E8%AF%91) 18 | 19 | ## 版权声明 20 | 21 | 本字幕版权属参与翻译的所有成员,严禁用作商业用途,一经发现,追究法律责任。 22 | 23 | ## Contributors ✨ 24 | 25 | 非常感谢这些可爱的同学: 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 |

zihe.liu

🖋

Fan Yang

🖋

Cyru1s

🖋

hoooga

🖋

Allen

🖋

fisheuler

📖

2014bduck

🖋

Laurel-rao

🐛
44 | 45 | 46 | 47 | 48 | 49 | This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! 50 | 51 | ### 参考资料和工具 52 | 53 | - [翻译流程介绍](https://github.com/ivanallen/thor/blob/master/doc/manual.md) 54 | - [MIT6.824翻译工作流分享](https://www.bilibili.com/video/BV1pQ4y1M7dv) 55 | - [分布式系统翻译工作流分享](https://github.com/ivanallen/thor/blob/master/doc/Translate_Workflow.pdf) 56 | - 参考PR:[Lec04-3翻译任务](https://github.com/ivanallen/thor/pull/24) 57 | - [翻译指南](https://docs.qq.com/doc/DZURQaXBrdXhXb0dx?tdsourcetag=s_macqq_grpfile) 58 | - [建议翻译流程](https://docs.qq.com/doc/BXXro31NHmDg4Kega60fkDTU4l51be2cdG2H4OMrVN3NzUlm0huLua1goly331XKV42Dko7Y0) 59 | - [VS Subtitles Editor](https://marketplace.visualstudio.com/items?itemName=pepri.subtitles-editor) 60 | - [抽取视频字幕的网站](https://downsub.com/) 61 | - [Free Google Translate API](https://pypi.org/project/googletrans/) 62 | - [自动切分时间轴](https://jingyan.baidu.com/article/e73e26c07ce0a824acb6a755.html) 63 | -------------------------------------------------------------------------------- /doc/Glossary_from_DDIA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanallen/thor/99446434fb2a135d42630593b32e7ae956011ce9/doc/Glossary_from_DDIA.pdf -------------------------------------------------------------------------------- /doc/Translate_Workflow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivanallen/thor/99446434fb2a135d42630593b32e7ae956011ce9/doc/Translate_Workflow.pdf -------------------------------------------------------------------------------- /doc/how_to_do.md: -------------------------------------------------------------------------------- 1 | # 如何翻译 2 | 3 | ## 1 快速开始 4 | 5 | 例如,这是一个原始的 srt 文件(截取自 lec03/gfs.srt): 6 | 7 | ``` 8 | 10 9 | 00:00:31,410 --> 00:00:34,260 10 | abstraction you might you know if you 11 | 12 | 11 13 | 00:00:34,260 --> 00:00:35,850 14 | didn't know already you might imagine 15 | 16 | 12 17 | 00:00:35,850 --> 00:00:37,230 18 | that there could be all kinds of 19 | 20 | 13 21 | 00:00:37,230 --> 00:00:40,050 22 | different you know important 23 | 24 | 14 25 | 00:00:40,050 --> 00:00:42,030 26 | abstractions you might want to use for 27 | 28 | 15 29 | 00:00:42,030 --> 00:00:43,650 30 | distributed systems but it's turned out 31 | 32 | 16 33 | 00:00:43,650 --> 00:00:47,730 34 | that a simple storage interface is just 35 | 36 | 17 37 | 00:00:47,730 --> 00:00:50,010 38 | incredibly useful and extremely general 39 | ``` 40 | 41 | 翻译之后,会变成这样: 42 | 43 | ``` 44 | 10 45 | 00:00:31,410 --> 00:00:34,260 46 | 因为存储被证明是一种关键抽象 47 | abstraction you might you know if you 48 | 49 | 11 50 | 00:00:34,260 --> 00:00:35,850 51 | 如果你还不知道的话 52 | didn't know already you might imagine 53 | 54 | 12 55 | 00:00:35,850 --> 00:00:37,230 56 | 你可以想象在分布式系统中 57 | that there could be all kinds of 58 | 59 | 13 60 | 00:00:37,230 --> 00:00:40,050 61 | 你希望使用的各种不同的抽象 62 | different you know important 63 | 64 | 14 65 | 00:00:40,050 --> 00:00:42,030 66 | 你希望使用的各种不同的抽象 67 | abstractions you might want to use for 68 | 69 | 15 70 | 00:00:42,030 --> 00:00:43,650 71 | 但事实表明 72 | distributed systems but it's turned out 73 | 74 | 16 75 | 00:00:43,650 --> 00:00:47,730 76 | 简单的存储接口往往更有用而且更加通用 77 | that a simple storage interface is just 78 | 79 | 17 80 | 00:00:47,730 --> 00:00:50,010 81 | 简单的存储接口往往更有用而且更加通用 82 | incredibly useful and extremely general 83 | ``` 84 | 85 | 你直接修改 gfs.srt 即可。如果你不放心,你可备份一个 gfs.srt,自己再单独创建一个 gfs2.srt。 86 | 87 | ## 2 翻译技巧 88 | 89 | 你可以使用谷歌翻译工具进行批量翻译,这样可以帮助你提高翻译效率。当然,我们正在制作自动化工具,让你第一手拿到的原始 srt 就已经包含了机翻结果,这样你只需要修正就可以了。 90 | 91 | 如果有些英文长句太长,而中文更加紧凑,可以把中文多复制几份,每个时间轴都放上一样的翻译。可以参考第 1 节的示例。 92 | 93 | ## 3 怎么知道自己翻译的部分和别人是否冲突 94 | 95 | 解决方案很简单,新建立一个 issue,标题为 "lec04;20-30min",同时把这个任务分配给自己,这样就不会有人和你翻译冲突了。 96 | 97 | 通常一个小任务为 5min,或者 10min,根据自己的实际情况选择。完成小任务后,一定要及时发起 pr,合入之后,你就可以关闭你的 issue 了。 98 | 99 | ## 4 字幕校正 100 | 101 | 这部分工作你不用担心,会有专门的同学针对完整的字幕文件做统一的修正。当然,如果你在观看过程中,发现有地方翻译的不对,请及时纠正,并提 PR. 102 | -------------------------------------------------------------------------------- /doc/manual.md: -------------------------------------------------------------------------------- 1 | # 规范细节 2 | 3 | > 如果你要认领我们翻译的任务,并成为对应字幕的贡献者,希望你能认真阅读一下这里的规范细节,保证步调的统一。 4 | 5 | ## 1 概述 6 | 你从每个课时中拿到的,有YouTube语音识别自动生成的英文字幕,还有我们抽取的整个讲稿。 7 | 当认领了某一个单元,你就可以按照如下流程,高效统一地开始MIT6.824的翻译了! 8 | 当然,欢迎加入我们字幕组的QQ群,与更多志同道合的小伙伴一起交流,共同进步!(群号:1035287657) 9 | 10 | ## 2 分布式翻译 11 | 12 | ### 2.1 基本思想 13 | 为了提高大家的效率,建议大家尽可能借助机翻,为了提高机翻的准确性,需要大家对汇总的整个讲稿,断句、去掉一些语气词,还有更重要的,纠正机器生成的字幕的错误(记得同步到英文字幕文件中哦)。 14 | 之后,为了翻译像一个老师讲课的效果,需要你在完成机翻后,完整地边听边润色,注意一些术语的翻译,我们在doc中添加了一个中文书籍中的术语表,作为基本参考。更多不熟悉的术语,欢迎进入翻译群讨论。 15 | 这一步翻译的输入输出,我们下面就来仔细介绍~ 16 | 17 | ### 2.2 翻译流程 18 | #### 输入文件 19 | 机器生成英文字幕 + 英文字幕汇总后的完整讲稿 20 | #### 输出文件 21 | 修改后的英文字幕 + 语义断句后的英文讲稿 + 机翻并润色的中文讲稿 22 | 23 | #### 翻译建议流程 24 | 1. 经过认真的听课与对照,你会对上面两文件中的错误进行调整,为了方便讲稿的机翻,你可以去掉讲稿中的一些语气词,并按照语义断句(长的句子可能比较难断,后面我们会有一些建议,辅助你) 25 | 2. 断句修改后的文件保存(作为输出之一),使用Chrome或谷歌翻译,进行机翻。 26 | 3. 但是机翻的效果可能不会特别好,你需要根据一些口语习惯,特定术语,进行调整;机翻之后也能帮助你发现之前步骤的错误,回过头,继续修改字幕文件和英文讲稿。 27 | 28 | #### 输出输出规范 29 | **文件命名:** 30 | 按照每个单元 如自己负责的 Lecx 的第 y 个单元,断句后英语讲稿部分上传 31 | Lecx-y.en.txt;断句后中文翻译上传Lecx-y.zh.txt;原srt修正后并入Lecx 32 | .en.srt。(边界划分原则为向下断句,比如某一行9:59-10:01,属于第一个 33 | 34 | 最后我们会统一合并,生成Lecx.zh.srt和Lecx.srt 35 | 36 | #### 输入输出文件举例 37 | **原始 srt 文件**(截取自 lec07/tolerance_raft_2.en.srt): 38 | ``` 39 | 807 40 | 00:40:53,140 --> 00:40:55,690 41 | to move the arm the right track right so 42 | 43 | 808 44 | 00:40:55,690 --> 00:40:58,570 45 | these persistance can be terribly 46 | 47 | 809 48 | 00:40:58,570 --> 00:41:01,510 49 | terribly expensive and if for sort of 50 | 51 | 810 52 | 00:41:01,510 --> 00:41:03,520 53 | any kind of straightforward design 54 | 55 | 811 56 | 00:41:03,520 --> 00:41:06,220 57 | they’re likely to be the limiting factor 58 | 59 | 812 60 | 00:41:06,220 --> 00:41:09,089 61 | in performance because they mean that 62 | 63 | 813 64 | 00:41:09,089 --> 00:41:13,690 65 | doing anything anything whatsoever on 66 | 67 | 814 68 | 00:41:13,690 --> 00:41:15,339 69 | these Raft servers takes ten 70 | 71 | 815 72 | 00:41:15,339 --> 00:41:18,580 73 | milliseconds a pop and 10 milliseconds 74 | ``` 75 | 76 | **汇总后的讲稿(如Lec7.en.txt)** 77 | ``` 78 | so these persistence can be terribly terribly expensive and if for sort of any kind of straightforward design they’re likely to be the limiting factor in performance because they mean that doing anything anything whatsoever on these Raft servers takes ten milliseconds a pop 79 | ``` 80 | 81 | **修正机器生成字幕错误后字幕并入(Lec7.en.srt)** 82 | >同输入 83 | 84 | **英文断句后输出(Lec7-1.en.txt)** 85 | ``` 86 | so these persistence can be terribly terribly expensive 87 | and if for sort of any kind of straightforward design 88 | they’re likely to be the limiting factor in performance 89 | because they mean that doing anything 90 | anything whatsoever on these Raft servers takes ten milliseconds a pop 91 | ``` 92 | **机翻润色中文输出(Lec7-1.zh.txt)** 93 | ``` 94 | 所以这些持久化的代价可能会非常大 95 | 如果只是用于一个简单的设计 96 | 它们可能成为性能的限制因素 97 | 因为这意味着做任何事情 98 | 这些Raft服务器上的所有内容都会花费10毫秒的时间 99 | ...... 100 | ``` 101 | 102 | #### 注意事项 103 | 1. 输入英文字幕为机器转语音,可能会有错误 104 | 2. 修改后的英文字幕时间轴不变,只是修改一些错误 105 | 3. 修改的时候,可能会有信息冗余(英文字幕与讲稿之间),所以注意两文件间同步 106 | 107 | #### 分工 108 | 通常一个小任务为 5min,或者 10min,在项目中新建立一个 issue,标题为 "lec04;20-30min",根据自己的实际情况选择。 109 | 完成小任务后,一定要及时发起 pr,合入之后,你就可以关闭你的 issue 了。 110 | 111 | #### 技巧推荐 112 | 1. 对于长句(@鱼蛋是我我是橘猫):使用谷歌对汇总后的讲稿直接翻译,谷歌会帮助去掉语气词,用来辅助断句。 113 | 2. 机翻直接采用Chrome打开后翻译(无字数限制)/Google translate(有字数限制) 114 | 115 | 116 | ## 3 字幕校正 117 | 118 | 这部分工作你不用担心,我们字幕组会有专门的同学针对完整的字幕文件做统一的修正。当然,如果你在观看过程中,发现有地方翻译的不对,请及时纠正,并提 PR. 119 | 120 | ## 最后,感谢每一个贡献者的参与! -------------------------------------------------------------------------------- /glossary.md: -------------------------------------------------------------------------------- 1 | ### 术语对照表 2 | 3 | |英文|中文|备注| 4 | |---|---|---| 5 | |single computer|一台计算机/单机|| 6 | |cooperating conputers|计算机集群|| 7 | |fault tolerance / FT|容错|| 8 | |concurrency|并发/并发问题|| 9 | |partial failure|局部错误|| 10 | |infrastructure|基础设施|| 11 | |replication|复制|| 12 | |shard|(数据)分片|| 13 | |goroutine|go例程|不翻译| 14 | |inline|串联式的|| 15 | |enclosing scope|封闭作用域|| 16 | |RPC(Remote Procedure Call)|远程过程调用|不翻译| 17 | |concurrency primitive|并发原语|| 18 | -------------------------------------------------------------------------------- /lec04/Lec4-3.en.txt: -------------------------------------------------------------------------------- 1 | You know they had replication 2 | but it wasn't replicating every single 3 | you know bit of memory 4 | between the primaries and the backups 5 | It was replicating much more application level table of chunks 6 | I had this abstraction of you know chunks and chunk identifiers 7 | And that's what it was replicating 8 | It wasn't replicating sort of everything else 9 | wasn't going to the expense of 10 | replicating every single other thing in that machines 11 | We're doing okay as long as 12 | they had the same sort of application visible set of chunks 13 | So most replication schemes out there go the GFS route 14 | In fact almost everything except pretty much this paper 15 | and a few handful of similar systems 16 | almost everything uses application 17 | at some level application level of replication 18 | Because it can be much more efficient 19 | Because we don't have to go to the 20 | we don't have to go to the trouble of for example making sure 21 | that interrupts occur at exactly the same point 22 | in the execution of the primary and backup 23 | GFS does not sweat that at all 24 | But this paper has to do 25 | Because it replicates at such a low level 26 | So most people build efficient systems 27 | with applications specific replication 28 | The consequence of that though is that 29 | the replication has to be built into the 30 | right into the application right 31 | If you're getting a feed of application level operations 32 | for example you really need to have the application participate in that 33 | because some generic replication thing like today's paper 34 | doesn't really can't understand 35 | the semantics of what needs to be replicated 36 | So anyway so most teams are application specific 37 | like GFS and every other paper we're going to read on this topic 38 | Today's paper is unique in that 39 | it replicates at the level of the machine 40 | and therefore does not care what software you run on it 41 | Right it replicates the low-level memory and machine registers 42 | You can run any software you like on it 43 | as long as it runs on that kind of microprocessor 44 | that's being represented 45 | This replication scheme applies to the software can be anything 46 | And you know the downside is that it's not that efficient necessarily 47 | The upside is that you can take any existing piece of software 48 | Maybe you don't even have source code for it or understand how it works 49 | And you know do within some limits 50 | you can just run it under this under VMware's replication scheme 51 | And it'll just work which is sort of 52 | magic fault-tolerance wand for arbitrary software 53 | All right now let me talk about how this is VMware FT 54 | First of all VMware is a virtual machine company 55 | They're what their business is 56 | a lot of their business is selling virtual machine technology 57 | And what virtual machines refer to is the idea of 58 | you know you buy a single computer 59 | And instead of booting an operating system like Linux on the hardware 60 | you boot we'll call a virtual machine monitor 61 | or hypervisor on the hardware 62 | And the hypervisor's job is actually to 63 | simulate multiple multiple computers 64 | multiple virtual computers on this piece of hardware 65 | So the virtual machine monitor may boot up you know one instance of Linux 66 | may be multiple instances of Linux may be a Windows 67 | machine you can The virtual machine monitor on this one computer 68 | can run a bunch of different operating systems 69 | you know Each of these as is itself some sort of operating system kernel and then applications 70 | So this is the technology they're starting with 71 | And you know the reason for this is that 72 | if you know you need to it just turns out 73 | there's many many reasons why it's very convenient 74 | to kind of interpose this level of indirection 75 | between the hardware and the operating systems 76 | And means that we can buy one computer 77 | and run lots of different operating systems on it 78 | we can have each If we run lots and lots of little services 79 | instead of having to have lots and lots of computers one per service 80 | you can just buy one computer 81 | and run each service in the operating system 82 | that it needs I'm using these virtual machines 83 | So this was their starting point 84 | They already had this stuff 85 | and a lot of sophisticated things built around it 86 | at the start of designing VMware FT 87 | So this is just virtual machines um 88 | What the paper's doing is that it's gonna set up one machine 89 | or they did requires two physical machines 90 | Because there's no point in 91 | running the primary and backup software 92 | in different virtual machines on the same physical machine 93 | Because we're trying to guard against hardware failures 94 | So you're gonna to at least you know you 95 | have two machines running their virtual machine monitors 96 | And the primary is going to run on one 97 | the backup is on the other 98 | So on one of these machines we have a guest 99 | you know we only It might be running a lot of virtual machines 100 | We only care about one of them 101 | It's gonna be running some guest operating system 102 | and some sort of server application 103 | Maybe a database server, MapReduce master, or something 104 | So I'll call this the primary 105 | And there'll be a second machine 106 | that you know runs the same virtual machine monitor 107 | and an identical virtual machine holding the backup 108 | So we have the same whatever the operating system 109 | is exactly the same 110 | And the virtual machine is you know giving 111 | these guest operating systems the primary 112 | and backup a each range of memory 113 | and this memory images will be identical 114 | or the goal is to make them identical 115 | in the primary in the backup 116 | We have two physical machines 117 | Each one of them running a virtual machine guest 118 | with its own copy of the service we care about 119 | We're assuming that there's a network 120 | connecting these two machines 121 | And in addition on this Local Area Network 122 | in addition on this network there's some set of clients 123 | Really, they don't have to be clients 124 | They're just maybe other computers 125 | that our replicated service needs to talk with 126 | Some of them are clients sending requests 127 | It turns out in this paper there 128 | the replicated service actually doesn't use a local disk 129 | and instead assumes that there's some sort of 130 | disk server that it talks to him 131 | Although it's a little bit hard to realize this from the paper 132 | The scheme actually does not 133 | really treat the server particularly 134 | Especially it's just another external source of packets 135 | and place that the replicated state machine may send packets to 136 | Not very much different from clients 137 | Okay so the basic scheme is that the we assume that 138 | these two replicas, the two virtual machines, primary and backup, are exact replicas 139 | Some client, you know database client who knows who has 140 | Some client of our replicated server 141 | sends a request to the primary 142 | And that really takes the form of a network packet 143 | that's what we're talking about 144 | That generates an interrupt and this interrupt actually goes to 145 | the virtual machine monitor at least in the first instance 146 | The virtual machine monitor sees 147 | here's the input for this replicated service 148 | And so the virtual machine monitor does two things 149 | One is it sort of simulates a network packet arrival interrupt 150 | into the primary guest operating system 151 | to deliver it to the primary copy of the application 152 | And in addition the virtual machine monitor you know knows that 153 | this is an input to a replicated virtual machine 154 | And it's so it sends back out 155 | on the network a copy of that packet 156 | to the backup virtual machine monitor 157 | It also gets it and backup virtual machine monitor knows 158 | ha it is a packet for this particular replicated state machine 159 | And it also fakes a sort of network packet arrival interrupt 160 | at the backup and delivers the packet 161 | So now both the primary and the backup have a copy 162 | This packet they looks at, the same input 163 | you know with a lot of details 164 | are gonna process it in the same way and stay synchronized 165 | Course the service is probably going to reply to the client 166 | On the primary the service will generate a reply packet 167 | and send it on the NIC 168 | that the virtual machine monitor is emulating 169 | And then the virtual machine monitor will we'll 170 | see that output packet on the primary 171 | They'll actually send the reply back out 172 | on the network to the client 173 | Because the backup is running exactly 174 | the same sequence of instructions 175 | It also generates a reply packet back to the client 176 | and sends that reply packet on its emulated NIC 177 | It's the virtual machine monitor 178 | that's emulating that network interface card 179 | And it says aha you know the virtual machine monitor says 180 | I know this was the backup 181 | only the primary is allowed to generate output 182 | And the virtual machine monitor drops the reply packet 183 | So both of them see inputs and only the primary generates outputs 184 | As far as terminology goes 185 | the paper calls this stream of input events 186 | and other things, other events we'll talk about from the stream 187 | is called the logging Channel 188 | -------------------------------------------------------------------------------- /lec04/Lec4-3.zh.txt: -------------------------------------------------------------------------------- 1 | 他们的确是有复制的 2 | 但是并没有在主和副本服务之间复制每一个bit的内存 3 | 但是并没有在主和副本服务之间复制每一个bit的内存 4 | 但是并没有在主和副本服务之间复制每一个bit的内存 5 | 而是复制偏应用程序级别的内存块表 6 | 我对块和块标识符进行了这种抽象 7 | 这就是需要复制的东西 8 | 并不需要复制任何其他的东西 9 | 也没有在该机器上复制任何其他的东西的代价 10 | 也没有在该机器上复制任何其他的东西的代价 11 | 这样做是可以的 12 | 只要主服务和副本服务具有相同应用程序可见性的内存块集 13 | 因此大多数复制方案都采用与GFS相似的方案 14 | 实际上,几乎除了这篇论文以及一些类似的系统之外的所有方案 15 | 实际上,几乎除了这篇论文以及一些类似的系统之外的所有方案 16 | 他们几乎都使用了应用级别的复制 17 | 他们几乎都使用了应用级别的复制 18 | 因为这样可以更有效率 19 | 因为我们不必费力的去确保 20 | 因为我们不必费力的去确保 21 | 在主和副本服务运行时 22 | 中断发生在完全相同的时间点 23 | GFS完全不用担心这点 24 | 但是本文必须要确保这点 25 | 因为它在很低的级别进行复制 26 | 因此大多数人使用特定于应用程序的复制来构建高效的系统 27 | 因此大多数人使用特定于应用程序的复制来构建高效的系统 28 | 这样做的后果是 29 | 必须将复制内置到应用程序权限中 30 | 必须将复制内置到应用程序权限中 31 | 例如,如果你需要获取应用程序级别操作的提要 32 | 你就需要让应用程序参与其中 33 | 因为有些通用的复制方案,例如今天的论文 34 | 并不能理解 35 | 哪些东西需要被复制的语义 36 | 因此大多数方案是针对于特定应用的 37 | 例如GFS以及我们将要在这个主题下阅读的所有其他论文 38 | 今天的论文的不同之处在于 39 | 它实在机器级别进行复制的 40 | 因此它不关系在其上运行了什么软件 41 | 它复制低级别的内存以及寄存器 42 | 你可以其上运行任何软件 43 | 只要它可以在这种所表示的微处理器上运行 44 | 只要它可以在这种所表示的微处理器上运行 45 | 这种复制方案可以适应任何软件 46 | 缺点是效率不一定高 47 | 优点是你可以使用任何现有的软件 48 | 甚至你没有源代码或者不知道它是如何工作的 49 | 在一定的限制下 50 | 你就可以在VMware的复制方案下运行它 51 | 它可以正常工作 52 | 且对于任意软件都可以进行容错 53 | 现在我们来讨论VMware FT 54 | 首先,VMware是一家虚拟机公司 55 | 他们的很多业务都是销售虚拟机技术 56 | 他们的很多业务都是销售虚拟机技术 57 | 虚拟机指的是 58 | 你买一台电脑 59 | 在硬件上不是启动像Linux这样的操作系统 60 | 而是启动虚拟机监视器 61 | 而是启动虚拟机监视器 62 | 它的工作实际上是 63 | 在此硬件上模拟多台虚拟的电脑 64 | 在此硬件上模拟多台虚拟的电脑 65 | 因此虚拟机监视器可能会启动一个Linux实例 66 | 多个Linux实例,或者一个Windows实例 67 | 这台计算机上的虚拟机监视器可以运行许多不同的操作系统 68 | 这台计算机上的虚拟机监视器可以运行许多不同的操作系统 69 | 它们每个包含某种操作系统内核以及应用程序 70 | 所以这是他们开始使用的技术 71 | 原因是事实证明 72 | 原因是事实证明 73 | 在硬件和操作系统之间进行这种级别的间接干预非常方便 的原因有很多 74 | 在硬件和操作系统之间进行这种级别的间接干预非常方便 的原因有很多 75 | 在硬件和操作系统之间进行这种级别的间接干预非常方便 的原因有很多 76 | 这意味着我们可以购买一台计算机 77 | 并在其上运行许多不同的操作系统 78 | 如果我们运行大量的小型服务 79 | 而不是使用大量的每台运行一个服务的计算机 80 | 你可以只购买一台计算机 81 | 在基于虚拟机上的操作系统中运行每个服务 82 | 在基于虚拟机上的操作系统中运行每个服务 83 | 这就是他们的出发点 84 | 在最开始设计VMware FT时 85 | 他们已经构建了这项功能和许多其他复杂的东西 86 | 他们已经构建了这项功能和许多其他复杂的东西 87 | 所以这就是虚拟机 88 | 论文要做的是要搭建一台机器 89 | 或者说他们需要两台物理机 90 | 因为在同一台物理计算机上的不同虚拟机中运行主和副本软件毫无意义 91 | 因为在同一台物理计算机上的不同虚拟机中运行主和副本软件毫无意义 92 | 因为在同一台物理计算机上的不同虚拟机中运行主和副本软件毫无意义 93 | 因为我们正在努力应对硬件故障 94 | 因此,你有两台计算机分别运行其虚拟机监视器 95 | 因此,你有两台计算机分别运行其虚拟机监视器 96 | 而主虚拟机将在一台计算机上运行 97 | 而副本虚拟机将在另一台上运行 98 | 在其中一台计算机上有一个guest操作系统... 99 | 它可能正在运行许多虚拟机 100 | 我们只在乎其中的一个 101 | 它会运行多个guest操作系统 102 | 以及服务应用程序 103 | 也许是数据库服务,MapReduce主数据库或其他东西 104 | 我们称这个为主虚拟机 105 | 这里有第二台计算机 106 | 其运行相同的虚拟机监视器 107 | 也有运行副本服务的相同的虚拟机 108 | 因此,无论是何种操作系统,我们都具有完全相同的东西 109 | 因此,无论是何种操作系统,我们都具有完全相同的东西 110 | 虚拟机为这些guest操作系统、主和副本服务器 111 | 虚拟机为这些guest操作系统、主和副本服务器 112 | 提供一定范围的内存空间 113 | 并且这两个内存镜像是完全相同的 114 | 或其目标是使它们在主和副本虚拟机中完全相同 115 | 或其目标是使它们在主和副本虚拟机中完全相同 116 | 我们有两台物理计算机 117 | 每台都在运行guest虚拟机 118 | 该虚拟机上带有我们关心的服务的副本 119 | 我们假设有一个网络连接了这两台机器 120 | 我们假设有一个网络连接了这两台机器 121 | 此外,在此局域网上还有一些客户端 122 | 此外,在此局域网上还有一些客户端 123 | 事实上,它们不一定是是客户端 124 | 它们可能是其他计算机 125 | 复制服务需要与之通信 126 | 其中一些是来发送请求的客户端 127 | 这篇论文中的复制服务实际上并不使用本地磁盘 128 | 这篇论文中的复制服务实际上并不使用本地磁盘 129 | 而是假设与某种磁盘服务器进行通信 130 | 尽管从本篇论文中很难意识到这一点 131 | 尽管从本篇论文中很难意识到这一点 132 | 该方案实际上并没有特殊对待这种服务器 133 | 该方案实际上并没有特殊对待这种服务器 134 | 它只是数据包的另一个外部来源 135 | 只是复制状态机可能会将数据包发送到的地方 136 | 这与其他客户端没有太大不同 137 | 因此,基本方案是,我们假设 138 | 这两个副本、两个虚拟机、或者说主和副本虚拟机,都是精确的副本 139 | 某个客户端,例如数据库客户端 140 | 复制服务器的某个客户端 141 | 向主虚拟机发送请求 142 | 而这实际上是以网络数据包的形式发送的 143 | 就是我们刚刚讨论的 144 | 它生成一个中断 145 | 该中断进入第一个实例的虚拟机监视器 146 | 虚拟机监视器发现 147 | 复制服务的输入到来了 148 | 因此,虚拟机监视器会做两件事 149 | 第一件事,它模拟网络数据包到达中断,传递给主guest操作系统 150 | 第一件事,它模拟网络数据包到达中断,传递给主guest操作系统 151 | 以此将其传递给应用程序的主副本 152 | 第二件事,虚拟机监视器知道这是复制虚拟机的输入 153 | 第二件事,虚拟机监视器知道这是复制虚拟机的输入 154 | 因此,它通过网络将数据包副本 155 | 因此,它通过网络将数据包副本 156 | 发送给副本虚拟机监视器 157 | 所以它也得到了数据包,副本虚拟机监视器 158 | 知道它是此复制状态机的数据包 159 | 它在副本虚拟机中也会构造网络数据包到达中断,并传送数据包 160 | 它在副本虚拟机中也会构造网络数据包到达中断,并传送数据包 161 | 所以现在主和副本虚拟机都有了数据包的一份副本 162 | 它们看到的这个数据包、这个相同的输入 163 | 通过考虑大量的细节 164 | 会以相同的方式处理并保持同步 165 | 当然,服务可能会回复客户 166 | 在主虚拟机上,服务将生成一个回复数据包 167 | 将其发送到虚拟机监视器所模拟的NIC上 168 | 将其发送到虚拟机监视器所模拟的NIC上 169 | 然后,虚拟机监视器将会 170 | 在主计算机上看到该输出数据包 171 | 它们会将回复通过网络发送回客户端 172 | 它们会将回复通过网络发送回客户端 173 | 由于副本在运行 174 | 完全相同的指令序列 175 | 它也会生成一个回复数据包返回给客户端 176 | 在其模拟的NIC上发送该回复数据包 177 | 虚拟机监视器模拟了该网卡 178 | 虚拟机监视器模拟了该网卡 179 | 虚拟机监视器知道这是副本虚拟机 180 | 虚拟机监视器知道这是副本虚拟机 181 | 而它只允许主虚拟机生成输出 182 | 因此,虚拟机监视器会丢弃回复数据包 183 | 所以他们两个都看到了输入,而只有主虚拟机产生了输出 184 | 就术语而言,这篇论文将这种输入事件流 185 | 就术语而言,这篇论文将这种输入事件流 186 | 以及之后要讨论的其他事件流 187 | 称为日志记录通道 188 | -------------------------------------------------------------------------------- /lec04/Lec4-4.en.txt: -------------------------------------------------------------------------------- 1 | logging Channel. it all goes over the 2 | same network presumably but these events 3 | the primary send to the backup are called 4 | log events on the log Channel 5 | where the fault tolerance comes in is 6 | that those, the primary crashes, what the 7 | backup is going to see is that it stops 8 | getting stuff ,on the ,stops getting log 9 | entries ,a log entry ,stops getting log 10 | entries on the logging channel and we 11 | know it it turns out that the backup can 12 | expect to get many per second because 13 | one of the things that generates log 14 | entries is periodic timer interrupts in 15 | the in the primary each one of which 16 | turns out every interrupt generates a 17 | log entries into the backup these timer 18 | interrupts are going to happen like 100 19 | times a second so the backups can 20 | certainly expect to see 21 | a lot of chitchat on the logging Channel 22 | if the primaries up .if the primary 23 | crashes then the virtual machine 24 | monitored over here will say gosh you 25 | know I haven't received anything on the 26 | logging channel for like a second or 27 | however long the primary must be dead or 28 | or something and in that case when the 29 | backup stop seeing log entries from the 30 | primary the paper the way the paper 31 | phrases it is that the backup goes alive 32 | and what that means is that it stops 33 | waiting for these input events on the 34 | logging Channel from the primary and 35 | instead this virtual machine monitor 36 | just lets this backup execute freely 37 | without waiting for without being driven 38 | by input events from the primary ,the vmm 39 | does something to the network to cause 40 | future client requests to go to the 41 | backup instead of the primary and the 42 | VMM here stops discarding the backup 43 | personnel it's the primary not the 44 | backup stops discarding output from this 45 | virtual machine so now this or machine 46 | directly gets the inputs and there's a 47 | lot of produce output and now our backup 48 | is taken over and similarly you know 49 | that this is less interesting but has to 50 | work correctly 51 | if the backup fails a similar primary 52 | has to use a similar process to abandon 53 | the backup stop sending it events and 54 | just sort of act much more like a single 55 | non replicated server so either one of 56 | them can go live if the other one 57 | appears to be dead ,stops, you know stops 58 | generating network traffic. 59 | magic, now it depends ,you know depends on 60 | what the networking technology is I 61 | think with the paper one possibility is 62 | that this is sitting on Ethernet every 63 | physical computer on the Internet or 64 | really every NIC has a 48 bit unique ID 65 | I'm making this up now, the ,it could be 66 | that in fact instead of each physical 67 | computer having a unique ID each virtual 68 | machine does and when the backup takes 69 | over it essentially claims the primary's 70 | Ethernet ID as its own and it starts 71 | saying you know I'm the owner of that ID 72 | and then other people on the ethernet 73 | will start sending us packets that's my 74 | interpretation ,the designers believed 75 | they had identified all such sources and 76 | for each one of them the primary does 77 | whatever it is you know executes the 78 | random number generator instruction or 79 | takes an interrupt at some time the 80 | backup does not and the back of virtual 81 | machine monitor sort of detects any such 82 | instruction and and intercepts that and 83 | doesn't do it and he said the backup 84 | waits for an event on the logging 85 | Channel saying this instruction number 86 | you know the random number was whatever 87 | it was on the primary 88 | At which? 89 | yes yes 90 | yeah the paper hints that they got Intel 91 | to add features to the microprocessor to 92 | support exactly this but they don't say 93 | what it was ,okay 94 | okay so on that topic ,the ,so far that 95 | you know the story is sort of assumed 96 | that as long as the backup to sees the 97 | package from the clients it'll execute 98 | in identically to the primary and that's 99 | actually glossing over some huge and 100 | important details so one problem is that 101 | as a couple of people have mentioned 102 | there are some things that are 103 | non-deterministic now it's not the case 104 | that every single thing that happens in 105 | the computer is a deterministic function 106 | of the contents of the memory of the 107 | computer it is for a sort of straight 108 | line code execution often but certainly 109 | not always so worried about is things 110 | that may happen that are not a strict 111 | function of the current state that is 112 | that might be different if we're not 113 | careful on the primary and backup so 114 | these are sort of non-deterministic 115 | events that may happen so the designers 116 | had to sit down and like figure out what 117 | they all work and here are the ones 118 | here's the kind of stuff they talked 119 | about so one is inputs from external 120 | sources like clients which arrive just 121 | whenever they arrive right they're not 122 | predictable there are no sense in which 123 | the time at which a client request 124 | arrives or its content is a 125 | deterministic function of the services 126 | state because it's not ,so these actually , 127 | this system is really dedicated to a 128 | world in which services only talk over 129 | the network and so the only really 130 | basically the only form of input or 131 | output in this system is supported by 132 | this system seems to be network packets 133 | coming and going. so we didn't put 134 | arrives at what that really means it's a 135 | packet 136 | arrives and what a packet really 137 | consists of for us is the data in the 138 | packet plus the interrupt 139 | that's signaled that the packet had 140 | arrived so that's quite important, so 141 | when a packet arrives 142 | I'm ordinarily the NIC DMAs the packet 143 | contents into memory and then raises an 144 | interrupt which the operating system 145 | feels and the interrupt happens at some 146 | point in the instruction stream and so 147 | both of those have to look identical on 148 | the primary and backup or else we're 149 | gonna have they're also executions gonna 150 | diverge and so you know the real issue 151 | is when the interrupt occurs exactly at 152 | which instruction the interrupts happen 153 | to occur and better be the same on the 154 | primary in the backup otherwise their 155 | execution is different and their states 156 | are gonna diverge and so we care about 157 | the content of the packet and the timing 158 | of the interrupt and then as a couple of 159 | people have mentioned there's a few 160 | instructions that that behave 161 | differently on different computers or 162 | differently depending on something like 163 | there's maybe a random number generator 164 | instruction there's I get time-of-day 165 | instructions that will yield different 166 | answers have called at different times 167 | and unique ID instructions another huge 168 | source of non determinism which the 169 | paper basically rules out is multi-core 170 | parallelism this is a uni-process only 171 | system there's no multi-core in this 172 | world the reason for this is that if it 173 | allowed multi-core then then the service 174 | would be running on multiple cores and 175 | the instructions of the service the rest 176 | of you know the different cores are 177 | interleaved in some way which is not 178 | predictable and so really if we run the 179 | same code on the on the backup in the 180 | server if it's parallel code running on 181 | a multi-core the tube interleave the 182 | instructions in the two cores in 183 | different ways the hardware will and 184 | that can just cause 185 | different results because you know 186 | supposing the code and the two cores you 187 | know they both asked for a lock on some 188 | data well on the master you know 189 | core one may get the lock before core two 190 | on the slave just because of a tiny 191 | timing difference core two may got the 192 | lock first and the you know execution 193 | results are totally different likely to 194 | be totally different if different 195 | threads get the lock 196 | so multi-core is the grim source among 197 | non-determinisms just totally 198 | outlawed in this papers world and indeed 199 | like as far as I can tell the techniques 200 | are not really applicable. the service 201 | can't use multi-core parallel -------------------------------------------------------------------------------- /lec04/Lec4-4.zh.txt: -------------------------------------------------------------------------------- 1 | 它们都在同一条网络上传输 2 | 推测来看,这些 3 | 被主节点发往备份节点的事件 4 | 被称为在日志通道上的日志记录 5 | 容错出现的场景是 6 | 主节点挂掉了 7 | 备份节点觉察到它接受不到日志了 8 | 备份节点停止接受日志记录了 9 | 接收不到日志通道上的日志数据了 10 | 接收不到日志通道上的日志数据 11 | 我们知道备份节点 12 | 每秒可以接受许多事件, 13 | 因为其中的一些事件就是 14 | 由周期性的时钟中断触发产生的, 15 | 在主节点上,每个中断产生一个日志记录, 16 | 在主节点上,每个中断产生一个日志记录, 17 | 在主节点上,每个中断产生一个日志记录, 18 | 这些时钟中断可以以每秒100次的频率触发, 19 | 所以备份节点 20 | 理应接受到 21 | 日志通道上的许多通信消息 22 | 如果主节点还存活着的话 23 | 如果主节点挂掉的话,VMM监视器 24 | 会说:天哪, 25 | 我已经有一秒或者更长时间没有收到日志同道上的事件了 26 | 我已经有一秒或者更长时间没有收到日志同道上的事件了 27 | 那主节点应该挂掉了, 28 | 或者发生了其他事情,当发生这些事情时候, 29 | 备份节点接收不到主节点的日志记录时候, 30 | 论文中描述说,备份节点会启动 31 | 论文中描述说,备份节点会启动 32 | 它将停止接收来自主节点日志通道上的输入事件 33 | 它将停止接收来自主节点日志通道上的输入事件 34 | 它将停止接收来自主节点日志通道上的输入事件 35 | 相反VMM 36 | 会让备份节点开始自由执行 37 | 而不需要通过主节点的输入事件来驱动执行 38 | 而不需要通过主节点的输入事件来驱动执行 39 | 同时VMM会变更网络层配置信息 40 | 让未来的客户端请求路由到备份节点上 41 | 而不是到原来的主节点上 42 | VMM此时停止之前抛弃备份节点返回数据的策略 43 | 此时原来旧的主节点 44 | 开始抛弃输出数据 45 | 所以这台机器开始 46 | 接受输入事件, 47 | 并且输出输出事件 48 | 这样我们的备份节点完成接管流程 49 | 类似的这个接管流程不是非常有趣 50 | 但是需要能正常的work 51 | 如果备份节点挂掉了,主节点 52 | 需要用同样的一套机制来放弃备份节点 53 | 停止向备份节点发送日志记录 54 | 此时主节点更像是一个单节点的 55 | 没有复制功能的服务, 56 | 所以它们中的一个会变活 57 | 当另一个挂掉的时候,或者停止发送网络流量 58 | 当另一个挂掉的时候,或者停止发送网络流量 59 | 神奇,这个得看具体情况 60 | 看具体的网络拓扑结构是怎样的 61 | 我认为论文中的一个可能是 62 | 在Ethernet层基础上 63 | 每一个物理机或者NIC卡 64 | 有一个48bit的唯一ID 65 | 也有可能是另一种情况 66 | 并不是每一个物理机 67 | 有一个唯一Id,而是每一个虚拟机有一个唯一Id 68 | 当备份节点接管的时候 69 | 它会申明 70 | 主节点的Ethernet Id变成它自己的Id 71 | 然后对外声明它是那个唯一Id的所有者 72 | 这样网络上的其他节点 73 | 就会向我们发送数据包,这是我这边的理解 74 | 这套机制的设计者 75 | 相信他们确认了所有的事件源 76 | 针对每一个这样的事件源,主节点 77 | 都会执行,如你所知,不管是 78 | 一个随机数生成指令,或者 79 | 在某个时间点触发一个中断事件, 80 | 但是备份节点并不执行这些操作,VMM的备份节点 81 | 会检测到这些指令 82 | 会截获这些指令 83 | 并且不执行它,备份节点 84 | 会等待日志通道上的一个对应的日志事件 85 | 告诉它具体的指令结果是啥 86 | 你知道的,那就是在主节点上产生的那个随机数 87 | 你知道的,那就是在主节点上产生的那个随机数 88 | 在哪个节点? 89 | 是的是的 90 | 论文暗示了他们让Intel 91 | 在微处理器上加入了一些特性 92 | 来支持这个功能,但是他们没有说 93 | 到底是什么 94 | 关于这个主题,到现在为止 95 | 你们知道的是 96 | 只要备份节点能够收到 97 | 从client发送过来的数据包 98 | 它就会和主节点保持一致的执行它 99 | 但实际上我们忽略了一些巨大和重要的细节 100 | 其中的一个问题就是 101 | 许多人之前提到的 102 | 有一些操作是非确定性的 103 | 事情并不是这样的 104 | 发生在机器里的每一个操作并不一定是内存内容 105 | 的确定性函数映射关系 106 | 的确定性函数映射关系 107 | 对一些常见的直接代码执行操作是确定性的 108 | 对一些常见的直接代码执行操作是确定性的 109 | 但是并不总是这样的,我们担心的是 110 | 某些操作并不是当前状态的严格函数关系 111 | 某些操作并不是当前状态的严格函数关系 112 | 即结果可能不一样,如果我们没有细心的考虑 113 | 主节点和备份节点之间的关系 114 | 所以这是一些非确定性的事件 115 | 它们的存在使设计者们不得不 116 | 坐下来好好思考弄清楚 117 | 它们要如何工作才行 118 | 这里是他们要讨论的内容 119 | 其中的一个是外部源的输入事件 120 | 这些事件随便什么时候都可以到来 121 | 这些事件随便什么时候都可以到来 122 | 它们是不可预测的,没有道理可以预先知道这些请求什么时候到来 123 | 没有道理可以预先知道这些请求什么时候到来 124 | 或者它的内容是服务状态的确定性映射关系,因为它并不是 125 | 或者它的内容是服务状态的确定性映射关系,因为它并不是 126 | 所以呢 127 | 所以这个系统真正面对的是一个服务之间通过网络进行交互的场景 128 | 所以这个系统真正面对的是一个服务之间通过网络进行交互的场景 129 | 所以这个系统仅有的的输入和输出方式 130 | 所以这个系统仅有的的输入和输出方式 131 | 是通过网络包的输入和输出形式来支撑的。 132 | 是通过网络包的输入和输出形式来支撑的。 133 | 所以我们没有把到达操作记录下来,它真正的意思是一个数据包到达了 134 | 它真正的意思是一个数据包到达了 135 | 它真正的意思是一个数据包到达了 136 | 同时一个数据包是由 137 | 数据包中的数据 138 | 加上用来通知数据到达的中断事件组成的 139 | 加上用来通知数据到达的中断事件组成的 140 | 这一点是非常重要的 141 | 所以当一个数据包到达的时候 142 | NIC卡通过DMA机制将包内容拷贝到内存 143 | 然后触发中断操作 144 | 操作系统收到中断事件 145 | 同时这个中断会在指令流的某个时间点执行 146 | 同时这个中断会在指令流的某个时间点执行 147 | 所有这些操作在主节点和备份节点必需看起来是一样的 148 | 所有这些操作在主节点和备份节点必需看起来是一样的 149 | 否则我们将会看到有些操作将会引起分叉 150 | 所以你知道真正的问题是 151 | 当中断真正的在某一条指令执行触发的时候, 152 | 当中断真正的在某一条指令执行触发的时候, 153 | 这个需要在主节点和备份节点是一致的 154 | 否则它们的执行可能出现不一致,他们的状态也将出现分叉 155 | 否则它们的执行可能出现不一致,他们的状态也将出现分叉 156 | 所以我们需要关注数据包的内容,中断的时序 157 | 所以我们需要关注数据包的内容,中断的时序 158 | 中断的时序以及 159 | 其他人提到的 160 | 有一些指令 161 | 在不同的机器上有不用的执行结果 162 | 或者依赖其他情况 163 | 比如一个随机数生成器 164 | 或者获取时间的指令 165 | 在不同时间执行会获取到不同的结果 166 | 在不同时间执行会获取到不同的结果 167 | 或者是唯一Id生成指令 168 | 另一个非确定性的主要来源 169 | 但是在论文中被显式剔除掉的 170 | 多核并行的场景,论文假定了一个单核处理, 171 | 并不存在多核情况的世界 172 | 这样做的原因是 173 | 如果允许存在多核处理的话, 174 | 服务将会运行在多个核上 175 | 这样的话服务的指令 176 | 如我们所知晓的,不同的核将交叉执行指令 177 | 不同的核将以某种方式交叉执行指令 178 | 但是这个次序是不可预知的,所以如果我们 179 | 在备份节点执行相同的指令 180 | 这些在多核上执行的并行指令 181 | 电子器件将在两个核上以不同的组合方式交叉执行这些指令 182 | 电子器件将在两个核上以不同的组合方式交叉执行这些指令 183 | 电子器件将在两个核上以不同的组合方式交叉执行这些指令 184 | 那样会导致出现不同的计算结果, 185 | 因为你知道 186 | 设想这些代码执行在两个核上 187 | 它们都在获取某些数据的锁 188 | 在主节点上 189 | 核芯1有可能比核芯2先获得了锁 190 | 但是在备份节点上,可能仅仅是由于一个很小 191 | 的时序差异,核芯2可能先获取到锁 192 | 这样的话执行结果 193 | 是完全不一样的, 194 | 执行结果可能是完全不一样的,如果是由 195 | 不同的线程获取了锁 196 | 所以多核场景是很糟糕的一种 197 | 导致非确定性的源头 198 | 在这篇论文中仅仅是禁止出现这种场景, 199 | 就我所知道的而言,论文中的这些技术 200 | 在实际中不是很实用。 201 | 服务不能使用多核并行方式 -------------------------------------------------------------------------------- /lec04/Lec4-5.en.txt: -------------------------------------------------------------------------------- 1 | so we can't let that happen 2 | because if the primary happens to be playing that trick 3 | it's gonna see 4 | you know if we allowed the network interface card to directly DMA incoming packets into the memory of the primary 5 | the primary we don't have any control over the exact timing of 6 | and so we're not going to know sort of at what times the primary did or didn't observe data from the packet arriving 7 | and so what that means is that in fact the NIC copies incoming packets into private memory of the virtual machine monitor 8 | and then the network interface card interrupts the virtual machine monitor and says oh a packet has arrived 9 | at that point the virtual machine monitor will suspend the primary 10 | and remember what instruction number had suspended at 11 | copy the entire packet into the primaries memory while the primary suspended and not looking at this copy 12 | and then emulate a network interface card interrupt into the primary 13 | and then send the packet and the instruction number to the backup 14 | the backup will also suspend the backup rope 15 | you know virtual machine monitor will spend the backup at that instruction number 16 | copy the entire packet and again to the back-up is guaranteed not to be watching the data arrive 17 | and then fakin interrupts at the same instruction numbers of the primary 18 | and this is the something the bounce buffer mechanism explained in the paper 19 | okay yeah the the only instructions and that result in logging channel traffic or are weird instructions which are rare 20 | no its instructions that might yield a different result if executed on the primary and backup 21 | like instruction to get the current time of day or current processor number 22 | or ask how many instructions have been executed or 23 | and those actually turn out to be relatively rare 24 | there's also one them to get random tasks when some machines 25 | to ask or a hardware generated random number for cryptography or something 26 | and but those are not everyday instructions 27 | most instructions like add instructions they're gonna get the same result on primary and backup 28 | exactly right each network packet just it's packaged up and forwarded as it is as a network packet 29 | and is interpreted by the tcp/ip stack on both you know 30 | so I'm expecting 99.99% of the logging channel traffic to be incoming packets 31 | and only a tiny fraction to be results from special non-deterministic instructions 32 | and so we can kind of guess what the traffic load is likely to be for for a server that serves clients 33 | basically it's a copy of every client packet 34 | and then we'll sort of know what the logging channel how fast the logging channel has to be 35 | it's worth talking a little bit about how output works 36 | and in this system really the only what output basically means only is sending packets 37 | that client send requests in as network packets the response goes back out as network packets 38 | and there's really no other form of output 39 | as I mentioned the you know both primary and backup compute the output packet they want to send 40 | and that sort of asks that simulated mix to send the packet it's really sent on the primary 41 | and simply discard it the output packet discarded on the backup 42 | okay but it turns out is a little more complicated than that 43 | so supposing we're what we're running is a some sort of simple database server 44 | and the operation the client operation that our database server supports is increment 45 | and ideas the client sends an increment requests the database server increments the value and sends back the new value 46 | so maybe on the primary well let's say everything's fine so far 47 | and the primary backup both have value 10 in memory and that's the current value at the counter 48 | and some client on the local area network sends a you know an increment request 49 | to the primary that packet is you know delivered to the primary it's you know 50 | it's executed the primary server software and the primary 51 | says oh you know current values 10 I'm gonna change to 11 52 | and send a you know response packet back to the client saying saying 11 53 | mentioned gonna supposed to be sent to the backup will also be processed here it's going to change this 10 to 11 also 54 | generate a reply and we'll throw it away that's what's supposed to happen the output 55 | however you also need to ask yourself what happens if there's a failure at an awkward time 56 | if you should always in this class should always ask yourself 57 | what's the most awkward time to have a failure and what would happen you to failure occurred then so 58 | suppose the primary does indeed generate the reply here back to the client 59 | but the client the primary crashes just after sending the report its reply to the client 60 | and furthermore and much worse it turns out that you know this is just a network it doesn't guarantee to deliver packets 61 | let's suppose this log entry on the logging channel got dropped also when the when the primary died 62 | so now the state of play is the client received a reply saying 11 63 | but the backup did not get the client request so its state is still 10 64 | no now the backup takes over because it's seized the primary is dead and 65 | this client or maybe some other client sends an increment request a new backup 66 | and now it's really processing these requests and so the new backup when it gets the next increment requests 67 | you know it's now going to change its state to 11 68 | and generate a second 11 response 69 | maybe the same client maybe to a different client 70 | which if the clients compare notes or if it's the same client it's just obviously cannot have happened I didn't 71 | so you know because we have to support unmodified software that does not damn 72 | that there's any funny business of replication going on 73 | that means we do not have the opportunity to 74 | you know you can imagine the client could go 75 | you know we could change the client to realize 76 | something funny it happened with the fault tolerance and do I don't know what 77 | but we don't have that option here 78 | because this whole system really only makes sense if we're running unmodified software 79 | so so this was a big this is a disaster 80 | we can't have let this happen does 81 | anybody remember from the paper how they prevent this from happening 82 | the output rule yeah so you want to do you know 83 | yeah so the output rules is the their solution to this problem 84 | and the idea is that the client he's not allowed to generate you know and 85 | generate any output the primary's not allowed to generate any output 86 | and what we're talking about now is this output here 87 | until the backup acknowledges that it has received all log records up to this point 88 | so the real sequence at the primary then let's now undone crash the primary 89 | go back to them starting at 10 the 90 | real sequence now when the output rule is that 91 | the input arrives at the time the input arrives 92 | that's when the virtual machine monitor sends a copy of the input to the backup 93 | so the the sort of time at which this log message with the input 94 | is sent is before strictly before the primary generates the output sort of obvious 95 | then after firing this log entry off across a network and now it's heading towards the backup 96 | but I'd have been lost might not 97 | the virtual machine monitor delivers a request to the primary server software it generates the output 98 | so now the replicated you know the primary has actually generated change the state 211 99 | and generated an output packet that says eleven 100 | but the virtual machine monitor says oh wait a minute we're not allowed to generate that output 101 | until all previous log records have been acknowledged by the backup 102 | so you know this is the most recent previous log message 103 | so this output is held by the virtual machine monitor 104 | until the this log entry containing the input packet from the client 105 | is delivered to the virtual machine monitor and buffered by the virtual machine monitor 106 | but do not necessarily execute it 107 | it may be just waiting for the backup to get to that point in the instruction stream 108 | and then the virtual machine monitor here will send an ACK packet back saying yes I did get that input 109 | and when the acknowledgment comes back -------------------------------------------------------------------------------- /lec04/Lec4-5.zh.txt: -------------------------------------------------------------------------------- 1 | 所以我们不能让这种情况发生 2 | 如果在主节点用的是这种方式 3 | 它就会观察到 4 | 如果我们允许网卡直接将接收到的包DMA到主节点内存 5 | 我们没办法精确控制网卡将数据复制到内存的时间点 6 | 所以我们也不知道 什么时候主节点观察到网络包中的数据 7 | 这就意味着网卡复制包进虚拟机监视器的虚拟内存 8 | 然后打断虚拟机监视器告诉它有抵达的包 9 | 这时虚拟机监视器就会中断主节点 10 | 记住当前位置的指令号 11 | 在主节点中断的时候 复制整个包到主节点的内存 12 | 然后模拟主节点的网卡中断 13 | 然后发送包和中断位置的指令号到副节点 14 | 副节点也会中断 15 | 副节点也会在同样的指令号位置中断 16 | 复制整个包到副节点的内存 同样副节点中断没有注意包的抵达过程 17 | 然后模拟副节点在同样指令号位置中断 18 | 这就是论文中描述的回弹缓冲机制 19 | 会产生记录通道流量的指令都是不寻常的指令 很少见 20 | 一般是在主节点和副节点执行结果会不同的指令 21 | 例如获取当前时间 获取当前处理器数量 22 | 或者获取已执行的指令数量 23 | 这些相对来说都是很少见的 24 | 或者说像生成随机数 25 | 让硬件生成用于加密的随机数之类的 26 | 这些都不是日常的指令 27 | 大多数指令例如相加的指令会在主副节点有相同的结果 28 | 没错 每个网络包都被直接打包转发没有修改 29 | 由两边的TCP/IP栈进行解析 30 | 所以我认为99.99%的记录通道流量都是来自于接收到的包 31 | 只有少部分是因为会产生不确定结果的指令造成 32 | 所以我们可以猜到在有客户端连接的服务器上 流量大概是什么样子的 33 | 基本就是各个客户端发的包 34 | 所以我们就能估算出记录通道需要有怎样的性能 35 | 输出是如何工作的也值得讲一下 36 | 在这个系统中 输出就是指发送数据包 37 | 客户端发送请求的网络包 服务端响应网络包 38 | 没有其他形式的输出了 39 | 主节点和副节点计算他们要发送的数据包 40 | 在主节点上真正进行发送 41 | 副节点的包则被简单地丢弃掉 42 | 真实的情况会稍微复杂一点 43 | 假设我们在跑的是一个简单的数据库服务器 44 | 服务器支持客户端进行计数器自增(Increment)操作 45 | 客户端发送自增的请求 服务端对计数器进行加操作 返回操作后的值 46 | 假设在主节点一切都正常 47 | 主副节点现在都存有计数器值10在内存中 48 | 本地网络的客户端发送自增请求到主节点 49 | 这个网络包被发到主节点 50 | 然后被主节点的程序执行 51 | 比如说现在是10 我要将他变为11 52 | 然后产生回复告诉客户端结果11 53 | 这个请求也会被发送到副本上 将10改为11 54 | 然后同样产生一个回复 并被丢弃掉 理论上是这样 55 | 然而你也要想一下如果在不恰当的时间服务出现失败会怎么样 56 | 上这门课的期间你要一直这样问自己 57 | 何时出现失败是最坏的情况 这种情况下会发生什么 58 | 假设主节点确实生成了返回给客户端的回复 59 | 但是主节点在发送完回复之后就宕机了 60 | 更糟的是 网络向来不保证数据包能传递到接收方 61 | 再假设记录通道也在主节点宕机的时候失效了 62 | 那么现在的状态就是 客户端收到回复11 63 | 但是副节点没有收到转发来的客户端请求 所以计数器仍然是10 64 | 现在因为察觉到主节点宕机 副节点接管服务 65 | 那这个或者其他客户端发送自增请求到原来的副节点 66 | 当收到请求的时候这个接管工作的副节点开始处理 67 | 将计数器从10自增为11 68 | 然后值11就会第二次作为回复出现 69 | 回复给原来的客户端或者不同的客户端 70 | 客户端如果比较会发现这是同样的回复 这本不应该发生 71 | 因为我们需要支撑的程序在没改动的情况下 72 | 并不能处理这些副节点上不寻常的问题 73 | 意思是我们不能对它进行修改 74 | 比如说我们可以 75 | 把客户端改成能处理 76 | 副节点容错性带来的问题 77 | 但是实际上我们没有这种选择 78 | 因为这个系统只有在我们不需要客户端进行改动的情况下才有意义 79 | 所以这是个灾难 80 | 我们不能让它发生 81 | 有人记得论文上是怎么样防止它发生的吗? 82 | 输出规则 对的 83 | 没错解决的方案就是控制输出规则 84 | 方案就是阻止输出 85 | 在主节点上不允许生成任何输出 86 | 比如我们现在说的这个计数器输出 87 | 直到副节点确认收到了所有的记录 88 | 所以整个流程正确的顺序是 回到主节点宕机之前 89 | 回到计数器还是10的时候 90 | 正确的顺序是 在这个输出规则下 91 | 当输入到来时 92 | 虚拟机监视器发送输入的副本到副节点 93 | 这个输入的日志信息 94 | 在主节点产生输出前发到副节点 95 | 发送之后这个日志就在去往副节点的网络中 96 | 这个日志有可能丢失 97 | 虚拟机监视器同样传递请求到主节点 生成输出 98 | 所以现在主节点生成结果 计数器值变为11 99 | 然后产生一个11的回复 100 | 但是虚拟机监视器说等一下 现在还不允许生成输出 101 | 直到之前的日志记录都被副节点确认 102 | 这是最近的一条日志消息 103 | 所以输出都被虚拟机监视器截停 104 | 直到这条包含客户端输入的日志 105 | 被虚拟机监视器投递和缓冲 106 | 但是不一定马上会执行 107 | 可能需要等待副节点执行到指令流中对应的位置 108 | 到这里虚拟机监视器就会发送一个ACK包说自己收到输入 109 | 直到主节点接收到ACK包时 -------------------------------------------------------------------------------- /lec04/Lec4-6.en.txt: -------------------------------------------------------------------------------- 1 | and then the virtual machine monitor here will send an ACK packet back 2 | saying yes I did get that input and when the acknowledgment comes back 3 | only then will the virtual machine monitor here release the packet out onto the network 4 | and so the idea is that if the client could have seen the reply 5 | then necessarily the backup must have seen the request and at least buffered it 6 | and so we no longer get this weird situation 7 | in which a client can see a reply but then there's a failure and a cut over 8 | and the replica didn't know anything about that reply 9 | if the you know there's also a situation maybe this message was lost 10 | and if this log entry was lost and then the primary crashes 11 | well since it hadn't been delivered so the backup hadn't sent the act 12 | that means if the primary crashed 13 | you know this log entry was brought in the primary crashed 14 | it must have crashed before the virtual machine monitor or at least the output packet 15 | and prayer for this client couldn't have gotten the reply 16 | and so it's not in a position to spot any irregularities 17 | they're really happy with the output rule 18 | brennon see 19 | I don't know they don't paper doesn't mention how the virtual machine monitor is implemented 20 | I mean it's pretty low level stuff because 21 | you know it's sitting there allocating memory and figuring page tables 22 | and talking to device drivers and intercepting instructions 23 | and understanding what instructions the guest was executing 24 | so we're talking about low-level stuff what language is written in you know traditionally C or C++ 25 | but I don't actually know 26 | okay this of the primary has to delay at this point 27 | waiting for the backup to say that it's up to date 28 | this is a real performance thorn in the side of just about every replication scheme 29 | this sort of synchronous wait where the we can't let the primary get too far ahead of the backup 30 | because if the primary failed while it was ahead 31 | that would be the backup lagging 32 | lagging behind clients right 33 | so just about every replication system has this problem that 34 | at some point the primary has to stall waiting for the backup 35 | and it's a real limit on performance 36 | even if the machines are like side-by-side and adjacent racks 37 | it's still you know we're talking about a half a millisecond or something 38 | to send messages back and forth with a primary stalled 39 | and if we wanna like withstand earthquakes or citywide power failures 40 | you know the primary in the backup have to be in different cities 41 | that's probably five milliseconds apart 42 | every time we produce output if we replicate in the two replicas in different city 43 | every packet that it produces this output 44 | has to first wait the five milliseconds or whatever to have the last log entry get to the backup 45 | and how the acknowledgment come back and then we can release a path packet 46 | and you know for sort of low intensity services that's not a problem 47 | but if we're building a you know database server that we would like to 48 | you know that if it weren't for this could process millions of requests per second 49 | then that's just unbelievably damaging for performance 50 | and this is a big reason why people you know you know 51 | if they possibly can use a replication scheme 52 | that's operating at a higher level and kind of understands the semantics of operations 53 | and so it doesn't have to stall on every packet 54 | you know it could stall on every high level operation or even notice that well 55 | you know read-only operations don't have to stall at all 56 | it's only right so that just all or something 57 | but you have to there has to be an application level replication scheme to to realize that 58 | you're absolutely right 59 | so the observation is that you don't have to stall the execution of the primary 60 | you only have to hold the output 61 | and so maybe that's not as bad as it could be 62 | but nevertheless it means that every you know in a service 63 | that could otherwise have responded in a couple of microseconds to the client 64 | you know if we have to first update the replicas in the next city 65 | we turn to you know 10 micro second interaction into it 10 millisecond interactions possibly 66 | if you have vast numbers of clients submitting concurrent requests 67 | then you may may be able to maintain high throughput even with high latency 68 | but you have to be lucky to or very clever designer to get that 69 | that's a great idea 70 | but if you log in the memory of the primary 71 | that log will disappear when the primary crashes 72 | or that's usual semantics of a server failing is that 73 | you lose everything inside the box like the contents of memory 74 | or you know if even if you didn't 75 | if the failure is that somebody unplugged the power cable accidentally from the primary 76 | even if the primary just has battery backed up RAM or I don't know what 77 | you can't get at it 78 | all right the backup can't get at it 79 | so in fact this system does log the output and the place it logs it is in the memory of the backup 80 | and in order to reliably log it there you have to observe the output rule and wait for the acknowledgment 81 | so it's entirely correct idea just can't use the primary's memory for it 82 | say it again 83 | that's a clever idea I'd 84 | and so the question is maybe input should go to the primary but output should come from the backup 85 | I completely haven't thought this through 86 | that might work that 87 | I don't know that's interesting 88 | yeah maybe I will 89 | one possibility this does expose though is that 90 | the situation you know maybe the a primary crashes after its output is released 91 | so the client does receive the reply 92 | then the primary crashes 93 | the backups input is still in this event buffer 94 | in the virtual machine monitor of the backup 95 | it hasn't been delivered to the actual replicated service 96 | when the backup goes live after the crash of the primary 97 | the backup first has to consume all of the sort of log records that are lying around 98 | that it hasn't consumed yet has to catch up to the primary 99 | otherwise it won't take over with the same state 100 | so before the backup can go live it actually has to consume all these entries 101 | the last entry is presumably is the request from the client 102 | so the backup will be live after after it 103 | after the interrupt that delivers the request from the client 104 | and that means that the backup well you know increment its counter to eleven 105 | and then generate an output packet and since it's live at this point 106 | it will generate the output packet and the client will get two eleven replies 107 | which is also if it if that really happened would be anomalous 108 | like possibly not something that could happen if there was only one server 109 | the good news is that almost certainly 110 | or the almost certainly the client is talking to this service using TCP 111 | and that this is the request and the response go back and forth on a TCP Channel 112 | the when the backup takes over 113 | the backup since the state is identical to the primaries it knows all about that TCP connection 114 | and whether all the sequence numbers are and whatnot 115 | and when it generates this packet 116 | it will generate it with the same TCP sequence number as an original packet 117 | and the TCP stack on the client will say oh wait a minute that's a duplicate packet 118 | we'll discard the duplicate packet at the TCP level 119 | and the user level software will just never see this duplicate 120 | and so this system really you know 121 | you can view this as a kind of accidental or clever trick 122 | but the fact is for any replication system where cutover can happen 123 | which is to say pretty much any replication system 124 | it's essentially impossible to design them 125 | in a way that they are guaranteed not to generate duplicate output 126 | basically you know you well you can err on either side 127 | I'm not even either not generate the output at all which 128 | would be bad which would be terrible 129 | or you can generate the output twice on a cutover 130 | that's basically no way to generate it guaranteed generated only once 131 | everybody errors on the side of possibly generating duplicate output 132 | and that means that at some level you know the client side of all replication schemes 133 | need some sort of duplicate detection scheme 134 | here we get to use TCP s that we didn't have TCP that would have to be something else -------------------------------------------------------------------------------- /lec04/Lec4-6.zh.txt: -------------------------------------------------------------------------------- 1 | 到这里虚拟机监视器就会发送一个ACK包说自己收到输入 2 | 直到主节点接收到ACK包时 3 | 虚拟机监视器才会将包发送到网络中 4 | 所以这个方案就是 若客户端可以收到回复 5 | 那么副节点肯定也收到过请求 并且至少已经存到缓冲区 6 | 因此我们不在会有以下的异常 7 | 客户端已经收到了回复 然后因为有故障发生 8 | 副节点完全没有接到过相关内容 9 | 比如说有些情况消息可能会中途丢失 10 | 日志记录丢失后主节点宕机 11 | 因为消息没投递成功 所以副节点也没有确认 12 | 如果主节点宕机 13 | 日志记录随主节点宕机丢失 14 | 它肯定在虚拟机监视器发送输出包之前丢失 15 | 客户端不可能提前收到回复 16 | 它们也不会识别出异常发生 17 | 所以我们需要有输出规则进行限制 18 | 19 | 我不了解 论文没有谈到虚拟机监视器的实现 20 | 这是非常底层的知识 21 | 像划分内存空间 计算页表 22 | 与设备驱动交互 拦截指令 23 | 搞明白Guest用户执行的命令等 24 | 所以这是一些底层的东西 一般用C或C++编写的 25 | 但我并不了解 26 | 回到这里 主节点需要延迟回复 27 | 先等副节点确认已经收到最新消息 28 | 这是几乎所有主备复制模式性能的一道坎 29 | 这种同步等待让主节点不会领先于副节点太多 30 | 因为如果主节点在领先的情况下出现故障 31 | 副节点就会出现数据延迟 32 | 副节点的进度和客户端会不一致 33 | 所以每种主备系统都有这样的问题 34 | 某个时间主节点必须等待副节点 35 | 这是对性能实打实的限制 36 | 即使机器是在相邻机架上 37 | 主节点发送消息和接收确认 38 | 仍然需要等上0.5毫秒 39 | 如果说像避免像地震 大范围断电等问题 40 | 主副节点必须位于不同的城市 41 | 那延迟大概会增加到5毫秒 42 | 如果我们主副节点在不同城市进行复制 43 | 每一个发送的包 44 | 都需要等上5毫秒让日志记录到达副节点上 45 | 然后响应确认 最后才能发送回复给客户端 46 | 对于一些可靠性要求低的服务 可能并不成问题 47 | 但是对于一些数据库服务 48 | 比如需要每秒处理百万请求 49 | 那将会对性能有极大的影响 50 | 这也时在条件允许的情况下 51 | 人们会使用一些不同的主副复制模式 52 | 比如说在更高层次操作 并且需要解析操作内容 53 | 然后不需要每个包都等待确认 54 | 比如只在进行高层次操作时才等待 55 | 只读操作完全不需要等待 56 | 只需要等待写操作同步或者其他一些操作 57 | 但你需要在应用层上实现这些区分 58 | 你说的都是对的 59 | 虚拟机监视器不需要阻止主节点执行命令 60 | 只需要阻止输出就好 61 | 这可能可以做得更好 62 | 但至少这样在一个服务中 63 | 可以在几微秒内响应客户端 64 | 如果我们要先等待处于另一个城市的副节点响应 65 | 那可能会让10微秒变成10毫秒 66 | 如果你有大量客户端并发请求 67 | 虽然可能在高延迟下完成大量处理 68 | 但是你需要非常巧妙的设计才能做到 69 | 这是个很好的想法 70 | 但是如果你将消息记录到主节点内存中 71 | 在主节点宕机时日志就会丢失 72 | 通常认为服务器失效就意味着 73 | 服务器内存中的内容都会丢失 74 | 或者即使你不会如此 75 | 比如说失效是因为主节点电源被意外拔掉 76 | 但你有备用电源之类 77 | 你也做不到 78 | 副节点也做不到如此 79 | 实际上系统在副节点的内存中记录了输出 80 | 为了保证可靠记录 你必须遵守输出规则 等待确认 81 | 所以这是个正确的想法 但是不能使用主节点内存来做 82 | 再说一遍? 83 | 这个想法很棒 84 | 他问的是 能否输入由主节点接收 输出由副节点发送 85 | 我完全没有想过 86 | 这或许可以 87 | 我不确定 这很有意思 88 | 89 | 还有一个可能出现的情况 90 | 主节点在输出已经发送出去之后宕机 91 | 客户端已经收到回复 92 | 然后主节点宕机 93 | 副节点的输入还在事件缓冲中 94 | 在副节点的虚拟机监视器种 95 | 还没有投递到真正的服务副本 96 | 副节点要顶替宕机的主节点 97 | 它首先要消费所有未处理的记录 98 | 以赶上主节点的进度 99 | 否则主副节点进度就会不一致 100 | 副节点在接管服务之前要先消费完记录 101 | 最后一条记录是客户端的请求 102 | 副节点会在它之后开始接管服务 103 | 在传递客户端请求的中断之后 104 | 这意味着副节点计数器自增到11 105 | 然后生成一个输出包 因为这时候它接管服务 106 | 所以生成输出 客户端会受到两个11回复 107 | 如果真的发生这种情况的话是不对的 108 | 如果是单服务器的话这不应该发生 109 | 好消息则是 110 | 如果服务间是使用TCP通信 111 | 请求和响应都是通过TCP通道传输 112 | 当副节点接管时 113 | 副节点的状态和主节点一致 知道所有TCP连接 114 | 还有所有的序列号 115 | 当它产生这个包时 116 | 它会产生和原来的包一样的序列号 117 | 客户端的TCP栈会认为这是个重复的包 118 | 在TCP层就会将它丢弃掉 119 | 而在用户层软件中永远不会看到重复包 120 | 所以这个系统中 121 | 你可以认为问题被意外解决了或者被巧妙处理了 122 | 但事实上对于所有能够进行切换的复制系统 123 | 也就是大部分的复制系统 124 | 很难将他们设计成 125 | 保证切换时不会有重复输出 126 | 你可以在两边都引发报错 127 | 然后在两边都不生成输出 128 | 但是这种做法很糟糕 129 | 或者你可以允许切换时有两次输出 130 | 总之没有办法可以保证只有一次输出 131 | 两边都引发报错或者允许可能的重复输出 132 | 某种程度上说 所有复制模式的客户端 133 | 都需要重复包的检测机制 134 | 在这里我们使用TCP 不然的话也需要其他实现 -------------------------------------------------------------------------------- /lec04/Lec4-7.en.txt: -------------------------------------------------------------------------------- 1 | maybe application level sequence numbers or I don't know what 2 | and you'll see all of this 3 | and actually you'll see versions of 4 | essentially everything I've talked about like the output rule for example in labs 2 & 3 5 | you'll design your own replicated state machine 6 | yes 7 | yes to the first part 8 | so the scenario is 9 | the primary sends the reply 10 | and then either the primary send the close packet 11 | or the client closes the connect the TCP connection after it receives the primary's reply 12 | so now this's like no connection on the client side 13 | but there is a connection on the backup side 14 | and so now the backup 15 | so the backup consumes the very last log entry that is the input is now live 16 | so we're not responsible for replicating anything at this point right 17 | because the backup is now live there's no other replica as the primary died 18 | so there's no like if if we don't if the backup fails to execute in lockstep with the primary 19 | that's fine actually 20 | because the primary is is dead and we do not want to execute in lockstep with it 21 | okay so the primary is now not it's live 22 | it generates an output on this TCP connection that isn't closed yet from the backup point of view 23 | this packet arrives at the client on a TCP connection 24 | that doesn't exist anymore from the clients point of view 25 | like no big whoopee on the client right 26 | he's just going to throw away the packet as if nothing happened the application won't know 27 | the client may send a reset 28 | something like a TCP error or whatever packet 29 | back to the backup and the backup does something or other with it 30 | but it doesn't matter 31 | because we're not diverging from anything 32 | because there's no primary to diverge from 33 | you can just handle a straight reset however it likes 34 | and what it'll in fact do is basically ignore it 35 | but there's no now the backup has gone live there's no 36 | we don't owe anybody anything as far as replication 37 | yeah 38 | well you can bet since the backup's memory image is identical to the primary's image 39 | that they're sending packets with the very same source TCP number 40 | and the very same everything 41 | they're sending bit for bit identical packets 42 | you know at this level the server's don't have IP addresses 43 | or for our purposes 44 | the virtual machines you know the primary and backup virtual machines have IP addresses 45 | but the the physical computer and the vmm are transparent to the network 46 | it's not entirely true but it's basically the case that 47 | the virtual machine monitor in the physical machine 48 | don't really have identity of their own on the network 49 | because you can configure that then that way instead these they're not 50 | you know the virtual machine with its own operating system in its own TCP stack 51 | it has IP address and ethernet address and all sort of stuff 52 | which is identical between the primary and the backup 53 | and when it sends a packet 54 | it sends it with the virtual machine's IP address and Ethernet address 55 | and those bits at least in my mental model are just simply passed through on to the local area network 56 | it's exactly what we want 57 | and so it will generate exactly the same packets 58 | that the primary would have generated 59 | there's maybe a little bit of trickery you know what the we 60 | if this is these are actually plugged into an Ethernet switch 61 | into the physical machines maybe plugged into different ports of an Ethernet switch 62 | and we'd like the Ethernet switch to change its mind about 63 | which of these two machines that delivers packets with replicated services Ethernet address 64 | and so there's a little bit of funny business there 65 | for the most part they're just generating identical packets 66 | and we just send them out 67 | okay so another little detail I've been glossing over is that 68 | I've been assuming that the primary just fails or the backup just fails 69 | that is fail-stop right 70 | but that's not the only option 71 | another very common situation that has to be dealt with is 72 | if the two machines are still up and running and executing 73 | but there's something funny happen on the network 74 | that causes them not to be able to talk to each other 75 | but to still be able to talk to some clients 76 | so if that happened if the primary backup couldn't talk to each other 77 | but they could still talk to the clients 78 | they would both think oh the other replica is dead 79 | I better take over and go live 80 | and so now we have two machines going live with this service 81 | and now you know they're no longer sending each other log events or anything 82 | they're just diverging 83 | maybe they're accepting different client inputs and change their states in different ways 84 | so now we have a split brain disaster 85 | if we let the primary and the backup go live 86 | because it was a network that has some kind of failure instead of these machines 87 | and the way that this paper solves it I mean 88 | is by appealing to an outside authority to make the decision about 89 | which of the primary or the backup is allowed to be live 90 | and so 91 | it there you know turns out that their storage is actually not on local disk 92 | this almost doesn't matter 93 | but their storage is on some external disk server 94 | and as well as being in this server as a like totally separate service 95 | there's nothing to do with disks 96 | there this server happens to export this test-and-set 97 | test-and-set service over the network where you 98 | you can send a test-and-set request to it 99 | and there's some flag it's keeping in memory 100 | and it'll set the flag and return what the old value was 101 | so both primary and backup have to sort of acquire this test-and-set flag 102 | it's a little bit like a lock 103 | in order to go live they both may be send test-and-set requests at the same time 104 | to this test-and-set server 105 | the first one gets back a reply that says oh the flag used to be zero 106 | now it's one the second request to arrive 107 | the response from the test-and-set server is 108 | Oh actually the flag was already one when your request arrived 109 | so so basically you're not allowed to be primary 110 | and so this this test-and-set server 111 | and we can think of it as a single machine 112 | is the arbitrator that decides which of the two should go live 113 | if they both think the other one is dead due to a network partition 114 | any questions about this mechanism 115 | you're busted 116 | yeah the test-and-set server should be dead at the critical moment when 117 | and so actually even if there's not a network partition 118 | under all circumstances in which 119 | one or the other of these wants to go live because it thinks the others dead 120 | even when the other one really is dead 121 | the one that wants to go live still has to acquire the test-and-set lock 122 | because one of like the deep rules of 6.824 game is that 123 | you cannot tell whether another computer is dead or not 124 | all you know is that you stopped receiving packets from it 125 | and you don't know whether it's because the other computer is dead 126 | or because something has gone wrong with the network between you and the other computer 127 | so all the backup ceases well I've stuck in packets 128 | maybe the primary is dead maybe it's live 129 | primary probably sees the same thing 130 | so if there's a network partition 131 | they certainly have to ask the Test-and-Set server 132 | but since they don't know if it's a network partition 133 | they have to ask the test-and-set server regardless of whether it's a partition or not 134 | so anytime either wants to go live 135 | the test-and-set server also has to be alive 136 | because they always have to acquire this test-and-set lock 137 | so the test-and-set server sounds like a single point of failure 138 | they were trying to build a replicated fault tolerant whatever thing 139 | but in the end you know we can't failover unless unless this is alive so 140 | that's a bit of a bummer 141 | I'm guessing though 142 | I'm making a strong guess that the test-and-set server is actually 143 | itself a replicated service and is fault tolerant right 144 | it's almost certainly I mean these people of VMware 145 | they're like happy to sell you a million dollar highly available storage system 146 | that uses enormous amounts of replication internally 147 | um since the test-and-set thing is on their this server 148 | I'm I'm guessing it's replicated too 149 | and the stuff you'll be doing in lab 2 in lab 3 is more than powerful enough 150 | for you to build your own fault-tolerant test-and-set server 151 | so this problem can easily be eliminated 152 | -------------------------------------------------------------------------------- /lec04/Lec4-7.zh.txt: -------------------------------------------------------------------------------- 1 | 可能是应用程序级别的序列号,或者我不知道的 2 | 你会看到所有的这些 3 | 实际上你会在lab2和lab3中看到 4 | 基本上我讲过的所有内容的,例如这个输出规则 5 | 你将设计自己的复制状态机 6 | 是 7 | 对第一部分 8 | 场景是 9 | primary发送答复 10 | 然后要么primary服务器发送关闭数据包 11 | 要么客户端在收到primary的答复后关闭TCP连接 12 | 所以现在在客户端,没有连接 13 | 但backup端有连接 14 | 所以现在backup 15 | backup消耗了输入中的最后一个日志条目,(backup)变成在线状态 16 | 在这个时间点我们不负责复制任何内容 17 | 因为backup现在是在线状态,没有其他副本因为primary已死 18 | 因此,如果backup无法与primary步骤一致 19 | 其实也没什么问题 20 | 因为primary已死,我们不想和它步骤一致 21 | 好,现在考虑primary没有死,还活着 22 | 从backup的角度来看TCP连接尚未关闭,primary在这个TCP连接上产生一个输出 23 | 该数据包通过TCP连接到达客户端 24 | 从客户的角度来看已经不存在了 25 | 客户没有大惊小怪 26 | 它只是将数据包丢掉,好像什么都没发生,应用程序不会知道 27 | 客户可以发送一个重置 28 | 类似TCP错误或任何数据包 29 | 回到backup,backup就可以执行其他操作 30 | 但这没关系 31 | 因为我们没有产生分歧 32 | 因为没有primary可以分歧 33 | 你可以应付一个直接的重置,不管怎样 34 | 实际上它基本上会忽略它 35 | 现在backup已经上线了,没有(primary)了 36 | 就复制而言,我们不欠任何人任何东西 37 | 是的 38 | 好吧,你可以打赌,因为backup内存映像与primary映像相同 39 | 他们发送具有相同源TCP编号的数据包 40 | 他们都是一样的 41 | 他们发送每个比特位都相同的的数据包 42 | 在这一层,服务器没有IP地址 43 | 或出于我们的目的 44 | backup虚拟机和primary虚拟机具有IP地址 45 | 但是物理计算机和vmm对网络是透明的 46 | 这不完全准确,但基本上是这样的 47 | 物理机中的虚拟机监视器 48 | 在网络上实际上没有自己的身份 49 | 你可以配置成那样,而不是 50 | 在自己的TCP堆栈中具有自己操作系统的虚拟机 51 | 它有IP地址和以太网地址,和其他所有内容 52 | 在backup和primary都完全相同 53 | 当它发送一个数据包 54 | 它把虚拟机的IP地址和以太网地址一起发送 55 | 根据我脑中的模型,这些比特只是简单地传递到局域网 56 | 这正是我们想要的 57 | 所以它会产生完全相同的数据包 58 | 和primary会产生数据包的一致 59 | 也许有些小诡计,我们 60 | 如果是,这些实际上是插入到以太网交换机中的 61 | 这些物理机器可能插入在以太网交换机的不同端口中 62 | 我们希望以太网交换机改变主意 63 | 这两台机器中的哪台传递复制的以太网地址的数据包 64 | 所以那里有一些有趣的事情 65 | 在大多数情况下,它们只是生成相同的数据包 66 | 我们只是把它们发送出去 67 | 好吧,我一直在讲的另一个小细节是 68 | 我一直假设primary失败或backup失败 69 | 那是fail-stop对吧 70 | 但这不是唯一的可能 71 | 必须处理的另一个非常普遍的情况是 72 | 如果两台计算机仍处于启动状态并且正在运行 73 | 但是网络上发生了一些有趣的事情 74 | 导致他们无法互相交谈 75 | 但仍然能够与一些客户端交谈 76 | 如果发生这种情况,如果prumary,backup无法互相通信 77 | 但他们仍然可以与客户交谈 78 | 他们都会以为另一个副本死了 79 | 我最好接管并开始上线 80 | 因此,现在我们有两台机器上线提供服务 81 | 现在你知道他们不再发送彼此的日志事件或任何其他信息 82 | 他们开始分歧 83 | 也许他们接受不同的客户输入,并且各自的状态变得不同 84 | 所以现在我们发生了脑裂灾难 85 | 如果我们让backup和primary生效 86 | 因为这是一个网络故障,而不是这些机器故障 87 | 这篇论文解决问题的方式 88 | 是通过寻求外部权威做出有关 89 | 允许哪个backup或primary上线的决定 90 | 所以 91 | 他们的存储实际上不在本地磁盘上 92 | 这几乎没关系 93 | 但它们的存储在某些外部磁盘服务器上 94 | 像完全独立的服务一样位于该服务器中 95 | 与磁盘无关 96 | 这个服务器碰巧在网络中提供这个test-and-set服务 97 | 这个服务器碰巧在网络中提供这个test-and-set服务 98 | 你可以发送test-and-set要求给它 99 | 它在内存维护一些标志 100 | 它会设置标志并返回原来的值 101 | 因此primary和backup都必须获得此test-and-set标志 102 | 这有点像锁 103 | 为了上线,他们可能同时发送test-and-set请求 104 | 给这个test-and-set服务器 105 | 第一个返回一个答复,说哦,标志曾经是零 106 | 现在是第二个请求到达 107 | test-and-set服务器的响应是 108 | 哦,实际上,当你的请求到达时,该标志已经是1 109 | 所以你不允许成为primary 110 | 所以这个test-and-set服务器 111 | 我们可以将其视为一台机器 112 | 是决定两者中哪一个应该生效的仲裁员 113 | 如果他们俩都认为其他人由于网络分区而死亡的话 114 | 有关此机制的任何问题 115 | 你被淘汰了 116 | 是的,如果test-and-set服务器在这个关键时刻死了 117 | 实际上即使没有网络分区 118 | 在任何情况下 119 | 其中一个希望上线,因为它认为其他人死了 120 | 即使对方真的死了 121 | 想要上线的那个也必须获得test-and-set锁 122 | 因为像6.824游戏的深层规则之一是 123 | 你无法判断另一台计算机是否坏掉了 124 | 你所知道的是,你不再能接收到数据包 125 | 而且你不知道是因为另一台计算机已死 126 | 还是由于你和另一台计算机之间的网络出了点问题 127 | 所以所有backup都停止,我被卡在数据包中 128 | 也许primary已经死了,也许还活着 129 | primary可能看到相同的东西 130 | 所以如果有网络分区 131 | 他们一定要问test-and-set服务器 132 | 但由于他们不知道这是否是网络分区 133 | 他们必须询问test-and-set服务器,不管它是否是分区 134 | 所以任何时候都想上线 135 | test-and-set服务器也必须处于在线状态 136 | 因为他们总是必须获得此test-and-set锁 137 | 因此test-and-set服务器听起来像是单点故障 138 | 他们试图建立一个复制的容错的东西 139 | 但最后你知道我们无法进行故障转移,除非它仍然活着 140 | 这有点令人烦恼 141 | 我猜 142 | 我强烈猜测test-and-set服务器实际上是 143 | 本身是有复制的服务,是具有容错性的 144 | 几乎可以肯定,我是说,VMware的这些人 145 | 他们很高兴向你出售百万美元的高可用性存储系统 146 | 在内部使用大量复制 147 | 嗯,因为test-and-set的东西在他们的这个服务器上 148 | 我在猜它也被复制了 149 | 你在lab2中和lab3中做的事情足够强大 150 | 你可以用它构建自己的容错的test-and-set服务器 151 | 所以这个问题很容易消除 152 | -------------------------------------------------------------------------------- /lec04/Lec4.en.txt: -------------------------------------------------------------------------------- 1 | all right today I want to talk about bit more about fault tolerance and replication and then look into the details of today's paper about vmware ft the topics still fault tolerance to provide high availability that is you want to build a server that even if some hardware you know computer crashes is involved in the service we still like to provide the service and to the extent we can we'd like to provide our service also if there's network problems and the tool we're using its replication least for this part of the course so it's worth asking what kind of failures replication can be expected to deal with because it's not everything by any means so maybe the easiest way to characterize the kind of failures we're talking about is fail stop failures of a single computer and what I mean by fail stop it's a sort of generic term and fault tolerance is that if something goes wrong would say the computer the computer simply stops executing it just stops if anything goes wrong and in particular it doesn't compute incorrect results so if somebody kicks the power cable out of your server that's probably gonna generate a fail stop failure similarly if they unplug your servers network connection even though the server is still running so this is a little bit funny you know be totally cut off from the network so it looks at me outside like it just stopped so it's really these failures we can deal with with replication this also covers some hardware problems like you know maybe if the fan on your server breaks because it you know it cost 50 cents maybe that'll cause the CPU to overheat and the CPU will shut itself down cleanly and just stop executing what's not covered by the kind of replication systems we're talking about is things like bugs and software or design defects in hardware so basically not bugs because if we take some service you know say you're a MapReduce master for example you know we replicated and run it on two computers you know if there's a bug in your MapReduce master or my MapReduce master let's say replications not going to help us we're going to compute the same incorrect result on both of our copies of our MapReduce master and everything looked fine they'll agree you just happen to be the wrong answer so we can't depending against bugs in the replicated software and we can't defend against bugs in the whatever scheme we're using to manage the replication and similarly as I mentioned before we can't expect to deal with bugs in the hardware the hardware it computes incorrectly that's just that's the end for us at least with this kind of technique although you know that said there are definitely hardware and software bugs that that replication might if you're lucky might be able to cope it so if there's some unrelated software running in your server and it causes the server to crash maybe because your kernel to panic and reboot or something it has nothing to do with you know with your with the service you're replicating then that kind of failure for us for your service will may well be fail stop you know the kernel will panic and the backup replicas will take over similarly some kinds of hardware errors can be turned into fail stop errors for example if you send a packet over the network and the network corrupts it just flips a bit in your packet that will almost certainly be caught by the checksum on the packet same thing for a disk block if you write some data to disk and read it back a month later you know maybe the magnetic surface isn't perfect and you know one of the best couple of bits were wrong in the block as it's right back it's actually error correcting that up to a certain point will fix errors in disk blocks that you'll be turning you know random hardware errors into as either correcting them if you're super lucky or at least detecting them and turning random corruption into a detected fault which you know the software then knows that something that wrong and can turn it into a fail stop fault by stopping executing or take some other remedial action but in general we really can only expect to handle fail stop faults there's other limits to replication to you know the the failures in the if we have a primary in the back of our two replicas or whatever we're really assuming that failures in the two are independent right if there tend to have correlated failures then replication is not going to help us so for example if we're a big outfit and we buy thousands of computers batches of thousands of computers identical computers from the same manufacturer and we run you know our replicas is on all on those computers we bought at the same time from the same place that's a bit of a risk maybe because presumably if one of them has a manufacturing defect in it there's a good chance that the other ones do too you know one of them's prone to overheating because the manufacturer you know didn't provide enough airflow well it probably all had that problem and so one of them overheats and dies it's a good chance that the other ones will too so that's one kind of correlated failure you just have to be careful of another one is that you know if there's an earthquake and the city where our datacenter is probably gonna take out the whole data center you know we can have all the replication we like inside that data center it's not going to help us because the failure caused by an earthquake or a citywide power failure or something the building burning down is like it's correlated failure between our replicas if they're on that building so if we care about dealing with earthquakes then we need to put our replicas in maybe in just different cities at least physically separate enough that they have separate power unlikely to be affected by the same natural disaster okay but that's all sort of hovering in the background for this discussion where we're talking about the technology you might use another question about replication is whether it's worthwhile you may ask yourself gosh you know this literally uses these replication schemes use twice as much or three times as much computer resources right we need to have you know GFS had three copies of every blocks we have to buy three times as much disk space the paper for today you know replicates just once but that means we have twice as many computers and CPUs and RAM it's all for expensive like is that really worth it that expense and you know that's not something we can answer technically right it's an economic question it depends on the value of having an available service you know if you're running a bank and if the consequence is the computer failing is that your customer you can't serve your customers and you can't generate revenue and your customers all hate you then it may well be worth it to blow you know an extra ten or twenty thousand bucks on a second computer so you can have a replica on the other hand if you're me and you're running the 6.824 web server I don't consider it worthwhile to have a hot backup of the 84 web server because the consequences of failure are very low so the whether the replication is worthwhile on how many replicas you ought to have and how much you're willing to spend on it is all about how much cost and inconvenience failure would call it cause you all right this paper sort of in the beginning mentions as there's a couple of different approaches to replication really mentions two one two calls state transfer and the other calls replicated state machine most of the schemes we're going to talk about in this class are replicated state machines it'll talk about both anyway the idea behind state transferor's that if we have two replicas of a server the way you cause them to be to stay in sync that is to be actual replicas so that the backup can has everything it needs to take over if the primary fails in a state transfer scheme the way that works is that the primary sends a copy of its entire state that is for example the contents of its RAM to the backup and the backup just sort of stores the latest state and so it's all there the primary fails in the backup can start executing with this last state it got if the primary fails so this is all about sending the state of the of the primary and for today's if today's paper worked as a state transfer system which it doesn't then the state we'd be talking about would be the contents of the RAM the contents of the memory of the primary so maybe every once while the primary would just you know make a big copy of its memory and send it across the network to the backup you can imagine if you wanted to be efficient you know maybe you would only send the parts of the memory that it's changed since the last time you sent in memory to the backup the replicated state machine this approach observes that most services are most computer things we want to replicate have some internal operation that's deterministic except when external input comes in right you know ordinarily if there's no external influences on a computer it just executes one instruction after another and what each instruction does is a deterministic function of what's in the memory and the registers of the computer and it's only when external events intervene that something unexpected may happen like a packet arrives of a some random time and that causes the server to start doing something differently I'm so replicated state machine schemes don't send the state between the replicas instead they just send those external events they just send maybe from a primary to a backup again just send things like arriving input from the outside world that the backup needs to know and the observation is that you know if you have to two computers and they start from the same state and they see the same inputs that that in the same order or at the same time the two computers will continue to be replicas of each other and sort of execute identically as long as they both see the same inputs at the same time so this transfers probably memory and this transfer some primary backup just operations from clients or external external inputs or external events and you know the reason why people tend to favor a replicated state machine is that usually operations are smaller than the state but this you know the state of a server if it's a database server might be the entire database might be you know gigabytes whereas the operations are just some clients sending and you know please read or write key 27 operations are usually small the states usually large so replicate a state machine usually looks attractive and slight downside is that the schemes tend to be quite a bit more complicated and rely on sort of more assumptions about how the computers operate whereas this is a really heavy-handed I'm just gonna send you my whole state sort of a nothing to worry about any questions about these strategies yes well the did ok so the question is suppose something went wrong with our scheme and the backup was not actually identical to the primary so you know you're suppose we were running GFS master and it's the primary it just handed out at least two chunks server one but because the two you know because we've allowed the states of the primary back to drift out of sync the backup did not issue at least to anybody it wasn't even away or anybody had asked for these so now the primary thinks you know chunks everyone has lease for some chunk in the backup doesn't the primary fails backup takes over right now chunks over one thinks it has a lease for some chunk but then the current master doesn't and is happy to hand out the lease to some other trunk server now we have to chunk servers serving the same lease okay so that's just a close to home example but really you know almost any bad thing and kind of I think you construct any bad scenario by just imagining some service that confuse the wrong answer because the state's leverage so you're asking about randomization yeah oh y'all talk about this I'll talk about this a bit later on but it is good that the replicated state scheme definitely makes the most sense when the instructions that the primary in the back of our executing do the same thing as long as there's no external events right and that's almost true right you know for an add instruction or something yeah you know if the starting if the registers and memory of the same and they both execute an add instruction add instruction has the same inputs in the same outputs but they're in some instructions as you point out that don't like maybe there's an instruction that gets the current time of day now probably be executed at slightly different times or an instruction that gets the current processors unique ID and a serial number it's going to yield the different answers and the the the uniform answered the questions that sound like this is that the primary does it and sends the answer to the backup and the backup does not execute that instruction but instead at the point where it would execute that instruction it listens for the primary to tell it what the right answer would be and just sort of fakes that answer to the software I'll talk about you know how the VMware scheme does that okay interestingly enough though today's paper is all about a replicated state machine you may have noticed that today's paper only deals with you know processors and it's not that clear how it could be extended to a multi-core and a multi-core machine where the interleavings of the instructions from the two cores organ are non-deterministic all right so we no longer have this situation on a multi-core machine where if we just let the primary and backup execute they're you know all else being equal they're going to be the same because they won't execute on multiple cores VMware has since come out with a new possibly completely different replication system that does work on multi-core and the new system appears to me to be using state transfer instead of replicated state machine because state transferred is more robust in the face multi-core and parallelism if you use the machine and send the memory over you know that the memory image is just that just is the state of the machine and sort of it doesn't matter that there was parallelism whereas the replicated state machine scheme really has a problem with the parallelism you know on the other hand I'm guessing that this new multi-core scheme is more expensive okay all right so if we want to build a replicated state machine scheme we got a number of questions to answer so we need to decide at what level we're gonna replicate state right so what state what do we mean by state we have to worry about how how closely synchronized the primary and backup have to be right because it's likely the primary will execute a little bit ahead of the backup after all it it's the primary that sees the inputs so the backup almost necessarily must lag over that gives that means there's an opportunity if the primary fails for the prime for the backup not to be fully caught up having the backup actually executes really in lockstep with the primaries for expensive because it requires a lot of chitchat so a lot of designs a lot of what people sweat about is how close the synchronization is if the primary fails or you know actually if the backup fails too but it's more exciting if the primary fails there has to be some scheme for switching over and the clients have to know oh gosh I instead of talking to the old primary on server one I should now be talking to the the backup on server to all the clients have to somehow figure this out the switch over almost certainly it's almost impossible maybe impossible to design a cut over system in which no anomalies are every are ever visible you know in this sort of ideal world if the primary fails we'd like nobody to ever notice none of the clients to notice turns out that's basically unattainable so there's going to be anomalies during the cut over and we've gotta figure out a way to cope with them and finally if the one of the two if one of our replicas fails we really need to have a new replica right if we have a two replicas and one fails we're just living on borrowed time right because the second replica may fail at some point so we absolutely need to get a new replica back online as fast as possible so and that can be very expensive the state is big you know you know but the reason we like to replicate a state machine was because we thought state transfer would be expensive but the two replicas in a replicated state machine still need to have full state right we just had a cheap way of keeping them both in sync if we need to create a new replica we actually have no choice but state transfer to create the new replicas the new replica needs to have a complete copy of the state so it's going to be expensive to create new replicas and this is often people spending well actually people spend a lot of time worrying about all these questions and you know we'll see them again as we look at other replicated state machine schemes so on the topic of what state to replicate the today's paper has a very interesting answer to this question it replicates the full state of the machine that is all of memory and all the Machine registers it's like a very very detailed replication scheme just no difference at the even of the lowest levels between the primary in the backup that's quite rare for replication schemes almost always you see something that's more like GFS where GFS absolutely did not replicate you know they had replication but it wasn't replicating every single you know bit of memory between the primaries and the backups it was replicating much more application level table of chunks I had this abstraction of you know chunks and chunk identifiers and that's what it was replicating it wasn't replicating sort of everything else wasn't going to the expense of replicating every single other thing that machines we're doing okay as long as they had the same sort of application visible set of of chunks so most replication schemes out there go the GFS route in fact almost everything except pretty much this paper and a few handful of similar systems almost everything uses application at some level application level of replication because it can be much more efficient because we don't have to go to the we don't have to go to the trouble of for example making sure that interrupts occur at exactly the same point in the execution of the primary and backup GFS does not sweat that at all but this paper has to do because it replicates at such a low level so most people build efficient systems with applications specific replication the consequence of that though is that the replication has to be built into the right into the application right if you're getting a feed of application level operations for example you really need to have the application participate in that because some generic replication thing like today's paper doesn't really can't understand the semantics of what needs to be replicated so anyways so most teams are application specific like GFS and every other paper we're going to read on this topic today's paper is unique in that it replicates at the level of the machine and therefore does not care what software you run on it right it replicates the low-level memory and machine registers you can run any software you like on it as long as it runs on that kind of microprocessor that's being represented this replication scheme applies to the software can be anything and you know the downside is that it's not that efficient necessarily the upside is that you can take any existing piece of software maybe you don't even have source code for it or understand how it works and you know do within some limits you can just run it under this under VMware this replication scheme and it'll just work which is sort of magic fault-tolerance wand for arbitrary software all right now let me talk about how this is VMware FT first of all VMware is a virtual machine company they're what their business is a lot of their business is selling virtual machine technology and what virtual machines refer to is the idea of you know you buy a single computer and instead of booting an operating system like Linux on the hardware you boot we'll call a virtual machine monitor or hypervisor on the hardware and the hypervisor is job is actually to simulate multiple multiple computers multiple virtual computers on this piece of hardware so the virtual machine monitor may boot up you know one instance of Linux may be multiple instances of Linux may be a Windows machine you can the virtual machine monitor on this one computer can run a bunch of different operating systems you know each of these as is itself some sort of operating system kernel and then applications so this is the technology they're starting with and you know the reason for this is that if you know you need to it just turns out there's many many reasons why it's very convenient to kind of interpose this level of indirection between the hardware and the operating systems and means that we can buy one computer and run lots of different operating systems on it we can have each if we run lots and lots of little services instead of having to have lots and lots of computers one per service you can just buy one computer and run each service in the operating system that it needs I'm using this virtual machines so this was their starting point they already had this stuff and a lot of sophisticated things built around it at the start of designing vmware ft so this is just virtual machines um what the papers doing is that it's gonna set up one machine or they did requires two physical machines because there's no point in running the primary and backup software in different virtual machines on the same physical machine because we're trying to guard against hardware failures so you're gonna to at least you know you have two machines running their virtual machine monitors and the primary it's going to run on one the backups and the other so on one of these machines we have a guest you know we only it might be running a lot of virtual machines we only care about one of them it's gonna be running some guest operating system and some sort of server application maybe a database server MapReduce master or something so I'll call this the primary and there'll be a second machine that you know runs the same virtual machine monitor and an identical virtual machine holding the backup so we have the same whatever the operating system is exactly the same and the virtual machine is you know giving these guest operating systems the primary and backup a each range of memory and this memory images will be identical or the goal is to make them identical in the primary in the backup we have two physical machines each one of them running a virtual machine guest with a its own copy of the service we care about we're assuming that there's a network connecting these two machines and in addition on this local area network in addition on this network there's some set of clients really they don't have to be clients they're just maybe other computers that our replicated service needs to talk with some of them our clients sending requests it turns out in this paper there the replicated service actually doesn't use a local disk and instead assumes that there's some sort of disk server that it talks to him although it's a little bit hard to realize this from the paper the scheme actually does not really treat the de server particularly especially it's just another external source of packets and place that the replicated state machine may send packets do not very much different from clients okay so the basic scheme is that the we assume that these two replicas the two virtual machines primary and backup are our exact replicas some client you know database client who knows who has some client of our replicated server sends a request to the primary and that really takes the form of a network packet that's what we're talking about that generates an interrupt and this interrupts actually goes to the virtual machine monitor at least in the first instance the virtual machine monitor sees a hot here's the input for this replicated service and so the virtual machine monitor does two things one is it sort of simulates a network packet arrival interrupt into the primary guest operating system to deliver it to the primary copy of the application and in addition the virtual machine monitor you know knows that this is an input to a replicated virtual machine and it's so it sends back out on the network a copy of that packet to the backup virtual machine monitor it also gets it and backup virtual machine monitor knows ha it is a packet for this particular replicated state machine and it also fakes a sort of network packet arrival interrupt at the backup and delivers the packet so now both the primary and the back have a copy this packet they looks at the same input you know with a lot of details are gonna process it in the same way and stay synchronized course the service is probably going to reply to the client on the primary the service will generate a reply packet and send it on the NIC that the virtual machine monitor is emulating and then the virtual machine monitor or will we'll see that output packet on the primary they'll actually send the reply back out on the network to the client because the backup is running exactly the same sequence of instructions it also generates a reply packet back to the client and sends that reply packet on its emulated NIC it's the virtual machine monitor that's emulating that network interface card and it says aha you know the virtual machine monitor says I know this was the backup only the primary is allowed to generate output and the virtual machine monitor drops the reply packet so both of them see inputs and only the primary generates outputs as far as terminology goes the paper calls this stream of input events and other things other events we'll talk about from the stream is called the logging Channel it all goes over the same network presumably but these events the primary since the back of our called log events on the log Channel where the fault tolerance comes in is that those the primary crashes what the backup is going to see is that it stops getting stuff on the stops getting log entries a log entry stops getting log entries on the logging channel and we know it it turns out that the backup can expect to get many per second because one of the things that generates log entries is periodic timer interrupts in the in the primary each one of which turns out every interrupt generates a log entries into the backup these timer interrupts are going to happen like 100 times a second so the backups can certainly expect to see a lot of chitchat on the logging Channel if the primaries up if the primary crashes then the virtual machine monitored over here will say gosh you know I haven't received anything on the logging channel for like a second or however long the primary must be dead or or something and in that case when the backup stop seeing log entries from the primary the paper the way the paper freezes it is that the backup goes live and what that means is that it stops waiting for these input events on the logging Channel from the primary and instead this virtual machine monitor just lets this backup execute freely without waiting for without being driven by input events from the primary the vmm does something to the network to cause future client requests to go to the backup instead of the primary and the VMM here stops discarding the backup personnel it's the primary not the backup stops discarding output from this virtual machine so now this or machine directly gets the inputs and there's a lot of produce output and now our backup is taken over and similarly you know that this is less interesting but has to work correctly if the backup fails a similar primary has to use a similar process to abandon the backup stop sending it events and just sort of act much more like a single non replicated server so either one of them can go live if the other one appears to be dead stops you know stops generating network traffic magic now it depends you know depends on what the networking technology is I think with the paper one possibility is that this is sitting on Ethernet every physical computer on the Internet or really every NIC has a 48 bit unique ID I'm making this up now the it could be that in fact instead of each physical computer having a unique ID each virtual machine does and when the backup takes over it essentially claims the primary's Ethernet ID as its own and it starts saying you know I'm the owner of that ID and then other people on the ethernet will start sending us packets that's my interpretation the designers believed they had identified all such sources and for each one of them the primary does whatever it is you know executes the random number generator instruction or takes an interrupt at some time the backup does not and the back of virtual machine monitor sort of detects any such instruction and and intercepts that and doesn't do it and he said the backup waits for an event on the logging Channel saying this instruction number you know the random number was whatever it was on the primary Edwige yes yes yeah the paper hints that they got Intel to add features to the microprocessor to support exactly this but they don't say what it was okay okay so on that topic the so far that you know the story is sort of assumed that as long as the backup to sees the package from the clients it'll execute in identically to the primary and that's actually glossing over some huge and important details so one problem is that as a couple of people have mentioned there are some things that are non-deterministic now it's not the case that every single thing that happens in the computer is a deterministic function of the contents of the memory of the computer it is for a sort of straight line code execution often but certainly not always so worried about is things that may happen that are not a strict function of the current state that is that might be different if we're not careful on the primary and backup so these are sort of non-deterministic events that may happen so the designers had to sit down and like figure out what they all work and here are the ones here's the kind of stuff they talked about so one is inputs from external sources like clients which arrive just whenever they arrive right they're not predictable there are no sense in which the time at which a client request arrives or its content is a deterministic function of the services state because it's not so these actually this system is really dedicated to a world in which services only talk over the network and so the only really basically the only form of input or output in this system is supported by this system seems to be network packets coming and going so we didn't put arrives at what that really means it's a packet arrives and what a packet really consists of for us is the data in the packet plus the interrupt that's signaled that the packet had arrived so that's quite important so when a packet arrives I'm ordinarily the NIC DMA is the packet contents into memory and then raises an interrupt which the operating system feels and the interrupt happens at some point in the instruction stream and so both of those have to look identical on the primary and backup or else we're gonna have they're also executions gonna diverge and so you know the real issue is when the interrupt occurs exactly at which instruction the interrupts happen to occur and better be the same on the primary in the backup otherwise their execution is different and their states are gonna diverge and so we care about the content of the packet and the timing of the interrupt and then as a couple of people have mentioned there's a few instructions that that behave differently on different computers or differently depending on something like there's maybe a random number generator instruction there's I get time-of-day instructions that will yield different answers have called at different times and unique ID instructions another huge source of non determinism which the paper basically rules out is multi-core parallelism this is a uni-process only system there's no multi-core in this world the reason for this is that if it allowed multi-core then then the service would be running on multiple cores and the instructions of the service the rest of you know the different cores are interleaved in some way which is not predictable and so really if we run the same code on the on the backup in the server if it's parallel code running on a multi-core the tubo interleave the instructions in the two cores in different ways the hardware will and that can just cause different results because you know supposing the code and the two cores you know they both asked for a lock on some data well on the master you know core one may get the lock before core two on the slave just because of a tiny timing difference core two may got the lock first and the you know execution results are totally different likely to be totally different if different threads get the lock so multi-core is the grim source among non-determinisms just totally outlawed in this papers world and indeed like as far as I can tell the techniques are not really applicable the service can't use multi-core parallel parallelism the hardware is almost certainly multi-core parallel but that's the hardware sitting underneath the virtual machine monitor the machine that the virtual machine monitor exposes to one of the guest operating systems that runs the primary backup that emulated virtual machine is a unicore it's a uni-processor machine in this paper and I'm guessing there's not an easy way for them to adapt this design to multi-core virtual machines okay so so these are really it's it's it's these events that go over the logging channel and so the format of a log record a log log entry they don't quite say but I'm guessing that there's really three things in a log entry there's the instruction number at which the event occurred because if you're delivering an interrupt or you know input or whatever it better be delivered at exactly the same place in the primary backup so we need to know the instruction number and by instruction number I mean you know the number of instructions since the Machine booted why not the instruction address but like oh or executing the four billion and entry is going to have instruction number four an interrupt for input it's going to be the instruction at which the interrupt was delivered on the primary and for a weird instruction like get at time of day it's going to be the instruction number of the instruction of the get time of day or whatever instruction that was executed on the primary so that you know the backup knows where to where to call this event to occur okay so there's gonna be a type you know network input whatever a weird instruction and then there's I'm gonna be data for a packet arrival it's gonna be the packet data for one of these weird instructions it's going to be the result of the instruction when it was executed on the primary so that the backup virtual machine can sort of fake the instruction and supply that same result okay so so as an example the both of these operating systems guest operating system assumes requires that the hardware in this case emulated hardware virtual machine has a timer that ticks say a hundred times a second and causes interrupts to the operating system and that's how the operating system keeps track of time it's by counting these timer interrupts so the way that plays out those timer notice why they have to happen at exactly the same place in the primary and backup otherwise they don't execute the same no diverge so what really happens is that the there's there's a timer on the physical machine that's running the Ft virtual machine monitor and the timer on the physical machine ticks and delivers an interrupt a timer and up to the virtual machine monitor on the primary the virtual machine monitor at you know the appropriate moment stops the execution of the primary writes down the instruction number that it was at you know instruction since boot and then delivers sort of fake simulates and interrupts into the guest operating system in the primary at that instruction number saying oh you know you're emulating the timer Hardware just ticked there's the interrupt and then the primary virtual machine monitor sends that instruction number which the interrupt happened you know to the backup the backup of course it's virtual machine monitor is also taking timer interrupts from its physical timer and it's not giving them it's not giving it's a real physical timer interrupts to the to the backup operating system it's just ignoring them when the law when the log entry for the primaries timer interrupts arrives here then the backup virtual machine monitor will arrange with the CPU and this requires special CPU support to cause the physical machine to interrupt at the same instruction number at the timer interrupts tapped into the primary at that point the virtual machine monitor gets control again from the guest and then fakes the timer interrupts into the backup operating system now exact exactly the same instruction number as it occurred on the primary well yeah so the observation is that this will this relies on the CPU having some special hardware in it where the vmm can tell the hardware CPU please interrupt a thousand instructions from now and then the vmm you know where so that you know it'll interrupt at the right instruction number the same instruction as the primary did and then the vmm just tells the cpu to start X resume executing again in the backup and exactly a thousand instructions later the CPU will force an interrupt into the virtual machine monitor and that that's special hardware but it turns out it's you know on all Intel chips so it's not it's not that special anymore you know totally normal and it turns out there's a lot of other uses for it like um if you want to do profiling you wanna do CPU time profiling what you'd really like or one way to do CPU time profiling is to have the microprocessor interrupt every thousand instructions right and this is the hardware that's this Hardware also this is the same hardware that would cause the microprocessor to generate an interrupt every thousand instructions so it's a very natural sort of gadget to want in your CPU all right yes what if the backup gets ahead of the primary so you know we standing above know that oh you know the primary is about to take an interrupt at the millionth instruction but the backup is already you know executed the millionth and first instruction so it's gonna be if we let this happen it's gonna be too late to deliver the interrupts if we let the backup execute ahead of the primary it's going to be too late to deliver the interrupts at the same point in the primary instruction stream and the backup of the instruction stream so we cannot let that happen we cannot let the backup get ahead of the primary in execution and the way VMware ft does that is that the the backup virtual machine monitor it actually keeps a buffer of waiting events that have arrived from the primary and it will not let to the backup execute unless there's at least one event in that buffer and if there's one event in that buffer then it will know from the instruction number the place at which it's got a force the backup to stop executing so always always the backup is executing with the CPU being told exactly where the next stopping point the next instruction number of a stopping point is because the backup only executes if it has a an event here that tells it where to stop next so that means it starts up after the primary because the backup can't even start executing until the primary has generated the first event and that event has arrived at the backup so the backup sort of always one event basically behind the at least one event behind the primary and if it's slower for some other whatever reason maybe there's other stuff running on that physical machine then the backup might get you know multiple events behind at the primary alright there's a one little piece of mess about arriving the specific case of arriving packets ordinarily when a packet arrives from a network interface card if we weren't running a virtual machine the network interface card would DMA the packet content into the memory of the computer that it's attached to sort of as the data arrives from the network interface card and that means you know you should never write software like this but it could be that the operating system that's running on a computer might actually see the data of a packet as its DMA or copied from the network interface card into memory right you know this is and you know we don't know what operating this system is designed so that it can support any operating system and cost maybe there is an operating system that watches arriving packets in memory as they're copied into memory so we can't let that happen because if the primary happens to be playing that trick it's gonna see you know if we allowed the network interface card to directly DMA incoming packets into the memory of the primary the primary we don't have any control over the exact timing of when the network interface card copies data into memory and so we're not going to know sort of at what times the primary did or didn't observe data from the packet arriving and so what that means is that in fact the NIC copies incoming packets into private memory of the virtual machine monitor and then the network interface card interrupts the virtual machine monitor and says oh a packet has arrived at that point the virtual machine monitor will suspend the primary and remember what instruction number had suspended at copy the entire packet into the primaries memory while the primary suspended and not looking at this copy and then emulate a network interface card interrupt into the primary and then send the packet and the instruction number to the backup the backup will also suspend the backup rope you know virtual machine monitor will spend the backup at that instruction number copy the entire packet and again to the back-up is guaranteed not to be watching the data arrive and then fakin interrupts at the same instruction numbers of the primary and this is the something the bounce buffer mechanism explained in the paper okay yeah the the only instructions and that result in logging channel traffic or are weird instructions which are rare no its instructions that might yield a different result if executed on the primary and backup like instruction to get the current time of day or current processor number or ask how many instructions have been executed or and those actually turn out to be relatively rare there's also one them to get random tasks when some machines to ask or a hardware generated random number for cryptography or something and but those are not everyday instructions most instructions like add instructions they're gonna get the same result on primary and that go yeah so the way those get replicated on the back up is just by forwarding that's exactly right each network packet just it's packaged up and forwarded as it is as a network packet and is interpreted by the tcp/ip stack on both you know so I'm expecting 99.99% of the logging channel traffic to be incoming packets and only a tiny fraction to be results from special non-deterministic instructions and so we can kind of guess what the traffic load is likely to be for for a server that serves clients basically it's a copy of every client packet and then we'll sort of know what the logging channel how fast the logging channel has to be all right so um so it's worth talking a little bit about how output works and in this system really the only what output basically means only is sending packets that client send requests in as network packets the response goes back out as network packets and there's really no other form of output as I mentioned the you know both primary and backup compute the output packet they want to send and that sort of asks that simulated mix to send the packet it's really sent on the primary and simply discard it the output packet discarded on the backup okay but it turns out is a little more complicated than that so supposing we're what we're running is a some sort of simple database server and the operation the client operation that our database server supports is increment and ideas the client sends an increment requests the database server increments the value and sends back the new value so maybe on the primary well let's say everything's fine so far and the primary backup both have value 10 in memory and that's the current value at the counter and some client on the local area network sends a you know an increment request to the primary that packet is you know delivered to the primary it's you know it's executed the primary server software and the primary says oh you know current values 10 I'm gonna change to 11 and send a you know response packet back to the client saying saying mentioned gonna supposed to be sent to the backup will also be processed here it's going to change this 10 to 11 also generate a reply and we'll throw it away that's what's supposed to happen the output however you also need to ask yourself what happens if there's a failure at an awkward time if you should always in this class should always ask yourself what's the most awkward time to have a failure and what would happen you to failure occurred then so suppose the primary does indeed generate the reply here back to the client but the client the primary crashes just after sending the report its reply to the client and furthermore and much worse it turns out that you know this is just a network it doesn't guarantee to deliver packets let's suppose this log entry on the logging channel got dropped also when the when the primary died so now the state of play is the client received a reply saying 11 but the backup did not get the client request so its state is still 10 no now the backup takes over because it's seized the primary is dead and this client or maybe some other client sends an increment request a new backup and now it's really processing these requests and so the new backup when it gets the next increment requests you know it's now going to change its state to 11 and generate a second 11 response maybe the same client maybe to a different client which if the clients compare notes or if it's the same client it's just obviously cannot have happened I didn't so you know because we have to support unmodified software that does not damn that there's any funny business of replication going on that means we do not have the opportunity to you know you can imagine the client could go you know we could change the client to realize something funny it happened with the fault tolerance and do I don't know what but we don't have that option here because this whole system really only makes sense if we're running unmodified software so so this was a big this is a disaster we can't have let this happen does anybody remember from the paper how they prevent this from happening the output rule yeah so you want to do you know yeah so the output rules is the their solution to this problem and the idea is that the client he's not allowed to generate you know and generate any output the primary's not allowed to generate any output and what we're talking about now is this output here until the backup acknowledges that it has received all log records up to this point so the real sequence at the primary then let's now undone crash the primary go back to them starting at 10 the real sequence now when the output rule is that the input arrives at the time the input arrives that's when the virtual machine monitor sends a copy of the input to the backup so the the sort of time at which this log message with the input is sent is before strictly before the primary generates the output sort of obvious then after firing this log entry off across a network and now it's heading towards the backup but I'd have been lost might not the virtual machine monitor delivers a request to the primary server software it generates the output so now the replicated you know the primary has actually generated change the state 211 and generated an output packet that says eleven but the virtual machine monitor says oh wait a minute we're not allowed to generate that output until all previous log records have been acknowledged by the backup so you know this is the most recent previous log message so this output is held by the virtual machine monitor until the this log entry containing the input packet from the client is delivered to the virtual machine monitor and buffered by the virtual machine monitor but do not necessarily execute it it may be just waiting for the backup to get to that point in the instruction stream and then the virtual machine monitor here will send an ACK packet back saying yes I did get that input and when the acknowledgment comes back only then will the virtual machine monitor here release the packet out onto the network and so the idea is that if the client could have seen the reply then necessarily the backup must have seen the request and at least buffered it and so we no longer get this weird situation in which a client can see a reply but then there's a failure and a cut over and the replica didn't know anything about that reply if the you know there's also a situation maybe this message was lost and if this log entry was lost and then the primary crashes well since it hadn't been delivered so the backup hadn't sent the act that means if the primary crashed you know this log entry was brought in the primary crashed it must have crashed before the virtual machine monitor or at least the output packet and prayer for this client couldn't have gotten the reply and so it's not in a position to spot any irregularities they're really happy with the output rule brennon see I don't know they don't paper doesn't mention how the virtual machine monitor is implemented I mean it's pretty low level stuff because you know it's sitting there allocating memory and figuring page tables and talking to device drivers and intercepting instructions and understanding what instructions the guest was executing so we're talking about low-level stuff what language is written in you know traditionally C or C++ but I don't actually know okay this of the primary has to delay at this point waiting for the backup to say that it's up to date this is a real performance thorn in the side of just about every replication scheme this sort of synchronous wait where the we can't let the primary get too far ahead of the backup because if the primary failed while it was ahead that would be the backup lagging lagging behind clients right so just about every replication system has this problem that at some point the primary has to stall waiting for the backup and it's a real limit on performance even if the machines are like side-by-side and adjacent racks it's still you know we're talking about a half a millisecond or something to send messages back and forth with a primary stalled and if we wanna like withstand earthquakes or citywide power failures you know the primary in the backup have to be in different cities that's probably five milliseconds apart every time we produce output if we replicate in the two replicas in different city every packet that it produces this output has to first wait the five milliseconds or whatever to have the last log entry get to the backup and how the acknowledgment come back and then we can release a path packet and you know for sort of low intensity services that's not a problem but if we're building a you know database server that we would like to you know that if it weren't for this could process millions of requests per second then that's just unbelievably damaging for performance and this is a big reason why people you know you know if they possibly can use a replication scheme that's operating at a higher level and kind of understands the semantics of operations and so it doesn't have to stall on every packet you know it could stall on every high level operation or even notice that well you know read-only operations don't have to stall at all it's only right so that just all or something but you have to there has to be an application level replication scheme to to realize that you're absolutely right so the observation is that you don't have to stall the execution of the primary you only have to hold the output and so maybe that's not as bad as it could be but nevertheless it means that every you know in a service that could otherwise have responded in a couple of microseconds to the client you know if we have to first update the replicas in the next city we turn to you know 10 micro second interaction into it 10 millisecond interactions possibly if you have vast numbers of clients submitting concurrent requests then you may may be able to maintain high throughput even with high latency but you have to be lucky to or very clever designer to get that that's a great idea but if you log in the memory of the primary that log will disappear when the primary crashes or that's usual semantics of a server failing is that you lose everything inside the box like the contents of memory or you know if even if you didn't if the failure is that somebody unplugged the power cable accidentally from the primary even if the primary just has battery backed up RAM or I don't know what you can't get at it all right the backup can't get at it so in fact this system does log the output and the place it logs it is in the memory of the backup and in order to reliably log it there you have to observe the output rule and wait for the acknowledgment so it's entirely correct idea just can't use the primary's memory for it yes say it again that's a clever idea I'd and so the question is maybe input should go to the primary but output should come from the backup I completely haven't thought this through that might work that I don't know that's interesting yeah maybe I will okay one possibility this does expose though is that the situation you know maybe the a primary crashes after its output is released so the client does receive the reply then the primary crashes the backups input is still in this event buffer in the virtual machine monitor of the backup it hasn't been delivered to the actual replicated service when the backup goes live after the crash of the primary the backup first has to consume all of the sort of log records that are lying around that it hasn't consumed yet has to catch up to the primary otherwise it won't take over with the same state so before the backup can go live it actually has to consume all these entries the last entry is presumably is the request from the client so the backup will be live after after it after the interrupt that delivers the request from the client and that means that the backup well you know increment its counter to eleven and then generate an output packet and since it's live at this point it will generate the output packet and the client will get to eleven replies which is also if it if that really happened would be anomalous like possibly not something that could happen if there was only one server the good news is that almost certainly or the almost certainly the client is talking to this service using TCP and that this is the request and the response go back and forth on a TCP Channel the when the backup takes over the backup since the state is identical to the primaries it knows all about that TCP connection and whether all the sequence numbers are and whatnot and when it generates this packet it will generate it with the same TCP sequence number as an original packet and the TCP stack on the client will say oh wait a minute that's a duplicate packet we'll discard the duplicate packet at the TCP level and the user level software will just never see this duplicate and so this system really you know you can view this as a kind of accidental or clever trick but the fact is for any replication system where cutover can happen which is to say pretty much any replication system it's essentially impossible to design them in a way that they are guaranteed not to generate duplicate output basically you know you well you can err on either side I'm not even either not generate the output at all which would be bad which would be terrible or you can generate the output twice on a cutover that's basically no way to generate it guaranteed generated only once everybody errors on the side of possibly generating duplicate output and that means that at some level you know the client side of all replication schemes need some sort of duplicate detection scheme here we get to use TCP s that we didn't have TCP that would have to be something else maybe application level sequence numbers or I don't know what and you'll see all of this and actually you'll see versions of essentially everything I've talked about like the output rule for example in labs 2 & 3 you'll design your own replicated state machine yes yes to the first part so the scenario is the primary sends the reply and then either the primary send the close packet or the client closes the connect the TCP connection after it receives the primary's reply so now there's like no connection on the client side but there is a connection on the backup side and so now the backup so the backup consumes the very last log entry that as the input is now live so we're not responsible for replicating anything at this point right because the backup now live there's no other replica as the primary died so there's no like if if we don't if the backup fails to execute in log step with the primary that's fine actually because the primary is is dead and we do not want to execute in log step with it okay so the primer is now not it's live it generates an output on this TCP connection that isn't closed yet from the backup point of view this packet arrives with the client on a TCP connection that doesn't exist anymore from the clients point of view like no big whoopee on the client right he's just going to throw away the packet as if nothing happened the application won't no the client may send a reset something like a TCP error or whatever packet back to the backup and the backup does something or other with it but it doesn't matter because we're not diverging from anything because there's no primary to diverge from you can just handle a stray we said however it likes and what it'll in fact do is basically ignore but there's no now the backup has gone live there's just no we don't owe anybody anything as far as replication yeah well you can bet since the backups memory image is identical to the primaries image that they're sending packets with the very same source TCP number and they're very same everything they're sending bit for bit identical packets you know at this level the server's don't have IP addresses or for our purposes the virtual machines you know the primary in the back up virtual machines have IP addresses but the the physical computer and the vmm are transparent to the network it's not entirely true but it's basically the case that the virtual machine monitor in the physical machine don't really have identity of their own on the network because you can configure that then that way instead these they're not you know the virtual machine with a sewing operating system in its own TCP stack it doesn't IP address underneath there an address and all this other stuff which is identical between the primary in the backup and when it sends a packet it sends it with the virtual machines IP address and Ethernet address and those bits least in my mental model are just simply passed through on to the local area network it's exactly what we want and so I think he doesn't generate exactly the same packets that the primary would have generated there's maybe a little bit of trickery you know what the we if this is these are actually plugged into an Ethernet switch into the physical machines maybe it wasn't in two different ports of an Ethernet switch and we'd like the Ethernet switch to change its mind about which of these two machines that delivers packets with replicated services Ethernet address and so there's a little bit of funny business there for the most part they're just generating identical packets so let me just send them out okay so another little detail I've been glossing over is that I've been assuming that the primary just fails or the backup just fails that is fail stop right but that's not the only option another very common situation that has to be dealt with is if the two machines are still up and running and executing but there's something funny happen on the network that causes them not to be able to talk to each other but to still be able to talk to some clients so if that happened if the primary backup couldn't talk to each other but they could still talk to the clients they would both think oh the other replicas dead I better take over and go live and so now we have two machines going live with this service and now you know they're no longer sending each other log events or anything they're just diverging maybe they're accepting different client inputs and changes are stayed in different ways so now we have a split brain disaster if we let the primary in the backup go live because it was a network that has some kind of failure instead of these machines and the way that this paper solves it I mean is by appealing to an outside authority to make the decision about which of the primary of the backup is allowed to be live and so it they're you know it turns out that their storage is actually not on local disk this almost doesn't matter but their storage is on some external disk server and as well as being in this server as a like totally separate service there's nothing to do with disks there this server happens to abort this test and set test and set service over the network where you you can send a test and set request to it and there's some flag it's keeping in memory and it'll set the flag and return what the old value was so both primary and backup have to sort of acquire this test and set flag it's a little bit like a lock in order to go live they both may be send test and set requests at the same time to this test and set server the first one gets back a reply that says oh the flag used to be zero now it's one this second request to arrive the response from the test and set server is Oh actually the flag was already one when your request arrived so so basically you're not allowed to be primary and so this this test and set server and we can think of it as a single machine is the arbitrator that decides which of the two should go live if they both think the other ones dead due to a network partition any questions about this mechanism you're busted yeah if the test and set server should be dead at the critical moment when and so actually even if there's not a network partition under all circumstances in which one or the other of these wants to go live because it thinks the others dead even when the other one really is dead the one that wants to collide still has to acquire the test and set lock because one of like the deep rules of 6.824 game is that you cannot tell whether or another computer is dead or not all you know is that you stopped receiving packets from it and you don't know whether it's because the other computer is dead or because something has gone wrong with the network between you and the other computer so all the backup ceases well I've stuck in packets maybe the primary is dead maybe it's live primary probably sees the same thing so if there's a network partition they certainly have to ask the Test-and-Set server but since they don't know if it's a network partition they have to ask the test and set server regardless of whether it's a partition or not so anytime either wants to collide the test and set server also has to be alive because they always have to acquire this test and set lock so the test and set server sounds like a single point of failure they were trying to build a replicated fault tolerant whatever thing but in the end you know we can't failover unless unless this is alive so that's a bit of a bummer I'm guessing though I'm making a strong guess that the test and set server is actually itself a replicated service and is fault tolerant right it's almost certainly I mean these people of VMware where they're like happy to sell you a million dollar highly available storage system that uses enormous amounts of replication internally um since the test and set thing is on there dis server I'm I'm guessing it's replicated too and the stuff you'll be doing in lab 2 in lab 3 is more than powerful enough for you to build your own fault-tolerant test and set server so this problem can easily be eliminated -------------------------------------------------------------------------------- /lec05/Lec5.en.txt: -------------------------------------------------------------------------------- 1 | Today the TAS are going to be giving a lecture on concurrency and go basically this lecture is going to be full of design patterns and practical tips to help you with the labs we're going to be covering briefly the code memory model the reading which we went over and then spend most of the lecture talking about concurrency primitives and go concurrency patterns and go how you do things that you will need to do in the labs and then finally we'll talk through some debugging tips and techniques and show you some interesting tools that you might want to use when debugging the labs so very briefly on the go memory model on the reading so why did we assign this reading well the goal was to give you some concrete examples of correct ways to write threaded code and go so the document like in the second half of the document has some examples of correct code and an incorrect code and how it can go wrong so one thing you might have noticed in the document is early on it says if you need to read and understand this you're being too clever and we think that that's good advice so focus on how to write correct code don't focus way too much on the happens before relation and being able to reason about exactly why incorrect code is't correct like we don't really care we just want to be able to write correct code and call it a day one question that came up in the lecture questions was like talking about goroutines in relation to performance and so we just wanted to say that goroutines and like in general concurrency can be used for a couple different reasons and the reason we use concurrency in the labs is not necessarily for performance like we're not going for parallelism using multiple cores on a single machine in order to be able to do more work on the CPU concurrency gets us something else besides performance through parallelism it can get us better expressivity like we want to write down some ideas and it happens to be that writing down code that uses threads is a clean way of expressing those ideas and so the takeaway from that is when you use threads in lab 2 and Beyond don't try to do fancy things you might do if you're going for performance especially CPU performance like we don't care to do things like using fine-grained locking or other techniques use basically write code that's easy to reason about use big locks to protect large critical sections and just like don't worry about performance in the sense of CPU performance so with that that's all we're going to say about the memory model and spend most of this lecture just talking about go code and go concurrency patterns and as we go through these examples feel free to ask any questions about what's on the screen or anything else you might think about so I'm going to start off talking about concurrency primitives and go so the first thing is closures this is something that will almost certainly be helpful in the labs and this is related to goroutines so here's this example program on the screen and what it does is the main function declares a bunch of variables and then spawns this goroutine in here with this go statement and we noticed that the score routine is not taking it as an argument a function call to some function defined elsewhere but this anonymous function just defined in line here so this is a handy pattern this is something called a closure and one neat thing about this is that this function that's defined here can refer to variables from the enclosing scope so for example this function can mutate this variable a that's defined up here or refer to this wait group that's defined up here so if we go run this example it does what you think it does the wait group dot done here let's the main thread continue past this point it prints out this variable which has been mutated by this concurrently running thread that finished before this wait happened so this is a useful pattern to be able to use one like the reason we're pointing this out is because you might have code that looks like this in your labs very similar to the previous example except this is code that is spawning a bunch of threads in a loop this is useful for example when you want to send rpcs in parallel right so like in lab two if you have a candidate asking for votes you want to ask for votes from all the followers in parallel not one after the other because the RPC is a blocking operation that might take some time or similarly the leader might want to send append entries to all the followers you want to do it in parallel not in series and so threads are a clean way to express this idea and so you might have code that looks kind of like this at a high level in a for loop you spawn a bunch of goroutines one thing to be careful about here this is something that was talked about in a previous lecture is identifier capture and goroutines and mutation of that identifier in the outer scope so we see here that we have this i that's being mutated by this for loop and then we want to use that value inside this goroutine and the way we do that like the correct way of writing this code is to pass this value i as an argument to this function and this function or you can rename it to X inside here and then use the value inside and so if we run this program so here I've kind of stubbed out to send our RPC thing was actually just prints out the index this I might be like the index of the follower trying to send an RPC to here prints out the numbers 0 through 4 in some order so this is what we want like send our PCs to all the followers the reason we're showing you this code is because there's a variation of this code which looks really similar and maybe intuitively you might think it does the right thing but in fact it doesn't so in this code the only thing that's changed is we've gotten rid of this argument here that we're explicitly passing and instead we're letting this I refer to the i from the outer scope so you might think that when you run this it does the same thing but in fact in this particular run it printed 4 5 5 5 5 so this would do the wrong thing and the reason for this is that this I is being mutated by this outer scope and by the time this goroutine ends up actually executing this line well the for loop has already changed the value of I so this doesn't do the right thing so at a high level if you're spawning goroutines in a loop just make sure that you use this pattern here and everything will work right any questions about that so it's just like a small gotcha but we've seen this a whole bunch of times in office hours so I just wanted to point this out all right so moving on to other patterns that you might want to use in your code oftentimes you want code that periodically does something a very simple way to do that is to have a separate function that in an infinite loop does something in this case we're just printing out tick and then use this time dot sleep to wait for a certain amount of time so very simple pattern here you don't need anything fancier than this to do something periodically one modification of this that you might want is you want to do something periodically until something happens for example you might want to start up a raft here and then periodically send heartbeats but when we call dot kill on the raft instance you want to actually shut down all these goroutines so you don't have all these random goroutines still running in the background and so the pattern for that looks something like this you have a goroutine that will run in an infinite loop and do something and then wait for a little bit and then you can just have a shared variable between whatever control thread is going to decide whether this goroutine should die or not so in this example we have this variable done that's a global variable and what main does is it waits for while and sets done to true and in this goroutine that's ticking and doing work periodically we're just checking the value of done and if done is set then we terminate the square-root eeen and here since done is a shared variable being mutated and read by multiple threads we need to make sure that we guard the use of this with a lock so that's where this mute outlaw can mute it unlock comes in for the purpose of the labs you can actually write something a little bit simpler than this so we have this method rf.kill on your raft instance so you might have code that looks a little bit more like this so while you're wrapped instance is not dead you want to periodically do some work any questions about that so far yeah question does using the locking mechanisms for channels make it so that any right stunts any variables and those functions are to be observed by the fencer would you need to send done across the channel okay so let me try to simplify the question a bit I think the question is do you need to use locks here can you use channels instead and are and can you get away with not using locks and like what's the difference between nothing versus channels vs locks is that basically what you're asking I think the question is this done does it not need to be sent across a channel does just using these locks ensure that this read here observes the write done by a thread okay so the answer is yes basically at a high level if you want to ensure cross thread communication make sure you use go synchronization primitives whether it's channels or locks and condition variables and so here because of the use of locks after this thread writes done and does unlock the next lock that happens is guaranteed to observe the writes done before that before this unlock happened so you have this write happened and this unlock happened then one of these locks happens and then the next done will be guaranteed to observe that write of true question that's a good question in this particular code it doesn't matter but it would be cleaner to do it so the question is why don't we do mu dot unlock here before returning and the answer is in here there's no more like the program's done so it doesn't actually end up mattering but you're right that like in general we would want to ensure that we unlock before we return yeah thanks for pointing that out so I'm not sure entirely what the question is but maybe something like can both of these acquire the lock at the same time is that the question and we'll talk a little bit more about locks in just a moment but at a high level the semantics of a lock are the lock is either held by somebody or not held by somebody and if it's not held by somebody then if someone calls lock they have the chance to acquire the lock and if before they call unlock somebody else calls lock that other thread is going to be blocked until the unlock happens then the lock is free again so at a high level between the lock and the unlock for any particular lock like any only a single thread can be executing what's called a critical section between the lock and unlock regions any other questions so the question is related to timing like when you set done equals true and then you unlock you have no guarantee in terms of real time like when periodic will end up being scheduled and observe this right and actually end up terminating and so yes if you want to mean to actually ensure that periodic has exited for some particular reason then you could write some code that communicates back from periodic acknowledging this but in this particular case like the only reason we have the sleep here is just to demonstrate that the sleep here is just to demonstrate that tick prints for a while and then periodic as indeed cancel it because it stops being printed before I get my shell prompt back and in general for a lot of these background threads like you can just say that you want to kill them and it doesn't matter if they're killed within 1 second or within 2 seconds or one exactly go schedules it because this thread is going to just observe this right to done and then exit do no more works it doesn't really matter and also another thing in go is that if you spawn a bunch of goroutines one of them is the main goroutine this one here and the way go works is that if the main goroutine exits the whole program terminates and all goroutines are terminated that's a great question okay so I think the question is something like why do you need locks at all like can you just delete all the locks and then like looking at this code it looks like okay main does a right to true at some point and periodic is repeatedly reading it so at some point it should observe this read right well it turns out that like this is why go has this fancy memory model and you have this whole thing on that happens before relation the compiler is allowed to take this code and emit a kind of low-level machine code that does something a little bit different than what you intuitively thought would happen here and we can talk about that in detail offline after the lecture and office hours but at a high level I think one rule you can follow is if you have accesses to shared variables and you want to be able to observe them across different threads you need to be holding a lock before you read or write those shared variables in this particular case I think the go compiler would be allowed to optimize this to like lift the read of done outside the four so read this shared variable once and then if done is false then set like make the inside be an infinite loop because like now the way this thread is written it had uses no synchronization primitives there's no mutex lock or unlock no channel sends or receives and so it's actually not guaranteed to observe any mutations done by other concurrently running threads and if you look on Piazza I've actually like written a particular go program that is optimized in the unintuitive way like it'll produce code that does an infinite loop even though looking at it like you might think that oh the obvious way to compile this code will produce something that terminates yeah so the memory model is pretty fancy and it's really hard to think about why exactly incorrect programs are incorrect but if you follow some general rules like whole blocks before you mutate shared variables then you can avoid thinking about some of these nasty issues any other questions all right so let's talk a little bit more about mutexes now so why do you need mutex is at a high level whenever you have concurrent access but by different threads to some shared data you want to ensure that reads and writes of that data are atomic so here's one example of program that declares a counter and then spawns a goroutine actually spawns a thousand goroutines that each update the counter value and increment it by one and you might think that looking at this intuitively when I print out the value of the counter at the end it should print a thousand but it turns out that we missed some of the updates here and in this particular case it only printed 947 so what's going on here is that this update here is not really protected in any way and so these threads running concurrently can read the value of counter and update it and clobber other threads updates of this value like basically we want to ensure that this entire section here happens atomically and so the way you make blocks of code run atomically are by using locks and so in this code example we've fixed this bug we create a lock and then all these goroutines that modify this counter value first grab the lock then update the counter value and then unlock and we see that we're using this defer keyword here what this does is basically the same as putting this code down here so we grab a lock do some update then unlock defer is just a nice way of remembering to do this you might forget to write the unlock later and so what defer does is it you can think of it as like scheduling this to run at the end of the current function body and so this is a really common pattern you'll see for example in your RPC handlers for the lab so oftentimes RPC handlers will manipulate either read or write data on the Raft structure right and those updates should be synchronized with other concurrently happening updates and so oftentimes the pattern for RPC handles would be like grab the lock differ unlock and then go do some work inside so we can see if we run this code it produces the expected results so it prints out a thousand and we haven't lost any of these updates and so what at a high level what a lock or a mutex can do is guarantee mutual exclusion for a region of code which we call a critical section so in here this is the critical section and it ensures that none of these critical sections execute concurrently with ones they're all serialized happened one after another question yes so this is a good observation this particular could is actually not guaranteed to produce a thousand depending on how thread scheduling end up ends up happening because all the main goroutine does is it waits for one second which is some arbitrary unit of time and then it prints out the value of the counter I just want to keep this example as simple as possible a different way to write this code that would be guaranteed to print a thousand would be to have the main goroutine wait for all these thousand threads to finish so you could do this using a wait group for example but we didn't want to put two synchronization primitives like wait groups and mutex is in the same example so that's why we're at this code that is like technically incorrect but I think it still demonstrates the point of locks any other questions great so at a very high level you can think of locks is like you grab the lock you mutate the shared data and then you unlock so does this pattern always work well turns out that that's like a useful starting point for how to think about locks but it's not really the complete story so here's some code this doesn't fit on the screen but I'll explain it to you we can scroll through it it basically implements a bank at a high level so I have Alice and Bob who both start out with some balances and then I keep track of what the total balances like the total amount of money I store in my bank and then I'm going to spawn to goroutines that will transfer money back and forth between our Alice and Bob so this one goroutine that a thousand times will reduce one from Alice and send it to Bob and concurrently running I have this other goroutine that in a loop will reduce one from Bob and send it to Alice and notice that I have this mutex here and whenever I manipulate these shared variables between these two different threads I'm always locking the mutex and this update only happens while this lock is held right and so is this code correct or incorrect there actually isn't really a straightforward answer to that question it depends on like what are the semantics of my bank like what behavior do I expect so I'm going to introduce another thread here I'll call this one the audit thread and what this is going to do is every once in a while I'll check it check the sum of all the accounts in my bank and make sure that the sum is the same as what it started out as right click if I only allow transfers within my bank the total amount should never change so now given this other thread so what this does is it grabs the lock then sums up Alice Plus Bob and compares it to the total and if it doesn't match then it says that though I've observed some violation that my total is no longer what it should be if I run this code I actually see that a whole bunch of times this concurrently running thread does indeed observe that Alice Plus Bob is not equal to the overall sum so what went wrong here like we're following our basic rule of whenever we're accessing data that's shared between threads we grab a lock it is indeed true that no updates to these shared variables happen while the lock is not held exactly so let me repeat that for everybody to hear what we intended here was for this decrement and increment to happen atomically but instead of what we ended up writing was code that decrement atomically and then increments atomically and so in this particular code actually like we won't lose money in the long term like if we let these threads run and then wait till they finish and then check the total it will indeed be what it started out as but while these are running since this entire block of code is not atomic we can temporarily observe these violations and so at a higher level the way should think about locking is not just like locks are to protect access to shared data but locks are meant to protect invariants you have some shared data that multiple people might access and there's some properties that hold on that shared data like for example here I is the programmer decided that I want this property that alice + Bob should equal some constant and that should always be that way I want that property to hold but then it may be the case that different threads running concurrently are making changes to this data and might temporarily break this invariant here right like here when I decrement from Alice temporarily the sum Alice Plus Bob has changed but then this thread eventually ends up restoring this invariant here and so locks are meant to protect and vary at a high level you grab a lock then you do some work that might temporarily break the invariant but then you restore the invariant before you release the lock so nobody can observe these in progress updates and so the correct way to write this code is to actually have less use of lock and unlock we have lock then we do a bunch of work and then we unlock and when you run this code we see no more printouts like this that we never have this audit thread observe that the total is not what it should be all right so that's the right way to think about locking at kind of a high level you can think about it as make sure you grab locks when every access shared data like that is a rule but another important rule is locks protect invariants so grab a lock manipulate things in a way that might break the invariants but restore them afterwards and then release the lock another way you can think about it is locks can make regions of code atomic not just like single statements or single updates to shared variables any questions about that great so the next synchronization primitive we're going to talk about it something called condition variables and this is it seems like there's been a source of confusion from lab one where we mentioned condition variables but didn't quite explain them so we're going to take the time to explain them to you now and we're going to do that in the context of an example that you should all be familiar with counting votes so remember in lab 2a you have this pattern where whenever a Raft peer becomes a candidate it wants to send out vote requests all of its followers and eventually the followers come back to the candidate and say yes or no like whether or not the candidate got the vote right and one way we could write this code is have the candidate in serial ask peer number one peer number two peer number three and so on but that's bad right because we want the candidate ask all the peers in parallel so it can quickly win the election when possible and then there's some other complexities there like when we ask all the peers in parallel we don't want to wait so we get a response from all of them before making up our mind right because if a candidate gets a majority of votes like it doesn't need to wait till it hears back from everybody else so this code is kind of complicated in some ways and so here here's a kind of stubbed out version of what that vote counting code might look like with a little bit of infrastructure to make it actually run and so here have this mean goroutine that sets count which is like the number of yes votes I got to zero and finish to zero finished as the number of responses I've gotten in total and the idea is I want to send out vote requests in parallel and keep track of how many yeses I've got and how many responses I've gotten in general and then once I know whether I've won the election or whether I know that I've lost the election then I can determine that and move on and like the real raft code you actually do whatever you need to do don't step up to a leader or to step down to a follower after you have the result from this and so looking at this code here I'm going to in parallel spawn say I have ten peers in parallel spawn ten goroutines here I pass in this closure here and I'm gonna do is request a vote and then if I get the vote I'm going to increment the count by one and then I'm also going to increment this finished by one so like this is a number of yeses this is total number of responses I've gotten and then outside here in the main goroutine what I'm doing is keeping track of this condition I'm waiting for this condition to become true that either I have enough yes votes that I've won the election or I've heard back from enough peers and I know that I've lost and so I'm just going to in a in a loop check to see and wait until count is greater than or equal to five or wait until finished is equal to ten and then after that's the case I can either determine that I've lost drive one so does anybody see any problems with this code given what we just talked about about mutexes yes yeah exactly countin finished aren't protected by mutexes so one thing we certainly need to fix here is that whenever we have shared variables we need to protect access with new taxes and so that's not too bad to fix here I declare mutex that's accessible by everybody and then in the goroutines I'm launching in parallel to request votes I'm going to and this this pattern here is pretty important I'm going to first request a vote while I'm not holding the lock and then after wear that I'm going to grab the lock and then update these shared variables and then outside I have the same patterns as before except I make sure to lock and unlock between reading these shared variables so in an infinite loop I grab the lock and check to see if the results of the election have been determined by this point and if not I'm going to keep running in this infinite loop otherwise I'll unlock and then do what I need to do outside of here and so if I run this example whoops it seems to work and this is actually like a correct implementation it does the right thing but there's some problems with it so can anybody recognize any problems with this implementation I'll give you a hint this code is not as nice as it could be so not quite it's going to wait for exactly the right amount of time the issue here is that it's busy waiting what it's doing is in a very tight loop it's grabbing the lock checking this condition unlocking grabbing this lock checking this condition unlocking and it's going to burn up 100% CPU on one core while it's doing this so this code is correct but it's like at a high level we don't care about efficiency like CPU efficiency for the purpose of the labs but if you're using a hundred percent of one core you might actually slow down the rest of your program enough that it won't make progress and so that's why this pattern is bad that we're burning up a hundred percent CPU waiting for some condition to become true right so does anybody have any ideas for how we could fix this so here's one simple solution I will change a single line of code all I've added here is wait for 50 milliseconds and so this is a correct transformation of that program and it kind of seems to solve the problem right like before I was burning up a hundred percent CPU now only once every 50 milliseconds I'm going to briefly wake up check this condition and go back to sleep if it doesn't hold and so this is like basically a working solution any questions so this kind of sort of works but one thing you should always be aware of whenever you write code is magic constants why is this 50 milliseconds why not a different number like whenever you have an arbitrary number in your code it's a sign that you're doing something that's not quite right or not quite as clean as it could be and so it turns out that there's a concurrency primitive designed to solve exactly this problem of I have some threads running concurrently that are making updates to some shared data and then I have another thread that's waiting for some property some condition on that shared data to become true and until that condition becomes true the thread is just going to wait there's a tool designed exactly to solve this problem and that's a tool called a condition variable and the way you use a condition variable is the pattern basically looks like this so we have our lock from earlier condition variables are associated with locks so we have some shared data some a lock that protects that shared data and then we have this condition variable that is given a pointer to the lock when it's initialized and we're going to use this condition variable for kind of coordinating when a certain condition some property on that shared data when that becomes true and the way we modify our code is like we have two places one we're making changes to that data which might make the condition become true and then we have another place where we're waiting for that condition to become true and the general pattern is whenever we do something that changes the data we call a conduct broadcast and we do this while holding the lock and then on the other side where we're waiting for some condition on that share data to become true we call cond dot wait and so what this does is like let's think about what happens in the mean thread for a moment the main thread grabs the lock it checks this condition suppose it's false it calls cond dow wait what this will do is it will atomically you can think of it as it'll release the lock in order to let other people make progress and it'll add its thread like it'll add itself to a like list of people who are waiting on this condition variable then concurrently one of these threads might be able to acquire the lock after it's gotten a vote and then it manipulates these variables and then it calls cond dot broadcast what that does is it wakes up whoever's waiting on the condition variable and so once this thread unlocks the mutex this one what do we want as it's returning from wait we'll reacquire the mutex and then return to the top of this for loop which is checking this condition so this broadcast wakes up whoever's waiting at this wait and so this avoids having to have that time dot sleep for some arbitrary amount of time like this thread that's waiting for some condition to become true only gets woken up when something changes that might make that condition become true right like if you think about these threads if they're very slow and they don't call cond dot broadcast for a long time this one will just be waiting it won't be like periodically waking up and checking some condition that can't have changed because nobody else manipulated their shared data so any questions about this pattern yeah so that's a great question I think you're referring to something called the lost wake up problem and this is a topic in operating systems and we won't talk about it in detail now there feel free to ask me after lecture but at a high level you can avoid funny race conditions that might happen between wait and broadcast by following the particular pattern I'm showing here and I'll show you an abstracted version of this pattern in a moment basically the pattern is for the side that might make changes that will change the outcome of the condition test you always lock then manipulate the data then call broadcast and call unlock afterwards so the broadcast must be called while holding the lock similarly when you're checking the condition you grab the lock then you're always checking the condition in a loop and then inside so when that condition is false you call Condit wait this is only called while you're holding the lock and it atomically releases the lock and kind of schedule like puts itself in a list of waiting threads and then as waits returning so as we like return from this wait call and then go back to the top of this for loop it will reacquire the lock so this check will only happen while holding the lock and then so outside of this we still have the lock here and we unlock after we're done doing whatever we need to do here at a high level this pattern looks like this so we have one thread or some number of threads doing something that might affect the condition so they're going to grab a lock do the thing call broadcast then call unlock and on the other side we have some thread that's waiting for some condition to become true the pattern there it looks like we grab the lock then in a while loop while the condition is false we wait and so then we know that when we get past this while loop now the condition is true and we're holding the lock and we can do whatever we need to do here and then finally we call unlock so we can talk about all the things that might go wrong if you violate one of these rules like after lecture if you're interested but at a high level if you follow this pattern then you won't need to deal with those issues so any questions about that yeah so that's a great question when do you use broadcast versus when do use signals so converse have three methods on them one is wait for the waiting side and then on the other side you can use signal or broadcast and the semantics of those are signal wait wakes up exactly one waiter like one thread that may be waiting whereas broadcast wakes up everybody who's waiting and they'll all reach out like they'll all try to grab the law can recheck the condition and only one of them will proceed because only one of them will hold lock until it gets past this point I think for the purpose of this class always use broadcast never use signal if you follow this pattern and just like don't use signal and always use broadcast your code will work I think you can stick think of signal as something used for efficiency and we don't really care about that level of CPU efficiency in the labs for this class any more questions ok so the final topic we're going to cover in terms of go concurrency primitives is channels so two high level channels are like a queue like synchronization primitive but they don't behave quite like cues in the intuitive sense like I think some people think of channels is like there's this data structure we can sticks that stick things in and eventually someone will pull those things out but in fact channels have no queuing capacity they have no internal storage basically channels are synchronous if you have to goroutines that are going to send and receive on a channel if someone tries to send on the channel while nobody's receiving that thread will block until somebody's ready to receive and at that point synchronously it will exchange that data over to the receiver and the same is true the other direction if someone tries to receive from a channel while nobody's sending that receive will block until there's another goroutine that's about to send on the channel and that send will happen synchronously so here's a little demo program that demonstrates this here I have a I declare channel and then I spawn a go routine that waits for a second and then sent and then receives from a channel and then in my main goroutine I keep track of the time then I send on the channel so I just put some dummy data into the channel and then I'm going to print out how long the send took and if you think of channels as cues with internal storage capacity you might think of this thing as completing very fast but that's not how channels work this send is going to block until this receive happens and this one happened till this one second is the elapsed and so from here to here we're actually blocked in the main goroutine for one whole second alright so don't think of channels as queues think of them as this synchronous like the synchronous communication mechanism another example that'll make this really obvious is here we have a goroutine that creates a channel then sends on the channel and tries receiving from it doesn't anybody know what'll happen when I try running this I think the file name might give it away yeah exactly the send is going to block till somebody's ready to receive but there is no receiver and go actually detects this condition if all your threads are sleeping it to text this is a deadlock condition and it'll actually crash but you can have more subtle bugs where if you have some other thread like off doing something if I spawn this go routine that you know for loop does nothing and I try running this program again now it goes deadlock detector won't notice that all threads are not doing any use will work like there's one thread running it's just this is never receiving and we can tell by looking at this program that it'll never terminate but here it just looks like it hangs so if you're not careful with channels you can get these subtle bugs where you have double X as a result yeah yeah exactly there's no data nobody's sending on this channel so this is gonna block here it's never gonna get to this line yeah so channels as you pointed out can't really be used just within a single goroutine it doesn't really make sense because in order to send or in order to receive there has to be another goroutine doing the opposite action at the same time so if there isn't you're just gonna block forever and then that chant but thread will no longer do any useful work yeah sends wait for receives receives wait for signs and it happens synchronously once there's both the sender and receiver present what I talked about so far is unbuffered channels I was going to avoid talking about buffered channels because there are very few problems that they're actually useful for solving so buffered channels can take in a capacity and then you can think of it as it's just switch this to so here's a buffered channel with a capacity of one this program does terminate because buffered channels are like they have some internal storage space and until that space fills up sends are non blocking because they can just put that data in the internal storage space but once the channel does fill up then it does behave like a non-buffer channel in the sense that further sends will block until there's a receive to make space in the channel but I think at a high level we should avoid buffered channels because they basically don't solve any problems and another path and other things should be thinking about is whenever you to make up arbitrary numbers like this one here to make your code work you're probably doing something wrong yeah so I think this is a question about terminology like what exactly does deadlock mean into this count as a deadlock like yes this counts as a deadlock like no useful progress will be made here like this these threads are just stuck forever any other questions so what our channel is useful for I think channels are useful for a small set of things like for example I think for producer consumer queues sort of situations like here I have a program that makes a channel and this spawns a bunch of goroutines that are going to be doing some work like say they're competing some result in producing some data and I have a bunch of these goroutines running in parallel and I want to collect all that data as it comes in and do something with it so this do work thing just like waits for a bit and produces a random number and in the main goroutine I'm going to continuously receive on this channel and print it out like this is a great use of channels another good use of channels is to achieve something similar to what wait groups do so rather than use a wait group suppose I want to spawn a bunch of threads and wait till they're all done doing something one way to do that is to create a channel and then I spawn a bunch of threads and know how many threads I've spawned so five goroutines created here they're going to do something and then send on this channel when they're done and then in the main goroutine I can just receive from that channel the same number of times and this has the same effect as a wait group so question so what exactly is the question [Music] so the question is here could you use a buffered channel with a capacity of five because you're waiting for five receives I think in this particular case yes that would have the equivalent effect but I think there's not really a reason to do that and I think at a high level in your code you should avoid buffer channels and also maybe even channels unless you think very hard about what you're doing yeah so what is a wait group I think we covered this in a previous lecture and I talked about it very briefly today but I do have an example of wait groups so a wait group is a yet another synchronization primitive provided by go in the sync package and it kind of does what his name advertises like it lets you wait for a certain number of threads to be done the way it works is you call wait group dot add and that basically increments some internal counter and then when you call wait group dot wait it waits till done has been called as many times as add was called so this code is basically the same as the code I just showed you that was using a channel except this is using wait group they have the exact same effect you can use either one yeah so the question here is about race conditions I think like what happens if this add doesn't happen fast enough before this wait happens or something like that well so here notice that the pattern here is we call wait group data outside of this goroutine and it's called before spawning this goroutine so this happens first this happens next and so we'll never have the situation we're done happens after this add happens for this particular routine how's this implemented by the compiler and I will not talk about that now but talk to me after class or in office hours but I think for the purposes class like you need to know the API for these things not the implementation all right and so I think that's basically all I have on go concurrency primitives so one final thought is on channels like channels are good for a specific set of things like I just showed you the producer consumer queue or like implementing something like wait groups but I think when you try to do fancier things with them like if you want to say like kick another go routine that may or may not be waiting for you to be like woken up that's a kind of tricky thing to do with channels there's also a bunch of other ways to shoot yourself in the foot with them I'm going to avoid showing you examples of bad code with channels just because it's not useful to see but I personally avoid using channels for the most part and just use shared memory and mutexes and condition variables and set and I personally find those much easier to reason about so feel free to use channels for when they make sense but if anything looks especially awkward to do with channels like just use mutexes and condition variables and they're probably a better tool yeah so the question is with the difference between this producer-consumer pattern here in a thread-safe FIFO I think they're kind of equivalent like you could do this with the thread-safe FIFO and it like that is basically what a like buffered channel is roughly if you're in queueing things in dequeueing things like if you want this line to finish and have this thread go do something else while that data sits there in a queue rather than this goroutine waiting to send it then a buffered channel might make sense but I think at least in the lab you will not have a pattern like that all right so next Fabian's going to talk about more rapidly related stuff do you need this all right can you all hear me is this working yeah all right so yeah basically I'm going to show you two bugs that we commonly see in people's raft implementations there's a lot of bugs that are pretty common but I'm just going to focus on two of them so in this first example we sort of have a start of a raft implementation for that's sort of like what you might see for to a just the beginnings of one so in our raft state we have primarily the current status of the raft pier either follower candidate or leader and we have these two state variables that were keeping track of the current term and who we voted for in the current term so I'm I want us to focus though on these two functions AttemptElection and CallRequestVote so in AttemptElection we're just going to set our state to candidate increment our current term vote for ourselves and then start sending out request votes to all of our raft peers and so this is similar to some of the patterns that Anish showed where we're going to loop through our peers and then for each one in a goroutines separately call this CallRequestVote function in order to actually send an RPC to that peer alright so in CallRequestVote we're going to acquire the lock prepare arguments for our request vote RPC call based on by setting it to the current term and then actually perform the RPC call over here and finally based on the response we will reply back to this this AttemptElection function and the AttemptElection function eventually should tally up the votes to see if it got a majority of the votes and can become leader so what happens when we run this code so in theory what we might expect to happen is for so there's going to be some code that's going to spawn a few graph spears and actually try to attempt elections on them and what should happen are we just start collecting votes from other peers and then we're not actually going to tally them up but hopefully nothing weird goes wrong but actually something is going to go wrong here and we actually activated goes deadlock detector and somehow we ran into a deadlock so let's see what happened for now let's focus on what's going on with the server zero so server zero it says it starts attempting an election at term one that's just starting the AttemptElection function it will acquire the lock set some of the set some stuff up for performing the election and then unlock then it's going to send out a request vote RPC to server two it finishes processing that request vote RPC over here so we're just printing right before and after we actually send out the RPC and then it sends out a request vote RPC to server one but after that it never we never actually see it finish sending the request vote RPC so it's actually stuck in this function call waiting for the RPC response from server 1 all right now let's look at what's everyone's doing so it's it's pretty much the same thing it sends a request vote I received a server two that that succeeds it finishes processing that request vote the response from server 2 then it sends this RPC to zero and now what's actually happening is 0 & 1 are sort of waiting for the RPC responses from each other they both sent out an RPC call but not yet got the response yet and that's actually sort of the cause of our deadlock so really what's the reason that we're dead locking is because we're holding this lock through our RPC calls over here in the core requests vote function we acquire our mutex associated with our raft peer and we only unlock at the end of this function so throughout this entire function we're holding the lock including when we try to contact our peer to get the vote and later when we handle this request vote RPC we actually only see it at the beginning of this function in the handler we're also trying to acquire the lock but we never actually succeed in acquiring the lock so just to make this a little bit more clear the the sort of order of operations is happening is in CallRequestVote server zero is first going to acquire the lock and send an RPC call to server one and then simultaneously and separately server one is going to do the same thing it's going to enter its call request vote function acquire the lock and send this RPC call to server zero now in server zeros handler and server ones handler they're trying to acquire the lock but they can't because they already are acquiring the lock and trying to send the RPC call to each other and that that's actually what's leading to the deadlock situation so to solve this basically we want you to not hold locks through RPC calls and that's the solution to this problem in fact we don't need the lock here at all instead of trying to read the current term when we enter this CallRequestVote function we can pass this as an argument here save the term when we had acquired the lock earlier in this AttemptElection and just passed this as a as a variable to CallRequestVote so that actually removes the need to acquire the lock at all in CallRequestVote alternatively we could lock while we're preparing the arguments and then unlock before actually performing the call and then if we need to to process the reply we could lock again afterwards so it's just make sure to unlock before making it obviously call and then if you need to you can acquire the lock again so now if I save this then so it's still activating the deadlock detector but that's actually just because we're not doing anything at the end but now it's actually working we finished sending the request votes on both sides and all the operations that we wanted to complete are complete all right any questions about this example yeah so not it's sort of so you might need to use locks when you are preparing the arguments or processing the response but yeah you shouldn't hold a lock through the RPC call while you're waiting for the other peer to respond and there's actually another reason to that in addition to deadlock the other problem is that in some tests we're going to sort of have this unreliable network that could delay some of your RPC messages potentially by like 50 milliseconds and in that case if you hold the lock through an RPC call then any other operation that you try to do during that 50 milliseconds won't be able to complete until that RPC response is received so that that's another issue that you might run into if you hold the lock so it's both to make things more efficient and to avoid these potential deadlock situations all right so just one more example this is again using a similar draft implementation so again in our raft state we're going to be keeping track of whether a fuller candidate leader and then also these two state variables in this example I want you to focus on this AttemptElection function so now we've first implemented the change that I just showed you to store the term here and pass it as a variable to our function that collects the request votes but additionally we've implemented some functionality to add up the votes so what we'll do is we'll create a local variable to count the votes and whenever we get a vote if the vote was not granted we'll return immediately from this go routine where we're processing the boat otherwise we'll acquire the lock before editing this shared local variable to count up the votes and then if we did not get a majority of the votes will return immediately otherwise we'll make ourselves the leader so as with the other example I mean initially if you look at this if I look at this like it seems reasonable but let's see if anything can go wrong all right so this is the log output from one run and one thing you might notice is that we've actually elected two leaders on the same term so server zero it was elected made itself a leader on term two and server one did as well it's okay to have a leader elected on different terms but here where we have one on the same term that that should never happen alright so how did this actually come up so let's start from the top so at the beginning server zero actually attempted an election at term one not turn two and it got its votes from both of the other peers but for whatever reason perhaps because those reply messages from those peers were delayed it didn't actually process its process those votes until later and in between receiving it like in between attempting the election and finishing the election server one also decided to attempt an election perhaps because because of server zero was delayed so much server one might actually ran into the election timeout and then started its own election and it started it on term 2 because it couldn't have been termed 1 because it already voted for server 0 on on term 1 over here okay so then server 1 sends out its own request votes 2 servers 2 and 0 at term for server 1 that's fine but server 0 also votes for server 1 this is actually also fine because server one is asking server 0 for a vote on a higher term and so what server 0 should do is if you remember from the spec it should set its current term to that term in the request for RPC message to term 2 and also revert itself to a follower instead of a candidate alright finally so the real problem is that on this line where server 0 although it really got enough votes on term 1 it made itself a leader on term - so the reason so one explanation for why this is happening is because in between where we set up the election our attempt for the election and where we actually process the votes some other things are happening input in this case we're actually voting for someone else in between and so we're no longer on term 1 where we thought we started the election we're now on term 2 and so we just need a double check that because we don't have the lock while we're performing the RPC calls which is important for its own reasons now some things might have changed and we need to double check that what we assume is true when we're setting ourselves to the leader is still true so one way to solve this that there's a few different ways like to solve this like you could imagine not voting for others while we're in the middle of attempting an election but in this case the simplest way to solve this at least in this implementation is to just double check that we're still on the same term and we're still a candidate we haven't reverted to a follower so actually one thing I want to show you is if we do print out our state over here then we do see that server 0 became a follower but it's still setting itself to a leader on this line so yeah we can just check for that if we're not a candidate or the current term doesn't match the term which we started the election then let's just quit and if we do that then so everyone becomes a leader and we never cease over zero become leader so the problem solved any question yeah yeah I think I think that would I because we would not if the term is higher now than actually no it would it might not be sufficient because we might have attempted another election it depends on your implementation but it's possible that you could have attempted another election on a higher term afterwards all we know that's the same thing right yeah it would not be sufficient to only check the state but I think you're right if you only check the term then it is sufficient all right any other questions all right so yeah that's it for this part she's going to show you some more examples of actually debugging some of these draft implementations hi can you all hear me yeah is it not okay so in my section I'm gonna walk you through how I would be but if you have like a bug in your raft implementation so I prepare a couple of buggy raft code and I just try to walk you through it so first I'm gonna go into my first buggy implementation and if I run the test here so for this one it doesn't print anything it just gets started and it's gonna be here forever and let's assume that I have no idea why there's happening the first thing that I want to find out is where it gets started and we we do have a good tool for that which printf but in the stop code if you go to youtube go we have a function called the printf this is just a nice wrapper around the block printf with the debugger able to enable or disable the locking messages so I'm gonna enable that and go back to my raft code so first of all when i when when there there's something that's bug happening I always go check if the code actually actually initialize raft server so here I'll just clean okay so here if I run the test again then now I know that there are three servers that get initialized so this files is okay but like there's nowhere where the bug is happening so I'll just go deeper into the hood just to find where it gets stuck so now if you see the code we are calling the leader a election so I'm gonna go to that function and just to make faster I'll try to check if it kicks off some election that part still fine so we we try to go for now here we are in the election I'll see if there's so we actually send the request vote to some other servers now we kind of have like more idea of where guests are because it's not printing that some sorry that kicks off the election are not sending the request words so I would go back for her just to see where customers like I always tried here prin if if we call some function I I I was always double shake if it actually go into the function so now I'm going to say that this service is at the start of the election and that works so now we have an idea of like the bug should be between here and here so we are trying to minimize the scope of the code that's causing the bug let's say if I print something here and it does it doesn't get there so I move it up let's say here still not there now it's there so the bug is probably in this function and I just go check so here the problem is that I'm trying to acquire a lock where I actually do have the lock so it's gonna be a deadlock so that's how I will find their first bug using the DPrintf and it's it's nice to use the printf because you can like just turn off the debugging print and have a nice test output with our audit debugging if you want it so that's how I would use it DPrintf to try to like handle a bug in your code and for this example there's actually another trick to help you find this kind of deadlock so if you press ctrl + backslash you can see in the bottle but bottom left that I press like control and backslash this this command will send a signal quit today go program and by default it will handles the the quiz signal and quit all the goroutines and print audio strike the stack rates so now this like Chico up here like this way it gets touched and then there are gonna be a couple functions printing here just trying to go through all the traces yes so it's actually showing that the function that's causing the problem is the cover to candidate so that's another wait you've to find out where the day locks are I can remove all this and now it works so that's the first example that I want to go through second thing that you want it you want to do before you submit your labs is to turn the race flag on when you do the test the way to do that is just to add -race before -run and here because my implement implementation doesn't have any races so it's not going to tell you anything but this just be careful about this because it's not a proof that you don't have any really it's just that it cannot detect races for you I'm going to run the same command again with the red flag but now this time that's actually race going on in my implementation so it's gonna yell at you that there's some deliveries going on in your code I'm quitting that and let's see like how useful is the warning are so I'm gonna go to my second implementation with Raft code and here let's look at this race so it's telling us that there's a wait going on at the line wait on probably Thursday here and there's also a right line 412 which is Thursday so I'm going to this line again and now we kind of know that this this radiation is protected by a lock so the risk flies actually wanting us and helping us to find out bug on on this database that we have so the fake it's gonna be just you lock this and unlock it and that should solve the problem so at this place we kind of know how to basic like do some basic debugging does anyone have any question no okay yeah so I'm going to go to the third one which is going to be more difficult to find a bug I'm going to test the run the centers and now I am I actually have some debugging messages in there already and just see that I also have a debugging message with the test action there's something you might want to consider doing if you go into the test clip here you can just see how the test would run and then there are some actions that the test clip is gonna do to make your code fail and it's usually a good idea to print out where that action is happening in your actual debugging message so you can guess what is happening like where the bug is happening in which phase of the test if that make sense so now it's like I was doing fine in the first case I passed I passed the fail but I'm failing their second test and here the Test section is to found one as a little one so I'm passing this the test until this and if you go to I'm actually passing until the leader two rejoins so this can give you a nice idea of how the test is working and just to help you have a better case as where the bondage is in your code so now let's look at the debugging messages so it's least it seems like when leader 2 rejoined it becomes a follower and we have a new leader so that looks fine to me and we probably need more debugging messages instead of just their state changes so I am going to add some more my first case that when one becomes a leader it might not be doing what a leader should you correctly so we got stuck so you might could after we cover it as eventually there I have a goroutine call operate leader there's just sending heartbeat to the all set to the all servers so I'm gonna print some stuff here saying heartbeat cheers away so to become a leader it sends the the first heartbeat to each server and one still tries to send heartbeat to the new leader and then one becomes a follower so this doesn't look like to be a problem now I'm gonna check if the other service receive heartbeat correctly it's taking away with I'm trying to finish this yeah so to becomes a leader to sends heartbeat but no one receive a heartbeat form - so if I go to the same opinion tree I actually hold the law to the RPC Hall which is the problem that Fabian went to in the last section so that's that's the problem that I need to fix so what I should do is to a log here and then lock again here and that should work we pass and then there are couple things that you might want to do when you test your rough implementation so that's actually script to run the test in imperial and I can show you how I how we can use how we can use it this creep is in the inner peer support some someone make a point about it and here's how we can use the script so you run the script specify the number of the test personally I do like a 1000 but that depends on your preference this is the number of course that you wanna run the test at the same time and then here's the test and if you run the script then if you show you that's like we have run four tests so far all are working fine and it's gonna keep going like that so that's how I would go about debugging rough implementation and you are all welcome to come to office hours when you need help --------------------------------------------------------------------------------