├── .all-contributorsrc
├── .gitignore
├── LICENSE
├── README.md
├── doc
    ├── Glossary_from_DDIA.pdf
    ├── Translate_Workflow.pdf
    ├── how_to_do.md
    └── manual.md
├── glossary.md
├── lec01
    └── introduction.srt
├── lec02
    └── rpc_and_threads.srt
├── lec03
    └── gfs.srt
├── lec04
    ├── Lec4-3.en.txt
    ├── Lec4-3.zh.txt
    ├── Lec4-4.en.txt
    ├── Lec4-4.zh.txt
    ├── Lec4-5.en.txt
    ├── Lec4-5.zh.txt
    ├── Lec4-6.en.txt
    ├── Lec4-6.zh.txt
    ├── Lec4-7.en.txt
    ├── Lec4-7.zh.txt
    ├── Lec4.en.txt
    └── primary_backup_replication.en.srt
├── lec05
    ├── Lec5.en.txt
    └── threads_and_raft.en.srt
├── lec06
    └── tolerance_raft_1.srt
├── lec07
    └── tolerance_raft_2.en.srt
├── lec08
    ├── zh-zookeeper.srt
    └── zookeeper.srt
└── lec09
    ├── more_replication_craq.srt
    └── zh-more_replication_craq.srt


/.all-contributorsrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "files": [
 3 |     "README.md"
 4 |   ],
 5 |   "imageSize": 100,
 6 |   "commit": false,
 7 |   "contributors": [
 8 |     {
 9 |       "login": "ZiheLiu",
10 |       "name": "zihe.liu",
11 |       "avatar_url": "https://avatars2.githubusercontent.com/u/13313784?v=4",
12 |       "profile": "https://ziheliu.github.io/",
13 |       "contributions": [
14 |         "content"
15 |       ]
16 |     },
17 |     {
18 |       "login": "wildandyang",
19 |       "name": "Fan Yang",
20 |       "avatar_url": "https://avatars0.githubusercontent.com/u/16045380?v=4",
21 |       "profile": "https://github.com/wildandyang",
22 |       "contributions": [
23 |         "content"
24 |       ]
25 |     },
26 |     {
27 |       "login": "CyrusF",
28 |       "name": "Cyru1s",
29 |       "avatar_url": "https://avatars0.githubusercontent.com/u/20309761?v=4",
30 |       "profile": "http://blog.cyru1s.com",
31 |       "contributions": [
32 |         "content"
33 |       ]
34 |     },
35 |     {
36 |       "login": "hoooga",
37 |       "name": "hoooga",
38 |       "avatar_url": "https://avatars3.githubusercontent.com/u/8995262?v=4",
39 |       "profile": "https://github.com/hoooga",
40 |       "contributions": [
41 |         "content"
42 |       ]
43 |     },
44 |     {
45 |       "login": "ivanallen",
46 |       "name": "Allen",
47 |       "avatar_url": "https://avatars1.githubusercontent.com/u/12481610?v=4",
48 |       "profile": "https://allen.blog.csdn.net",
49 |       "contributions": [
50 |         "content"
51 |       ]
52 |     },
53 |     {
54 |       "login": "fisheuler",
55 |       "name": "fisheuler",
56 |       "avatar_url": "https://avatars2.githubusercontent.com/u/4300522?v=4",
57 |       "profile": "https://github.com/fisheuler",
58 |       "contributions": [
59 |         "doc"
60 |       ]
61 |     },
62 |     {
63 |       "login": "2014BDuck",
64 |       "name": "2014bduck",
65 |       "avatar_url": "https://avatars0.githubusercontent.com/u/30280396?v=4",
66 |       "profile": "https://github.com/2014BDuck",
67 |       "contributions": [
68 |         "content"
69 |       ]
70 |     },
71 |     {
72 |       "login": "Laurel-rao",
73 |       "name": "Laurel-rao",
74 |       "avatar_url": "https://avatars2.githubusercontent.com/u/42195541?v=4",
75 |       "profile": "https://github.com/Laurel-rao",
76 |       "contributions": [
77 |         "bug"
78 |       ]
79 |     }
80 |   ],
81 |   "contributorsPerLine": 7,
82 |   "projectName": "thor",
83 |   "projectOwner": "ivanallen",
84 |   "repoType": "github",
85 |   "repoHost": "https://github.com",
86 |   "skipCi": true
87 | }
88 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.DS_Store
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # thor
 2 | <!-- ALL-CONTRIBUTORS-BADGE:START - Do not remove or modify this section -->
 3 | [![All Contributors](https://img.shields.io/badge/all_contributors-8-orange.svg?style=flat-square)](#contributors-)
 4 | <!-- ALL-CONTRIBUTORS-BADGE:END -->
 5 | 雷神项目，翻译 mit 6.824 2020
 6 | 
 7 | ## 组织
 8 | 
 9 | QQ:1035287657
10 | 
11 | ## 视频资源
12 | 
13 | - https://www.bilibili.com/video/BV1R7411t71W?p=1
14 | 
15 | ## 快速开始
16 | 
17 | [如何翻译](https://github.com/ivanallen/thor/wiki/%E5%A6%82%E4%BD%95%E7%BF%BB%E8%AF%91)
18 | 
19 | ## 版权声明
20 | 
21 | 本字幕版权属参与翻译的所有成员，严禁用作商业用途，一经发现，追究法律责任。
22 | 
23 | ## Contributors ✨
24 | 
25 | 非常感谢这些可爱的同学:
26 | 
27 | <!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
28 | <!-- prettier-ignore-start -->
29 | <!-- markdownlint-disable -->
30 | <table>
31 |   <tr>
32 |     <td align="center"><a href="https://ziheliu.github.io/"><img src="https://avatars2.githubusercontent.com/u/13313784?v=4" width="100px;" alt=""/><br /><sub><b>zihe.liu</b></sub></a><br /><a href="#content-ZiheLiu" title="Content">🖋</a></td>
33 |     <td align="center"><a href="https://github.com/wildandyang"><img src="https://avatars0.githubusercontent.com/u/16045380?v=4" width="100px;" alt=""/><br /><sub><b>Fan Yang</b></sub></a><br /><a href="#content-wildandyang" title="Content">🖋</a></td>
34 |     <td align="center"><a href="http://blog.cyru1s.com"><img src="https://avatars0.githubusercontent.com/u/20309761?v=4" width="100px;" alt=""/><br /><sub><b>Cyru1s</b></sub></a><br /><a href="#content-CyrusF" title="Content">🖋</a></td>
35 |     <td align="center"><a href="https://github.com/hoooga"><img src="https://avatars3.githubusercontent.com/u/8995262?v=4" width="100px;" alt=""/><br /><sub><b>hoooga</b></sub></a><br /><a href="#content-hoooga" title="Content">🖋</a></td>
36 |     <td align="center"><a href="https://allen.blog.csdn.net"><img src="https://avatars1.githubusercontent.com/u/12481610?v=4" width="100px;" alt=""/><br /><sub><b>Allen</b></sub></a><br /><a href="#content-ivanallen" title="Content">🖋</a></td>
37 |     <td align="center"><a href="https://github.com/fisheuler"><img src="https://avatars2.githubusercontent.com/u/4300522?v=4" width="100px;" alt=""/><br /><sub><b>fisheuler</b></sub></a><br /><a href="https://github.com/ivanallen/thor/commits?author=fisheuler" title="Documentation">📖</a></td>
38 |     <td align="center"><a href="https://github.com/2014BDuck"><img src="https://avatars0.githubusercontent.com/u/30280396?v=4" width="100px;" alt=""/><br /><sub><b>2014bduck</b></sub></a><br /><a href="#content-2014BDuck" title="Content">🖋</a></td>
39 |   </tr>
40 |   <tr>
41 |     <td align="center"><a href="https://github.com/Laurel-rao"><img src="https://avatars2.githubusercontent.com/u/42195541?v=4" width="100px;" alt=""/><br /><sub><b>Laurel-rao</b></sub></a><br /><a href="https://github.com/ivanallen/thor/issues?q=author%3ALaurel-rao" title="Bug reports">🐛</a></td>
42 |   </tr>
43 | </table>
44 | 
45 | <!-- markdownlint-enable -->
46 | <!-- prettier-ignore-end -->
47 | <!-- ALL-CONTRIBUTORS-LIST:END -->
48 | 
49 | This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
50 | 
51 | ### 参考资料和工具
52 | 
53 | - [翻译流程介绍](https://github.com/ivanallen/thor/blob/master/doc/manual.md)
54 | - [MIT6.824翻译工作流分享](https://www.bilibili.com/video/BV1pQ4y1M7dv)
55 | - [分布式系统翻译工作流分享](https://github.com/ivanallen/thor/blob/master/doc/Translate_Workflow.pdf)
56 | - 参考PR：[Lec04-3翻译任务](https://github.com/ivanallen/thor/pull/24)
57 | - [翻译指南](https://docs.qq.com/doc/DZURQaXBrdXhXb0dx?tdsourcetag=s_macqq_grpfile)
58 | - [建议翻译流程](https://docs.qq.com/doc/BXXro31NHmDg4Kega60fkDTU4l51be2cdG2H4OMrVN3NzUlm0huLua1goly331XKV42Dko7Y0)
59 | - [VS Subtitles Editor](https://marketplace.visualstudio.com/items?itemName=pepri.subtitles-editor)
60 | - [抽取视频字幕的网站](https://downsub.com/)
61 | - [Free Google Translate API](https://pypi.org/project/googletrans/)
62 | - [自动切分时间轴](https://jingyan.baidu.com/article/e73e26c07ce0a824acb6a755.html)
63 | 


--------------------------------------------------------------------------------
/doc/Glossary_from_DDIA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanallen/thor/99446434fb2a135d42630593b32e7ae956011ce9/doc/Glossary_from_DDIA.pdf


--------------------------------------------------------------------------------
/doc/Translate_Workflow.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivanallen/thor/99446434fb2a135d42630593b32e7ae956011ce9/doc/Translate_Workflow.pdf


--------------------------------------------------------------------------------
/doc/how_to_do.md:
--------------------------------------------------------------------------------
  1 | # 如何翻译
  2 | 
  3 | ## 1 快速开始
  4 | 
  5 | 例如，这是一个原始的 srt 文件(截取自 lec03/gfs.srt)：
  6 | 
  7 | ```
  8 | 10
  9 | 00:00:31,410 --> 00:00:34,260
 10 | abstraction you might you know if you
 11 | 
 12 | 11
 13 | 00:00:34,260 --> 00:00:35,850
 14 | didn't know already you might imagine
 15 | 
 16 | 12
 17 | 00:00:35,850 --> 00:00:37,230
 18 | that there could be all kinds of
 19 | 
 20 | 13
 21 | 00:00:37,230 --> 00:00:40,050
 22 | different you know important
 23 | 
 24 | 14
 25 | 00:00:40,050 --> 00:00:42,030
 26 | abstractions you might want to use for
 27 | 
 28 | 15
 29 | 00:00:42,030 --> 00:00:43,650
 30 | distributed systems but it's turned out
 31 | 
 32 | 16
 33 | 00:00:43,650 --> 00:00:47,730
 34 | that a simple storage interface is just
 35 | 
 36 | 17
 37 | 00:00:47,730 --> 00:00:50,010
 38 | incredibly useful and extremely general
 39 | ```
 40 | 
 41 | 翻译之后，会变成这样：
 42 | 
 43 | ```
 44 | 10
 45 | 00:00:31,410 --> 00:00:34,260
 46 | 因为存储被证明是一种关键抽象
 47 | abstraction you might you know if you
 48 | 
 49 | 11
 50 | 00:00:34,260 --> 00:00:35,850
 51 | 如果你还不知道的话
 52 | didn't know already you might imagine
 53 | 
 54 | 12
 55 | 00:00:35,850 --> 00:00:37,230
 56 | 你可以想象在分布式系统中
 57 | that there could be all kinds of
 58 | 
 59 | 13
 60 | 00:00:37,230 --> 00:00:40,050
 61 | 你希望使用的各种不同的抽象
 62 | different you know important
 63 | 
 64 | 14
 65 | 00:00:40,050 --> 00:00:42,030
 66 | 你希望使用的各种不同的抽象
 67 | abstractions you might want to use for
 68 | 
 69 | 15
 70 | 00:00:42,030 --> 00:00:43,650
 71 | 但事实表明
 72 | distributed systems but it's turned out
 73 | 
 74 | 16
 75 | 00:00:43,650 --> 00:00:47,730
 76 | 简单的存储接口往往更有用而且更加通用
 77 | that a simple storage interface is just
 78 | 
 79 | 17
 80 | 00:00:47,730 --> 00:00:50,010
 81 | 简单的存储接口往往更有用而且更加通用
 82 | incredibly useful and extremely general
 83 | ```
 84 | 
 85 | 你直接修改 gfs.srt 即可。如果你不放心，你可备份一个 gfs.srt，自己再单独创建一个 gfs2.srt。
 86 | 
 87 | ## 2 翻译技巧
 88 | 
 89 | 你可以使用谷歌翻译工具进行批量翻译，这样可以帮助你提高翻译效率。当然，我们正在制作自动化工具，让你第一手拿到的原始 srt 就已经包含了机翻结果，这样你只需要修正就可以了。
 90 | 
 91 | 如果有些英文长句太长，而中文更加紧凑，可以把中文多复制几份，每个时间轴都放上一样的翻译。可以参考第 1 节的示例。
 92 | 
 93 | ## 3 怎么知道自己翻译的部分和别人是否冲突
 94 | 
 95 | 解决方案很简单，新建立一个 issue，标题为 "lec04;20-30min"，同时把这个任务分配给自己，这样就不会有人和你翻译冲突了。
 96 | 
 97 | 通常一个小任务为 5min，或者 10min，根据自己的实际情况选择。完成小任务后，一定要及时发起 pr，合入之后，你就可以关闭你的 issue 了。
 98 | 
 99 | ## 4 字幕校正
100 | 
101 | 这部分工作你不用担心，会有专门的同学针对完整的字幕文件做统一的修正。当然，如果你在观看过程中，发现有地方翻译的不对，请及时纠正，并提 PR.
102 | 


--------------------------------------------------------------------------------
/doc/manual.md:
--------------------------------------------------------------------------------
  1 | # 规范细节
  2 | 
  3 | > 如果你要认领我们翻译的任务，并成为对应字幕的贡献者，希望你能认真阅读一下这里的规范细节，保证步调的统一。
  4 | 
  5 | ## 1 概述
  6 | 你从每个课时中拿到的，有YouTube语音识别自动生成的英文字幕，还有我们抽取的整个讲稿。
  7 | 当认领了某一个单元，你就可以按照如下流程，高效统一地开始MIT6.824的翻译了！
  8 | 当然，欢迎加入我们字幕组的QQ群，与更多志同道合的小伙伴一起交流，共同进步！（群号：1035287657）
  9 | 
 10 | ## 2 分布式翻译
 11 | 
 12 | ### 2.1 基本思想
 13 | 为了提高大家的效率，建议大家尽可能借助机翻，为了提高机翻的准确性，需要大家对汇总的整个讲稿，断句、去掉一些语气词，还有更重要的，纠正机器生成的字幕的错误（记得同步到英文字幕文件中哦）。
 14 | 之后，为了翻译像一个老师讲课的效果，需要你在完成机翻后，完整地边听边润色，注意一些术语的翻译，我们在doc中添加了一个中文书籍中的术语表，作为基本参考。更多不熟悉的术语，欢迎进入翻译群讨论。
 15 | 这一步翻译的输入输出，我们下面就来仔细介绍~
 16 | 
 17 | ### 2.2 翻译流程
 18 | #### 输入文件
 19 | 机器生成英文字幕 + 英文字幕汇总后的完整讲稿
 20 | #### 输出文件
 21 | 修改后的英文字幕 + 语义断句后的英文讲稿 + 机翻并润色的中文讲稿
 22 | 
 23 | #### 翻译建议流程
 24 | 1. 经过认真的听课与对照，你会对上面两文件中的错误进行调整，为了方便讲稿的机翻，你可以去掉讲稿中的一些语气词，并按照语义断句（长的句子可能比较难断，后面我们会有一些建议，辅助你）
 25 | 2. 断句修改后的文件保存（作为输出之一），使用Chrome或谷歌翻译，进行机翻。
 26 | 3. 但是机翻的效果可能不会特别好，你需要根据一些口语习惯，特定术语，进行调整；机翻之后也能帮助你发现之前步骤的错误，回过头，继续修改字幕文件和英文讲稿。
 27 | 
 28 | #### 输出输出规范
 29 | **文件命名：**
 30 | 按照每个单元 如自己负责的 Lecx 的第 y 个单元，断句后英语讲稿部分上传
 31 | Lecx-y.en.txt；断句后中文翻译上传Lecx-y.zh.txt；原srt修正后并入Lecx
 32 | .en.srt。（边界划分原则为向下断句，比如某一行9:59-10:01，属于第一个
 33 | 
 34 | 最后我们会统一合并，生成Lecx.zh.srt和Lecx.srt
 35 | 
 36 | #### 输入输出文件举例
 37 | **原始 srt 文件**(截取自 lec07/tolerance_raft_2.en.srt)：
 38 | ```
 39 | 807
 40 | 00:40:53,140 --> 00:40:55,690
 41 | to move the arm the right track right so
 42 | 
 43 | 808
 44 | 00:40:55,690 --> 00:40:58,570
 45 | these persistance can be terribly
 46 | 
 47 | 809
 48 | 00:40:58,570 --> 00:41:01,510
 49 | terribly expensive and if for sort of
 50 | 
 51 | 810
 52 | 00:41:01,510 --> 00:41:03,520
 53 | any kind of straightforward design
 54 | 
 55 | 811
 56 | 00:41:03,520 --> 00:41:06,220
 57 | they’re likely to be the limiting factor
 58 | 
 59 | 812
 60 | 00:41:06,220 --> 00:41:09,089
 61 | in performance because they mean that
 62 | 
 63 | 813
 64 | 00:41:09,089 --> 00:41:13,690
 65 | doing anything anything whatsoever on
 66 | 
 67 | 814
 68 | 00:41:13,690 --> 00:41:15,339
 69 | these Raft servers takes ten
 70 | 
 71 | 815
 72 | 00:41:15,339 --> 00:41:18,580
 73 | milliseconds a pop and 10 milliseconds
 74 | ```
 75 | 
 76 | **汇总后的讲稿（如Lec7.en.txt）**
 77 | ```
 78 | so these persistence can be terribly terribly expensive and if for sort of any kind of straightforward design they’re likely to be the limiting factor in performance because they mean that doing anything anything whatsoever on these Raft servers takes ten milliseconds a pop 
 79 | ```
 80 | 
 81 | **修正机器生成字幕错误后字幕并入（Lec7.en.srt）**
 82 | >同输入
 83 | 
 84 | **英文断句后输出（Lec7-1.en.txt）**
 85 | ```
 86 | so these persistence can be terribly terribly expensive
 87 | and if for sort of any kind of straightforward design
 88 | they’re likely to be the limiting factor in performance
 89 | because they mean that doing anything 
 90 | anything whatsoever on these Raft servers takes ten milliseconds a pop 
 91 | ```
 92 | **机翻润色中文输出（Lec7-1.zh.txt）**
 93 | ```
 94 | 所以这些持久化的代价可能会非常大
 95 | 如果只是用于一个简单的设计
 96 | 它们可能成为性能的限制因素
 97 | 因为这意味着做任何事情 
 98 | 这些Raft服务器上的所有内容都会花费10毫秒的时间
 99 | ......
100 | ```
101 | 
102 | #### 注意事项
103 | 1. 输入英文字幕为机器转语音，可能会有错误
104 | 2. 修改后的英文字幕时间轴不变，只是修改一些错误
105 | 3. 修改的时候，可能会有信息冗余（英文字幕与讲稿之间），所以注意两文件间同步
106 | 
107 | #### 分工
108 | 通常一个小任务为 5min，或者 10min，在项目中新建立一个 issue，标题为 "lec04;20-30min"，根据自己的实际情况选择。
109 | 完成小任务后，一定要及时发起 pr，合入之后，你就可以关闭你的 issue 了。
110 | 
111 | #### 技巧推荐
112 | 1. 对于长句（@鱼蛋是我我是橘猫）：使用谷歌对汇总后的讲稿直接翻译，谷歌会帮助去掉语气词，用来辅助断句。
113 | 2. 机翻直接采用Chrome打开后翻译（无字数限制）/Google translate（有字数限制）
114 | 
115 | 
116 | ## 3 字幕校正
117 | 
118 | 这部分工作你不用担心，我们字幕组会有专门的同学针对完整的字幕文件做统一的修正。当然，如果你在观看过程中，发现有地方翻译的不对，请及时纠正，并提 PR.
119 | 
120 | ## 最后，感谢每一个贡献者的参与！


--------------------------------------------------------------------------------
/glossary.md:
--------------------------------------------------------------------------------
 1 | ### 术语对照表
 2 | 
 3 | |英文|中文|备注|
 4 | |---|---|---|
 5 | |single computer|一台计算机/单机||
 6 | |cooperating conputers|计算机集群||
 7 | |fault tolerance / FT|容错||
 8 | |concurrency|并发/并发问题||
 9 | |partial failure|局部错误||
10 | |infrastructure|基础设施||
11 | |replication|复制||
12 | |shard|（数据）分片||
13 | |goroutine|go例程|不翻译|
14 | |inline|串联式的||
15 | |enclosing scope|封闭作用域||
16 | |RPC(Remote Procedure Call)|远程过程调用|不翻译|
17 | |concurrency primitive|并发原语||
18 | 


--------------------------------------------------------------------------------
/lec04/Lec4-3.en.txt:
--------------------------------------------------------------------------------
  1 | You know they had replication 
  2 | but it wasn't replicating every single 
  3 | you know bit of memory
  4 | between the primaries and the backups
  5 | It was replicating much more application level table of chunks
  6 | I had this abstraction of you know chunks and chunk identifiers
  7 | And that's what it was replicating
  8 | It wasn't replicating sort of everything else
  9 | wasn't going to the expense of
 10 | replicating every single other thing in that machines
 11 | We're doing okay as long as
 12 | they had the same sort of application visible set of chunks 
 13 | So most replication schemes out there go the GFS route
 14 | In fact almost everything except pretty much this paper
 15 | and a few handful of similar systems 
 16 | almost everything uses application
 17 | at some level application level of replication
 18 | Because it can be much more efficient
 19 | Because we don't have to go to the
 20 | we don't have to go to the trouble of for example making sure
 21 | that interrupts occur at exactly the same point
 22 | in the execution of the primary and backup 
 23 | GFS does not sweat that at all
 24 | But this paper has to do
 25 | Because it replicates at such a low level
 26 | So most people build efficient systems
 27 | with applications specific replication
 28 | The consequence of that though is that
 29 | the replication has to be built into the
 30 | right into the application right
 31 | If you're getting a feed of application level operations
 32 | for example you really need to have the application participate in that
 33 | because some generic replication thing like today's paper
 34 | doesn't really can't understand
 35 | the semantics of what needs to be replicated
 36 | So anyway so most teams are application specific
 37 | like GFS and every other paper we're going to read on this topic
 38 | Today's paper is unique in that
 39 | it replicates at the level of the machine
 40 | and therefore does not care what software you run on it
 41 | Right it replicates the low-level memory and machine registers
 42 | You can run any software you like on it
 43 | as long as it runs on that kind of microprocessor
 44 | that's being represented
 45 | This replication scheme applies to the software can be anything
 46 | And you know the downside is that it's not that efficient necessarily
 47 | The upside is that you can take any existing piece of software
 48 | Maybe you don't even have source code for it or understand how it works
 49 | And you know do within some limits
 50 | you can just run it under this under VMware's replication scheme
 51 | And it'll just work which is sort of 
 52 | magic fault-tolerance wand for arbitrary software
 53 | All right now let me talk about how this is VMware FT
 54 | First of all VMware is a virtual machine company
 55 | They're what their business is
 56 | a lot of their business is selling virtual machine technology
 57 | And what virtual machines refer to is the idea of 
 58 | you know you buy a single computer
 59 | And instead of booting an operating system like Linux on the hardware
 60 | you boot we'll call a virtual machine monitor
 61 | or hypervisor on the hardware
 62 | And the hypervisor's job is actually to 
 63 | simulate multiple multiple computers
 64 | multiple virtual computers on this piece of hardware 
 65 | So the virtual machine monitor may boot up you know one instance of Linux
 66 | may be multiple instances of Linux may be a Windows
 67 | machine you can The virtual machine monitor on this one computer
 68 | can run a bunch of different operating systems
 69 | you know Each of these as is itself some sort of operating system kernel and then applications
 70 | So this is the technology they're starting with
 71 | And you know the reason for this is that
 72 | if you know you need to it just turns out
 73 | there's many many reasons why it's very convenient
 74 | to kind of interpose this level of indirection
 75 | between the hardware and the operating systems
 76 | And means that we can buy one computer
 77 | and run lots of different operating systems on it
 78 | we can have each If we run lots and lots of little services
 79 | instead of having to have lots and lots of computers one per service
 80 | you can just buy one computer
 81 | and run each service in the operating system
 82 | that it needs I'm using these virtual machines
 83 | So this was their starting point
 84 | They already had this stuff
 85 | and a lot of sophisticated things built around it 
 86 | at the start of designing VMware FT
 87 | So this is just virtual machines um
 88 | What the paper's doing is that it's gonna set up one machine
 89 | or they did requires two physical machines
 90 | Because there's no point in
 91 | running the primary and backup software 
 92 | in different virtual machines on the same physical machine
 93 | Because we're trying to guard against hardware failures
 94 | So you're gonna to at least you know you
 95 | have two machines running their virtual machine monitors
 96 | And the primary is going to run on one
 97 | the backup is on the other
 98 | So on one of these machines we have a guest
 99 | you know we only It might be running a lot of virtual machines
100 | We only care about one of them
101 | It's gonna be running some guest operating system
102 | and some sort of server application
103 | Maybe a database server, MapReduce master, or something
104 | So I'll call this the primary
105 | And there'll be a second machine
106 | that you know runs the same virtual machine monitor
107 | and an identical virtual machine holding the backup
108 | So we have the same whatever the operating system 
109 | is exactly the same
110 | And the virtual machine is you know giving 
111 | these guest operating systems the primary 
112 | and backup a each range of memory
113 | and this memory images will be identical
114 | or the goal is to make them identical
115 | in the primary in the backup
116 | We have two physical machines
117 | Each one of them running a virtual machine guest
118 | with its own copy of the service we care about
119 | We're assuming that there's a network
120 | connecting these two machines
121 | And in addition on this Local Area Network
122 | in addition on this network there's some set of clients
123 | Really, they don't have to be clients
124 | They're just maybe other computers
125 | that our replicated service needs to talk with
126 | Some of them are clients sending requests
127 | It turns out in this paper there 
128 | the replicated service actually doesn't use a local disk
129 | and instead assumes that there's some sort of
130 | disk server that it talks to him
131 | Although it's a little bit hard to realize this from the paper
132 | The scheme actually does not
133 | really treat the server particularly
134 | Especially it's just another external source of packets
135 | and place that the replicated state machine may send packets to 
136 | Not very much different from clients
137 | Okay so the basic scheme is that the we assume that
138 | these two replicas, the two virtual machines, primary and backup, are exact replicas
139 | Some client, you know database client who knows who has 
140 | Some client of our replicated server
141 | sends a request to the primary
142 | And that really takes the form of a network packet
143 | that's what we're talking about
144 | That generates an interrupt and this interrupt actually goes to 
145 | the virtual machine monitor at least in the first instance
146 | The virtual machine monitor sees
147 | here's the input for this replicated service
148 | And so the virtual machine monitor does two things
149 | One is it sort of simulates a network packet arrival interrupt 
150 | into the primary guest operating system 
151 | to deliver it to the primary copy of the application 
152 | And in addition the virtual machine monitor you know knows that 
153 | this is an input to a replicated virtual machine
154 | And it's so it sends back out 
155 | on the network a copy of that packet 
156 | to the backup virtual machine monitor
157 | It also gets it and backup virtual machine monitor knows 
158 | ha it is a packet for this particular replicated state machine
159 | And it also fakes a sort of network packet arrival interrupt 
160 | at the backup and delivers the packet
161 | So now both the primary and the backup have a copy
162 | This packet they looks at, the same input
163 | you know with a lot of details
164 | are gonna process it in the same way and stay synchronized
165 | Course the service is probably going to reply to the client
166 | On the primary the service will generate a reply packet
167 | and send it on the NIC
168 | that the virtual machine monitor is emulating
169 | And then the virtual machine monitor will we'll
170 | see that output packet on the primary
171 | They'll actually send the reply back out
172 | on the network to the client
173 | Because the backup is running exactly
174 | the same sequence of instructions
175 | It also generates a reply packet back to the client
176 | and sends that reply packet on its emulated NIC
177 | It's the virtual machine monitor
178 | that's emulating that network interface card
179 | And it says aha you know the virtual machine monitor says
180 | I know this was the backup
181 | only the primary is allowed to generate output
182 | And the virtual machine monitor drops the reply packet
183 | So both of them see inputs and only the primary generates outputs
184 | As far as terminology goes
185 | the paper calls this stream of input events
186 | and other things, other events we'll talk about from the stream
187 | is called the logging Channel
188 | 


--------------------------------------------------------------------------------
/lec04/Lec4-3.zh.txt:
--------------------------------------------------------------------------------
  1 | 他们的确是有复制的
  2 | 但是并没有在主和副本服务之间复制每一个bit的内存
  3 | 但是并没有在主和副本服务之间复制每一个bit的内存
  4 | 但是并没有在主和副本服务之间复制每一个bit的内存
  5 | 而是复制偏应用程序级别的内存块表
  6 | 我对块和块标识符进行了这种抽象
  7 | 这就是需要复制的东西
  8 | 并不需要复制任何其他的东西
  9 | 也没有在该机器上复制任何其他的东西的代价
 10 | 也没有在该机器上复制任何其他的东西的代价
 11 | 这样做是可以的
 12 | 只要主服务和副本服务具有相同应用程序可见性的内存块集
 13 | 因此大多数复制方案都采用与GFS相似的方案
 14 | 实际上，几乎除了这篇论文以及一些类似的系统之外的所有方案
 15 | 实际上，几乎除了这篇论文以及一些类似的系统之外的所有方案
 16 | 他们几乎都使用了应用级别的复制
 17 | 他们几乎都使用了应用级别的复制
 18 | 因为这样可以更有效率
 19 | 因为我们不必费力的去确保
 20 | 因为我们不必费力的去确保
 21 | 在主和副本服务运行时
 22 | 中断发生在完全相同的时间点
 23 | GFS完全不用担心这点
 24 | 但是本文必须要确保这点
 25 | 因为它在很低的级别进行复制
 26 | 因此大多数人使用特定于应用程序的复制来构建高效的系统
 27 | 因此大多数人使用特定于应用程序的复制来构建高效的系统
 28 | 这样做的后果是
 29 | 必须将复制内置到应用程序权限中
 30 | 必须将复制内置到应用程序权限中
 31 | 例如，如果你需要获取应用程序级别操作的提要
 32 | 你就需要让应用程序参与其中
 33 | 因为有些通用的复制方案，例如今天的论文
 34 | 并不能理解
 35 | 哪些东西需要被复制的语义
 36 | 因此大多数方案是针对于特定应用的
 37 | 例如GFS以及我们将要在这个主题下阅读的所有其他论文
 38 | 今天的论文的不同之处在于
 39 | 它实在机器级别进行复制的
 40 | 因此它不关系在其上运行了什么软件
 41 | 它复制低级别的内存以及寄存器
 42 | 你可以其上运行任何软件
 43 | 只要它可以在这种所表示的微处理器上运行
 44 | 只要它可以在这种所表示的微处理器上运行
 45 | 这种复制方案可以适应任何软件
 46 | 缺点是效率不一定高
 47 | 优点是你可以使用任何现有的软件
 48 | 甚至你没有源代码或者不知道它是如何工作的
 49 | 在一定的限制下
 50 | 你就可以在VMware的复制方案下运行它
 51 | 它可以正常工作
 52 | 且对于任意软件都可以进行容错
 53 | 现在我们来讨论VMware FT
 54 | 首先，VMware是一家虚拟机公司
 55 | 他们的很多业务都是销售虚拟机技术
 56 | 他们的很多业务都是销售虚拟机技术
 57 | 虚拟机指的是
 58 | 你买一台电脑
 59 | 在硬件上不是启动像Linux这样的操作系统
 60 | 而是启动虚拟机监视器
 61 | 而是启动虚拟机监视器
 62 | 它的工作实际上是
 63 | 在此硬件上模拟多台虚拟的电脑
 64 | 在此硬件上模拟多台虚拟的电脑
 65 | 因此虚拟机监视器可能会启动一个Linux实例
 66 | 多个Linux实例，或者一个Windows实例
 67 | 这台计算机上的虚拟机监视器可以运行许多不同的操作系统
 68 | 这台计算机上的虚拟机监视器可以运行许多不同的操作系统
 69 | 它们每个包含某种操作系统内核以及应用程序
 70 | 所以这是他们开始使用的技术
 71 | 原因是事实证明
 72 | 原因是事实证明
 73 | 在硬件和操作系统之间进行这种级别的间接干预非常方便 的原因有很多
 74 | 在硬件和操作系统之间进行这种级别的间接干预非常方便 的原因有很多
 75 | 在硬件和操作系统之间进行这种级别的间接干预非常方便 的原因有很多
 76 | 这意味着我们可以购买一台计算机
 77 | 并在其上运行许多不同的操作系统
 78 | 如果我们运行大量的小型服务
 79 | 而不是使用大量的每台运行一个服务的计算机
 80 | 你可以只购买一台计算机
 81 | 在基于虚拟机上的操作系统中运行每个服务
 82 | 在基于虚拟机上的操作系统中运行每个服务
 83 | 这就是他们的出发点
 84 | 在最开始设计VMware FT时
 85 | 他们已经构建了这项功能和许多其他复杂的东西
 86 | 他们已经构建了这项功能和许多其他复杂的东西
 87 | 所以这就是虚拟机
 88 | 论文要做的是要搭建一台机器
 89 | 或者说他们需要两台物理机
 90 | 因为在同一台物理计算机上的不同虚拟机中运行主和副本软件毫无意义
 91 | 因为在同一台物理计算机上的不同虚拟机中运行主和副本软件毫无意义
 92 | 因为在同一台物理计算机上的不同虚拟机中运行主和副本软件毫无意义
 93 | 因为我们正在努力应对硬件故障
 94 | 因此，你有两台计算机分别运行其虚拟机监视器
 95 | 因此，你有两台计算机分别运行其虚拟机监视器
 96 | 而主虚拟机将在一台计算机上运行
 97 | 而副本虚拟机将在另一台上运行
 98 | 在其中一台计算机上有一个guest操作系统...
 99 | 它可能正在运行许多虚拟机
100 | 我们只在乎其中的一个
101 | 它会运行多个guest操作系统
102 | 以及服务应用程序
103 | 也许是数据库服务，MapReduce主数据库或其他东西
104 | 我们称这个为主虚拟机
105 | 这里有第二台计算机
106 | 其运行相同的虚拟机监视器
107 | 也有运行副本服务的相同的虚拟机
108 | 因此，无论是何种操作系统，我们都具有完全相同的东西
109 | 因此，无论是何种操作系统，我们都具有完全相同的东西
110 | 虚拟机为这些guest操作系统、主和副本服务器
111 | 虚拟机为这些guest操作系统、主和副本服务器
112 | 提供一定范围的内存空间
113 | 并且这两个内存镜像是完全相同的
114 | 或其目标是使它们在主和副本虚拟机中完全相同
115 | 或其目标是使它们在主和副本虚拟机中完全相同
116 | 我们有两台物理计算机
117 | 每台都在运行guest虚拟机
118 | 该虚拟机上带有我们关心的服务的副本
119 | 我们假设有一个网络连接了这两台机器
120 | 我们假设有一个网络连接了这两台机器
121 | 此外，在此局域网上还有一些客户端
122 | 此外，在此局域网上还有一些客户端
123 | 事实上，它们不一定是是客户端
124 | 它们可能是其他计算机
125 | 复制服务需要与之通信
126 | 其中一些是来发送请求的客户端
127 | 这篇论文中的复制服务实际上并不使用本地磁盘
128 | 这篇论文中的复制服务实际上并不使用本地磁盘
129 | 而是假设与某种磁盘服务器进行通信
130 | 尽管从本篇论文中很难意识到这一点
131 | 尽管从本篇论文中很难意识到这一点
132 | 该方案实际上并没有特殊对待这种服务器
133 | 该方案实际上并没有特殊对待这种服务器
134 | 它只是数据包的另一个外部来源
135 | 只是复制状态机可能会将数据包发送到的地方
136 | 这与其他客户端没有太大不同
137 | 因此，基本方案是，我们假设
138 | 这两个副本、两个虚拟机、或者说主和副本虚拟机，都是精确的副本
139 | 某个客户端，例如数据库客户端
140 | 复制服务器的某个客户端
141 | 向主虚拟机发送请求
142 | 而这实际上是以网络数据包的形式发送的
143 | 就是我们刚刚讨论的
144 | 它生成一个中断
145 | 该中断进入第一个实例的虚拟机监视器
146 | 虚拟机监视器发现
147 | 复制服务的输入到来了
148 | 因此，虚拟机监视器会做两件事
149 | 第一件事，它模拟网络数据包到达中断，传递给主guest操作系统
150 | 第一件事，它模拟网络数据包到达中断，传递给主guest操作系统
151 | 以此将其传递给应用程序的主副本
152 | 第二件事，虚拟机监视器知道这是复制虚拟机的输入
153 | 第二件事，虚拟机监视器知道这是复制虚拟机的输入
154 | 因此，它通过网络将数据包副本
155 | 因此，它通过网络将数据包副本
156 | 发送给副本虚拟机监视器
157 | 所以它也得到了数据包，副本虚拟机监视器
158 | 知道它是此复制状态机的数据包
159 | 它在副本虚拟机中也会构造网络数据包到达中断，并传送数据包
160 | 它在副本虚拟机中也会构造网络数据包到达中断，并传送数据包
161 | 所以现在主和副本虚拟机都有了数据包的一份副本
162 | 它们看到的这个数据包、这个相同的输入
163 | 通过考虑大量的细节
164 | 会以相同的方式处理并保持同步
165 | 当然，服务可能会回复客户
166 | 在主虚拟机上，服务将生成一个回复数据包
167 | 将其发送到虚拟机监视器所模拟的NIC上
168 | 将其发送到虚拟机监视器所模拟的NIC上
169 | 然后，虚拟机监视器将会
170 | 在主计算机上看到该输出数据包
171 | 它们会将回复通过网络发送回客户端
172 | 它们会将回复通过网络发送回客户端
173 | 由于副本在运行
174 | 完全相同的指令序列
175 | 它也会生成一个回复数据包返回给客户端
176 | 在其模拟的NIC上发送该回复数据包
177 | 虚拟机监视器模拟了该网卡
178 | 虚拟机监视器模拟了该网卡
179 | 虚拟机监视器知道这是副本虚拟机
180 | 虚拟机监视器知道这是副本虚拟机
181 | 而它只允许主虚拟机生成输出
182 | 因此，虚拟机监视器会丢弃回复数据包
183 | 所以他们两个都看到了输入，而只有主虚拟机产生了输出
184 | 就术语而言，这篇论文将这种输入事件流
185 | 就术语而言，这篇论文将这种输入事件流
186 | 以及之后要讨论的其他事件流
187 | 称为日志记录通道
188 | 


--------------------------------------------------------------------------------
/lec04/Lec4-4.en.txt:
--------------------------------------------------------------------------------
  1 | logging Channel. it all goes over the  
  2 | same network presumably but these events  
  3 | the primary send to the backup are called  
  4 | log events on the log Channel  
  5 | where the fault tolerance comes in is  
  6 | that those, the primary crashes, what the  
  7 | backup is going to see is that it stops  
  8 | getting stuff ,on the ,stops getting log  
  9 | entries ，a log entry ，stops getting log  
 10 | entries on the logging channel and we 
 11 | know it it turns out that the backup can  
 12 | expect to get many per second because  
 13 | one of the things that generates log  
 14 | entries is periodic timer interrupts in  
 15 | the in the primary each one of which  
 16 | turns out every interrupt generates a  
 17 | log entries into the backup these timer  
 18 | interrupts are going to happen like 100  
 19 | times a second so the backups can  
 20 | certainly expect to see  
 21 | a lot of chitchat on the logging Channel  
 22 | if the primaries up .if the primary  
 23 | crashes then the virtual machine  
 24 | monitored over here will say gosh you  
 25 | know I haven't received anything on the  
 26 | logging channel for like a second or  
 27 | however long the primary must be dead or  
 28 | or something and in that case when the  
 29 | backup stop seeing log entries from the  
 30 | primary the paper the way the paper  
 31 | phrases it is that the backup goes alive  
 32 | and what that means is that it stops  
 33 | waiting for these input events on the  
 34 | logging Channel from the primary and  
 35 | instead this virtual machine monitor  
 36 | just lets this backup execute freely  
 37 | without waiting for without being driven  
 38 | by input events from the primary ,the vmm  
 39 | does something to the network to cause  
 40 | future client requests to go to the  
 41 | backup instead of the primary and the  
 42 | VMM here stops discarding the backup  
 43 | personnel it's the primary not the  
 44 | backup stops discarding output from this  
 45 | virtual machine so now this or machine  
 46 | directly gets the inputs and there's a  
 47 | lot of produce output and now our backup  
 48 | is taken over and similarly you know  
 49 | that this is less interesting but has to  
 50 | work correctly  
 51 | if the backup fails a similar primary  
 52 | has to use a similar process to abandon  
 53 | the backup stop sending it events and  
 54 | just sort of act much more like a single  
 55 | non replicated server so either one of  
 56 | them can go live if the other one  
 57 | appears to be dead ，stops， you know stops  
 58 | generating network traffic.  
 59 | magic, now it depends ,you know depends on  
 60 | what the networking technology is I  
 61 | think with the paper one possibility is  
 62 | that this is sitting on Ethernet every  
 63 | physical computer on the Internet or  
 64 | really every NIC has a 48 bit unique ID  
 65 | I'm making this up now, the ,it could be  
 66 | that in fact instead of each physical  
 67 | computer having a unique ID each virtual  
 68 | machine does and when the backup takes  
 69 | over it essentially claims the primary's  
 70 | Ethernet ID as its own and it starts  
 71 | saying you know I'm the owner of that ID  
 72 | and then other people on the ethernet  
 73 | will start sending us packets that's my  
 74 | interpretation ,the designers believed  
 75 | they had identified all such sources and  
 76 | for each one of them the primary does  
 77 | whatever it is you know executes the  
 78 | random number generator instruction or  
 79 | takes an interrupt at some time the  
 80 | backup does not and the back of virtual  
 81 | machine monitor sort of detects any such  
 82 | instruction and and intercepts that and  
 83 | doesn't do it and he said the backup  
 84 | waits for an event on the logging  
 85 | Channel saying this instruction number  
 86 | you know the random number was whatever  
 87 | it was on the primary  
 88 | At which?  
 89 | yes yes  
 90 | yeah the paper hints that they got Intel  
 91 | to add features to the microprocessor to  
 92 | support exactly this but they don't say  
 93 | what it was ,okay  
 94 | okay so on that topic ,the ,so far that  
 95 | you know the story is sort of assumed  
 96 | that as long as the backup to sees the 
 97 | package from the clients it'll execute  
 98 | in identically to the primary and that's  
 99 | actually glossing over some huge and  
100 | important details so one problem is that  
101 | as a couple of people have mentioned  
102 | there are some things that are  
103 | non-deterministic now it's not the case  
104 | that every single thing that happens in  
105 | the computer is a deterministic function  
106 | of the contents of the memory of the  
107 | computer it is for a sort of straight  
108 | line code execution often but certainly  
109 | not always so worried about is things  
110 | that may happen that are not a strict  
111 | function of the current state that is  
112 | that might be different if we're not  
113 | careful on the primary and backup so  
114 | these are sort of non-deterministic  
115 | events that may happen so the designers  
116 | had to sit down and like figure out what  
117 | they all work and here are the ones  
118 | here's the kind of stuff they talked  
119 | about so one is inputs from external  
120 | sources like clients which arrive just  
121 | whenever they arrive right they're not  
122 | predictable there are no sense in which  
123 | the time at which a client request  
124 | arrives or its content is a  
125 | deterministic function of the services  
126 | state because it's not ,so these actually ,
127 | this system is really dedicated to a  
128 | world in which services only talk over  
129 | the network and so the only really  
130 | basically the only form of input or  
131 | output in this system is supported by  
132 | this system seems to be network packets  
133 | coming and going. so we didn't put  
134 | arrives at what that really means it's a  
135 | packet  
136 | arrives and what a packet really  
137 | consists of for us is the data in the  
138 | packet plus the interrupt  
139 | that's signaled that the packet had  
140 | arrived so that's quite important, so  
141 | when a packet arrives  
142 | I'm ordinarily the NIC DMAs the packet  
143 | contents into memory and then raises an  
144 | interrupt which the operating system  
145 | feels and the interrupt happens at some  
146 | point in the instruction stream and so  
147 | both of those have to look identical on  
148 | the primary and backup or else we're  
149 | gonna have they're also executions gonna  
150 | diverge and so you know the real issue  
151 | is when the interrupt occurs exactly at  
152 | which instruction the interrupts happen  
153 | to occur and better be the same on the  
154 | primary in the backup otherwise their  
155 | execution is different and their states  
156 | are gonna diverge and so we care about  
157 | the content of the packet and the timing  
158 | of the interrupt and then as a couple of  
159 | people have mentioned there's a few  
160 | instructions that that behave  
161 | differently on different computers or  
162 | differently depending on something like  
163 | there's maybe a random number generator  
164 | instruction there's I get time-of-day  
165 | instructions that will yield different  
166 | answers have called at different times  
167 | and unique ID instructions another huge  
168 | source of non determinism which the  
169 | paper basically rules out is multi-core  
170 | parallelism this is a uni-process only  
171 | system there's no multi-core in this  
172 | world the reason for this is that if it 
173 | allowed multi-core then then the service  
174 | would be running on multiple cores and
175 | the instructions of the service the rest  
176 | of you know the different cores are  
177 | interleaved in some way which is not  
178 | predictable and so really if we run the  
179 | same code on the on the backup in the  
180 | server if it's parallel code running on  
181 | a multi-core the tube interleave the 
182 | instructions in the two cores in  
183 | different ways the hardware will and 
184 | that can just cause  
185 | different results because you know  
186 | supposing the code and the two cores you  
187 | know they both asked for a lock on some  
188 | data well on the master you know  
189 | core one may get the lock before core two  
190 | on the slave just because of a tiny  
191 | timing difference core two may got the  
192 | lock first and the you know execution  
193 | results are totally different likely to  
194 | be totally different if different  
195 | threads get the lock   
196 | so multi-core is the grim source among  
197 | non-determinisms just totally  
198 | outlawed in this papers world and indeed  
199 | like as far as I can tell the techniques  
200 | are not really applicable. the service 
201 | can't use multi-core parallel


--------------------------------------------------------------------------------
/lec04/Lec4-4.zh.txt:
--------------------------------------------------------------------------------
  1 | 它们都在同一条网络上传输
  2 | 推测来看，这些
  3 | 被主节点发往备份节点的事件
  4 | 被称为在日志通道上的日志记录
  5 | 容错出现的场景是
  6 | 主节点挂掉了
  7 | 备份节点觉察到它接受不到日志了
  8 | 备份节点停止接受日志记录了
  9 | 接收不到日志通道上的日志数据了
 10 | 接收不到日志通道上的日志数据
 11 | 我们知道备份节点
 12 | 每秒可以接受许多事件，
 13 | 因为其中的一些事件就是
 14 | 由周期性的时钟中断触发产生的，
 15 | 在主节点上，每个中断产生一个日志记录，
 16 | 在主节点上，每个中断产生一个日志记录，
 17 | 在主节点上，每个中断产生一个日志记录，
 18 | 这些时钟中断可以以每秒100次的频率触发，
 19 | 所以备份节点
 20 | 理应接受到
 21 | 日志通道上的许多通信消息
 22 | 如果主节点还存活着的话
 23 | 如果主节点挂掉的话，VMM监视器
 24 | 会说：天哪，
 25 | 我已经有一秒或者更长时间没有收到日志同道上的事件了
 26 | 我已经有一秒或者更长时间没有收到日志同道上的事件了
 27 | 那主节点应该挂掉了，
 28 | 或者发生了其他事情，当发生这些事情时候，
 29 | 备份节点接收不到主节点的日志记录时候，
 30 | 论文中描述说，备份节点会启动
 31 | 论文中描述说，备份节点会启动
 32 | 它将停止接收来自主节点日志通道上的输入事件
 33 | 它将停止接收来自主节点日志通道上的输入事件
 34 | 它将停止接收来自主节点日志通道上的输入事件
 35 | 相反VMM
 36 | 会让备份节点开始自由执行
 37 | 而不需要通过主节点的输入事件来驱动执行
 38 | 而不需要通过主节点的输入事件来驱动执行
 39 | 同时VMM会变更网络层配置信息
 40 | 让未来的客户端请求路由到备份节点上
 41 | 而不是到原来的主节点上
 42 | VMM此时停止之前抛弃备份节点返回数据的策略
 43 | 此时原来旧的主节点
 44 | 开始抛弃输出数据
 45 | 所以这台机器开始
 46 | 接受输入事件，
 47 | 并且输出输出事件
 48 | 这样我们的备份节点完成接管流程
 49 | 类似的这个接管流程不是非常有趣
 50 | 但是需要能正常的work
 51 | 如果备份节点挂掉了，主节点
 52 | 需要用同样的一套机制来放弃备份节点
 53 | 停止向备份节点发送日志记录
 54 | 此时主节点更像是一个单节点的
 55 | 没有复制功能的服务，
 56 | 所以它们中的一个会变活
 57 | 当另一个挂掉的时候，或者停止发送网络流量
 58 | 当另一个挂掉的时候，或者停止发送网络流量
 59 | 神奇，这个得看具体情况
 60 | 看具体的网络拓扑结构是怎样的
 61 | 我认为论文中的一个可能是
 62 | 在Ethernet层基础上
 63 | 每一个物理机或者NIC卡
 64 | 有一个48bit的唯一ID
 65 | 也有可能是另一种情况
 66 | 并不是每一个物理机
 67 | 有一个唯一Id，而是每一个虚拟机有一个唯一Id
 68 | 当备份节点接管的时候
 69 | 它会申明
 70 | 主节点的Ethernet Id变成它自己的Id
 71 | 然后对外声明它是那个唯一Id的所有者
 72 | 这样网络上的其他节点
 73 | 就会向我们发送数据包，这是我这边的理解
 74 | 这套机制的设计者
 75 | 相信他们确认了所有的事件源
 76 | 针对每一个这样的事件源，主节点
 77 | 都会执行，如你所知，不管是
 78 | 一个随机数生成指令，或者
 79 | 在某个时间点触发一个中断事件，
 80 | 但是备份节点并不执行这些操作，VMM的备份节点
 81 | 会检测到这些指令
 82 | 会截获这些指令
 83 | 并且不执行它，备份节点
 84 | 会等待日志通道上的一个对应的日志事件
 85 | 告诉它具体的指令结果是啥
 86 | 你知道的，那就是在主节点上产生的那个随机数
 87 | 你知道的，那就是在主节点上产生的那个随机数
 88 | 在哪个节点？
 89 | 是的是的
 90 | 论文暗示了他们让Intel
 91 | 在微处理器上加入了一些特性
 92 | 来支持这个功能，但是他们没有说
 93 | 到底是什么
 94 | 关于这个主题，到现在为止
 95 | 你们知道的是
 96 | 只要备份节点能够收到
 97 | 从client发送过来的数据包
 98 | 它就会和主节点保持一致的执行它
 99 | 但实际上我们忽略了一些巨大和重要的细节
100 | 其中的一个问题就是
101 | 许多人之前提到的
102 | 有一些操作是非确定性的
103 | 事情并不是这样的
104 | 发生在机器里的每一个操作并不一定是内存内容
105 | 的确定性函数映射关系
106 | 的确定性函数映射关系
107 | 对一些常见的直接代码执行操作是确定性的
108 | 对一些常见的直接代码执行操作是确定性的
109 | 但是并不总是这样的，我们担心的是
110 | 某些操作并不是当前状态的严格函数关系
111 | 某些操作并不是当前状态的严格函数关系
112 | 即结果可能不一样，如果我们没有细心的考虑
113 | 主节点和备份节点之间的关系
114 | 所以这是一些非确定性的事件
115 | 它们的存在使设计者们不得不
116 | 坐下来好好思考弄清楚
117 | 它们要如何工作才行
118 | 这里是他们要讨论的内容
119 | 其中的一个是外部源的输入事件
120 | 这些事件随便什么时候都可以到来
121 | 这些事件随便什么时候都可以到来
122 | 它们是不可预测的，没有道理可以预先知道这些请求什么时候到来
123 | 没有道理可以预先知道这些请求什么时候到来
124 | 或者它的内容是服务状态的确定性映射关系，因为它并不是
125 | 或者它的内容是服务状态的确定性映射关系，因为它并不是
126 | 所以呢
127 | 所以这个系统真正面对的是一个服务之间通过网络进行交互的场景
128 | 所以这个系统真正面对的是一个服务之间通过网络进行交互的场景
129 | 所以这个系统仅有的的输入和输出方式
130 | 所以这个系统仅有的的输入和输出方式
131 | 是通过网络包的输入和输出形式来支撑的。
132 | 是通过网络包的输入和输出形式来支撑的。
133 | 所以我们没有把到达操作记录下来，它真正的意思是一个数据包到达了
134 | 它真正的意思是一个数据包到达了
135 | 它真正的意思是一个数据包到达了
136 | 同时一个数据包是由
137 | 数据包中的数据
138 | 加上用来通知数据到达的中断事件组成的
139 | 加上用来通知数据到达的中断事件组成的
140 | 这一点是非常重要的
141 | 所以当一个数据包到达的时候
142 | NIC卡通过DMA机制将包内容拷贝到内存
143 | 然后触发中断操作
144 | 操作系统收到中断事件
145 | 同时这个中断会在指令流的某个时间点执行
146 | 同时这个中断会在指令流的某个时间点执行
147 | 所有这些操作在主节点和备份节点必需看起来是一样的
148 | 所有这些操作在主节点和备份节点必需看起来是一样的
149 | 否则我们将会看到有些操作将会引起分叉
150 | 所以你知道真正的问题是
151 | 当中断真正的在某一条指令执行触发的时候，
152 | 当中断真正的在某一条指令执行触发的时候，
153 | 这个需要在主节点和备份节点是一致的
154 | 否则它们的执行可能出现不一致，他们的状态也将出现分叉
155 | 否则它们的执行可能出现不一致，他们的状态也将出现分叉
156 | 所以我们需要关注数据包的内容，中断的时序
157 | 所以我们需要关注数据包的内容，中断的时序
158 | 中断的时序以及
159 | 其他人提到的
160 | 有一些指令
161 | 在不同的机器上有不用的执行结果
162 | 或者依赖其他情况
163 | 比如一个随机数生成器
164 | 或者获取时间的指令
165 | 在不同时间执行会获取到不同的结果
166 | 在不同时间执行会获取到不同的结果
167 | 或者是唯一Id生成指令
168 | 另一个非确定性的主要来源
169 | 但是在论文中被显式剔除掉的
170 | 多核并行的场景，论文假定了一个单核处理，
171 | 并不存在多核情况的世界
172 | 这样做的原因是
173 | 如果允许存在多核处理的话，
174 | 服务将会运行在多个核上
175 | 这样的话服务的指令
176 | 如我们所知晓的，不同的核将交叉执行指令
177 | 不同的核将以某种方式交叉执行指令
178 | 但是这个次序是不可预知的，所以如果我们
179 | 在备份节点执行相同的指令
180 | 这些在多核上执行的并行指令
181 | 电子器件将在两个核上以不同的组合方式交叉执行这些指令
182 | 电子器件将在两个核上以不同的组合方式交叉执行这些指令
183 | 电子器件将在两个核上以不同的组合方式交叉执行这些指令
184 | 那样会导致出现不同的计算结果，
185 | 因为你知道
186 | 设想这些代码执行在两个核上
187 | 它们都在获取某些数据的锁
188 | 在主节点上
189 | 核芯1有可能比核芯2先获得了锁
190 | 但是在备份节点上，可能仅仅是由于一个很小
191 | 的时序差异，核芯2可能先获取到锁
192 | 这样的话执行结果
193 | 是完全不一样的，
194 | 执行结果可能是完全不一样的，如果是由
195 | 不同的线程获取了锁
196 | 所以多核场景是很糟糕的一种
197 | 导致非确定性的源头
198 | 在这篇论文中仅仅是禁止出现这种场景，
199 | 就我所知道的而言，论文中的这些技术
200 | 在实际中不是很实用。
201 | 服务不能使用多核并行方式


--------------------------------------------------------------------------------
/lec04/Lec4-5.en.txt:
--------------------------------------------------------------------------------
  1 | so we can't let that happen
  2 | because if the primary happens to be playing that trick
  3 | it's gonna see
  4 | you know if we allowed the network interface card to directly DMA incoming packets into the memory of the primary
  5 | the primary we don't have any control over the exact timing of
  6 | and so we're not going to know sort of at what times the primary did or didn't observe data from the packet arriving
  7 | and so what that means is that in fact the NIC copies incoming packets into private memory of the virtual machine monitor
  8 | and then the network interface card interrupts the virtual machine monitor and says oh a packet has arrived
  9 | at that point the virtual machine monitor will suspend the primary
 10 | and remember what instruction number had suspended at
 11 | copy the entire packet into the primaries memory while the primary suspended and not looking at this copy
 12 | and then emulate a network interface card interrupt into the primary
 13 | and then send the packet and the instruction number to the backup
 14 | the backup will also suspend the backup rope
 15 | you know virtual machine monitor will spend the backup at that instruction number
 16 | copy the entire packet and again to the back-up is guaranteed not to be watching the data arrive
 17 | and then fakin interrupts at the same instruction numbers of the primary
 18 | and this is the something the bounce buffer mechanism explained in the paper
 19 | okay yeah the the only instructions and that result in logging channel traffic or are weird instructions which are rare
 20 | no its instructions that might yield a different result if executed on the primary and backup
 21 | like instruction to get the current time of day or current processor number
 22 | or ask how many instructions have been executed or
 23 | and those actually turn out to be relatively rare
 24 | there's also one them to get random tasks when some machines
 25 | to ask or a hardware generated random number for cryptography or something
 26 | and but those are not everyday instructions
 27 | most instructions like add instructions they're gonna get the same result on primary and backup
 28 | exactly right each network packet just it's packaged up and forwarded as it is as a network packet
 29 | and is interpreted by the tcp/ip stack on both you know
 30 | so I'm expecting 99.99% of the logging channel traffic to be incoming packets
 31 | and only a tiny fraction to be results from special non-deterministic instructions
 32 | and so we can kind of guess what the traffic load is likely to be for for a server that serves clients
 33 | basically it's a copy of every client packet
 34 | and then we'll sort of know what the logging channel how fast the logging channel has to be
 35 | it's worth talking a little bit about how output works
 36 | and in this system really the only what output basically means only is sending packets
 37 | that client send requests in as network packets the response goes back out as network packets
 38 | and there's really no other form of output
 39 | as I mentioned the you know both primary and backup compute the output packet they want to send
 40 | and that sort of asks that simulated mix to send the packet it's really sent on the primary
 41 | and simply discard it the output packet discarded on the backup
 42 | okay but it turns out is a little more complicated than that
 43 | so supposing we're what we're running is a some sort of simple database server
 44 | and the operation the client operation that our database server supports is increment
 45 | and ideas the client sends an increment requests the database server increments the value and sends back the new value
 46 | so maybe on the primary well let's say everything's fine so far
 47 | and the primary backup both have value 10 in memory and that's the current value at the counter
 48 | and some client on the local area network sends a you know an increment request
 49 | to the primary that packet is you know delivered to the primary it's you know
 50 | it's executed the primary server software and the primary
 51 | says oh you know current values 10 I'm gonna change to 11
 52 | and send a you know response packet back to the client saying saying 11
 53 | mentioned gonna supposed to be sent to the backup will also be processed here it's going to change this 10 to 11 also
 54 | generate a reply and we'll throw it away that's what's supposed to happen the output
 55 | however you also need to ask yourself what happens if there's a failure at an awkward time
 56 | if you should always in this class should always ask yourself
 57 | what's the most awkward time to have a failure and what would happen you to failure occurred then so
 58 | suppose the primary does indeed generate the reply here back to the client
 59 | but the client the primary crashes just after sending the report its reply to the client
 60 | and furthermore and much worse it turns out that you know this is just a network it doesn't guarantee to deliver packets
 61 | let's suppose this log entry on the logging channel got dropped also when the when the primary died
 62 | so now the state of play is the client received a reply saying 11
 63 | but the backup did not get the client request so its state is still 10
 64 | no now the backup takes over because it's seized the primary is dead and
 65 | this client or maybe some other client sends an increment request a new backup
 66 | and now it's really processing these requests and so the new backup when it gets the next increment requests
 67 | you know it's now going to change its state to 11
 68 | and generate a second 11 response
 69 | maybe the same client maybe to a different client
 70 | which if the clients compare notes or if it's the same client it's just obviously cannot have happened I didn't
 71 | so you know because we have to support unmodified software that does not damn
 72 | that there's any funny business of replication going on
 73 | that means we do not have the opportunity to
 74 | you know you can imagine the client could go
 75 | you know we could change the client to realize
 76 | something funny it happened with the fault tolerance and do I don't know what
 77 | but we don't have that option here
 78 | because this whole system really only makes sense if we're running unmodified software
 79 | so so this was a big this is a disaster
 80 | we can't have let this happen does
 81 | anybody remember from the paper how they prevent this from happening
 82 | the output rule yeah so you want to do you know
 83 | yeah so the output rules is the their solution to this problem
 84 | and the idea is that the client he's not allowed to generate you know and
 85 | generate any output the primary's not allowed to generate any output
 86 | and what we're talking about now is this output here
 87 | until the backup acknowledges that it has received all log records up to this point
 88 | so the real sequence at the primary then let's now undone crash the primary
 89 | go back to them starting at 10 the
 90 | real sequence now when the output rule is that
 91 | the input arrives at the time the input arrives
 92 | that's when the virtual machine monitor sends a copy of the input to the backup
 93 | so the the sort of time at which this log message with the input
 94 | is sent is before strictly before the primary generates the output sort of obvious
 95 | then after firing this log entry off across a network and now it's heading towards the backup
 96 | but I'd have been lost might not
 97 | the virtual machine monitor delivers a request to the primary server software it generates the output
 98 | so now the replicated you know the primary has actually generated change the state 211
 99 | and generated an output packet that says eleven
100 | but the virtual machine monitor says oh wait a minute we're not allowed to generate that output
101 | until all previous log records have been acknowledged by the backup
102 | so you know this is the most recent previous log message
103 | so this output is held by the virtual machine monitor
104 | until the this log entry containing the input packet from the client
105 | is delivered to the virtual machine monitor and buffered by the virtual machine monitor
106 | but do not necessarily execute it
107 | it may be just waiting for the backup to get to that point in the instruction stream
108 | and then the virtual machine monitor here will send an ACK packet back saying yes I did get that input
109 | and when the acknowledgment comes back


--------------------------------------------------------------------------------
/lec04/Lec4-5.zh.txt:
--------------------------------------------------------------------------------
  1 | 所以我们不能让这种情况发生
  2 | 如果在主节点用的是这种方式
  3 | 它就会观察到
  4 | 如果我们允许网卡直接将接收到的包DMA到主节点内存
  5 | 我们没办法精确控制网卡将数据复制到内存的时间点
  6 | 所以我们也不知道 什么时候主节点观察到网络包中的数据
  7 | 这就意味着网卡复制包进虚拟机监视器的虚拟内存
  8 | 然后打断虚拟机监视器告诉它有抵达的包
  9 | 这时虚拟机监视器就会中断主节点
 10 | 记住当前位置的指令号
 11 | 在主节点中断的时候 复制整个包到主节点的内存
 12 | 然后模拟主节点的网卡中断
 13 | 然后发送包和中断位置的指令号到副节点
 14 | 副节点也会中断
 15 | 副节点也会在同样的指令号位置中断
 16 | 复制整个包到副节点的内存 同样副节点中断没有注意包的抵达过程
 17 | 然后模拟副节点在同样指令号位置中断
 18 | 这就是论文中描述的回弹缓冲机制
 19 | 会产生记录通道流量的指令都是不寻常的指令 很少见
 20 | 一般是在主节点和副节点执行结果会不同的指令
 21 | 例如获取当前时间 获取当前处理器数量
 22 | 或者获取已执行的指令数量
 23 | 这些相对来说都是很少见的
 24 | 或者说像生成随机数
 25 | 让硬件生成用于加密的随机数之类的
 26 | 这些都不是日常的指令
 27 | 大多数指令例如相加的指令会在主副节点有相同的结果
 28 | 没错 每个网络包都被直接打包转发没有修改
 29 | 由两边的TCP/IP栈进行解析
 30 | 所以我认为99.99%的记录通道流量都是来自于接收到的包
 31 | 只有少部分是因为会产生不确定结果的指令造成
 32 | 所以我们可以猜到在有客户端连接的服务器上 流量大概是什么样子的
 33 | 基本就是各个客户端发的包
 34 | 所以我们就能估算出记录通道需要有怎样的性能
 35 | 输出是如何工作的也值得讲一下
 36 | 在这个系统中 输出就是指发送数据包
 37 | 客户端发送请求的网络包 服务端响应网络包
 38 | 没有其他形式的输出了
 39 | 主节点和副节点计算他们要发送的数据包
 40 | 在主节点上真正进行发送
 41 | 副节点的包则被简单地丢弃掉
 42 | 真实的情况会稍微复杂一点
 43 | 假设我们在跑的是一个简单的数据库服务器
 44 | 服务器支持客户端进行计数器自增（Increment)操作
 45 | 客户端发送自增的请求 服务端对计数器进行加操作 返回操作后的值
 46 | 假设在主节点一切都正常
 47 | 主副节点现在都存有计数器值10在内存中
 48 | 本地网络的客户端发送自增请求到主节点
 49 | 这个网络包被发到主节点
 50 | 然后被主节点的程序执行
 51 | 比如说现在是10 我要将他变为11
 52 | 然后产生回复告诉客户端结果11
 53 | 这个请求也会被发送到副本上 将10改为11
 54 | 然后同样产生一个回复 并被丢弃掉 理论上是这样
 55 | 然而你也要想一下如果在不恰当的时间服务出现失败会怎么样
 56 | 上这门课的期间你要一直这样问自己
 57 | 何时出现失败是最坏的情况 这种情况下会发生什么
 58 | 假设主节点确实生成了返回给客户端的回复
 59 | 但是主节点在发送完回复之后就宕机了
 60 | 更糟的是 网络向来不保证数据包能传递到接收方
 61 | 再假设记录通道也在主节点宕机的时候失效了
 62 | 那么现在的状态就是 客户端收到回复11
 63 | 但是副节点没有收到转发来的客户端请求 所以计数器仍然是10
 64 | 现在因为察觉到主节点宕机 副节点接管服务
 65 | 那这个或者其他客户端发送自增请求到原来的副节点
 66 | 当收到请求的时候这个接管工作的副节点开始处理
 67 | 将计数器从10自增为11
 68 | 然后值11就会第二次作为回复出现
 69 | 回复给原来的客户端或者不同的客户端
 70 | 客户端如果比较会发现这是同样的回复 这本不应该发生
 71 | 因为我们需要支撑的程序在没改动的情况下
 72 | 并不能处理这些副节点上不寻常的问题
 73 | 意思是我们不能对它进行修改
 74 | 比如说我们可以
 75 | 把客户端改成能处理
 76 | 副节点容错性带来的问题
 77 | 但是实际上我们没有这种选择
 78 | 因为这个系统只有在我们不需要客户端进行改动的情况下才有意义
 79 | 所以这是个灾难
 80 | 我们不能让它发生
 81 | 有人记得论文上是怎么样防止它发生的吗？
 82 | 输出规则 对的
 83 | 没错解决的方案就是控制输出规则
 84 | 方案就是阻止输出
 85 | 在主节点上不允许生成任何输出
 86 | 比如我们现在说的这个计数器输出
 87 | 直到副节点确认收到了所有的记录
 88 | 所以整个流程正确的顺序是 回到主节点宕机之前
 89 | 回到计数器还是10的时候
 90 | 正确的顺序是 在这个输出规则下
 91 | 当输入到来时
 92 | 虚拟机监视器发送输入的副本到副节点
 93 | 这个输入的日志信息
 94 | 在主节点产生输出前发到副节点
 95 | 发送之后这个日志就在去往副节点的网络中
 96 | 这个日志有可能丢失
 97 | 虚拟机监视器同样传递请求到主节点 生成输出
 98 | 所以现在主节点生成结果 计数器值变为11
 99 | 然后产生一个11的回复
100 | 但是虚拟机监视器说等一下 现在还不允许生成输出
101 | 直到之前的日志记录都被副节点确认
102 | 这是最近的一条日志消息
103 | 所以输出都被虚拟机监视器截停
104 | 直到这条包含客户端输入的日志
105 | 被虚拟机监视器投递和缓冲
106 | 但是不一定马上会执行
107 | 可能需要等待副节点执行到指令流中对应的位置
108 | 到这里虚拟机监视器就会发送一个ACK包说自己收到输入
109 | 直到主节点接收到ACK包时


--------------------------------------------------------------------------------
/lec04/Lec4-6.en.txt:
--------------------------------------------------------------------------------
  1 | and then the virtual machine monitor here will send an ACK packet back
  2 | saying yes I did get that input and when the acknowledgment comes back
  3 | only then will the virtual machine monitor here release the packet out onto the network
  4 | and so the idea is that if the client could have seen the reply
  5 | then necessarily the backup must have seen the request and at least buffered it
  6 | and so we no longer get this weird situation
  7 | in which a client can see a reply but then there's a failure and a cut over
  8 | and the replica didn't know anything about that reply
  9 | if the you know there's also a situation maybe this message was lost
 10 | and if this log entry was lost and then the primary crashes
 11 | well since it hadn't been delivered so the backup hadn't sent the act
 12 | that means if the primary crashed
 13 | you know this log entry was brought in the primary crashed
 14 | it must have crashed before the virtual machine monitor or at least the output packet
 15 | and prayer for this client couldn't have gotten the reply
 16 | and so it's not in a position to spot any irregularities
 17 | they're really happy with the output rule
 18 | brennon see
 19 | I don't know they don't paper doesn't mention how the virtual machine monitor is implemented
 20 | I mean it's pretty low level stuff because
 21 | you know it's sitting there allocating memory and figuring page tables
 22 | and talking to device drivers and intercepting instructions
 23 | and understanding what instructions the guest was executing
 24 | so we're talking about low-level stuff what language is written in you know traditionally C or C++
 25 | but I don't actually know
 26 | okay this of the primary has to delay at this point
 27 | waiting for the backup to say that it's up to date
 28 | this is a real performance thorn in the side of just about every replication scheme
 29 | this sort of synchronous wait where the we can't let the primary get too far ahead of the backup
 30 | because if the primary failed while it was ahead
 31 | that would be the backup lagging
 32 | lagging behind clients right
 33 | so just about every replication system has this problem that
 34 | at some point the primary has to stall waiting for the backup
 35 | and it's a real limit on performance
 36 | even if the machines are like side-by-side and adjacent racks
 37 | it's still you know we're talking about a half a millisecond or something
 38 | to send messages back and forth with a primary stalled
 39 | and if we wanna like withstand earthquakes or citywide power failures
 40 | you know the primary in the backup have to be in different cities
 41 | that's probably five milliseconds apart
 42 | every time we produce output if we replicate in the two replicas in different city
 43 | every packet that it produces this output
 44 | has to first wait the five milliseconds or whatever to have the last log entry get to the backup
 45 | and how the acknowledgment come back and then we can release a path packet
 46 | and you know for sort of low intensity services that's not a problem
 47 | but if we're building a you know database server that we would like to
 48 | you know that if it weren't for this could process millions of requests per second
 49 | then that's just unbelievably damaging for performance
 50 | and this is a big reason why people you know you know
 51 | if they possibly can use a replication scheme
 52 | that's operating at a higher level and kind of understands the semantics of operations
 53 | and so it doesn't have to stall on every packet
 54 | you know it could stall on every high level operation or even notice that well
 55 | you know read-only operations don't have to stall at all
 56 | it's only right so that just all or something
 57 | but you have to there has to be an application level replication scheme to to realize that
 58 | you're absolutely right
 59 | so the observation is that you don't have to stall the execution of the primary
 60 | you only have to hold the output
 61 | and so maybe that's not as bad as it could be
 62 | but nevertheless it means that every you know in a service
 63 | that could otherwise have responded in a couple of microseconds to the client
 64 | you know if we have to first update the replicas in the next city
 65 | we turn to you know 10 micro second interaction into it 10 millisecond interactions possibly
 66 | if you have vast numbers of clients submitting concurrent requests
 67 | then you may may be able to maintain high throughput even with high latency
 68 | but you have to be lucky to or very clever designer to get that
 69 | that's a great idea
 70 | but if you log in the memory of the primary
 71 | that log will disappear when the primary crashes
 72 | or that's usual semantics of a server failing is that
 73 | you lose everything inside the box like the contents of memory
 74 | or you know if even if you didn't
 75 | if the failure is that somebody unplugged the power cable accidentally from the primary
 76 | even if the primary just has battery backed up RAM or I don't know what
 77 | you can't get at it
 78 | all right the backup can't get at it
 79 | so in fact this system does log the output and the place it logs it is in the memory of the backup
 80 | and in order to reliably log it there you have to observe the output rule and wait for the acknowledgment
 81 | so it's entirely correct idea just can't use the primary's memory for it
 82 | say it again
 83 | that's a clever idea I'd
 84 | and so the question is maybe input should go to the primary but output should come from the backup
 85 | I completely haven't thought this through
 86 | that might work that
 87 | I don't know that's interesting
 88 | yeah maybe I will
 89 | one possibility this does expose though is that
 90 | the situation you know maybe the a primary crashes after its output is released
 91 | so the client does receive the reply
 92 | then the primary crashes
 93 | the backups input is still in this event buffer
 94 | in the virtual machine monitor of the backup
 95 | it hasn't been delivered to the actual replicated service
 96 | when the backup goes live after the crash of the primary
 97 | the backup first has to consume all of the sort of log records that are lying around
 98 | that it hasn't consumed yet has to catch up to the primary
 99 | otherwise it won't take over with the same state
100 | so before the backup can go live it actually has to consume all these entries
101 | the last entry is presumably is the request from the client
102 | so the backup will be live after after it
103 | after the interrupt that delivers the request from the client
104 | and that means that the backup well you know increment its counter to eleven
105 | and then generate an output packet and since it's live at this point
106 | it will generate the output packet and the client will get two eleven replies
107 | which is also if it if that really happened would be anomalous
108 | like possibly not something that could happen if there was only one server
109 | the good news is that almost certainly
110 | or the almost certainly the client is talking to this service using TCP
111 | and that this is the request and the response go back and forth on a TCP Channel
112 | the when the backup takes over
113 | the backup since the state is identical to the primaries it knows all about that TCP connection
114 | and whether all the sequence numbers are and whatnot
115 | and when it generates this packet
116 | it will generate it with the same TCP sequence number as an original packet
117 | and the TCP stack on the client will say oh wait a minute that's a duplicate packet
118 | we'll discard the duplicate packet at the TCP level
119 | and the user level software will just never see this duplicate
120 | and so this system really you know
121 | you can view this as a kind of accidental or clever trick
122 | but the fact is for any replication system where cutover can happen
123 | which is to say pretty much any replication system
124 | it's essentially impossible to design them
125 | in a way that they are guaranteed not to generate duplicate output
126 | basically you know you well you can err on either side
127 | I'm not even either not generate the output at all which
128 | would be bad which would be terrible
129 | or you can generate the output twice on a cutover
130 | that's basically no way to generate it guaranteed generated only once
131 | everybody errors on the side of possibly generating duplicate output
132 | and that means that at some level you know the client side of all replication schemes
133 | need some sort of duplicate detection scheme
134 | here we get to use TCP s that we didn't have TCP that would have to be something else


--------------------------------------------------------------------------------
/lec04/Lec4-6.zh.txt:
--------------------------------------------------------------------------------
  1 | ﻿到这里虚拟机监视器就会发送一个ACK包说自己收到输入
  2 | 直到主节点接收到ACK包时
  3 | 虚拟机监视器才会将包发送到网络中
  4 | 所以这个方案就是 若客户端可以收到回复
  5 | 那么副节点肯定也收到过请求 并且至少已经存到缓冲区
  6 | 因此我们不在会有以下的异常
  7 | 客户端已经收到了回复 然后因为有故障发生
  8 | 副节点完全没有接到过相关内容
  9 | 比如说有些情况消息可能会中途丢失
 10 | 日志记录丢失后主节点宕机
 11 | 因为消息没投递成功 所以副节点也没有确认
 12 | 如果主节点宕机
 13 | 日志记录随主节点宕机丢失
 14 | 它肯定在虚拟机监视器发送输出包之前丢失
 15 | 客户端不可能提前收到回复
 16 | 它们也不会识别出异常发生
 17 | 所以我们需要有输出规则进行限制
 18 | 
 19 | 我不了解 论文没有谈到虚拟机监视器的实现
 20 | 这是非常底层的知识
 21 | 像划分内存空间 计算页表
 22 | 与设备驱动交互 拦截指令
 23 | 搞明白Guest用户执行的命令等
 24 | 所以这是一些底层的东西 一般用C或C++编写的
 25 | 但我并不了解
 26 | 回到这里 主节点需要延迟回复
 27 | 先等副节点确认已经收到最新消息
 28 | 这是几乎所有主备复制模式性能的一道坎
 29 | 这种同步等待让主节点不会领先于副节点太多
 30 | 因为如果主节点在领先的情况下出现故障
 31 | 副节点就会出现数据延迟
 32 | 副节点的进度和客户端会不一致
 33 | 所以每种主备系统都有这样的问题
 34 | 某个时间主节点必须等待副节点
 35 | 这是对性能实打实的限制
 36 | 即使机器是在相邻机架上
 37 | 主节点发送消息和接收确认
 38 | 仍然需要等上0.5毫秒
 39 | 如果说像避免像地震 大范围断电等问题
 40 | 主副节点必须位于不同的城市
 41 | 那延迟大概会增加到5毫秒
 42 | 如果我们主副节点在不同城市进行复制
 43 | 每一个发送的包
 44 | 都需要等上5毫秒让日志记录到达副节点上
 45 | 然后响应确认 最后才能发送回复给客户端
 46 | 对于一些可靠性要求低的服务 可能并不成问题
 47 | 但是对于一些数据库服务
 48 | 比如需要每秒处理百万请求
 49 | 那将会对性能有极大的影响
 50 | 这也时在条件允许的情况下
 51 | 人们会使用一些不同的主副复制模式
 52 | 比如说在更高层次操作 并且需要解析操作内容
 53 | 然后不需要每个包都等待确认
 54 | 比如只在进行高层次操作时才等待
 55 | 只读操作完全不需要等待
 56 | 只需要等待写操作同步或者其他一些操作
 57 | 但你需要在应用层上实现这些区分
 58 | 你说的都是对的
 59 | 虚拟机监视器不需要阻止主节点执行命令
 60 | 只需要阻止输出就好
 61 | 这可能可以做得更好
 62 | 但至少这样在一个服务中
 63 | 可以在几微秒内响应客户端
 64 | 如果我们要先等待处于另一个城市的副节点响应
 65 | 那可能会让10微秒变成10毫秒
 66 | 如果你有大量客户端并发请求
 67 | 虽然可能在高延迟下完成大量处理
 68 | 但是你需要非常巧妙的设计才能做到
 69 | 这是个很好的想法
 70 | 但是如果你将消息记录到主节点内存中
 71 | 在主节点宕机时日志就会丢失
 72 | 通常认为服务器失效就意味着
 73 | 服务器内存中的内容都会丢失
 74 | 或者即使你不会如此
 75 | 比如说失效是因为主节点电源被意外拔掉
 76 | 但你有备用电源之类
 77 | 你也做不到
 78 | 副节点也做不到如此
 79 | 实际上系统在副节点的内存中记录了输出
 80 | 为了保证可靠记录 你必须遵守输出规则 等待确认
 81 | 所以这是个正确的想法 但是不能使用主节点内存来做
 82 | 再说一遍?
 83 | 这个想法很棒
 84 | 他问的是 能否输入由主节点接收 输出由副节点发送
 85 | 我完全没有想过
 86 | 这或许可以
 87 | 我不确定 这很有意思
 88 | 
 89 | 还有一个可能出现的情况
 90 | 主节点在输出已经发送出去之后宕机
 91 | 客户端已经收到回复
 92 | 然后主节点宕机
 93 | 副节点的输入还在事件缓冲中
 94 | 在副节点的虚拟机监视器种
 95 | 还没有投递到真正的服务副本
 96 | 副节点要顶替宕机的主节点
 97 | 它首先要消费所有未处理的记录
 98 | 以赶上主节点的进度
 99 | 否则主副节点进度就会不一致
100 | 副节点在接管服务之前要先消费完记录
101 | 最后一条记录是客户端的请求
102 | 副节点会在它之后开始接管服务
103 | 在传递客户端请求的中断之后
104 | 这意味着副节点计数器自增到11
105 | 然后生成一个输出包 因为这时候它接管服务
106 | 所以生成输出 客户端会受到两个11回复
107 | 如果真的发生这种情况的话是不对的
108 | 如果是单服务器的话这不应该发生
109 | 好消息则是
110 | 如果服务间是使用TCP通信
111 | 请求和响应都是通过TCP通道传输
112 | 当副节点接管时
113 | 副节点的状态和主节点一致 知道所有TCP连接
114 | 还有所有的序列号
115 | 当它产生这个包时
116 | 它会产生和原来的包一样的序列号
117 | 客户端的TCP栈会认为这是个重复的包
118 | 在TCP层就会将它丢弃掉
119 | 而在用户层软件中永远不会看到重复包
120 | 所以这个系统中
121 | 你可以认为问题被意外解决了或者被巧妙处理了
122 | 但事实上对于所有能够进行切换的复制系统
123 | 也就是大部分的复制系统
124 | 很难将他们设计成
125 | 保证切换时不会有重复输出
126 | 你可以在两边都引发报错
127 | 然后在两边都不生成输出
128 | 但是这种做法很糟糕
129 | 或者你可以允许切换时有两次输出
130 | 总之没有办法可以保证只有一次输出
131 | 两边都引发报错或者允许可能的重复输出
132 | 某种程度上说 所有复制模式的客户端
133 | 都需要重复包的检测机制
134 | 在这里我们使用TCP 不然的话也需要其他实现


--------------------------------------------------------------------------------
/lec04/Lec4-7.en.txt:
--------------------------------------------------------------------------------
  1 | maybe application level sequence numbers or I don't know what
  2 | and you'll see all of this
  3 | and actually you'll see versions of
  4 | essentially everything I've talked about like the output rule for example in labs 2 & 3
  5 | you'll design your own replicated state machine
  6 | yes
  7 | yes to the first part
  8 | so the scenario is
  9 | the primary sends the reply
 10 | and then either the primary send the close packet
 11 | or the client closes the connect the TCP connection after it receives the primary's reply
 12 | so now this's like no connection on the client side
 13 | but there is a connection on the backup side
 14 | and so now the backup
 15 | so the backup consumes the very last log entry that is the input is now live
 16 | so we're not responsible for replicating anything at this point right
 17 | because the backup is now live there's no other replica as the primary died
 18 | so there's no like if if we don't if the backup fails to execute in lockstep with the primary
 19 | that's fine actually
 20 | because the primary is is dead and we do not want to execute in lockstep with it
 21 | okay so the primary is now not it's live
 22 | it generates an output on this TCP connection that isn't closed yet from the backup point of view
 23 | this packet arrives at the client on a TCP connection
 24 | that doesn't exist anymore from the clients point of view
 25 | like no big whoopee on the client right
 26 | he's just going to throw away the packet as if nothing happened the application won't know
 27 | the client may send a reset
 28 | something like a TCP error or whatever packet
 29 | back to the backup and the backup does something or other with it
 30 | but it doesn't matter 
 31 | because we're not diverging from anything
 32 | because there's no primary to diverge from
 33 | you can just handle a straight reset however it likes
 34 | and what it'll in fact do is basically ignore it
 35 | but there's no now the backup has gone live there's no
 36 | we don't owe anybody anything as far as replication
 37 | yeah
 38 | well you can bet since the backup's memory image is identical to the primary's image
 39 | that they're sending packets with the very same source TCP number
 40 | and the very same everything
 41 | they're sending bit for bit identical packets
 42 | you know at this level the server's don't have IP addresses
 43 | or for our purposes
 44 | the virtual machines you know the primary and backup virtual machines have IP addresses
 45 | but the the physical computer and the vmm are transparent to the network
 46 | it's not entirely true but it's basically the case that
 47 | the virtual machine monitor in the physical machine
 48 | don't really have identity of their own on the network
 49 | because you can configure that then that way instead these they're not
 50 | you know the virtual machine with its own operating system in its own TCP stack
 51 | it has IP address and ethernet address and all sort of stuff
 52 | which is identical between the primary and the backup
 53 | and when it sends a packet
 54 | it sends it with the virtual machine's IP address and Ethernet address
 55 | and those bits at least in my mental model are just simply passed through on to the local area network
 56 | it's exactly what we want
 57 | and so it will generate exactly the same packets
 58 | that the primary would have generated
 59 | there's maybe a little bit of trickery you know what the we
 60 | if this is these are actually plugged into an Ethernet switch
 61 | into the physical machines maybe plugged into different ports of an Ethernet switch
 62 | and we'd like the Ethernet switch to change its mind about
 63 | which of these two machines that delivers packets with replicated services Ethernet address
 64 | and so there's a little bit of funny business there
 65 | for the most part they're just generating identical packets
 66 | and we just send them out
 67 | okay so another little detail I've been glossing over is that
 68 | I've been assuming that the primary just fails or the backup just fails
 69 | that is fail-stop right
 70 | but that's not the only option
 71 | another very common situation that has to be dealt with is
 72 | if the two machines are still up and running and executing
 73 | but there's something funny happen on the network
 74 | that causes them not to be able to talk to each other
 75 | but to still be able to talk to some clients
 76 | so if that happened if the primary backup couldn't talk to each other
 77 | but they could still talk to the clients
 78 | they would both think oh the other replica is dead
 79 | I better take over and go live
 80 | and so now we have two machines going live with this service
 81 | and now you know they're no longer sending each other log events or anything
 82 | they're just diverging
 83 | maybe they're accepting different client inputs and change their states in different ways
 84 | so now we have a split brain disaster
 85 | if we let the primary and the backup go live
 86 | because it was a network that has some kind of failure instead of these machines
 87 | and the way that this paper solves it I mean
 88 | is by appealing to an outside authority to make the decision about
 89 | which of the primary or the backup is allowed to be live
 90 | and so
 91 | it there you know turns out that their storage is actually not on local disk
 92 | this almost doesn't matter
 93 | but their storage is on some external disk server
 94 | and as well as being in this server as a like totally separate service
 95 | there's nothing to do with disks
 96 | there this server happens to export this test-and-set
 97 | test-and-set service over the network where you
 98 | you can send a test-and-set request to it
 99 | and there's some flag it's keeping in memory
100 | and it'll set the flag and return what the old value was
101 | so both primary and backup have to sort of acquire this test-and-set flag
102 | it's a little bit like a lock
103 | in order to go live they both may be send test-and-set requests at the same time
104 | to this test-and-set server
105 | the first one gets back a reply that says oh the flag used to be zero
106 | now it's one the second request to arrive
107 | the response from the test-and-set server is
108 | Oh actually the flag was already one when your request arrived
109 | so so basically you're not allowed to be primary
110 | and so this this test-and-set server
111 | and we can think of it as a single machine
112 | is the arbitrator that decides which of the two should go live
113 | if they both think the other one is dead due to a network partition
114 | any questions about this mechanism
115 | you're busted
116 | yeah the test-and-set server should be dead at the critical moment when
117 | and so actually even if there's not a network partition
118 | under all circumstances in which
119 | one or the other of these wants to go live because it thinks the others dead
120 | even when the other one really is dead
121 | the one that wants to go live still has to acquire the test-and-set lock
122 | because one of like the deep rules of 6.824 game is that
123 | you cannot tell whether another computer is dead or not
124 | all you know is that you stopped receiving packets from it
125 | and you don't know whether it's because the other computer is dead
126 | or because something has gone wrong with the network between you and the other computer
127 | so all the backup ceases well I've stuck in packets
128 | maybe the primary is dead maybe it's live
129 | primary probably sees the same thing
130 | so if there's a network partition
131 | they certainly have to ask the Test-and-Set server
132 | but since they don't know if it's a network partition
133 | they have to ask the test-and-set server regardless of whether it's a partition or not
134 | so anytime either wants to go live
135 | the test-and-set server also has to be alive
136 | because they always have to acquire this test-and-set lock
137 | so the test-and-set server sounds like a single point of failure
138 | they were trying to build a replicated fault tolerant whatever thing
139 | but in the end you know we can't failover unless unless this is alive so
140 | that's a bit of a bummer
141 | I'm guessing though
142 | I'm making a strong guess that the test-and-set server is actually
143 | itself a replicated service and is fault tolerant right
144 | it's almost certainly I mean these people of VMware
145 | they're like happy to sell you a million dollar highly available storage system
146 | that uses enormous amounts of replication internally
147 | um since the test-and-set thing is on their this server
148 | I'm I'm guessing it's replicated too
149 | and the stuff you'll be doing in lab 2 in lab 3 is more than powerful enough
150 | for you to build your own fault-tolerant test-and-set server
151 | so this problem can easily be eliminated
152 | 


--------------------------------------------------------------------------------
/lec04/Lec4-7.zh.txt:
--------------------------------------------------------------------------------
  1 | 可能是应用程序级别的序列号，或者我不知道的
  2 | 你会看到所有的这些
  3 | 实际上你会在lab2和lab3中看到
  4 | 基本上我讲过的所有内容的，例如这个输出规则
  5 | 你将设计自己的复制状态机
  6 | 是
  7 | 对第一部分
  8 | 场景是
  9 | primary发送答复
 10 | 然后要么primary服务器发送关闭数据包
 11 | 要么客户端在收到primary的答复后关闭TCP连接
 12 | 所以现在在客户端，没有连接
 13 | 但backup端有连接
 14 | 所以现在backup
 15 | backup消耗了输入中的最后一个日志条目，（backup）变成在线状态
 16 | 在这个时间点我们不负责复制任何内容
 17 | 因为backup现在是在线状态，没有其他副本因为primary已死
 18 | 因此，如果backup无法与primary步骤一致
 19 | 其实也没什么问题
 20 | 因为primary已死，我们不想和它步骤一致
 21 | 好，现在考虑primary没有死，还活着
 22 | 从backup的角度来看TCP连接尚未关闭，primary在这个TCP连接上产生一个输出
 23 | 该数据包通过TCP连接到达客户端
 24 | 从客户的角度来看已经不存在了
 25 | 客户没有大惊小怪
 26 | 它只是将数据包丢掉，好像什么都没发生，应用程序不会知道
 27 | 客户可以发送一个重置
 28 | 类似TCP错误或任何数据包
 29 | 回到backup，backup就可以执行其他操作
 30 | 但这没关系
 31 | 因为我们没有产生分歧
 32 | 因为没有primary可以分歧
 33 | 你可以应付一个直接的重置，不管怎样
 34 | 实际上它基本上会忽略它
 35 | 现在backup已经上线了，没有（primary）了
 36 | 就复制而言，我们不欠任何人任何东西
 37 | 是的
 38 | 好吧，你可以打赌，因为backup内存映像与primary映像相同
 39 | 他们发送具有相同源TCP编号的数据包
 40 | 他们都是一样的
 41 | 他们发送每个比特位都相同的的数据包
 42 | 在这一层，服务器没有IP地址
 43 | 或出于我们的目的
 44 | backup虚拟机和primary虚拟机具有IP地址
 45 | 但是物理计算机和vmm对网络是透明的
 46 | 这不完全准确，但基本上是这样的
 47 | 物理机中的虚拟机监视器
 48 | 在网络上实际上没有自己的身份
 49 | 你可以配置成那样，而不是
 50 | 在自己的TCP堆栈中具有自己操作系统的虚拟机
 51 | 它有IP地址和以太网地址，和其他所有内容
 52 | 在backup和primary都完全相同
 53 | 当它发送一个数据包
 54 | 它把虚拟机的IP地址和以太网地址一起发送
 55 | 根据我脑中的模型，这些比特只是简单地传递到局域网
 56 | 这正是我们想要的
 57 | 所以它会产生完全相同的数据包
 58 | 和primary会产生数据包的一致
 59 | 也许有些小诡计，我们
 60 | 如果是，这些实际上是插入到以太网交换机中的
 61 | 这些物理机器可能插入在以太网交换机的不同端口中
 62 | 我们希望以太网交换机改变主意
 63 | 这两台机器中的哪台传递复制的以太网地址的数据包
 64 | 所以那里有一些有趣的事情
 65 | 在大多数情况下，它们只是生成相同的数据包
 66 | 我们只是把它们发送出去
 67 | 好吧，我一直在讲的另一个小细节是
 68 | 我一直假设primary失败或backup失败
 69 | 那是fail-stop对吧
 70 | 但这不是唯一的可能
 71 | 必须处理的另一个非常普遍的情况是
 72 | 如果两台计算机仍处于启动状态并且正在运行
 73 | 但是网络上发生了一些有趣的事情
 74 | 导致他们无法互相交谈
 75 | 但仍然能够与一些客户端交谈
 76 | 如果发生这种情况，如果prumary，backup无法互相通信
 77 | 但他们仍然可以与客户交谈
 78 | 他们都会以为另一个副本死了
 79 | 我最好接管并开始上线
 80 | 因此，现在我们有两台机器上线提供服务
 81 | 现在你知道他们不再发送彼此的日志事件或任何其他信息
 82 | 他们开始分歧
 83 | 也许他们接受不同的客户输入，并且各自的状态变得不同
 84 | 所以现在我们发生了脑裂灾难
 85 | 如果我们让backup和primary生效
 86 | 因为这是一个网络故障，而不是这些机器故障
 87 | 这篇论文解决问题的方式
 88 | 是通过寻求外部权威做出有关
 89 | 允许哪个backup或primary上线的决定
 90 | 所以
 91 | 他们的存储实际上不在本地磁盘上
 92 | 这几乎没关系
 93 | 但它们的存储在某些外部磁盘服务器上
 94 | 像完全独立的服务一样位于该服务器中
 95 | 与磁盘无关
 96 | 这个服务器碰巧在网络中提供这个test-and-set服务
 97 | 这个服务器碰巧在网络中提供这个test-and-set服务
 98 | 你可以发送test-and-set要求给它
 99 | 它在内存维护一些标志
100 | 它会设置标志并返回原来的值
101 | 因此primary和backup都必须获得此test-and-set标志
102 | 这有点像锁
103 | 为了上线，他们可能同时发送test-and-set请求
104 | 给这个test-and-set服务器
105 | 第一个返回一个答复，说哦，标志曾经是零
106 | 现在是第二个请求到达
107 | test-and-set服务器的响应是
108 | 哦，实际上，当你的请求到达时，该标志已经是1
109 | 所以你不允许成为primary
110 | 所以这个test-and-set服务器
111 | 我们可以将其视为一台机器
112 | 是决定两者中哪一个应该生效的仲裁员
113 | 如果他们俩都认为其他人由于网络分区而死亡的话
114 | 有关此机制的任何问题
115 | 你被淘汰了
116 | 是的，如果test-and-set服务器在这个关键时刻死了
117 | 实际上即使没有网络分区
118 | 在任何情况下
119 | 其中一个希望上线，因为它认为其他人死了
120 | 即使对方真的死了
121 | 想要上线的那个也必须获得test-and-set锁
122 | 因为像6.824游戏的深层规则之一是
123 | 你无法判断另一台计算机是否坏掉了
124 | 你所知道的是，你不再能接收到数据包
125 | 而且你不知道是因为另一台计算机已死
126 | 还是由于你和另一台计算机之间的网络出了点问题
127 | 所以所有backup都停止，我被卡在数据包中
128 | 也许primary已经死了，也许还活着
129 | primary可能看到相同的东西
130 | 所以如果有网络分区
131 | 他们一定要问test-and-set服务器
132 | 但由于他们不知道这是否是网络分区
133 | 他们必须询问test-and-set服务器，不管它是否是分区
134 | 所以任何时候都想上线
135 | test-and-set服务器也必须处于在线状态
136 | 因为他们总是必须获得此test-and-set锁
137 | 因此test-and-set服务器听起来像是单点故障
138 | 他们试图建立一个复制的容错的东西
139 | 但最后你知道我们无法进行故障转移，除非它仍然活着
140 | 这有点令人烦恼
141 | 我猜
142 | 我强烈猜测test-and-set服务器实际上是
143 | 本身是有复制的服务，是具有容错性的
144 | 几乎可以肯定，我是说，VMware的这些人
145 | 他们很高兴向你出售百万美元的高可用性存储系统
146 | 在内部使用大量复制
147 | 嗯，因为test-and-set的东西在他们的这个服务器上
148 | 我在猜它也被复制了
149 | 你在lab2中和lab3中做的事情足够强大
150 | 你可以用它构建自己的容错的test-and-set服务器
151 | 所以这个问题很容易消除
152 | 


--------------------------------------------------------------------------------
/lec04/Lec4.en.txt:
--------------------------------------------------------------------------------
1 | all right today I want to talk about bit  more about fault tolerance and  replication and then look into the  details of today's paper about vmware ft  the topics still fault tolerance to  provide high availability that is you  want to build a server that even if some  hardware you know computer crashes is  involved in the service we still like to  provide the service and to the extent we  can we'd like to provide our service  also if there's network problems and the  tool we're using its replication least  for this part of the course so it's  worth asking what kind of failures  replication can be expected to deal with  because it's not everything by any means  so maybe the easiest way to characterize  the kind of failures we're talking about  is fail stop failures of a single  computer and what I mean by fail stop  it's a sort of generic term and fault  tolerance is that if something goes  wrong would say the computer the  computer simply stops executing it just  stops if anything goes wrong and in  particular it doesn't compute incorrect  results so if somebody kicks the power  cable out of your server that's probably  gonna generate a fail stop failure  similarly if they unplug your servers  network connection even though the  server is still running so this is a  little bit funny you know be totally cut  off from the network so it looks at me  outside like it just stopped so it's  really these failures we can deal with  with replication this also covers some  hardware problems like you know maybe if  the fan on your server breaks because it  you know it cost 50 cents maybe that'll  cause the CPU to overheat and the CPU  will shut itself down cleanly and just  stop executing  what's not covered by the kind of  replication systems we're talking about  is things like bugs and software or  design defects in hardware so basically  not bugs because if we take some service  you know say you're a MapReduce master  for example you know we replicated and  run it on two computers you know if  there's a bug in your MapReduce master  or my MapReduce master let's say  replications not going to help us we're  going to compute the same incorrect  result on both of our copies of our  MapReduce master and everything looked  fine they'll agree you just happen to be  the wrong answer so we can't depending  against bugs in the replicated software  and we can't defend against bugs in the  whatever scheme we're using to manage  the replication and similarly as I  mentioned before we can't expect to deal  with bugs in the hardware the hardware  it computes incorrectly that's just  that's the end for us at least with this  kind of technique although you know that  said there are definitely hardware and  software bugs that that replication  might if you're lucky might be able to  cope it so if there's some unrelated  software running in your server and it  causes the server to crash maybe because  your kernel to panic and reboot or  something it has nothing to do with you  know with your with the service you're  replicating then that kind of failure  for us for your service will may well be  fail stop  you know the kernel will panic and the  backup replicas will take over similarly  some kinds of hardware errors can be  turned into fail stop errors for example  if you send a packet over the network  and the network corrupts it just flips a  bit in your packet that will almost  certainly be caught by the checksum on  the packet same thing for a disk block  if you write some data to disk and read  it back a month later you know maybe the  magnetic surface isn't perfect and you  know one of the best couple of bits were  wrong in the block as it's right back  it's actually error correcting  that up to a certain point will fix  errors in disk blocks that you'll be  turning you know random hardware errors  into as either correcting them if you're  super lucky or at least detecting them  and turning random corruption into a  detected fault which you know the  software then knows that something that  wrong and can turn it into a fail stop  fault by stopping executing or take some  other remedial action but in general we  really can only expect to handle fail  stop faults there's other limits to  replication to you know the the failures  in the if we have a primary in the back  of our two replicas or whatever we're  really assuming that failures in the two  are independent right if there tend to  have correlated failures then  replication is not going to help us so  for example if we're a big outfit and we  buy thousands of computers batches of  thousands of computers identical  computers from the same manufacturer and  we run you know our replicas is on all  on those computers we bought at the same  time from the same place that's a bit of  a risk  maybe because presumably if one of them  has a manufacturing defect in it there's  a good chance that the other ones do too  you know one of them's prone to  overheating because the manufacturer you  know didn't provide enough airflow well  it probably all had that problem and so  one of them overheats and dies it's a  good chance that the other ones will too  so that's one kind of correlated failure  you just have to be careful of another  one is that you know if there's an  earthquake and the city where our  datacenter is probably gonna take out  the whole data center you know we can  have all the replication we like inside  that data center it's not going to help  us because the failure caused by an  earthquake or a citywide power failure  or something the building burning down  is like it's correlated failure between  our replicas if they're on that building  so if we care about dealing with  earthquakes then we need to put our  replicas in maybe in just different  cities at least physically separate  enough that they have separate power  unlikely to be affected by the same  natural disaster  okay but that's all sort of hovering in  the background for this discussion where  we're talking about the technology you  might use another question about  replication is whether it's worthwhile  you may ask yourself gosh you know this  literally uses these replication schemes  use twice as much or three times as much  computer resources right we need to have  you know GFS had three copies of every  blocks we have to buy three times as much  disk space the paper for today  you know replicates just once but that  means we have twice as many computers  and CPUs and RAM it's all for expensive  like is that really worth it that  expense and you know that's not  something we can answer technically  right it's an economic question it  depends on the value of having an  available service you know if you're  running a bank and if the consequence is  the computer failing is that your  customer you can't serve your customers  and you can't generate revenue and your  customers all hate you then it may well  be worth it to blow you know an extra  ten or twenty thousand bucks on a second  computer so you can have a replica on  the other hand if you're me and you're  running the 6.824 web server I don't  consider it worthwhile to have a hot  backup of the 84 web server because the  consequences of failure are very low so  the whether the replication is  worthwhile on how many replicas you  ought to have and how much you're  willing to spend on it is all about how  much cost and inconvenience failure  would call it cause you all right this  paper sort of in the beginning mentions  as there's a couple of different  approaches to replication really  mentions two one two calls state  transfer and the other calls replicated  state machine most of the schemes we're  going to talk about in this class are  replicated state machines  it'll talk about both anyway the idea  behind state transferor's that if we  have two replicas of a server the way  you cause them to be to stay in sync  that is to be actual replicas so that  the backup can has everything it needs  to take over if the primary fails in a  state transfer scheme the way that works  is that the primary sends a copy of its  entire state that is for example the  contents of its RAM to the backup and  the backup just sort of stores the  latest state and so it's all there  the primary fails in the backup can  start executing with this last state it  got if the primary fails so this is all  about sending the state of the of the  primary and for today's if today's paper  worked as a state transfer system which  it doesn't then the state we'd be  talking about would be the contents of  the RAM the contents of the memory of  the primary so maybe every once while  the primary would just you know make a  big copy of its memory and send it  across the network to the backup you can  imagine if you wanted to be efficient  you know maybe you would only send the  parts of the memory that it's changed  since the last time you sent in memory  to the backup the replicated state  machine  this approach observes that most  services are most computer things we  want to replicate have some internal  operation that's deterministic except  when external input comes in right you  know ordinarily if there's no external  influences on a computer it just  executes one instruction after another  and what each instruction does is a  deterministic function of what's in the  memory and the registers of the computer  and it's only when external events  intervene that something unexpected may  happen like a packet arrives of a some  random time and that causes the server  to start doing something differently I'm  so replicated state machine schemes  don't send the state between the  replicas instead they just send those  external events they just send maybe  from a primary to a backup again just  send things like arriving input from the  outside world that the backup needs to  know  and the observation is that you know if  you have to two computers and they start  from the same state and they see the  same inputs that that in the same order  or at the same time the two computers  will continue to be replicas of each  other and sort of execute identically as  long as they both see the same inputs at  the same time so this transfers probably  memory and this transfer some primary  backup just operations from clients or  external external inputs or external  events and you know the reason why  people tend to favor a replicated state  machine is that usually operations are  smaller than the state but this you know  the state of a server if it's a database  server might be the entire database  might be you know gigabytes whereas the  operations are just some clients sending  and you know please read or write key 27  operations are usually small the states  usually large so replicate a state  machine usually looks attractive and  slight downside is that the schemes tend  to be quite a bit more complicated and  rely on sort of more assumptions about  how the computers operate whereas this  is a really heavy-handed I'm just gonna  send you my whole state sort of a  nothing to worry about  any questions about these strategies yes  well the did ok so the question is  suppose something went wrong with our  scheme and the backup was not actually  identical to the primary so you know  you're suppose we were running GFS  master and it's the primary it just  handed out at least two chunks server  one but because the two you know because  we've allowed the states of the primary  back to drift out of sync the backup did  not issue at least to anybody it wasn't  even away or anybody had asked for these  so now the primary thinks you know  chunks everyone has lease for some chunk  in the backup doesn't the primary fails  backup takes over right now chunks over  one thinks it has a lease for some chunk  but then the current master doesn't and  is happy to hand out the lease to some  other trunk server now we have to chunk  servers serving the same lease okay so  that's just a close to home example but  really you know almost any bad thing and  kind of I think you construct any bad  scenario by just imagining some service  that confuse the wrong answer because  the state's leverage  so you're asking about randomization  yeah oh y'all talk about this I'll talk  about this a bit later on but it is good  that the replicated state scheme  definitely makes the most sense when the  instructions that the primary in the  back of our executing do the same thing  as long as there's no external events  right and that's almost true right you  know for an add instruction or something  yeah you know if the starting if the  registers and memory of the same and they  both execute an add instruction add instruction  has the same inputs in  the same outputs but they're in some  instructions as you point out that don't  like maybe there's an instruction that  gets the current time of day now  probably be executed at slightly  different times or an instruction that  gets the current processors unique ID  and a serial number it's going to yield  the different answers and the the the  uniform answered the questions that  sound like this is that the primary does  it and sends the answer to the backup  and the backup does not execute that  instruction but instead at the point  where it would execute that instruction  it listens for the primary to tell it  what the right answer would be and just  sort of fakes that answer to the  software I'll talk about you know how  the VMware scheme does that okay  interestingly enough though today's  paper is all about a replicated state  machine you may have noticed that  today's paper only deals with you know  processors and it's not that clear how  it could be extended to a multi-core and  a multi-core machine where the  interleavings of the instructions from  the two cores organ are  non-deterministic all right so we no  longer have this situation on a  multi-core machine where if we just let  the primary and backup execute they're  you know all else being equal they're  going to be the same because they won't  execute on multiple cores VMware has  since come out with a new possibly  completely different replication system  that does work on multi-core and the new  system appears to me to be using state  transfer instead of replicated state  machine because state transferred is  more robust in the face  multi-core and parallelism if you use  the machine and send the memory over you  know that the memory image is just that  just is the state of the machine and  sort of it doesn't matter that there was  parallelism whereas the replicated state  machine scheme really has a problem with  the parallelism you know on the other  hand I'm guessing that this new  multi-core scheme is more expensive okay  all right so if we want to build a  replicated state machine scheme we got a  number of questions to answer so we need  to decide at what level we're gonna  replicate state right so what state what  do we mean by state we have to worry  about how how closely synchronized the  primary and backup have to be right  because it's likely the primary will  execute a little bit ahead of the backup  after all it it's the primary that sees  the inputs so the backup almost  necessarily must lag over that gives  that means there's an opportunity if the  primary fails for the prime for the  backup not to be fully caught up having  the backup actually executes really in  lockstep with the primaries for  expensive because it requires a lot of  chitchat so a lot of designs a lot of  what people sweat about is how close the  synchronization is if the primary fails  or you know actually if the backup fails too  but it's more exciting if the primary  fails there has to be some scheme for  switching over and the clients have to  know oh gosh I instead of talking to the  old primary on server one I should now  be talking to the  the backup on server to all the clients  have to somehow figure this out the  switch over almost certainly it's almost  impossible maybe impossible to design a  cut over system in which no anomalies  are every are ever visible you know in  this sort of ideal world if the primary  fails we'd like nobody to ever notice  none of the clients to notice turns out  that's basically unattainable so there's  going to be anomalies during the cut  over and we've gotta figure out a way to  cope with them and finally if the one of  the two if one of our replicas fails we  really need to have a new replica right  if we have a two replicas and one fails  we're just living on borrowed time right  because the second replica may fail at  some point so we absolutely need to get  a new replica back online as fast as  possible so and that can be very  expensive the state is big you know you  know but the reason we like to replicate  a state machine was because we thought  state transfer would be expensive but  the two replicas in a replicated state  machine still need to have full state  right we just had a cheap way of keeping  them both in sync if we need to create a  new replica we actually have no choice  but state transfer to create the new  replicas the new replica needs to have a  complete copy of the state so it's going  to be expensive to create new replicas  and this is often people spending well  actually people spend a lot of time  worrying about all these questions and  you know we'll see them again as we look  at other replicated state machine  schemes so on the topic of what state to  replicate the today's paper has a very  interesting answer to this question it  replicates the full state of the machine  that is all of memory and all the  Machine registers it's like a very very  detailed replication scheme just no  difference at the even of the lowest  levels between the primary in the backup  that's quite rare for replication  schemes  almost always you see something that's  more like GFS where GFS absolutely did  not replicate you know they had  replication but it wasn't replicating  every single you know bit of memory  between the primaries and the backups  it was replicating much more application  level table of chunks  I had this abstraction of you know  chunks and chunk identifiers and that's  what it was replicating it wasn't  replicating sort of everything else  wasn't going to the expense of  replicating every single other thing  that machines we're doing okay as long  as they had the same sort of application  visible set of of chunks so most  replication schemes out there go the GFS  route in fact almost everything except  pretty much this paper and a few handful  of similar systems almost everything  uses application at some level  application level of replication because  it can be much more efficient because we  don't have to go to the we don't have to  go to the trouble of for example making  sure that interrupts occur at exactly  the same point in the execution of the  primary and backup GFS does not sweat  that at all but this paper has to do  because it replicates at such a low  level so most people build efficient  systems with applications specific  replication the consequence of that  though is that the replication has to be  built into the right into the  application right if you're getting a  feed of application level operations for  example you really need to have the  application participate in that because  some generic replication thing like  today's paper  doesn't really can't understand the  semantics of what needs to be replicated  so anyways so most teams are application  specific like GFS and every other paper  we're going to read on this topic  today's paper is unique in that it  replicates at the level of the machine  and therefore does not care what  software you run on it right it  replicates the low-level memory and  machine registers you can run any  software you like on it as long as it  runs on that kind of microprocessor  that's being represented this  replication scheme applies to the  software can be anything  and you know the downside is that it's  not that efficient necessarily the  upside is that you can take any existing  piece of software maybe you don't even  have source code for it or understand  how it works and you know do within some  limits you can just run it under this  under VMware this replication scheme and  it'll just work which is sort of magic  fault-tolerance wand for arbitrary  software all right now let me talk about  how this is VMware FT first of all  VMware is a virtual machine company  they're what their business is a lot of  their business is selling virtual  machine technology and what virtual  machines refer to is the idea of you  know you buy a single computer and  instead of booting an operating system  like Linux on the hardware you boot  we'll call a virtual machine monitor or  hypervisor on the hardware and the  hypervisor is job is actually to  simulate multiple multiple computers  multiple virtual computers on this piece  of hardware so the virtual machine  monitor may boot up you know one  instance of Linux may be multiple  instances of Linux may be a Windows  machine you can the virtual machine  monitor on this one computer can run a  bunch of different operating systems you  know each of these as is itself some  sort of operating system kernel and then  applications so this is the technology  they're starting with and you know the  reason for this is that if you know you  need to it just turns out there's many  many reasons why it's very convenient to  kind of interpose this level of  indirection between the hardware and the  operating systems and means that we can  buy one computer and run lots of  different operating systems on it we can  have each if we run lots and lots of  little services instead of having to  have lots and lots of computers one per  service you can just buy one computer  and run each service in the operating  system that it needs I'm using this  virtual machines so this was their  starting point they already had this  stuff and a lot of sophisticated things  built around it at the start of  designing vmware ft so this is just  virtual machines um what the papers  doing is that it's gonna set up one  machine or they did requires two  physical machines because there's no  point in running the primary and backup  software in different virtual machines  on the same physical machine because  we're trying to guard against hardware  failures so you're gonna to at least you  know you have two machines running their  virtual machine monitors and the primary  it's going to run on one the backups and  the other so on one of these machines we  have a guest you know we only it might  be running a lot of virtual machines we  only care about one of them it's gonna  be running some guest operating system  and some sort of server application  maybe a database server MapReduce master  or something so I'll call this the  primary and there'll be a second machine  that you know runs the same virtual  machine monitor and an identical virtual  machine holding the backup so we have  the same whatever the operating system  is exactly the same and the virtual  machine is you know giving these guest  operating systems the primary and backup  a each range of memory and this memory  images will be identical or the goal is  to make them identical in the primary in  the backup we have two physical machines  each one of them running a virtual  machine guest with a its own copy of the  service we care about we're assuming  that there's a network connecting these  two machines and in addition on this  local area network in addition on this  network there's some set of clients  really they don't have to be clients  they're just maybe other computers that  our replicated service needs to talk  with some of them our clients  sending requests it turns out in this  paper there the replicated service  actually doesn't use a local disk and  instead assumes that there's some sort  of disk server that it talks to him  although it's a little bit hard to  realize this from the paper the scheme  actually does not really treat the de  server particularly especially it's just  another external source of packets and  place that the replicated state machine  may send packets do not very much  different from clients okay so the basic  scheme is that the we assume that these  two replicas the two virtual machines  primary and backup are our exact  replicas some client you know database  client who knows who has some client of  our replicated server sends a request to  the primary and that really takes the  form of a network packet that's what  we're talking about that generates an  interrupt  and this interrupts actually goes to the  virtual machine monitor at least in the  first instance the virtual machine  monitor sees a hot here's the input for  this replicated service and so the  virtual machine monitor does two things  one is it sort of simulates a network  packet arrival interrupt into the  primary guest operating system to  deliver it to the primary copy of the  application and in addition the virtual  machine monitor you know knows that this  is an input to a replicated virtual  machine and it's so it sends back out on  the network a copy of that packet to the  backup virtual machine monitor it also  gets it and backup virtual machine  monitor knows ha it is a packet for this  particular replicated state machine and  it also fakes a sort of network packet  arrival interrupt at the backup and  delivers the packet so now both the  primary and the back have a copy this  packet they looks at the same input you  know with a lot of details are gonna  process it in the same way and stay  synchronized  course the service is probably going to  reply to the client on the primary the  service will generate a reply packet and  send it on the NIC that the virtual  machine monitor is emulating and then  the virtual machine monitor or will  we'll see that output packet on the  primary they'll actually send the reply  back out on the network to the client  because the backup is running exactly  the same sequence of instructions it  also generates a reply packet back to  the client and sends that reply packet  on its emulated NIC it's the virtual  machine monitor that's emulating that  network interface card and it says aha  you know the virtual machine monitor  says I know this was the backup only the  primary is allowed to generate output  and the virtual machine monitor drops  the reply packet so both of them see  inputs and only the primary generates  outputs as far as terminology goes the  paper calls this stream of input events  and other things other events we'll talk  about from the stream is called the  logging Channel it all goes over the  same network presumably but these events  the primary since the back of our called  log events on the log Channel  where the fault tolerance comes in is  that those the primary crashes what the  backup is going to see is that it stops  getting stuff on the stops getting log  entries a log entry stops getting log  entries on the logging channel and we  know it it turns out that the backup can  expect to get many per second because  one of the things that generates log  entries is periodic timer interrupts in  the in the primary each one of which  turns out every interrupt generates a  log entries into the backup these timer  interrupts are going to happen like 100  times a second so the backups can  certainly expect to see  a lot of chitchat on the logging Channel  if the primaries up if the primary  crashes then the virtual machine  monitored over here will say gosh you  know I haven't received anything on the  logging channel for like a second or  however long the primary must be dead or  or something and in that case when the  backup stop seeing log entries from the  primary the paper the way the paper  freezes it is that the backup goes live  and what that means is that it stops  waiting for these input events on the  logging Channel from the primary and  instead this virtual machine monitor  just lets this backup execute freely  without waiting for without being driven  by input events from the primary the vmm  does something to the network to cause  future client requests to go to the  backup instead of the primary and the  VMM here stops discarding the backup  personnel it's the primary not the  backup stops discarding output from this  virtual machine so now this or machine  directly gets the inputs and there's a  lot of produce output and now our backup  is taken over and similarly you know  that this is less interesting but has to  work correctly  if the backup fails a similar primary  has to use a similar process to abandon  the backup stop sending it events and  just sort of act much more like a single  non replicated server so either one of  them can go live if the other one  appears to be dead stops you know stops  generating network traffic  magic now it depends you know depends on  what the networking technology is I  think with the paper one possibility is  that this is sitting on Ethernet every  physical computer on the Internet or  really every NIC has a 48 bit unique ID  I'm making this up now the it could be  that in fact instead of each physical  computer having a unique ID each virtual  machine does and when the backup takes  over it essentially claims the primary's  Ethernet ID as its own and it starts  saying you know I'm the owner of that ID  and then other people on the ethernet  will start sending us packets that's my  interpretation the designers believed  they had identified all such sources and  for each one of them the primary does  whatever it is you know executes the  random number generator instruction or  takes an interrupt at some time the  backup does not and the back of virtual  machine monitor sort of detects any such  instruction and and intercepts that and  doesn't do it and he said the backup  waits for an event on the logging  Channel saying this instruction number  you know the random number was whatever  it was on the primary  Edwige  yes yes  yeah the paper hints that they got Intel  to add features to the microprocessor to  support exactly this but they don't say  what it was okay  okay so on that topic the so far that  you know the story is sort of assumed  that as long as the backup to sees the  package from the clients it'll execute  in identically to the primary and that's  actually glossing over some huge and  important details so one problem is that  as a couple of people have mentioned  there are some things that are  non-deterministic now it's not the case  that every single thing that happens in  the computer is a deterministic function  of the contents of the memory of the  computer it is for a sort of straight  line code execution often but certainly  not always so worried about is things  that may happen that are not a strict  function of the current state that is  that might be different if we're not  careful on the primary and backup so  these are sort of non-deterministic  events that may happen so the designers  had to sit down and like figure out what  they all work and here are the ones  here's the kind of stuff they talked  about so one is inputs from external  sources like clients which arrive just  whenever they arrive right they're not  predictable there are no sense in which  the time at which a client request  arrives or its content is a  deterministic function of the services  state because it's not so these actually  this system is really dedicated to a  world in which services only talk over  the network and so the only really  basically the only form of input or  output in this system is supported by  this system seems to be network packets  coming and going so we didn't put  arrives at what that really means it's a  packet  arrives and what a packet really  consists of for us is the data in the  packet plus the interrupt  that's signaled that the packet had  arrived so that's quite important so  when a packet arrives  I'm ordinarily the NIC DMA is the packet  contents into memory and then raises an  interrupt which the operating system  feels and the interrupt happens at some  point in the instruction stream and so  both of those have to look identical on  the primary and backup or else we're  gonna have they're also executions gonna  diverge and so you know the real issue  is when the interrupt occurs exactly at  which instruction the interrupts happen  to occur and better be the same on the  primary in the backup otherwise their  execution is different and their states  are gonna diverge and so we care about  the content of the packet and the timing  of the interrupt and then as a couple of  people have mentioned there's a few  instructions that that behave  differently on different computers or  differently depending on something like  there's maybe a random number generator  instruction there's I get time-of-day  instructions that will yield different  answers have called at different times  and unique ID instructions another huge  source of non determinism which the  paper basically rules out is multi-core  parallelism this is a uni-process only  system there's no multi-core in this  world the reason for this is that if it  allowed multi-core then then the service  would be running on multiple cores and  the instructions of the service the rest  of you know the different cores are  interleaved in some way which is not  predictable and so really if we run the  same code on the on the backup in the  server if it's parallel code running on  a multi-core the tubo interleave the  instructions in the two cores in  different ways the hardware will and  that can just cause  different results because you know  supposing the code and the two cores you  know they both asked for a lock on some  data well on the master you know  core one may get the lock before core two  on the slave just because of a tiny  timing difference core two may got the  lock first and the you know execution  results are totally different likely to  be totally different if different  threads get the lock   so multi-core is the grim source among  non-determinisms just totally  outlawed in this papers world and indeed  like as far as I can tell the techniques  are not really applicable the service  can't use multi-core parallel  parallelism the hardware is almost  certainly multi-core parallel but that's  the hardware sitting underneath the  virtual machine monitor the machine that  the virtual machine monitor exposes to  one of the guest operating systems that  runs the primary backup that emulated  virtual machine is a unicore it's a  uni-processor machine in this paper and I'm  guessing there's not an easy way for  them to adapt this design to multi-core  virtual machines  okay so so these are really it's it's  it's these events that go over the  logging channel and so the format of a  log record a log log entry they don't  quite say but I'm guessing that there's  really three things in a log entry  there's the instruction number at which  the event occurred because if you're  delivering an interrupt or you know  input or whatever it better be delivered  at exactly the same place in the primary  backup so we need to know the  instruction number and by instruction  number I mean you know the number of  instructions since the Machine booted  why not the instruction address but like  oh or executing the four billion and   entry is going to have instruction  number four an interrupt for input it's  going to be the instruction at which the  interrupt was delivered on the primary  and for a weird instruction like get at  time of day it's going to be the  instruction number of the instruction of  the get time of day or whatever  instruction that was executed on the  primary so that you know the backup  knows where to where to call this event  to occur okay so there's gonna be a type  you know network input whatever a weird  instruction and then there's I'm gonna  be data for a packet arrival it's gonna  be the packet data for one of these  weird instructions it's going to be the  result of the instruction when it was  executed on the primary so that the  backup virtual machine can sort of fake  the instruction and supply that same  result  okay so so as an example the both of  these operating systems guest operating  system assumes requires that the  hardware in this case emulated hardware  virtual machine has a timer that ticks  say a hundred times a second and causes  interrupts to the operating system and  that's how the operating system keeps  track of time it's by counting these  timer interrupts so the way that plays  out those timer notice why they have to  happen at exactly the same place in the  primary and backup otherwise they don't  execute the same no diverge so what  really happens is that the there's  there's a timer on the physical machine  that's running the Ft virtual machine  monitor and the timer on the physical  machine ticks and delivers an interrupt  a timer and up to the virtual machine  monitor on the primary the virtual  machine monitor at you know the  appropriate moment stops the execution  of the primary writes down the  instruction number that it was at you  know instruction since boot and then  delivers sort of fake simulates and  interrupts into the guest operating  system in the primary at that  instruction number saying oh you know  you're emulating the timer Hardware just  ticked  there's the interrupt and then the  primary virtual machine monitor sends  that instruction number which the  interrupt happened you know to the  backup the backup of course it's virtual  machine monitor is also taking timer  interrupts from its physical timer and  it's not giving them it's not giving  it's a real physical timer interrupts to  the to the backup operating system it's  just ignoring them when the law when the  log entry for the primaries timer  interrupts arrives here then the backup  virtual machine monitor will arrange  with the CPU and this requires special  CPU support to cause the physical  machine to interrupt at the same  instruction number  at the timer interrupts tapped into the  primary at that point the virtual  machine monitor gets control again from  the guest and then fakes the timer  interrupts into the backup operating  system now exact exactly the same  instruction number as it occurred on the  primary well yeah so the observation is  that this will this relies on the CPU  having some special hardware in it where  the vmm can tell the hardware CPU please  interrupt a thousand instructions from  now and then the vmm you know where so  that you know it'll interrupt at the  right instruction number the same  instruction as the primary did and then  the vmm just tells the cpu to start X  resume executing again in the backup and  exactly a thousand instructions later  the CPU will force an interrupt into the  virtual machine monitor and that that's  special hardware but it turns out it's  you know on all Intel chips so it's not  it's not that special anymore you know   totally normal and it turns out there's  a lot of other uses for it like um if  you want to do profiling you wanna do  CPU time profiling what you'd really  like or one way to do CPU time profiling  is to have the microprocessor interrupt  every thousand instructions right and  this is the hardware that's this  Hardware also this is the same hardware  that would cause the microprocessor to  generate an interrupt every thousand  instructions so it's a very natural sort  of gadget to want in your CPU  all right yes  what if the backup gets ahead of the  primary so you know we standing above  know that oh you know the primary is  about to take an interrupt at the  millionth instruction but the backup is  already you know executed the millionth  and first instruction so it's gonna be  if we let this happen it's gonna be too  late to deliver the interrupts if we let  the backup execute ahead of the primary  it's going to be too late to deliver the  interrupts at the same point in the  primary instruction stream and the  backup of the instruction stream so we  cannot let that happen we cannot let the  backup get ahead of the primary in  execution and the way VMware ft does  that is that the the backup virtual  machine monitor it actually keeps a  buffer of waiting events that have  arrived from the primary and it will not  let to the backup execute unless there's  at least one event in that buffer and if  there's one event in that buffer then it  will know from the instruction number  the place at which it's got a force the  backup to stop executing so always  always the backup is executing with the  CPU being told exactly where the next  stopping point the next instruction  number of a stopping point is because  the backup only executes if it has a an  event here that tells it where to stop  next so that means it starts up after  the primary because the backup can't  even start executing until the primary  has generated the first event and that  event has arrived at the backup so the  backup sort of always one event  basically behind the at least one event  behind the primary and if it's slower  for some other whatever reason maybe  there's other stuff running on that  physical machine then the backup might  get you know multiple events behind at  the primary  alright there's a one little piece of  mess about arriving the specific case of  arriving packets ordinarily when a  packet arrives from a network interface  card if we weren't running a virtual  machine the network interface card would  DMA the packet content into the memory  of the computer that it's attached to  sort of as the data arrives from the  network interface card and that means  you know you should never write software  like this but it could be that the  operating system that's running on a  computer might actually see the data of  a packet as its DMA or copied from the  network interface card into memory right  you know this is and you know we don't  know what operating this system is  designed so that it can support any  operating system and cost maybe there is  an operating system that watches  arriving packets in memory as they're  copied into memory so we can't let that  happen because if the primary happens to  be playing that trick it's gonna see you  know if we allowed the network interface  card to directly DMA incoming packets  into the memory of the primary the  primary we don't have any control over  the exact timing of when the network  interface card copies data into memory  and so we're not going to know sort of  at what times the primary did or didn't  observe data from the packet arriving  and so what that means is that in fact  the NIC copies incoming packets into  private memory of the virtual machine  monitor and then the network interface  card interrupts the virtual machine  monitor and says oh a packet has arrived  at that point the virtual machine  monitor will suspend the primary and  remember what instruction number had  suspended at copy the entire packet into  the primaries memory while the primary  suspended and not looking at this copy  and then emulate a network interface  card interrupt into the primary  and then send the packet and the  instruction number to the backup the  backup will also suspend the backup rope  you know virtual machine monitor will  spend the backup at that instruction  number copy the entire packet and again  to the back-up is guaranteed not to be  watching the data arrive and then fakin  interrupts at the same instruction  numbers of the primary and this is the  something the bounce buffer mechanism  explained in the paper okay yeah the the  only instructions and that result in  logging channel traffic or are weird  instructions which are rare no its  instructions that might yield a  different result if executed on the  primary and backup like instruction to  get the current time of day or current  processor number or ask how many  instructions have been executed or and  those actually turn out to be relatively  rare there's also one them to get random  tasks when some machines to ask or a  hardware generated random number for  cryptography or something and but those  are not everyday instructions most  instructions like add instructions  they're gonna get the same result on  primary and that go  yeah so the way those get replicated on  the back up is just by forwarding that's  exactly right each network packet just  it's packaged up and forwarded as it is  as a network packet and is interpreted  by the tcp/ip stack on both you know so  I'm expecting 99.99% of the logging  channel traffic to be incoming packets  and only a tiny fraction to be results  from special non-deterministic  instructions and so we can kind of guess  what the traffic load is likely to be  for for a server that serves clients  basically it's a copy of every client  packet and then we'll sort of know what  the logging channel how fast the logging  channel has to be all right so um so  it's worth talking a little bit about  how output works and in this system  really the only what output basically  means only is sending packets that  client send requests in as network  packets the response goes back out as  network packets and there's really no  other form of output as I mentioned the  you know both primary and backup compute  the output packet they want to send and  that sort of asks that simulated mix to  send the packet it's really sent on the  primary and simply discard it the output  packet discarded on the backup okay but  it turns out is a little more  complicated than that so supposing we're  what we're running is a some sort of  simple database server and the operation  the client operation that our database  server supports is increment and ideas  the client sends an increment requests  the database server increments the value  and sends back the new value so maybe on  the primary well let's say everything's  fine so far and the primary backup both  have value 10 in memory and that's the  current value at the counter and some  client on the local area network sends a  you know an increment request to  the primary that packet is you know  delivered to the primary it's you know  it's executed the primary server  software and the primary says oh you  know current values 10 I'm gonna change  to 11 and send a you know response  packet back to the client saying saying   mentioned gonna supposed to be sent to  the backup will also be processed here  it's going to change this 10 to 11 also  generate a reply and we'll throw it away  that's what's supposed to happen the  output however you also need to ask  yourself what happens if there's a  failure at an awkward time if you should  always in this class should always ask  yourself what's the most awkward time to  have a failure and what would happen you  to failure occurred then so suppose the  primary does indeed generate the reply  here back to the client but the client  the primary crashes just after sending  the report its reply to the client and  furthermore and much worse it turns out  that you know this is just a network it  doesn't guarantee to deliver packets  let's suppose this log entry on the  logging channel got dropped also when  the when the primary died so now the  state of play is the client received a  reply saying 11 but the backup did not  get the client request so its state is  still 10 no now the backup takes over  because it's seized the primary is dead  and this client or maybe some other  client sends an increment request a new  backup and now it's really processing  these requests and so the new backup  when it gets the next increment requests  you know it's now going to change its  state to 11 and generate a second 11  response maybe the same client maybe to  a different client which if the clients  compare notes or if it's the same client  it's just obviously cannot have happened  I didn't so you know because we have to  support unmodified software that does  not  damn that there's any funny business of  replication going on that means we do  not have the opportunity to you know you  can imagine the client could go you know  we could change the client to realize  something funny it happened with the  fault tolerance and do I don't know what  but we don't have that option here  because this whole system really only  makes sense if we're running unmodified  software so so this was a big this is a  disaster we can't have let this happen  does anybody remember from the paper how  they prevent this from happening the  output rule yeah so you want to do you  know yeah so the output rules is the  their solution to this problem and the  idea is that the client he's not allowed  to generate you know and generate any  output the primary's not allowed to  generate any output and what we're  talking about now is this output here  until the backup acknowledges that it  has received all log records up to this  point so the real sequence at the  primary then let's now undone crash the  primary go back to them starting at 10  the real sequence now when the output  rule is that the input arrives at the  time the input arrives that's when the  virtual machine monitor sends a copy of  the input to the backup so the the sort  of time at which this log message with  the input is sent is before strictly  before the primary generates the output  sort of obvious then after firing this  log entry off across a network and now  it's heading towards the backup but I'd  have been lost might not the virtual  machine monitor delivers a request to  the primary server software it generates  the output so now the  replicated you know the primary has  actually generated change the state 211  and generated an output packet that says  eleven but the virtual machine monitor  says oh wait a minute we're not allowed  to generate that output until all  previous log records have been  acknowledged by the backup so you know  this is the most recent previous log  message so this output is held by the  virtual machine monitor until the this  log entry containing the input packet  from the client is delivered to the  virtual machine monitor and buffered by  the virtual machine monitor but do not  necessarily execute it it may be just  waiting for the backup to get to that  point in the instruction stream and then  the virtual machine monitor here will  send an ACK packet back saying yes I  did get that input and when the  acknowledgment comes back only then will  the virtual machine monitor here release  the packet out onto the network and so  the idea is that if the client could  have seen the reply then necessarily the  backup must have seen the request and at  least buffered it and so we no longer  get this weird situation in which a  client can see a reply but then there's  a failure and a cut over and the replica  didn't know anything about that reply if  the you know there's also a situation  maybe this message was lost and if this  log entry was lost and then the primary  crashes well since it hadn't been  delivered so the backup hadn't sent the  act that means if the primary crashed  you know this log entry was brought in  the primary crashed it must have crashed  before the virtual machine monitor or at  least the output packet and prayer for  this client couldn't have gotten the  reply and so it's not in a position to  spot any irregularities they're really  happy with the output rule  brennon see I don't know they don't  paper doesn't mention how the virtual  machine monitor is implemented I mean  it's pretty low level stuff because you  know it's sitting there allocating  memory and figuring page tables and  talking to device drivers and  intercepting instructions and  understanding what instructions the  guest was executing so we're talking  about low-level stuff what language is  written in you know traditionally C or  C++ but I don't actually know okay this  of the primary has to delay at this  point waiting for the backup to say that  it's up to date this is a real  performance thorn in the side of just  about every replication scheme this sort  of synchronous wait where the we can't  let the primary get too far ahead of the  backup because if the primary failed  while it was ahead that would be the  backup lagging lagging behind clients  right so just about every replication  system has this problem that at some  point the primary has to stall waiting  for the backup and it's a real limit on  performance even if the machines are  like side-by-side and adjacent racks  it's still you know we're talking about  a half a millisecond or something to  send messages back and forth with a  primary stalled and if we wanna like  withstand earthquakes or citywide power  failures you know the primary in the  backup have to be in different cities  that's probably five milliseconds apart  every time we produce output if we  replicate in the two replicas in  different city every packet that it  produces this output has to first wait  the five milliseconds or whatever to  have the last log entry get to the  backup and how the acknowledgment come  back and then we can release a path  packet and you know for sort of low  intensity services that's not a problem  but if we're building a you know  database server that we would like to  you know that if it weren't for this  could process millions of requests per  second then  that's just unbelievably damaging for  performance and this is a big reason why  people you know you know if they  possibly can use a replication scheme  that's operating at a higher level and  kind of understands the semantics of  operations and so it doesn't have to  stall on every packet you know it could  stall on every high level operation or  even notice that well you know read-only  operations don't have to stall at all  it's only right so that just all or  something but you have to there has to  be an application level replication  scheme to to realize that you're  absolutely right so the observation is  that you don't have to stall the  execution of the primary you only have  to hold the output and so maybe that's  not as bad as it could be but  nevertheless it means that every you  know in a service that could otherwise  have responded in a couple of  microseconds to the client you know if  we have to first update the replicas in  the next city we turn to you know 10  micro second interaction into it 10  millisecond interactions possibly if you  have vast numbers of clients submitting  concurrent requests then you may may be  able to maintain high throughput even  with high latency but you have to be  lucky to or very clever designer to get  that  that's a great idea but if you log in  the memory of the primary that log will  disappear when the primary crashes or  that's usual semantics of a server  failing is that you lose everything  inside the box like the contents of  memory or you know if even if you didn't  if the failure is that somebody  unplugged the power cable accidentally  from the primary even if the primary  just has battery backed up RAM or I  don't know what you can't get at it  all right the backup can't get at it so  in fact this system does log the output  and the place it logs it is in the  memory of the backup and in order to  reliably log it there you have to  observe the output rule and wait for the  acknowledgment so it's entirely correct  idea just can't use the primary's memory  for it yes  say it again that's a clever idea I'd  and so the question is maybe input  should go to the primary but output  should come from the backup  I completely haven't thought this  through that might work that  I don't know that's interesting  yeah maybe I will  okay one possibility this does expose  though is that the situation you know  maybe the a primary crashes after its  output is released so the client does  receive the reply then the primary  crashes the backups input is still in  this event buffer in the virtual machine  monitor of the backup it hasn't been  delivered to the actual replicated  service when the backup goes live after  the crash of the primary the backup  first has to consume all of the sort of  log records that are lying around that  it hasn't consumed yet has to catch up  to the primary otherwise it won't take  over with the same state so before the  backup can go live it actually has to  consume all these entries the last entry  is presumably is the request from the  client so the backup will be live after  after it after the interrupt that  delivers the request from the client and  that means that the backup well you know  increment its counter to eleven and then  generate an output packet and since it's  live at this point it will generate the  output packet and the client will get to  eleven replies which is also if it if  that really happened would be anomalous  like possibly not something that could  happen if there was only one server the  good news is that almost certainly or  the almost certainly the client is  talking to this service using TCP and  that this is the request and the  response go back and forth on a TCP  Channel the when the backup takes over  the backup since the state is identical  to the primaries it knows all about that  TCP connection and whether all the  sequence numbers are and whatnot and  when it generates this packet it will  generate it with the same TCP sequence  number as an original packet and the TCP  stack on the client will say oh wait a  minute that's a duplicate packet  we'll discard the duplicate packet at  the TCP level and the user level  software will just never see this  duplicate and so this system really you  know you can view this as a kind of  accidental or clever trick but the fact  is for any replication system where  cutover can happen which is to say  pretty much any replication system it's  essentially impossible to design them in  a way that they are guaranteed not to  generate duplicate output basically you  know you well you can err on either side  I'm not even either not generate the  output at all which would be bad which  would be terrible or you can generate  the output twice on a cutover that's  basically no way to generate it  guaranteed generated only once everybody  errors on the side of possibly  generating duplicate output and that  means that at some level you know the  client side of all replication schemes  need some sort of duplicate detection  scheme here we get to use TCP s that we  didn't have TCP that would have to be  something else maybe application level  sequence numbers or I don't know what  and you'll see all of this and actually  you'll see versions of essentially  everything I've talked about like the  output rule for example in labs 2 & 3  you'll design your own replicated state  machine yes  yes to the first part so the scenario is  the primary sends the reply and then  either the primary send the close  packet or the client closes the connect  the TCP connection after it receives the  primary's reply so now there's like no  connection on the client side but there  is a connection on the backup side and  so now the backup so the backup consumes  the very last log entry that as the  input is now live so we're not  responsible for replicating anything at  this point right because the backup now  live there's no other replica as the  primary died so there's no like if if we  don't if the backup fails to execute in  log step with the primary that's fine  actually because the primary is is dead  and we do not want to execute in  log step with it okay so the primer is  now not it's live it generates an output  on this TCP connection that isn't closed  yet from the backup point of view this  packet arrives with the client on a TCP  connection that doesn't exist anymore  from the clients point of view like no  big whoopee on the client right he's  just going to throw away the packet as  if nothing happened the application  won't no the client may send a reset  something like a TCP error or whatever  packet back to the backup and the backup  does something or other with it but it  doesn't matter because we're not  diverging from anything because there's  no primary to diverge from you can just  handle a stray we said however it likes  and what it'll in fact do is basically  ignore but there's no now the backup has  gone live there's just no we don't owe  anybody anything as far as replication  yeah  well you can bet since the backups  memory image is identical to the  primaries image that they're sending  packets with the very same source TCP  number and they're very same everything  they're sending bit for bit identical  packets you know at this level the  server's don't have IP addresses or for  our purposes the virtual machines you  know the primary in the back up virtual  machines have IP addresses but the the  physical computer and the vmm are  transparent to the network it's not  entirely true but it's basically the  case that the virtual machine monitor in  the physical machine don't really have  identity of their own on the network  because you can configure that then that  way instead these they're not you know  the virtual machine with a sewing  operating system in its own TCP stack it  doesn't IP address underneath there an  address and all this other stuff which  is identical between the primary in the  backup and when it sends a packet it  sends it with the virtual machines IP  address and Ethernet address and those  bits least in my mental model are just  simply passed through on to the local  area network it's exactly what we want  and so I think he doesn't generate  exactly the same packets that the  primary would have generated there's  maybe a little bit of trickery  you know what the we if this is these  are actually plugged into an Ethernet  switch into the physical machines maybe  it wasn't in two different ports of an  Ethernet switch and we'd like the  Ethernet switch to change its mind about  which of these two machines that  delivers packets with replicated  services Ethernet address and so there's  a little bit of funny business there for  the most part they're just generating  identical packets so let me just send  them out  okay so another little detail I've been  glossing over is that I've been assuming  that the primary just fails or the  backup just fails that is fail stop  right but that's not the only option  another very common situation that has  to be dealt with is if the two machines  are still up and running and executing  but there's something funny happen on  the network that causes them not to be  able to talk to each other but to still  be able to talk to some clients so if  that happened if the primary backup  couldn't talk to each other but they  could still talk to the clients they  would both think oh the other replicas  dead I better take over and go live and  so now we have two machines going live  with this service and now you know  they're no longer sending each other log  events or anything they're just  diverging maybe they're accepting  different client inputs and changes are  stayed in different ways so now we have  a split brain disaster if we let the  primary in the backup go live because it  was a network that has some kind of  failure instead of these machines and  the way that this paper solves it I mean  is by appealing to an outside authority  to make the decision about which of the  primary of the backup is allowed to be  live and so it they're you know it turns  out that their storage is actually not  on local disk this almost doesn't matter  but their storage is on some external  disk server and as well as being in this  server as a like totally separate  service there's nothing to do with disks  there this server happens to abort this  test and set test and set service over  the network where you you can send a  test and set request to it and there's  some flag it's keeping in memory and  it'll set the flag and return what the  old value was so both primary and backup  have to sort of acquire this test and  set flag it's a little bit like a lock  in order to go live they both may be  send test and set requests at the same  time to this test and set server the  first one gets back a reply that says oh  the flag used to be zero now it's one  this  second request to arrive the response  from the test and set server is Oh  actually the flag was already one when  your request arrived so so basically  you're not allowed to be primary and so  this this test and set server and we can  think of it as a single machine is the  arbitrator that decides which of the two  should go live if they both think the  other ones dead due to a network  partition any questions about this  mechanism you're busted yeah if the test  and set server should be dead at the  critical moment when and so actually  even if there's not a network partition  under all circumstances in which one or  the other of these wants to go live  because it thinks the others dead even  when the other one really is dead the  one that wants to collide still has to  acquire the test and set lock because  one of like the deep rules of 6.824  game is that you cannot tell whether or  another computer is dead or not all you  know is that you stopped receiving  packets from it and you don't know  whether it's because the other computer  is dead or because something has gone  wrong with the network between you and  the other computer so all the backup  ceases well I've stuck in packets maybe  the primary is dead maybe it's live  primary probably sees the same thing so  if there's a network partition they  certainly have to ask the Test-and-Set  server but since they don't know if it's  a network partition they have to ask the  test and set server regardless of whether  it's a partition or not so anytime  either wants to collide the test and set  server also has to be alive because they  always have to acquire this test and set  lock so the test and set server  sounds like a single point of failure  they were trying to build a replicated  fault tolerant whatever thing but in the  end you know we can't failover unless  unless this is alive so that's a bit of  a bummer  I'm guessing though I'm making a strong  guess that the test and set server is  actually itself a replicated service and  is fault tolerant right it's almost  certainly I mean these people of VMware  where they're like happy to sell you a  million dollar highly available storage  system that  uses enormous amounts of replication  internally um since the test and set  thing is on there dis server I'm I'm  guessing it's replicated too and the  stuff you'll be doing in lab 2 in lab 3  is more than powerful enough for you to  build your own fault-tolerant test and  set server so this problem can easily be  eliminated   


--------------------------------------------------------------------------------
/lec05/Lec5.en.txt:
--------------------------------------------------------------------------------
1 | Today the TAS are going to be giving a  lecture on concurrency and go basically  this lecture is going to be full of  design patterns and practical tips to  help you with the labs we're going to be  covering briefly the code memory model  the reading which we went over and then  spend most of the lecture talking about  concurrency primitives and go  concurrency patterns and go how you do  things that you will need to do in the  labs and then finally we'll talk through  some debugging tips and techniques and  show you some interesting tools that you  might want to use when debugging the  labs so very briefly on the go memory  model on the reading so why did we  assign this reading well the goal was to  give you some concrete examples of  correct ways to write threaded code and  go so the document like in the second  half of the document has some examples  of correct code and an incorrect code and  how it can go wrong so one thing you  might have noticed in the document is  early on it says if you need to read and  understand this you're being too clever  and we think that that's good advice so  focus on how to write correct code don't  focus way too much on the happens before  relation and being able to reason about  exactly why incorrect code is't correct  like we don't really care we just want  to be able to write correct code and  call it a day  one question that came up in the lecture  questions was like talking about  goroutines in relation to performance  and so we just wanted to say that  goroutines and like in general  concurrency can be used for a couple  different reasons and the reason we use  concurrency in the labs is not  necessarily for performance like we're  not going for parallelism using multiple  cores on a single machine in order to be  able to do more work on the CPU  concurrency gets us something else  besides performance through parallelism  it can get us better expressivity  like we want to write down some ideas  and it happens to be that writing down  code that uses threads is a clean way of  expressing those ideas and so the  takeaway from that is when you use  threads in lab 2 and Beyond don't try to  do fancy things you might do if you're  going for performance especially CPU  performance like we don't care to do  things like using fine-grained locking  or other techniques use basically write  code that's easy to reason about use big  locks to protect large critical sections  and just like don't worry about  performance in the sense of CPU  performance  so with that that's all we're going to  say about the memory model and spend  most of this lecture just talking about  go code and go concurrency patterns and  as we go through these examples feel  free to ask any questions about what's  on the screen or anything else you might  think about so I'm going to start off  talking about concurrency primitives and  go so the first thing is closures this  is something that will almost certainly  be helpful in the labs and this is  related to goroutines so here's this  example program on the screen and what  it does is the main function declares a  bunch of variables and then spawns this  goroutine in here with this go  statement and we noticed that the score  routine is not taking it as an argument  a function call to some function defined  elsewhere but this anonymous function  just defined in line here so this is a  handy pattern this is something called a  closure and one neat thing about this is  that this function that's defined here  can refer to variables from the  enclosing scope so for example this  function can mutate this variable a  that's defined up here or refer to this  wait group that's defined up here so  if we go run this example it does what  you think it does  the wait group dot done here let's the  main thread continue past this point it  prints out this variable which has been  mutated by this concurrently running  thread that finished before this wait  happened so this is a useful pattern to  be able to use one like the reason we're  pointing this out is because you might  have code that looks like this in your  labs very similar to the previous  example except this is code that is  spawning a bunch of threads in a loop  this is useful for example when you want  to send rpcs in parallel right so  like in lab two if you have a candidate  asking for votes you want to ask for  votes from all the followers in parallel  not one after the other because the RPC  is a blocking operation that might take  some time or similarly the leader might  want to send append entries to all the  followers you want to do it in parallel  not in series and so threads are a clean  way to express this idea and so you  might have code that looks kind of like  this at a high level in a for loop you  spawn a bunch of goroutines one thing  to be careful about here this is  something that was talked about in a  previous lecture  is identifier capture and goroutines and  mutation of that identifier in the outer  scope so we see here that we have this  i that's being mutated by this for  loop and then we want to use that value  inside this goroutine and the way  we do that like the correct way of  writing this code is to pass this value i  as an argument to this function and  this function or you can rename it to X  inside here and then use the value  inside and so if we run this program  so here I've kind of stubbed out to send  our RPC thing was actually just prints  out the index this I might be like the  index of the follower trying to send an  RPC to here prints out the numbers 0  through 4 in some order so this is what  we want like send our PCs to all the  followers the reason we're showing you  this code is because there's a variation  of this code which looks really similar  and maybe intuitively you might think it  does the right thing but in fact it  doesn't so in this code the only thing  that's changed is we've gotten rid of  this argument here that we're explicitly  passing and instead we're letting this I  refer to the i from the outer scope so  you might think that when you run this  it does the same thing but in fact in  this particular run it printed 4 5 5 5 5  so this would do the wrong thing and the  reason for this is that this I is being  mutated by this outer scope and by the  time this goroutine ends up actually  executing this line well the for loop  has already changed the value of I so  this doesn't do the right thing so at a  high level if you're spawning goroutines  in a loop just make sure that you use  this pattern here and everything will  work right any questions about that  so it's just like a small gotcha but  we've seen this a whole bunch of times  in office hours so I just wanted to  point this out all right so moving on to  other patterns that you might want to  use in your code oftentimes you want  code that periodically does something a  very simple way to do that is to have a  separate function that in an infinite  loop does something in this case we're  just printing out tick and then use this  time dot sleep to wait for a certain  amount of time so very simple pattern  here  you don't need anything fancier than  this to do something periodically  one modification of this that you might  want is you want to do something  periodically until something happens for  example you might want to start up a  raft here and then periodically send  heartbeats but when we call dot kill on  the raft instance you want to actually  shut down all these goroutines so you  don't have all these random goroutines  still running in the background and so  the pattern for that looks something  like this you have a goroutine that will  run in an infinite loop and do something  and then wait for a little bit and then  you can just have a shared variable  between whatever control thread is going  to decide whether this goroutine should  die or not so in this example we have  this variable done that's a global  variable and what main does is it waits  for while and sets done to true and in  this goroutine that's ticking and doing  work periodically we're just checking  the value of done and if done is set  then we terminate the square-root eeen  and here since done is a shared variable  being mutated and read by multiple  threads we need to make sure that we  guard the use of this with a lock so  that's where this mute outlaw can mute  it unlock comes in for the purpose of  the labs you can actually write  something a little bit simpler than this  so we have this method rf.kill on your  raft instance so you might have code  that looks a little bit more like this  so while you're wrapped instance is not  dead you want to periodically do some  work any questions about that so far  yeah question  does using the locking mechanisms for  channels make it so that any right  stunts any variables and those functions  are to be observed by the fencer would  you need to send done across the channel  okay so let me try to simplify the  question a bit I think the question is  do you need to use locks here can you  use channels instead and are and can you  get away with not using locks and like  what's the difference between nothing  versus channels vs locks is that  basically what you're asking I think the  question is this done does it not need  to be sent across a channel does just  using these locks ensure that this read  here observes the write done by a thread  okay so the answer is yes basically at a  high level if you want to ensure cross  thread communication make sure you use  go synchronization primitives whether  it's channels or locks and condition  variables and so here because of the use  of locks after this thread writes done  and does unlock the next lock that  happens is guaranteed to observe the  writes done before that before this  unlock happened so you have this write  happened and this unlock happened then  one of these locks happens and then the  next done will be guaranteed to observe  that write of true question  that's a good question in this  particular code it doesn't matter but it  would be cleaner to do it so the  question is why don't we do mu dot  unlock here before returning and the  answer is in here there's no more like  the program's done so it doesn't  actually end up mattering but you're  right that like in general we would want  to ensure that we unlock before we  return yeah thanks for pointing that out  so I'm not sure entirely what the  question is but maybe something like can  both of these acquire the lock at the  same time is that the question and we'll  talk a little bit more about locks in  just a moment but at a high level the  semantics of a lock are the lock is  either held by somebody or not held by  somebody and if it's not held by  somebody then if someone calls lock they  have the chance to acquire the lock and  if before they call unlock somebody else  calls lock that other thread is going to  be blocked until the unlock happens then  the lock is free again so at a high  level between the lock and the unlock  for any particular lock like any only a  single thread can be executing what's  called a critical section between the  lock and unlock regions any other  questions  so the question is related to timing  like when you set done equals true and  then you unlock you have no guarantee in  terms of real time like when periodic  will end up being scheduled and observe  this right and actually end up  terminating and so yes if you want to  mean to actually ensure that periodic  has exited for some particular reason  then you could write some code that  communicates back from periodic  acknowledging this but in this  particular case like the only reason we  have the sleep here is just to  demonstrate that the sleep here is just  to demonstrate that tick prints for a  while and then periodic as indeed cancel  it because it stops being printed before  I get my shell prompt back and in  general for a lot of these background  threads like you can just say that you  want to kill them and it doesn't matter  if they're killed within 1 second or  within 2 seconds or one exactly go  schedules it because this thread is  going to just observe this right to done  and then exit do no more works it  doesn't really matter and also another  thing in go is that if you spawn a bunch  of goroutines one of them is the main  goroutine this one here and the way go  works is that if the main goroutine  exits the whole program terminates and  all goroutines are terminated  that's a great question okay so I think  the question is something like why do  you need locks at all like can you just  delete all the locks and then like  looking at this code it looks like okay  main does a right to true at some point  and periodic is repeatedly reading it so  at some point it should observe this  read right well it turns out that like  this is why go has this fancy memory  model and you have this whole thing on  that happens before relation the  compiler is allowed to take this code  and emit a kind of low-level machine  code that does something a little bit  different than what you intuitively  thought would happen here and we can  talk about that in detail offline after  the lecture and office hours but at a  high level I think one rule you can  follow is if you have accesses to shared  variables and you want to be able to  observe them across different threads  you need to be holding a lock before you  read or write those shared variables in  this particular case I think the go  compiler would be allowed to optimize  this to like lift the read of done  outside the four so read this shared  variable once and then if done is false  then set like make the inside be an  infinite loop because like now the way  this thread is written it had uses no  synchronization primitives there's no  mutex lock or unlock no channel sends or  receives and so it's actually not  guaranteed to observe any mutations done  by other concurrently running threads  and if you look on Piazza I've actually  like written a particular go program  that is optimized in the unintuitive way  like it'll produce code that does an  infinite loop even though looking at it  like you might think that oh the obvious  way to compile this code will produce  something that terminates yeah so the  memory model is pretty fancy and it's  really hard to think about why exactly  incorrect programs are incorrect but if  you follow some general rules like whole  blocks before you mutate shared  variables then you can avoid thinking  about some of these nasty issues  any other questions all right so let's  talk a little bit more about mutexes now  so why do you need mutex is at a high  level whenever you have concurrent  access but by different threads to some  shared data you want to ensure that  reads and writes of that data are atomic  so here's one example of program that  declares a counter and then spawns a  goroutine  actually spawns a thousand goroutines  that each update the counter value and  increment it by one and you might think  that looking at this intuitively when I  print out the value of the counter at  the end it should print a thousand but  it turns out that we missed some of the  updates here and in this particular case  it only printed 947 so what's going on  here is that this update here is not  really protected in any way and so these  threads running concurrently can read  the value of counter and update it and  clobber other threads updates of this  value like basically we want to ensure  that this entire section here happens  atomically and so the way you make  blocks of code run atomically are by  using locks and so in this code example  we've fixed this bug we create a lock  and then all these goroutines that  modify this counter value first grab the  lock then update the counter value and  then unlock and we see that we're using  this defer keyword here what this does  is basically the same as putting this  code down here so we grab a lock do some  update then unlock defer is just a nice  way of remembering to do this you might  forget to write the unlock later and so  what defer does is it you can think of  it as like scheduling this to run at the  end of the current function body and so  this is a really common pattern you'll  see for example in your RPC handlers for  the lab so oftentimes RPC handlers will  manipulate either read or write data on  the Raft structure right and those  updates should be synchronized with  other concurrently happening updates and  so oftentimes the pattern for RPC  handles would be like grab the lock  differ unlock and then go do some work  inside so we can see if we run this code  it produces the expected results so it  prints out a thousand and we haven't  lost any of these updates and so what at  a high level what a lock or a mutex can  do is guarantee mutual exclusion for a  region of code which we call a critical  section so in here this is the critical  section and it ensures that none of  these critical sections execute  concurrently with  ones they're all serialized happened one  after another question  yes so this is a good observation this  particular could is actually not  guaranteed to produce a thousand  depending on how thread scheduling end  up ends up happening because all the  main goroutine does is it waits for one  second which is some arbitrary unit of  time and then it prints out the value of  the counter I just want to keep this  example as simple as possible a  different way to write this code that  would be guaranteed to print a thousand  would be to have the main goroutine wait  for all these thousand threads to finish  so you could do this using a wait  group for example but we didn't want to  put two synchronization primitives like  wait groups and mutex is in the same  example so that's why we're at this code  that is like technically incorrect but I  think it still demonstrates the point of  locks any other questions great so at a  very high level you can think of locks  is like you grab the lock you mutate the  shared data and then you unlock so does  this pattern always work well turns out  that that's like a useful starting point  for how to think about locks but it's  not really the complete story so here's  some code this doesn't fit on the screen  but I'll explain it to you we can scroll  through it it basically implements a  bank at a high level so I have Alice and  Bob who both start out with some  balances and then I keep track of what  the total balances like the total amount  of money I store in my bank and then I'm  going to spawn to goroutines that will  transfer money back and forth between  our Alice and Bob so this one  goroutine that a thousand times will  reduce one from Alice and send it to Bob  and concurrently running I have this  other goroutine that in a loop will  reduce one from Bob and send it to Alice  and notice that I have this mutex here  and whenever I manipulate these shared  variables between these two different  threads  I'm always locking the mutex and this  update only happens while this lock is  held right and so is this code correct  or incorrect there actually isn't really  a straightforward answer to that  question it depends on like what are the  semantics of my bank like what behavior  do I expect so I'm going to introduce  another thread here I'll call this one  the audit thread and what this is going  to do is every once in a while I'll  check it check the sum of all the  accounts in my bank and make sure that  the sum is the same as what it started  out as  right click if I only allow transfers  within my bank the total amount should  never change so now given this other  thread so what this does is it grabs the  lock then sums up Alice Plus Bob and  compares it to the total and if it  doesn't match then it says that though  I've observed some violation that my  total is no longer what it should be if  I run this code I actually see that a  whole bunch of times  this concurrently running thread does  indeed observe that Alice Plus Bob is  not equal to the overall sum so what  went wrong here like we're following our  basic rule of whenever we're accessing  data that's shared between threads we  grab a lock it is indeed true that no  updates to these shared variables happen  while the lock is not held exactly so  let me repeat that for everybody to hear  what we intended here was for this  decrement and increment to happen  atomically but instead of what we ended  up writing was code that decrement  atomically and then increments  atomically and so in this particular  code actually like we won't lose money  in the long term like if we let these  threads run and then wait till they  finish and then check the total it will  indeed be what it started out as but  while these are running since this  entire block of code is not atomic we  can temporarily observe these violations  and so at a higher level the way should  think about locking is not just like  locks are to protect access to shared  data but locks are meant to protect  invariants you have some shared data  that multiple people might access and  there's some properties that hold on  that shared data like for example here I  is the programmer decided that I want  this property that alice + Bob should  equal some constant and that should  always be that way I want that property  to hold but then it may be the case that  different threads running concurrently  are making changes to this data and  might temporarily break this invariant  here right like here when I decrement  from Alice temporarily the sum Alice  Plus Bob has changed but then this  thread eventually ends up restoring this  invariant here and so locks are meant to  protect and vary  at a high level you grab a lock then you  do some work that might temporarily  break the invariant but then you restore  the invariant before you release the  lock so nobody can observe these in  progress updates and so the correct way  to write this code is to actually have  less use of lock and unlock  we have lock then we do a bunch of work  and then we unlock and when you run this  code we see no more printouts like this  that we never have this audit thread  observe that the total is not what it  should be all right so that's the right  way to think about locking at kind of a  high level you can think about it as  make sure you grab locks when every  access shared data like that is a rule  but another important rule is locks  protect invariants so grab a lock  manipulate things in a way that might  break the invariants but restore them  afterwards and then release the lock  another way you can think about it is  locks can make regions of code atomic  not just like single statements or  single updates to shared variables any  questions about that great so the next  synchronization primitive we're going to  talk about it something called condition  variables and this is it seems like  there's been a source of confusion from  lab one where we mentioned condition  variables but didn't quite explain them  so we're going to take the time to  explain them to you now and we're going  to do that in the context of an example  that you should all be familiar with  counting votes so remember in lab 2a you  have this pattern where whenever a Raft  peer becomes a candidate it wants to  send out vote requests all of its  followers and eventually the followers  come back to the candidate and say yes  or no like whether or not the candidate  got the vote right and one way we could  write this code is have the candidate in  serial ask peer number one peer number  two peer number three and so on but  that's bad right because we want the  candidate ask all the peers in parallel  so it can quickly win the election when  possible and then there's some other  complexities there like when we ask all  the peers in parallel we don't want to  wait so we get a response from all of  them before making up our mind right  because if a candidate gets a majority  of votes like it doesn't need to wait  till it hears back from everybody else  so this code is kind of complicated in  some ways and so here here's a kind of  stubbed out version of what that vote  counting code might look like  with a little bit of infrastructure to  make it actually run and so here have  this mean goroutine that sets count  which is like the number of yes votes I  got to zero and finish to zero finished  as the number of responses I've gotten  in total and the idea is I want to send  out vote requests in parallel and keep  track of how many yeses I've got and how  many responses I've gotten in general  and then once I know whether I've won  the election or whether I know that I've  lost the election then I can determine  that and move on and like the real raft  code you actually do whatever you need  to do don't step up to a leader or to  step down to a follower after you have  the result from this and so looking at  this code here I'm going to in parallel  spawn say I have ten peers in parallel  spawn ten goroutines  here I pass in this closure here and I'm  gonna do is request a vote and then if I  get the vote I'm going to increment the  count by one and then I'm also going to  increment this finished by one so like  this is a number of yeses this is total  number of responses I've gotten and then  outside here in the main goroutine what  I'm doing is keeping track of this  condition I'm waiting for this condition  to become true that either I have enough  yes votes that I've won the election or  I've heard back from enough peers and I  know that I've lost and so I'm just  going to in a in a loop check to see and  wait until count is greater than or  equal to five or wait until finished is  equal to ten and then after that's the  case I can either determine that I've  lost drive one so does anybody see any  problems with this code given what we  just talked about about mutexes yes  yeah exactly  countin finished aren't protected by  mutexes so one thing we certainly need  to fix here is that whenever we have  shared variables we need to protect  access with new taxes and so that's not  too bad to fix here I declare mutex  that's accessible by everybody and then  in the goroutines I'm launching in  parallel to request votes I'm going to  and this this pattern here is pretty  important I'm going to first request a  vote while I'm not holding the lock and  then after wear that I'm going to grab  the lock and then update these shared  variables and then outside I have the  same patterns as before except I make  sure to lock and unlock between reading  these shared variables so in an infinite  loop I grab the lock and check to see if  the results of the election have been  determined by this point and if not I'm  going to keep running in this infinite  loop otherwise I'll unlock and then do  what I need to do outside of here and so  if I run this example whoops it seems to  work and this is actually like a correct  implementation it does the right thing  but there's some problems with it so can  anybody recognize any problems with this  implementation I'll give you a hint this  code is not as nice as it could be  so not quite it's going to wait for  exactly the right amount of time the  issue here is that it's busy waiting  what it's doing is in a very tight loop  it's grabbing the lock checking this  condition unlocking grabbing this lock  checking this condition unlocking and  it's going to burn up 100% CPU on one  core while it's doing this so this code  is correct but it's like at a high level  we don't care about efficiency like CPU  efficiency for the purpose of the labs  but if you're using a hundred percent of  one core you might actually slow down  the rest of your program enough that it  won't make progress and so that's why  this pattern is bad that we're burning  up a hundred percent CPU waiting for  some condition to become true right so  does anybody have any ideas for how we  could fix this so here's one simple  solution  I will change a single line of code all  I've added here is wait for 50  milliseconds and so this is a correct  transformation of that program and it  kind of seems to solve the problem right  like before I was burning up a hundred  percent CPU now only once every 50  milliseconds I'm going to briefly wake  up check this condition and go back to  sleep  if it doesn't hold and so this is like  basically a working solution any  questions so this kind of sort of works  but one thing you should always be aware  of whenever you write code is magic  constants why is this 50 milliseconds  why not a different number like whenever  you have an arbitrary number in your  code it's a sign that you're doing  something that's not quite right or not  quite as clean as it could be and so it  turns out that there's a concurrency  primitive designed to solve exactly this  problem of I have some threads running  concurrently that are making updates to  some shared data and then I have another  thread that's waiting for some property  some condition on that shared data to  become true and until that condition  becomes true the thread is just going to  wait there's a tool designed exactly to  solve this problem and that's a tool  called a condition variable and the way  you use a condition variable is the  pattern basically looks like this so we  have our lock from earlier condition  variables are associated with locks so  we have some shared data some a lock  that protects that shared data and then  we have this condition variable that is  given a pointer to the lock when it's  initialized and we're going to use this  condition variable for kind of  coordinating when a certain condition  some property on that shared data when  that becomes true and the way we modify  our code is like we have two places one  we're making changes to that data which  might make the condition become true and  then we have another place where we're  waiting for that condition to become  true and the general pattern is whenever  we do something that changes the data we  call a conduct broadcast and we do this  while holding the lock and then on the  other side where we're waiting for some  condition on that share data to become  true we call cond dot wait and so what  this does is like let's think about what  happens in the mean thread for a moment  the main thread grabs the lock it checks  this condition suppose it's false it  calls cond dow wait what this will do is  it will atomically you can think of it  as it'll release the lock in order to  let other people make progress and it'll  add its thread like it'll add itself to  a like list of people who are waiting on  this condition variable then  concurrently one of these threads might  be able to acquire the lock after it's  gotten a vote and then it manipulates  these variables and then it calls  cond dot broadcast what that does is it  wakes up whoever's waiting on the  condition variable and so once this  thread unlocks the mutex this one what  do we want as it's returning from wait  we'll reacquire the mutex and then  return to the top of this for loop which  is checking this condition so this  broadcast wakes up whoever's waiting at  this wait and so this avoids having to  have that time dot sleep for some  arbitrary amount of time like this  thread that's waiting for some condition  to become true only gets woken up when  something changes that might make that  condition become true right like if you  think about these threads if they're  very slow and they don't call cond dot  broadcast for a long time this one will  just be waiting it won't be like  periodically waking up and checking some  condition that can't have changed  because nobody else manipulated their  shared data so any questions about this  pattern yeah  so that's a great question I think  you're referring to something called the  lost wake up problem and this is a topic  in operating systems and we won't talk  about it in detail now there feel free  to ask me after lecture but at a high  level you can avoid funny race  conditions that might happen between  wait and broadcast by following the  particular pattern I'm showing here and  I'll show you an abstracted version of  this pattern in a moment basically the  pattern is for the side that might make  changes that will change the outcome of  the condition test you always lock then  manipulate the data then call broadcast  and call unlock afterwards so the  broadcast must be called while holding  the lock similarly when you're checking  the condition you grab the lock then  you're always checking the condition in  a loop and then inside so when that  condition is false you call Condit wait  this is only called while you're holding  the lock and it atomically releases the  lock and kind of schedule like puts  itself in a list of waiting threads and  then as waits returning so as we like  return from this wait call and then go  back to the top of this for loop it will  reacquire the lock so this check will  only happen while holding the lock and  then so outside of this we still have  the lock here and we unlock after we're  done doing whatever we need to do here  at a high level this pattern looks like  this so we have one thread or some  number of threads doing something that  might affect the condition so they're  going to grab a lock do the thing call  broadcast then call unlock and on the  other side we have some thread that's  waiting for some condition to become  true the pattern there it looks like we  grab the lock then in a while loop while  the condition is false we wait and so  then we know that when we get past this  while loop now the condition is true and  we're holding the lock and we can do  whatever we need to do here and then  finally we call unlock so we can talk  about all the things that might go wrong  if you violate one of these rules like  after lecture if you're interested but  at a high level if you follow this  pattern then you won't need to deal with  those issues so any questions about that  yeah  so that's a great question  when do you use broadcast versus when do  use signals so converse have three  methods on them one is wait for the  waiting side and then on the other side  you can use signal or broadcast and the  semantics of those are signal wait wakes  up exactly one waiter like one thread  that may be waiting  whereas broadcast wakes up everybody  who's waiting and they'll all reach out  like they'll all try to grab the law can  recheck the condition and only one of  them will proceed because only one of  them will hold lock until it gets  past this point I think for the purpose  of this class always use broadcast never  use signal if you follow this pattern  and just like don't use signal and  always use broadcast your code will work  I think you can stick think of signal as  something used for efficiency and we  don't really care about that level of  CPU efficiency in the labs for this  class  any more questions ok so the final topic  we're going to cover in terms of go  concurrency primitives is channels so  two high level channels are like a queue  like synchronization primitive but they  don't behave quite like cues in the  intuitive sense like I think some people  think of channels is like there's this  data structure we can sticks that stick  things in and eventually someone will  pull those things out but in fact  channels have no queuing capacity they  have no internal storage basically  channels are synchronous if you have to  goroutines that are going to send and  receive on a channel if someone tries to  send on the channel while nobody's  receiving that thread will block until  somebody's ready to receive and at that  point synchronously it will exchange  that data over to the receiver and the  same is true the other direction if  someone tries to receive from a channel  while nobody's sending that receive will  block until there's another goroutine  that's about to send on the channel and  that send will happen synchronously so  here's a little demo program that  demonstrates this here I have a I  declare channel and then I spawn a go  routine that waits for a second and then  sent and then receives from a channel  and then in my main goroutine I keep  track of the time then I send on the  channel so I just put some dummy data  into the channel and then I'm going to  print out how long the send took  and if you think of channels as cues  with internal storage capacity you might  think of this thing as completing very  fast but that's not how channels work  this send is going to block until this  receive happens and this one happened  till this one second is the elapsed and  so from here to here  we're actually blocked in the main  goroutine for one whole second alright so  don't think of channels as queues think  of them as this synchronous like the  synchronous communication mechanism  another example that'll make this really  obvious is here we have a goroutine that  creates a channel then sends on the  channel and tries receiving from it  doesn't anybody know what'll happen when  I try running this  I think the file name might give it away  yeah exactly the send is going to block  till somebody's ready to receive but  there is no receiver and go actually  detects this condition if all your  threads are sleeping it to text this is  a deadlock condition and it'll actually  crash but you can have more subtle bugs  where if you have some other thread like  off doing something if I spawn this go  routine that you know for loop does  nothing and I try running this program  again now it goes deadlock detector  won't notice that all threads are not  doing any use will work like there's one  thread running it's just this is never  receiving and we can tell by looking at  this program that it'll never terminate  but here it just looks like it hangs so  if you're not careful with channels you  can get these subtle bugs where you have  double X as a result yeah yeah exactly  there's no data nobody's sending on this  channel so this is gonna block here it's  never gonna get to this line  yeah so channels as you pointed out  can't really be used just within a  single goroutine it doesn't really make  sense because in order to send or in  order to receive there has to be another  goroutine doing the opposite action at  the same time so if there isn't you're  just gonna block forever and then that  chant but thread will no longer do any  useful work yeah sends wait for receives  receives wait for signs and it happens  synchronously once there's both the  sender and receiver present what I  talked about so far is unbuffered  channels I was going to avoid talking  about buffered channels because there  are very few problems that they're  actually useful for solving so buffered  channels can take in a capacity and then  you can think of it as it's just switch  this to so here's a buffered channel  with a capacity of one this program does  terminate because buffered channels are  like they have some internal storage  space and until that space fills up  sends are non blocking because they can  just put that data in the internal  storage space but once the channel does  fill up then it does behave like a  non-buffer channel in the sense that further  sends will block until there's a receive  to make space in the channel but I think  at a high level we should avoid buffered  channels because they basically don't  solve any problems and another path and  other things should be thinking about is  whenever you to make up arbitrary  numbers like this one here to make your  code work you're probably doing  something wrong yeah  so I think this is a question about  terminology like what exactly does  deadlock mean into this count as a  deadlock like yes this counts as a  deadlock like no useful progress will be  made here like this these threads are  just stuck forever  any other questions so what our channel  is useful for I think channels are  useful for a small set of things like  for example I think for producer  consumer queues sort of situations like  here I have a program that makes a  channel and this spawns a bunch of  goroutines that are going to be doing  some work like say they're competing  some result in producing some data and I  have a bunch of these goroutines  running in parallel and I want to  collect all that data as it comes in and  do something with it  so this do work thing just like waits  for a bit and produces a random number  and in the main goroutine I'm going to  continuously receive on this channel and  print it out like this is a great use of  channels another good use of channels is  to achieve something similar to what  wait groups do so rather than use a wait  group suppose I want to spawn a bunch of  threads and wait till they're all done  doing something one way to do that is to  create a channel and then I spawn a  bunch of threads and know how many  threads I've spawned so five goroutines  created here they're going to do  something and then send on this channel  when they're done and then in the main  goroutine I can just receive from that  channel the same number of times and  this has the same effect as a wait group  so question so what exactly is the  question  [Music]  so the question is here could you use a  buffered channel with a capacity of five  because you're waiting for five receives  I think in this particular case yes that  would have the equivalent effect but I  think there's not really a reason to do  that  and I think at a high level in your code  you should avoid buffer channels and  also maybe even channels unless you  think very hard about what you're doing  yeah so what is a wait group I think  we covered this in a previous lecture  and I talked about it very briefly today  but I do have an example of wait  groups so a wait group is a yet  another synchronization primitive  provided by go in the sync package and  it kind of does what his name advertises  like it lets you wait for a certain  number of threads to be done the way it  works is you call wait group dot add  and that basically increments some  internal counter and then when you call  wait group dot wait it waits till  done has been called as many times as add  was called so this code is basically the  same as the code I just showed you that  was using a channel except this is using  wait group they have the exact same  effect you can use either one yeah  so the question here is about race  conditions I think like what happens if  this add doesn't happen fast enough  before this wait happens or something  like that well so here notice that the  pattern here is we call wait group  data outside of this goroutine and it's  called before spawning this goroutine  so this happens first this happens next  and so we'll never have the situation  we're done happens after this add happens  for this particular routine how's this  implemented by the compiler and I will  not talk about that now but talk to me  after class or in office hours but I  think for the purposes class like you  need to know the API for these things  not the implementation all right and so  I think that's basically all I have on  go concurrency primitives so one final  thought is on channels like channels are  good for a specific set of things like I  just showed you the producer consumer  queue or like implementing something  like wait groups but I think when you  try to do fancier things with them like  if you want to say like kick another go  routine that may or may not be waiting  for you to be like woken up that's a  kind of tricky thing to do with channels  there's also a bunch of other ways to  shoot yourself in the foot with them I'm  going to avoid showing you examples of  bad code with channels just because it's  not useful to see but I personally avoid  using channels for the most part and  just use shared memory and mutexes and  condition variables and set and I  personally find those much easier to  reason about so feel free to use  channels for when they make sense but if  anything looks especially awkward to do  with channels like just use mutexes and  condition variables and they're probably  a better tool yeah  so the question is with the difference  between this producer-consumer pattern  here in a thread-safe FIFO I think  they're kind of equivalent like you  could do this with the thread-safe FIFO  and it like that is basically what a  like buffered channel is roughly if  you're in queueing things in  dequeueing things like if you want this  line to finish and have this thread go  do something else while that data sits  there in a queue rather than this  goroutine waiting to send it then a  buffered channel might make sense but I  think at least in the lab you will not  have a pattern like that all right so  next Fabian's going to talk about more  rapidly related stuff do you need this  all right can you all hear me is this  working yeah all right so yeah basically  I'm going to show you two bugs that we  commonly see in people's raft  implementations there's a lot of bugs  that are pretty common but I'm just  going to focus on two of them so in this  first example we sort of have a start of  a raft implementation for that's sort of  like what you might see for to a just  the beginnings of one  so in our raft state we have primarily  the current status of the raft pier  either follower candidate or leader and  we have these two state variables that  were keeping track of the current term  and who we voted for in the current term  so I'm I want us to focus though on  these two functions AttemptElection and  CallRequestVote so in AttemptElection we're  just going to set our state to candidate  increment our current term vote for  ourselves and then start sending out  request votes to all of our raft peers  and so this is similar to some of the  patterns that Anish showed where we're  going to loop through our peers and then  for each one in a goroutines separately  call this CallRequestVote function in  order to actually send an RPC to that  peer  alright so in CallRequestVote we're  going to acquire the lock prepare  arguments for our request vote RPC call  based on by setting it to the current  term and then actually perform the RPC  call over here and finally based on the  response we will reply back to this this  AttemptElection function and the  AttemptElection function eventually  should tally up the votes to see if it  got a majority of the votes and can  become leader so what happens when we  run this code so in theory what we might  expect to happen is for so there's  going to be some code that's going to  spawn a few graph spears and actually  try to attempt elections on them and  what should happen are we just start  collecting votes from other peers and  then we're not actually going to tally  them up  but hopefully nothing weird goes wrong  but actually something is going to go  wrong here and we actually activated  goes deadlock detector and somehow we  ran into a deadlock so let's see what  happened for now let's focus on what's  going on with the server zero so server  zero it says it starts attempting an  election at term one that's just  starting the AttemptElection function  it will acquire the lock set some of the  set some stuff up for performing the  election and then unlock then it's going  to send out a request vote RPC to server  two it finishes processing that request  vote RPC over here so we're just  printing right before and after we  actually send out the RPC and then it  sends out a request vote RPC to server  one but after that it never we never  actually see it finish sending the  request vote RPC so it's actually stuck  in this function call waiting for the  RPC response from server 1 all right now  let's look at what's everyone's doing so  it's it's pretty much the same thing it  sends a request vote I received a server  two that that succeeds it finishes  processing that request vote the  response from server 2 then it sends  this RPC to zero and now what's actually  happening is 0 & 1 are sort of waiting  for the RPC responses from each other  they both sent out an RPC call but not  yet got the response yet and that's  actually sort of the cause of our  deadlock so really what's the reason  that we're dead locking is because we're  holding this lock through our RPC calls  over here in the core requests vote  function we acquire our mutex associated  with our raft peer and we only unlock at  the end of this function so throughout  this entire function we're holding the  lock including when we try to contact  our peer to get the vote and later when  we handle this request vote RPC we  actually only see it at the beginning of  this function in the handler we're also  trying to acquire the lock but we never  actually succeed in acquiring the lock  so just to make this a little bit more  clear the the sort of order of  operations  is happening is in CallRequestVote  server zero is first going to acquire  the lock and send an RPC call to server  one and then simultaneously and  separately server one is going to do the  same thing it's going to enter its call  request vote function acquire the lock  and send this RPC call to server zero  now in server zeros handler and server  ones handler they're trying to acquire  the lock but they can't because they  already are acquiring the lock and  trying to send the RPC call to each  other and that that's actually what's  leading to the deadlock situation so to  solve this basically we want you to not  hold locks through RPC calls and that's  the solution to this problem in fact we  don't need the lock here at all instead  of trying to read the current term when  we enter this CallRequestVote function  we can pass this as an argument here  save the term when we had acquired the  lock earlier in this AttemptElection  and just passed this as a as a variable  to CallRequestVote so that actually  removes the need to acquire the lock at  all in CallRequestVote alternatively  we could lock while we're preparing the  arguments and then unlock before  actually performing the call and then if  we need to to process the reply we could  lock again afterwards so it's just make  sure to unlock before making it  obviously call and then if you need to  you can acquire the lock again so now if  I save this then so it's still  activating the deadlock detector but  that's actually just because we're not  doing anything at the end but now it's  actually working  we finished sending the request votes on  both sides and all the operations that  we wanted to complete are complete all  right any questions about this example  yeah so not it's sort of so you might  need to use locks when you are preparing  the arguments or processing the response  but yeah you shouldn't hold a lock  through the RPC call while you're  waiting for the other peer to respond  and there's actually another reason to  that in addition to deadlock the other  problem is that in some tests we're  going to sort of have this unreliable  network that could delay some of your  RPC messages potentially by like 50  milliseconds and in that case if you  hold the lock through an RPC call then  any other operation that you try to do  during that 50 milliseconds won't be  able to complete until that RPC response  is received so that that's another issue  that you might run into if you hold the  lock so it's both to make things more  efficient and to avoid these potential  deadlock situations  all right so just one more example this  is again using a similar draft  implementation so again in our raft  state we're going to be keeping track of  whether a fuller candidate leader and  then also these two state variables in  this example I want you to focus on this  AttemptElection function so now we've  first implemented the change that I just  showed you to store the term here and  pass it as a variable to our function  that collects the request votes but  additionally we've implemented some  functionality to add up the votes so  what we'll do is we'll create a local  variable to count the votes and whenever  we get a vote if the vote was not  granted  we'll return immediately from this go  routine where we're processing the boat  otherwise we'll acquire the lock before  editing this shared local variable to  count up the votes and then if we did  not get a majority of the votes will  return immediately otherwise we'll make  ourselves the leader so as with the  other example I mean initially if you  look at this if I look at this like it  seems reasonable but let's see if  anything can go wrong all right so this  is the log output from one run and one  thing you might notice is that we've  actually elected two leaders on the same  term so server zero  it was elected made itself a leader on  term two and server one did as well it's  okay to have a leader elected on  different terms but here where we have  one on the same term that that should  never happen alright so how did this  actually come up so let's start from the  top so at the beginning server zero  actually attempted an election at term  one not turn two and it got its votes  from both of the other peers but for  whatever reason perhaps because those  reply messages from those peers were  delayed it didn't actually process its  process those votes until later and in  between receiving it like in between  attempting the election and finishing  the election server one also decided to  attempt an election perhaps because  because of server zero was delayed so  much server one might  actually ran into the election timeout  and then started its own election and it  started it on term 2 because it couldn't  have been termed 1 because it already  voted for server 0 on on term 1 over  here  okay so then server 1 sends out its own  request votes 2 servers 2 and 0 at term   for server 1 that's fine but server 0  also votes for server 1 this is actually  also fine because server one is asking  server 0 for a vote on a higher term and  so what server 0 should do is if you  remember from the spec it should set its  current term to that term in the request  for RPC message to term 2 and also  revert itself to a follower instead of a  candidate alright finally so the real  problem is that on this line where  server 0 although it really got enough  votes on term 1 it made itself a leader  on term - so the reason so one  explanation for why this is happening is  because in between where we set up the  election our attempt for the election  and where we actually process the votes  some other things are happening input in  this case we're actually voting for  someone else in between and so we're no  longer on term 1 where we thought we  started the election we're now on term 2  and so we just need a double check that  because we don't have the lock while  we're performing the RPC calls which is  important for its own reasons now some  things might have changed and we need to  double check that what we assume is true  when we're setting ourselves to the  leader is still true so one way to solve  this that there's a few different ways  like to solve this like you could  imagine not voting for others while  we're in the middle of attempting an  election but in this case the simplest  way to solve this at least in this  implementation is to just double check  that we're still on the same term and  we're still a candidate we haven't  reverted to a follower so actually one  thing I want to show you is if we do  print out our state over here then we do  see that server 0 became a follower but  it's still setting itself to a leader on  this line  so yeah we can just check for that if  we're not a candidate or the current  term doesn't match the term which we  started the election then let's just  quit and if we do that then  so everyone becomes a leader and we  never cease over zero become leader so  the problem solved any question yeah  yeah I think I think that would I  because we would not if the term is  higher now than actually no it would it  might not be sufficient because we might  have attempted another election it  depends on your implementation but it's  possible that you could have attempted  another election on a higher term  afterwards all we know that's the same  thing right yeah it would not be  sufficient to only check the state but I  think you're right if you only check the  term then it is sufficient all right any  other questions all right so yeah that's  it for this part she's going to show you  some more examples of actually debugging  some of these draft implementations  hi can you all hear me yeah  is it not  okay so in my section I'm gonna walk you  through how I would be but if you have  like a bug in your raft implementation  so I prepare a couple of buggy raft code  and I just try to walk you through it so  first I'm gonna go into my first  buggy implementation and if I run the  test here so for this one it doesn't  print anything it just gets started and  it's gonna be here forever and let's  assume that I have no idea why there's  happening  the first thing that I want to find out  is where it gets started and we we do  have a good tool for that which printf  but in the stop code if you go to  youtube go we have a function called the  printf this is just a nice wrapper  around the block printf with the  debugger able to enable or disable the  locking messages so I'm gonna enable  that and go back to my raft code so  first of all when i when when there  there's something that's bug happening I  always go check if the code actually  actually initialize raft server so here  I'll just clean  okay so here if I run the test again  then now I know that there are three  servers that get initialized so this  files is okay but like there's nowhere  where the bug is happening so I'll just  go deeper into the hood just to find  where it gets stuck so now if you see  the code we are calling the leader a  election so I'm gonna go to that  function and just to make faster I'll  try to check if it kicks off some  election  that part still fine so we we try to go  for now here we are in the election I'll  see if there's so we actually send the  request vote to some other servers  now we kind of have like more idea of  where guests are because it's not  printing that some sorry that kicks off  the election are not sending the request  words so I would go back for her just to  see where customers like I always tried  here prin if if we call some function  I I  I was always double shake if it actually  go into the function so now I'm going to  say that this service is at the start of  the election  and that works so now we have an idea of  like the bug should be between here and  here so we are trying to minimize the  scope of the code that's causing the bug  let's say if I print something here  and it does it doesn't get there so I  move it up let's say here still not  there  now it's there so the bug is probably in  this function and I just go check so  here the problem is that I'm trying to  acquire a lock where I actually do have  the lock so it's gonna be a deadlock so  that's how I will find their first bug  using the DPrintf and it's it's nice  to use the printf because you can like  just turn off the debugging print and  have a nice test output with our audit  debugging if you want it so that's how I  would use it DPrintf to try to like  handle a bug in your code and for this  example there's actually another trick  to help you find this kind of deadlock  so if you press ctrl + backslash you can  see in the bottle but bottom left that I  press like control and backslash this  this command will send a signal quit  today  go program and by default it will  handles the the quiz signal and quit all  the goroutines and print audio strike  the stack rates so now this like Chico  up here like this way it gets touched  and then there are gonna be a couple  functions printing here  just trying to go through all the traces  yes so it's actually showing that the  function that's causing the problem is  the cover to candidate so that's another  wait you've to find out where the day  locks are I can remove all this  and now it works so that's the first  example that I want to go through second  thing that you want it you want to do  before you submit your labs is to turn  the race  flag on when you do the test the way to  do that is just to add -race before  -run and here because my implement  implementation doesn't have any races  so it's not going to tell you anything  but this just be careful about this  because it's not a proof that you don't  have any really it's just that it cannot  detect races for you I'm going to run  the same command again with the red flag  but now this time that's actually race  going on in my implementation so it's  gonna yell at you that there's some  deliveries going on in your code  I'm quitting that and let's see like how  useful is the warning are so I'm gonna  go to my second implementation with  Raft code and here  let's look at this race so it's telling  us that there's a wait going on at the  line   wait on probably Thursday here and  there's also a right line 412 which  is Thursday so  I'm going to this line again  and now we kind of know that this this  radiation is protected by a lock so the  risk flies actually wanting us and  helping us to find out bug on on this  database that we have so the fake it's  gonna be just you lock this and unlock  it and that should solve the problem  so at this place we kind of know how to  basic like do some basic debugging does  anyone have any question no okay yeah so  I'm going to go to the third one which  is going to be more difficult to find  a bug I'm going to test the run the  centers and now I am I actually have  some debugging messages in there already  and just see that I also have a  debugging message with the test action  there's something you might want to  consider doing if you go into the test  clip here  you can just see how the test would run  and then there are some actions that the  test clip is gonna do to make your code  fail and it's usually a good idea to  print out where that action is happening  in your actual debugging message so you  can guess what is happening like where  the bug is happening in which phase of  the test if that make sense so now it's  like I was doing fine in the first case  I passed I passed the fail but I'm  failing their second test and here the  Test section is to found one as a little  one so I'm passing this the test until  this and if you go to I'm actually  passing until the leader two rejoins so  this can give you a nice idea of how the  test is working and just to help you  have a better case as where the bondage  is in your code so now let's look at the  debugging messages  so it's least it seems like when leader 2  rejoined it becomes a follower and we  have a new leader  so that looks fine to me and we probably  need more debugging messages instead of  just their state changes so I am going  to add some more my first case that when  one becomes a leader it might not be  doing what a leader should you correctly  so we got stuck  so you might could after we cover it as  eventually there I have a goroutine  call operate leader  there's just sending heartbeat to the all set  to the all servers so I'm gonna print  some stuff here saying heartbeat  cheers  away  so to become a leader it sends the the  first heartbeat to each server and one still  tries to send heartbeat to the new leader  and then one becomes a follower so this  doesn't look like to be a problem now  I'm gonna check if the other service  receive heartbeat correctly  it's taking away with I'm trying to  finish this yeah so to becomes a leader  to sends heartbeat but no one receive a  heartbeat form - so if I go to the same  opinion tree I actually hold the law to  the RPC Hall which is the problem that  Fabian went to in the last section so  that's that's the problem that I need to  fix so what I should do is to a log here  and then  lock again here and that should work  we pass and then there are couple things  that you might want to do when you test  your rough implementation so that's  actually script to run the test in  imperial and I can show you how I how we  can use how we can use it this creep is  in the inner peer support some someone  make a point about it and here's how we  can use the script so you run the script  specify the number of the test  personally I do like a 1000 but that  depends on your preference this is the  number of course that you wanna run the  test at the same time and then here's  the test and if you run the script then  if you show you that's like we have run  four tests so far all are working fine  and it's gonna keep going like that so  that's how I would go about debugging  rough implementation and you are all  welcome to come to office hours when you  need help   


--------------------------------------------------------------------------------