├── .gitignore
├── LICENSE
├── README.md
├── SUMMARY.md
├── book.json
├── cover.jpg
├── images
    └── firefox_spnego_setup.png
├── sections
    ├── acknowledgements.md
    ├── bibliography.md
    ├── checklists.md
    ├── delegationtokenissuer.md
    ├── errors.md
    ├── glossary.md
    ├── hadoop_and_kerberos.md
    ├── hadoop_tokens.md
    ├── hdfs.md
    ├── ipc.md
    ├── jaas.md
    ├── jdk_versions.md
    ├── kerberos_the_madness.md
    ├── keytabs.md
    ├── sasl.md
    ├── secrets.md
    ├── services.md
    ├── terrors.md
    ├── testing.md
    ├── the_limits_of_hadoop_security.md
    ├── tokens.md
    ├── ugi.md
    ├── web_and_rest.md
    ├── what_is_kerberos.md
    ├── yarn.md
    └── zookeeper.md
└── src
    └── uml
        ├── auth_token.txt
        ├── hdfs_uml.txt
        ├── index.md
        └── kerberos_login.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Node rules:
 2 | ## Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
 3 | .grunt
 4 | 
 5 | ## Dependency directory
 6 | ## Commenting this out is preferred by some people, see
 7 | ## https://docs.npmjs.com/misc/faq#should-i-check-my-node_modules-folder-into-git
 8 | node_modules
 9 | 
10 | # Book build output
11 | _book
12 | 
13 | # eBook build output
14 | *.epub
15 | *.mobi
16 | *.pdf
17 | .DS_Store
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Hadoop and Kerberos: The Madness beyond the Gate
 3 | 
 4 | 
 5 | > The most merciful thing in the world, I think, is the inability of the human mind to correlate all its contents.
 6 | > We live on a placid island of ignorance in the midst of black seas of infinity, and it was not meant that we should voyage far.
 7 | > The sciences, each straining in its own direction, have hitherto harmed us little;
 8 | > but some day the piecing together of dissociated knowledge will open up such terrifying vistas of reality,
 9 | > and of our frightful position therein, that we shall either go mad from the revelation
10 | > or flee from the light into the peace and safety of a new dark age.
11 | 
12 | > *[The Call of Cthulhu](https://en.wikisource.org/wiki/The_Call_of_Cthulhu), HP Lovecraft, 1926.*
13 | 
14 | 
15 | This manuscript discusses low-level issues related to Apache&trade; Hadoop&reg; and Kerberos
16 | 
17 | ## Disclaimer
18 | 
19 | Just as the infamous [Necronomicon](http://www.amazon.com/gp/product/0380751925) is a collection
20 | of notes scrawled in blood as a warning to others, this book is
21 | 
22 | 1. Incomplete.
23 | 1. Based on experience and superstition, rather than understanding and insight.
24 | 1. Contains information that will drive the reader insane.
25 | 
26 | Reading this book implies recognition of these facts and that the reader, their estate and
27 | their heirs accept all risk and liability. The author is not responsible if anything happens
28 | to their Apache Hadoop cluster, including all the data stored in HDFS disappearing into an unknown dimension,
29 | or the YARN scheduler starting to summon pre-human deities.
30 | 
31 | **You have been warned**
32 | 
33 | 
34 | ## Implementation notes.
35 | 
36 | 1. This is a work in progress book designed to built using the [gitbook tool chain](https://github.com/GitbookIO/gitbook).
37 | 
38 | 1. It is hosted on [github](https://github.com/steveloughran/kerberos_and_hadoop).
39 | Pull requests are welcome.
40 | 
41 | 1. All the content is Apache licensed.
42 | 
43 | 1. This is not a formal support channel for Hadoop + Kerberos problems. If you have a support
44 | contract with [Cloudera](http://cloudera.com/) then issues related to Kerberos may 
45 | eventually reach the author. Otherwise: try 
46 | 
47 |       - [Cloudera Community](https://community.cloudera.com/)
48 |       - The users mailing list of Apache Hadoop, the application and you are using on top of it.
49 |       - [Stack Overflow](http://stackoverflow.com/search?q=hadoop+kerberos).
50 | 1. The author is very much *not* a Kerberos expert, so (a) he can be wrong and (b) asking hard questions about it will generally get a "I have no idea whatsoever" answer. 
51 | 


--------------------------------------------------------------------------------
/SUMMARY.md:
--------------------------------------------------------------------------------
 1 | # Summary
 2 | 
 3 | * [The Madness Beyond the Gate](sections/kerberos_the_madness.md)
 4 | * [What is Kerberos?](sections/what_is_kerberos.md)
 5 | * [Hadoop and Kerberos](sections/hadoop_and_kerberos.md)
 6 | * [Hadoop Tokens](sections/hadoop_tokens.md)
 7 | * [HDFS and Kerberos](sections/hdfs.md)
 8 | * [UGI](sections/ugi.md)
 9 | * [Java and JDK Versions](sections/jdk_versions.md)
10 | * [JAAS](sections/jaas.md)
11 | * [Keytabs](sections/keytabs.md)
12 | * [SASL](sections/sasl.md)
13 | * [Hadoop IPC Security](sections/ipc.md)
14 | * [Web and REST](sections/web_and_rest.md)
15 | * [YARN and YARN Applications](sections/yarn.md)
16 | * [Zookeeper](sections/zookeeper.md)
17 | * [Testing](sections/testing.md)
18 | * [Low-Level Secrets](sections/secrets.md)
19 | * [Error Messages to Fear](sections/errors.md)
20 | * [Tales of Terror](sections/terrors.md)
21 | * [The Limits of Hadoop Security](sections/the_limits_of_hadoop_security.md)
22 | * [Checklists](sections/checklists.md)
23 | * [Glossary](sections/glossary.md)
24 | * [Bibliography](sections/bibliography.md)
25 | * [Acknowledgements](sections/acknowledgements.md)
26 | 


--------------------------------------------------------------------------------
/book.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "gitbook": ">=2.0.0" ,
 3 |   "title": "Hadoop and Kerberos: The Madness beyond the Gate",
 4 |   "description": "A terrifying dive into the depths of Hadoop security",
 5 |   "variables": {
 6 |         "asf": "Apache Software Foundation",
 7 |         "author":"Steve Loughran",
 8 |         "title": "Hadoop and Kerberos: The Madness beyond the Gate",
 9 |         "hadoop-latest": "2.7.1"
10 |     },
11 |     "plugins": [
12 |       "autocover",
13 |       "katex",
14 |       "include-codeblock"],
15 |     "pluginsConfig": {
16 |         "autocover": {
17 |           "title": "Kerberos and Hadoop: The Madness Beyond the Gate",
18 |               "author": "Steve Loughran",
19 |               "font": {
20 |                   "size": null,
21 |                   "family": "Impact",
22 |                   "color": "#FFF"
23 |               },
24 |               "size": {
25 |                   "w": 4000,
26 |                   "h": 3000
27 |               },
28 |               "background": {
29 |                   "color": "#09F"
30 |               }
31 |         }
32 |     },
33 |     "pdf": {
34 |         "pageNumbers": true,
35 |         "fontSize": 11,
36 |         "paperSize": "a4",
37 |         "margin": {
38 |             "right": 62,
39 |             "left": 62,
40 |             "top": 36,
41 |             "bottom": 36
42 |         },
43 | 
44 |         "comment-1":"//Header HTML template. Available variables: _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_.",
45 |         "headerTemplate-off": "_TITLE_",
46 | 
47 |         "comment":"//Footer HTML template. Available variables: _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_.",
48 |         "footerTemplate-off": "_PAGENUM_"
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steveloughran/kerberos_and_hadoop/dfc110e4e8d7a831a75bf2747f261c3a1b99b1f6/cover.jpg


--------------------------------------------------------------------------------
/images/firefox_spnego_setup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steveloughran/kerberos_and_hadoop/dfc110e4e8d7a831a75bf2747f261c3a1b99b1f6/images/firefox_spnego_setup.png


--------------------------------------------------------------------------------
/sections/acknowledgements.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 |   Licensed under the Apache License, Version 2.0 (the "License");
 3 |   you may not use this file except in compliance with the License.
 4 |   You may obtain a copy of the License at
 5 |   
 6 |    http://www.apache.org/licenses/LICENSE-2.0
 7 |   
 8 |   Unless required by applicable law or agreed to in writing, software
 9 |   distributed under the License is distributed on an "AS IS" BASIS,
10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |   See the License for the specific language governing permissions and
12 |   limitations under the License. See accompanying LICENSE file.
13 | -->
14 | # Acknowledgements
15 | 
16 | * Everyone who has struggled to secure Hadoop deserves to be recognised, their sacrifice acknowledged.
17 | * Everyone who has got their application to work within a secure Hadoop cluster will have suffered without any appreciation; without anyone appreciating their effort. Indeed, all that they are likely to have received is complaints about how their software is late.
18 | 
19 | However, our best praise, our greatest appreciation, has to go to everyone who added logging statements in the Hadoop codepath.
20 | 
21 | Some of the content in this document was copied from a 2013 presentation by Kevin Minder.
22 | 
23 | ## Contributors to this Document
24 | 
25 | It is through the work of these brave people that we shall prevail!
26 | 
27 | * Samson Scharfrichter
28 | * [Aloklal99](https://github.com/aloklal99)
29 | * [chuckleberryfinn](https://github.com/chuckleberryfinn)
30 | * [camypaj](https://github.com/camypaj)
31 | * [Sean Busbey](https://github.com/busbey)
32 | * [Daniel Darabos](https://github.com/darabos)
33 | * [Vipin Rathor](https://github.com/VipinRathor)
34 | * [Josh Elser](https://github.com/joshelser)
35 | * [Peter MacNaughton](https://github.com/pmacn)
36 | <!-- * [](https://github.com/) -->
37 | 
38 | 
39 | We shall honour their memories, and mourn their messy and painful departures from the world of the sane, cherishing the github PRs they left as their glimpsed the underpinnings of the world, and so condemned themselvers forever for a world of Kerberos support calls.
40 | 


--------------------------------------------------------------------------------
/sections/bibliography.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 |   Licensed under the Apache License, Version 2.0 (the "License");
 3 |   you may not use this file except in compliance with the License.
 4 |   You may obtain a copy of the License at
 5 |   
 6 |    http://www.apache.org/licenses/LICENSE-2.0
 7 |   
 8 |   Unless required by applicable law or agreed to in writing, software
 9 |   distributed under the License is distributed on an "AS IS" BASIS,
10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |   See the License for the specific language governing permissions and
12 |   limitations under the License. See accompanying LICENSE file.
13 | -->
14 | # Bibliography
15 | 
16 | 1. [IETF RFC 4120](https://www.ietf.org/rfc/rfc4120.txt)
17 | 1. [Java 7 Kerberos Requirements](http://docs.oracle.com/javase/7/docs/technotes/guides/security/jgss/tutorials/KerberosReq.html)
18 | 1. [Java 8 Kerberos Requirements](http://docs.oracle.com/javase/8/docs/technotes/guides/security/jgss/tutorials/KerberosReq.html)
19 | 1. [Troubleshooting Kerberos on Java 7](http://docs.oracle.com/javase/7/docs/technotes/guides/security/jgss/tutorials/Troubleshooting.html)
20 | 1. [Troubleshooting Kerberos on Java 8](http://docs.oracle.com/javase/8/docs/technotes/guides/security/jgss/tutorials/Troubleshooting.html)
21 | 1. [JAAS Configuration (Java 8)](http://docs.oracle.com/javase/8/docs/technotes/guides/security/jgss/tutorials/LoginConfigFile.html)
22 | 1. For OS/X users, the GUI ticket viewer is `/System/Library/CoreServices/Ticket\ Viewer.app`
23 | 1. [Colouris01], Colouris, Dollimore & Kindberg, 2001, *Distributed System Concepts and Design*,
24 | 1. [Java 8 GSS API](https://docs.oracle.com/javase/8/docs/technotes/guides/security/jgss/jgss-features.html)
25 | 1. [Ubuntu Kerberos Wiki](https://help.ubuntu.com/community/Kerberos)
26 | 1. [Kerberos FAQ](http://www.cmf.nrl.navy.mil/krb/kerberos-faq.html). Dates from 2000; many of the links are worthless
27 | 1. [Kerberos With Clocks Adrift: History, Protocols, and Implementation](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.204.9216&rep=rep1&type=pdf)
28 | 
29 | 
30 | ## Hadoop Security
31 | 
32 | 1. [Adding Security to Apache Hadoop](http://hortonworks.com/wp-content/uploads/2011/10/security-design_withCover-1.pdf)
33 | 1. [The Role of Delegation Tokens in Apache Hadoop Security](http://hortonworks.com/blog/the-role-of-delegation-tokens-in-apache-hadoop-security/)
34 | 1. [Chapter 8. Secure Apache HBase](http://hbase.apache.org/book/security.html)
35 | 1. Hadoop Operations p135+
36 | 1. [Hadoop Security Architecture](http://www.slideshare.net/oom65/hadoop-security-architecture)
37 | 1. [HADOOP-9621] Document/analyze current Hadoop security model, [HADOOP-9621)(https://issues.apache.org/jira/browse/HADOOP-9621)
38 | 
39 | 
40 | 
41 | ## Kerberos, Active Directory and Apache Hadoop
42 | 
43 | 1. [Microsoft Technet Introduction to Kerberos](https://technet.microsoft.com/en-us/library/cc772815(v=ws.10).aspx)
44 | 1. [Kabakov14], Kabakov,
45 |    [Securing Hadoop environments with Kerberos and active directory](https://developer.ibm.com/hadoop/blog/2014/09/03/securing-hadoop-environments-kerberos-active-directory/),
46 |    IBM, 2014
47 | 1. [Cesir14], Cesir,
48 |    [Enabling Kerberos on HDP and Integrating with Active Directory](http://hortonworks.com/blog/enabling-kerberos-hdp-active-directory-integration/),
49 |    Hortonworks, 2014.
50 | 1. [Cloudera15] Cloudera,
51 |    [Integrating Hadoop Security with Active Directory](http://www.cloudera.com/content/cloudera/en/documentation/core/v5-3-x/topics/cdh_sg_hadoop_security_active_directory_integrate.html),
52 |    2015
53 | 1. [Troubleshooting Kerberos Encryption Types](https://ping.force.com/Support/servlet/fileField?retURL=%2FSupport%2Fapex%2FPingIdentityArticle%3Fid%3DkA3400000008RZLCA2&entityId=ka3400000008XOTAA2&field=Associated_File__Body__s); Nathan Park, Ping Identity, 2013. Grat 
54 | 


--------------------------------------------------------------------------------
/sections/checklists.md:
--------------------------------------------------------------------------------
  1 | <!---
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 |   
  6 |    http://www.apache.org/licenses/LICENSE-2.0
  7 |   
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 | 
 15 | # Checklists
 16 | 
 17 | ## All programs
 18 | 
 19 | [ ] Sets up security before accessing any Hadoop services, including FileSystem APIs
 20 | 
 21 | [ ] Sets up security after loading local configuration files, such as `core-site.xml`.
 22 | If you try to open an `hdfs://` filesystem, an `HdfsConfiguration` instance is created, which
 23 | pulls in `hdfs-default.xml` and `hdfs-site.xml`. To load in the Yarn settings, create an
 24 | instance of `YarnConfiguration`.
 25 | 
 26 | [ ] Are tested against a secure cluster from a logged in user.
 27 | 
 28 | [ ] Are tested against a secure cluster from a not logged in user, and the program set up to
 29 | use a keytab (if supported).
 30 | 
 31 | [ ] Are tested from a logged out user with a token file containing the tokens and the environment variable
 32 | `HADOOP_TOKEN_FILE_LOCATION` set to this file. This verifies Oozie can support it without needing
 33 | a keytab.
 34 | 
 35 | [ ] Can get tokens for all services which they may optionally need.
 36 | 
 37 | [ ] Don't crash on a secure cluster if the cluster filesystem does not issue tokens. That is:
 38 | Kerberized clusters where the FS is something other than HDFS.
 39 | 
 40 | ## Hadoop RPC Service
 41 | 
 42 | [ ] Principal for Service defined. This is generally a configuration property.
 43 | 
 44 | [ ] `SecurityInfo` subclass written.
 45 | 
 46 | [ ] `META-INF/services/org.apache.hadoop.security.SecurityInfo` resource lists.
 47 | 
 48 | [ ] the `SecurityInfo` subclass written
 49 | 
 50 | [ ] `PolicyProvider` subclass written.
 51 | 
 52 | [ ] RPC server handed `PolicyProvider` subclass during setup.
 53 | 
 54 | [ ] Service verifies that caller has authorization for the action before executing it.
 55 | 
 56 | [ ] Service records authorization failures to audit log.
 57 |  
 58 | [ ] Service records successful action to audit log.
 59 | 
 60 | [ ] Uses `doAs()` to perform operations as the user making the RPC call.
 61 | 
 62 | ## YARN Client/launcher
 63 | 
 64 | [ ] `HADOOP_USER` env variable set on AM launch context in insecure clusters, and in launched containers.
 65 | 
 66 | [ ] In secure cluster: all delegation tokens needed (HDFS, Hive, HBase, Zookeeper) created and added to launch context.
 67 | 
 68 | ## YARN Application
 69 | 
 70 | [ ] Delegation tokens extracted and saved.
 71 | 
 72 | [ ] When launching containers, the relevant subset of delegation tokens are passed to the containers. (This normally omits the RM/AM token).
 73 | 
 74 | [ ] Container Credentials are retrieved in AM and containers.
 75 | 
 76 | [ ] Delegation tokens revoked during (managed) teardown.
 77 | 
 78 | ## YARN Web UIs and REST endpoints
 79 |  
 80 | [ ] Primary Web server: `AmFilterInitializer` used to redirect requests to the RM Proxy.
 81 | 
 82 | [ ] Other web servers: a custom authentication strategy is chosen and implemented.
 83 |  
 84 | ## Yarn Service
 85 | 
 86 | [ ] A strategy for token renewal is chosen and implemented
 87 | 
 88 | ## Web Service
 89 | 
 90 | [ ] `AuthenticationFilter` added to web filter chain
 91 | 
 92 | [ ] Token renewal policy defined and implemented. (Look at `TimelineClientImpl` for an example of this)
 93 | 
 94 | 
 95 | ## Clients
 96 | 
 97 | ### All clients
 98 | 
 99 | [ ] Supports keytab login and calls `UserGroupInformation.loginUserFromKeytab(principalName, keytabFilename)` during initialization.
100 | 
101 | [ ] Issues `UserGroupInformation.getCurrentUser().checkTGTAndReloginFromKeytab()` call during connection setup/token reset. This is harmless on an insecure or non-keytab client.
102 | 
103 | [ ] Client supports Authentication Token option
104 | 
105 | [ ] Client supports Delegation Token option. (not so relevant for most YARN clients)
106 | 
107 | [ ] For Delegation-token authenticated connections, something runs in the background to regularly update delegation tokens.
108 | 
109 | [ ] Tested against secure clusters with user logged out (kdestroy).
110 | 
111 | [ ] Logs basic security operations at INFO, with detailed operations at DEBUG level.
112 | 
113 | ### RESTful client
114 | 
115 | [ ] Jersey: URL constructor handles SPNEGO Auth
116 | 
117 | [ ] Code invoking Jersey Client reacts to 401/403 exception responses when using Authentication Token by deleting creating a new Auth Token and re-issuing request. (this triggers re-authentication)
118 | 
119 | ### Debugging Workflow
120 | 
121 | [ ] host has an IP address (`ifconfig` / `ipconfig`)
122 | 
123 | [ ] host has an FQDN: `hostname -f`
124 | 
125 | [ ] FQDN resolves to hostname `nslookup $hostname`
126 | 
127 | [ ] hostname responds to pings `ping $hostname`
128 | 
129 | [ ] reverse DNS lookup of IPAddr returns hostname
130 | 
131 | [ ] clock is in sync with rest of cluster: `date`
132 | 
133 | [ ] JVM has Java Crypto Extensions
134 | 
135 | [ ] keytab exists
136 | 
137 | [ ] keytab is readable by account running service.
138 | 
139 | [ ] keytab contains principals in listing `ktlist -kt $keytab`
140 | 
141 | [ ] keytab FQDN is in entry of form `shortname/$FQDN`
142 | 


--------------------------------------------------------------------------------
/sections/delegationtokenissuer.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 |   Licensed under the Apache License, Version 2.0 (the "License");
 3 |   you may not use this file except in compliance with the License.
 4 |   You may obtain a copy of the License at
 5 |   
 6 |    http://www.apache.org/licenses/LICENSE-2.0
 7 |   
 8 |   Unless required by applicable law or agreed to in writing, software
 9 |   distributed under the License is distributed on an "AS IS" BASIS,
10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |   See the License for the specific language governing permissions and
12 |   limitations under the License. See accompanying LICENSE file.
13 | -->
14 | 
15 | # `interface DelegationTokenIssuer`
16 | 
17 | This interface was extracted from the delegation token methods of the `FileSystem`
18 | class; it was extracted so that it can also be used to create delegation
19 | tokens from other sources, specifically the KMS Key manager.
20 | 
21 | 
22 | 
23 | ### `String getCanonicalServiceName()`
24 | 
25 | The method `getCanonicalServiceName()` is used to return a string for the filesystem
26 | by which a Delegation Token issued by the filesystem may be indexed within
27 | a map of `ServiceName` to `Token`.
28 | 
29 | If non-null, it must be unique to the extent that: 
30 | 
31 | _all filesystem instances or
32 | other token issuers which declare their canonical service name to be identical
33 | MUST must be able to use the same marshalled delegation token to authenticate._
34 | 
35 | Generally the service name is the URI of the filesystem or other service.
36 | 
37 | For any filesystem or service where the port is used to distinguish the service identity,
38 | the port MUST be included in the canonical service name. e.g. `hdfs://namenode:8088/`. 
39 | 
40 | For object stores accessed over HTTP or HTTPS, the port is generally excluded
41 | 
42 | `s3a://bucket1/`
43 | 
44 | 
45 | 
46 | *Note* `AbstractFileSystem.getUriDefaultPort()` is used in 
47 | the `AbstractFileSystem.checkPath()` to verify that a path passed in to a `FileContext`
48 | API call applies to the filesystem being invoked. 
49 | 
50 | If the canonical URI of a filesystem includes a port, the same value should be 
51 | be returned inthe `getDefaultPort()`
52 | 
53 | 
54 | The implemntation of the method in `FileSystem` is: 
55 | 
56 | ```java
57 |   public String getCanonicalServiceName() {
58 |     return (getChildFileSystems() == null)
59 |       ? SecurityUtil.buildDTServiceName(getUri(), getDefaultPort())
60 |       : null;
61 |   }
62 | ```
63 | That is: if there are no nested child filesystems, then the Canonical Service
64 | URI is derived from `SecurityUtil.buildDTServiceName(getUri(), getDefaultPort())`
65 | 
66 | 
67 | 
68 | If a filesystem returns `null` from `getCanonicalServiceName()`, it is
69 | declaring that it does not issue delegation tokens, that is `getDelegationToken()`
70 | will also return `null`.
71 | 
72 | ### `Token<?> getDelegationToken(renewer)`
73 | 
74 | Request a delegation token from the filesystem/delegation token issuer.
75 | 
76 | 
77 | ### `Token<?>[] addDelegationTokens` 
78 | 
79 | 
80 | ###   `DelegationTokenIssuer[] getAdditionalTokenIssuers()`
81 | 
82 | 
83 | Return a possibly empty array of other token issuers, or `null`.
84 | 
85 | The token collector is expected to recursively collect all tokens served
86 | up by issuers listed. This is to support filesystems which contain one
87 | or more nested filesystem through mount points, and services such as key
88 | management associated with the store.
89 | 
90 | The return value *MUST NOT* contain a reference to the current issuer: callers
91 | do not check for this when recursing through the tree of token issuers.
92 | 


--------------------------------------------------------------------------------
/sections/glossary.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 |   Licensed under the Apache License, Version 2.0 (the "License");
 3 |   you may not use this file except in compliance with the License.
 4 |   You may obtain a copy of the License at
 5 |   
 6 |    http://www.apache.org/licenses/LICENSE-2.0
 7 |   
 8 |   Unless required by applicable law or agreed to in writing, software
 9 |   distributed under the License is distributed on an "AS IS" BASIS,
10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |   See the License for the specific language governing permissions and
12 |   limitations under the License. See accompanying LICENSE file.
13 | -->
14 | # Glossary
15 | 
16 |  
17 | * KPW - Kerberos Password
18 | Used to encrypt and validate session keys
19 | 
20 | * TGT - Kerberos Ticket Granting Ticket
21 |  A special KST granting user access to TGS
22 |  Stored in user's keytab via kinit or windows login 
23 | 
24 | *  KST - Kerberos Service Ticket
25 | 
26 | *  KST[P,S] - A KST granting principal P access to service S
27 | *  DT - Delegation Token
28 | *  DT[P,R] - Allows holder to impersonate principal P and renew DT with service R
29 | *  JT - Job Token
30 | *  A secure random value authenticating communication between JT and TT about a given task
31 | *  BT - Block Access Token
32 | *  BT[P,B] - A BT granting principal P access to block B
33 | 


--------------------------------------------------------------------------------
/sections/hadoop_and_kerberos.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 |   Licensed under the Apache License, Version 2.0 (the "License");
 3 |   you may not use this file except in compliance with the License.
 4 |   You may obtain a copy of the License at
 5 |   
 6 |    http://www.apache.org/licenses/LICENSE-2.0
 7 |   
 8 |   Unless required by applicable law or agreed to in writing, software
 9 |   distributed under the License is distributed on an "AS IS" BASIS,
10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |   See the License for the specific language governing permissions and
12 |   limitations under the License. See accompanying LICENSE file.
13 | -->
14 | 
15 | # Hadoop's support for Kerberos
16 | 
17 | Hadoop can use Kerberos to authenticate users, and processes running within a
18 | Hadoop cluster acting on behalf of the user. It is also used to authenticate services running
19 | within the Hadoop cluster itself -so that only authenticated HDFS Datanodes can join the HDFS
20 | filesystem, that only trusted Node Managers can heartbeat to the YARN Resource Manager and
21 | receive work.
22 | 
23 | * The exact means by which all this is done is one of the most complicated pieces of code to span the
24 | entire Hadoop codebase.*
25 |  
26 | Users of Hadoop do not need to worry about the implementation details, and, ideally, nor should
27 | the operations team.
28 | 
29 | Developers of core Hadoop code, anyone writing a YARN application, and anyone writing code
30 | to interact with a Hadoop cluster and applications running in it *do need to know those details*.
31 | 
32 | This is what this book attempts to cover.
33 | 
34 | ## Why do they inflict so much pain on us?
35 | 
36 | Before going in there, here's a recurring question: why? Why Kerberos and not, say some
37 | SSL-certificate like system? Or OAuth?
38 | 
39 | Kerberos was written to support centrally managed accounts in a local area network, one in
40 | which adminstrators manage individual accounts. This is actually much simpler to manage than
41 | PKI-certificate based systems: look at the effort it takes to revoke a certificate in a browser.
42 | 


--------------------------------------------------------------------------------
/sections/hadoop_tokens.md:
--------------------------------------------------------------------------------
  1 | <!---
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 |   
  6 |    http://www.apache.org/licenses/LICENSE-2.0
  7 |   
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 |   
 15 | # Introducing Hadoop Tokens
 16 | 
 17 | So far we've covered Kerberos and *Kerberos Tickets*. Hadoop complicates
 18 | things by adding another form of delegated authentication, *Hadoop Tokens*.
 19 | 
 20 | 
 21 | ### Why does Hadoop have another layer on top of Kerberos?
 22 | 
 23 | That's a good question, one developers ask on a regular basis —at least once
 24 | every hour based on our limited experiments.
 25 | 
 26 | Hadoop clusters are some of the largest "single" distributed systems on the planet
 27 | in terms of numbers of services: a YARN cluster of 10,000 nodes would have
 28 | 10,000 hdfs principals, 10,000 yarn principals and the principals of the users
 29 | running the applications. That's a lot of principals, all talking to each other,
 30 | all having to talk to the KDC, having to re-authenticate all the time, and making
 31 | calls to the KDC whenever they wish to talk to another principal in the system.
 32 | 
 33 | Tokens are wire-serializable objects issued by Hadoop services, which grant access
 34 | to services. Some services issue tokens to callers which are then used by those callers
 35 | to directly interact with other services *without involving the KDC at all*.
 36 | 
 37 | As an example, The HDFS NameNode has to give callers access to the blocks comprising a file.
 38 | This isn't done in the DataNodes: all filenames and the permissions are stored in the NN.
 39 | All the DNs have is their set of blocks. 
 40 | 
 41 | To get at these blocks, HDFS gives an authenticated caller a *Block Tokens* for every block
 42 | they need to read in a file. The caller then requests the blocks of any of the datanodes
 43 | hosting that block, including the block token in the request.
 44 | 
 45 | These HDFS Block Tokens do not contain any specific knowledge of the principal running the
 46 | Datanodes, instead they declare that the caller has stated access rights to the specific block, up until
 47 | the token expires.
 48 | 
 49 | 
 50 | ```
 51 | public class BlockTokenIdentifier extends TokenIdentifier {
 52 |   static final Text KIND_NAME = new Text("HDFS_BLOCK_TOKEN");
 53 | 
 54 |   private long expiryDate;
 55 |   private int keyId;
 56 |   private String userId;
 57 |   private String blockPoolId;
 58 |   private long blockId;
 59 |   private final EnumSet<AccessMode> modes;
 60 |   private byte [] cache;
 61 | 
 62 |   ...
 63 | ```
 64 | 
 65 | Alongside the fields covering the block and permissions, that `cache` data contains the token
 66 | identifier
 67 |  
 68 | ## Kerberos Tickets vs Hadoop Tokens
 69 |  
 70 |  
 71 | | Token                | Function                   |
 72 | |--------------------------|----------------------------------------------------|
 73 | | Authentication Token | Directly authenticate a caller. |
 74 | | Delegation Token | A token which can be passed to another process. |
 75 |  
 76 |  
 77 | ### Authentication Tokens
 78 |  
 79 | Authentication Tokens are explicitly issued by services to allow the caller to
 80 | interact with the service without having to re-request tickets from the TGT.
 81 | 
 82 | When an Authentication Tokens expires, the caller must request a new one from the service.
 83 | If the Kerberos ticket to interact with the service has expired, this may include
 84 | re-requesting a ticket off the TGS, or even re-logging in to Kerberos to obtain a new TGT.
 85 | 
 86 | As such, they are almost equivalent to Kerberos Tickets -except that it is the 
 87 | distributed services themselves issuing the Authentication Token, not the TGS.
 88 | 
 89 | ### Delegation Tokens
 90 | 
 91 | A delegation token is requested by a client of a service; they can be passed to
 92 | other processes. 
 93 | 
 94 | When the token expires, the original client must request a new delegation token
 95 | and pass it on to the other process, again.
 96 | 
 97 | What is more important is: 
 98 | 
 99 | * delegation tokens can be renewed before they expire.*
100 | 
101 | This is a fundamental difference between Kerberos Tickets and Hadoop Delegation Tokens.
102 | 
103 | Holders of delegation tokens may renew them with a token-specific `TokenRenewer` service,
104 | so refresh them without needing the Kerberos credentials to log in to kerberos.
105 | 
106 | More subtly
107 | 
108 | 1. The tokens must be renewed before they expire: once expired, a token is worthless.
109 | 1. Token renewers can be implemented as a Hadoop RPC service, or by other means, *including HTTP*.
110 | 1. Token renewal may simply be the updating of an expiry time in the server, without pushing
111 | out new tokens to the clients. This scales well when there are many processes across
112 | the cluster associated with a single application..
113 | 
114 | For the HDFS Client protocol, the client protocol itself is the token renewer. A client may
115 | talk to the Namenode using its current token, and request a new one, so refreshing it.
116 | 
117 | In contrast, the YARN timeline service is a pure REST API, which implements its token renewal over
118 | HTTP/HTTPS. To refresh the token, the client must issue an HTTP request (a PUT operation, interestingly
119 | enough), receiving a new token as a response.
120 | 
121 | Other delegation token renewal mechanisms alongside Hadoop RPC and HTTP could be implemented,
122 | that is a detail which client applications do not need to care about. All the matters is that
123 | they have the code to refresh tokens, usually code which lives alongside the RPC/REST client,
124 | *and keep renewing the tokens on a regularl basis*. Generally this is done by starting
125 | a thread in the background.
126 | 
127 | 
128 | # Delegation Token Revocation
129 | 
130 | Delegation tokens can be revoked —such as when the YARN which needed them completes.
131 | 
132 | In Kerberos, the client obtains a ticket off the KDC, then hands it to the service —a service
133 | which does not need to contain any information about the tickets issued to clients.
134 | 
135 | With delegation tokens, the specific services supporting them have to implement their
136 | own internal tracking of issued tokens. That comes with benefits as well as a cost.
137 | 
138 | The cost? The services now have to maintain some form of state, either locally or, in HA deployments,
139 | in some form of storage shared across the failover services. 
140 | 
141 | The benefit: there's no need to involve the KDC in authenticating requests, yet short-lived access
142 | can be granted to applications running in the cluster. This explicitly avoid the problem of having
143 | 1000+ containers in a YARN application each trying to talk to the KDC. (Issue: surely tickets
144 | offer that feature?).
145 | 
146 | ## Example
147 | 
148 | Imagine a user deploying a YARN application in a cluster, one which needs
149 | access to the user's data stored in HDFS. The user would be required to be authenticated with
150 | the KDC, and have been granted a *Ticket Granting Ticket*; the ticket needed to work with
151 | the TGS. 
152 | 
153 | The client-side launcher of the YARN application would be able to talk to HDFS and the YARN
154 | resource manager, because the user was logged in to Kerberos. This would be managed in the Hadoop
155 | RPC layer, requesting tickets to talk to the HDFS NameNode and YARN ResourceManager, if needed.
156 | 
157 | To give the YARN application the same rights to HDFS, the client-side application must
158 | request a Delegation Token to talk to HDFS, a key which is then passed to the YARN application in
159 | the `ContainerLaunchContext` within the `ApplicationSubmissionContext` used to define the
160 | application to launch: its required container resources, artifacts to download, "localize",
161 | environment to set up and command to run.
162 | 
163 | The YARN resource manager finds a location for the Application Master, and requests that
164 | hosts' Node Manager start the container/application. 
165 | 
166 | The Node Manager uses the "delegated HDFS token" to download the launch-time resources into
167 | a local directory space, then executes the application.
168 | 
169 | *Somehow*, the HDFS token (and any other supplied tokens) are passed to the application that
170 | has been launched.
171 | 
172 | The launched application master can use this token to interact with HDFS *as the original user*.
173 | 
174 | The AM can also pass token(s) on to launched containers, so that they too have access to HDFS.
175 | 
176 | 
177 | The Hadoop NameNode does not need to care whether the caller is the user themselves, the Node Manager
178 | localizing the container, the launched application or any launched containers. All it does is verify
179 | that when a caller requests access to the HDFS filesystem metadata or the contents of a file,
180 | it must have a ticket/token which declares that they are the specific user, and that the token
181 | is currently considered valid (based on the expiry time and the clock value of the Name Node)
182 | 
183 | 
184 | 
185 | ## What does this mean for my application?
186 | 
187 | If you are writing an application, what does this mean?
188 | 
189 | You need to worry about tokens in servers if:
190 | 
191 | 1. You want an application deploying work into a YARN cluster to access
192 | your service *as the user submitting the job*.
193 | 1. You want to support secure connections without requiring Kerberos
194 | authentication at the rate of the maximum life of a kerberos ticket.
195 | 1. You want to allow applications to delegate authority, such
196 | as to YARN applications, or other services. (Example, filesystem delegation tokens
197 | provided to a Hive thrift server could be used to access the filesystem
198 | as that user).
199 | 1. You want a consistent client/server authentication and identification 
200 | mechanism across secure and insecure clusters. This is exactly what YARN does:
201 | a token is issued by the YARN Resource Manager to an application instance's
202 | Application Manager at launch time; this is used in all communications from
203 | the AM to the RM. Using tokens *always* means there is no separate codepath
204 | between insecure and secure clusters.
205 | 
206 | You need to worry about tokens in client applications if you wish
207 | to interact with Hadoop services. If the client is required to run
208 | on a kerberos-authenticated account (e.g. kinit or keytab), then
209 | your main concern is simply making sure the principal is logged in.
210 | 
211 | If your application wishes to run code in the cluster using the YARN scheduler, you need to
212 | directly worry about Hadoop tokens. You will need to request delegation tokens
213 | from every service with which your application will interact, include them in the YARN
214 | launch information —and propagate them from your Application Master to all
215 | containers the application launches.
216 | 
217 | ## Design
218 | 
219 | (from Owen's design document)
220 | 
221 | Namenode Token
222 | 
223 |     TokenID = {ownerID, renewerID, issueDate, maxDate, sequenceNumber}
224 |     TokenAuthenticator = HMAC-SHA1(masterKey, TokenID)
225 |     Delegation Token = {TokenID, TokenAuthenticator}
226 | 
227 | The token ID is used in messages from the client to identify the client; service can
228 | rebuild the `TokenAuthenticator` from it; this is the secret used for DIGEST-MD5 signing
229 | of requests.
230 | 
231 | 
232 | Token renewal: caller asks service provider for a token to be renewed. The server updates
233 | the expiry date in its local table to `min(maxDate, now()+renew_period)`. A non-HA NN
234 | can use these renewal requests to actually rebuild its token table —provided the master
235 | key has been persisted.
236 | 
237 | ## Implementation Details
238 | 
239 | What is inside a Hadoop Token? Whateve marshallable data the service 
240 | wishes to supply. 
241 | 
242 | A token is treated as a byte array to be passed
243 | in communications, such as when setting up an IPC
244 | connection, or as a data to include on an HTTP header
245 | while negotiating with a remote REST endpoint.
246 | 
247 | 
248 | 
249 | The code on the server which issues tokens,
250 | the `SecretManager` is free to fill its byte arrays with
251 | structures of its choice. Sometimes serialized java objects
252 | are used; more recent code, such as that in YARN, serializes
253 | data as a protobuf structure and provides that in the  byte array
254 | (example, `NMTokenIdentifier`).
255 | 
256 | ### `Token`
257 | 
258 | The abstract class `org.apache.hadoop.yarn.api.records.Token` is
259 | used to represent a token in Java code; it contains
260 | 
261 | | field | type | role |
262 | |-------|------|------|
263 | | identifier | `ByteBuffer` | the service-specific data within a token |
264 | | password | `ByteBuffer` | a password
265 | | tokenKind | `String` | token kind for looking up tokens.  
266 | 
267 | 
268 | ### `TokenIdentifier implements Writable`
269 | 
270 | Implementations of the the abstract class `org.apache.hadoop.security.token.TokenIdentifier`
271 | must contain everything needed for a caller to be validated by a
272 | service which uses tokens for authentication.
273 | 
274 | The base class has:
275 | 
276 | | field | type | role |
277 | |-------|------|------|
278 | | `kind` | `Text` |  Unique type of token identifier|
279 | 
280 | The `kind` field must be unique across all tokens, as it is used in bonding
281 | to tokens.    
282 | 
283 | 
284 | ### `DelegationTokenIdentifier`
285 | 
286 | The abstract class `org.apache.hadoop.security.token.delegation.AbstractDelegationTokenIdentifier` is
287 | the base of all Delegation Tokens in HDFS and elsewhere. 
288 | 
289 | It extends `TokenIdentifier` with:
290 | 
291 | 
292 | | field | type | role |
293 | |-------|------|------|
294 | | `owner` | `Text` |  owner of the token | 
295 | | `renewer` | `Text` |  |
296 | | `realUser` | `Text` |  |
297 | 
298 | 
299 | It is straightforward to extend this with more data, which can be
300 | used to add information into a token which can then be read by the
301 | applications which have loaded the tokens. This has been used
302 | in [HADOOP-14556](https://issues.apache.org/jira/browse/HADOOP-14556)
303 | to pass Amazon AWS login secrets as if they were a filesystem token.
304 | 
305 | 
306 | ### `SecretManager`
307 | 
308 | Every server which handles tokens through Hadoop RPC should implement an
309 | a `org.apache.hadoop.security.token.SecretManager` subclass.
310 |  
311 | 
312 | In `org.apache.hadoop.ipc.Server` and `org.apache.hadoop.security.SaslRpcServer`,
313 | a specific implementation of the class `SecretManager<T extends TokenIdentifier>`
314 | creates instances of the TokenIdentifier
315 | 
316 |  
317 | ### `DelegationKey`
318 | 
319 | This contains a "secret" (generated by the `javax.crypto` libraries), adding serialization
320 | and equality checks. Because of this the keys can be persisted (as HDFS does) or sent
321 | over a secure channel. Uses crop up in YARN's `ZKRMStateStore`, the MapReduce History server 
322 | and the YARN Application Timeline Service.
323 | 
324 | 
325 |  
326 | ## Working with tokens
327 |  
328 | ### How tokens are issued
329 | 
330 |  
331 | A first step is determining the Kerberos Principal for a service:
332 |  
333 | 1. The Service name is derived from the URI (see `SecurityUtil.buildDTServiceName`)...different
334 | services on the same host have different service names
335 | 1. Every service has a protocol (usually defined by the RPC protocol API)
336 | 1. To find a token for a service, client enumerates all `SecurityInfo` instances; these
337 | return info about the provider. One class `AnnotatedSecurityInfo`, examines the annotations
338 | on the class to determine these values, including looking in the Hadoop configuration
339 | to determine the kerberos principal declared for that service (see [IPC](ipc.html) for specifics).
340 | 
341 | 
342 | 
343 | ### How tokens are renewed
344 | 
345 | Token renewal is the second part of token work. If you are implementing token
346 | support, it is easiest to postpone this until the core issuing is working.
347 | 
348 | Implement a subclass of `org.apache.hadoop.security.token.TokenRenewer`, 
349 | 
350 | ```java
351 | public abstract class TokenRenewer {
352 | 
353 |   /**
354 |    * Does this renewer handle this kind of token?
355 |    * @param kind the kind of the token
356 |    * @return true if this renewer can renew it
357 |    */
358 |   public abstract boolean handleKind(Text kind);
359 | 
360 |   /**
361 |    * Is the given token managed? Only managed tokens may be renewed or
362 |    * cancelled.
363 |    * @param token the token being checked
364 |    * @return true if the token may be renewed or cancelled
365 |    * @throws IOException
366 |    */
367 |   public abstract boolean isManaged(Token<?> token) throws IOException;
368 |   
369 |   /**
370 |    * Renew the given token.
371 |    * @return the new expiration time
372 |    * @throws IOException
373 |    * @throws InterruptedException 
374 |    */
375 |   public abstract long renew(Token<?> token,
376 |                              Configuration conf
377 |                              ) throws IOException, InterruptedException;
378 |   
379 |   /**
380 |    * Cancel the given token
381 |    * @throws IOException
382 |    * @throws InterruptedException 
383 |    */
384 |   public abstract void cancel(Token<?> token,
385 |                               Configuration conf
386 |                               ) throws IOException, InterruptedException;
387 | }
388 | ```
389 | 
390 | 1. In `handleKind()`, verify the token kind is that which the renewer supports.
391 | 1. In `isManaged()`, return true, unless the ability to renew a token is made on 
392 | a token-by-token basis.
393 | 1. In `reneww()`, renew the credential by notifying the bonded service that
394 | the token is to be renewed.
395 | 1. In `cancel()`, notify the bonded service that the token should be cancelled.
396 | 
397 | Finally, declare the class in the resource
398 | `META-INF/services/org.apache.hadoop.security.token.TokenRenewer`.
399 | 
400 | 
401 | ### How delegation tokens are shared
402 | 
403 | DTs can be serialized; that is done when issued/renewed.
404 | 
405 | When making requests over Hadoop RPC, you don't need to include the DT, simply
406 | include the Hash to indicate that you have it (Revisit: what does this mean?)
407 | 
408 | 
409 | Every token which is deserialized at the far end must 
410 | have a service declaration in 
411 | 
412 | `META-INF/services/org.apache.hadoop.security.token.TokenIdentifier`
413 | 
414 | Here is the one in `hadoop-common` as of Hadoop 3.2:
415 | 
416 | ```
417 | org.apache.hadoop.crypto.key.kms.KMSDelegationToken$KMSDelegationTokenIdentifier
418 | ```
419 | 
420 | When Hadoop is looking for an implementation of a token, it
421 | enumerates all available token identifiers registered this way through the
422 | java `ServiceLoader.load(class)` API.
423 | 
424 | 
425 | 
426 | `Token.decodeIdentifier()` is then invoked to extract the identifier
427 | information.
428 | 
429 | *Important*: All java classes directly invoked in a token
430 | implementation class must be on the classpath.
431 | 
432 | If this condition is not met, the identifier cannot be loaded.
433 |  
434 | 
435 | ### Delegation Tokens
436 |  
437 | ### Token Propagation in YARN Applications
438 |  
439 | YARN applications depend on delegation tokens to gain access to cluster
440 | resources and data on behalf of the principal. It is the task of
441 | the client-side launcher code to collect the tokens needed, and pass them
442 | to the launch context used to launch the Application Master.
443 | 
444 | 
445 | ### Proxy Users
446 | 
447 | Proxy users are a feature which was included in the Hadoop security model for services
448 | such as Oozie; a service which needs to be able to execute work on behalf of a user 
449 | 
450 | Because the time at which Oozie would execute future work cannot be determined, delegation
451 | tokens cannot be used to authenticate requests issued by Oozie on behalf of a user.
452 | Kerberos keytabs are a possible solution here, but it would require every user submitting
453 | work to Oozie to have a keytab and to pass it to Oozie.
454 | 
455 | See [Proxy user - Superusers Acting On Behalf Of Other Users](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Superusers.html).
456 | 
457 | Also a clarification in [HADOOP-15758](https://issues.apache.org/jira/browse/HADOOP-15758)
458 | 
459 | > You cannot mimic a proxy user. A proxy user is specific construct. 
460 | There is no substitute.
461 | A proxy user is a ugi that lacks its own authentication credentials,
462 | thus it explicitly encapsulates a "real" ugi that does contain kerberos credentials.
463 | The real ugi's user must be specifically configured on the target service to allow impersonation of the proxied user.
464 |   
465 | > There is no correlation between a proxy user and a ticket cache.
466 | The real ugi can supply ticket cache or keytab based credentials.
467 | All that matters is the real user has credentials
468 | 
469 | There's also a special bit of advice for service implementors
470 | 
471 | > An impersonating service should never ever be ticket cache based.
472 | > Use a keytab.
473 | > Otherwise you may be very surprised with proxy user service morphs into a
474 | > different user if/when someone/something does a kinit as a different user.
475 | 
476 | ## Weaknesses
477 | 
478 | 1. In HDFS Any compromised DN can create block tokens.
479 | 1. Possession of the tokens is sufficent to impersonate a user. This means it is critical
480 | to transport tokens over the network in an encrypted form. Typically, this is done
481 | by SASL-encrypting the Hadoop IPC channel.
482 | 
483 | 
484 | ## How Delegation Tokens work in File System implementations
485 | 
486 | This is for relevance of people implementing/using FileSystem APIs.
487 | 
488 | 1. Any FileSystem instance may issue zero or more delegation tokens when 
489 | asked. 
490 | 1. These can be used to grant time-limited access to the filesystem in different processes.
491 | 1. The reason more than one can be issued is to support aggregate filesystems
492 | where multiple filesystems may eventually be invoked (e.g ViewFS), or
493 | when tokens for multiple services need to be included (e.g KMS access tokens).
494 | 1. Clients must invoke `FileSystem.addDelegationTokens()` to get an
495 | array of tokens which may then be written to persistent storage, marshalled,
496 | and loaded in later.
497 | 1. Each filesystem must have a unique Canonical Name -a URI which will
498 | refer only to the FS instance to which the token matches.
499 | 1. When working with YARN, container and application launch contexts may
500 | include a list of tokens to include. the FS Tokens should be included here
501 | along with any needed to talk to the YARN RM, and, for Hadoop 3+,
502 | docker tokens. See `org.apache.hadoop.yarn.applications.distributedshell.Client`
503 | for an example of this.
504 | 
505 | 
506 | In the remote application, these tokens can be unmarshalled
507 | from a the byte array, the tokens for specific services retrieved, 
508 | and then used for authentication.
509 | 
510 | 
511 | ## Technique: how to propagate secrets in TokenIdentifiers
512 | 
513 | A TokenIdentifier is a `Writable` object; it can contain arbitrary
514 | data.
515 | For this reason it can be used to propagate secrets from a client
516 | to deployed YARN applications and containers, even when there is no
517 | actual service which reads the token identifiers.
518 | All that is required is that the far end implements the token identifier
519 | and declares itself as such.
520 | 


--------------------------------------------------------------------------------
/sections/hdfs.md:
--------------------------------------------------------------------------------
  1 | <!---
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 |   
  6 |    http://www.apache.org/licenses/LICENSE-2.0
  7 |   
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 | 
 15 | # HDFS
 16 | 
 17 | > It seemed to be a sort of monster, or symbol representing a monster, of a form which only a diseased fancy could conceive. If I say that my somewhat extravagant imagination yielded simultaneous pictures of an octopus, a dragon, and a human caricature, I shall not be unfaithful to the spirit of the thing. A pulpy, tentacled head surmounted a grotesque and scaly body with rudimentary wings; but it was the general outline of the whole which made it most shockingly frightful.
 18 | > *[The Call of Cthulhu](https://en.wikisource.org/wiki/The_Call_of_Cthulhu), HP Lovecraft, 1926.*
 19 | 
 20 | HDFS uses Kerberos to 
 21 | 
 22 | 1. Authenticate caller's access to the Namenode and filesystem metadata (directory and file manipulation).
 23 | 1. Authenticate Datanodes attempting to join the HDFS cluster. This prevents malicious code
 24 |  from claiming to be part of HDFS and having blocks passed to it.
 25 | 1. Authenticate the Namenode with the Datanodes (prevents malicious code claiming to be
 26 | the Namenode and granting access to data or just deleting it)
 27 | 1. Grant callers read and write access to data within HDFS.
 28 | 
 29 | Kerberos is used to set up the initial trust between a client and the NN, by way of
 30 | Hadoop tokens. A client with an Authentication Token can request a Delegation Token,
 31 | which it can then pass to other services or YARN applications, so giving them time-bound
 32 | access to HDFS with the rights of that user.
 33 | 
 34 | The namenode also issues "Block Tokens" which are needed to access HDFS data stored on the
 35 | Datanodes: the DNs validate these tokens, rather than requiring clients to authenticate
 36 | with the DNs in any way. This avoids any authentication overhead on block requests,
 37 | and the need to somehow acquire/share delegation tokens to access specific DNs.
 38 | 
 39 | HDFS Block Tokens do not (as of August 2015) contain information about the identity of the caller or
 40 | the process which is running. This is somewhat of an inconvenience, as it prevents
 41 | the HDFS team from implementing user-specific priority/throttling of HDFS data access
 42 | —something which would allow YARN containers to manage the IOPs and bandwith of containers,
 43 | and allow multi-tenant Hadoop clusters to prioritise high-SLA applications over lower-priority
 44 | code.
 45 | 
 46 | ## HDFS NameNode
 47 | 
 48 | 
 49 | 1. NN reads in a keytab and initializes itself from there (i.e. no need to `kinit`; ticket
 50 | renewal handed by `UGI`).
 51 | 1. Generates a *Secret*
 52 | 
 53 | Delegation tokens in the NN are persisted to the edit log, the operations `OP_GET_DELEGATION_TOKEN`
 54 | `OP_RENEW_DELEGATION_TOKEN` and `OP_CANCEL_DELEGATION_TOKEN` covering the actions. This ensures
 55 | that on failover, the tokens are still valid
 56 | 
 57 | 
 58 | ### Block Keys
 59 | 
 60 | A `BlockKey` is the secret used to show that the caller has been granted access to a block
 61 | in a DN. 
 62 | 
 63 | The NN issues the block key to a client, which then asks a DN for that block, supplying
 64 | the key as proof of authorization.
 65 | 
 66 | Block Keys are managed in the `BlockTokenSecretManager`, one in the NN
 67 | and another in every DN to track the block keys to which it has access. 
 68 | It is the DNs which issue block keys as blocks are created; when they heartbeat to the NN
 69 | they include the keys.
 70 | 
 71 | ### Block Tokens
 72 | 
 73 | A `BlockToken` is the token issued for access to a block; it includes 
 74 | 
 75 |     (userId, (BlockPoolId, BlockId), keyId, expiryDate, access-modes)
 76 | 
 77 | The block key itself isn't included, just the key to the referenced block. The access modes declare
 78 | what access rights the caller has to the data
 79 | 
 80 |     public enum AccessMode {
 81 |       READ, WRITE, COPY, REPLACE
 82 |     }
 83 | 
 84 | It is the NN which has the permissions/ACLs on each file —DNs don't have access to that data.
 85 | Thus it is the BlockToken which passes this information to the DN, by way of the client.
 86 | Obviously, they need to be tamper-proof.
 87 | 
 88 | 
 89 | ## DataNodes
 90 | 
 91 | DataNodes do not use Hadoop RPC —they transfer data over HTTP. This delivers better performance,
 92 | though the (historical) use of Jetty introduced other problems. At scale, obscure race conditions
 93 | in Jetty surfaced. Hadoop now uses Netty for its DN block protocol.
 94 | 
 95 | ### DataNodes and SASL
 96 | 
 97 | Pre-2.6, all that could be done to secure the DN was to bring it up on a secure (&lt;1024) port
 98 | and so demonstrate that an OS superuser started the process. Hadoop 2.6 supports SASL
 99 | authenticated HTTP connections, which works *provided all clients are all running Hadoop 2.6+*
100 | 
101 | See [Secure DataNode](http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SecureMode.html#Secure_DataNode)
102 | 
103 | ## HDFS Bootstrap
104 | 
105 | 1. NN reads in a keytab and initializes itself from there (i.e. no need to `kinit`; ticket
106 | renewal handed by `UGI`).
107 | 1. Generates a *Secret*
108 | 
109 | 
110 | Delegation tokens in the NN are persisted to the edit log, the operations `OP_GET_DELEGATION_TOKEN`
111 | `OP_RENEW_DELEGATION_TOKEN` and `OP_CANCEL_DELEGATION_TOKEN` covering the actions. This ensures
112 | that on failover, the tokens are still valid
113 | 
114 | 
115 | 
116 | ## HDFS Client interaction
117 | 
118 | 1. Client asks NN for access to a path, identifying via Kerberos or delegation token.
119 | 1. NN authenticates caller, if access to path is authorized, returns Block Token to the client.
120 | 1. Client talks to 1+ DNs with the block, using the Block Token.
121 | 1. DN authenticates Block Token using shared-secret with NameNode.
122 | 1. if authenticated, DN compares permissions in Block Token with operation requested, then
123 | grants or rejects the request.
124 | 
125 | The client does not have its identity checked by the DNs. That is done by the NN. This means
126 | that the client can in theory pass a Block Token on to another process for delegated access to a single
127 | block. It has another implication: DNs can't do IO throttling on a per-user basis, as they do
128 | not know the user requesting data.
129 | 
130 | ### WebHDFS
131 | 
132 | 1. In a secure cluster, Web HDFS requires SPNEGO
133 | 1. After authenticating with a SPNEGO-negotiated mechanism, webhdfs sends an HTTP redirect,
134 | including the BlockTocken in the redirect
135 | 
136 | ### NN/Web UI
137 | 
138 | 1. If web auth is enabled in a secure cluster, the DN web UI will requires SPNEGO
139 | 1. In a secure cluster, if webauth is disabled, kerberos/SPNEGO auth may still be needed
140 | to access the HDFS browser. This is a point of contention: its implicit from the delegation
141 |  to WebHDFS --but a change across Hadoop versions, as before an unauthed user could still browse
142 |  as "dr who". 
143 |  
144 | 


--------------------------------------------------------------------------------
/sections/ipc.md:
--------------------------------------------------------------------------------
  1 | <!---
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 |   
  6 |    http://www.apache.org/licenses/LICENSE-2.0
  7 |   
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 | 
 15 | 
 16 | > Man's respect for the imponderables varies according to his mental constitution and environment. Through certain modes of thought and training it can be elevated tremendously, yet there is always a limit.
 17 | 
 18 | > *[At the Root](https://en.wikisource.org/wiki/At_the_Root), HP Lovecraft, 1918.*
 19 | 
 20 | # Hadoop IPC Security
 21 | 
 22 | The Hadoop IPC system handles Kerberos ticket and Hadoop token authentication automatically.
 23 | 
 24 | 1. The identity of the principals of services are configured in the hadoop site configuration
 25 | files.
 26 | 1. Every IPC services uses java annotations, a metadata resource file and a custom `SecurityInfo`
 27 | subclass to define the security information of the IPC, including the key in the configuration
 28 | used to define the principal.
 29 | 1. If a caller making a connection has a valid token (auth or delegate) it is used
 30 | to authenticate with the remote principal.
 31 | 1. If a caller lacks a token, the Hadoop ticket will be used to acquire an authentication
 32 | token.
 33 | 1. Applications may explicitly request delegation tokens to forward to other processes.
 34 | 1. Delegation tokens are renewed in a background thread (which?).
 35 | 
 36 | 
 37 | ## IPC authentication options
 38 | 
 39 | Hadoop IPC uses [SASL](sasl.html) to authenticate, sign and potentially encrypt
 40 | communications.
 41 | 
 42 | ## Use Kerberos to authenticate sender and recipient
 43 | 
 44 | ```xml
 45 | <property>
 46 | <name>hadoop.rpc.protection</name>
 47 | <value>authentication</value>
 48 | </property>
 49 | ```
 50 | 
 51 | ## Kerberos to authenticate sender and recipient, Checksums for tamper-protection
 52 | 
 53 | ```xml
 54 | <property>
 55 | <name>hadoop.rpc.protection</name>
 56 | <value>integrity</value>
 57 | </property>
 58 | ```
 59 | 
 60 | ## Kerberos to authenticate sender and recipient, Wire Encryption
 61 | 
 62 | ```xml
 63 | <property>
 64 | <name>hadoop.rpc.protection</name>
 65 | <value>privacy</value>
 66 | </property>
 67 | ```
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | ## Adding a new IPC interface to a Hadoop Service/Application
 74 | 
 75 | This is "fiddly". It's not impossible, it just involves effort.
 76 | 
 77 | In its favour: it's a lot easier than SPNEGO.
 78 | 
 79 | ### Annotating a service interface
 80 | 
 81 | ```java
 82 | @KerberosInfo(serverPrincipal = "my.kerberos.principal")
 83 | public interface MyRpc extends VersionedProtocol {
 84 |   long versionID = 0x01;
 85 | ...
 86 | }
 87 | ```
 88 | 
 89 | ### `SecurityInfo` subclass
 90 | 
 91 | Every exported RPC service will need its own extension of the `SecurityInfo` class, to provide two things:
 92 | 
 93 | 1. The name of the principal to use in this communication
 94 | 1. The token used to authenticate ongoing communications.
 95 | 
 96 | ### `PolicyProvider` subclass
 97 | 
 98 | 
 99 | ```java
100 | public class MyRpcPolicyProvider extends PolicyProvider {
101 | 
102 |   public Service[] getServices() {
103 |     return  new Service[] {
104 |      new Service("my.protocol.acl", MyRpc.class)
105 |     };
106 |   }
107 | 
108 | }
109 | ```
110 | 
111 |  This is used to inform the RPC infrastructure of the ACL policy: who may talk to the service. It must be explicitly passed to the RPC server
112 | 
113 | ```java
114 | rpcService.getServer() .refreshServiceAcl(serviceConf, new MyRpcPolicyProvider());
115 | ```
116 | 
117 | In practise, the ACL list is usually configured with a list of groups, rather than a user.
118 | 
119 | ### `SecurityInfo` class 
120 | 
121 | ```
122 | public class MyRpcSecurityInfo extends SecurityInfo { ... }
123 | 
124 | ```
125 | 
126 | ### `SecurityInfo` resource file
127 | 
128 | The resource file `META-INF/services/org.apache.hadoop.security.SecurityInfo` lists all RPC APIs which have a matching SecurityInfo subclass in that JAR.
129 | 
130 |     org.example.rpc.MyRpcSecurityInfo
131 | 
132 | The RPC framework will read this file and build up the security information for the APIs (server side? Client side? both?)
133 | 
134 | 
135 | ### Authenticating a caller
136 | 
137 | How does an IPC endpoint validate the caller? If security is turned on,
138 | the client will have had to authenticate with Kerberos, ensuring that
139 | the server can determine the identity of the principal. 
140 | 
141 | This is something it can ask for when handling the RPC Call:
142 | 
143 | ```java
144 | UserGroupInformation callerUGI;
145 | 
146 | // #1: get the current user identity
147 | try {
148 |   callerUGI = UserGroupInformation.getCurrentUser();
149 | } catch (IOException ie) {
150 |   LOG.info("Error getting UGI ", ie);
151 |   AuditLogger.logFailure("UNKNOWN", "Error getting UGI");
152 |   throw RPCUtil.getRemoteException(ie);
153 | }
154 | ```
155 | 
156 | The `callerUGI` variable is now set to the identity of the caller. If the caller
157 | has delegated authority (tickets, tokens) then they still authenticate as
158 | that principal they were acting as (possibly via a `doAs()` call).
159 | 
160 | 
161 | ```java
162 | // #2 verify their permissions
163 | String user = callerUGI.getShortUserName();
164 | if (!checkAccess(callerUGI, MODIFY)) {
165 |   AuditLog.unauthorized(user,
166 |     KILL_CONTAINER_REQUEST,
167 |     "User doesn't have permissions to " + MODIFY);
168 |   throw RPCUtil.getRemoteException(new AccessControlException(
169 |     + user + " lacks access "
170 |     + MODIFY_APP.name()));
171 | }
172 | AuditLog.authorized(user, KILL_CONTAINER_REQUEST)
173 | ```
174 | 
175 | In ths example, there's a check to see if the caller can make a request which modifies
176 | something in the service, if not the calls is rejected.
177 | 
178 | Note how failures are logged to an audit log; successful operations should be logged too.
179 | The purpose of the audit log is determine the actions of a principal —both successful
180 | and unsuccessful.
181 | 
182 | ### Downgrading to unauthed IPC
183 | 
184 | IPC can be set up on the client to fall back to unauthenticated IPC if it can't negotiate
185 | a kerberized connection. While convenient, this opens up some security vulnerabilitie -hence
186 | the feature is generally disabled on secure clusters. It can/should be enabled when needed
187 | 
188 | ```
189 | -D ipc.client.fallback-to-simple-auth-allowed=true
190 | ```
191 | 
192 | As an example, this is the option on the command line for DistCp to copy from a secure cluster
193 | to an insecure cluster, the destination only supporting simple authentication.
194 | 
195 | ```
196 | hadoop distcp -D ipc.client.fallback-to-simple-auth-allowed=true hdfs://secure:8020/lovecraft/books hdfs://insecure:8020/lovecraft/books
197 | ```
198 | 
199 | Although you can set it in a core-site.xml, this is dangerous from a security perpective
200 | 
201 | ```xml
202 | <property>
203 |   <name>ipc.client.fallback-to-simple-auth-allowed</name>
204 |   <value>true</value> 
205 | </property>
206 | ```
207 | 
208 | *warning* it's tempting to turn this on during development, as it makes problems go away. As it is
209 | not recommended in production: avoid except on the CLI during attempts to debug problems.
210 | 


--------------------------------------------------------------------------------
/sections/jaas.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 |   Licensed under the Apache License, Version 2.0 (the "License");
 3 |   you may not use this file except in compliance with the License.
 4 |   You may obtain a copy of the License at
 5 |   
 6 |    http://www.apache.org/licenses/LICENSE-2.0
 7 |   
 8 |   Unless required by applicable law or agreed to in writing, software
 9 |   distributed under the License is distributed on an "AS IS" BASIS,
10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |   See the License for the specific language governing permissions and
12 |   limitations under the License. See accompanying LICENSE file.
13 | -->
14 | 
15 | 
16 | # JAAS
17 | 
18 | JAAS is a nightmare from the Enterprise Java Bean era, one which surfaces from the depths to pull the unwary under. You can see its heritage whenever you search for documentation; it's generally related to managing the context of callers to EJB operations.
19 | 
20 | 
21 | JAAS provides for a standard configuration file format for specifying a *login context*; how code trying to run in a specific context/role should login and authenticate.
22 | 
23 | As a single `jaas.conf` file can have multiple contexts, the same file can be used to configure the server and clients of a service, each with different binding information. Different contexts can have different login/auth mechanisms, including Kerberos and LDAP, so that you can even specify different auth mechanisms for different roles.
24 | 
25 | In Hadoop, the JAAS context is invariably Kerberos when it comes to talking to HDFS, YARN, etc.
26 | However, if Zookeeper enters the mix, it may be interacted with differently —and so need a different JAAS context.
27 | 
28 | Fun facts about JAAS
29 | 
30 | 1. Nobody ever mentions it, but the file takes backslashed-escapes like a Java string.
31 | 1. It needs escaped backlash directory separators on Windows, such as: `C:\\security\\krb5.conf`.
32 |    Get that wrong and your code will fail with what will inevitably be an unintuitive message.
33 | 1. Each context must declare the authentication module to use.
34 |    The kerberos authentication model on IBM JVMs is different from that on Oracle and OpenJDK JVMs.
35 |    You need to know the target JVM for the context —or create separate contexts for the different JVMs.
36 | 1. The rules about when to use `=` within an entry, and when to complete an entry with a `;` appear to be:
37 | start with the login module, one key=value line per entry, quote strings, finish with a `;`
38 | within the same file.
39 | 
40 | Hadoop's UGI class will dynamically create a JAAS context for Hadoop logins, dynamically determining the name of the kerberos module to use. For interacting purely with HDFS and YARN, you may be able to avoid needing to know about or understand JAAS.
41 | 
42 | Example of a JAAS file valid for an Oracle JVM:
43 |  
44 | 
45 | ```
46 | Client {
47 |   com.sun.security.auth.module.Krb5LoginModule required
48 |   useKeyTab=false
49 |   useTicketCache=true
50 |   doNotPrompt=true;
51 | };
52 | ```
53 | 
54 | 
55 | # Setting a JAAS Config file for a Java process
56 | 
57 | ```
58 | -Djava.security.auth.login.config=/path/to/server/jaas/file.conf
59 | ```
60 | 
61 | In Hadoop applications, this has to be set in whichever environment variable is picked up
62 | by the command which your are invoking.
63 | 
64 | # Disabling JAAS from doing something other than what you told it to do
65 | 
66 | Or, as known on the [Oracle JGSS docs](https://docs.oracle.com/javase/8/docs/technotes/guides/security/jgss/single-signon.html)
67 | under the section "Exceptions to the Model".
68 | 
69 | JAAS includes a rather obscure system property which can have a significant impact on how the `LoginModule` (in our case, the Krb5LoginModule used to authenticate using Kerberos v5) uses the JAAS configuration which was provided.
70 | 
71 | ```
72 | -Djavax.security.auth.useSubjectCredsOnly=true
73 | ```
74 | 
75 | When this system property is set to `true` (the default), the `LoginModule` will never try to obtain credentials when they are not present in the context of the current call (e.g. the current `doAs` scope). When this property is set to `false`, the `LoginModule`
76 | has the ability to try to obtain credentials by `any other means`. This means that the implementation of the `LoginModule` has the "latitude" to do whatever it likes.
77 | 
78 | For the `Krb5LoginModule` on Oracle Java, the implementation will attempt to obtain credentials via a prompt on standard input/output.
79 | When the provided JAAS configuration otherwise instructs the `Krb5LoginModule` to never prompt or use a ticket cache, this can be a
80 | very jarring and unexpected action.
81 | 


--------------------------------------------------------------------------------
/sections/jdk_versions.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 |   Licensed under the Apache License, Version 2.0 (the "License");
 3 |   you may not use this file except in compliance with the License.
 4 |   You may obtain a copy of the License at
 5 |   
 6 |    http://www.apache.org/licenses/LICENSE-2.0
 7 |   
 8 |   Unless required by applicable law or agreed to in writing, software
 9 |   distributed under the License is distributed on an "AS IS" BASIS,
10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |   See the License for the specific language governing permissions and
12 |   limitations under the License. See accompanying LICENSE file.
13 | -->
14 |   
15 |   
16 | > Johansen, thank God, did not know quite all, even though he saw the city and the Thing, but I shall never sleep calmly again when I think of the horrors that lurk ceaselessly behind life in time and in space, and of those unhallowed blasphemies from elder stars which dream beneath the sea, known and favoured by a nightmare cult ready and eager to loose them upon the world whenever another earthquake shall heave their monstrous stone city again to the sun and air.
17 | 
18 | > HP Lovecraft [The Call of Cthulhu](http://www.hplovecraft.com/writings/texts/fiction/cc.aspx), 1926
19 | 
20 | # Java and JDK Versions
21 | 
22 | Kerberos support is built into the Java JRE. It comes in two parts
23 | 
24 | 1. The public APIs with some guarantee of stability at the class/method level, along with assertions of functionality behind those classes and methods.
25 | 2. The internal implementation classes which are needed to do anything sophisticated.
26 | 
27 | The Hadoop UGI code uses part (2), the internal `com.sun` classes, as the public APIs
28 | are too simplistic for the authentication system.
29 | 
30 | This comes at a price
31 | 
32 | 1. Different JRE vendors (e.g IBMs) use different implementation classes, so will not work.
33 | 2. Different Java versions can (and do) change those implementation classes.
34 | 
35 | Using internal classes is one of those "don't do this your code will be unreliable" rules;
36 | it wasn't something done lightly.
37 | In its defence, (a) it was needed and (b) it turns out that the public APIs
38 | are brittle across versions and JDKs too, so we haven't lost as much as you'd think.
39 | 
40 | ### Key things to know
41 | 
42 | * Hadoop is built and tested against the Oracle JDKs
43 | * Open JDK has the same classes and methods, so will behave consistently; it's tested against too.
44 | * It's left to the vendors of other JVMs to test their code; the patches are taken on trust.
45 | * The Kerberos internal access usually needs fixing across Java versions.
46 | This means secure Hadoop clusters absolutely require the Java versions listed on the download requirements.
47 | * Releases within a Java version may break the internals and/or the public API's behaviour.
48 | * If you want to see the details of Hadoop's binding, look in `org.apache.hadoop.security.authentication.util.KerberosUtil` in the `hadoop-auth` module.
49 | 
50 | To put it differently: 
51 | 
52 | ## The Hadoop security team is always scared of a new version of Java
53 | 
54 | 


--------------------------------------------------------------------------------
/sections/kerberos_the_madness.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Hadoop and Kerberos: The Madness Beyond the Gate
  3 | 
  4 | 
  5 | Authors:
  6 | 
  7 | S.A. Loughran
  8 | 
  9 | 
 10 | ----
 11 | 
 12 | # Introduction
 13 | 
 14 | When HP Lovecraft wrote his books about forbidden knowledge which would reduce the reader to insanity, of "Elder Gods" to whom all of humanity were a passing inconvenience, most people assumed that he was making up a fantasy world.
 15 | In fact he was documenting Kerberos.
 16 | 
 17 | What is remarkable is that he did this fifty years before kerberos was developed. This makes him less of an author, 
 18 | instead: a prophet.
 19 | 
 20 | What he wrote was true: there are some things humanity was not meant to know. Most people are better off living lives of naive innocence, never having to see an error message about SASL or GSS, never fear building up scripts of incantations to `kadmin.local`, incantations which you hope to keep evil and chaos away. To never stare in dismay at the code whose true name must never be spoken, but instead it's initials whispered, "UGI". For those of us who have done all this, our lives are forever ruined. From now on we will cherish any interaction with a secure Hadoop cluster —from a client application to HDFS, or application launch on a YARN cluster, and simply viewing a web page in a locked down web UI —all as a miracle against the odds, against the forces of chaos struggling to destroy order.
 21 | And forever more, we shall fear those voices calling out to us in the night, the machines by our bed talking to us, saying things like "we have an urgent support call related to REST clients on a remote kerberos cluster —can you help?" 
 22 | 
 23 | 
 24 | | HP Lovecraft                                          | Kerberos                   |
 25 | |-------------------------------------------------------|----------------------------|
 26 | | Evil things lurking in New England towns and villages | MIT Project Athena         |
 27 | | Ancient, evil deities oblivious to humanity           | Kerberos Domain Controller |
 28 | | Books whose reading will drive the reader insane      | IETF RFC 4120              |
 29 | | Entities which are never spoken of aloud              | UserGroupInformation       |
 30 | | People driven insane by their knowledge               | You                        |
 31 | 
 32 | This documents contains the notes from previous people who have delved too deep into the mysteries of Apache&trade; Hadoop&reg; and Kerberos, who have read the forbidden source code, maybe who have even contributed to it. If you wish to preserve your innocence, to view the world as a place of happiness: stop now.
 33 | 
 34 | ## Disclaimer
 35 | 
 36 | This document is a collection of notes based on the experience of the author. There are no guarantees that any of the information contained within was correct at the time of writing, let alone the time of reading. The author does not accept any responsibility for actions made on the basis of the information contained herein, be it correct or incorrect.
 37 | 
 38 | The reader of this document is likely to leave with some basic realisation that Kerberos, while important, is an uncontrolled force of suffering and devastation. The author does not accept any responsibility for the consequences of such knowledge.
 39 | 
 40 | What has been learned cannot be unlearned(*)
 41 | 
 42 | (*) Except for Kerberos workarounds you wrote 18 months ago and for which you now field support calls.
 43 | 
 44 | ----
 45 | 
 46 | # Foundational Concepts
 47 | 
 48 | What is the problem that Hadoop security is trying to address? Securing Hadoop.
 49 | 
 50 | Apache Hadoop is "an OS for data".
 51 | A Hadoop cluster can rapidly become the largest stores of data in an organisation.
 52 | That data can explicitly include sensitive information: financial, personal, business, and can often implicitly contain data which needs to be sensitive about the privacy of individuals (for example, log data of web accesses alone).
 53 | Much of this data is protected by laws of different countries.
 54 | This means that access to the data needs to be strictly controlled, and accesses made of that data potentially logged to provide an audit trail of use.
 55 | 
 56 | You have to also consider, "why do people have Hadoop clusters?".
 57 | 
 58 | It's not just because they have lots of data --its because they want to make use of it.
 59 | A data-driven organisation needs to trust that data, or at least be confident of its origins.
 60 | Allowing entities to tamper with that data is dangerous.
 61 | 
 62 | For the protection of data, then, read and write access to data stored directly in the HDFS filesystem needs to be protected.
 63 | Applications which work with their data in HDFS also need to have their accesses restricted: Apache HBase and Apache Accumulo store their data in HDFS, Apache Hive submits SQL queries to HDFS-stored data, etc.
 64 | All these accesses need to be secured; applications like HBase and Accumulo granted restricted access to their data, and themselves securing and authenticating communications with their clients.
 65 | 
 66 | YARN allows arbitrary applications to be deployed within a Hadoop cluster.
 67 | This needs to be done without granting open access to the entire cluster from those user-launched applications, while isolating different users' work.
 68 | A YARN application started by user Alice should not be able to directly manipulate an application launched by user "Bob", even if they are running on the same host.
 69 | This means that not only do they need to run as different users on the same host (or in some isolated virtual/container), the applications written by Alice and Bob themselves need to be secure.
 70 | In particular, any web UI or IPC service they instantiate needs to have its access restricted to trusted users. here Alice and Bob
 71 | 
 72 | ## Authentication
 73 | 
 74 | The authentication problem: who is a caller identifying themselves as —and can you verify
 75 | that they really are this person.
 76 | 
 77 | In an unsecure cluster, all callers to HDFS, YARN and other services are trusted to be
 78 | who they say they are. In a secure cluster, services need to authenticate callers.
 79 | That means some information must be passed with remote IPC/REST calls to declare
 80 | a caller's identity and authenticate that identity
 81 | 
 82 | ## Authorization
 83 | 
 84 | Does an (authenticated) user have the permissions to perform the desired request?
 85 | 
 86 | This isn't handled by Keberos: this is Hadoop-side, and is generally done
 87 | in various ways across systems. HDFS has file and directory permissions, with the
 88 | user+group model now extended to ACLs. YARN allows job queues to be restricted
 89 | to different users and groups, so restricting the memory & CPU limits of those
 90 | users. When cluster node labels are used to differentiate parts of the cluster (e.g. servers with
 91 | more RAM, GPUs or other features), then the queues can be used to restrict access
 92 | to specific sets of nodes.
 93 | 
 94 | Similarly, HBase and Accumulo have their users and permissions, while Hive uses the
 95 | permissions of the source files as its primary access control mechanism.
 96 | 
 97 | These various mechanisms are all a bit disjoint, hence the emergence of tools
 98 | to work across the entire stack for a unified view, Apache Ranger being one example.
 99 | 
100 | 
101 | ## Encryption
102 | 
103 | Can data be intercepted on disk or over the wire?
104 | 
105 | 
106 | ### Encrytion of Persistent Data.
107 | 
108 | HDFS now supports *at rest encryption*; the data is encrypted while stored on disk.
109 | 
110 | Before rushing to encrypt all the data, consider that it isn't a magic solution to
111 | security: the authentication and authorisation comes first. Encryption adds a new problem,
112 | secure key management, as well as the inevitable performance overhead. It also complicates
113 | some aspects of HDFS use.
114 | 
115 | Data stored in HDFS by applications is implicitly encrypted. However, applications like 
116 | Hive have had to be reworked to ensure 
117 | that when making queries across encrypted datasets, temporary data files are also stored
118 | in the same encryption zone, to stop the intermediate data being stored unencrypted.
119 | And of course, analytics code running in the servers may also intentionally or unintentionally
120 | persist the sensitive data in an unencrypted form: the local filesystem, OS swap space
121 | and even OS hibernate-time memory snapshots need to be managed.
122 | 
123 | Before rushing to enable persistent data encryption, then, you need to consider: what is the
124 | goal here? 
125 | 
126 | What at-REST encryption does deliver is better guarantees that data stored in hard disks
127 | is not recoverable —at least on the HDFS side. However, as OS-level data can persist,
128 | (strongly) wiping HDDs prior to disposal is still going to be necessary to guarantee
129 | destruction of the data.
130 | 
131 | ## Auditing and Governance
132 | 
133 | Authenticated and Authorized users should not just be able to perform actions
134 | or read and write data —this should all be logged in *Audit Logs* so that
135 | if there is ever a need to see which files a user accessed, or what individual
136 | made specific requests of a service —that information is available. Audit logs
137 | should be 
138 | 
139 | 1. Separate log categories from normal processing logs, so log configurations
140 | can store them in separate locations, with different persistence policies.
141 | 
142 | 1. Machine Parseable. This allows the audit logs themselves to be analyzed. This
143 | does not just have to be for security reasons; Spotify have disclosed that they
144 | run analysis over their HDFS audit logs to identify which files are most popular (and
145 | hence should have their replication factor increased), and which do not get
146 | used more then 7 days after their creation —and hence can be automatically deleted
147 | as part of a workflow.
148 | 
149 | 


--------------------------------------------------------------------------------
/sections/keytabs.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 |   Licensed under the Apache License, Version 2.0 (the "License");
 3 |   you may not use this file except in compliance with the License.
 4 |   You may obtain a copy of the License at
 5 |   
 6 |    http://www.apache.org/licenses/LICENSE-2.0
 7 |   
 8 |   Unless required by applicable law or agreed to in writing, software
 9 |   distributed under the License is distributed on an "AS IS" BASIS,
10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |   See the License for the specific language governing permissions and
12 |   limitations under the License. See accompanying LICENSE file.
13 | -->
14 | 
15 | # Keytabs
16 | 
17 | Keytabs are critical for secure Hadoop clusters, as they allow the services to be launched
18 | without prompts for passwords
19 | 
20 | 
21 | ## Creating a Keytab
22 | 
23 | If your management tools sets up keytabs for you: use it.
24 | 
25 | ```bash
26 | 
27 | kadmin.local
28 | 
29 | ktadd -k zk.service.keytab -norandkey zookeeper/devix@COTHAM 
30 | ktadd -k zk.service.keytab -norandkey zookeeper/devix.cotham.uk@COTHAM
31 | exit
32 | ```
33 | 
34 | and of course, make it accessible
35 | 
36 | ```bash
37 | chgrp hadoop zk.service.keytab
38 | chown zookeeper zk.service.keytab
39 | ```
40 | 
41 | check that the user can login
42 | 
43 | ```bash
44 | # sudo -u zookeeper klist -e -kt zk.service.keytab
45 | # sudo -u zookeeper kinit -kt zk.service.keytab zookeeper/devix.cotham.uk
46 | # sudo -u zookeeper klist
47 | ```
48 | 
49 | ### Keytab Expiry
50 | 
51 | Keytabs expire
52 | 
53 | That is: entries in them have a limited lifespan (default: 1 year)
54 | 
55 | This is actually a feature —it limits how long a lost/stolen keytab can have access to the system.
56 | 
57 | At the same time, it's a major inconvenience as (a) the keytabs expire and (b) it's never
58 | immediately obvious why your cluster has stopped working.
59 | 
60 | ### Keytab security
61 | 
62 | Keytabs are sensitive items. They need to be treated as having all the access to the data of that principal
63 | 
64 | ### Keytabs and YARN applications
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/sections/sasl.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 |   Licensed under the Apache License, Version 2.0 (the "License");
 3 |   you may not use this file except in compliance with the License.
 4 |   You may obtain a copy of the License at
 5 |   
 6 |    http://www.apache.org/licenses/LICENSE-2.0
 7 |   
 8 |   Unless required by applicable law or agreed to in writing, software
 9 |   distributed under the License is distributed on an "AS IS" BASIS,
10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |   See the License for the specific language governing permissions and
12 |   limitations under the License. See accompanying LICENSE file.
13 | -->
14 | 
15 | # SASL: Simple Authentication and Security Layer
16 | 
17 | This is an [IETF-managed](http://www.iana.org/assignments/sasl-mechanisms/sasl-mechanisms.xhtml)
18 | specification for securing network channels, with [RFC4422](http://tools.ietf.org/html/rfc4422)
19 | covering the core of it.
20 | 
21 | SASL is not an authentication mechanism. SASL is a mechanism for applications to set up
22 | an authenticated communications channel by way of a shared authentication mechanism.
23 | SASL covers the protocol for the applications to negotiate as to which authentication
24 | mechanism to use, then to perform whatever challenge/response exchanges are needed for
25 | that authentication to take place. Kerberos is one authentication mechanism, but SASL
26 | supports others, such as x.509 certificates.
27 | 
28 | Similarly, SASL does not address wire-encryption, or anything related to authorization.
29 | SASL is purely about a client authenticating its actual or delegated identity with a server,
30 | while the client verifies that the server also has the required identity (usually one
31 | declared in a configuration file).
32 | 
33 | As well as being independent of the authentication mechanism, SASL is independent of the
34 | underlying wire format/communications protocol. The SASL implementation libraries
35 | can be used by applications to secure whatever network protocol they've implemented.
36 | 
37 | In Hadoop, "SASL" can be taken to mean "authentication negotiated using SASL".
38 | It doesn't define which protocol itself is authenticated —and you don't really need to care.
39 | Furthermore, if you implement your own protocol, if you add SASL-based authentication to it,
40 | you get to use Kerberos, x509, Single-Sign-On, Open Auth (when completed), etc.
41 | 
42 | For Hadoop RPC, there are currently two protocols for authentication:
43 | 
44 | * KERBEROS: Kerberos ticket-based authentication
45 | * DIGEST-MD5: MD5 checksum-based authentication; shows caller has a secret which the
46 |   recipient also knows.
47 | 
48 | Note that there is also the protocol `PLAIN`; SASL-based negotiation to not have any authentication
49 | at all. That doesn't surface in Hadoop —yet— though it does crop up in JIRAs.
50 | 
51 | ## SASL-enabled services
52 | 
53 | Services which use SASL include
54 | 
55 | 1. Hadoop RPC
56 | 1. [Zookeeper](https://cwiki.apache.org/confluence/display/ZOOKEEPER/Zookeeper+and+SASL)
57 | 1. HDFS 2.6+ DataNode bulk IO protocol (HTTP based)
58 | 


--------------------------------------------------------------------------------
/sections/secrets.md:
--------------------------------------------------------------------------------
  1 | <!---
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 |   
  6 |    http://www.apache.org/licenses/LICENSE-2.0
  7 |   
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 |   
 15 | 
 16 | # Low-Level Secrets
 17 | 
 18 | 
 19 | 
 20 | > Among the agonies of these after days is that chief of torments — inarticulateness. What I learned and saw in those hours of impious exploration can never be told — for want of symbols or suggestions in any language.
 21 | 
 22 | > *[The Shunned House](https://en.wikipedia.org/wiki/The_Shunned_House), HP Lovecraft, 1924.*
 23 | 
 24 | 
 25 | ## `krb5.conf` and system property `java.security.krb5.conf`
 26 | 
 27 | 
 28 | You can do two things when setting up the JVM binding to the `krb5.conf` kerberos
 29 | binding file.
 30 | 
 31 | 
 32 | *1. Change the realm with System Property `java.security.krb5.realm`*
 33 | 
 34 | This system property sets the realm for the kerberos binding. This allows you to use a different one from the default in the krb5.conf file. 
 35 | 
 36 | 
 37 | Examples
 38 | 
 39 |     -Djava.security.krb5.realm=PRODUCTION
 40 | 
 41 |     System.setProperty("java.security.krb5.realm", "DEVELOPMENT");
 42 | 
 43 | The JVM property MUST be set before UGI is initialized.
 44 | 
 45 | 
 46 | *2. Switch to an alternate `krb5.conf` file.*
 47 | 
 48 | The JVM kerberos operations are configured via the `krb5.conf` file specified in the JVM option
 49 | `java.security.krb5.conf` which can be done on the JVM command line, or inside the JVM
 50 | 
 51 | ```java
 52 | System.setProperty("java.security.krb5.conf", krbfilepath);
 53 | ```
 54 | 
 55 | The JVM property MUST be set before UGI is initialized.
 56 | 
 57 | Notes
 58 | 
 59 | * use double backslash to escape paths on Windows platforms, e.g. `C:\\keys\\key1`, or `\\\\server4\\shared\\tokens`
 60 | * Different JVMs (e.g. IBM JVM) want different fields in their `krb5.conf` file. How can you tell? Kerberos will fail with a message
 61 | 
 62 | 
 63 | ## JVM Kerberos Library logging
 64 | 
 65 | You can turn Kerberos low-level logging on
 66 | 
 67 | ```
 68 | -Dsun.security.krb5.debug=true
 69 | ```
 70 | 
 71 | This doesn't come out via Log4J, or `java.util logging;` it just comes out on the console. Which is somewhat inconvenient —but bear in mind they are logging at a very low level part of the system. And it does at least log.
 72 | If you find yourself down at this level you are in trouble. Bear that in mind.
 73 | 
 74 | 
 75 | ## JVM SPNEGO Logging
 76 | 
 77 | If you want to debug what is happening in SPNEGO, another system property lets you enable this:
 78 | 
 79 | ```
 80 | -Dsun.security.spnego.debug=true
 81 | ```
 82 | 
 83 | You can ask for both of these in the `HADOOP_OPTS` environment variable
 84 | 
 85 | ```
 86 | export HADOOP_OPTS=-Dsun.security.krb5.debug=true -Dsun.security.spnego.debug=true
 87 | ```
 88 | 
 89 | 
 90 | ## Hadoop-side JAAS debugging
 91 | 
 92 | Set the env variable `HADOOP_JAAS_DEBUG` to true and UGI will set the "debug" flag on any JAAS
 93 | files it creates.
 94 | 
 95 | You can do this on the client, before issuing a `hadoop`, `hdfs` or `yarn` command,
 96 | and set it in the environment script of a YARN service to turn it on there.
 97 | 
 98 | ```
 99 | export HADOOP_JAAS_DEBUG=true
100 | ```
101 | 
102 | On the next Hadoop command, you'll see a trace like
103 | 
104 |         [UnixLoginModule]: succeeded importing info: 
105 |           uid = 503
106 |           gid = 20
107 |           supp gid = 20
108 |           supp gid = 501
109 |           supp gid = 12
110 |           supp gid = 61
111 |           supp gid = 79
112 |           supp gid = 80
113 |           supp gid = 81
114 |           supp gid = 98
115 |           supp gid = 399
116 |           supp gid = 33
117 |           supp gid = 100
118 |           supp gid = 204
119 |           supp gid = 395
120 |           supp gid = 398
121 |     Debug is  true storeKey false useTicketCache true useKeyTab false doNotPrompt true ticketCache is null isInitiator true KeyTab is null refreshKrb5Config is false principal is null tryFirstPass is false useFirstPass is false storePass is false clearPass is false
122 |     Acquire TGT from Cache
123 |     Principal is stevel@COTHAM
124 |         [UnixLoginModule]: added UnixPrincipal,
125 |             UnixNumericUserPrincipal,
126 |             UnixNumericGroupPrincipal(s),
127 |            to Subject
128 |     Commit Succeeded 
129 |     
130 |         [UnixLoginModule]: logged out Subject
131 |         [Krb5LoginModule]: Entering logout
132 |         [Krb5LoginModule]: logged out Subject
133 |         [UnixLoginModule]: succeeded importing info: 
134 |           uid = 503
135 |           gid = 20
136 |           supp gid = 20
137 |           supp gid = 501
138 |           supp gid = 12
139 |           supp gid = 61
140 |           supp gid = 79
141 |           supp gid = 80
142 |           supp gid = 81
143 |           supp gid = 98
144 |           supp gid = 399
145 |           supp gid = 33
146 |           supp gid = 100
147 |           supp gid = 204
148 |           supp gid = 395
149 |           supp gid = 398
150 |     Debug is  true storeKey false useTicketCache true useKeyTab false doNotPrompt true ticketCache is null isInitiator true KeyTab is null refreshKrb5Config is false principal is null tryFirstPass is false useFirstPass is false storePass is false clearPass is false
151 |     Acquire TGT from Cache
152 |     Principal is stevel@COTHAM
153 |         [UnixLoginModule]: added UnixPrincipal,
154 |             UnixNumericUserPrincipal,
155 |             UnixNumericGroupPrincipal(s),
156 |            to Subject
157 |     Commit Succeeded 
158 |     
159 | 
160 | ## OS-level Kerberos Debugging
161 | 
162 | Starting MIT Kerberos v1.9, Kerberos libraries introduced a debug option which is a boon to any person breaking his/her head over a nasty Kerberos issue. It is also a good way to understand how does Kerberos library work under the hood. User can set an environment variable called `KRB5_TRACE` to a filename or to `/dev/stdout` and Kerberos programs (like kinit, klist and kvno etc.) as well as Kerberos libraries (libkrb5* ) will start printing more interesting details.
163 | 
164 | This is a very powerfull feature and can be used to debug any program which uses Kerberos libraries (e.g. CURL). It can also be used in conjunction with other debug options like `HADOOP_JAAS_DEBUG` and `sun.security.krb5.debug`.
165 | 
166 | ```
167 | export KRB5_TRACE=/tmp/kinit.log
168 | ```
169 | 
170 | After setting this up in the terminal, the kinit command will produce something similar to this:
171 | 
172 | ```
173 | # kinit admin/admin
174 | Password for admin/admin@MYKDC.COM:
175 | 
176 | # cat /tmp/kinit.log
177 | [5709] 1488484765.450285: Getting initial credentials for admin/admin@MYKDC.COM
178 | [5709] 1488484765.450556: Sending request (200 bytes) to MYKDC.COM
179 | [5709] 1488484765.450613: Resolving hostname sandbox.hortonworks.com
180 | [5709] 1488484765.450954: Initiating TCP connection to stream 172.17.0.2:88
181 | [5709] 1488484765.451060: Sending TCP request to stream 172.17.0.2:88
182 | [5709] 1488484765.461681: Received answer from stream 172.17.0.2:88
183 | [5709] 1488484765.461724: Response was not from master KDC
184 | [5709] 1488484765.461752: Processing preauth types: 19
185 | [5709] 1488484765.461764: Selected etype info: etype aes256-cts, salt "(null)", params ""
186 | [5709] 1488484765.461767: Produced preauth for next request: (empty)
187 | [5709] 1488484765.461771: Salt derived from principal: MYKDC.COMadminadmin
188 | [5709] 1488484765.461773: Getting AS key, salt "MYKDC.COMadminadmin", params ""
189 | [5709] 1488484770.985461: AS key obtained from gak_fct: aes256-cts/93FB
190 | [5709] 1488484770.985518: Decrypted AS reply; session key is: aes256-cts/2C56
191 | [5709] 1488484770.985531: FAST negotiation: available
192 | [5709] 1488484770.985555: Initializing FILE:/tmp/krb5cc_0 with default princ admin/admin@MYKDC.COM
193 | [5709] 1488484770.985682: Removing admin/admin@MYKDC.COM -> krbtgt/MYKDC.COM@MYKDC.COM from FILE:/tmp/krb5cc_0
194 | [5709] 1488484770.985688: Storing admin/admin@MYKDC.COM -> krbtgt/MYKDC.COM@MYKDC.COM in FILE:/tmp/krb5cc_0
195 | [5709] 1488484770.985742: Storing config in FILE:/tmp/krb5cc_0 for krbtgt/MYKDC.COM@MYKDC.COM: fast_avail: yes
196 | [5709] 1488484770.985758: Removing admin/admin@MYKDC.COM -> krb5_ccache_conf_data/fast_avail/krbtgt\/MYKDC.COM\@MYKDC.COM@X-CACHECONF: from FILE:/tmp/krb5cc_0
197 | [5709] 1488484770.985763: Storing admin/admin@MYKDC.COM -> krb5_ccache_conf_data/fast_avail/krbtgt\/MYKDC.COM\@MYKDC.COM@X-CACHECONF: in FILE:/tmp/krb5cc_0
198 | ```
199 | 
200 | 
201 | ## KRB5CCNAME
202 | 
203 | The environment variable [`KRB5CCNAME`](http://web.mit.edu/kerberos/krb5-1.4/krb5-1.4/doc/klist.html)
204 | As the docs say:
205 | 
206 | If the KRB5CCNAME environment variable is set, its value is used to name the default ticket cache.
207 | 
208 | ## IP addresses vs. Hostnames
209 | 
210 | Kerberos principals are traditionally defined with hostnames of the form `hbase@worker3/EXAMPLE.COM`, not `hbase/10.10.15.1/EXAMPLE.COM`
211 | 
212 | The issue of whether Hadoop should support IP addresses has been raised [HADOOP-9019](https://issues.apache.org/jira/browse/HADOOP-9019) & [HADOOP-7510](https://issues.apache.org/jira/browse/HADOOP-7510)
213 | Current consensus is no: you need DNS set up, or at least a consistent and valid /etc/hosts file on every node in the cluster.
214 | 
215 | ## Windows
216 | 
217 | 1. Windows does not reverse-DNS 127.0.0.1 to localhost or the local machine name; this can cause problems with MiniKDC tests in Windows, where adding a `user/127.0.0.1@REALM` principal will be needed [example](https://github.com/apache/hadoop/blob/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-registry/src/test/java/org/apache/hadoop/registry/secure/AbstractSecureRegistryTest.java#L209).
218 | 1. Windows hostnames are often upper case.
219 | 
220 | ## Kerberos's defences against replay attacks
221 | 
222 | From the javadocs of `org.apache.hadoop.ipc.Client.handleSaslConnectionFailure()`:
223 | 
224 |     /**
225 |      * If multiple clients with the same principal try to connect to the same
226 |      * server at the same time, the server assumes a replay attack is in
227 |      * progress. This is a feature of kerberos. In order to work around this,
228 |      * what is done is that the client backs off randomly and tries to initiate
229 |      * the connection again.
230 |      */
231 | 
232 | That's a good defence on the surface, "multiple connections from same principal == attack", which
233 | doesn't scale to Hadoop clusters. Hence the sleep. It is also why large Hadoop clusters define
234 | a different principal for every service/host pair in the keytab, ensuring giving the principal
235 | for the HDFS blockserver on host1 an identity such as `hdfs/host1`, for host 2 `hdfs/host2`, etc.
236 | When a cluster is completely restarted, instead of the same principal trying to authenticate from
237 | 1000+ hosts, only the HDFS services on a single node try to authenticate as the same principal.
238 | 
239 | ## Asymmetric Kerberos Realm Trust
240 |  
241 | It is possible to configure Kerberos KDCs such that one realm, e.g `"hadoop-kdc"` 
242 | can  trust principals from a remote realm -but for that
243 | remote realm not to trust the principals from that `"hadoop-kdc"` realm. 
244 | What does that permit? It means that a Hadoop-cluster-specific KDC can be created and configured
245 | to trust principals from the enterprise-wide (Active-Directory Managed) KDC infrastructure.
246 | The hadoop cluster KDC will contain the principals for the various services, with these exported
247 | into keytabs.
248 | 
249 | As a result, even if the keytabs are compromised, *they do not grant any access to and
250 | enterprise-wide kerberos-authenticated services.
251 |  
252 | 


--------------------------------------------------------------------------------
/sections/services.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 |   Licensed under the Apache License, Version 2.0 (the "License");
 3 |   you may not use this file except in compliance with the License.
 4 |   You may obtain a copy of the License at
 5 |   
 6 |    http://www.apache.org/licenses/LICENSE-2.0
 7 |   
 8 |   Unless required by applicable law or agreed to in writing, software
 9 |   distributed under the License is distributed on an "AS IS" BASIS,
10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |   See the License for the specific language governing permissions and
12 |   limitations under the License. See accompanying LICENSE file.
13 | -->
14 |   
15 | # Securing Hadoop Services
16 | 
17 | ## Proxy Users and superuser services
18 | 
19 | Supporting Proxy users is covered in the hadoop docs
20 | [Proxy user - Superusers Acting On Behalf Of Other Users](http://hadoop.apache.org/docs/r2.7.1/hadoop-project-dist/hadoop-common/Superusers.html)
21 |  
22 | In an insecure user, the proxy user creation is automatic: services act on behalf of whoever
23 | they claim to be.
24 | 
25 | In a secure cluster, they must have (how??) received the delegation token for
26 | the user.
27 | 
28 | 
29 | Note that in Hadoop 2.6/2.7 the Filesystem cache creates a new filesystem instance for the given user, even
30 | if there is an FS client for that specific filesystem already in the cache. (VERIFY)
31 | 
32 | 


--------------------------------------------------------------------------------
/sections/terrors.md:
--------------------------------------------------------------------------------
  1 | <!---
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 |   
  6 |    http://www.apache.org/licenses/LICENSE-2.0
  7 |   
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 | 
 15 | # Tales of Terror
 16 | 
 17 | The following are all true stories. We welcome more submissions of these stories, especially
 18 | covering the steps taken to determine what was wrong.
 19 | 
 20 | 
 21 | ## The Zookeeper's Birthday Present
 22 | 
 23 | 
 24 | A client program could not work with zookeeper: the connections were being broken. But it
 25 | was working for everything else.
 26 | 
 27 | The cluster was one year old that day.
 28 | 
 29 | It turns out that ZK reacts to an auth failure by logging something in its logs, and breaking
 30 | the client connection —without any notification to the client. Rather than a network problem
 31 | (initial hypothesis), this a Kerberos problem. How was that worked out? By examining the
 32 | Zookeeper logs —there was nothing client-side except the reports of connections being closed
 33 | and the ZK client attempting to retry.
 34 | 
 35 | When a Kerberos keytab is created, the entries in it have a lifespan. The default value is one
 36 | year. This was its first birthday, hence ZK wouldn't trust the client.
 37 | 
 38 | **Fix: create new keytabs, valid for another year, and distribute them.**
 39 | 
 40 | ## The Principal With No Realm
 41 | 
 42 | This one showed up during release testing —credit to Andras Bokor for tracking it all down.
 43 | 
 44 | A stack trace
 45 | 
 46 | ```
 47 | 16/01/16 01:42:39 WARN ipc.Client: Exception encountered while connecting to the server :
 48 | javax.security.sasl.SaslException: GSS initiate failed
 49 | [Caused by GSSException: No valid credentials provided (Mechanism level: Failed to find any Kerberos tgt)]
 50 | java.io.IOException: Failed on local exception: java.io.IOException:
 51 | javax.security.sasl.SaslException: GSS initiate failed
 52 | [Caused by GSSException: No valid credentials provided 
 53 | (Mechanism level: Failed to find any Kerberos tgt)]; Host Details :
 54 | local host is: "os-u14-2-2.novalocal/172.22.73.243"; destination host is: "os-u14-2-3.novalocal":8020; 
 55 |   at org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:773)
 56 |   at org.apache.hadoop.ipc.Client.call(Client.java:1431)
 57 |   at org.apache.hadoop.ipc.Client.call(Client.java:1358)
 58 |   at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:229)
 59 |   at com.sun.proxy.$Proxy11.getFileInfo(Unknown Source)
 60 |   at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:771)
 61 |   at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
 62 |   at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
 63 |   at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
 64 |   at java.lang.reflect.Method.invoke(Method.java:606)
 65 |   at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:252)
 66 |   at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:104)
 67 |   at com.sun.proxy.$Proxy12.getFileInfo(Unknown Source)
 68 |   at org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:2116)
 69 |   at org.apache.hadoop.hdfs.DistributedFileSystem$22.doCall(DistributedFileSystem.java:1315)
 70 |   at org.apache.hadoop.hdfs.DistributedFileSystem$22.doCall(DistributedFileSystem.java:1311)
 71 |   at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
 72 |   at org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1311)
 73 |   at org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:1424)
 74 | ```
 75 | 
 76 | This looks like a normal "not logged in" problem, except for some little facts: 
 77 | 
 78 | 1. The user was logged in.
 79 | 1. The failure was replicable.
 80 | 1. It only surfaced on OpenJDK, not oracle JDK.
 81 | 1. Everything worked on OpenJDK 7u51, but not on OpenJDK 7u91.
 82 | 
 83 | Something had changed in the JDK to reject the login on this system (ubuntu, virtual test cluster).
 84 | 
 85 | `Kdiag` didn't throw up anything obvious. What did show some warning was `klist`:
 86 | 
 87 | ```
 88 | Ticket cache: FILE:/tmp/krb5cc_2529
 89 | Default principal: qe@REALM
 90 | 
 91 | Valid starting       Expires              Service principal
 92 | 01/16/2016 11:07:23  01/16/2016 21:07:23  krbtgt/REALM@REALM 
 93 |   renew until 01/23/2016 11:07:23
 94 | 01/16/2016 13:13:11  01/16/2016 21:07:23  HTTP/hdfs-3-5@
 95 |   renew until 01/23/2016 11:07:23
 96 | 01/16/2016 13:13:11  01/16/2016 21:07:23  HTTP/hdfs-3-5@REALM
 97 |   renew until 01/23/2016 11:07:23
 98 | ```
 99 | 
100 | See that? There's a principal which doesn't have a stated realm. Does that matter? 
101 | 
102 | In OracleJDK, and OpenJDK 7u51, apparently not. In OpenJDK 7u91, yes
103 | 
104 | There's some new code in `sun.security.krb5.PrincipalName` (Oracle OpenJDK copyright)
105 | 
106 | ```java
107 | // Validate a nameStrings argument
108 | private static void validateNameStrings(String[] ns) {
109 |     if (ns == null) {
110 |         throw new IllegalArgumentException("Null nameStrings not allowed");
111 |     }
112 |     if (ns.length == 0) {
113 |         throw new IllegalArgumentException("Empty nameStrings not allowed");
114 |     }
115 |     for (String s: ns) {
116 |         if (s == null) {
117 |             throw new IllegalArgumentException("Null nameString not allowed");
118 |         }
119 |         if (s.isEmpty()) {
120 |             throw new IllegalArgumentException("Empty nameString not allowed");
121 |         }
122 |     }
123 | }
124 | ```
125 | 
126 | This checks the code, and rejects if nothing is valid. Now, how does something invalid get in?
127 | Setting `HADOOP_JAAS_DEBUG=true` and logging at debug turned up output,
128 | 
129 | With 7u51:
130 | ```
131 | 16/01/20 15:13:20 DEBUG security.UserGroupInformation: using kerberos user:qe@REALM
132 | ```
133 | 
134 | With 7u91:
135 | 
136 | ```
137 | 16/01/20 15:10:44 DEBUG security.UserGroupInformation: using kerberos user:null
138 | ```
139 | 
140 | Which means that the default principal wasn't being picked up, instead some JVM specific introspection
141 | had kicked in —and it was finding the principal without a realm, rather than the one that was.
142 | 
143 | 
144 | *Fix: add a `domain_realm` in `/etc/krb5.conf` mapping hostnames to realms *
145 | 
146 | ```
147 | [domain_realm]
148 |   hdfs-3-5.novalocal = REALM
149 | ```
150 | 
151 | A `klist` then returns a list of credentials without this realm-less one in.
152 | 
153 | ```
154 | Valid starting       Expires              Service principal
155 | 01/17/2016 14:49:08  01/18/2016 00:49:08  krbtgt/REALM@REALM
156 |   renew until 01/24/2016 14:49:08
157 | 01/17/2016 14:49:16  01/18/2016 00:49:08  HTTP/hdfs-3-5@REALM
158 |   renew until 01/24/2016 14:49:08
159 | ```
160 | 
161 | Because this was a virtual cluster, DNS/RDNS probably wasn't working, presumably kerberos
162 | didn't know what realm the host was in, and things went downhill. It just didn't show in
163 | any validation operations, merely in the classic "no TGT" error.
164 | 
165 | ## The Principal With No Realm - Hive Edition
166 | 
167 | A connection issue from hiveserver2 to the metastore produced similar error message - credit to Attila
168 | Kreiner for tracking it all down.
169 | 
170 | A stack trace
171 | 
172 | ```
173 | <15>1 2021-08-31T06:09:52.229Z hiveserver2-0 hiveserver2 1 cfbab91e-b63f-4ebe-9c31-bc2e9e9ad77e [mdc@18060 class="security.UserGroupInformation" level="DEBUG" operationLogLevel="EXECUTION" queryId="hive_20210831060952_17b5006c-758f-4a0a-b734-cdf23bbfa204" sessionId="155509ce-2d5a-41f3-840d-c4352f041353" thread="HiveServer2-Background-Pool: Thread-222"] PrivilegedAction as:hive (auth:PROXY) via hive/dwx-env-kt566l@VPC.CLOUDERA.COM (auth:KERBEROS) from:org.apache.hadoop.hive.metastore.security.TUGIAssumingTransport.open(TUGIAssumingTransport.java:48)
174 | <11>1 2021-08-31T06:09:52.237Z hiveserver2-0 hiveserver2 1 cfbab91e-b63f-4ebe-9c31-bc2e9e9ad77e [mdc@18060 class="transport.TSaslTransport" level="ERROR" operationLogLevel="EXECUTION" queryId="hive_20210831060952_17b5006c-758f-4a0a-b734-cdf23bbfa204" sessionId="155509ce-2d5a-41f3-840d-c4352f041353" thread="HiveServer2-Background-Pool: Thread-222"] SASL negotiation failure
175 | javax.security.sasl.SaslException: GSS initiate failed [Caused by GSSException: No valid credentials provided (Mechanism level: Attempt to obtain new INITIATE credentials failed! (null))]
176 |   at com.sun.security.sasl.gsskerb.GssKrb5Client.evaluateChallenge(GssKrb5Client.java:211)
177 |   at org.apache.thrift.transport.TSaslClientTransport.handleSaslStartMessage(TSaslClientTransport.java:94)
178 |   at org.apache.thrift.transport.TSaslTransport.open(TSaslTransport.java:271)
179 |   at org.apache.thrift.transport.TSaslClientTransport.open(TSaslClientTransport.java:37)
180 |   at org.apache.hadoop.hive.metastore.security.TUGIAssumingTransport$1.run(TUGIAssumingTransport.java:51)
181 |   at org.apache.hadoop.hive.metastore.security.TUGIAssumingTransport$1.run(TUGIAssumingTransport.java:48)
182 | ...
183 | Caused by: GSSException: No valid credentials provided (Mechanism level: Attempt to obtain new INITIATE credentials failed! (null))
184 |   at sun.security.jgss.krb5.Krb5InitCredential.getTgt(Krb5InitCredential.java:385)
185 |   at sun.security.jgss.krb5.Krb5InitCredential.getInstance(Krb5InitCredential.java:160)
186 |   at sun.security.jgss.krb5.Krb5MechFactory.getCredentialElement(Krb5MechFactory.java:122)
187 | ...
188 | Caused by: javax.security.auth.login.LoginException: Cannot read from System.in
189 |   at com.sun.security.auth.module.Krb5LoginModule.promptForName(Krb5LoginModule.java:869)
190 |   at com.sun.security.auth.module.Krb5LoginModule.attemptAuthentication(Krb5LoginModule.java:708)
191 |   at com.sun.security.auth.module.Krb5LoginModule.login(Krb5LoginModule.java:618)
192 | ```
193 | 
194 | The investigation turned out the root cause of the problem was the HADOOP_USER_NAME=hive environment 
195 | setting. In this case hive added the principal **hive** to the Subject and this triggered the "Principal 
196 | With No Realm" problem described above.
197 | 
198 | The problem affected only some of the threads that logged only the misterious GSSException, meanwhile 
199 | the other threads were running fine and logging a lot of normal/healthy log messages. The problem only 
200 | appeared after the original TGT has expired. The java version was OpenJDK 1.8.0_302-b08.
201 | 
202 | 
203 | ## The AD realm redirection failure
204 | 
205 | Real-life example: 
206 | 
207 | * Company ACME has one ActiveDirectory domain per continent.
208 | * Domains have mutual trust enabled.
209 | * AD is also used for Kerberos authentication.
210 | * Kerberos trust is handled by a few AD servers in the "root" domain.
211 | * Hadoop cluster is running in Europe.
212 | 
213 | When a South American user opens a SSH session on the edge node, authentication is done by LDAP, no issue
214 | the dirty work is done by the AD servers. 
215 | But when a S.A. user tries to connect to HDFS or HiveServer2, with principal `Me@SAM.ACME.COM`,
216 | then the Kerberos client must make several hops...
217 | 
218 | 1. AD server for @SAM.ACME.COM says "no, can't create ticket for svc/somehost@EUR.ACME.COM"
219 | 1. AD server for @SAM.ACME.COM says "OK, I can get you a credential to @ACME.COM, see what they can do there" alas, 
220 | 1. There's no AD server defined in conf file for @ACME.COM
221 | 1. This leads to the all to familiar message, `Fail to create credential. (63) - No service creds`
222 | 
223 | Of course the only thing displayed in logs was the final error message.
224 | Even after enabling the "secret" debug flags, it was not clear what the client was trying to do
225 | with all these attempts.
226 | But the tell-tale was the "following capath" comment on each hop, because `CAPATH` is actually
227 | an optional section in `krb5.conf`. Fix: add the information about cross realm authentication
228 | to the `krb5.conf` file.
229 | 
230 | (CAPATH coverage: [MIT](http://web.mit.edu/kerberos/krb5-1.5/krb5-1.5.4/doc/krb5-admin/capaths.html),
231 | [Redhat](https://access.redhat.com/documentation/en-US/Red_Hat_Enterprise_Linux/6/html/Managing_Smart_Cards/Setting_Up_Cross_Realm_Authentication.html)
232 | 


--------------------------------------------------------------------------------
/sections/testing.md:
--------------------------------------------------------------------------------
  1 | <!---
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 |   
  6 |    http://www.apache.org/licenses/LICENSE-2.0
  7 |   
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 |   
 15 | # Testing
 16 | 
 17 | 
 18 | ## Writing Kerberos Tests with MiniKDC
 19 | 
 20 | The Hadoop project has an in-VM Kerberos Controller for tests, MiniKDC, which is packaged as its own JAR for downstream use. The core of this code actually comes from the Apache Directory Services project.
 21 | 
 22 | ----
 23 | 
 24 | ## Testing against Kerberized Hadoop clusters
 25 | 
 26 | This is not actually the hardest form of testing; getting the MiniKDC working holds that honour.
 27 | It does have some pre-requisites.
 28 | 
 29 | 1. Everyone running the tests has set up a Hadoop cluster/single VM with Kerberos enabled.
 30 | 2. The software project has a test runner capable of deploying applications into a remote Hadoop cluster/VM and assessing the outcome.
 31 | 
 32 | It's primarily the test runner which matters. Without that you cannot do functional tests against any Hadoop cluster.
 33 | However, once you have such a test runner, you have a powerful tool: the ability to run tests against real Hadoop clusters, rather than simply minicluster and miniKDC tests which, while better than nothing, are unrealistic.
 34 | 
 35 | If this approach is so powerful, why not bypass the minicluster tests altogether?
 36 | 
 37 | 1. Minicluster tests are easier to run. Build tools can run them; Jenkins can trivially run them as part of test runs.
 38 | 2. The current state of the cluster affects the outcome of the tests. Its useful not only to have tests tear down properly, but for the setup phase of each test suite to verify that the cluster is in the valid initial state/get it into that state. For YARN applications, this generally means that there are no running applications in the cluster.
 39 | 3. Setup often includes the overhead of copying files into HDFS. As the secure user.
 40 | 4. The host launching the tests needs to be setup with kinit/keytabs.
 41 | 5. Retrieving and interpreting the results is harder. Often it involved manually going to the YARN RM to get through to the logs (assuming that yarn-site is configured to preserve them), and/or collecting other service logs.
 42 | 6. If you are working with nightly builds of Hadoop, VM setup needs to be automated.
 43 | 7. Unless you can mandate and verify that all developers run the tests against secure clusters, they may not get run by everyone.
 44 | 8. The tests can be slow.
 45 | 9. Fault injection can be harder.
 46 | 
 47 | Overall, tests are less deterministic.
 48 | 
 49 | In the slider project, different team members have different test clusters, Linux and Windows, Kerberized and non-Kerberized, Java-7 and Java 8. This means that test runs do test a wide set of configurations without requiring every developer to have a VM of every form. The Hortonworks QE team also run these tests against the nightly HDP stack builds, catching regressions in both the HDP stack and in the Slider project.
 50 | 
 51 | For fault injection the Slider Application Master has an integral "chaos monkey" which can be configured to start after a defined period of time, then randomly kill worker containers and/or the application master. This is used in conjunction with the functional tests of the deployed applications to verify that they remain resilient to failure. When tests do fail, we are left with the problem of retrieving the logs and identifying problems from them. The QE test runs do collect all the logs from all the services across the test clusters —but this still leaves the problem of trying to correlate events from the logs across the machines.
 52 | 
 53 | # Tuning a Hadoop cluster for aggressive token timeouts
 54 | 
 55 | ## Kinit
 56 | 
 57 | You can ask for a limited lifespan of a ticket when logging in on the console
 58 |  
 59 |     kinit -l 15m
 60 | 
 61 | ## KDC
 62 | 
 63 | 
 64 | Here is an example `/etc/krb5.conf` which limits the lifespan of a ticket
 65 | to 1h
 66 | 
 67 | ```
 68 | [libdefaults]
 69 | 
 70 |   default_realm = DEVIX
 71 |   renew_lifetime = 2h
 72 |   forwardable = true
 73 | 
 74 |   ticket_lifetime = 1h
 75 |   dns_lookup_realm = false
 76 |   dns_lookup_kdc = false
 77 | 
 78 | [realms]
 79 | 
 80 |   DEVIX = {
 81 |     kdc = devix
 82 |     admin_server = devix
 83 |   }
 84 | ```
 85 | 
 86 | The KDC host here, `devix` is a Linux VM. Turning off the DNS lookups avoids
 87 | futile attempts to work with DNS/rDNS.
 88 | 
 89 | ## Hadoop tokens
 90 | 
 91 | 
 92 | *TODO: Table of properties for hdfs, yarn, hive, ... listing token timeout properties*
 93 | 
 94 | ## Enabling Kerberos for different Hadoop components
 95 | 
 96 | ### Core Hadoop
 97 | 
 98 | 
 99 | ```xml
100 | <property>
101 |   <name>hadoop.security.authentication</name>
102 |   <value>kerberos</value>
103 | </property>
104 | <property>
105 |   <name>hadoop.security.authorization</name>
106 |   <value>true</value>
107 | </property>
108 | ```
109 | 
110 | 
111 | ### HBase
112 | 
113 | ```xml
114 | <property>
115 |   <name>hbase.security.authentication</name>
116 |   <value>kerberos</value>
117 | </property>
118 | <property>
119 |   <name>hbase.security.authorization</name>
120 |   <value>true</value>
121 | </property>
122 | <property>
123 |   <name>hbase.regionserver.kerberos.principal</name>
124 |   <value>hbase/_HOST@YOUR-REALM.COM</value>
125 | </property>
126 | <property>
127 |   <name>hbase.regionserver.keytab.file</name>
128 |   <value>/etc/hbase/conf/keytab.krb5</value>
129 | </property>
130 | <property>
131 |   <name>hbase.master.kerberos.principal</name>
132 |   <value>hbase/_HOST@YOUR-REALM.COM</value>
133 | </property>
134 | <property>
135 |   <name>hbase.master.keytab.file</name>
136 |   <value>/etc/hbase/conf/keytab.krb5</value>
137 | </property>
138 | <property>
139 | <name>hbase.coprocessor.region.classes</name>
140 |   <value>org.apache.hadoop.hbase.security.token.TokenProvider</value>
141 | </property>
142 | ```
143 | 
144 | 
145 | ## Tips
146 | 
147 | 
148 | 1. Use `kdestroy` to destroy your local ticket cache. Do this to ensure that code
149 | running locally is reading data in from a nominated keytab and not falling back
150 | to the user's ticket cache.
151 | 1. VMs: Make sure the clocks between VM and host are in sync; it's easy for a VM clock
152 | to drift when suspended and resumed.
153 | 1. VMs: Make sure that all hosts are listed in the host table, so that hostname lookup works.
154 | 1. Try to log in from a web browser without SPNEGO enabled; this will catch any WebUI
155 | that wasn't actually authenticating callers.
156 | 1. Try to issue RPC and REST calls from an unauthenticated client, and from a user that is not granted
157 | access rights.
158 | 1. YARN applications: verify that REST/Web requests against the real app URL (which can be
159 | determined from the YARN application record), are redirected to the RM proxy (i.e. that
160 | all GET calls result in a 30x redirect). If this does not take place, it means that the RM-hosted
161 | SPNEGO authentication layer can be bypassed.
162 | 


--------------------------------------------------------------------------------
/sections/the_limits_of_hadoop_security.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 |   Licensed under the Apache License, Version 2.0 (the "License");
 3 |   you may not use this file except in compliance with the License.
 4 |   You may obtain a copy of the License at
 5 |   
 6 |    http://www.apache.org/licenses/LICENSE-2.0
 7 |   
 8 |   Unless required by applicable law or agreed to in writing, software
 9 |   distributed under the License is distributed on an "AS IS" BASIS,
10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |   See the License for the specific language governing permissions and
12 |   limitations under the License. See accompanying LICENSE file.
13 | -->
14 | 
15 | # The Limits of Hadoop Security
16 | 
17 | What are the limits of Hadoop security? Even with Kerberos enabled, what vulnerabilities exist?
18 | 
19 | ## Unpatched and 0-day holes in the layers underneath.
20 | 
21 | The underlying OS in a Hadoop cluster may have known or 0-day security holes, allowing a malicious (YARN?) application to gain root access to a host in the cluster. Once this is done it would have direct access to blocks stored by the datanode, and to secrets held in the various processes, including keytabs in the local filesystems.
22 | 
23 | ### Defences
24 | 
25 | 1. Keep up to date with security issues. (SANS is worth tracking), and keep servers up to date.
26 | 2. Isolate the Hadoop cluster from the rest of your network infrastructure, apart from some "edge" nodes, so that only processes running in the cluster.
27 | 3. Developers: ensure that your code works with the more up to date versions of operating systems, JDKs and dependent libraries, so that you not holding back the upgrades. Do not increase the risk for the operations team.
28 | 
29 | ## Failure of users to keep their machines secure
30 | 
31 | The etnernal problem. Securing end-user machines is beyond the scope of the Hadoop project.
32 | 
33 | However, one area where Hadoop may impose risk on the end-user systems is the use of Java as the runtime for client-side code, so mandating an installation of the JVM on those users who need to directly talk to the Hadoop services.
34 | Ops teams should
35 | 
36 | * Make sure that an up to date JVM/JRE is installed, out of date ones are uninstalled, and that Java Applets in browsers are completely disabled.
37 | * Control access to those Hadoop clusters and the services deployed on them.
38 | * Use HDFS Quotas and YARN Queues to limit the resources malicious code can do.
39 | * Collect the HDFS audit logs and learn how to use them to see if, after any possible security breach, you are in a position to even state what data was accessed by a specific user in a given time period.
40 | 
41 | We Hadoop developers need to
42 | 
43 | 1. Make sure that our code works with current versions of Java, and test against forthcoming releases (a permanent trouble spot).
44 | 2. Make sure that our own systems are not vulnerable due to the tools installed locally.
45 | 3. Work to enable thin-client access to services, through REST APIs over Hadoop IPC and other Java protocols, and by helping the native-client work.
46 | 4. Ensure our applications do not blindly trust users —and do as much as possible to prevent privilege escalation.
47 | 5. Log information for ops teams to use in security audits.
48 | 
49 | ## Denial of service attacks.
50 | 
51 | Hadoop is its own Distributed Denial of Service platform. A misconfiguration could easily trigger all datanodes to attempt to report in so frequently that the namenode gets overloaded, triggering apparent timeouts of some DN heartbeats, leading to the namenode assuming it has failed and starting block transfers of under-replicated blocks, so impacting network load and reporting even more. This is not a hypothetical example: Facebook had a cluster outage from precisely such an event, a failing switch partitioning the cluster and triggering a cascade failure. Nowadays IPC throttling (from Twitter) and the use of different ports on the namenode for heartbeating and filesystem operations (from Facebook) try to keep this under control.
52 | 
53 | We're not aware of any reported deliberate attempts to use a Hadoop cluster to overload local/remote services, though there are some anecdotes of the Yahoo! search engines having be written so as to deliberately stripe searches not just across hosts, but domains and countries, so as not to overload the DNS infrastructure of small countries. If you have some network service in your organisation which is considered critical (Examples: sharepoint, exchange), then configure the firewall rules to block access to those hosts and service ports from the Hadoop cluster.
54 | 
55 | Other examples of risk points and mitigation strategies
56 | 
57 | ### YARN resource overload
58 | 
59 | Too many applications asking for small numbers of containers, consuming resources in the Node Managers and RM. There are minimum size values for YARN container allocations for a reason: it's good to set them low on a single node development VM, but in production, they are needed
60 | 
61 | ### DNS overload.
62 | 
63 | This is easily done by accident. Many of the large clusters have local caching DNS servers for this reason, especially those doing any form of search. 
64 | 
65 | ### CPU, network IO, disk IO, memory
66 | 
67 | YARN applications can consume so much local resources that they hurt the performance of other applications running on the same nodes.
68 | 
69 | In Linux and Windows, CPU can be throttled, the amount of physical and virtual memory limited. We could restrict disk and network IO (see relevant JIRAs), but that won't limit HDFS IO, which takes place in a different process.
70 | YARN labels do let you isolate parts of the cluster, so that low-latency YARN applications have access to machines across the racks which IO-heavy batch/background applications do not. 
71 | 
72 | ## Deliberate insertion of malicious code into the Hadoop stack, dependent components or underlying OS.
73 | 
74 | We haven't encountered this yet. Is it conceivable? Yes: in the security interfaces and protocols themselves. Anything involving encryption protocols, random number generation and authentication checks would be the areas most appealing as targets: break the authentication or weaken the encryption and data in a Hadoop cluster becomes more accessible. As stated, we've not seen this. As Hadoop relies on external libraries for encryption, we have to trust them (and any hardware implementations), leaving random number generation and authentication code as targets. Given that few committers understand Hadoop Kerberos, especially at the REST/SPNEGO layer, it is hard for new code submissions in this area to be audited well.
75 | 
76 | One risk we have to consider is: if someone malicious had access to the committer credentials of a developer, could they insert malicious code? Everyone in the Hadoop team would notice changes in the code appearing without associated 
77 | JIRA entries, though it's not clear how well reviewed the code is. 
78 | 
79 | Mitigation strategies:
80 | 
81 | A key one has to be "identify those areas which would be vulnerable to deliberate weakening, and audit patch submissions extra rigorously there", "reject anything which appears to weaken security -even something as simple as allowing IP addresses instead of Hostnames in kerberos binding (cite: JIRA) could be dangerous. And while the submitters are probably well-meaning, we should assume maliciousness or incompetence in the high-risk areas. (* yes, this applies to my own patches too. The accusation of incompetence is defendable based on past submissions anyway). 
82 | 
83 | ### Insecure applications
84 | 
85 | SQL injection attacks are the classic example here. It doesn't matter how secure the layers are underneath if the front end application isn't handling untrusted data. Then there are things like emergency patches to apple watches because of a binary parse error in fonts. 
86 | 
87 | Mitigation strategies 
88 | 
89 | 1. Assume all incoming data is untrusted. In particular, all strings used in queries, while all documents (XML, HTML, binary) should be treated as potentially malformed, if not actually malicious. 
90 | 2. Use source code auditing tools such as Coverity Scan to audit the code. Apache projects have free access to some of these tools.
91 | 3. Never have your programs ask for more rights than they need, to data, to database tables (and in HBase and Accumulo: columns)
92 | 4. Log data in a form which can be used for audit logs. (Issue: what is our story here? Logging to local/remote filesystems isn't it, not if malware could overwrite the logs)
93 | 
94 | 


--------------------------------------------------------------------------------
/sections/tokens.md:
--------------------------------------------------------------------------------
  1 | <!---
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 |   
  6 |    http://www.apache.org/licenses/LICENSE-2.0
  7 |   
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 | 
 15 | # Hadoop Service Tokens
 16 | 
 17 | > Man rules now where They ruled once;
 18 | > They shall soon rule where man rules now.
 19 | > After summer is winter, and after winter summer.
 20 | > They wait patient and potent, for here shall They reign again.
 21 | 
 22 | 
 23 | Hadoop Service "Tokens" are the other side of the complexity of Kerberos and Hadoop;
 24 | close enough to be confusing, different enough that the confusion becomes
 25 | dangerous.
 26 | 
 27 | 
 28 | ## Core Concepts
 29 | 
 30 | 1. Hadoop tokens are issued by services, *for use by that service and its
 31 | distributed components*.
 32 | 1. Tokens are obtained by clients from services, usually through some IPC call
 33 | which returns a token.
 34 | 1. Tokens can contain arbitrary data issued by a service and marshalled
 35 | into a byte array via the Hadoop `Writable` interfce.
 36 | 1. This data is kept opaque to clients by encrypting the marshalled data with
 37 | a password. The encrypted data is given to the client, which can then resubmit it
 38 | to the service or peers *with the same password*. Here it can be decoded and used.
 39 | 1. Therefore, provided the password is sufficiently complex, it should be impossible
 40 | for a client application to view *or tamper with* the data.
 41 | 1. Delegation tokens *may* be passed to other services or other applications.
 42 | The services processing tokens do not (normally) validate the identity of the caller, merely
 43 | that the token is valid.
 44 | 1. Tokens expire; they are invalid after that expiry time.
 45 | 1. Tokens *may* be renewed before they expire, so returning a new token.
 46 | (Not all services support token renewal).
 47 | 1. Token renewal may be repeated, until a final expiry time is reached, often 7 days.
 48 | 1. Tokens may be revoked, after which they are not valid. This is possible if the 
 49 | (stateful) service maintains a table of valid tokens.
 50 | 
 51 | 
 52 | Here are some example uses of tokens
 53 | 
 54 | ### HDFS access tokens in a launched YARN application
 55 | 
 56 | 1. An application running in an account logged in to Kerberos requests a delegation token
 57 | (DT) from HDFS, (API call: `FileSystem.addDelegationTokens()`).
 58 | 1. This token instance is added to the `ContainerLaunchContext`.
 59 | 1. The application is launched.
 60 | 1. The launched App master can retrieve the tokens it has been launched with. One
 61 | of these will be the delegation token.
 62 | 1. The delegation token is used to authenticate the application, renewing it
 63 | as needed.
 64 | 1. When the application completes, the token is revoked.
 65 |   (`ApplicationSubmissionContext.setCancelTokensWhenComplete`).
 66 |   
 67 | ### Block Access tokens within HDFS
 68 | 
 69 | 1. An authenticated client asks the NN for access to data.
 70 | 1. The NN determines the block ID of the data, and returns a block token to the caller.
 71 | 1. This contains encrypted information about the block, particularly its ID the access
 72 | rights granted to the caller.
 73 | 1. The client application locates a DN hosting the block, and issues an access request
 74 | on the block, passing in the block token.
 75 | 1. The DN (which has the same secret password as the NN), decrypts the token to validate
 76 | access permissions of the caller.
 77 | 1. If valid, access to the data is granted.
 78 | 
 79 | 
 80 | ### HBase Access token in Spark Job submission
 81 | 
 82 | The Spark submission client, if configured to do so, will ask HBase for a token. This
 83 | is added to the application launch context, so that a spark job may talk to HBase.
 84 | 
 85 | 
 86 | ## Supporting Tokens in a Service
 87 | 
 88 | 
 89 | To support tokens you need to define your own token identifier, which is a marshallable
 90 | object containing all data to be contained within a token.
 91 | 
 92 | 
 93 | ### Binding
 94 | 
 95 | Binding information must be declared in service metadata files, so the Java
 96 | ServiceLoader can find them. The Token Identifier class is declared in
 97 | `META-INF/services/org.apache.hadoop.security.token.TokenIdentifier`; 
 98 | 
 99 | 
100 | 
101 | Contents of `hadoop-tools/hadoop-azure/src/main/resources/META-INF/services/org.apache.hadoop.security.token.TokenIdentifier`
102 | 
103 | ```
104 | org.apache.hadoop.fs.azure.security.WasbDelegationTokenIdentifier
105 | ```
106 | 
107 | If a token is renewable, it must also provide a token renewer declaration
108 | in in `META-INF/services/org.apache.hadoop.security.token.TokenRenewer`
109 | 
110 | 
111 | Contents of `hadoop-tools/hadoop-azure/src/main/resources/META-INF/services/org.apache.hadoop.security.token.TokenRenewer`
112 | 
113 | ```
114 | org.apache.hadoop.fs.azure.security.WasbTokenRenewer
115 | ```
116 | 
117 | Important: make these classes fast to load, and resilient to not having dependencies
118 | on the classpath. Why? They classes will be loaded whenever *any* token is decoded.
119 | A slow loading class applications down; one which actually fails during
120 | loading creates needless support calls. If a transient dependency is not present,
121 | the entire load process will break for every token load, irrespective
122 | of token kind.
123 | 
124 | 
125 | ### Token Service and Kind
126 | 
127 | A *token kind* is the string identifier used to uniquely identify the binding implementation
128 | class; this class's implementation of `TokenIdentifier.getKind()` is used to look up
129 | the implementation of `TokenIdentifier` registered. This string is saved as a `Text` entry
130 | in the token file; when the token is decoded the implementation is located.
131 | 
132 | The token kind *must* be unique amongst all possible token implementation classes.
133 | 
134 | A *Token Service* is a reference to a service, used to locate it in Credentials.
135 | `Credentials` persists tokens as a map from (service -> token), where the service
136 | is that returned by `Token.getService()`. Generally this *must* be unique to
137 | both the service *and the endpoint offering that service*. HDFS uses the IP address
138 | of the NN host as part of its service identifier. 
139 | See `SecurityUtil.buildDTServiceName()` for the algorithm here. It is not mandatory
140 | to use this -any identifier unique for a service kind and installation is sufficient.
141 | 


--------------------------------------------------------------------------------
/sections/ugi.md:
--------------------------------------------------------------------------------
  1 | <!---
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 |   
  6 |    http://www.apache.org/licenses/LICENSE-2.0
  7 |   
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 |   
 15 | # UGI
 16 | 
 17 | > From the pictures I turned to the bulky, closely written letter itself; and for the next three hours was immersed in a gulf of unutterable horror. Where Akeley had given only outlines before, he now entered into minute details; presenting long transcripts of words overheard in the woods at night, long accounts of monstrous pinkish forms spied in thickets at twilight on the hills, and a terrible cosmic narrative derived from the application of profound and varied scholarship to the endless bygone discourses of the mad self-styled spy who had killed himself.
 18 | 
 19 | > HP Lovecraft [The Whisperer in Darkness](http://www.hplovecraft.com/writings/texts/fiction/wid.aspx), 1931
 20 | 
 21 | If there is one class guaranteed to strike fear into anyone with experience in Hadoop+Kerberos code it is `UserGroupInformation`, abbreviated to "UGI"
 22 | 
 23 | Nobody says `UserGroupInformation` out loud; it is the *that which must not be named* of the stack
 24 | 
 25 | ## What does UGI do?
 26 | 
 27 | Here are some of the things it can do
 28 | 
 29 | 1. Handles the initial login process, using any environmental `kinit`-ed tokens or a keytab.
 30 | 1. Spawn off a thread to renew the TGT
 31 | 1. Support an operation for-on demand verification/re-init of kerberos tickets details before issuing a request.
 32 | 1. Appear in stack traces which warn the viewer of security related trouble.
 33 | 
 34 | 
 35 | ## UGI Strengths
 36 | 
 37 | * It's one place for almost all Kerberos/User authentication to live.
 38 | * Being fairly widely used, once you've learned it, your knowledge works through
 39 | the entire Hadoop stack.
 40 |  
 41 | 
 42 | ## UGI Troublespots
 43 | 
 44 | * It's a singleton. Don't expect to have one "real user" per process. 
 45 | This does sort of makes sense. Even a single service has its "service" identity; as the 
 46 | 
 47 | * Once initialized, it stays initialized *and cannot be reset*.
 48 | This makes it critical to load in your configuration information including keytabs and principals,
 49 | before that first initialization of the UGI.
 50 | (There is actually a `UGI.reset()` call, but it is package scoped and purely to allow tests to
 51 | reset the information).
 52 | * UGI initialization can take place in code which you don't expect.
 53 |  A specific example is in the Hadoop filesystem APIs.
 54 |  Create a Hadoop filesystem instance and UGI is likely to be inited immediately, even if it is a local file:// reference.
 55 |  As a result: init before you go near the filesystem, with the principal you want.
 56 | * It has to do some low-level reflection-based access to Java-version-specific Kerberos internal classes.
 57 | This can break across Java versions, and JVM implementations. Specifically Java 8 has classes that Java 6 doesn't; the IBM JVM is very different.
 58 | * All its exceptions are basic `IOException` instances, so hard to match on without looking at the text, which is very brittle.
 59 | * Some invoked operations are relayed without the stack trace (this should now be fixed).
 60 | * Diagnostics could be improved. (this is one of those British understatements, it really means "it would be really nice if you could actually get any hint as to WTF is going inside the class as otherwise you are left with nothing to go on other than some message that a user at a random bit of code wasn't authorized)
 61 | 
 62 | The issues related to diagnostics, logging, exception types and inner causes could be addressed. It would be nice to also have an exception cached at init time, so that diagnostics code could even track down where the init took place. Volunteers welcome. That said, here are some bits of the code where patches would be vetoed
 63 | 
 64 | * Replacing the current text of exceptions. We don't know what is scanning for that text, or what documents go with them.
 65 | Extending the text should be acceptable.
 66 | * All exceptions must remain subclasses of IOException.
 67 | * Logging must not leak secrets, such as tokens.
 68 | 
 69 | 
 70 | ## Core UGI Operations
 71 | 
 72 | 
 73 | ### `isSecurityEnabled()`
 74 | 
 75 | One of the foundational calls is the `UserGroupInformation.isSecurityEnabled()`
 76 | 
 77 | It crops up in code like this
 78 | 
 79 | ```
 80 | if(!UserGroupInformation.isSecurityEnabled()) {
 81 |   stayInALifeOfNaiveInnocence();
 82 | } else {
 83 |   sufferTheEternalPainOfKerberos();
 84 | }
 85 | ```
 86 | 
 87 | Having two branches of code, the "insecure" and "secure mode" is actually dangerous: the entire
 88 | security-enabled branch only ever gets executed when run against a secure Hadoop cluster
 89 | 
 90 | **Important**
 91 | 
 92 | *If you have separate secure and insecure codepaths, you must test on a secure cluster*
 93 | *alongside an insecure one. Otherwise coverage of code and application state will be*
 94 | *fundamentally lacking.*
 95 | 
 96 | *Unless you put in the effort, all your unit tests will be of the insecure codepath.*
 97 | 
 98 | *This means there's an entire codepath which won't get exercised until you run integration*
 99 | *tests on a secure cluster, or worse: until you ship.*
100 | 
101 | What to do? Alongside the testing, the other strategy is: keep the differences between
102 | the two branches down to a minimum. If you look at what YARN does, it always uses
103 | renewable tokens to authenticate communication between the YARN Resource Manager and
104 | a deployed application. As a result, one codepath for token creation, while token propagation
105 | and renewal is automatically tested on all applications.
106 | 
107 | Could your applications do the same? Certainly as far as token- and delegation-token based
108 | mechanisms for callers to show that they have been granted access rights to a service.
109 | 
110 | ### `getLoginUser()`
111 | 
112 | This returns the logged in user
113 | 
114 |     UserGroupInformation user = UserGroupInformation.getLoginUser();
115 | 
116 | If there is no logged user --that is, the login process hasn't started yet,
117 | this triggers the login and the starting of the background refresh thread.
118 | 
119 | This makes it a point where the security kicks in: all configuration resources
120 | must be loaded in advance.
121 | 
122 | ### `checkTGTAndReloginFromKeytab()`
123 | 
124 | 
125 |     UserGroupInformation.checkTGTAndReloginFromKeytab();
126 |     
127 | If security is not enabled, this is a no-op.
128 | 
129 | If security is enabled, and the last login took place "long enough ago",
130 | this will trigger a re-login if needed (which may fail,
131 | of course).
132 | 
133 | If the last successful login was recent enough, this will be a no-op. This makes it a low
134 | cost operation to include in IPC/REST client operations so as to ensure that your
135 | tickets are up to date.
136 | 
137 | *Important*: If the login fails, UGI will remember this and not retry until a time
138 | limit has passed, even if other methods invoke the operation. The property
139 | `hadoop.kerberos.min.seconds.before.relogin` controls this delay; the default is 60s.
140 | 
141 | What does that mean? A failure lasts for a while, even if it is a transient one. 
142 | 
143 | ### `getCurrentUser()`
144 | 
145 | This returns the *current* user. 
146 | 
147 | 
148 | 
149 | 
150 | ## Environment variable-managed UGI Initialization
151 | 
152 | There are some environment variables which configure UGI.
153 | 
154 | 
155 | | Environment Variable   | Meaning                   |
156 | |----------------------------------------------------|----------------------------|
157 | | HADOOP_PROXY_USER | identity of a proxy user to authenticate as |
158 | | HADOOP_TOKEN_FILE_LOCATION | local path to a token file |
159 | 
160 | Why environment variables? They offer some features
161 | 
162 | 1. Hadoop environment setup scripts can set them
163 | 1. When launching YARN containers, they may be set as environment variables.
164 | 
165 | As the UGI code is shared across all clients of HDFS and YARN; these environment
166 | variables can be used to configure *any* application which communicates with Hadoop
167 | services via the UGI-authenticated clients. Essentially: all Java IPC clients and
168 | those REST clients using (the Hadoop-implemented REST clients)[web_and_rest.html].
169 | 
170 | ## Debugging UGI
171 | 
172 | UGI supports low-level logging via the log4J log `org.apache.hadoop.security.UserGroupInformation`;
173 | set the system property `HADOOP_JAAS_DEBUG=true` to have the JAAS context logging at
174 | the debug level via some Java log API.
175 | 
176 | It's helpful to back this up with logging the `org.apache.hadoop.security.authentication`
177 | package in `hadoop-auth`
178 | 
179 | ```
180 | log4j.logger.org.apache.hadoop.security.authentication=DEBUG
181 | log4j.logger.org.apache.hadoop.security=DEBUG
182 | ```
183 | 
184 | 
185 | ## Proxy Users
186 | 
187 | Some applications need to act on behalf of other users. For example: Oozie wants to run scheduled
188 | jobs as people, YARN services
189 | 
190 | 
191 | The current user is not always the same as the logged in user; it changes
192 | when a service performs an action on the user's behalf
193 | 
194 | ### `createProxyUser()`
195 | 
196 | Proxy users are a feature which was included in the Hadoop security model for services
197 | such as Oozie; a service which needs to be able to execute work on behalf of a user 
198 | 
199 | ### `doAs()`
200 | 
201 | 
202 | This method is at the core of UGI. A call to `doAs()` executes the inner code
203 | *as the user*. In secure, that means using the Kerberos tickets and Hadoop delegation
204 | tokens belonging to them.
205 | 
206 | Example: loading a filesystem as a user
207 | 
208 | ```
209 | 
210 | UserGroupInformation proxy = 
211 |   UserGroupInformation.createProxyUser(user,
212 |    UserGroupInformation.getLoginUser());
213 | 
214 | FileSystem userFS = proxy.doAs(
215 |   new PrivilegedExceptionAction<FileSystem>() {
216 |     public FileSystem run() throws Exception {
217 |       return FileSystem.get(FileSystem.getDefaultUri(), conf);
218 |     }
219 |   });
220 | ```
221 | 
222 | Here the variable `userFS` contains a client of the Hadoop Filesystem with
223 | the home directory and access rights of the user `user`. If the user identity
224 | had come in via an RPC call, they'd
225 | 


--------------------------------------------------------------------------------
/sections/web_and_rest.md:
--------------------------------------------------------------------------------
  1 | <!---
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 |   
  6 |    http://www.apache.org/licenses/LICENSE-2.0
  7 |   
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 | 
 15 | # Web, REST and SPNEGO
 16 | 
 17 | SPNEGO is the acronym of the protocol by which HTTP clients can authenticate with a web site using Kerberos. This allows the client to identify and authenticate itself to a web site or a web service.
 18 | SPNEGO is supported by
 19 | 
 20 | * The standard browsers, to different levels of pain of use
 21 | * `curl` on the command line
 22 | * `java.net.URL`
 23 | 
 24 | The final point is key: it can be used programmatically in Java, so used by REST client applications to authenticate with a remote Web Service.
 25 | 
 26 | Exactly how the Java runtime implements its SPNEGO authentication is a mystery to all.
 27 | Unlike, say Hadoop IPC, where the entire authentication code has been implemented by people whose email addresses you can identify from the change log and so ask hard questions, what the JDK does is a black hole.
 28 | 
 29 | The sole source of information is the JDK source, and anything which IDE decompilers
 30 | can add if you end up stepping in to vendor-specific classes.
 31 | 
 32 | There is [one readme file](https://github.com/ddopson/openjdk-test/blob/master/sun/net/www/protocol/http/spnegoReadme) hidden in the test documentation.
 33 | 
 34 | 
 35 | ## Configuring Firefox to use SPNEGO
 36 | 
 37 | Firefox is the easiest browser to set up with SPNEGO support, as it is done in `about:config `and then persisted
 38 | Here are the settings for a local VM, a VM which has an entry in the `/etc/hosts`:
 39 | 
 40 | ```
 41 | 192.168.1.134 devix.cotham.uk devix
 42 | ```
 43 | 
 44 | This hostname is then listed in firefox's config as a URL to trust.
 45 | 
 46 | ![firefox spnego](../images/firefox_spnego_setup.png)
 47 | 
 48 | ## Chrome and SPNEGO
 49 | 
 50 | Historically, Chrome needed to be configured on the command line to use SPNEGO, which was complicated to the point of unusability.
 51 | 
 52 | Fortunately, there is a better way, [Chromium Policy Templates](https://www.chromium.org/administrators/policy-templates).
 53 | 
 54 | See [Google Chrome, SPNEGO, and WebHDFS on Hadoop](http://www.ghostar.org/2015/06/google-chrome-spnego-and-webhdfs-on-hadoop/)
 55 | 
 56 | 
 57 | ## Why not use Apache HTTP Components?
 58 | 
 59 | The Apache HTTP Client/http components have a well-deserved reputation for being great libraries to work with remote HTTP servers. 
 60 | 
 61 | Should you use them for Kerberos/SPNEGO authenticated applications?
 62 | 
 63 | **No.**
 64 | 
 65 | As [the documentation says](http://hc.apache.org/httpcomponents-client-4.3.x/tutorial/html/authentication.html#spnego).
 66 | 
 67 | 
 68 | > There are a lot of issues that can happen but if lucky it'll work without too much of a problem. It should also provide some output to debug with. 
 69 | 
 70 | 
 71 | That's not the kind of information you want to read when working out how to talk to a SPNEGO-authed server.
 72 | In its favour: it's being honest,
 73 | and "if you are lucky it will work" could probably be used to describe the entire
 74 | JDK Kerberos libraries. However: they are being honest;
 75 | it hasn't been a good experience trying to get Jersey to work with secure REST endpoints using the http components as the back end.
 76 | 
 77 | 
 78 | *Don't waste time or make things worse: go with the JDK libraries from the outset*
 79 | 
 80 | 
 81 | # Jersey SPNEGO support
 82 | 
 83 | There is not enough space to describe how to do this; examine the code.
 84 | 
 85 | # Apache CXF
 86 | 
 87 | I've been told that Apache CXF supports SPNEGO —but not yet experimented with it. Any
 88 | insight here would be welcome.
 89 | 
 90 | ## SPNEGO REST clients in the Hadoop codebase
 91 | 
 92 | The first point to note is that there is more than one piece of code
 93 | adding SPNEGO support to Jersey in the Hadoop libraries -there are at
 94 | least three slightly different ones.
 95 | 
 96 | Code in:
 97 | 
 98 | ### WebHDFS
 99 | 
100 | In the HDFS codebase.
101 | 
102 | ### KMS
103 | 
104 | This is probably the best starting point for any REST client which does
105 | not want to address the challenge of delegation token renewal.
106 | 
107 | ### YARN timeline server
108 | 
109 | This handles delegation token renewal by supporting an explicit
110 | renew-token REST operation. A scheduled operation in the client is used to issue this call
111 | regularly and so keep the token up to date.
112 | 
113 | ## Implementing a SPNEGO-authenticated endpoint
114 | 
115 | This isn't as hard as you think: you need to add an authentication filter
116 | 
117 | ## Fun facts
118 | 
119 | * [HADOOP-10850](https://issues.apache.org/jira/browse/HADOOP-10850) The Java SPNEGO code
120 | will blacklist any host where initializing the negotiation code fails.
121 | The blacklist lasts the duration of the JVM. 
122 | 
123 | ## Adding Delegation token renewal
124 | 
125 | 
126 | Simplest way to do this is to have something in the background which makes `OPTIONS` or `HEAD`
127 | calls of the endpoint (the former relies on `OPTIONS` not being disabled, the latter on `HEAD`)
128 | being inexpensive.
129 | 
130 | ## Supporting custom webauth initializers
131 | 
132 | Many large organizations implement their own authentication system. This can be a source
133 | of "entertainment", that is, if fielding support calls in stack traces which include
134 | private modules is considered entertaining.
135 | 
136 | TODO: 
137 | 1. How to declare a custom webauth renderer in the RM proxy
138 | 1. How to handle it in a client
139 | 
140 | ## Identifying and Authenticating callers in Web/REST endpoints
141 |     
142 | 
143 | 
144 | ```java
145 | private static UserGroupInformation getUser(HttpServletRequest req) {
146 |   String remoteUser = req.getRemoteUser();
147 |   UserGroupInformation callerUGI = null;
148 |   if (remoteUser != null) {
149 |     callerUGI = UserGroupInformation.createRemoteUser(remoteUser);
150 |   }
151 |   return callerUGI;
152 | }
153 | ```
154 | 
155 | Note: the remote caller doesn't have any credentials. The service
156 | 
157 | This can then be used to process the events
158 | ```java
159 | @PUT
160 | @Path("/jobs/{jobid}/tasks/{taskid}/attempts/{attemptid}/state")
161 | @Produces({ MediaType.APPLICATION_JSON, MediaType.APPLICATION_XML })
162 | @Consumes({ MediaType.APPLICATION_JSON, MediaType.APPLICATION_XML })
163 | public Response updateJobTaskAttemptState(JobTaskAttemptState targetState,
164 |     @Context HttpServletRequest request, @PathParam("jobid"))
165 |         throws IOException, InterruptedException {
166 |   init();
167 |   UserGroupInformation callerUGI = getUser(request);
168 |   // if the UGI is null, no remote user.
169 | 
170 | ```
171 | 
172 | 


--------------------------------------------------------------------------------
/sections/what_is_kerberos.md:
--------------------------------------------------------------------------------
  1 | <!---
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 |   
  6 |    http://www.apache.org/licenses/LICENSE-2.0
  7 |   
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 |   
 15 | # What is Kerberos?
 16 | 
 17 | > Yog-Sothoth knows the gate.
 18 | > Yog-Sothoth is the gate.
 19 | > Yog-Sothoth is the key and guardian of the gate.
 20 | > Past, present, future, all are one in Yog-Sothoth.
 21 | > He knows where the Old Ones broke through of old, and where They shall break through again.
 22 | > He knows where They have trod earth's fields, and where They still tread them, and why no one can behold Them as They tread.
 23 | 
 24 | >  *["The Dunwich Horror"](https://en.wikisource.org/wiki/The_Dunwich_Horror), HP Lovecraft, 1928*
 25 | 
 26 | 
 27 | Kerberos is a system for authenticating access to distributed services: 
 28 | 
 29 | 1. that callers to a service represent a `principal` in the system, or
 30 | 1. That a caller to a service has been granted the right to act on behalf of a principal 
 31 | —a right which the principal can grant for a limited amount of time.
 32 | 
 33 | In Hadoop, feature #2 is key: a user or a process may *delegate* the authority to another
 34 | process, which can then talk to the desired service with the delegated authority. These
 35 | delegation rights are both limited in scope --- the principal delegates authority on a
 36 | service-by-service basis --- and in time. The latter is for security reasons ---it guarantees
 37 | that if the secret used to act as a delegate, the *token*, is stolen, there is
 38 | only a finite time for which it can be used.
 39 | 
 40 | How does it work? That is beyond the scope of this book and its author.
 41 | 
 42 | It is covered in detail in [Coluris01], S7.6.2.
 43 | However, anyone attempting to read this will generally come out with at least a light headache
 44 | and no better informed.
 45 |  
 46 | 
 47 | For an approximate summary of the concepts
 48 | 
 49 | ## Kerberos Domain Controller, *the KDC*
 50 | 
 51 | The KDC is the gate, it is is the key and guardian of the gate, it is the gateway to the
 52 | madness that is Kerberos.
 53 | 
 54 | Every Kerberos Realm needs at least one. There's one for Linux and Active Directory
 55 | can act as a federated KDC infrastructure. Hadoop cluster management tools often
 56 | aid in setting up a KDC for a Hadoop cluster. 
 57 | There's even a minature one, the `MiniKDC` in the Hadoop source for [testing](testing.html).
 58 | 
 59 | KDCs are managed by operations teams. If a developer finds themselves maintaining a KDC outside
 60 | of a test environment, they are in trouble and probably out of their depth.
 61 | 
 62 | ## Kerberos Principal
 63 | 
 64 | A principal is an identity in the system; a person or a thing like the hadoop namenode
 65 | which has been given an identity.
 66 | 
 67 | In Hadoop, a different principal is usually created for each service and machine in the cluster,
 68 | such as `hdfs/node1`, `hdfs/node2`, ... etc. These principals would then be used for
 69 | all HDFS daemons running on node1, node2, etc.
 70 | 
 71 | It's possible to shortcut this and skip the machine specific principal, downgrading
 72 | to one per service, such as `hdfs`, `yarn`, `hbase` —or even one for all hadoop applications,
 73 | such as `hadoop`. This can be done on a small cluster, but doesn't scale well, or makes
 74 | working out WTF is going on difficult.
 75 | 
 76 | In particular, the bits of Kerberos which handle logins, the Kerberos Domain Controllers,
 77 | treat repeated attempts to log in as the same principal within a short period of time
 78 | as some attack on the system, such as a replay or key guessing attack. The requests
 79 | are all automatically rejected (presumably without any validation, so as to reduce
 80 | CPU load on the server). Even a small Hadoop cluster could generate enough authentication
 81 | requests on a cluster restart for this to happen —hence a different principal for every
 82 | service on every node.
 83 | 
 84 | How do the Hadoop services know which principal to identify themselves at? Generally,
 85 | though Hadoop configuration files. They also determine the hostname, and use this to
 86 | decide which of the possible principals in their keytab (see below) to identify themselves
 87 | at. For this to work, machines have to know who they are.
 88 | 
 89 | Specifically
 90 | 1. They have to have a name
 91 | 1. That name has to be in their host table or DNS
 92 | 1. It has to match the IP address of the host
 93 | 
 94 | #### what if there is more than one IP address on the host?
 95 | 
 96 | Generally Hadoop services
 97 | are single-IP address, which is a limitation that is likely to be addressed at some time.
 98 | So who knows. Actually, it comes from `org.apache.hadoop.net.NetUtils.getHostname()`, which
 99 | invokes `InetAddress.getLocalHost()` and relies on this to return a hostname. 
100 | It's because of this need to know hostnames for principals, that requests for
101 | Hadoop services to use IP Addresses over hostnames, such as [MAPREDUCE-6463](https://issues.apache.org/jira/browse/MAPREDUCE-6463)
102 | are declined. It also means that if a machine does not know its own hostname, things
103 | do not work [HADOOP-3426](https://issues.apache.org/jira/browse/HADOOP-3426),
104 | [HADOOP-10011](https://issues.apache.org/jira/browse/HADOOP-10011).
105 | 
106 | ## Kerberos Realm
107 | 
108 | A Kerberos *Realm* is the security equivalent of a subnet: all principals live in a realm.
109 | It is conventional, though not mandatory, to use capital letters and a single name, rather
110 | than a dotted network address. Examples: `ENTERPRISE`, `HCLUSTER`
111 | 
112 | Kerberos allows different realms to have some form of trust of others. This would allow
113 | a Hadoop cluster with its own KDC and realm to trust the `ENTERPRISE` realm, but for the
114 | enterprise realm to not trust the HCLUSTER realm, and hence all its principals. This would
115 | prevent a principal `hdfs/node1@HCLUSTER` from having access to the `ENTERPRISE` systems.
116 | While this is a bit tricky to set up, it means that keytabs created for the Hadoop cluster
117 | (see below) are only a security risk for the Hadoop cluster and all data kept in/processed
118 | by it, rather than the entire organisation.
119 | 
120 | ## Kerberos login, `kinit`
121 | 
122 | The command line program `kinit` is how a user authenticates with a KDC on a unix system;
123 | it uses the information stored in `/etc/krb`
124 | 
125 | Alongside `kinit`, comes `kdestroy`, to destroy credentials/log out, and `klist` to list the current
126 | status. The `kdestroy` command is invaluable if you want to verify that any program you start
127 | on the command line really is reading in and using keytab.
128 | 
129 | Here's what a full `klist -v` listing looks like
130 | 
131 | ```
132 | $ klist -v
133 | Credentials cache: API:489E6666-45D0-4F04-9A1D-FCD5D48EEA07
134 |         Principal: stevel@COTHAM
135 |     Cache version: 0
136 | 
137 | Server: krbtgt/COTHAM@COTHAM
138 | Client: stevel@COTHAM
139 | Ticket etype: aes256-cts-hmac-sha1-96, kvno 1
140 | Ticket length: 326
141 | Auth time:  Sep  2 11:52:02 2015
142 | End time:   Sep  3 11:52:01 2015
143 | Renew till: Sep  2 11:52:02 2015
144 | Ticket flags: enc-pa-rep, initial, renewable, forwardable
145 | Addresses: addressless
146 | 
147 | Server: HTTP/devix.cotham.uk@COTHAM
148 | Client: stevel@COTHAM
149 | Ticket etype: aes256-cts-hmac-sha1-96, kvno 25
150 | Ticket length: 333
151 | Auth time:  Sep  2 11:52:02 2015
152 | Start time: Sep  2 12:20:00 2015
153 | End time:   Sep  3 11:52:01 2015
154 | Ticket flags: enc-pa-rep, transited-policy-checked, forwardable
155 | Addresses: addressless
156 | 
157 | ```
158 | 
159 | A shorter summary comes from the basic `klist`
160 | 
161 | ```
162 | $ klist
163 | Credentials cache: API:489E6666-45D0-4F04-9A1D-FCD5D48EEA07
164 |         Principal: stevel@COTHAM
165 | 
166 |   Issued                Expires               Principal
167 | Sep  2 11:52:02 2015  Sep  3 11:52:01 2015  krbtgt/COTHAM@COTHAM
168 | Sep  2 12:20:00 2015  Sep  3 11:52:01 2015  HTTP/devix.cotham.uk@COTHAM
169 | ```
170 | 
171 | This shows that
172 | 1. The user is logged in as `stevel@COTHAM`
173 | 1. They have a ticket to work with the ticket granting service, `krbtgt/COTHAM@COTHAM`.
174 | 1. They have a ticket to authenticate with the principal
175 | `HTTP/devix.cotham.uk@COTHAM`. This is used by some HTTP services running on the host
176 | (`devix.cotham.uk`), specifically the Hadoop Namenode and Resource Manager web pages.
177 | These have both been configured to require Kerberos authentication via [SPNEGO](web_and_rest.html),
178 | and to use `HTTP` as the user. The full principal `HTTP/devix.cotham.uk` is determined
179 | from the host running the service.
180 | 
181 | ## Keytab
182 | 
183 | A (binary) file containing the secrets needed to log in as a principal
184 | 
185 | 1. It contains all the information to log in as a principal, so is a sensitive file.
186 | 1. It can hold many principals, so one can be created for, say, hdfs,
187 |   which contails all its principals, `hdfs/node1@HCLUSTER`, `hdfs/node2@HCLUSTER`, ...etc.
188 |   Thus only one keytab per service is needed.
189 | 1. It is created by the KDC administrators, who must then securely propagate that file
190 | to where it can be used.
191 | 
192 | Keytabs are the only way in which programs can directly authenticate themselves with Kerberos,
193 | (though they can indirectly do this with credentials passed to them). This means
194 | that for any long-lived process, a keytab is needed.
195 | 
196 | Operations teams are generally very reluctant to provide keytabs. They will need to
197 | create them for all long-lived services which run in the cluster. For services
198 | such as HDFS and YARN this is generally done at cluster setup time. YARN services
199 | have to deal with this problem whenever a user wants to run a long lived *YARN service*
200 | within the cluster: the technical one of keytab management and the organisational one
201 | of getting the keytab in the first place.
202 | 
203 | 
204 | To look at and work with keytabs, the `ktutil` command line program is the tool of choice.
205 | 
206 | ## Tickets
207 | 
208 | Kerberos is built around the notion of *tickets*.
209 | 
210 | A ticket is something which can be passed to a server to identify that the caller
211 | and to provide a secret key that can be used between the client an the server 
212 | —for the duration of the ticket's lifetime. It is all that a server needs to
213 | authenticate a client: there's no need for the server to talk to the KDC.
214 | 
215 | What's important is that tickets can be passed on: an authenticated principal
216 | can obtain a ticket to a service, and pass that on to another process in the distributed
217 | system. The recipient can then issue requests on behalf of the original principal,
218 | using that ticket. That recipient only has the permissions granted to the ticket
219 | (it does not have any other permissions of the principal, unless those tickets are
220 | also provided), and those permissions are only valid for as long as the ticket
221 | is valid.
222 | 
223 | The limited lifetime of tickets ensures that even if a ticket is captured by a malicious
224 | attacker, they can only make use of the credential for the lifetime of the ticket.
225 | The ops team doesn't need to worry about lost/stolen tickets, to have a process for
226 | revoking them, as they expire within a short time period, usually a couple of days.
227 | 
228 | This notion of tickets starts right at the bottom of Kerberos. When a principal
229 | authenticates with the KDC, it doesn't get any special authentication secrets
230 | —it gets a ticket to the *Ticket Granting Service*. This ticket can then be used
231 | to get tickets to other services —and, like any other ticket, can be forwarded.
232 | Equally importantly, the ticket will expire —forcing the principal to re-authenticate
233 | via the command line or a keytab.
234 | 
235 | ## Kerberos Authentication Service
236 | 
237 | This is network-accessible service which runs in the KDC, and which is used
238 | to authenticate callers. The protocol to authenticate callers is one of those
239 | low level details found in text books. What is important to know is that
240 | 
241 | 1. The KDC contains 'a secret' shared with the principal. There is no public/private
242 | key system here, just a shared secret.
243 | 1. When a client authenticates on the command line, the password is (somehow) used
244 | to generate some data which is passed to the authentication service to show that
245 | at least during the authentication process, the client had the password for the principal
246 | they were trying to authenticate as. (i.e., part of the process includes a challenge issued by the
247 | KDC, a challenged hashed by the password to show that's in the callers' possession).
248 | 1. When a client authenticates via the keytab, a similar challenge-reponse operation
249 | takes place to allow the client to show they have the principal's (secret) data in that keytab.
250 | 1. When the KDC's secret key for a principal is changed, all existing keytabs stop working.
251 | 
252 | ## Ticket Granting Service, *TGS*
253 | 
254 | 1. A *Kerberos Domain Controller*, *KDC* exists on the network to manage Kerberos security
255 | 1. It contains an *Authentication Service*, which authenticates remote principals, and
256 |  a *Ticket Granting Service*, *TGS*, which grants access to specific services.
257 | 1. The Authentication Service can authenticate via a password-based login, or though the principal having a stored copy of a shared secret, a *key*.
258 | 1. The TGS can issue *tickets*, secrets which declare that a caller has duration-limited access
259 | to a requested service, with the rights of the authenticated principal.
260 | 1. An authenticated principal can request tickets to services, which they can then use to authenticate
261 | directly with those services, and interact with them until the ticket expires.
262 | 1. A principal can also forward a ticket to any other process/service within the distributed system,
263 | to *delegate* rights.
264 | 1. This delegate can use the ticket to access the service, with the identity of the principal, for
265 | the lifespan of that ticket.
266 | 
267 | Note that Hadoop goes beyond this with the notion of *delegation tokens*, secrets which are similar
268 | to *tickets*, but which can be issued and renewed directly by Hadoop services. That will
269 | be covered in a later chapter.
270 | 
271 | ## Kerberos User Login
272 | 
273 | A user logs in with the Kerberos Authentication Service
274 | 
275 | 
276 | ## Examples of Kerberos
277 | 
278 | To put things into the context of Hadoop, here are some examples of how it could be used.
279 | 
280 | 
281 | ### Example: User listing an HDFS directory
282 | 
283 | A user wishes to submit some work to a Hadoop cluster, a new YARN application.
284 | 
285 | First, they must be logged in to the Kerberos infrastructure,
286 | 
287 | 1. On unix, this is done by running `kinit`
288 | 1. The `kinit` program asks the user for their password.
289 | 1. This is used to authenticate the user with the *Authentication Service* of the
290 | KDC configured in `/etc/krb5.conf`.
291 | 1. The Kerberos *Authentication Service* authenticates the user and issues a TGT ticket,
292 | which is stored in the client's *Credentials Cache*. A call to `klist` can be used to verify this.
293 | 
294 | Then, they must run a hadoop command
295 | 
296 |     hadoop fs -ls /
297 | 
298 | 1. The HDFS client code attempts to talk to the HDFS Namenode via the
299 | `org.apache.hadoop.hdfs.protocol.ClientProtocol` IPC protocol
300 | 1. It checks to see if If security is enabled (via `UserGroupInformation.isSecurityEnabled()`)
301 | 1. If it is, it looks in metadata assocated with the protocol, metadata which is used
302 | to identify the Kerberos principal, the identity, of the namenode.
303 | 
304 |             @InterfaceAudience.Private
305 |             @InterfaceStability.Evolving
306 |             @KerberosInfo(serverPrincipal ="dfs.namenode.kerberos.principal")
307 |             @TokenInfo(DelegationTokenSelector.class)
308 |             public interface ClientProtocol {
309 |             ...
310 |             }
311 | 1. The Hadoop `Configuration` class instance used to initialise the client is
312 | used to retrieve the value of `"dfs.namenode.kerberos.principal"` —so identifying
313 | the service to which the client must have a valid ticket to talk to.
314 | 1. The Hadoop Kerberos code (this is in Java, not the OS), asks the Kerberos *Ticket Granting
315 | Service*, *the TGS*, for a ticket to talk to the Namenode's principal. It does this in a request
316 | authenticated with the *TGT* received during the `kinit` process.
317 | 1. This ticket is granted by the TGT, and cached in the memory of the JVM.
318 | 1. The Hadoop RPC layer then uses the ticket to authenticate the caller to the Namenode, and
319 | implicitly, authenticate the NameNode to the caller.
320 | 1. The Namenode can use the Kerberos information to determine the identity of the (authenticated)
321 | caller.
322 | 1. It can then look at the permissions of the user as recorded in the HDFS directory and file metadata
323 | and determine if they have the rights to perform the requested action.
324 | 1. If they do, the action is performed and the results returned to the caller.
325 | 
326 | (Note there's some glossing over of details here, specifically how the client to Namenode
327 | authentication takes place, how they stay authenticated, how a users principal gets mapped to user name and
328 | how its group membership is ascertained for authorization purposes.)
329 | 
330 | 
331 | If a second request is made against the Namenode in the same Java process, there is no
332 | need to ask the TGT for a new ticket —not until the previous one expires. Instead cached
333 | authentication data is reused. This avoids involving the KDC in any further interactions with the
334 | Namenode.
335 | 
336 | In Hadoop —as we will see— things go one step further, with Delegation Tokens. For now: ignore them.
337 | 
338 | This example shows Kerberos at work, and the Hadoop IPC integration.
339 | 
340 | As described, this follows the original Kerberos architecture, one principal per user, tickets
341 | between users and services. Hadoop/Kerberos integration has to jump one step further to
342 | address the scale problem, to avoid overloading the KDC with requests, to avoid
343 | problems such as having to have the client ask the TGT for a ticket to talk to individual
344 | Datanodes when reading or writing a file across the HDFS filesystem, or even handle the problem
345 | with a tens of thousands of clients having to refresh their Namenode tickets every few hours.
346 | 
347 | This is done with a concept called *Hadoop Delegation Tokens*. These will be covered later.
348 | 
349 | For now, know that the core authentication between principals and services utterly depends
350 | upon the Hadoop infrastructure, with an initial process as describe above.
351 | 
352 | 
353 | ## Kerberos and Windows Active Directory
354 | 
355 | A lot of people are blissfully unaware of Kerberos. Such a life is one to treasure. Many of
356 | these people, do, however, log in to an enterprise network by way of Microsoft Active Directory.
357 | "AD" is a Kerberos Controller. [Kerberos Explained](https://msdn.microsoft.com/en-us/library/bb742516.aspx)
358 | 
359 | If an organisation uses Active Directory to manage users, they are running Kerberos, so
360 | have the basic infrastructure needed to authenticate users and services within a Hadoop cluster.
361 | Users should be able to submit jobs as themselves, interacting with "Kerberized" Hadoop services.
362 | 
363 | Setting up Hadoop to work with Active Directory is beyond the scope of this book. Please
364 | consult the references in the bibliography, and/or any vendor-specific documentation.
365 | 
366 | For Developers, it is worth knowing that AD is subtly different from the MIT/Unix Kerberos controller,
367 | enough so that you should really test with a cluster using AD as the Kerberos infrastructure, alongside
368 | the MIT KDC.
369 | 
370 | ## Limitations of Kerberos
371 | 
372 | Kerberos is considered "the best there is" in terms of securing distributed systems. Its
373 | use of tickets is designed to limit the load on the KDC, as it is only interacted with when
374 | a principal requests a ticket, rather than having to validate every single request.
375 | 
376 | The ability to delegate tokens to other processes allows transitive authentication as the original
377 | principal. This can be used by core Hadoop services to act on a users behalf, and by processes
378 | launched by the user.
379 | 
380 | The fact that tickets/tokens are time limited means that if one is stolen, the time for which
381 | unauthorized access is possible is limited to the lifespan of the token.
382 | 
383 | Finally, the fact that kerberos clients are standard in Windows, Linux and OS/X, and built
384 | into the Java runtime, means that it is possible to use Kerberos widely.
385 | 
386 | This does not mean it is perfect. Known limitations are
387 | 
388 | 1. The KDC is a Single Point of Failure, unless an HA system is set up (which Active Directory
389 | can do).
390 | 1. Excess load can overload the KDC. 
391 | 1. The communications channels between services still need to be secure. Kerberos does not
392 | address data encryption. If those channels are not secure, then tickets can be intercepted or
393 | communications forged.
394 | 1. Time needs to be roughly consistent across machines, else the time-limited tokens won't work.
395 | 1. If time cannot be securely managed across machines (i.e. an insecure time synchronization,
396 | protocol is used), then it is theoretically possible to extend the lifetime of a stolen token.
397 | 1. Because a stolen ticket can be used directly against a service, there's no log of its use
398 | in the KDC. Every application needs to have its own audit log of actions performed by
399 | a user, so that the history of actions by a client authenticated with a stolen ticket
400 | can be traced.
401 | 1. It's an authentication service: it verifies callers and allows callers to pass that authentication
402 | information on. It doesn't deal with permissions *at all*.
403 | 
404 | 
405 | There's some coverage of other issues in
406 | [Kerberos in the Crosshairs: Golden Tickets, Silver Tickets, MITM, and More](https://digital-forensics.sans.org/blog/2014/11/24/kerberos-in-the-crosshairs-golden-tickets-silver-tickets-mitm-more)
407 | 
408 | ## Hadoop/Kerberos Integration Issues
409 | 
410 | Hadoop specific issues are:
411 | 
412 | 1. While the ticketing process reduces KDC load, an entire
413 |    Hadoop cluster starting up can generate the login requests of a few thousand principals over
414 |    a short period of time. The Hadoop code contains some back-off logic to handle connection and
415 |    authentication failures here.
416 | 1. Because granted tokens expire, long-lived YARN services need to have a mechanism for updating
417 | tokens.
418 | 1. It's both hard to code for kerberos, and test against it.
419 | 
420 | Finally, it is *necessary but not sufficient*.
421 | 
422 | Having a Kerberized application does not guarantee that it is secure: you need to think about
423 | possible weaknesses, ways in which untrusted callers can make use of the service, ways
424 | in which tokens and keytabs may be leaked (that includes log messages!) and defend against them.
425 | 


--------------------------------------------------------------------------------
/sections/yarn.md:
--------------------------------------------------------------------------------
  1 | <!---
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 |   
  6 |    http://www.apache.org/licenses/LICENSE-2.0
  7 |   
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 |   
 15 | # YARN  and YARN Applications
 16 |  
 17 | YARN applications are somewhere where Hadoop authentication becomes some of its most complex.
 18 | 
 19 | Anyone writing a YARN application will encounter Hadoop security, and will end up spending
 20 | time debugging the problems. This is "the price of security".
 21 | 
 22 | ## YARN Service security
 23 | 
 24 | YARN Resource Managers (RM) and Node Managers (NM) perform work on behalf of the user.
 25 | 
 26 | The NM's
 27 | 
 28 | 1. `Localize` resources: Download from HDFS or other filesystem into a local directory. This
 29 | is done using the delegation tokens attached to the container launch context. (For non-HDFS
 30 | resources, using other credentials such as object store login details in cluster configuratio
 31 | files)
 32 | 
 33 | 1. Start the application as the user.
 34 | 
 35 | ## Securing YARN Application Web UIs and REST APIs
 36 | 
 37 | YARN provides a straightforward way of giving every YARN application SPNEGO authenticated
 38 | web pages: it implements SPNEGO authentication in the Resource Manager Proxy. 
 39 | YARN web UI are expected to load the AM proxy filter when setting up its web UI; this filter
 40 | will redirect all HTTP Requests coming from any host other than the RM Proxy hosts to an
 41 | RM proxy, to which the client app/browser must re-issue the request. The client will authenticate
 42 | against the principal of the RM PRoxy (usually `yarn`), and, once authenticated, have its
 43 | request forwared.
 44 | 
 45 | As a result, all client interactions are SPNEGO-authenticated, without the YARN application
 46 | itself needing any kerberos principal for the clients to authenticate against.
 47 | 
 48 | Known weaknesses in this approach are
 49 | 
 50 | 1. As calls coming from the proxy hosts are not redirected, any application running
 51 | on those hosts has unrestricted access to the YARN applications. This is why in a secure cluster
 52 | the proxy hosts must run on cluster nodes which do not run end user code (i.e. not run YARN
 53 | NodeManagers and hence schedule YARN containers).
 54 | 1. The HTTP requests between proxy and YARN RM Server are not encrypted.
 55 | 
 56 | ## Securing YARN Application REST APIs
 57 | 
 58 | YARN REST APIs running on the same port as the registered web UI of a YARN application are
 59 | automatically authenticated via SPNEGO authentication in the RM proxy.
 60 |  
 61 | Any REST endpoint (and equally, any web UI) brought up on a different port does not
 62 | support SPNEGO authentication unless implemented in the YARN application itself.
 63 | 
 64 | ## Strategies for token renewal on YARN services
 65 | 
 66 | 
 67 | ### Keytabs for AM and containers
 68 | 
 69 | A keytab is provided for the application. This can be done by:
 70 | 
 71 | 1. Installing it in every cluster node, then providing the path
 72 | to this in a configuration directory. The keytab must be in a secure directory path, where
 73 | only the service (and other trusted accounts) can read it.
 74 | 
 75 | 1. Including the keytab as a resource for the container, relying on the Node Manager localizer
 76 | to download it from HDFS and store it locally. This avoids the administration task of
 77 | installing keytabs for specific services. It does require the client to have access to the keytab
 78 | and as it is uploaded to the distributed filesystem, must be secured through the appropriate 
 79 | path permissions. 
 80 | 
 81 | This is the strategy adopted by Apache Slider (incubating). Slider also pushes out specified
 82 | keytabs for deployed applications such as HBase, with the Application Master declaring the
 83 | HDFS paths to them in its Container Launch Requests.
 84 | 
 85 | ### AM keytab + renewal and forwarding of Delegation Tokens to containers
 86 | 
 87 | The Application Master is given the path to a keytab (usually a client-uploaded localized resource),
 88 | so can stay authenticated with Kerberos. Launched containers are only given delegation tokens.
 89 | Before a delegation token is due to expire, the processes running in the containers must request new
 90 | tokens from the Application Master. Obviously, communications between containers and their Application
 91 | Master must themselves be authenticated, so as to prevent malicious code from requesting the containers
 92 | from an Application Master.
 93 | 
 94 | This is the strategy used by Spark 1.5+. Communications between container processes and the AM
 95 | is over HTTPS-authenticated Akka channels.
 96 | 
 97 | 
 98 | ### Client-side push of renewed Delegation Tokens
 99 | 
100 | This strategy may be the sole one acceptable to a strict operations team: a client process
101 | running on an account holding a kerberos TGT negotiates with all needed cluster services
102 | for delegation tokens, tokens which are then pushed out to the Application Master via
103 | some RPC interface.
104 | 
105 | This does require the client process to be re-executed on a regular basis; a cron or OOzie job
106 | can do this.
107 | 


--------------------------------------------------------------------------------
/sections/zookeeper.md:
--------------------------------------------------------------------------------
  1 | <!---
  2 |   Licensed under the Apache License, Version 2.0 (the "License");
  3 |   you may not use this file except in compliance with the License.
  4 |   You may obtain a copy of the License at
  5 |   
  6 |    http://www.apache.org/licenses/LICENSE-2.0
  7 |   
  8 |   Unless required by applicable law or agreed to in writing, software
  9 |   distributed under the License is distributed on an "AS IS" BASIS,
 10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 11 |   See the License for the specific language governing permissions and
 12 |   limitations under the License. See accompanying LICENSE file.
 13 | -->
 14 |   
 15 | # Zookeeper
 16 | 
 17 | Apache Zookeeper uses Kerberos + [SASL](sasl.md) to authenticate callers. 
 18 | The specifics are covered in [Zookeeper and SASL](https://cwiki.apache.org/confluence/display/ZOOKEEPER/Zookeeper+and+SASL)
 19 | 
 20 | Other than SASL, its access control is all based around secrets "Digests" which are shared between client and server, and sent over the (unencrypted) channel.
 21 | The Digest is stored in the ZK node; any client which provides the same Digest is considered to be that principal, and so gains
 22 | those rights. They cannot be relied upon to securely identify callers.
 23 | 
 24 | What's generally more troublesome about ZK is that its ACL-based permission scheme is "confusing".
 25 | Most developers are used to hierarchial permissions, in which the permissions of the parent
 26 | paths propagate down. If a root directory is world writeable, then anyone with those permissions
 27 | can implicity manipulate entries below it.
 28 | 
 29 | ZK nodes are not like this: permissions are there on a zknode-by-zknode basis.
 30 | 
 31 | For extra fun, ZK does not have any notion of groups; you can't have users in specific groups
 32 | (i.e. 'administrators')
 33 | 
 34 | The Hadoop Security book covers the client-side ZK APIs briefly.
 35 | 
 36 | ## Enabling SASL in ZK
 37 | 
 38 | SASL is enabled in ZK by setting a system property. While adequate for a server,
 39 | it's less than convenient when using ZK in an application as it means something very important:
 40 | you cannot have a non-SASL and a SASL ZK connection at the same time.
 41 | Although you could create theoretically create one connection, change the system properties and then
 42 | create the next, Apache Curator, doesn't do this.
 43 | 
 44 | ```java
 45 | System.setProperty("zookeeper.sasl.client", "true");
 46 | ```
 47 | 
 48 | As everyone sensible uses Curator to handle transient disconnections and ZK node failover,
 49 | this isn't practicable. (Someone needs to fix this —volunteers welcome)
 50 | 
 51 | ## Working with ZK
 52 | 
 53 | If you want to use ZK in production you have to
 54 | 
 55 | 1. Remember that even in a secure cluster, parts of the ZK path, including the root `/` znode,
 56 | are usually world writeable. You may unintentionally be relying on this and be creating
 57 | insecure paths. (Some of our production tests explicitly check this, BTW).
 58 | 
 59 | 1. Remember that ZK permissions have to be explicitly asked for: there is no inheritance. 
 60 | Set them for every node you intend to work with.
 61 | 
 62 | 1. Lock down node permissions in a secure cluster, so that only authenticated users can 
 63 | read secret data or manipulate data which must only be written by specific services.
 64 | As an example, HBase and Accumulo both publish their binding information to ZK, The 
 65 | YARN Registry has per-user zknode paths set up so that all nodes under `/users/${username}`
 66 | are implicitly written by the user `$username`, so have their authority.
 67 | 
 68 | 1. On an insecure cluster, do not try to create an ACL with "the current user" until the user
 69 | is actually authenticated. 
 70 | 
 71 | 1. If you want administrative accounts to have access to znodes, explicitly set it.
 72 | 
 73 | ## Basic code
 74 | 
 75 | ```java
 76 | List<ACL> perms = new ArrayList<>();
 77 | if (UserGroupInformation.isSecurityEnabled()) {
 78 |   perms(new ACL(ZooDefs.Perms.ALL, ZooDefs.Ids.AUTH_IDS));
 79 |   perms.add(new ACL(ZooDefs.Perms.READ,ZooDefs.Ids.ANYONE_ID_UNSAFE));
 80 | } else {
 81 |   perms.add(new ACL(ZooDefs.Perms.ALL, ZooDefs.Ids.ANYONE_ID_UNSAFE));
 82 | }
 83 | zk.createPath(path, null, perms, CreateMode.PERSISTENT);
 84 | ```
 85 | 
 86 | 
 87 | ## Example YARN Registry
 88 | 
 89 | In the Hadoop yarn registry, in order to allow admin rights, we added a yarn
 90 | property to list principals who would be given full access.
 91 | 
 92 | To avoiding requiring all configuration files to list the explicit realm to use,
 93 | we added the concept that if the principal was listed purely as `user@`, rather than
 94 | `user@REALM`, we'd append the value of `hadoop.registry.kerberos.realm` —and
 95 | if that value was unset, the realm of the (logged in) caller.
 96 | 
 97 | This means that all users of the YARN registry in a secure cluster get znodes with
 98 | admin access to `yarn`, `mapred` and `hdfs` users in the current Kerberos Realm, unless
 99 | otherwise configured in the cluster's `core-site.xml`.
100 | 
101 | ```xml
102 | <property>
103 |   <description>
104 |     Key to set if the registry is secure. Turning it on
105 |     changes the permissions policy from "open access"
106 |     to restrictions on kerberos with the option of
107 |     a user adding one or more auth key pairs down their
108 |     own tree.
109 |   </description>
110 |   <name>hadoop.registry.secure</name>
111 |   <value>false</value>
112 | </property>
113 | 
114 | <property>
115 |   <description>
116 |     A comma separated list of Zookeeper ACL identifiers with
117 |     system access to the registry in a secure cluster.
118 | 
119 |     These are given full access to all entries.
120 | 
121 |     If there is an "@" at the end of a SASL entry it
122 |     instructs the registry client to append the default kerberos domain.
123 |   </description>
124 |   <name>hadoop.registry.system.acls</name>
125 |   <value>sasl:yarn@, sasl:mapred@, sasl:hdfs@</value>
126 | </property>
127 | 
128 | <property>
129 |   <description>
130 |     The kerberos realm: used to set the realm of
131 |     system principals which do not declare their realm,
132 |     and any other accounts that need the value.
133 | 
134 |     If empty, the default realm of the running process
135 |     is used.
136 | 
137 |     If neither are known and the realm is needed, then the registry
138 |     service/client will fail.
139 |   </description>
140 |   <name>hadoop.registry.kerberos.realm</name>
141 |   <value></value>
142 | </property>
143 | ```
144 |  
145 | ## ZK Client and JAAS
146 | 
147 | Zookeeper needs a [jaas context](jaas.html) in SASL mode.
148 | 
149 | It will actually attempt to fallback to unauthorized if it doesn't get one
150 | 
151 | ZK's example JAAS client config
152 | 
153 | ```
154 | Client {
155 |   com.sun.security.auth.module.Krb5LoginModule required
156 |   useKeyTab=true
157 |   keyTab="/path/to/client/keytab"
158 |   storeKey=true
159 |   useTicketCache=false
160 |   principal="yourzookeeperclient";
161 | };
162 | ```
163 | 
164 | And here is one which should work for using your login credentials instead
165 | 
166 | ```
167 | Client {
168 |   com.sun.security.auth.module.Krb5LoginModule required
169 |   useKeyTab=false
170 |   useTicketCache=true
171 |   principal="user@REALM";
172 |   doNotPrompt=true
173 | };
174 | ```
175 | 
176 | ## How ZK reacts to authentication failures
177 | 
178 | The ZK server appears to react to a SASL authentication failure by closing the connection
179 | _without sending any error back to the client_
180 | 
181 | This means that for a client, authentication problems surface as connection failures
182 | 
183 | ```
184 | 2015-12-15 13:56:30,066 [main] DEBUG zk.CuratorService (zkList(695)) - ls /registry
185 | Exception: `/': Failure of ls() on /: org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode = ConnectionLoss for /registry: KeeperErrorCode = ConnectionLoss for /registry
186 | 2015-12-15 13:56:58,892 [main] ERROR main.ServiceLauncher (error(344)) - Exception: `/': Failure of ls() on /: org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode = ConnectionLoss for /registry: KeeperErrorCode = ConnectionLoss for /registry
187 | org.apache.hadoop.registry.client.exceptions.RegistryIOException: `/': Failure of ls() on /: org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode = ConnectionLoss for /registry: KeeperErrorCode = ConnectionLoss for /registry
188 |   at org.apache.hadoop.registry.client.impl.zk.CuratorService.operationFailure(CuratorService.java:403)
189 |   at org.apache.hadoop.registry.client.impl.zk.CuratorService.operationFailure(CuratorService.java:360)
190 |   at org.apache.hadoop.registry.client.impl.zk.CuratorService.zkList(CuratorService.java:701)
191 |   at org.apache.hadoop.registry.client.impl.zk.RegistryOperationsService.list(RegistryOperationsService.java:154)
192 |   at org.apache.hadoop.registry.client.binding.RegistryUtils.statChildren(RegistryUtils.java:204)
193 |   at org.apache.slider.client.SliderClient.actionResolve(SliderClient.java:3345)
194 |   at org.apache.slider.client.SliderClient.exec(SliderClient.java:431)
195 |   at org.apache.slider.client.SliderClient.runService(SliderClient.java:323)
196 |   at org.apache.slider.core.main.ServiceLauncher.launchService(ServiceLauncher.java:188)
197 |   at org.apache.slider.core.main.ServiceLauncher.launchServiceRobustly(ServiceLauncher.java:475)
198 |   at org.apache.slider.core.main.ServiceLauncher.launchServiceAndExit(ServiceLauncher.java:403)
199 |   at org.apache.slider.core.main.ServiceLauncher.serviceMain(ServiceLauncher.java:630)
200 |   at org.apache.slider.Slider.main(Slider.java:49)
201 | Caused by: org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode = ConnectionLoss for /registry
202 |   at org.apache.zookeeper.KeeperException.create(KeeperException.java:99)
203 |   at org.apache.zookeeper.KeeperException.create(KeeperException.java:51)
204 |   at org.apache.zookeeper.ZooKeeper.getChildren(ZooKeeper.java:1590)
205 |   at org.apache.curator.framework.imps.GetChildrenBuilderImpl$3.call(GetChildrenBuilderImpl.java:214)
206 |   at org.apache.curator.framework.imps.GetChildrenBuilderImpl$3.call(GetChildrenBuilderImpl.java:203)
207 |   at org.apache.curator.RetryLoop.callWithRetry(RetryLoop.java:107)
208 |   at org.apache.curator.framework.imps.GetChildrenBuilderImpl.pathInForeground(GetChildrenBuilderImpl.java:200)
209 |   at org.apache.curator.framework.imps.GetChildrenBuilderImpl.forPath(GetChildrenBuilderImpl.java:191)
210 |   at org.apache.curator.framework.imps.GetChildrenBuilderImpl.forPath(GetChildrenBuilderImpl.java:38)
211 |   at org.apache.hadoop.registry.client.impl.zk.CuratorService.zkList(CuratorService.java:698)
212 |   ... 10 more
213 | ```
214 | 
215 | If you can telnet into the ZK host & port then ZK is up, but rejecting authenticated calls.
216 | 
217 | You need to go to the server logs (e.g. `/var/log/zookeeper/zookeeper.out`) to see what actually went wrong:
218 | 
219 | ```
220 | 2015-12-15 13:56:30,995 - INFO  [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxnFactory@197] - Accepted socket connection from /192.168.56.1:55882
221 | 2015-12-15 13:56:31,004 - INFO  [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:ZooKeeperServer@868] - Client attempting to establish new session at /192.168.56.1:55882
222 | 2015-12-15 13:56:31,031 - INFO  [SyncThread:0:ZooKeeperServer@617] - Established session 0x151a5e1345d0003 with negotiated timeout 40000 for client /192.168.56.1:55882
223 | 2015-12-15 13:56:31,181 - WARN  [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:ZooKeeperServer@969] - Client failed to SASL authenticate: javax.security.sasl.SaslException:
224 |  GSS initiate failed [Caused by GSSException: Failure unspecified at GSS-API level (Mechanism level: Specified version of key is not available (44))]
225 | 2015-12-15 13:56:31,181 - WARN  [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:ZooKeeperServer@975] - Closing client connection due to SASL authentication failure.
226 | 2015-12-15 13:56:31,182 - INFO  [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxn@1007] - Closed socket connection for client /192.168.56.1:55882 which had
227 |  sessionid 0x151a5e1345d0003
228 | 2015-12-15 13:56:31,182 - ERROR [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxn@178] - Unexpected Exception: 
229 | java.nio.channels.CancelledKeyException
230 |         at sun.nio.ch.SelectionKeyImpl.ensureValid(SelectionKeyImpl.java:73)
231 |         at sun.nio.ch.SelectionKeyImpl.interestOps(SelectionKeyImpl.java:77)
232 |         at org.apache.zookeeper.server.NIOServerCnxn.sendBuffer(NIOServerCnxn.java:151)
233 |         at org.apache.zookeeper.server.NIOServerCnxn.sendResponse(NIOServerCnxn.java:1081)
234 |         at org.apache.zookeeper.server.ZooKeeperServer.processPacket(ZooKeeperServer.java:936)
235 |         at org.apache.zookeeper.server.NIOServerCnxn.readRequest(NIOServerCnxn.java:373)
236 |         at org.apache.zookeeper.server.NIOServerCnxn.readPayload(NIOServerCnxn.java:200)
237 |         at org.apache.zookeeper.server.NIOServerCnxn.doIO(NIOServerCnxn.java:244)
238 |         at org.apache.zookeeper.server.NIOServerCnxnFactory.run(NIOServerCnxnFactory.java:208)
239 |         at java.lang.Thread.run(Thread.java:745)
240 | 2015-12-15 13:56:31,186 - WARN  [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxn@346] - Exception causing close of session 0x151a5e1345d0003
241 |  due to java.nio.channels.CancelledKeyException
242 | 2015-12-15 13:56:32,540 - INFO  [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxnFactory@197] - Accepted socket connection from /192.168.56.1:55883
243 | 2015-12-15 13:56:32,542 - INFO  [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:ZooKeeperServer@861] - Client attempting to renew session 0x151a5e1345d0003 at /192.168.56.1:55883
244 | 2015-12-15 13:56:32,543 - INFO  [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:ZooKeeperServer@617] - Established session 0x151a5e1345d0003 with negotiated timeout 40000 for
245 |  clie nt /192.168.56.1:55883
246 | 2015-12-15 13:56:32,547 - WARN  [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:ZooKeeperServer@969] - Client failed to SASL authenticate: javax.security.sasl.SaslException:
247 |  GSS initiate failed [Caused by GSSException: Failure unspecified at GSS-API level (Mechanism level: Specified version of key is not available (44))]
248 | ```
249 | 
250 | 
251 | ## Troubleshooting ZK
252 | 
253 | There's a nice list from Jeremy Custenborder of what to do to troubleshoot ZK
254 | on [ZOOKEEPER-2345](https://issues.apache.org/jira/browse/ZOOKEEPER-2345?focusedCommentId=15134725&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15134725)
255 | 
256 | 


--------------------------------------------------------------------------------
/src/uml/auth_token.txt:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The ASF licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | 
20 | @startuml
21 | autonumber
22 | actor Client
23 | 
24 | 
25 | Title: Requesting an Authentication Token
26 | 
27 | @enduml
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/src/uml/hdfs_uml.txt:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The ASF licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | see: http://plantuml.com/sequence.html
20 | 
21 | 
22 | @startuml
23 | 
24 | autonumber
25 | 
26 | actor Client
27 | database NN
28 | database DN
29 | database LocalFS
30 | database fsEditLog
31 | 
32 | Title HDFS Startup
33 | 
34 | == Initialization ==
35 | 
36 | NN -> LocalFS: load keytab
37 | LocalFS --> NN: keytab
38 | NN -> KDC: authenticate(hdfs@namenode)
39 | KDC --> NN: TGT
40 | 
41 | NN -> fsEditLog : replay
42 | fsEditLog --> NN : history
43 | 
44 | rnote over NN
45 |   rebuilds state: all delegation token
46 |   events rebuild delegation token tables.
47 | endrnote
48 | 
49 | == Datanode ==
50 | 
51 | DN -> LocalFS: load keytab
52 | LocalFS --> DN: keytab
53 | DN -> KDC: authenticate(hdfs@datanode)
54 | KDC --> DN: TGT
55 | 
56 | DN -> LocalFS: load block metadata
57 | LocalFS --> DN: all block information, including BlockKeys
58 | 
59 | rnote over DN
60 |   DN init BlockTokenSecretManager
61 | endrnote
62 | 
63 | DN -> NN : RPC.open
64 | NN --> DN : authenticate(hdfs@namenode)
65 | DN -> KDC : request-ticket(hdfs@namenode, TGT)
66 | KDC -> DN : ticket(hdfs@namenode, hdfs@datanode)
67 | DN -> NN: ticket(hdfs@namenode, hdfs@datanode)
68 | DN -> NN : heartbeat (block info, block keys)
69 | 
70 | rnote over NN
71 |   NN init BlockTokenSecretManager
72 | endrnote
73 | 
74 | 
75 | @enduml
76 | 


--------------------------------------------------------------------------------
/src/uml/index.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 |   Licensed under the Apache License, Version 2.0 (the "License");
 3 |   you may not use this file except in compliance with the License.
 4 |   You may obtain a copy of the License at
 5 |   
 6 |    http://www.apache.org/licenses/LICENSE-2.0
 7 |   
 8 |   Unless required by applicable law or agreed to in writing, software
 9 |   distributed under the License is distributed on an "AS IS" BASIS,
10 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 |   See the License for the specific language governing permissions and
12 |   limitations under the License. See accompanying LICENSE file.
13 | -->
14 | 
15 | # Where PlantUML files will go.
16 | 
17 | See [PlantUML](http://plantuml.com/)
18 | 
19 | 1. [Kerberos Login](kerberos_login.txt)
20 |  


--------------------------------------------------------------------------------
/src/uml/kerberos_login.txt:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The ASF licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | see: http://plantuml.com/sequence.html
20 | see: Colouris & Dollimore, 3rd edition, P295
21 | 
22 | @startuml
23 | autonumber
24 | actor Client
25 | database "Auth Server"
26 | 
27 | Title: Authentication with Kerberos Authentication Server
28 | 
29 | Client -> "Auth Server"     : request Ticket(Client, T, n)
30 | 
31 | rnote over "Auth Server"
32 |   Ticket(C,T) = T, C, t1, t2, K<sub>CT</sub>
33 | end note
34 | 
35 | "Auth Server" --> Client   : {K<sub>CT</sub>, n}<sub>KC</sub>, {ticket(C,T)}<sub>KT</sub>
36 | 
37 | rnote over Client
38 |  Decodes {K<sub>CT</sub>, n}<sub>KC</sub>
39 |  via password or keytab
40 |  'n' demonstrates response from recipient of #1
41 |   K<sub>CT</sub> is session key with TGT
42 | endrnote
43 | 
44 | legend left
45 |   From Colouris & Dollimore, 3rd edition, p.295
46 | endlegend
47 | 
48 | newpage Obtaining a ticket for server S from the TGS
49 | 
50 | Client -> TGS : {C, t}<sub>KCT</sub> {ticket(C,T)}<sub>KT</sub>, S, n
51 | rnote over TGS
52 |  Decodes ticket
53 |  Validates request against ticket
54 |  Generates new session key  K<sub>CS</sub>
55 | endrnote
56 | 
57 | TGS --> Client:  {K<sub>CS</sub>, n}<sub>KCT</sub>, {ticket(C,S)}<sub>KS</sub>
58 | 
59 | rnote over Client
60 |  Decodes response with session key
61 |  Verifies nonce
62 |  saves K<sub>CS</sub> & {ticket(C,S)}<sub>KS</sub>
63 | endrnote
64 | 
65 | newpage Request of Server
66 | 
67 | Client -> S : {C, t}<sub>KCS</sub>, {ticket(C,S)}<sub>KS</sub>, request
68 | S -> Client: {n}<sub>KCS</sub
69 | 
70 | @enduml
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------