├── .gitignore ├── README.md ├── pom.xml └── tex ├── a_zero_rename_committer.tex ├── bibliography.bib ├── commit-protocol.png ├── commit-protocol.puml ├── improvements-to-the-commit-protocols.tex ├── notes.tex ├── spark-protocol.png └── spark-protocol.puml /.gitignore: -------------------------------------------------------------------------------- 1 | ## Core latex/pdflatex auxiliary files: 2 | *.aux 3 | *.lof 4 | *.log 5 | *.lot 6 | *.fls 7 | *.out 8 | *.toc 9 | *.fmt 10 | *.fot 11 | *.cb 12 | *.cb2 13 | 14 | ## Intermediate documents: 15 | *.dvi 16 | *-converted-to.* 17 | # these rules might exclude image files for figures etc. 18 | # *.ps 19 | # *.eps 20 | # *.pdf 21 | 22 | ## Generated if empty string is given at "Please type another file name for output:" 23 | .pdf 24 | 25 | ## Bibliography auxiliary files (bibtex/biblatex/biber): 26 | *.bbl 27 | *.bcf 28 | *.blg 29 | *-blx.aux 30 | *-blx.bib 31 | *.run.xml 32 | 33 | ## Build tool auxiliary files: 34 | *.fdb_latexmk 35 | *.synctex 36 | *.synctex(busy) 37 | *.synctex.gz 38 | *.synctex.gz(busy) 39 | *.pdfsync 40 | 41 | ## Auxiliary and intermediate files from other packages: 42 | # algorithms 43 | *.alg 44 | *.loa 45 | 46 | # achemso 47 | acs-*.bib 48 | 49 | # amsthm 50 | *.thm 51 | 52 | # beamer 53 | *.nav 54 | *.pre 55 | *.snm 56 | *.vrb 57 | 58 | # changes 59 | *.soc 60 | 61 | # cprotect 62 | *.cpt 63 | 64 | # elsarticle (documentclass of Elsevier journals) 65 | *.spl 66 | 67 | # endnotes 68 | *.ent 69 | 70 | # fixme 71 | *.lox 72 | 73 | # feynmf/feynmp 74 | *.mf 75 | *.mp 76 | *.t[1-9] 77 | *.t[1-9][0-9] 78 | *.tfm 79 | 80 | #(r)(e)ledmac/(r)(e)ledpar 81 | *.end 82 | *.?end 83 | *.[1-9] 84 | *.[1-9][0-9] 85 | *.[1-9][0-9][0-9] 86 | *.[1-9]R 87 | *.[1-9][0-9]R 88 | *.[1-9][0-9][0-9]R 89 | *.eledsec[1-9] 90 | *.eledsec[1-9]R 91 | *.eledsec[1-9][0-9] 92 | *.eledsec[1-9][0-9]R 93 | *.eledsec[1-9][0-9][0-9] 94 | *.eledsec[1-9][0-9][0-9]R 95 | 96 | # glossaries 97 | *.acn 98 | *.acr 99 | *.glg 100 | *.glo 101 | *.gls 102 | *.glsdefs 103 | 104 | # gnuplottex 105 | *-gnuplottex-* 106 | 107 | # gregoriotex 108 | *.gaux 109 | *.gtex 110 | 111 | # hyperref 112 | *.brf 113 | 114 | # knitr 115 | *-concordance.tex 116 | # TODO Comment the next line if you want to keep your tikz graphics files 117 | *.tikz 118 | *-tikzDictionary 119 | 120 | # listings 121 | *.lol 122 | 123 | # makeidx 124 | *.idx 125 | *.ilg 126 | *.ind 127 | *.ist 128 | 129 | # minitoc 130 | *.maf 131 | *.mlf 132 | *.mlt 133 | *.mtc[0-9]* 134 | *.slf[0-9]* 135 | *.slt[0-9]* 136 | *.stc[0-9]* 137 | 138 | # minted 139 | _minted* 140 | *.pyg 141 | 142 | # morewrites 143 | *.mw 144 | 145 | # nomencl 146 | *.nlo 147 | 148 | # pax 149 | *.pax 150 | 151 | # pdfpcnotes 152 | *.pdfpc 153 | 154 | # sagetex 155 | *.sagetex.sage 156 | *.sagetex.py 157 | *.sagetex.scmd 158 | 159 | # scrwfile 160 | *.wrt 161 | 162 | # sympy 163 | *.sout 164 | *.sympy 165 | sympy-plots-for-*.tex/ 166 | 167 | # pdfcomment 168 | *.upa 169 | *.upb 170 | 171 | # pythontex 172 | *.pytxcode 173 | pythontex-files-*/ 174 | 175 | # thmtools 176 | *.loe 177 | 178 | # TikZ & PGF 179 | *.dpth 180 | *.md5 181 | *.auxlock 182 | 183 | # todonotes 184 | *.tdo 185 | 186 | # easy-todo 187 | *.lod 188 | 189 | # xindy 190 | *.xdy 191 | 192 | # xypic precompiled matrices 193 | *.xyc 194 | 195 | # endfloat 196 | *.ttt 197 | *.fff 198 | 199 | # Latexian 200 | TSWLatexianTemp* 201 | 202 | ## Editors: 203 | # WinEdt 204 | *.bak 205 | *.sav 206 | 207 | # Texpad 208 | .texpadtmp 209 | 210 | # Kile 211 | *.backup 212 | 213 | # KBibTeX 214 | *~[0-9]* 215 | 216 | # auto folder when using emacs and auctex 217 | /auto/* 218 | 219 | # expex forward references with \gathertags 220 | *-tags.tex 221 | 222 | target/ 223 | out/ 224 | 225 | .classpath 226 | .project 227 | .settings 228 | .settings/ 229 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Zero-Rename Committer: Object-storage as a destination for Apache Hadoop and Spark 2 | 3 | This is a LaTeX formatted paper on the new S3A committers [shipped in Hadoop 3.1.](https://hadoop.apache.org/docs/r3.1.1/hadoop-aws/tools/hadoop-aws/committers.html) 4 | 5 | 6 | 7 | ## Building 8 | 9 | `mvn package` should do all but the image rendering. For that, `plantuml` is 10 | doing the rendering, set up to monitor the directory `tex/`. 11 | As usual, you need to run `bibtex` sporadically. 12 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 15 | 19 | 4.0.0 20 | com.github.steveloughran.papers 21 | s3a-committers 22 | 1.0-SNAPSHOT 23 | Maven to build the PDF/DVIs of the project 24 | S3A Committers 25 | jar 26 | 27 | 28 | 29 | maven-latex-plugin-repo 30 | http://akquinet.github.com/maven-latex-plugin/maven2/ 31 | 32 | true 33 | 34 | 35 | 36 | 37 | 38 | 39 | de.akquinet.jbosscc.latex 40 | maven-latex-plugin 41 | 1.2 42 | 43 | 44 | 45 | 46 | 47 | 48 | de.akquinet.maven 49 | maven-latex-plugin 50 | 1.1 51 | false 52 | 53 | 54 | compile 55 | 56 | 57 | tex 58 | pdf 59 | 60 | bibtex 61 | 62 | 63 | 64 | latex 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /tex/a_zero_rename_committer.tex: -------------------------------------------------------------------------------- 1 | \documentclass[format=acmsmall, screen=true, review=false]{acmart} 2 | 3 | % _ _____ ____ 4 | % / \ |__ /___ _ __ ___ | _ \ ___ _ __ __ _ _ __ ___ ___ 5 | % / _ \ / // _ \ '__/ _ \ _____| |_) / _ \ '_ \ / _` | '_ ` _ \ / _ \ 6 | % / ___ \ / /| __/ | | (_) |_____| _ < __/ | | | (_| | | | | | | __/ 7 | % /_/ \_\ /____\___|_| \___/ |_| \_\___|_| |_|\__,_|_| |_| |_|\___| 8 | % 9 | % ____ _ _ _ 10 | % / ___|___ _ __ ___ _ __ ___ (_) |_| |_ ___ _ __ 11 | % | | / _ \| '_ ` _ \| '_ ` _ \| | __| __/ _ \ '__| 12 | % | |__| (_) | | | | | | | | | | | | |_| || __/ | 13 | % \____\___/|_| |_| |_|_| |_| |_|_|\__|\__\___|_| 14 | 15 | %\usepackage{babel} 16 | \usepackage{graphicx} 17 | \usepackage{color} 18 | \usepackage{cite} 19 | %\usepackage{algorithmic} 20 | %\usepackage{algorithmicx} 21 | \usepackage[ruled,vlined,boxed]{algorithm2e} 22 | \usepackage{listings} 23 | %\usepackage{minted} 24 | \usepackage{underscore} 25 | \usepackage{multicol} 26 | \usepackage{float} 27 | \usepackage{checkend} 28 | \usepackage{enumitem} 29 | 30 | % ======================================================================== 31 | % commands 32 | 33 | 34 | \newcommand{\SUCCESS}{\texttt{\_SUCCESS}\ } 35 | 36 | % add a todo marker. We can turn this off when we don't want to see it. 37 | \newcommand{\TODO}{\emph{TODO}\ } 38 | \newcommand{\FOC}{\texttt{FileOutputCommitter}\ } 39 | 40 | 41 | % ======================================================================== 42 | 43 | 44 | \title{ A Zero-Rename Committer} 45 | 46 | 47 | \subtitle{Object-storage as a Destination for Apache Hadoop and Spark} 48 | % Yes, this titling is broken 49 | \author{ 50 | Loughran, Steve 51 | \and 52 | Blue, Ryan 53 | \and 54 | Radia, Sanjay 55 | \and 56 | Demoor, Thomas 57 | } 58 | %\author{ 59 | % Loughran, Steve 60 | % \texttt{stevel@apache.org} 61 | %\and 62 | % Blue, Ryan 63 | % \texttt{rblue@netflix.com} 64 | %\and 65 | % Radia, Sanjay 66 | % \texttt{sradia@apache.org} 67 | %\and 68 | % Demoor, Thomas 69 | % \texttt{thomas.demoor@wdc.com} 70 | %} 71 | 72 | \date{December 2017} 73 | 74 | % ======================================================================== 75 | 76 | \begin{document} 77 | 78 | 79 | \maketitle 80 | 81 | % ======================================================================== 82 | 83 | \begin{abstract} 84 | 85 | We introduce new \emph{committers} for Apache Hadoop, which 86 | the Amazon S3 Object Store to be safely used as a direct destination of output generated 87 | by Hadoop MapReduce and Apache Spark. 88 | 89 | By using the operations directly exported by the store, 90 | most critically the multipart-upload mechanism, tasks within a distributed 91 | query can upload their output to the final destination, 92 | yet not materialize this data until the overall job is committed. 93 | As a result, the committers meet the core requirement of the Hadoop and Spark commit 94 | protocols: the output of the job is complete and consistent: it contains 95 | all the output of the successful ``committed'' work, and none of the output of 96 | tasks which were committed. 97 | That this mechanism permits highly-performant commit operations is an added benefit. 98 | 99 | We also document the commit protocols of Hadoop and Spark, and show how the classic committer 100 | implementation's requirements of atomic file creation and rename operations mean that they 101 | cannot be safely used with Amazon S3. 102 | 103 | We introduce our the two committers, ``Staging'' and ``Magic'', exploring their differences. 104 | The Staging committer stages all generated output to the local filesystem of 105 | worker nodes, uploading this data when a task is committed. 106 | The Magic committer streams data directly to the object store, relying on the 107 | object store client to recognise some output paths as special (``magic''), and 108 | so translating writes to these paths as initiating a delayed-completion write 109 | to a calculated final destination. 110 | 111 | In order to evaluate the correctness of our work, we provide a definition 112 | of \emph{Correctness} ---the requirements of a committer for safe integration 113 | with Hadoop MapReduce and Spark. 114 | 115 | The requirements are 116 | 117 | \begin{labeling}{continuity of correctness} 118 | \item [complete] the output includes the work of all successful tasks. 119 | \item [exclusive] the output of unsuccessful tasks is not present. 120 | \item [concurrent] multiple tasks can be committed in parallel. 121 | \item [abortable] jobs and tasks may be aborted, after which their output is not visible. 122 | \item [continuity of correctness] once a job is committed, the output 123 | of any failed, aborted, or unsuccessful task must not appear at some point in the future. 124 | \end{labeling} 125 | 126 | 127 | 128 | As well as demonstrating that our new committers meet these requirements, 129 | we show that one of the committers which ships with Hadoop. 130 | Evaluating other object-store-specific committers, IBM's Stocator\ \cite{Stocator} 131 | meets the requirements, while Amazon's EMRFS S3-optimized Committer does not 132 | appear to do so \ \cite{AWS-EMR-committer}. 133 | Equally notably, one of the committers which ships in Hadoop and is broadly used, 134 | ``The V2 committer'' does not meet these correctness criteria eiher. 135 | In both the EMR and Hadoop V2 committers, tasks publish their output directly 136 | into the destination during their task commit operations, through a sequence of 137 | operations. 138 | Any failure during such non-atomic task commits could leave the 139 | destination in an undefined state. 140 | 141 | The obvious mitigation technique is ``avoid these'', but it is also 142 | possible for the commit protocols to be extended to allow the committers 143 | to declare whether or not a failure during task commit is recoverable. 144 | The application could then use that information to react to a failure 145 | in a stricter way, such as failing the job, or restarting it entirely. 146 | 147 | \end{abstract} 148 | 149 | % ======================================================================== 150 | 151 | \section{Introduction} 152 | \label{sec:introduction} 153 | 154 | It has long been a core requirement of ``Big Data'' computation platforms that 155 | the source and destination of data was a fully consistent distributed filesystem. 156 | 157 | Distributed, because data needs to be readable and writable by the distributed 158 | processes executing a single query across the cluster of computers. 159 | Consistent, because all machines across the cluster need to be able to 160 | list and read data written by any of the others. 161 | As for ``Filesystem'', that's the model and API for distributed storage which 162 | developers are familiar with. 163 | 164 | 165 | The full semantics of a POSIX filesystem are not always necessary; 166 | random write access to a file being an oft-omitted feature of the stores, 167 | forcing the persistence formats to rely purely on appending data. 168 | 169 | What has been a critical part of the required semantics has been that the filesystem 170 | presents a model of directories and files with consistent operations to list and 171 | read those directories and their contents, with at least four atomic operations: 172 | 173 | \begin{itemize} 174 | \item Rename of a single file to another location within the same volume. 175 | \item Rename of a directory and its contents to another location within the same volume. 176 | \item Create a file iff a file or directory does not exist at that part. 177 | \item Recursive delete of a directory. 178 | \end{itemize} 179 | 180 | These operations are regularly used by applications as the foundational operators of higher-level 181 | co-ordination and commit protocols. 182 | 183 | For example, the \texttt{create()} operation can be used to obtain a lock on a resource: 184 | the first process to create a file can consider itself having exclusive access to it, 185 | and so implicitly consider itself to have acquired the resource. 186 | 187 | The \texttt{rename()} operation is generally critical to providing atomic promotion 188 | of output: a single \texttt{rename()} call can promote all in-progress output 189 | of a worker to become completed work, simply by moving all the output to a well known path. 190 | And, when the job completes, its final output may be renamed to a new location to become 191 | publicly visible. 192 | 193 | As covered in the original MapReduce paper\ \cite{MapReduce}: 194 | 195 | \begin{quote} 196 | We rely on the atomic rename operation provided by the underlying file system 197 | to guarantee that the final file system state contains just the data produced 198 | by one execution of the reduce task. 199 | \end{quote} 200 | 201 | 202 | Apache Hadoop was written with its own filesystem, Hadoop Distributed File System 203 | (HDFS)\ \cite{Chansler2011}. 204 | 205 | It is self-admittedly sub-POSIX as data can only be 206 | appended directly to the end of the current file. 207 | What it does offer is the classic filesystem model of 208 | a tree of directories and files, 209 | and the atomic operations needed by MapReduce to safely use HDFS 210 | as a destination of work. 211 | As will be shown, some object stores do not provide the same guarantees, 212 | and so cannot be safely used as a destination with the standard protocol, 213 | even if everything \emph{appears} to work. 214 | 215 | 216 | % ======================================================================== 217 | 218 | \section{The Hadoop MapReduce Commit Protocol} 219 | \label{sec:hadoop-mr-commit} 220 | 221 | Before the challenge and solution of using an object store as a destination 222 | of work can be covered, the problem of outputting data from a distributed 223 | query itself must be covered, along with the existing protocols and algorithms. 224 | 225 | 226 | \subsubsection{Terminology} 227 | 228 | First, some terminology needs to be introduced to describe 229 | the protocols. 230 | 231 | 232 | \textbf{Query}. 233 | One or more transformations of source data to a result; 234 | data presented or saved in some form. 235 | The query may be described in procedural source code, 236 | or declaratively in a form such as SQL\@. 237 | 238 | 239 | \textbf{Job}. 240 | A parallelized query, composed of one or more distributed \emph{tasks}. 241 | The output of a Job is made visible to other stages in a larger operation 242 | sequence or other applications iff the job \emph{completes successfully}. 243 | A complex query may consist of a chain of Jobs, either executing in sequence 244 | or as a DAG of jobs. 245 | 246 | \textbf{Job Attempt}. 247 | A single attempt at executing a job. 248 | 249 | \textbf{Task}. 250 | Part of a job, such as a single Map or Reduce transformation applied to a fraction 251 | of the input data. 252 | 253 | 254 | \textbf{Task Attempt}. 255 | A single attempt to complete a task on a single process running on a single host 256 | in the cluster. 257 | A task attempt is \emph{successful} if it generates all its output without 258 | failing in some way. 259 | A task attempt has \emph{failed} if the execution raises an exception, or 260 | if the process executing the task attempt stops communicating with 261 | the process managing the job. 262 | 263 | Multiple attempts may be made to execute a task; 264 | sequentially, if addressing task failure, or in parallel when task attempts are 265 | executed speculatively. 266 | It is critical that only one task attempt's output is propagated 267 | to the final output of a job. 268 | 269 | 270 | \textbf{Job Manager}. 271 | The application which schedules task attempt execution, tracks success/failures, 272 | determines when a job has been completed and publishes the results. 273 | It may also determine that a job has failed and cannot be recovered, 274 | in which case the job is aborted. 275 | 276 | \textbf{Executor}. 277 | A process capable of executing work, as directed by the Job Manager. 278 | In Hadoop MapReduce, a unique executor is created for each partition 279 | of the data, destroyed when the processing is completed. 280 | In Spark, executors are long lived and can be allocated task attempts from multiple 281 | jobs to execute, often simultaneously. 282 | 283 | \textbf{Job Output Directory}. 284 | The directory into which the output of a job writing to the filesystem is placed 285 | so as to be visible. 286 | After a successful job completion, the data MUST be visible in the destination 287 | directory. 288 | 289 | \textbf{Task Working Directory}. 290 | A directory for exclusive access by a single task attempt, into which uncommitted 291 | work may be placed. 292 | All data written in and under this directory is considered the output of 293 | that task attempt. 294 | 295 | 296 | \textbf{Task Commit}. 297 | The act of taking the output of a task attempt 298 | and promoting it to become part of the final output of the active job 299 | attempt. 300 | When the output is a filesystem, this consists of moving the files 301 | under the Task Working Directory and moving to the Job Output Directory, 302 | preserving the hierarchy of subdirectories. 303 | 304 | 305 | \textbf{Job Commit}. 306 | The act of taking the output of all committed tasks of a job attempt, 307 | and generating the final output. 308 | This normally consists of publishing this output in an aggregate form; 309 | it can also include generating extra summary data. 310 | As this it is often a serialized operation at the end of a job attempt, 311 | its performance can be a bottleneck. 312 | 313 | \textbf{Task Abort}. 314 | To cancel a task such that its data is not committed. 315 | 316 | \textbf{Job Abort}. 317 | To cancel all work in a job attempt: no task's work is committed. 318 | 319 | 320 | \textbf{Job Context}. 321 | An instance of the Java class \texttt{org.apache.hadoop.mapreduce.JobContext}, 322 | which provides a read-only view of the Job for the Job Driver and tasks. 323 | 324 | \textbf{Task Attempt Context}. 325 | An instance of the class 326 | \texttt{org.apache.hadoop.mapreduce.TaskAttemptContext}, 327 | which provides operations for tasks, such as getting and setting status, 328 | progress and counter values. 329 | 330 | 331 | \subsection{Requirements of a Commitment Protocol} 332 | \label{subsec:commit-protocol-requirements} 333 | 334 | Apache Hadoop's MapReduce implementation is designed to support long-lived 335 | large-scale queries taking minutes to hours to complete. 336 | Its requirements include the following: 337 | 338 | \begin{enumerate} 339 | 340 | \item Support for thousands to tens of thousands of individually scheduled $tasks$ 341 | within a single $job$. 342 | 343 | \item Support different destinations of work, such as databases and 344 | distributed filesystems. 345 | 346 | \item ``Correctly'' propagate the output of individual tasks to the final 347 | aggregate of the job. 348 | What constitutes correctness is covered in\ \ref{sec:correctness}. 349 | 350 | \item Recover from the failure of a task attempt by rescheduling the task; 351 | a new task attempt may be executed anywhere within the cluster. 352 | 353 | \item Support speculative execution of task attempts as a means of compensating for the 354 | delay caused by \emph{stragglers} in the execution. 355 | 356 | \item Potentially: recover from a failure of a job attempt, using all the committed 357 | task output from the previous, failed attempt. 358 | 359 | \item Be resilient to network failures/partitions of tasks, and of the job manager 360 | itself becoming isolated from other parts of the system (and hence: a second 361 | attempt at the job being started). 362 | 363 | \end{enumerate} 364 | 365 | This leads to some specific requirements of an implementation, requirements 366 | which can be used to assess its correctness. 367 | 368 | \textbf{Independent.} 369 | Individual tasks must be able to output without directly 370 | co-ordinating that write with those of other tasks. 371 | 372 | \textbf{Speculative tasks until committed.} 373 | Multiple tasks must be able to simultaneously execute on the same input 374 | source, to generate the required output of that part of the input. 375 | This is required for recovery, and for speculation. 376 | Non requirement: idempotent output; 377 | that is left to the implementors of the operations executed in the tasks. 378 | 379 | \textbf{Scaleable communication protocol.} 380 | The commit protocol communications between task and job manager 381 | must be support tens of thousands of simultaneous tasks. 382 | 383 | \textbf{Abortable.} 384 | It must be possible to abort an uncommitted task or job. 385 | There should be no leftover output. 386 | 387 | \textbf{Recoverable or restartable job.} 388 | A committer can declare whether or not it supports job recovery; 389 | if it does, it must implement recovery. 390 | If not, the job must be restartable from the beginning. 391 | 392 | \begin{figure*} 393 | \centering 394 | \includegraphics[width=.8\textwidth]{commit-protocol.png} 395 | \caption{Hadoop commit protocol (excluding Job recovery)} 396 | \label{fig:commit-protocol} 397 | \end{figure*} 398 | 399 | An UML sequence diagram of the core commit protocol is 400 | show in\ \ref{fig:commit-protocol}. 401 | 402 | The commit algorithm is designed to work on the YARN cluster scheduler 403 | \ \cite{Vavilapalli2013}. 404 | 405 | On each node in the YARN cluster, a \emph{Node Manager} has the responsibility 406 | of launching the applications, usually within a memory-and-CPU-bounded 407 | environment. 408 | A central \emph{Resource Manager} manages the scheduling liveness monitoring 409 | of individual applications. 410 | When an application is submitted for execution, the Resource Manager schedules 411 | its root process, the \emph{Application Master}. 412 | This communicates with the Resource Manager via an umbilical protocol 413 | which is explicitly used by the application for requesting new processes 414 | across the cluster, and implicitly used by the Resource Manager 415 | as a liveness probe of the application. 416 | 417 | When a launched process terminates, the process exit code 418 | is passed to the \emph{ResourceManager} within the regular status heartbeats 419 | between each NodeManager and the ResourceManager. 420 | If it is the Application Master itself which has terminated, unless it explicitly 421 | declared itself to be finished, the application is considered to have failed. 422 | All worker processes will be terminated (by default), and a new instance 423 | of the Application Master scheduled for execution. 424 | If it was a worker process, the Application Master chooses how to react. 425 | 426 | 427 | In MapReduce, the YARN Application Master is the Job Manager, 428 | with every individual task attempt executed in a unique worker process, termed 429 | an ```executor'' in this paper. 430 | A direct RPC protocol between the Job Manager and the executors is used to manage 431 | the commit operation. 432 | Excluding process failures, all liveness detection must be performed by the 433 | Job Manager, which is done based on timeouts of this direct RPC protocol. 434 | 435 | % ----------------------------------------------------------------------- 436 | 437 | \subsection{Recoverable Failure Modes} 438 | \label{subsec:optionalRecoverableFailureModes} 439 | 440 | \subsubsection{Job Recovery} 441 | 442 | When YARN perceives the Job Manager process to have failed, it instantiates 443 | a new instance of the process somewhere within the cluster. 444 | This new Job Manager creates an instance of the specified Job Committer, 445 | and queries it as to whether job recovery is supported. 446 | 447 | If a committer does support recovery., 448 | the state of the previous job attempt is rebuilt from reading the 449 | ``Job History'' file. 450 | This file is a log of the events saved during the execution, including 451 | a record of all tasks which successfully committed, and which could 452 | therefore be recovered. 453 | The Job Committer's \texttt{recoverTask(TaskAttempt)} method is called 454 | for each of these tasks. 455 | All unexecuted, uncommitted or unrecoverable tasks are scheduled for execution. 456 | 457 | If job recovery is not support the entire job is re-executed. 458 | 459 | As the probability of the Job Manager failing is, excluding bugs in the code itself, 460 | a function of job time, rather than scale, recovering from job failure is more 461 | important on long lived jobs ---those which last many hours. 462 | 463 | \subsubsection{Bad Records} 464 | 465 | To avoid an entire task, and hence job, failing due to a single unprocesseable record, 466 | task attempts may skip records whose processing raises an exception. 467 | If the number of skipped records in a task attempt is below some threshold 468 | threshold, these records will not result in a task attempt reporting itself as 469 | having failed. 470 | This is not of direct relevance to the commit protocol except as a 471 | reason for a task attempt to fail. 472 | 473 | \section{Hadoop's FileOutputCommitter} 474 | \label{sec:fileoutputcommitter} 475 | 476 | 477 | The operations to commit the work, the Task Commit and the Job Commit, 478 | are all implemented in the same class, an implementation of \texttt{OutputCommitter}. 479 | For writing to HDFS, this is done in the \texttt{FileOutputCommitter} class. 480 | 481 | This actually implements two separate algorithms for committing work: each 482 | with different performance and scalability characteristics. 483 | 484 | The ``v1'' algorithm was designed to handle failure and recovery with an 485 | atomic task commit and the ability to explicitly recover the output generated 486 | by the committed tasks of a failed job attempt. 487 | 488 | The ``v2'' algorithm was added in 2015, as its predecessor was found 489 | to have scalablity problems for jobs with tens of thousands of files 490 | \ \cite{MAPREDUCE-4815}. 491 | While the v2 algorithm can deliver better performance, it comes at the price of 492 | reduced isolation of output. 493 | 494 | \textbf{v1.} 495 | When a task attempt is committed, its task working directory is renamed into 496 | the job attempt directory. 497 | When the job attempt is committed, all committed task directories are merged 498 | (serially) into the job output directory. 499 | A restarted job move the directories of committed tasks from the previous 500 | attempt, so recovering their output. 501 | 502 | 503 | \textbf{v2.} 504 | 505 | When a task attempt is committed, its output is immediately merged into the 506 | job output directory; 507 | the job commit operation does nothing but create a marker file. 508 | This is faster, but intermediate work is visible. 509 | The task commit operation is no longer atomic, changing failure modes. 510 | 511 | %\begin{table} 512 | % \caption{Attributes of the \texttt{FileOutputCommitter} algorithms} 513 | % \begin{tabular}{ l c c } 514 | % \hline 515 | % & \textbf{v1} & \textbf{v2} \\ 516 | % Independent & True & True \\ 517 | % Speculative Tasks & True & True \\ 518 | % Recoverable Job & True & False \\ 519 | % Abortable Job & True & Delete output directory \\ 520 | % Observable & False & True \\ 521 | % Atomic Task Commit & True & False \\ 522 | % Idempotent Task Commit & True & False \\ 523 | % Atomic Job Commit & False & True \\ 524 | % Idempotent Job Commit & False & True \\ 525 | % \hline 526 | % \end{tabular} 527 | % \label{tab:file-committer-attributes} 528 | %\end{table} 529 | 530 | \subsection{Common Variables} 531 | \label{subsec:common-variables} 532 | 533 | 534 | \begin{table} 535 | \caption{Variables used in the algorithms} 536 | \begin{tabular}{ l l l } 537 | \hline 538 | \textbf{name} & \textbf{meaning} \\ 539 | $fs$ & Destination filesystem \\ 540 | $destPath$ & Job Output Directory in the destination filesystem. \\ 541 | $jobId$ & Numeric Job ID $\geq$ 0; expected to be unique for all application instances in the cluster. \\ 542 | $jobAttemptId$ & $JobID_\$counter$; Counter starts at 0 for a job and increments on each attempt.\\ 543 | $jobAttemptPath$ & a path under which a job attempt may store any data.\\ 544 | $partId$ & Numeric value of partition of data to be allocated to a task.\\ 545 | $taskId$ & $jobAttemptId_\$partId$; the task which works on part $partId$ in the job attempt.\\ 546 | $taskAttemptId$ & $taskId\$counter$; a single attempt to execute a task.\\ 547 | $taskAttemptPath$ & a path under $jobAttemptPath$ into which a task attempt may write uncommitted data.\\ 548 | $taskCommittedPath$ & a path under $jobAttemptPath$ where the contents of $taskAttemptPath$ are moved when that attempt is committed. \\ 549 | \hline 550 | \end{tabular} 551 | \label{tab:variables} 552 | \end{table} 553 | 554 | For a Job Attempt $jobAttemptId$ to be successful, all parts of the dataset 555 | must be processed in one or more successful tasks. 556 | The output of exactly one task attempt for each task must be in the final dataset, 557 | 558 | The function of a commit algorithm, then, is to guarantee that this condition 559 | is met, even in the presence of failures. 560 | It is not a requirement for an algorithm to be able to recover from all 561 | failures; 562 | it may react to some failure conditions by failing the entire job. 563 | 564 | It is also not a general requirement that if a job fails, the job output directory 565 | must be unchanged. 566 | Together, this implies that at-most-once semantics are required, 567 | and that the task of handling job failures is to be handled 568 | by a higher-level workflow. 569 | 570 | \subsection{Hadoop V1 commit algorithm} 571 | \label{subsec:hadoop-v1-commit-algorithm} 572 | 573 | %% Define the standard commit variables 574 | \newcommand{\FileOutputCommitVars}{ 575 | \DontPrintSemicolon 576 | \SetKwData{fs}{$fs$} 577 | \SetKwData{dest}{$destPath$} 578 | \SetKwData{jobAttemptPath}{$jobAttemptPath$} 579 | \SetKwData{jobAttemptId}{$jobAttemptId$} 580 | \SetKwData{SUCCESS}{_$SUCCESS$} 581 | \SetKwData{taskAttemptId}{$taskAttemptId$} 582 | \SetKwData{taskAttemptPath}{$taskAttemptPath$} 583 | \SetKwData{taskCommittedPath}{$taskCommittedPath$} 584 | \SetKwData{temp}{_$temporary$}} 585 | 586 | \newcommand{\true}{ $true$ } 587 | \newcommand{\false}{ $false$ } 588 | 589 | 590 | \textbf{Job Setup} 591 | 592 | This creates the path \emph{jobAttemptPath}, under the 593 | directory \texttt{\_temporary} of the output directory 594 | \emph{destPath}. 595 | 596 | \begin{procedure} 597 | \FileOutputCommitVars 598 | % Operations, which are defined for all subsequent procedures/functions 599 | % ALL functions must go in here, in alphabetical order 600 | % Macros cannot have numbers in them, though their values can. 601 | \SetKwFunction{abortUpload}{abortUpload} 602 | \SetKwFunction{checkForConflicts}{checkForConflicts} 603 | \SetKwFunction{commitJob}{commitJob} 604 | \SetKwFunction{completeUpload}{completeUpload} 605 | \SetKwFunction{delete}{delete} 606 | \SetKwFunction{exists}{exists} 607 | \SetKwFunction{Exception}{Exception} 608 | \SetKwFunction{getFileStatus}{getFileStatus} 609 | \SetKwFunction{getJobAttemptPath}{getJobAttemptPath} 610 | \SetKwFunction{getUsername}{getUsername} 611 | \SetKwFunction{isDirectory}{isDirectory} 612 | \SetKwFunction{isFile}{isFile} 613 | \SetKwFunction{listFiles}{listFiles} 614 | \SetKwFunction{listPendingUploads}{listPendingUploads} 615 | \SetKwFunction{loadPendingFile}{loadPendingFile} 616 | \SetKwFunction{loadPendingSet}{loadPendingSet} 617 | \SetKwFunction{mergePathsA}{mergePathsV1} 618 | \SetKwFunction{mergePathsB}{mergePathsV2} 619 | \SetKwFunction{mkdir}{mkdir} 620 | \SetKwFunction{mkdirs}{mkdirs} 621 | \SetKwFunction{newUUID}{newUUID} 622 | \SetKwFunction{rename}{rename} 623 | \SetKwFunction{return}{return} 624 | \SetKwFunction{savePendingSet}{savePendingSet} 625 | \SetKwFunction{tempDirForStaging}{tempDirForStaging} 626 | \SetKwFunction{throw}{throw} 627 | \SetKwFunction{touch}{touch} 628 | \SetKwFunction{uniquePath}{uniquePath} 629 | \SetKwFunction{uploadFileToPendingCommit}{uploadFileToPendingCommit} 630 | 631 | \jobAttemptPath $\longleftarrow$ \dest/\temp/\jobAttemptId\; 632 | \mkdir(\fs, \jobAttemptPath)\; 633 | 634 | \caption{setupJob()} 635 | \label{alg:FileOutputCommitter.setupJob} 636 | \end{procedure} 637 | 638 | Note Hadoop has a convention that all paths starting with ``_'' are not considered 639 | ``visible''; 640 | everything under this directory is excluded from normal 641 | listings of the destination path. 642 | Creating all intermediate files in a subdirectory of the destination 643 | directory provides an implicit guarantee that the data is created in the 644 | same volume (in a multi-volume filesystem), and in the same encryption zone, 645 | for any HDFS cluster with encryption enabled. 646 | 647 | 648 | \textbf{Task Setup} 649 | 650 | The task attempt is given a directory under the job attempt path 651 | as its task working directory. 652 | 653 | \begin{procedure} 654 | \FileOutputCommitVars 655 | 656 | \taskAttemptPath $\longleftarrow$ \jobAttemptPath/\taskAttemptId\; 657 | 658 | \caption{setupTask()} 659 | \label{alg:FileOutputCommitter.setupTask} 660 | \end{procedure} 661 | 662 | The actual directories are created on demand. 663 | 664 | 665 | \textbf{Needs Task Commit} 666 | 667 | A commit is required iff data was generated. 668 | 669 | \begin{function} 670 | \FileOutputCommitVars 671 | 672 | \exists(\fs, \taskAttemptPath)\; 673 | 674 | \caption{needsTaskCommit()} 675 | \label{alg:FileOutputCommitter.needsTaskCommit} 676 | \end{function} 677 | 678 | This is somewhere where any eventual consistency of object listings in an object 679 | store can generate a false ``no data to commit'' negative result. 680 | 681 | \textbf{Task Commit} 682 | 683 | A task attempt is committed simply by renaming the task attempt working directory 684 | into the job attempt directory. 685 | 686 | \begin{procedure} 687 | \FileOutputCommitVars 688 | 689 | \If{\exists(\fs, \taskAttemptPath)} { 690 | \delete(\fs, \taskCommittedPath, $recursive$)\; 691 | \rename(\fs, \taskAttemptPath, \taskCommittedPath)\; 692 | } 693 | 694 | \caption{commitTask()} 695 | \label{alg:FileOutputCommitter.commitTask} 696 | \end{procedure} 697 | 698 | 699 | In a true file system, the rename is an $O(1)$ atomic operation. 700 | %Even if the task fails to report to the Job Manager that the 701 | %commit operation was completed, the existence of the \texttt{taskCommittedPath} 702 | %is an implicit confirmation that the task was committed. 703 | %Its absence is not a guarantee that the task has failed ---it could just 704 | %be taking slow to execute the operation. 705 | %However, the Job Manager can assume that the task has failed, 706 | %and reschedule another attempt at that task. 707 | %Whichever of the rescheduled or original (delayed/partitioned) task 708 | %last renames their task attempt path to the task committed path is considered 709 | %the successful committer. 710 | 711 | \textbf{Task Abort} 712 | 713 | Abort a task attempt by deleting its task attempt path. 714 | 715 | \begin{procedure} 716 | \FileOutputCommitVars 717 | 718 | \delete(\fs, \taskAttemptPath, $recursive$)\; 719 | 720 | \caption{abortTask()} 721 | \label{alg:FileOutputCommitter.abortTask} 722 | \end{procedure} 723 | 724 | 725 | On a genuine filesystem this is an $O(1)$` operation. 726 | %On an object store, usually $O(files)$. 727 | 728 | \textbf{Job Commit} 729 | 730 | A Job is committed by merging all files/directories in from all the task 731 | committed paths into final job output directory. 732 | 733 | Optionally, it can create a zero-byte `\SUCCESS` file in the output directory. 734 | 735 | \begin{procedure*} 736 | \FileOutputCommitVars 737 | 738 | \For { committedTask $\in$ listFiles(\fs, \jobAttemptPath) } { 739 | \mergePathsA(\fs, committedTask, \dest)\; 740 | } 741 | \touch(\fs, \dest/\SUCCESS)\; 742 | \delete(\fs, \temp)\; 743 | 744 | \caption{commitJob()} 745 | \label{alg:FileOutputCommitter.commitJob} 746 | \end{procedure*} 747 | 748 | 749 | The \texttt{mergePathsV1(FileSystem, Path, Path)} procedure is 750 | a recursive move of all the output of a committed task into/underneath. 751 | the destination directory. 752 | 753 | % ------------------------------------------------------------ 754 | \begin{procedure*} 755 | \FileOutputCommitVars 756 | 757 | \eIf { \isFile(\fs, $src$) } { 758 | \If { \exists(\fs, $dest$) } { 759 | \delete(\fs, $dest$, $recursive$)\; 760 | } 761 | \rename(\fs, $src$, $dest$)\; 762 | } { 763 | \eIf { \exists(\fs, $dest$) } { 764 | \eIf { \isFile(\fs, $dest$) } { 765 | \delete(\fs, $dest$, recursive)\; 766 | \rename(\fs, $src$, $dest$)\; 767 | } { 768 | \For { f $\in$ \listFiles(\fs, $src$) } { 769 | \mergePathsA(\fs, f, $dest$ + f.name)\; 770 | } 771 | } 772 | } { 773 | \rename(\fs, $src$, $dest$)\; 774 | } 775 | } 776 | 777 | \caption{mergePathsV1(fs, src, dest)} 778 | \label{alg:mergePathsV1} 779 | \end{procedure*} 780 | % ------------------------------------------------------------ 781 | 782 | All the files and directories are promoted to the destination directory. 783 | 784 | \begin{enumerate} 785 | \item If the calculated destination path of a source file or directory does 786 | not exist, the source files/directory renamed. 787 | \item If the destination path does exist and is a file, it is deleted and then 788 | the source files/directory renamed. 789 | \item If the destination path exists and is a directory, and the source 790 | is also a directory, then \texttt{mergePathsV1} is applied to the child 791 | entries of the source path. 792 | \end{enumerate} 793 | 794 | Together, it forms a depth-first overwrite of the source tree by the destination 795 | tree, specifically merging the contents all directories. 796 | 797 | %This is clearly not an atomic operation; 798 | %it is performing a sequence of operations on the distributed filesystem, 799 | %potentially including recursive operations down a directory tree. 800 | The time to execute the merge depends on the number of source entries 801 | and the state of the destination directory. 802 | 803 | If it fails, the state of the operation is unknown: it cannot simply 804 | be repeated. 805 | 806 | 807 | \begin{function} 808 | 809 | \false 810 | 811 | \caption{v1.isCommitJobRepeatable()} 812 | \label{v1.isCommitJobRepeatable} 813 | 814 | \end{function} 815 | 816 | Accordingly: if a job attempt fails during the commit process, it is 817 | unrecoverable: the subsequent attempt reports an error and aborts. 818 | 819 | % Abort Job v1 820 | \begin{procedure} 821 | \FileOutputCommitVars 822 | 823 | \delete(\fs, \jobAttemptPath, $recursive$)\; 824 | 825 | \caption{v1.abortJob()} 826 | \label{alg:v1.abortJob} 827 | 828 | \end{procedure} 829 | 830 | A job attempt an be cleaned up by deleting the output of all job attempts which 831 | may have been made. 832 | This can be achieved by deleting the entire \texttt{_temporary} directory 833 | under the destination directory. 834 | 835 | % cleanup job v1 836 | \begin{procedure} 837 | \FileOutputCommitVars 838 | 839 | \delete(\fs, \dest/\temp, $recursive$)\; 840 | 841 | \caption{cleanupJob()} 842 | \label{alg:v1.cleanupJob} 843 | \end{procedure} 844 | 845 | This would break with any other ongoing job which is writing to the 846 | same destination directory. 847 | It is a requirement that only one job may be actively writing 848 | to a specific destination, something which is checked for during job submission. 849 | 850 | 851 | \textbf{Job Recovery} 852 | 853 | The v1 committer can recover from a failed job attempt, with the 854 | second attempt being able to reuse the output of all committed tasks from 855 | the previous attempt. 856 | 857 | This job whole attempt recovery process is a complex one; 858 | from the perspective of the committer, if the task attempt was committed 859 | in the previous job attempt, the $taskCommittedPath$ of the previous attempt 860 | can be moved under the $jobAttemptPath$ of the new job attempt. 861 | 862 | 863 | \begin{procedure} 864 | \FileOutputCommitVars 865 | 866 | \delete(\fs, \dest/\temp, $recursive$)\; 867 | 868 | \caption{isRecoverySupported()} 869 | \label{alg:v1.isRecoverySupported} 870 | \end{procedure} 871 | 872 | 873 | \begin{procedure} 874 | \FileOutputCommitVars 875 | 876 | $previousJobId$ = $jobId$ - 1 877 | $previousJobAttemptDir$ = getJobAttemptPath($previousJobId$ - 1) 878 | \delete(\fs, \dest/\temp, $recursive$)\; 879 | 880 | \caption{recoverTask(TaskAttemptContext )} 881 | \label{alg:v1.recoverTask} 882 | \end{procedure} 883 | 884 | 885 | The only lost work is that of all in progress task attempts ---those which had generated 886 | data but were not yet committed. 887 | 888 | When working with HDFS, the main limitation of this algorithm is 889 | one of scale: job commit is an $O(tasks)$, with the time for 890 | each task's merge being a function of the number of files and the depth of 891 | the directory tree. 892 | 893 | As this is serialized at the end of the job, irrespective of how many workers 894 | there were, the job commit is a single point of delay, and of failure. 895 | The more tasks, the more work to commit, the longer the commit, and the 896 | higher risk of that failure. 897 | 898 | \subsection{Hadoop V2 commit algorithm} 899 | \label{subsec:hadoop-v2-commit-algorithm} 900 | 901 | 902 | The v2 commit algorithm propagates the task attempts output into the job's 903 | output directory in the task commit. 904 | This is done in a variant \texttt{mergePaths()} algorithm,\ \ref{alg:mergePathsV2}, 905 | designed to support parallel writers to the output directory. 906 | In the Hadoop source the two algorithms are intermixed within a pair of 907 | co-recursive procedures; 908 | they have been isolated here for clarity. 909 | 910 | 911 | % ------------------------------------------------------------ 912 | % V2 commit algorithm 913 | \begin{procedure*} 914 | \FileOutputCommitVars 915 | 916 | \eIf {\isFile(\fs, $src$)} { 917 | \If {\exists(\fs, $dest$)} { 918 | \delete(\fs, $dest$, $recursive$)\; 919 | } 920 | \rename(\fs, $src$, $dest$)\; 921 | } { 922 | \eIf {\exists(\fs, $dest$)} { 923 | \eIf {\isFile(\fs, $dest$)} { 924 | \delete(\fs, $dest$, $recursive$)\; 925 | \mkdirs(\fs, $dest$)\; 926 | \For {c $\in$ \listFiles(\fs, $src$)} { 927 | \mergePathsB(\fs, c, $dest$ + c.name)\; 928 | } 929 | } { 930 | \For {f $\in$ \listFiles(\fs, $src$)} { 931 | \mergePathsB(\fs, f, $dest$ + f.name)\; 932 | } 933 | } 934 | } { 935 | \mkdirs(\fs, $dest$)\; 936 | \For {f $\in$ \listFiles(\fs, $src$)} { 937 | \mergePathsB(\fs, f, $dest$ + f.name)\; 938 | } 939 | } 940 | } 941 | 942 | \label{alg:mergePathsV2} 943 | \caption{mergePathsV2(fs, rc, dest)} 944 | 945 | \end{procedure*} 946 | % ------------------------------------------------------------ 947 | 948 | Here, the \texttt{rename()} operation is restricted to committing 949 | a single file: whenever a directory is to be committed, it is done 950 | as a recursive merge. 951 | This is necessary because multiple tasks may be committing simultaneously, tasks which 952 | may be writing to the same destination. 953 | The atomic exclusivity of a directory rename is precisely what is not 954 | wanted when trying to support multiple tasks merging their output into the 955 | same directory tree. 956 | 957 | Performance-wise, the \texttt{mergePathsV2} operation is slower 958 | than the v1 algorithm whenever there are directories to commit. 959 | Yet, because these operations are taking place in task commits, work is parallelized 960 | across the cluster, and, often, not directly slowing down the overall job. 961 | 962 | With the file propagation taking place in the tasks, the job commit 963 | operation is reduced to creating the \SUCCESS file and cleaning up 964 | working directories: 965 | 966 | \begin{procedure*} 967 | \FileOutputCommitVars 968 | 969 | \touch(\fs, \dest/\SUCCESS)\; 970 | \delete(\fs, \temp)\; 971 | 972 | \label{alg:commitJobV2} 973 | 974 | \caption{v2 commitJob()} 975 | \end{procedure*} 976 | 977 | As a result, the time to commit a job is barely measurable. 978 | 979 | In terms of failure resilience, the v2 algorithm is weaker than the v1 algorithm. 980 | Task commit is now a non-atomic operation; 981 | It is therefore not possible to safely recover from the failure or loss of a task attempt 982 | while it was committing work. 983 | 984 | Because the output of committed tasks are visible, 985 | if the job fails the contents of all committed tasks are visible. 986 | 987 | This commit algorithm has chosen speed over resilience. 988 | 989 | This is often a valid decision to make, however, the callers of the committers 990 | need to be aware that this decision has been made, and that failures in 991 | certain parts of the process, specifically task commit, are not recoverable. 992 | 993 | \subsection{Limitations of the Hadoop MapReduce Commit Protocol} 994 | \label{subsec:hadoop-commit-protocol} 995 | 996 | Alongside some implementation details, such as the fact that a task process 997 | will be exit without calling \texttt{cleanupTask()} once it is informed that it 998 | is unknown, we have to consider: are there any fundamental issues with 999 | the Hadoop commit protocol? 1000 | 1001 | A key weakness is that the job committer is not passed the list of task 1002 | attempts considered successful, and, from those committed tasks, their 1003 | lists of files which were committed. 1004 | 1005 | The committers themselves have to implement some mechanism to enumerate those 1006 | committed tasks. 1007 | 1008 | The File Output Committer does this through the filesystem, relying on consistent 1009 | directory listings to enumerate task output to merge, and, for the v1 algorithm, 1010 | to enumerate the set of committed tasks whose output must be published during job commit. 1011 | This places a requirement for the filesystem metadata listings to be consistent, 1012 | a requirement not met by all not object stores. 1013 | 1014 | As no list of completed tasks is directly passed to the \texttt{commitJob} operation, 1015 | the job committer cannot determine whether the actual committed output 1016 | in the filesystem is correct. 1017 | 1018 | There also appears to be a race condition between 1019 | verification that the destination directory does not exist in the 1020 | the client-side job submission, and the creation of that directory during 1021 | the \texttt{setupJob()} operation. 1022 | In a busy cluster there can be a delay between the scheduling of the job and 1023 | its application manager actually starting to execute\ldots 1024 | a second, conflicting, job may also be scheduled at this point. 1025 | If the destination directory were to be created during job submission, 1026 | this window would be nearly completely eliminated. 1027 | 1028 | % ======================================================================== 1029 | 1030 | \section{The Spark Commit Protocol} 1031 | \label{sec:spark-commit-protocol} 1032 | 1033 | Apache Spark's execution model is significantly different from 1034 | that of Hadoop. 1035 | Rather than dedicatinga single process to executing a single operation 1036 | across a subset of the source data, Spark creates a set of \emph{Executors}, 1037 | each of which can execute task attempts across a number of threads. 1038 | As a result, a single executor may be executing many task attempts 1039 | simultaneously, with each task's commit operations being centrally managed 1040 | by the single Job Manager. 1041 | 1042 | \begin{figure*} 1043 | \centering 1044 | \includegraphics[width=.8\textwidth]{spark-protocol.png} 1045 | \caption{Spark commit protocol)} 1046 | \label{fig:spark-protocol} 1047 | \end{figure*} 1048 | 1049 | 1050 | When a failure of an executor is detected by loss of its heartbeat, 1051 | all active tasks will be rescheduled. 1052 | As the failure may be a network partition, multiple task attempts may be active 1053 | simultaneously. 1054 | It is therefore a requirement that no data is promoted until a task attempt is actually 1055 | committed. 1056 | 1057 | 1058 | Spark can use the Hadoop Committers within its commit protocol, 1059 | which is usually done whenever writing data to HDFS or other cluster filesystem. 1060 | 1061 | Spark manages its requirement of ``only one task attempt may be committed'' 1062 | in its \texttt{OutputCommitCoordinator} class; 1063 | an instance of this in the driver tracks the state of all active task attempts 1064 | and grants or denies permission to commit. 1065 | 1066 | A task attempt is only granted permission to commit if a set of conditions 1067 | are met\footnote{see: \texttt{OutputCommitCoordinator.handleAskPermissionToCommit()}}: 1068 | 1069 | \begin{enumerate} 1070 | \item The task attempt is not recorded as having failed. 1071 | \item The task attempt is in the set of known-to-be active tasks 1072 | \item Either the requesting task attempt has already been granted this permission, 1073 | no task attempt has been granted permission to commit, or a previous task 1074 | was granted permission, but it is considered to have failed. 1075 | 1076 | \end{enumerate} 1077 | 1078 | That is: it must be a valid task attempt and no other task attempt can be 1079 | actively committing or have committed this task. 1080 | 1081 | The Executor requests this permission to commit via an RPC call and will 1082 | proceed with the commit when it receives a successful message. 1083 | A timeout on the RPC channel or a denial of commit permission will result 1084 | in \texttt{abortTask()} being invoked. 1085 | 1086 | Once a task attempt has been granted permission to commit, then no other 1087 | attempt will be granted unless the first attempt is reported as having failed. 1088 | 1089 | \TODO: \texttt{OutputCommitCoordinator} reacts to task events from the scheduler, but 1090 | does that cover executor failure? 1091 | 1092 | Spark makes no attempt to recover from a failed Job Manager; 1093 | its mechanism for recovering from a failed job is ``rerun the entire query''. 1094 | 1095 | One area where Spark goes beyond Hadoop's protocol ias that 1096 | it adds a new operation to request a file with an absolute path, 1097 | \texttt{newTaskTempFileAbsPath()}. 1098 | It is needed to address the special case of Apache Hive, wherein 1099 | some parts of the dataset are written to different locations than 1100 | under the destination directory of a job. 1101 | The operation, having calculated the absolute destination of the output, 1102 | requests a temporary file which will be placed in the final destination 1103 | directory on a job commit. 1104 | 1105 | Spark implements this operation atop the standard \texttt{FileOutputCommitter} 1106 | as follows: 1107 | 1108 | \begin{enumerate} 1109 | \item An ``absolute path staging directory'' is created under the job output 1110 | directory; 1111 | this is \texttt{_temporary-\$jobId}. 1112 | \item When a \texttt{newTaskTempFileAbsPath()} is invoked, a path under this 1113 | directory is generated, with a UUID in the filename. 1114 | \item The mapping of absolute path to temporary file is stored in a map in the Task Committer. 1115 | \item In the \texttt{commitTask()} operation, the map of all files to rename is passed back. 1116 | \item In \texttt{commitJob()}, after invoking the Hadoop committer's \texttt{commitJob()} 1117 | call, all files in the aggregate map of files to rename to absolute paths is iterated through. 1118 | Each file is renamed to its final path, in turn. 1119 | \item Task abort will delete files of that task, while Job abort will delete 1120 | the whole absolute path staging directory. 1121 | \end{enumerate} 1122 | 1123 | This is extra operation is currently only used in that specific use case, 1124 | ``Hive table with partitions elsewhere in the same filesystem as the active job''. 1125 | This is not a common use case, at least with data stored in object stores. 1126 | Accordingly, our new committers do not support this operation 1127 | \footnote{It is possible to support this, but it would complicate cleaning up 1128 | after tasks, especially failed ones, and failed jobs.}. 1129 | 1130 | Spark is more flexible in its commit protocol, because 1131 | the name for a file is generated by the committer, not the application, 1132 | and because successful task committers can pass arbitrary serialized data back 1133 | to the driver, for use in the Job Commit operation. 1134 | This could potentially be used as the sole mechanism for passing a list 1135 | of written files from the task attempts to the job committer. 1136 | Being able to generate names (albeit while preserving a sort order), could 1137 | also be potentially useful. 1138 | We have initially chosen to not explore this as a commitment strategy; 1139 | others may wish to do so. 1140 | 1141 | 1142 | 1143 | \subsubsection{Limitations of the Spark Commit Protocol} 1144 | 1145 | The standard commit coordination in the Spark Driver is with the 1146 | \texttt{OutputCommitCoordinator}. 1147 | this class's state includes tracking whether or not a task attempt 1148 | has been granted permission to commit its work. 1149 | Once one task attempt has been granted permission to commit, 1150 | all other task attempts for the same task will be denied. 1151 | However, if the task attempt granted permission to commit its work fails 1152 | for any reason, the attempt is considered a failure, and 1153 | another attempt will be granted permission to commit its work. 1154 | 1155 | This strategy works, provided task commit is a repeatable operation, 1156 | even if the first attempt has failed or become partitioned from the 1157 | Spark Driver. 1158 | That requirement is met for the \texttt{FileOutputCommitter} v1 algorithm, 1159 | but possibly not by the v2 algorithm, or, potentially, others. 1160 | If committers could declare their ability to recover from 1161 | failed task commits, along with other aspects of their operation, 1162 | the \texttt{OutputCommitCoordinator} would be able to decide whether 1163 | a repeated attempt were permitted, or whether failing the Job was the 1164 | safer outcome. 1165 | 1166 | 1167 | Unlike the Hadoop protocol, there is no requirement for the Spark Driver 1168 | to have received a recent liveness check from the cluster scheduler. 1169 | Unless the Spark Driver process exits once it determines that it has been 1170 | isolated from any underlying cluster scheduler, there is a risk that 1171 | a partitioned Spark cluster may commit a job to the same destination 1172 | as a cluster instantiated as a replacement. 1173 | Careful review of the YARN and Mesos integration code is required to be 1174 | confident that this risk does not exist. 1175 | 1176 | Spark's commit protocol permit task committers to return data to 1177 | the Job Committer in the Spark Driver; 1178 | it would be possible to use this to validate the output of the tasks. 1179 | The current committer implementations do not do this, but at least the underlying 1180 | protocol makes such an improvement possible. 1181 | 1182 | 1183 | 1184 | 1185 | % ======================================================================== 1186 | 1187 | \section{The Challenge of Object Stores} 1188 | \label{sec:object-stores} 1189 | 1190 | Having introduced the classic filesystem and the commit protocols and algorithms 1191 | used to commit the output of distributed computation, let us consider 1192 | Object Stores such as Amazon S3, Google Cloud Storage and 1193 | Windows Azure Storage\ \cite{AWS-S3-intro,Calder11}. 1194 | 1195 | % As all filesystem 1196 | %operations are via the NameNode, all clients get a consistent view of the filesystem. 1197 | %And, as the 1198 | 1199 | 1200 | The most salient point, is this: Object Stores are not filesystems. 1201 | Rather than the classic hierarchical view of directories, subdirectories 1202 | and paths, object stores store a set of objects, each with a unique key; 1203 | a sequence of characters provided when the object was created. 1204 | Classic path separators ``\texttt{/}'' are invariably part of the set of valid 1205 | characters, so allowing objects to be created which have the appearance 1206 | of files in a directory. 1207 | 1208 | As examples, the following are all valid keys on the Amazon, Google and Microsoft 1209 | stores 1210 | 1211 | \begin{verbatim} 1212 | /entry 1213 | /path1/path2/path3 1214 | /path1/ 1215 | /path1 1216 | \end{verbatim} 1217 | 1218 | More subtly, it is valid for an object store container (on S3:, a ``bucket'') 1219 | to have objects with all of these names simultaneously. 1220 | It is not an error to have an object whose key would make it appear to be 1221 | ``under'' another object, nor to explicitly contain path entries separators. 1222 | 1223 | Objects cannot generally be appended to once created, or renamed. 1224 | They can be replaced by new objects or deleted. 1225 | Some form of copy operation permits an object to be duplicated, creating 1226 | a new object with a different key. 1227 | Such copy operations take place within the storage infrastructure with a 1228 | copy time measurable in megabytes/second. 1229 | 1230 | 1231 | The set of operations offered are normally an extended set of HTTP verbs: 1232 | 1233 | \begin{description}[leftmargin=8em, style=nextline] 1234 | \item[PUT] Atomic write of an object 1235 | \item[GET] retrieve all or part of an object 1236 | \item[HEAD] retrieve the object metadata 1237 | \item[LIST] list all objects starting with a given prefix 1238 | \item[COPY] copy a single object within the store, possibly from other containers. 1239 | \item[DELETE] Delete an object 1240 | \end{description} 1241 | 1242 | There are usually two extra operations to address scale: 1243 | a bulk delete call which may have partial failures, 1244 | and \emph{Multipart Upload}; a way to upload an object larger than the 1245 | 5GB which a single HTTP POST can support. 1246 | The exact nature of multipart uploads varies from store to store. 1247 | For Amazon this is initiated as a sequence of POST calls, one to initiate, 1248 | one or more POST calls with data, and a final POST listing the (ordered) 1249 | etags of the uploaded object parts. 1250 | All but the last upload in the object must be 5 MB or larger 1251 | 1252 | 1253 | Object store implementations can display different levels of inconsistency. 1254 | Windows Azure Storage is fully consistent; 1255 | Amazon S3 offers create consistency on new objects, but not updated or deleted ones. 1256 | It also exhibits listing inconsistency, wherein a newly created object 1257 | may not be visible in the results of a \texttt{LIST} call, or a newly deleted 1258 | object still be listed as present. 1259 | 1260 | 1261 | Despite the clear mismatch between the capabilities and APIs of object storage, 1262 | and that expected of a Hadoop filesystem, they have one key thing in common: 1263 | they can store Petabytes of data. 1264 | For that reason, all the popular cloud storage infrastructures have connectors 1265 | from Hadoop, and thus transitively application such as Apache Hive, Apache HBase 1266 | and Apache Spark. 1267 | Many of these are developed within the Apache Software Foundation's own 1268 | source repository, including the Azure ``wasb'' connector and the ``s3a'' connector 1269 | to Amazon S3. 1270 | Others are maintained externally ---particularly Amazon EMR's own ``EMRFS'', 1271 | known by the \texttt{s3} URL schema, and the Google Cloud Storage connector, 1272 | \texttt{gcs}. 1273 | 1274 | Irrespective of where they are implemented, they all share a common objective: 1275 | trying to maintain the filesystem metaphor atop an object store. 1276 | 1277 | As an example of a simple case, the \texttt{getFileStatus()} call mimics 1278 | a directory, in conjunction with zero-byte ``empty directory'' markers, 1279 | so must look for a file, then a marker and the most expensive operation, a path listing. 1280 | 1281 | \begin{verbatim} 1282 | GET path 1283 | GET path/ 1284 | LIST path/ 1285 | \end{verbatim} 1286 | 1287 | The performance differences of every HTTPS request slows 1288 | down all RPC operations, even with pooled collections: even this simple probe 1289 | can for a file take hundreds of milliseconds. 1290 | Other mimicked operations have similar costs. 1291 | 1292 | Operations upon directories are mimicked by listing all objects under that path, 1293 | and acting upon those objects individually. 1294 | A recursive delete is implemented as a listing of the maximum number of files 1295 | returned in one HTTP request (5000 or a similar value), then either issuing 1296 | bulk DELETE operations, where supported, or falling back to individual DELETE 1297 | calls. 1298 | Bulk LIST/DELETE operations have a cost of one HTTP request/page size, such 1299 | as $O(1 + descendants/5000)$; if sequential delete operations must be issued, then 1300 | the cost is at least $O(1+ descendants)$, with the ``at least'' qualifier being 1301 | added because request throttling can slow down the requests even further. 1302 | 1303 | File and directory renaming is even more expensive. 1304 | A file rename is implemented as a copy of the original data to a new path, 1305 | followed by a delete of the original data. 1306 | This makes the time to copy a single file an $O(length(file))$ operation 1307 | \footnote{Third party implementations of the S3 protocol do generally offer an $O(1)$ rename operation}. 1308 | 1309 | Directory rename is a paged listing of all children, and a copy and delete for 1310 | each, which makes its duration a function the number of files and total amount of data. 1311 | 1312 | These are the tangible performance issues, the ones which are most visible 1313 | to users. 1314 | However it is the fact that the atomicity behaviors of a POSIX filesystem 1315 | are not provided which are most dangerous. 1316 | 1317 | The \texttt{rename()} call is no longer atomic: two clients may start renaming 1318 | into to the same destination directory. 1319 | Furthermore, if any rename fails, the state of the source and destination 1320 | directory is unknown: the data may be spread across both locations. 1321 | Finally, because the files to be copied is determined from a LIST call, 1322 | if the object store is not consistent, the listing can be incomplete or out of 1323 | date. 1324 | Newly created files may not be visible, so not copied as part of the rename 1325 | operation. 1326 | 1327 | \emph{Directory rename cannot be used in a commit algorithm which 1328 | requires atomic, exclusive or consistent renames}. 1329 | 1330 | The \texttt{create(path, overwrite=false)} operation is also flawed. 1331 | This is expected to be an atomic operation to immediately create a file iff there is 1332 | no entry at that path; 1333 | Instead may be mimicked by a sequence of the \texttt{getFileStatus()} call 1334 | and the creation of a buffer on the client side for the output: the data 1335 | will not be visible until the data is completely written and the stream 1336 | closed. 1337 | As a result, it is impossible to use file creation as a means of creating any 1338 | form of lock or exclusive access in such a store. 1339 | 1340 | 1341 | Returning to the MapReduce v1 and v2 commit algorithms, they are unsuitable for 1342 | use in any object store without atomic renames (v1), consistent 1343 | directory listings and existence checks (v1 and v2). 1344 | 1345 | As a result, neither can be used directly against Amazon S3 today. 1346 | With a consistent metadata layer such as S3mper or S3Guard, the v2 algorithm 1347 | can be used, though its task commit time will be $O(data)$\ \cite{S3mper,HADOOP-13345}. 1348 | 1349 | Providing a safe, performant output committer for object stores forces 1350 | us to leave the metaphor of a filesystem behind, and embrace 1351 | the capabilities of object stores themselves. 1352 | 1353 | % ======================================================================== 1354 | 1355 | \section{The new S3A Committers: working with S3 from the outset} 1356 | \label{sec:new-committers} 1357 | 1358 | 1359 | Given that S3 does not deliver the safe and performant operations 1360 | which the file committers expect, how can Hadoop and Spark 1361 | jobs safely use it as a destination of their work? 1362 | 1363 | This is the problem solved by the new ``S3A committers''. 1364 | These are called as they are closely integrated with Hadoop's S3A connector to S3, 1365 | using the multipart upload operation to decouple writing the data from 1366 | manifesting it at its final destination. 1367 | 1368 | Multipart upload is already used for writing large files to the object store. 1369 | When a file is written, it is initially buffered to disk or memory, when the 1370 | buffer size reaches some threshold the upload is initiated, and first block uploaded 1371 | in a \texttt{POST} operation. 1372 | S3's response to the POST operation is an MD5-checksum of the 1373 | uploaded data, the ``entity tag'', as used in existing HTTP operations. 1374 | After all the blocks of a stream have been uploaded, the ordered list 1375 | of entity tags is POSTed to S3 in a final request completing the MPU . 1376 | It is only after this final POST that the uploaded object is manifest in S3. 1377 | If this final POST operation can be used to commit the output of a task, 1378 | then the committer has atomic and effectively O(1) operation for each file. 1379 | 1380 | The challenge for an S3 committer then becomes one of: how to have 1381 | user code write to the destination directory, preserving and propagating 1382 | the lists of MPUs to finally commit in the job commit operation? 1383 | 1384 | That is the challenge addressed in the two committers. 1385 | 1386 | Underneath, they both use the same methods offered by the S3A connector, 1387 | and the same persistent data formats to propagate the lists of pending uploads. 1388 | Where they differ is how tasks write data, and how the lists are passed 1389 | to the job committer. 1390 | 1391 | In the ``Staging Committer'', each task attempt writes its data into the local 1392 | filesystem of the server on which the attempt is executed. 1393 | When a task attempt is committed, its data is uploaded to the final 1394 | paths on S3. 1395 | The manifest of the pending MPUs is passed to the job tracker via 1396 | a shared consistent cluster filesystem (usually HDFS), \emph{using the v1 1397 | File Output Committer}. 1398 | When the Hadoop or Spark job is committed, the Staging committer reads in 1399 | from HDFS the manifests written of the committed task attempts, and 1400 | completes the uploads listed therein. 1401 | 1402 | Performance-wise, all the data is uploaded to its final destination in the task commit, with the 1403 | job commit being the time to execute the v1 commit operation within HDFS, followed 1404 | by that of a POST call per uploaded file. 1405 | 1406 | 1407 | The ``Magic Committer'' works within the S3A filesystem connector, changing 1408 | how files are incrementally written to S3. 1409 | Rather than completing a multipart upload when the output stream being written 1410 | by a task is closed, in the process doing the writing, the magic 1411 | committer delays the final POST until the job is committed. 1412 | Instead it writes a manifest describing the upload to S3. 1413 | When the task is committed, all the single file manifests of that attempt 1414 | are aggregated into a single manifest for the task attempt, which is then 1415 | PUT to S3 in the directory of completed tasks. 1416 | The Job commit process is one of reading in the manifests of all committed 1417 | tasks, and as with the Staging Committer, completing their uploads. 1418 | 1419 | Because of its incremental upload of blocks of the output data, the magic committer promises 1420 | faster uploads of larger datasets: there is no need to postpone the upload 1421 | to S3 until the task is actually committed. 1422 | Because it does not buffer any data other than the yet-to-be-written blocks, 1423 | the amount of local storage is reduced, so potentially avoiding running 1424 | out of local disk capacity\footnote{and/or allow for VMs with less virtual disk to be used}. 1425 | 1426 | 1427 | \subsubsection{The Staging Committer} 1428 | 1429 | The staging committer declares the working directory of a task 1430 | attempt to be in the local filesystem, the directory \texttt{workPath}. 1431 | It is this which is returned in the method \texttt{PathOutputCommitter.getWorkPath()}, 1432 | which is then used in \texttt{FileOutputFormat} to provide the paths which 1433 | callers use when creating files in a task attempt. 1434 | 1435 | 1436 | 1437 | \begin{table} 1438 | \caption{Extra variables used by the staging committer} 1439 | \begin{tabular}{ l l } 1440 | \hline 1441 | \textbf{name} & \textbf{meaning} \\ 1442 | $localfs$ & The local ``file:'' filesystem \\ 1443 | $localAttemptPath$ & A local filesystem path \\ 1444 | $clusterfs$ & The cluster filesystem \\ 1445 | $wrappedCommitter$ & The committer for the cluster filesystem. \\ 1446 | $clusterJobAttemptPath$ & the job attempt path of $wrappedCommitter$ \\ 1447 | $clusterTaskAttemptPath$ & the job attempt path of $wrappedCommitter$ \\ 1448 | \hline 1449 | \end{tabular} 1450 | \label{tab:StagingCommitter.variables} 1451 | \end{table} 1452 | 1453 | %% Define the extra variables for the staging committer 1454 | \newcommand{\StagingVars}{ 1455 | \FileOutputCommitVars 1456 | \SetKwData{clusterfs}{$clusterfs$} 1457 | \SetKwData{wrappedCommitter}{$wrappedCommitter$} 1458 | \SetKwData{clusterJobAttemptPath}{$clusterJobAttemptPath$} 1459 | \SetKwData{clusterTaskAttemptPath}{$clusterTaskAttemptPath$} 1460 | \SetKwData{jobUUID}{$jobUUID$} 1461 | \SetKwData{localfs}{$localfs$} 1462 | \SetKwData{localAttemptPath}{$localAttemptPath$} 1463 | \SetKwData{temp}{_$temporary$} 1464 | } 1465 | 1466 | 1467 | \textbf{Job Setup} 1468 | 1469 | The cluster-filesystem committer, \texttt{wrappedCommitter}. 1470 | is created and initialized, configured to use a unique path within the 1471 | cluster filesystem as its $clusterJobAttemptPath$ output directory. 1472 | This committer will have its own job attempt and task attempt directories. 1473 | This committer is set to use the v1 commit algorithm,. 1474 | 1475 | %% StagingCommitter.setupJob() 1476 | \begin{procedure} 1477 | \StagingVars 1478 | 1479 | \jobUUID $\longleftarrow$ \newUUID \; 1480 | \clusterJobAttemptPath $\longleftarrow$ \tempDirForStaging + \getUsername + \jobUUID \; 1481 | 1482 | \taskAttemptPath $\longleftarrow$ \jobAttemptPath/\taskAttemptId\; 1483 | 1484 | \wrappedCommitter.setupJob(\clusterJobAttemptPath)\; 1485 | \caption{StagingCommitter.setupJob()} 1486 | \label{alg:StagingCommitter.setupJob} 1487 | \end{procedure} 1488 | 1489 | %If the staging committer is configured to fail if the destination exists, 1490 | %this setup will also include a check for the destination path, raising 1491 | %an exception if it is present. 1492 | 1493 | 1494 | % ----------------------------------------------------------------- 1495 | 1496 | \textbf{Task Setup} 1497 | 1498 | 1499 | %% StagingCommitter.setupTask() 1500 | \begin{procedure} 1501 | \StagingVars 1502 | 1503 | \taskAttemptPath $\longleftarrow$ \jobAttemptPath/\taskAttemptId\; 1504 | \localAttemptPath $\longleftarrow$ uniquePath(\localfs, \taskAttemptId)\; 1505 | \clusterTaskAttemptPath $\longleftarrow$ \clusterJobAttemptPath + \taskAttemptId)\; 1506 | 1507 | \wrappedCommitter.setupTask(\clusterTaskAttemptPath)\; 1508 | 1509 | \caption{StagingCommitter.setupTask()} 1510 | \label{alg:StagingCommitter.setupTask} 1511 | 1512 | \end{procedure} 1513 | 1514 | The function \texttt{uniquePath(filesystem, taskAttemptId)} is required to return 1515 | a unique path in the local filesystem for a task attempt. 1516 | It does this under the local \texttt{/tmp} directory, which is where 1517 | large intermediate datafiles are stored during MapReduce operations. 1518 | A well managed Hadoop cluster has this temporary data stored on a non-root 1519 | volume, along with a regularly scheduled job to delete old temporary files. 1520 | 1521 | This local filesystem is returned by the committer's \texttt{getWorkPath()} method. 1522 | 1523 | \begin{function} 1524 | \StagingVars 1525 | \return \localAttemptPath\; 1526 | 1527 | \caption{StagingCommitter.getWorkPath()} 1528 | \label{alg:StagingCommitter.getWorkPath} 1529 | \end{function} 1530 | 1531 | This is the crux of the algorithm. 1532 | 1533 | The working path returned to the task attempt execution code in MapReduce and Spark 1534 | is a \texttt{file://}-prefixed local directory, not one in the object store. 1535 | The task attempt commit process is where these will be uploaded, and the job 1536 | commit where the uploads are materialized. 1537 | 1538 | % ----------------------------------------------------------------- 1539 | \textbf{Needs Task Commit} 1540 | 1541 | A commit is required iff data has been generated in the local filesystem. 1542 | 1543 | \begin{function} 1544 | \StagingVars 1545 | 1546 | \return \exists(\localfs, \localAttemptPath)\; 1547 | 1548 | \caption{StagingCommitter.needsTaskCommit()} 1549 | \label{alg:StagingCommitter.needsTaskCommit} 1550 | 1551 | \end{function} 1552 | 1553 | 1554 | \textbf{Task Abort} 1555 | 1556 | A task attempt is aborted by deleting all staged data, and aborting the wrapped committer's 1557 | task. 1558 | 1559 | \begin{procedure} 1560 | \StagingVars 1561 | 1562 | \delete(\localfs, \localAttemptPath, $recursive$)\; 1563 | \wrappedCommitter.abortTask()\; 1564 | 1565 | \caption{StagingCommitter.abortTask()} 1566 | \label{alg:StagingCommitter.abortTask} 1567 | \end{procedure} 1568 | 1569 | 1570 | 1571 | \textbf{Task Commit} 1572 | 1573 | If a task attempt is given permission to commit its output, it does 1574 | so by initiating multipart uploads of all files under \texttt{localAttemptPath} 1575 | to the final destination directory, uploading the data, but not completing 1576 | the operation. 1577 | 1578 | % commit task 1579 | \begin{procedure} 1580 | \StagingVars 1581 | 1582 | \wrappedCommitter.commitTask()\; 1583 | 1584 | $U \longleftarrow \emptyset$\; 1585 | \For{$f\in $ \listFiles(\localfs, \localAttemptPath) } { 1586 | $U \longleftarrow U + \{$ \uploadFileToPendingCommit($f$, \dest) $\}$\; 1587 | } 1588 | \savePendingSet(\clusterfs, \clusterTaskAttemptPath, $U$)\; 1589 | 1590 | \caption{StagingCommitter.commitTask()} 1591 | \label{alg:StagingCommitter.commitTask} 1592 | 1593 | \end{procedure} 1594 | 1595 | The information needed to complete these pending uploads are then saved as 1596 | a manifest file to \texttt{clusterTaskAttemptPath}, after which the wrapped committer has 1597 | its \texttt{commitTask()} operation called. 1598 | This will rename the saved file into the job attempt directory with the 1599 | filename of the actual task., that is \texttt{\$clusterJobAttemptPath/\$taskId}. 1600 | 1601 | 1602 | \textbf{Job Commit} 1603 | 1604 | The Job commit process manifests the pending uploads. 1605 | 1606 | The list of uploads is found by listing the files in the cluster job attempt path 1607 | This is the directory into which the pending set files of task attempts are 1608 | renamed during their task commits. 1609 | 1610 | % commit job 1611 | \begin{procedure} 1612 | \StagingVars 1613 | 1614 | $Pending \longleftarrow \emptyset$\; 1615 | \For{$f \in $ \listFiles(\clusterfs, \clusterJobAttemptPath)} { 1616 | $Pending \longleftarrow Pending + $ \loadPendingSet(\clusterfs, $f$)\; 1617 | } 1618 | \checkForConflicts(\fs, $Pending$) \; 1619 | \For{$p \in Pending$} { 1620 | \completeUpload($p$)\; 1621 | } 1622 | 1623 | \caption{StagingCommitter.commitJob()} 1624 | \label{alg:StagingCommitter.commitJob} 1625 | \end{procedure} 1626 | 1627 | The \texttt{completeUpload()} operation completes the upload of a file by POST-ing 1628 | a complete-multipart-upload requesting list the ordered MD5 checksums of every block 1629 | previously uploaded. 1630 | 1631 | Note that \texttt{wrappedCommitter.commitJob()} is not invoked; 1632 | because the location of the pending set files of this job attempt is known, 1633 | they can be read directly. 1634 | This is a minor optimization. 1635 | 1636 | % Abort Job 1637 | 1638 | \textbf{Job Abort} 1639 | 1640 | To abort an entire job, the set of pending uploads must be enumerated as 1641 | per job commit, only now the jobs are aborted. 1642 | 1643 | \begin{procedure} 1644 | \StagingVars 1645 | 1646 | $Pending \longleftarrow \emptyset$\; 1647 | \For{$f \in $ \listFiles(\clusterfs, \clusterJobAttemptPath)} { 1648 | $Pending \longleftarrow Pending + $ \loadPendingSet(\clusterfs, $f$)\; 1649 | } 1650 | \For{$p \in Pending$} { 1651 | \abortUpload($p$)\; 1652 | } 1653 | \wrappedCommitter.abortJob()\; 1654 | 1655 | \caption{StagingCommitter.abortJob()} 1656 | \label{alg:StagingCommitter.abortJob} 1657 | \end{procedure} 1658 | 1659 | % Cleanup 1660 | 1661 | \textbf{Job Cleanup} 1662 | 1663 | To clean up a job all incomplete uploads targeted at or under 1664 | the output directory must be enumerated and aborted, which can be done 1665 | with a POST to S3 to list the outstanding uploads, and another POST per 1666 | upload to abort. 1667 | 1668 | Local task attempt directories must be deleted, as well as those in the shared cluster. 1669 | 1670 | \begin{procedure} 1671 | \StagingVars 1672 | 1673 | \For{$f \in $ \listPendingUploads(\dest)} { 1674 | \abortUpload($f$)\; 1675 | } 1676 | 1677 | \delete(\localfs, $local directories for job$, recursive)\; 1678 | \wrappedCommitter.cleanupJob()\; 1679 | 1680 | \caption{StagingCommitter.cleanupJob()} 1681 | \label{alg:StagingCommitter.cleanupJob} 1682 | \end{procedure} 1683 | 1684 | 1685 | As those local task attempt directories are local to the nodes executing 1686 | individual tasks, they will not be deleted in the job cleanup, except for 1687 | those tasks which were executed on the same host as that where the 1688 | \texttt{cleanupJob()} operation is invoked. 1689 | 1690 | 1691 | \subsubsection{Enhancing conflict resolution for a zero-rename workflow} 1692 | 1693 | One aspect of the commit algorithm omitted is how this committer resolves 1694 | conflict with existing files. 1695 | The \texttt{FileOutputCommitter} algorithms fail if there is destination data; 1696 | they are required to have an empty output directory. 1697 | 1698 | The Staging Committer supports alternative policies, and may be configured 1699 | to overwrite or add to data in the destination directory. 1700 | To guarantee that newly added files have unique names, the 1701 | uploaded files can have a unique ID inserted in their filenames. 1702 | 1703 | One conflict resolution option is targeted explicitly at Spark SQL queries writing 1704 | output in the layout structure popularized by Apache Hive, wherein 1705 | different levels in the directory tree are used to partition data. 1706 | 1707 | For example, data could be partitioned by year, month and day, such as 1708 | \texttt{/data/YEAR=2017/MONTH=12/DAY=21/}. 1709 | Partitioning increases query performance where only select field ranges are used; 1710 | any query of December 2017 only needs to look under all subdirectories of 1711 | \texttt{/data/YEAR=2017/MONTH=12/}, ignoring all adjacent directories. 1712 | 1713 | Often large datasets like these are built up over time, with nightly or hourly 1714 | results being added. 1715 | In a traditional workflow, this is normally by done by 1716 | executing the new query into an empty directory, then, once the job has succeeded, 1717 | moving the new data into the aggregate dataset through `rename()` operations. 1718 | This generate-then-rename strategy ensures that if a job fails, no matter when 1719 | it happened, original dataset is unchanged, 1720 | and that applications can continue to use the current set of files. 1721 | 1722 | In an object store, that rename operation is of course, another expensive copy 1723 | operation, with its own failure modes. 1724 | 1725 | What to do? 1726 | 1727 | The solution as developed and utilized at Netflix is to have a special mode 1728 | of the committer, ``Partitioned'', which expects all data to be written 1729 | into one or more subdirectories of a partitioned dataset, a dataset which 1730 | may already exist in the destination directory. 1731 | 1732 | Conflict resolution is scoped purely to that of the destination partitions, 1733 | ignoring all other partitions in the existing dataset. 1734 | In the Job Commit operation, the ``fail'' conflict option will only fail if there 1735 | are existing files in the partitions to which new files are added; 1736 | the ``overwrite'' option will cause the existing files in the destination 1737 | partitions to be deleted. 1738 | 1739 | Thus, ``the Partitioned Staging Committer'' 1740 | permits jobs to be run with their destination set to the actively 1741 | shared dataset, while existing work queries can be run across the data. 1742 | By eliminating the need to copy data at the end of an isolated query, 1743 | it can speed up a workflow of execute-then-rename. 1744 | 1745 | 1746 | \subsection{The Magic Committer} 1747 | \label{subsec:magic-committer} 1748 | 1749 | Rather than stage data in the local filesystem, the magic committer 1750 | allows task attempts to write directly to the object store as 1751 | delayed multipart uploads. 1752 | 1753 | One challenge of the committer is : how to determine when a client wants to initiate 1754 | a delayed-visibility write operation? 1755 | 1756 | 1757 | Whenever a file is written to a directory under the path \texttt{__magic}, 1758 | it is considered to be a delayed write operation. 1759 | The relative path under the this directory is mapped as being relative to the 1760 | job's destination directory ---the parent directory of the \texttt{__magic} path. 1761 | 1762 | To support multiple job and task attempts, the output of every task attempt 1763 | must be written as to be relative to the Job's destination directory. 1764 | Accordingly, whenever a directory with the name \texttt{__base} is 1765 | encountered, it declares that its contents must be mapped relative to the destination 1766 | directory. 1767 | 1768 | \begin{table} 1769 | \caption{Example magic path mappings} 1770 | \begin{tabular}{ l l } 1771 | \hline 1772 | \textbf{original} & \textbf{final} \\ 1773 | \texttt{dest} & \texttt{dest} \\ 1774 | \texttt{dest/__magic/1} & \texttt{dest/1} \\ 1775 | \texttt{dest/__magic/1/2} & \texttt{dest/1/2} \\ 1776 | \texttt{dest/__magic/job1/task003/__base/3} & \texttt{dest/3} \\ 1777 | \texttt{dest/__magic/job2/task004/__base/4/5} & \texttt{dest/4/5} \\ 1778 | \texttt{dest/__magic/1/2.pending} & \texttt{dest/__magic/1/2.pending} \\ 1779 | \texttt{dest/__magic/job1/task003.pendingset} & \texttt{dest/__magic/job1/task003.pendingset} \\ 1780 | \hline 1781 | \end{tabular} 1782 | \label{tab:magic-paths} 1783 | \end{table} 1784 | 1785 | 1786 | When the a magic output stream is closed, the manifest of the single upload is saved 1787 | to a \texttt{.pending}-suffixed file saved under the \texttt{__magic} path, 1788 | along with a 0-byte marker file of the original path. 1789 | The latter is required to satisfy applications which verify the existence 1790 | of their written file. 1791 | 1792 | When a task attempt is committed, all \texttt{.pending} files under its task attempt directory are 1793 | listed and saved into a single \texttt{.pendingset} file into the job attempt directory. 1794 | 1795 | When the job is committed, all \texttt{.pendingset} files in its job attempt 1796 | directory are loaded, and the outstanding uploads listed therein committed. 1797 | 1798 | Because of its use of list operations to enumerate uploads to commit, this 1799 | committer needs consistent metadata listings of the object store. 1800 | This is provided by the S3Guard extension to S3A\ \cite{HADOOP-13345}, 1801 | which uses Amazon's DynamoDB database for the consistent metadata view. 1802 | This significantly speeds up the listing operations, so speeding up the task 1803 | and job commit operations. 1804 | 1805 | %% Define the extra variables for the magic committer 1806 | \newcommand{\MagicVars}{ 1807 | \FileOutputCommitVars 1808 | \SetKwData{temp}{_$temporary$} 1809 | \SetKwData{magic}{\_\_magic} 1810 | \SetKwData{magicPath}{$magicPath$} 1811 | } 1812 | 1813 | 1814 | \begin{table} 1815 | \caption{Extra variables used by the magic committer} 1816 | \begin{tabular}{ l l } 1817 | \hline 1818 | \textbf{name} & \textbf{meaning} \\ 1819 | 1820 | $magicPath$ & The magic directory \\ 1821 | \hline 1822 | \end{tabular} 1823 | \label{tab:MagicCommitter.variables} 1824 | \end{table} 1825 | 1826 | 1827 | \textbf{Job Setup} 1828 | 1829 | 1830 | %% StagingCommitter.setupJob( 1831 | \begin{procedure} 1832 | \MagicVars 1833 | 1834 | \magicPath $\longleftarrow$ \dest + __magic\; 1835 | \jobAttemptPath $\longleftarrow$ \magicPath + + \jobAttemptId\; 1836 | \taskAttemptPath $\longleftarrow$ \jobAttemptPath/\taskAttemptId\; 1837 | \mkdirs(\fs, \jobAttemptPath)\; 1838 | \caption{MagicCommitter.setupJob()} 1839 | \label{alg:MagicCommitter.setupJob} 1840 | 1841 | \end{procedure} 1842 | 1843 | \textbf{Task Setup} 1844 | 1845 | 1846 | %% MagicCommitter.setupTask() 1847 | \begin{procedure} 1848 | \MagicVars 1849 | 1850 | \taskAttemptPath $\longleftarrow$ \jobAttemptPath/\taskAttemptId\; 1851 | \mkdirs(\fs, \taskAttemptPath)\; 1852 | 1853 | \caption{MagicCommitter.setupTask()} 1854 | \label{alg:MagicCommitter.setupTask} 1855 | 1856 | \end{procedure} 1857 | 1858 | \textbf{Needs Task Commit} 1859 | 1860 | A commit is required iff files are pending, which is true if there are 1861 | files to upload. 1862 | 1863 | \begin{function} 1864 | \StagingVars 1865 | 1866 | \return \exists(\fs, \taskAttemptPath)\; 1867 | 1868 | \caption{MagicCommitter.needsTaskCommit()} 1869 | \label{alg:MagicCommitter.needsTaskCommit} 1870 | 1871 | \end{function} 1872 | 1873 | This will return true even if there are no \texttt{.pending} files under the task attempt 1874 | path. 1875 | A full path listing could determine this, but as this itself is potentially a slow 1876 | operation, we have omitted it, relying on the task commit process to 1877 | handle the case of no output being generated. 1878 | 1879 | % commit task 1880 | 1881 | A task attempt is committed by listing all the single \texttt{.pending} files 1882 | under a the task attempt directory, reading in the contents and merging it 1883 | into the set of all pending uploads initiated by this task attempt. 1884 | This file is then saved as a \texttt{.pendingset} file into the job attempt directory, 1885 | which is still in the \texttt{__magic} directory. 1886 | 1887 | \begin{procedure} 1888 | \MagicVars 1889 | 1890 | $Pending \longleftarrow \emptyset$\; 1891 | \For{$f \in $ \listFiles(\fs, \taskAttemptPath, $recursive$)} { 1892 | $Pending \longleftarrow Pending + \{$ \loadPendingFile(\fs, $f$) $\}$\; 1893 | } 1894 | 1895 | \savePendingSet(\fs, \jobAttemptPath + $taskId$, $Pending$)\; 1896 | 1897 | \caption{MagicCommitter.commitTask()} 1898 | \label{alg:MagicCommitter.commitTask} 1899 | 1900 | \end{procedure} 1901 | 1902 | Because the \texttt{.pendingset} file is written in a single atomic PUT, the 1903 | commit of an individual task attempt is atomic. 1904 | 1905 | If there are no .\texttt{.pendingset} files, the saved \texttt{.pendingset} file 1906 | will simply contain an empty list of pending uploads. 1907 | 1908 | 1909 | \textbf{Task Abort} 1910 | 1911 | A task is aborted by listing all \texttt{.pending} files in the task attempt directory, 1912 | then aborting the upload associated with it 1913 | 1914 | \begin{procedure} 1915 | \MagicVars 1916 | 1917 | \For{$f \in $ \listFiles(\fs, \taskAttemptPath, $recursive$)} { 1918 | \abortUpload(\loadPendingFile(\fs, $f$))\; 1919 | } 1920 | 1921 | \caption{MagicCommitter.abortTask()} 1922 | \label{alg:MagicCommitter.abortTask} 1923 | \end{procedure} 1924 | 1925 | 1926 | \textbf{Job Commit} 1927 | 1928 | The Job commit operation is very similar to that of the Staging Committer, because 1929 | they are doing nearly the same operation: loading in the \texttt{.pendingset} files 1930 | from a directory and completing the uploads listed within. 1931 | 1932 | % commit job 1933 | \begin{procedure} 1934 | \MagicVars 1935 | 1936 | $Pending \longleftarrow \emptyset$\; 1937 | \For {$f \in $ \listFiles(\fs, \jobAttemptPath)} { 1938 | $Pending \longleftarrow Pending + $ \loadPendingSet(\fs, $f$)\; 1939 | } 1940 | \For {$p \in Pending$} { 1941 | \completeUpload($p$)\; 1942 | } 1943 | 1944 | \caption{MagicCommitter.commitJob()} 1945 | \label{alg:MagicCommitter.commitJob} 1946 | \end{procedure} 1947 | 1948 | 1949 | This committer does not currently support the ``partitioned commit'' conflict 1950 | resolution mechanism, so omits the conflict handling operation. 1951 | Otherwise it is identical, and has the similar performance and (non-) atomicity 1952 | characteristics. 1953 | 1954 | 1955 | % Abort Job 1956 | 1957 | \textbf{Job Abort/ Job Cleanup} 1958 | 1959 | A job is aborted and/or cleaned up by aborting all outstanding uploads 1960 | pending against the destination directory. 1961 | 1962 | \begin{procedure} 1963 | \MagicVars 1964 | 1965 | \For {$f \in $ \listPendingUploads(\dest)} { 1966 | \abortUpload($f$)\; 1967 | } 1968 | 1969 | \caption{MagicCommitter.abortJob()} 1970 | \label{alg:MagicCommitter.abortJob} 1971 | \end{procedure} 1972 | 1973 | 1974 | No attempt is made here to list any \texttt{.pending} or .\texttt{.pendingset} files. 1975 | We committer cannot rely on those to enumerate all uploaded files, 1976 | specifically those of failed task attempts, where the information about 1977 | the pending uploads may not have been not saved. 1978 | Asking the S3 store to enumerate all pending uploads, and then aborting each one 1979 | guarantees that all incomplete uploads will be aborted. 1980 | 1981 | 1982 | 1983 | % ======================================================================== 1984 | \section{Integration with Hadoop and Spark} 1985 | \label{sec:integration} 1986 | 1987 | A major challenge with this work is integrating the committers with MapReduce 1988 | and Spark, without making changes to their commit protocols themselves. 1989 | This is complicated by the fact that the choice of committer to use is 1990 | not made directly by their commit engines, but, when the Hadoop file output 1991 | formats are used, returned by the method \texttt{OutputFormat.getOutputCommitter}. 1992 | Hadoop-based output formats all extend a common \texttt{FileOutputCommitter}, 1993 | so return its committer or a custom subclass thereof, the \texttt{FileOutputCommitter}. 1994 | 1995 | How to switch all the existing subclasses of \texttt{FileOutputFormat} to using 1996 | a new committer when using an object store as a destination? 1997 | 1998 | This was achieved by modifying \texttt{FileOutputFormat}, so that rather than 1999 | only working with the standard \texttt{FileOutputCommitter}, it was possible to declare 2000 | a different committer factory for different filesystem schemas\ \cite{MAPREDUCE-6823}. 2001 | The \texttt{s3a:} schema is configured to refer to an S3A-specific factory, which 2002 | returns the specific S3A committer chosen on the job configuration. 2003 | 2004 | An alternative strategy would have been to retrofit an ``algorithm 3'' inside 2005 | the \texttt{FileOutputCommitter}, which would have implemented the plugin point. 2006 | This would have permitted the new committers to be inserted underneath any 2007 | subclass, so retrofit it to classes such as the \texttt{ParquetFileOutputCommitter}. 2008 | We chose not to do this 2009 | 2010 | \begin{enumerate} 2011 | \item The existing code is complex, containing two intermixed co-recursive 2012 | algorithms. 2013 | \item Our changes could unintentionally break the correctness of the existing committer. 2014 | \item Subclasses of the existing committer may have been implemented to extend 2015 | the protocol, perhaps by summarizing the output, writing extra files, etc. 2016 | Changing the superclass behavior to not create output files until job commit 2017 | ran the risk of breaking all this code. 2018 | \end{enumerate} 2019 | 2020 | The factory design eliminated these risk at the expense of complicating 2021 | Spark/Parquet integration. 2022 | 2023 | To address this, we ultimately implemented two committers 2024 | The \texttt{PathOutputCommitProtocol}, which extended Spark's 2025 | \texttt{HadoopMapReduceCommitProtocol} class, relaxing the requirement of a 2026 | committer to be a subclass of \texttt{FileOutputCommitter}. 2027 | 2028 | The \texttt{BindingParquetOutputCommitter} then extends Parquet's 2029 | \texttt{ParquetOutputCommitter} class, relaying 2030 | all commit operations to that of whichever committer was committer dynamically created 2031 | through the factory mechanism. 2032 | This allows Spark's requirement ``ParquetFileFormat requires a ParquetOutputCommitter'' 2033 | to be satisfied with any of the factory-created committers. 2034 | 2035 | % During the development of the committers, a change in Spark caused the 2036 | % tests to fail. 2037 | % Spark was enhanced to measure the amount of data created by a task, by 2038 | % measuring the length of the written file\ \cite{SPARK-21669}. 2039 | % With the Magic Committer, there is no written file, not until the job is committed. 2040 | % Accordingly: the probe failed, so resulting in a task and, transitively a job, failure. 2041 | % The Magic Committer was extended to create a zero-byte file in the expected path, 2042 | % so guaranteed that the existence check will hold. 2043 | % It does mean, however, that the statistics collected by Spark will not measure 2044 | % the amount of data written. 2045 | 2046 | 2047 | 2048 | % ======================================================================== 2049 | \section{Correctness} 2050 | \label{sec:correctness} 2051 | 2052 | The two new committers implement variants of the same concept: delaying 2053 | manifesting of multipart uploads. 2054 | Do the new algorithms actually \emph{work}? 2055 | 2056 | 2057 | \subsubsection{Defining Correctness of Committed work} 2058 | 2059 | First, a definition of correct behavior must be defined. 2060 | 2061 | \begin{paragraph} 2062 | \textbf{Completeness of job output.} 2063 | After a successful invocation of \texttt{commitJob()}, 2064 | the destination directory tree will contain all files written under the output directory 2065 | of all task attempts which successfully returned from an invocation of \texttt{commitTask()}. 2066 | The contents of these files will contain exactly the data written by the user code. 2067 | \emph{``You get what was committed''} 2068 | \end{paragraph} 2069 | 2070 | \begin{paragraph} 2071 | \textbf{Exclusivity of output.} 2072 | After a successful invocation of \texttt{commitJob()}, 2073 | the destination directory tree must only contain the output of successfully 2074 | committed tasks. 2075 | \emph{``And not what wasn't''}. 2076 | \end{paragraph} 2077 | 2078 | \begin{paragraph} 2079 | \textbf{Consistency of the commit.} 2080 | The task or job must be able to reliably commit the work, even in the presence 2081 | of inconsistent listings. 2082 | This could be addressed, for example, by using a consistent store for some operations, 2083 | or a manifest mechanism and a reliance on create consistency. 2084 | Consistency with subsequent queries in a workflow is encouraged, else a ``sufficient'' 2085 | delay is needed for the listings to become consistent. 2086 | \emph{``Addresses store inconsistencies, somehow''} 2087 | \end{paragraph} 2088 | 2089 | \begin{paragraph} 2090 | \textbf{Concurrent.} 2091 | Multiple tasks in the same job must be able to commit concurrently. 2092 | A job must be able to commit its work while other jobs are committing 2093 | their work \emph{to different destinations in the store}. 2094 | \end{paragraph} 2095 | 2096 | \begin{paragraph} 2097 | \textbf{Ability to abort.} 2098 | If a job attempt is aborted before \texttt{commitJob()}, is invoked, and 2099 | \texttt{cleanupJob()} called, then the output of the attempt will not appear in the 2100 | destination directory at any point in the future. 2101 | \emph{``An aborted/cleaned up job no longer exists''} 2102 | \end{paragraph} 2103 | 2104 | 2105 | \begin{paragraph} 2106 | \textbf{Continuity of correctness.} 2107 | After a job has been successfully committed, no outstanding task may promote 2108 | output into the destination directory. 2109 | That is: if a task attempt has not ``failed'' mid-commit, merely proceeded at a slow rate, 2110 | its output will not contaminate the directory of the already-successful job. 2111 | \emph{``A dead task attempt stays dead''} 2112 | \end{paragraph} 2113 | 2114 | 2115 | The \emph{continuity-of-correctness} requirement excludes that of a failed job. 2116 | We depend here upon the restriction that a job will not commit its work unless 2117 | a heartbeat has been received in a predefined time interval from the YARN ResourceManager. 2118 | Assuming all clocks move forward at approximately the same rate, if a job has 2119 | not received/responded to heartbeats outside that interval, 2120 | we can can conclude that the process will no longer commit work. 2121 | This failure to respond to heartbeats triggers YARN rescheduling a new 2122 | instance of the Job Manager and an attempt to kill the previous attempt. 2123 | A second job attempt may conclude from the very fact that it has been launched 2124 | that the previous job attempt will not attempt to commit its work 2125 | \footnote{the monotonically increasing YARN attempt ID value implicitly 2126 | informs the Job whether or not it is the first attempt}. 2127 | 2128 | This definition of correctness omits some constraints: 2129 | 2130 | \begin{itemize} 2131 | \item The output of committed tasks not being present in the output directory 2132 | until the job is committed. 2133 | Rationale: It is the final state of the job which matters, not intermediate states. 2134 | 2135 | \item That task commit operation is atomic. 2136 | Rationale: The v2 commit algorithm does not meet this requirement. 2137 | 2138 | \item The job commit operation is atomic. 2139 | Rationale: The v1 commit algorithm does not meet this requirement. 2140 | 2141 | \item Concurrent jobs writing to the same destination will succeed and 2142 | produce output equivalent to a serialized commit of the the jobs. 2143 | Rationale: none of the original commit algorithm offers such guarantees. 2144 | \end{itemize} 2145 | 2146 | The implication of not requiring these constraints is that the higher-level 2147 | commit protocol must react to failures or timeouts of the task and job 2148 | commit operations. 2149 | 2150 | %Concurrency cannot easily be handled in the commit protocol except through 2151 | %some mechanism of obtaining an exclusive lock of operations 2152 | %on the destination path, one shared with all applications which may write 2153 | %to that path. 2154 | %Directory existence may be such an option for filesystems 2155 | %supporting an atomic check-and-create \tt{mkdir()} call, though it is not 2156 | %a check which the \tt{FileOutputCommitter} directly performs. 2157 | 2158 | 2159 | We do not attempt provide a formal proof of the correctness of the algorithms. 2160 | A TLA+ specification of the behavior of a consistent object store was created 2161 | during the process, however we have not completed that with any 2162 | algorithm specifications\ \cite{s3-tla}. 2163 | Modelling an eventually consistent somewhat is ``somewhat challenging'' 2164 | At the same time, it is fundamentally impossible to demonstrate through testing 2165 | that the algorithms are correct in the presence of inconsistency ---so arguing 2166 | strongly for that correctness proof. 2167 | 2168 | In the absence of proofs, 2169 | here are our informal assertions about the correctness of the two algorithms. 2170 | 2171 | \subsubsection{Correctness of the Staging Committer} 2172 | 2173 | All task attempt output is written to the local filesystem; 2174 | it is implicitly not in the destination object store until task commit. 2175 | 2176 | In task commit, the contents of the local attempt directory are uploaded to the 2177 | destination, as incomplete uploads. 2178 | Hence: not visible until an operation completes the multipart upload. 2179 | 2180 | A task attempt's manifest of uploads to complete is saved to the cluster filesystem, 2181 | where the v1 commit algorithm is used to commit this file. 2182 | Thus the commitment of upload data has the same correctness as the 2183 | v1 commit algorithm on a consistent HDFS filesystem. 2184 | 2185 | In the job commit, the v1 commit algorithm ensures that the contents 2186 | of $clusterJobAttemptPath$ contains only the manifest of committed tasks. 2187 | 2188 | As the v1 algorithm satisfies the completeness and exclusivity requirements, 2189 | we can be confident that reading in these lists will build an aggregate list 2190 | of files to commit, a list which is, transitively, complete and exclusive. 2191 | 2192 | The subsequent job commit action is to complete these uploads, 2193 | then cancel all other multipart uploads pending against the directory tree. 2194 | This will cancel the pending work of all tasks attempts which have uploaded staged 2195 | data, but which were somehow not included in the list of committed tasks. 2196 | That is: they failed during the task commit process. 2197 | 2198 | Because HDFS is used to propagate the lists of uncommitted files from 2199 | committed task attempts to the job committer, there is no requirement for 2200 | a consistent view of the object store during the actual job. 2201 | For the results of a successful job to be safely used by another application, 2202 | something must still present a consistent view of the results, or 2203 | the interval between the publishing of the results and their use must be sufficient 2204 | for the users to be \emph{confident} that the store is now consistent, 2205 | or at least optimistic\ \cite{Bermbach:2014:BEC:2624303.2624662}. 2206 | 2207 | 2208 | 2209 | \subsubsection{Correctness of the Magic Committer} 2210 | 2211 | This is harder to demonstrate, and depends on consistent directory 2212 | listings of the object stores, that is: all files created under a path 2213 | in the object store are visible to the LIST operation. 2214 | For Amazon S3, this requires a consistency layer, such as S3mper or S3Guard 2215 | \ \cite{S3mper,HADOOP-13345}. 2216 | Implementations of the S3 Store API which offer consistent listings are not 2217 | at risk. 2218 | 2219 | All task attempt output is written to the object store, to the final (calculated) 2220 | destination. 2221 | However, the writes are not made visible until the job is committed. 2222 | 2223 | The requirements of completeness and exclusivity must be met by 2224 | having the lists of pending uploads generated by committed task attempts propagated 2225 | to the Job Commit phase, and the list of pending uploads from uncommitted 2226 | attempts not propagated to the Job Commit. 2227 | 2228 | That is: 2229 | 2230 | \begin{enumerate} 2231 | \item All pending uploads written by a committed task attempt must be 2232 | included in the final list of uploads for the job to commit. 2233 | \item No pending uploads by a failed task attempt must be included in this list. 2234 | \item A partitioned task attempt's uploads must never become visible, 2235 | even if uploaded after task or job commit. 2236 | \end{enumerate} 2237 | 2238 | 2239 | Reviewing this code, there appears to be a small race condition in job commit, 2240 | wherein a task attempt partitioned from the Job Manager during task commit 2241 | can still complete its PUT of its list of uploads to commit, the ``pending set'', 2242 | overwriting that of the task attempt which had considered itself successful. 2243 | 2244 | We cannot defend against that with the traditional strategy of creating 2245 | a file with overwrite=false, because against S3, there is no atomic 2246 | ``create-no-overwrite'' operation. 2247 | 2248 | Instead we rely on the higher level requirement that any committed task attempt must 2249 | constitute a valid outcome, and argue that the pending set from either attempt 2250 | must constitute a valid result of a task. 2251 | 2252 | 2253 | It's notable that this process could be improved were the job commit 2254 | operation supplied with a list of successful task attempts; 2255 | this would avoid inferring this state from the filesystem, except in 2256 | the case of job recovery from a commit algorithm capable of 2257 | rebuilding its state from a directory listing (i.e.\ the v1 committer). 2258 | Spark's protocol already permits this, but not Hadoop's. 2259 | 2260 | Regarding the requirement to abort safely, the fact that all writes are 2261 | not manifest until job commit means that the any writes from failed tasks 2262 | will remain ``pending''. 2263 | 2264 | Data in this state is still billed by byte, so must not be neglected. 2265 | After the job commits all successful tasks it lists all outstanding 2266 | uploads against the destination directory and cancels them. 2267 | We implemented a command line tools to list and cancel pending uploads for 2268 | scheduled workloads, and, finally, 2269 | it is possible to set a rule on an S3 bucket whereby uncompleted 2270 | pending uploads are deleted a specific time interval after their creation. 2271 | Our documentation recommends an interval of twenty-four hours here, to 2272 | clean out old data yet without affecting jobs ---assuming that all jobs 2273 | take less than a day to complete. 2274 | 2275 | 2276 | \subsection{Testing} 2277 | \label{subsec:testing} 2278 | 2279 | Confidence in the correctness of the algorithms notwithstanding, there 2280 | is still the issue of the correctness of the implementation. 2281 | 2282 | 2283 | This was done through testing: 2284 | 2285 | \begin{enumerate} 2286 | \item Functional tests of the underlying IO operations against Amazon S3. 2287 | \item Tests of the commit operation against a mock S3 service endpoint. 2288 | \item Invocations of the commit protocols in the normal and failing sequences of operations. 2289 | \item Integration tests on a single host MapReduce cluster. 2290 | \item Single-host integration tests of Spark integration, tests derived from Spark's own SQL test suites. 2291 | \item Large scale integration tests in virtual test clusters. 2292 | \item Peer review. 2293 | \end{enumerate} 2294 | 2295 | To aid in demonstrating resilience to metadata inconsistency 2296 | operations and transient network failures, Hadoop's \texttt{hadoop-aws} module 2297 | now contains a special fault-injecting S3 connector: idempotent throttling errors and 2298 | delayed consistency can both be simulated in the downstream tests; 2299 | this was used in integration testing. 2300 | 2301 | The large-scale integration tests have not, at the time of writing, highlighted any problems; 2302 | the simpler test suites were co-developed with the code, and exposing issues and 2303 | being expanded as new issues were discovered. 2304 | One bug the integration tests did show that our committers' cleanup code was 2305 | over-aggressive in listing and cancelling all outstanding uploads pending 2306 | on the destination directory. 2307 | 2308 | 2309 | The \texttt{cleanupJob()} procedure used the existing S3A client command 2310 | \texttt{listMultipartUploads(directory)} to enumerate the updates, 2311 | which were then cancelled. 2312 | A detailed review of this code while trying to identify an intermittent problem 2313 | made clear that this existing routine had a long standing bug in it. 2314 | Rather than just list all uploads under a directory, it also included 2315 | all uploads in directories whose paths began with the same string. 2316 | That is, listing and cancelling pending work in the directory \texttt{/output/dataset1}, 2317 | would also delete the output in \texttt{/output/dataset10}, \texttt{/output/dataset11/work}, 2318 | ...etc. 2319 | We are fortunate that this was found before the product shipped. 2320 | This does, however highlight our implementation's dependencies on the correctness 2321 | of the existing codebase, and how hard it is to imagine test cases which 2322 | can demonstrate the existence of bugs. 2323 | Who would have expected a test running in \texttt{/output/dataset1} to 2324 | cause an independent test in \texttt{/output/dataset10} iff the two tests 2325 | executions overlapped, and the first test executed is \texttt{cleanupJob()} 2326 | operation when the second had committed at least one task but not committed 2327 | the final job? 2328 | 2329 | 2330 | Peer review is an integral part of the development process; 2331 | It was invaluable to have other developers interested in this problem 2332 | and willing to contribute time reviewing the code and testing it 2333 | in their own environments, including a commercial S3-compatible 2334 | storage system. 2335 | 2336 | 2337 | % ======================================================================== 2338 | 2339 | \section{Results} 2340 | \label{sec:results} 2341 | 2342 | 2343 | The performance of the new committers is not visible with small amounts 2344 | of data, as the number of HTTP requests is the dominant factor. 2345 | As the amount of data increases, the elimination of the copy operations 2346 | delivers a significant speedup to the new committers. 2347 | With a measured in-S3 copy time of ~6-10MB/s, the saving is 1 second per 10 MB 2348 | of data committed. 2349 | 2350 | Comparing the staging and magic committers is interesting. 2351 | 2352 | The Staging committer writes all data locally, with the write bandwidth 2353 | of the local (usually virtual) disk. 2354 | In task commit, this data must be read and uploaded to the S3 service. 2355 | Usually it is the bandwidth between the server and S3 which is the bottleneck, 2356 | though as S3 throttles requests to specific shards, having many servers trying 2357 | to write to the same destination directory tree can slow down the write, irrespective 2358 | of bandwidth. 2359 | \ \cite{AWS-S3-throttling}. 2360 | If a single task has generated many files, or many tasks of the same job are 2361 | committing nearly simultaneously, this may be observed. 2362 | \footnote{Throttling can also be observed on read operations; 2363 | in such situations adding more workers is counterproductive.} 2364 | 2365 | Job commit is a matter of reading the small \texttt{.pendingset} files saved in the 2366 | cluster filesystem (HDFS), and then issuing the relevant POSTs: one per uploaded 2367 | object. 2368 | This is parallelized, and not constrained by bandwidth. 2369 | Capacity in a local pool of HTTP1.1 connections, the time to create more, 2370 | and potentially throttling are the primary limits on IO performance at this point. 2371 | 2372 | The Magic Committer uploads data in blocks as it is written: the larger 2373 | the amount of data created by a single task, the greater the performance 2374 | benefit over the Staging committer's task-commit-time upload. 2375 | However, task commit does list the task attempt directory and read all \texttt{.pending} 2376 | files within, an operation which can take a few hundred milliseconds per file, 2377 | and again, potentially throttled. 2378 | With only a single summary file written back to S3, task commit is never 2379 | bandwidth constrained. 2380 | 2381 | Job commit time is that of the Staging Committer, proceeded by a listing 2382 | of and reading in of the pending files of every committed task. 2383 | This is again a few hundred milliseconds per file, though parallelization 2384 | can reduce the delay. 2385 | 2386 | Ignoring throttling, the Magic Committer is best with tasks which create 2387 | large amounts of data in each task attempt. 2388 | As well as avoiding the upload in the task commit, by reducing the 2389 | amount of storage needed in the virtual machine, VM and Container instances 2390 | with smaller amounts of storage can be request, or simply more tasks executed 2391 | per VM: computation, RAM and network bandwidth are the bottlenecks. 2392 | 2393 | In production use, we have found that the default size of the HTTP thread 2394 | pool becomes a bottleneck in the job commit phase for any queries 2395 | containing many thousands of files. 2396 | The small-payload POST requests are executed in parallel for higher 2397 | throughput, but the default limit on the number of HTTP connections, 15, 2398 | limits that parallelization. 2399 | Increasing this value to a larger number, such as 50 to 100, significantly 2400 | speeds up this phase of a query. 2401 | 2402 | One final feature to highlight is the ``partitioned committer'' variant 2403 | of the Staging Committer, which is designed to update an existing 2404 | dataset in-place, only considering conflict with existing data in 2405 | those partitions for which data is actually generated. 2406 | This supports workflows where large datasets are updated on a daily basis, 2407 | without the need for any post-job copy of the new day's data into the 2408 | final dataset. 2409 | If the existing workflow for maintaining such large datasets involved 2410 | moving the new data into the aggregated dataset, those renames themselves 2411 | suffer from the performance constraints of the store's COPY operation. 2412 | Here, then, the speedup comes from the overall workflow, rather than 2413 | simply the query. 2414 | 2415 | 2416 | % ======================================================================== 2417 | 2418 | \section{Limitations} 2419 | \label{sec:limitations} 2420 | 2421 | A key criticism of the new committers is that the job commit operation is not atomic; 2422 | it is an $O(files)$ operation which may fail partway through. 2423 | We respond that as Hadoop's MapReduce v1 commit algorithm it itself non-atomic in job commit; 2424 | the Job Manager commit protocol detect failures in job commits 2425 | of previous attempts, and either recover or fail, according to the actions 2426 | offered by the committer. 2427 | A more subtle issue is the volume of POST requests required of a significantly large job, 2428 | all against a specific shard of the S3 store can trigger HTTP throttling. 2429 | This reduces the benefit of parallelized issuing of the POST requests. 2430 | 2431 | A Hadoop task process may exit without \texttt{abortTask()} being invoked. 2432 | Specifically, it exits immediately during the ping/response 2433 | heartbeat process if any of a number of conditions are met. 2434 | This is probably a bug in Hadoop ---and straightforward to correct. 2435 | 2436 | \begin{enumerate} 2437 | \item Predefined task limits are exceeded 2438 | (currently an optional limit on the number of bytes written to the local filesystem). 2439 | \item Communications with the Job Manager have failed beyond configured limits. 2440 | \item The response to the \texttt{ping()} call is \texttt{false}, indicating the current 2441 | Job Manager does not consider the task to part of its set of active tasks. 2442 | \end{enumerate} 2443 | 2444 | The first check is a defense against an errant process filling the local 2445 | filesystem with data; 2446 | the latter are symptoms of ---and reaction to--- different failures (loss of manager/network failure) 2447 | and restarted manager with no knowledge of active task, respectively. 2448 | There is also the without-warning failures triggered by the operating system 2449 | if limits on the execution environment are exceeded: usually memory allocation. 2450 | 2451 | While OS-level failures can occur without warning, it would be useful if the 2452 | ``managed'' system exits triggered in the heartbeat thread were to invoke 2453 | an emergency task cleanup operation. 2454 | For the S3A committers, this would consist of aborting all pending uploads, and 2455 | deleting any local data. 2456 | While the Job committer's \texttt{cleanupJob()} operation is expected to clean up 2457 | the output of all task attempts, active participation of the tasks would 2458 | reduce the time incomplete uploads were pending (reducing costs) and 2459 | potentially free up local disk storage. 2460 | 2461 | This appears to us to be an enhancement to the commit protocol which could 2462 | be considered. 2463 | 2464 | 2465 | One problem which may manifest itself in cloud-based deployments, 2466 | is that the Hadoop commit protocol assumes that time increases monotonically 2467 | on individual machines in the cluster. 2468 | The job manager and workers can use the interval between the last successful heartbeat 2469 | and the current time as the means by which they can consider themselves to have lost 2470 | contact with each other and system services. 2471 | In cloud environments clocks may stutter, proceed at significantly different rates, 2472 | and indeed, may even proceed backwards, especially if the VMs are moved between 2473 | physical cluster nodes. 2474 | We hope that Amazon's newly introduced \emph{Time Sync Service} 2475 | can address this on well-configured systems\ \cite{AWS-clock-service}. 2476 | 2477 | 2478 | % ======================================================================== 2479 | 2480 | \section{Improvements to the Commit Protocols} 2481 | \label{sec:improvements-to-commit-protocols} 2482 | 2483 | 2484 | This work has highlighted some of the existing limitations of the commit protocols, 2485 | specifically:- 2486 | 2487 | \begin{itemize} 2488 | \item Lack of information returned by task attempts as to what output they have committed, 2489 | prevents this from being validated in the job commit. 2490 | \item Lack of information declared by the committers as to whether they can 2491 | recover from a failure during the task commit. 2492 | \item A general expectation in the execution engines that job and task abort do 2493 | not fail, or at least, succeed in a bounded time and log rather than propagate 2494 | failures. 2495 | \end{itemize} 2496 | 2497 | These can be addressed; 2498 | the Spark commit protocol is already partway to doing so, as tasks attempts 2499 | can propagate information to the job commit phase. 2500 | 2501 | We recommend extending the underlying output committers, to provide the 2502 | information needed to enhance the protocols robustness. 2503 | An extended \texttt{commitTask()} operation should return the (opaque) information 2504 | needed for the \texttt{commitJob()} method to validate the correctness and 2505 | completeness of the operation; 2506 | The job committer can then validate the content of the final output directory. 2507 | 2508 | Were the committers also declare their ability to recover from a failed task commit, 2509 | as they do for job recovery, then query engines would be able to choose the safe 2510 | actions following the failure/timeout of a task attempt commit. 2511 | 2512 | We should also have the object store connectors declare their consistency policy, 2513 | so have the file committers fail fast when executed against an inconsistent store. 2514 | This can be implemented at the filesystem and committer layers. 2515 | 2516 | As for the task abort issues; 2517 | that can be addressed with a straightforward hardening of the abort operations, 2518 | \emph{and} their use. 2519 | 2520 | Finally, the committers and the underlying storage infrastructures are instrumented; 2521 | they can and collect statistics about their operations, information 2522 | which can be useful in identify performance and scale problems. 2523 | Again, this could be propagated back from the committers to to the 2524 | query engine. 2525 | Our new committers do collect this information, and aggregated it in the 2526 | job commit process, but only to publish it in the \texttt{_SUCCESS} file; 2527 | it is not integrated with the applications themselves. 2528 | 2529 | All these problems are tractable, and will improve confidence in the ability 2530 | of the query engines to safely interact with alternate data stores and 2531 | commit algorithms written to work with them. 2532 | 2533 | 2534 | 2535 | \section{Related Work} 2536 | \label{sec:related-work} 2537 | 2538 | \subsection{Spark's Direct Output Committer} 2539 | \label{subsec:direct-output} 2540 | 2541 | Apache Spark (briefly) offered a zero rename committer, 2542 | the \emph{Direct Output Committer}\ \cite{SPARK-6352}. 2543 | With this committer, output was written directly to the destination directory; 2544 | both task and job commit operations were reduced to no-ops. 2545 | To avoid concurrency issues, speculative execution of tasks was automatically 2546 | disabled when this committer was used. 2547 | Unfortunately, the committer was still not resilient to failure: a failed 2548 | task could not be repeated, as its output was unknown. 2549 | For this reason it was removed\ \cite{SPARK-10063}. 2550 | 2551 | It's absence is now noted by users, showing how a much zero-rename committer 2552 | was valued by users, even if one which failed to offer the complete semantics 2553 | of a commit protocol. 2554 | Alternatively: performance is observable, whereas consistency and failures 2555 | are not considered important until they surface in production systems. 2556 | 2557 | \subsection{IBMs's Stocator} 2558 | \label{subsec:stocator} 2559 | 2560 | 2561 | IBM's Stocator eliminates renames by also having a direct write to the 2562 | destination\ \cite{Stocator}. 2563 | As with the \emph{the Magic Committer}, it modifies the semantics of write 2564 | operations into the temporary directories of work, here the standard 2565 | \texttt{\_temporary} directory used by the classic \texttt{FileOutputCommitter}. 2566 | To avoid the failure semantics of Spark's \texttt{Direct Output Committer}, 2567 | every remapped file is given a name which inserted the job and task attempt IDs, 2568 | while still preserving the sort order. 2569 | Failed and aborted tasks and jobs can then be cleaned up by their successors. 2570 | Stocator also generates a JSON-formatted \SUCCESS file, which offers 2571 | the ability to obtain a consistent list of the final files committed by a job, 2572 | even in the presence of listing inconsistency. 2573 | 2574 | With this design, Stocator makes the output of work immediately visible; 2575 | there is no task commit, and the job commit is a matter of writing 2576 | the \SUCCESS file. 2577 | 2578 | The actual implementation is achieved by misleading the classic committer, 2579 | changing the semantics of file creation under the task attempt directories 2580 | under the \texttt{\_temporary} path. 2581 | The committer believes that it tasks are writing files to a temporary destination 2582 | and renaming them, when in fact they are being written direct to the final destination directory, 2583 | with a task-attempt-specific filename. 2584 | 2585 | The filesystem \texttt{rename()} operations of the the committer are then implicitly 2586 | omitted: there is no work to rename. 2587 | 2588 | 2589 | Stocator's task commit operation becomes a no-op, thus trivially repeatable. 2590 | Job commit is a listing of the output and generation of the manifest; 2591 | as the manifest PUT is atomic, the job commit itself is atomic. 2592 | 2593 | 2594 | What is critical for Stocator is that the output of all failed tasks 2595 | is cleaned up, \TODO \emph{WHERE?}. 2596 | This cannot be guaranteed in the failure case of: partitioned task attempt 2597 | continues to execute and write new files. 2598 | When that task attempt attempts to commit, it will fail to be granted permission, 2599 | and presumably clean up. 2600 | \TODO: verify commit $\rightarrow$ fail triggers cleanup 2601 | Before that commit and cleanup phase, the destination directory will contain 2602 | data from the ongoing, uncommitted task. 2603 | 2604 | Compared to the other designs, this is unique in that it retrofits an 2605 | object-store-optimized committer under the MapReduce V1 and V2 commit algorithms. 2606 | Thus existing applications can switch to the new committer without needing 2607 | explicit changes. 2608 | This makes it significantly easier to adopt. 2609 | 2610 | 2611 | The closest of the two S3A committers is the Magic Committer. 2612 | It too modifies the object store connector to write the output to a 2613 | different destination than the path requested 2614 | in the user's code \texttt{createFile(path)} call. 2615 | 2616 | The Magic Committer, does not attempt to work underneath the existing committer, 2617 | instead we provide our own store-aware committer 2618 | which ensures that output is not actually manifest until 2619 | the final job is committed. 2620 | Thus it provides the standard semantics of task and job commit: no data is 2621 | visible until the job is committed, and partitioned task attempts will 2622 | never make changes to the visible file set. 2623 | 2624 | \subsection{Amazon's EMRFS S3-optimized Committer} 2625 | \label{subsec:emrfs-committer} 2626 | 2627 | In November 2018, Amazon announced they had implemented their own S3-specific 2628 | committer for Apache Spark\ \cite{AWS-EMR-committer}, with an article 2629 | in March 2019 providing some details on the algorithm\ \cite{AWS-EMR-committer-blog}. 2630 | 2631 | \begin{quote} 2632 | \emph{ 2633 | The EMRFS S3-optimized committer is used for Spark jobs that use Spark SQL, 2634 | DataFrames, or Datasets to write Parquet files.} 2635 | \end{quote} 2636 | 2637 | The documentation contains three assertions:- 2638 | 2639 | \begin{itemize} 2640 | \item "Improves application performance by avoiding list and rename operations" 2641 | \item "Lets you safely enable the speculative execution of idempotent tasks in Spark jobs to help reduce the performance impact of task stragglers." 2642 | \item "Avoids issues that can occur with Amazon S3 eventual consistency during job and task commit phases, and helps improve job correctness under task failure conditions. 2643 | \end{itemize} 2644 | 2645 | 2646 | Without the source being available to examine, we can only infer aspects 2647 | of its behaviour from the documentation and blog post. 2648 | 2649 | The 2019 article is the most informative, as are some hints in. 2650 | 2651 | \begin{itemize} 2652 | \item Data is uploaded as multipart uploads, which are not made visible at the time of writing. 2653 | \item The list of pending uploads is built up the task committers, hence consumes memory in the worker process \cite{AWS-EMR-committer-tuning}. 2654 | \item The manifestation of the files, by completing the multipart upload, is performed in the individual task commit operations. 2655 | \item The job commit operation simply becomes one of writing the \texttt{_SUCCESS)} file, and, hopefully, listing and aborting any incomplete 2656 | multipart uploads in progress under the destination path. 2657 | 2658 | \end{itemize} 2659 | 2660 | Because no data is made visible until the task commit is executed, the output 2661 | of any in-progress tasks is not visible: speculative execution is safe, 2662 | as is reattempts of any task which failed up to the point of the task commit. 2663 | 2664 | However, because the uploads are completed in the task commit operation, 2665 | the output of each committed task is visible. 2666 | Furthermore, because the commit operation is not-atomic, the operation may 2667 | fail partway through, which will trigger a new task attempt, which 2668 | will then commit its work into a destination of unknown state. 2669 | 2670 | It's disappointing that this approach has been chosen, given that passing a 2671 | list of files to commit to the application master is straightforward in the 2672 | spark protocol, and also that the Netflix and Apache prior art showed what 2673 | could be done. 2674 | 2675 | As discussed previously, extending the committer API/protocol to allow tasks 2676 | to declare when task commit was unrecoverable, so Spark would at least 2677 | know when a task commit failure must trigger a job failure, rather than 2678 | rescheduling of another task attempt. 2679 | 2680 | 2681 | \begin{table} 2682 | \begin{tabular}{ l c c c c } 2683 | \hline 2684 | & \textbf{Direct} & \textbf{Stocator} & \textbf{S3A} & \textbf{EMR}\\ 2685 | Speculative Tasks & False & True & True & True \\ 2686 | Recoverable Job & False & False & False & True \\ 2687 | Abortable Task & False & True & True & True \\ 2688 | Abortable Job & True & True & True & True \\ 2689 | Uncommitted task output observable & True & True & False & False \\ 2690 | Committed task output observable & True & True & False & True\\ 2691 | Atomic Task Commit & True & True & True & False \\ 2692 | Atomic Job Commit & True & True & False & True \\ 2693 | Partitioned Executor resilience & False & False & True & False\\ 2694 | \hline 2695 | \end{tabular} 2696 | \caption{Attributes of the different committer algorithms} 2697 | \label{tab:other-committer-attributes} 2698 | \end{table} 2699 | 2700 | 2701 | The classic file output committers postpone this until task (v2) or job (v1) 2702 | commit, and use rename as the low-cost operation to promote the files. 2703 | 2704 | All these object-store-optimized committers focus on eliminating renames, 2705 | and are left with the challenge of finding alternative algorithms to 2706 | allow for distributed queries to complete successfully in the presence 2707 | of failures of individual worker processes. 2708 | 2709 | 2710 | The Direct Committer fails at the foundational requirement: ability to support 2711 | speculative or restarted task attempts. 2712 | This is why it was removed from the ASF codebase. 2713 | 2714 | Stocator also writes to the destination directory, but by renaming the output 2715 | files retains the ability to clean up the output of uncommitted tasks. 2716 | It does however, fail to meet our requirement ``Continuity of correctness.'' 2717 | 2718 | A task which is still in progress after the job commit may generate output 2719 | into the destination directory. 2720 | 2721 | Neither committer is performing any operation in task commit other than creating 2722 | a \SUCCESS marker, which is both atomic and repeatable, therefore their job commit 2723 | operations are both fast and safe. 2724 | 2725 | The EMR Committer has adopted the same multipart upload mechanism to postpone 2726 | manifesting files written to their ultimate paths, but has chosen to materialize 2727 | those files in the task commit rather than postponing it until the job is committed. 2728 | This will deliver performance benefits in larger applications ---at the cost 2729 | of resilience to failures during that task commit operation. 2730 | Without modifying Spark to fail the entire job in such an event, it is placing 2731 | users at risk of corrupted output. 2732 | 2733 | 2734 | One recurrent theme here is that the output of a job is defined as 2735 | ``the contents of the job output directory'', thus all committers are 2736 | forced to output data ``somewhere'' and manifest it in the commit process. 2737 | It is also the reason that eventually consistent metadata operations are 2738 | dangerous: even when the directory tree is valid, a listing may be incorrect. 2739 | 2740 | If applications moved towards declaring the output of a job in 2741 | a manifest file, rather than implicitly defining it as ``all files in the directory 2742 | tree which do not begin with '.' or '\_''', then the writing/renaming 2743 | of this manifest would be all that is needed to commit a job. 2744 | 2745 | The S3A committers and Stocator already generate manifest data in the 2746 | standard \SUCCESS file. 2747 | For our committers, this was done initially for testing; 2748 | later it included the filesystem statistics of the process, so helping 2749 | collect data on IO costs. 2750 | However, it is present, and it could perhaps be used as an alternative to 2751 | a directory listing. 2752 | 2753 | Provided all relevant applications agree to use a single, shared manifest 2754 | format, it may be possible to move to a simpler structure of 2755 | output being written straight to the destination, and the atomic PUT of the 2756 | manifest defining the output. 2757 | 2758 | This is essentially one aspect of the in-incubation Apache Iceberg project, 2759 | which uses manifest files to describe a data source, amongst its other 2760 | features designed to support efficient operation in S3 and elsewhere\ \cite{iceberg, iceberg:slides}. 2761 | 2762 | \section{Conclusions and Further Work} 2763 | \label{sec:conclusions} 2764 | 2765 | Object Stores are becoming a common source and destination of data analyzed 2766 | through Apache Hadoop and Spark. 2767 | The client connectors make the stores resemble filesystems in 2768 | terms of the API exposed to applications, so enabling existing code to 2769 | interact with the stores without modification. 2770 | However, the core semantics required by conventional commit algorithms, particularly 2771 | that of an $O(1)$ atomic rename, are not always met. 2772 | While the existing Hadoop/Spark commit algorithms appear to work, they lack 2773 | both the performance and correctness delivered when used with a ``real'' filesystem. 2774 | 2775 | We have demonstrated that the use of object-store specific operations --here 2776 | the multipart PUT with its ability to complete the upload from a different host-- 2777 | allow for object-store aware commit algorithms to be implemented, 2778 | algorithms which do meet these requirements. 2779 | 2780 | The new committers are implemented in Apache Hadoop 3.1, with a small bridging 2781 | library to aid integration with Apache Spark\ \cite{HADOOP-13786}. 2782 | 2783 | 2784 | These committers have shown that the metaphor presented to applications, 2785 | \emph{Object Stores are File Systems} cannot be sustained. 2786 | As means of allowing existing applications to use stores as a source 2787 | of data the mimicking of directories and files works, albeit sometimes 2788 | inefficiently\ \cite{HADOOP-13208}). 2789 | What does not work is code which expects the strict semantics 2790 | offered by HDFS and other filesystems --atomic creation and rename algorithms. 2791 | This commit algorithm is one key example of a failure point, as 2792 | is any other algorithm attempting to use a shared filesystem 2793 | as a coordination mechanism between process. 2794 | 2795 | The Hadoop project has long discussed the merits of explicitly 2796 | exposing an API for object stores, offering only the limited 2797 | set of verbs such stores present\ \cite{HADOOP-9565}. 2798 | However, we have been unable to progress because of the nuanced details 2799 | between the different stores\ \cite{S3, WASB, ADL, GCS}. 2800 | It is these nuances which prove critical in safely implementing 2801 | commit protocols and suchlike: any API which offered a lowest-common-denominator 2802 | would likely prove itself inadequate. 2803 | 2804 | The integration with the Hadoop and Spark commit protocols is intended 2805 | to support different committers for different destination filesystems. 2806 | We hope to see committers supporting other object stores, each 2807 | able to use store-specific operations. 2808 | What can be offered is common code for much of each implementation, 2809 | knowledge of the new algorithms needed, and 2810 | with the suites of tests used to validate their functionality. 2811 | 2812 | One recurrent issue which this work has shown is that using the 2813 | filesystem or object store to communicate state from task attempts 2814 | to the job committer, and from the job committer to successor 2815 | applications, is brittle. 2816 | 2817 | There is no reason why the job committer cannot be passed the list of 2818 | successful task attempts from the job manager, as well as, ideally, 2819 | the list of failed attempts. 2820 | This can be used for the creation of a manifest, and for aiding cleanup 2821 | of failed task attempts. 2822 | The Spark commit protocol does permit committed task attempts to pass data 2823 | to the Spark committer; 2824 | use of this should be explored. 2825 | 2826 | 2827 | Finally, we note that the Hadoop commit protocols are woefully under-documented; 2828 | understanding them involved stepping through tests with a debugger and 2829 | some deliberate fault injection to see what happened. 2830 | Given how critical the correctness of the protocol and committers and implementations 2831 | are, and how other projects depend also use the same code, there 2832 | are opportunities to better specify the protocol and APIs, and review 2833 | their use. 2834 | We hope this document is a start, while warning readers that it is non-normative. 2835 | 2836 | % ======================================================================== 2837 | 2838 | \section*{Acknowledgements} 2839 | \label{sec:acknowledgements} 2840 | 2841 | We are grateful for the contributions of all reviewers and testers, especially 2842 | Aaron Fabbri and Ewan Higgs. 2843 | We must also highlight the contributions of our QE teams: it is through 2844 | their work that this work is ready for others to use. 2845 | 2846 | % ======================================================================== 2847 | 2848 | \section{References} 2849 | \label{sec:references} 2850 | 2851 | % Bibliography. Include 2852 | 2853 | \bibliographystyle{IEEEtran} 2854 | \bibliography{bibliography} 2855 | 2856 | 2857 | \end{document} 2858 | -------------------------------------------------------------------------------- /tex/bibliography.bib: -------------------------------------------------------------------------------- 1 | %% This BibTeX bibliography file was created using BibDesk. 2 | %% http://bibdesk.sourceforge.net/ 3 | 4 | %% Created for Loughran, Steve at 2017-12-06 20:15:33 +0000 5 | 6 | 7 | %% Saved with string encoding Unicode (UTF-8) 8 | 9 | 10 | 11 | @url{AWS-S3-intro, 12 | Author = {Amazon}, 13 | Title = {{Introduction to Amazon S3}}, 14 | Url = {http://docs.aws.amazon.com/AmazonS3/latest/dev/Introduction.html} } 15 | 16 | 17 | @url{AWS-S3-throttling, 18 | Author = {Amazon}, 19 | Date-Added = {2017-12-06 20:12:58 +0000}, 20 | Date-Modified = {2017-12-06 20:14:27 +0000}, 21 | Title = {{Request Rate and Performance Considerations}}, 22 | Url = {http://docs.aws.amazon.com/AmazonS3/latest/dev/request-rate-perf-considerations.html} 23 | } 24 | 25 | @url{AWS-clock-service, 26 | Author = {Hunt, Randall}, 27 | Date-Added = {2017-12-06 20:12:58 +0000}, 28 | Date-Modified = {2017-12-06 20:14:27 +0000}, 29 | Title = {{Keeping Time With Amazon Time Sync Service}}, 30 | Url = {https://aws.amazon.com/blogs/aws/keeping-time-with-amazon-time-sync-service/}, 31 | Urldate = {2017-11-27}, 32 | Year = {2017} 33 | } 34 | 35 | @url{AWS-EMR-committer, 36 | Author = {Amazon}, 37 | Title = {{ Using the EMRFS S3-optimized Committer }}, 38 | Url = {https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-s3-optimized-committer.html}, 39 | Urldate = {2018-11}, 40 | Year = {2018} 41 | } 42 | 43 | @url{AWS-EMR-committer-tuning, 44 | Author = {Amazon}, 45 | Title = {{ EMRFS S3-optimized Committer: Job Tuning Considerations }}, 46 | Url = {https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-committer-tuning.html}, 47 | Urldate = {2018-11}, 48 | Year = {2018} 49 | } 50 | 51 | @url{AWS-EMR-committer-blog, 52 | Author = {Kelly, Jonathan and Slawski, Peter }, 53 | Title = {{ Improve Apache Spark write performance on Apache Parquet formats with the EMRFS S3-optimized committer }}, 54 | Url = {https://aws.amazon.com/blogs/big-data/improve-apache-spark-write-performance-on-apache-parquet-formats-with-the-emrfs-s3-optimized-committer/}, 55 | Urldate = {2019-03}, 56 | Year = {2019} 57 | } 58 | 59 | 60 | @url{S3mper, 61 | Author = {Weeks, Daniel C.}, 62 | Title = {{ S3mper: Consistency in the Cloud}}, 63 | Url = {https://medium.com/netflix-techblog/s3mper-consistency-in-the-cloud-b6a1076aa4f8}, 64 | Year = {2014}, 65 | } 66 | 67 | @inproceedings{Calder11, 68 | author = {Calder, Brad and Wang, Ju and Ogus, Aaron and Nilakantan, Niranjan and Skjolsvold, Arild and McKelvie, Sam and Xu, Yikang and Srivastav, Shashwat and Wu, Jiesheng and Simitci, Huseyin and Haridas, Jaidev and Uddaraju, Chakravarthy and Khatri, Hemal and Edwards, Andrew and Bedekar, Vaman and Mainali, Shane and Abbasi, Rafay and Agarwal, Arpit and Haq, Mian Fahim ul and Haq, Muhammad Ikram ul and Bhardwaj, Deepali and Dayanand, Sowmya and Adusumilli, Anitha and McNett, Marvin and Sankaran, Sriram and Manivannan, Kavitha and Rigas, Leonidas}, 69 | title = {Windows Azure Storage: A Highly Available Cloud Storage Service with Strong Consistency}, 70 | booktitle = {Proceedings of the Twenty-Third ACM Symposium on Operating Systems Principles}, 71 | series = {SOSP '11}, 72 | year = {2011}, 73 | isbn = {978-1-4503-0977-6}, 74 | location = {Cascais, Portugal}, 75 | pages = {143--157}, 76 | numpages = {15}, 77 | url = {http://doi.acm.org/10.1145/2043556.2043571}, 78 | doi = {10.1145/2043556.2043571}, 79 | acmid = {2043571}, 80 | publisher = {ACM}, 81 | address = {New York, NY, USA}, 82 | keywords = {Windows Azure, cloud storage, distributed storage systems}, 83 | } 84 | 85 | @url{HADOOP-S3A-Committers, 86 | Author = {Apache Software Foundation}, 87 | Title = {{ ``Committing work to S3 with the “S3A Committers'' }}, 88 | Url = {https://hadoop.apache.org/docs/r3.1.1/hadoop-aws/tools/hadoop-aws/committers.html}, 89 | Year = {2018} 90 | } 91 | 92 | @url{HADOOP-9565, 93 | Author = {Loughran, Steve}, 94 | Title = {{ HADOOP-9565. Add a Blobstore interface to add to blobstore FileSystems }}, 95 | Url = {https://issues.apache.org/jira/browse/HADOOP-9565}, 96 | Year = {2013} 97 | } 98 | 99 | @url{HADOOP-10400, 100 | Author = {Mendelson Jordan}, 101 | Title = {{ HADOOP-10400. Incorporate new S3A FileSystem implementation }}, 102 | Url = {https://issues.apache.org/jira/browse/HADOOP-10400}, 103 | Year = {2014} 104 | } 105 | 106 | @url{HADOOP-13208, 107 | Author = {Loughran, Steve}, 108 | Date-Added = {2017-12-06 20:12:58 +0000}, 109 | Date-Modified = {2017-12-06 20:14:27 +0000}, 110 | Title = {{HADOOP-13208 listFiles(recursive=true) to do a bulk listObjects}}, 111 | Url = {https://issues.apache.org/jira/browse/HADOOP-13208}, 112 | Urldate = {2016-05-17}, 113 | Year = {2016} } 114 | 115 | 116 | @url{HADOOP-13345, 117 | Author = {Nauroth, Chris and Liu, Minglaing and Fabbri, Aaron and Mackrory, Sean and Loughran, Steve}, 118 | Date-Added = {2017-12-06 20:12:58 +0000}, 119 | Date-Modified = {2017-12-06 20:14:27 +0000}, 120 | Title = {{HADOOP-13345. S3Guard: Improved Consistency for S3A }}, 121 | Url = {https://issues.apache.org/jira/browse/HADOOP-13345}, 122 | Urldate = {2016-07-06}, 123 | Year = {2016} } 124 | 125 | @url{HADOOP-13786, 126 | Author = {Loughran, Steve and Ryan Blue}, 127 | Date-Added = {2017-12-06 20:12:58 +0000}, 128 | Date-Modified = {2017-12-06 20:14:27 +0000}, 129 | Title = {{HADOOP-13786. Add S3A committer for zero-rename commits to S3 endpoints}}, 130 | Url = {https://issues.apache.org/jira/browse/HADOOP-13786}, 131 | Urldate = {2016-11-02}, 132 | Year = {2016} } 133 | 134 | @url{HADOOP-14161, 135 | Author = {Miner, Luke}, 136 | Title = {{ HADOOP-14161. Failed to rename file in S3A during FileOutputFormat commitTask }}, 137 | Url = {https://issues.apache.org/jira/browse/HADOOP-14161}, 138 | Year = {2017} } 139 | 140 | @url{iceberg-asf, 141 | Author = {Apache Software Foundation}, 142 | Title = {{ Apache Iceberg (incubating}}, 143 | Url = {https://iceberg.apache.org/}, 144 | Year = {2018} } 145 | 146 | @url{iceberg, 147 | Author = {Ryan Blue}, 148 | Title = {{ Iceberg: a fast table format for S3 }}, 149 | Url = {https://www.slideshare.net/Hadoop_Summit/iceberg-a-fast-table-format-for-s3-103201179}, 150 | Year = {2018} } 151 | 152 | @url{MAPREDUCE-4815, 153 | Author = {Li, Siqi}, 154 | Title = {{MAPREDUCE-4815. Speed up FileOutputCommitter.commitJob for many output files}}, 155 | Url = {https://issues.apache.org/jira/browse/MAPREDUCE-4815}, 156 | Urldate = {2015-02-15}, 157 | Year = {2015} } 158 | 159 | @url{MAPREDUCE-6823, 160 | Author = {Loughran, Steve}, 161 | Date-Added = {2017-12-06 20:04:47 +0000}, 162 | Date-Modified = {2017-12-06 20:11:52 +0000}, 163 | Title = {{MAPREDUCE-6823. FileOutputFormat to support configurable PathOutputCommitter factory}}, 164 | Url = {https://issues.apache.org/jira/browse/MAPREDUCE-6823}, 165 | Urldate = {2016-12-14}, 166 | Year = {2016} } 167 | 168 | @url{SPARK-6352, 169 | Author = {Lee, Pei-Lun}, 170 | Date-Added = {2017-12-06 20:08:54 +0000}, 171 | Date-Modified = {2017-12-06 20:11:35 +0000}, 172 | Title = {{SPARK-6352 Add DirectParquetOutputCommitter}}, 173 | Url = {https://issues.apache.org/jira/browse/SPARK-6352}, 174 | Urldate = {2015-04-28}, 175 | Year = {2015} } 176 | 177 | @url{SPARK-8029, 178 | Author = {Liu, Davies}, 179 | Date-Added = {2017-12-06 20:08:54 +0000}, 180 | Date-Modified = {2017-12-06 20:11:35 +0000}, 181 | Title = {{SPARK-8029. ShuffleMapTasks must be robust to concurrent attempts on the same executor}}, 182 | Url = {https://issues.apache.org/jira/browse/SPARK-8029}, 183 | Urldate = {2015-07-02}, 184 | Year = {2015} } 185 | 186 | @url{SPARK-8413, 187 | Author = {Kim, Mingyu}, 188 | Date-Added = {2017-12-06 20:08:54 +0000}, 189 | Date-Modified = {2017-12-06 20:11:35 +0000}, 190 | Title = {{ SPARK-8413. DirectParquetOutputCommitter doesn't clean up the file on task failure}}, 191 | Url = {https://issues.apache.org/jira/browse/SPARK-8413}, 192 | Urldate = {2015-07-15}, 193 | Year = {2015} } 194 | 195 | @url{SPARK-10063, 196 | Author = {Xin, Reynold}, 197 | Date-Added = {2017-12-06 20:04:47 +0000}, 198 | Date-Modified = {2017-12-06 20:11:52 +0000}, 199 | Title = {{SPARK-10063. Remove DirectParquetOutputCommitter }}, 200 | Url = {https://issues.apache.org/jira/browse/SPARK-10063}, 201 | Urldate = {2016-04-07}, 202 | Year = {2016} } 203 | 204 | @url{SPARK-18512, 205 | Author = {Bonaccorso, Giuseppe }, 206 | Title = {{SPARK-18512. FileNotFoundException on _temporary directory with Spark Streaming 2.0.1 and S3A}}, 207 | Url = {https://issues.apache.org/jira/browse/SPARK-18512}, 208 | Urldate = {2016-12-15}, 209 | Year = {2016} } 210 | 211 | @url{SPARK-21669, 212 | Author = {Adrian Ionescu}, 213 | Title = {{ SPARK-21669. Internal API for collecting metrics/stats during FileFormatWriter jobs}}, 214 | Url = {https://issues.apache.org/jira/browse/SPARK-21669}, 215 | Urldate = {2017-08-17}, 216 | Year = {2017} } 217 | 218 | @url{SPARK-21762, 219 | Author = {Loughran, Steve}, 220 | Title = {{ SPARK-21762. FileFormatWriter/BasicWriteTaskStatsTracker metrics collection fails if a new file isn't yet visible}}, 221 | Url = {https://issues.apache.org/jira/browse/SPARK-21762}, 222 | Urldate = {2017-08-17}, 223 | Year = {2017} } 224 | 225 | @url{SPARK-22217, 226 | Author = {Loughran, Steve}, 227 | Title = {{ SPARK-22217. ParquetFileFormat to support arbitrary OutputCommitters }}, 228 | Url = {https://issues.apache.org/jira/browse/SPARK-22217}, 229 | Urldate = {2017-10-13}, 230 | Year = {2017} } 231 | 232 | @url{s3-tla, 233 | Author = {Loughran, Steve}, 234 | Title = {{ TLA+ Specification of a consistent S3 object store }}, 235 | Url = {https://github.com/steveloughran/formality/releases/download/tag_blobstore_0.3/objectstore.pdf}, 236 | Year = {2017} } 237 | 238 | @article{Stocator, 239 | Archiveprefix = {arXiv}, 240 | Title = {{ Stocator: A High Performance Object Store Connector for Spark}}, 241 | Author = {Vernik, Gil and Factor, Michael and Kolodner, Elliot K. and Michiardi, Pietro and Ofer, Effi and Pace, Francesco}, 242 | Bibsource = {dblp computer science bibliography, http://dblp.org}, 243 | Biburl = {http://dblp.org/rec/bib/journals/corr/abs-1709-01812}, 244 | Date-Added = {2017-11-28 14:34:59 +0000}, 245 | Date-Modified = {2017-11-28 14:34:59 +0000}, 246 | Eprint = {1709.01812}, 247 | Journal = {CoRR}, 248 | Timestamp = {Thu, 05 Oct 2017 09:42:54 +0200}, 249 | Url = {http://arxiv.org/abs/1709.01812}, 250 | Volume = {abs/1709.01812}, 251 | Year = {2017}, 252 | Bdsk-Url-1 = {http://arxiv.org/abs/1709.01812} 253 | } 254 | 255 | @inproceedings{MapReduce, 256 | Acmid = {1251264}, 257 | Address = {Berkeley, CA, USA}, 258 | Author = {Dean, Jeffrey and Ghemawat, Sanjay}, 259 | Booktitle = {Proceedings of the 6th Conference on Symposium on Operating Systems Design and Implementation - Volume 6}, 260 | Date-Added = {2017-11-28 14:23:50 +0000}, 261 | Date-Modified = {2017-11-28 14:23:50 +0000}, 262 | Location = {San Francisco, CA}, 263 | Numpages = {1}, 264 | Pages = {10--10}, 265 | Publisher = {USENIX Association}, 266 | Series = {OSDI'04}, 267 | Title = {{MapReduce: Simplified Data Processing on Large Clusters}}, 268 | Url = {http://dl.acm.org/citation.cfm?id=1251254.1251264}, 269 | Year = {2004}, 270 | Bdsk-Url-1 = {http://dl.acm.org/citation.cfm?id=1251254.1251264} 271 | } 272 | 273 | 274 | @book{2011_AOSA, 275 | Title = {{ The Architecture of Open Source Applications }}, 276 | Author = {{ Brown, Amy and Wilson, Greg }}, 277 | Isbn = {9781257638017}, 278 | Month = jun, 279 | Posted-At = {2011-07-14 10:06:08}, 280 | Priority = {5}, 281 | Publisher = {CreativeCommons}, 282 | Url = {http://aosabook.org/en/index.html}, 283 | Year = {2011}, 284 | Bdsk-Url-1 = {http://www.worldcat.org/isbn/9781257638017} 285 | } 286 | 287 | @inproceedings{Bermbach:2014:BEC:2624303.2624662, 288 | author = {Bermbach, David and Tai, Stefan}, 289 | title = {Benchmarking Eventual Consistency: Lessons Learned from Long-Term Experimental Studies}, 290 | booktitle = {Proceedings of the 2014 IEEE International Conference on Cloud Engineering}, 291 | series = {IC2E '14}, 292 | year = {2014}, 293 | isbn = {978-1-4799-3766-0}, 294 | pages = {47--56}, 295 | numpages = {10}, 296 | url = {https://doi.org/10.1109/IC2E.2014.37}, 297 | doi = {10.1109/IC2E.2014.37}, 298 | acmid = {2624662}, 299 | publisher = {IEEE Computer Society}, 300 | address = {Washington, DC, USA}, 301 | } 302 | 303 | @incollection{Chansler2011, 304 | author = {{Chansler, Robert and Kuang, Hairong and Radia, Sanjay and Shvachko, Konstantin and Srinivas, Suresh}}, 305 | title = {{The Hadoop Distributed File System}}, 306 | editor = {{ Brown, Amy and Wilson, Greg }}, 307 | booktitle = {{ The Architecture of Open Source Applications }}, 308 | Isbn = {9781257638017}, 309 | chapter = {8}, 310 | Publisher = {CreativeCommons}, 311 | Url = {http://aosabook.org/en/index.html}, 312 | Year = {2011}, 313 | Bdsk-Url-1 = {http://www.worldcat.org/isbn/9781257638017} 314 | } 315 | 316 | @inproceedings{Vavilapalli2013, 317 | author = {Vavilapalli, Vinod Kumar and Murthy, Arun C. and Douglas, Chris and Agarwal, Sharad and Konar, Mahadev and Evans, Robert and Graves, Thomas and Lowe, Jason and Shah, Hitesh and Seth, Siddharth and Saha, Bikas and Curino, Carlo and O'Malley, Owen and Radia, Sanjay and Reed, Benjamin and Baldeschwieler, Eric}, 318 | title = {{Apache Hadoop YARN: Yet Another Resource Negotiator}}, 319 | booktitle = {{Proceedings of the 4th Annual Symposium on Cloud Computing}}, 320 | series = {SOCC '13}, 321 | year = {2013}, 322 | isbn = {978-1-4503-2428-1}, 323 | location = {Santa Clara, California}, 324 | pages = {5:1--5:16}, 325 | articleno = {5}, 326 | numpages = {16}, 327 | url = {http://doi.acm.org/10.1145/2523616.2523633}, 328 | doi = {10.1145/2523616.2523633}, 329 | acmid = {2523633}, 330 | publisher = {ACM}, 331 | address = {New York, NY, USA}, 332 | } 333 | 334 | @article{Satya89, 335 | author = {M. Satyanarayanan}, 336 | title = {A Survey of Distributed File Systems}, 337 | institution = {Department of Computer Science Carnegie Mellon University}, 338 | year = {1989}, 339 | doi = {0.1146/annurev.cs.04.060190.000445}, 340 | journal = {Annual Review of Computer Science}, 341 | pages = {73-104}, 342 | notes = {Vol. 4:73-104 (Volume publication date June 1990)} 343 | } 344 | 345 | -------------------------------------------------------------------------------- /tex/commit-protocol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/steveloughran/zero-rename-committer/fe31fbf8f474c8580a1471a08eec3946b642268c/tex/commit-protocol.png -------------------------------------------------------------------------------- /tex/commit-protocol.puml: -------------------------------------------------------------------------------- 1 | @startuml 2 | 3 | 'define icons 4 | control "Job Manager" 5 | participant "Job Committer" 6 | control Executor 7 | participant "Task Committer" 8 | participant Operation 9 | control "YARN NodeManager" 10 | control "YARN ResourceManager" 11 | 12 | ' protocol 13 | '== Initialization == 14 | 15 | "Job Manager" -> "Job Committer": setupJob() 16 | 17 | == For Each Task Attempt == 18 | "Job Manager" -> Executor: execute work 19 | Executor -> "Task Committer": setupTask() 20 | Executor -> "Task Committer": getWorkPath() 21 | Executor <-- "Task Committer": taskAttemptDirectory 22 | Executor --> Operation: map/reduce + taskAttemptDirectory 23 | 24 | loop Until execution is complete... 25 | Executor -> "Job Manager" : ping? 26 | "Job Manager" -> Executor : task is known 27 | alt task is unknown/timeout/limits-exceeded 28 | Executor -> Executor: exit! 29 | Executor -> "YARN NodeManager": (exited) 30 | "YARN NodeManager" -> "YARN ResourceManager": process exited 31 | end 32 | "Job Manager" -> "YARN ResourceManager": heartbeat request 33 | "Job Manager" <-- "YARN ResourceManager": heartbeat response [exited tasks]* 34 | end 35 | Executor <-- Operation: map/reduce finished 36 | 37 | 'TODO: MR failure 38 | 39 | Executor -> "Task Committer": needsTaskCommit()? 40 | Executor <-- "Task Committer": 41 | 42 | 43 | alt needs task commit 44 | Executor -> "Job Manager": can commit? 45 | Executor <-- "Job Manager": 46 | alt action = commit 47 | Executor -> "Job Manager": commitPending 48 | Executor -> "Task Committer": commitTask() 49 | Executor <-- "Task Committer": committed task 50 | else action = abort 51 | Executor -> "Task Committer": abortTask() 52 | Executor <-- "Task Committer": aborted task 53 | end 54 | end 55 | Executor -> "Job Manager" : done 56 | Executor -> Executor: exit! 57 | Executor -> "YARN NodeManager": (exited) 58 | "YARN NodeManager" -> "YARN ResourceManager": process exited 59 | 60 | alt done-not-received-in time interval 61 | "Job Manager" -> "Job Committer": cleanupTask() 62 | "Job Manager" -> "Job Manager": reschedule task 63 | end 64 | 65 | 66 | == Job Commit== 67 | 68 | "Job Manager" -> "YARN ResourceManager": heartbeat request 69 | "Job Manager" <-- "YARN ResourceManager": heartbeat response [exited tasks]* 70 | 71 | "Job Manager" -> "Job Committer": commitJob() 72 | "Job Manager" <-- "Job Committer": committed job 73 | "Job Manager" -> "Job Committer": cleanupJob() 74 | "Job Manager" -> "YARN ResourceManager": finished 75 | "Job Manager" -> "Job Manager": exit! 76 | "Job Manager" -> "YARN NodeManager": (exited) 77 | "YARN NodeManager" -> "YARN ResourceManager": process exited 78 | 79 | 80 | @enduml 81 | -------------------------------------------------------------------------------- /tex/improvements-to-the-commit-protocols.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | % ======================================================================== 3 | \begin{document} 4 | 5 | What can be done for improving commit operations 6 | 7 | \section{Hadoop}\label{sec:hadoop} 8 | 9 | Add check for dest dir existing before job setup? 10 | Currently there's a window between job submit and OutputCommitter.setupJob() 11 | where no attempt to create the dest dir exists. 12 | 13 | The *first* attempt could reject if the dest dir is there; 14 | later ones should expect it. 15 | 16 | \section{Spark}\label{sec:spark} 17 | 18 | Handling on task attempt commit timeout 19 | 20 | \begin{enumerate} 21 | \item choose whether to commit another attempt on task commit timeout based on committer attributes. 22 | \item Short term: if FileOutputCommitter, check algorithm and reject restart if v2 23 | \item Long term: add predicate to PathOutputCommitter. 24 | \end{enumerate} 25 | 26 | Avoid duplicate job attempt commmit after a partitioned spark driver 27 | 28 | Q1: is this actually a problem on YARN? 29 | 30 | How would a test simulate this? 31 | 32 | Q2: what about on Mesos or YARN with driver running off-cluster? 33 | 34 | Q3: How to address in a infra-neutral way? 35 | Need to query infra for last time a heartbeat took place and wait until 36 | its within a time range before committing. 37 | 38 | 39 | \texttt{newTaskTempFileAbsPath()} is, well, trouble for the new committers. 40 | It could be supported in the staging committers again throue upload in task commit. 41 | 42 | For the magic committer, we could add a new prefix under \texttt{__magic}, 43 | \texttt{__root}, alongside \texttt{__base}. 44 | The magic path mapping logic would then be "when writing to a magic path 45 | \texttt{dest/__magic/jobAttemptId/taskAttemptID/__root/abspath}, then the destination of the write 46 | would actually become "abspath". 47 | The existing commit algorithm would automatically incorporate the .pending files 48 | created in the write, and so commit them along with the relative paths. 49 | There is one sole complication: with the writes scattered through the S3 bucket, 50 | a full cleanup would require listing and aborting all outstanding MPUs 51 | writes throughout the bucket. 52 | This could not be done without breaking all other active jobs writing to 53 | the same bucket. 54 | 55 | 56 | Committer resilience/reliability could be enhanced by tracking that list of 57 | files to commit: tasks could record the files they'd provisionally committed and return 58 | that to the job committer. (more specifically, final dest files) 59 | That could then verify that after job commit, they were there. 60 | This is a safety check which could be done at the commit layer; the spark protocols 61 | have room for this 62 | 63 | 64 | \section{Statistic collection} 65 | 66 | A long term fix woud be to radically improve the means by which statistics are 67 | collected by tasks and then aggregated in the job committer. 68 | The latest versions of the HDFS and object store connectors are heavily instrumented, 69 | collecting lots of information on filesystem client use. 70 | for S3A this includes: times an HTTP1.1 connection was aborted on a file read, 71 | the number of times a S3 request was throttled and retried, 72 | even some latency statistics. 73 | All of these would be useful if they could be collected by the job, aggregated 74 | appropriately, and included in the job execution history. 75 | It is possible to collect this information, and then pass this back. 76 | Indeed, our committers do collect this data themselves, saving it in the 77 | manifest files listing pending uploads, combinining this into the \SUCCESS file 78 | in job commit. 79 | This data is not yet extracted by the job committer and returned to the execution 80 | environment. 81 | This can be fixed. 82 | 83 | \section{Committer Capabilities} 84 | 85 | Committers should be able to declare that they have certain attributes, such 86 | as repeatable task commit operations (currently only job commit is declared). 87 | 88 | Rather than propose a continually growing list of predicates, we'd argue for 89 | implementing the \texttt{org.apache.hadoop.fs.StreamCapabilities} interface 90 | and its \texttt{boolean hasCapability(String capability)} predicate, with capabilities 91 | chosen so that the default return value, false, is the pessimistic outcome. 92 | By reusing the capabilities probe already built into Hadoop 2.9+, it's easier 93 | to adopt the checks. 94 | 95 | 96 | 97 | \section{S3a Committers} 98 | 99 | 100 | \section{Validation} 101 | 102 | What about an option of a post Job commit verification that all committed files 103 | are really there? 104 | That's done in the integration tests by reading the data in _SUCCESS; 105 | it could be added into the Job Committer itself. 106 | It's potentially expensive in cost and time at a HEAD/file; 107 | a listFiles(dest, recursive) could be used instead, with, on the partitioned 108 | committer, only generated partitions checked. 109 | 110 | This is really just a safety check, but it could could be useful in testing 111 | and diagnostics, because you wouldn't need to add specific logic to read in 112 | the (unstable) _SUCCESS JSON data & check there. 113 | 114 | \subsubsection{Magic committer} 115 | 116 | Make sure that an empty directory can still be committed & generates 117 | an empty pendingset file, and that when loaded, its harmless. 118 | 119 | 120 | 121 | \section{Aborts and cleanup} 122 | 123 | Everythuing needs to assume that the committers abort and cleanup operations may raise exceptions. 124 | 125 | When invoked, they should be should be surrounded by try/catch blocks When subclassed, the subclasses 126 | should do their own cleanup operations in a try/finally clause. 127 | 128 | Executors/drivers should call these operations on all failure codepaths; 129 | Hadoop MR doesn't appear to do this. 130 | 131 | Current committers should do that try/catch themselves, even though the method 132 | signature allows them to raise IOEs. 133 | This avoids them having to wait for new releases of the execution engines 134 | for the add resilience. 135 | 136 | 137 | \end{document} 138 | -------------------------------------------------------------------------------- /tex/notes.tex: -------------------------------------------------------------------------------- 1 | %% 2 | %% Author: stevel 3 | %% 03/01/2018 4 | %% 5 | 6 | % Preamble 7 | \documentclass[11pt]{article} 8 | 9 | % Packages 10 | \usepackage{a4wide} 11 | 12 | % Document 13 | \begin{document} 14 | 15 | 16 | 17 | The check for a dest dir existing happens in job submission, thus there is a 18 | small race condition where >1 job may target the same directory. 19 | 20 | \begin{verbatim} 21 | 22 | 23 | Tests run: 1, Failures: 0, Errors: 1, Skipped: 0, Time elapsed: 10.82 s <<< FAILURE! - in org.apache.hadoop.fs.s3a.commit.staging.integration.ITStagingCommitMRJobBadDest[ERROR] testMRJob(org.apache.hadoop.fs.s3a.commit.staging.integration.ITStagingCommitMRJobBadDest) Time elapsed: 3.138 s <<< ERROR!org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory s3a://hwdev-steve-ireland-new/test/DELAY_LISTING_ME/testMRJob already exists 24 | at org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:164) 25 | at org.apache.hadoop.mapreduce.JobSubmitter.checkSpecs(JobSubmitter.java:280) 26 | at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:146) 27 | at org.apache.hadoop.mapreduce.Job\$11.run(Job.java:1570) 28 | at org.apache.hadoop.mapreduce.Job\$11.run(Job.java:1567) 29 | at java.security.AccessController.doPrivileged(Native Method) 30 | at javax.security.auth.Subject.doAs(Subject.java:422) 31 | at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1965) 32 | at org.apache.hadoop.mapreduce.Job.submit(Job.java:1567) 33 | at org.apache.hadoop.fs.s3a.commit.AbstractITCommitMRJob.testMRJob(AbstractITCommitMRJob.java:206) 34 | at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) 35 | at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) 36 | at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) 37 | at java.lang.reflect.Method.invoke(Method.java:498) 38 | at org.junit.runners.model.FrameworkMethod\$1.runReflectiveCall(FrameworkMethod.java:47) 39 | at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) 40 | at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:44) 41 | at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) 42 | at org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26) 43 | at org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27) 44 | at org.junit.rules.ExternalResource\$1.evaluate(ExternalResource.java:48) 45 | at org.junit.rules.TestWatcher\$1.evaluate(TestWatcher.java:55) 46 | at org.junit.internal.runners.statements.FailOnTimeout\$StatementThread.run(FailOnTimeout.java:74) 47 | 48 | \end{verbatim} 49 | 50 | \end{document} 51 | 52 | -------------------------------------------------------------------------------- /tex/spark-protocol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/steveloughran/zero-rename-committer/fe31fbf8f474c8580a1471a08eec3946b642268c/tex/spark-protocol.png -------------------------------------------------------------------------------- /tex/spark-protocol.puml: -------------------------------------------------------------------------------- 1 | @startuml 2 | 3 | 'define icons 4 | control Driver 5 | participant "Job Committer" 6 | control Executor 7 | participant "Task Committer" 8 | participant Operation 9 | 10 | ' protocol 11 | '== Initialization == 12 | 13 | Driver -> "Job Committer": setupJob() 14 | 15 | == For Each Task Attempt == 16 | Driver -> Executor: execute work 17 | Executor -> "Task Committer": setupTask() 18 | Executor --> Operation: execute 19 | "Task Committer" <-- Operation: newTaskTempFile 20 | "Task Committer" --> Operation: tempFile 21 | 22 | "Task Committer" <-- Operation: newTaskTempFileAbsPath 23 | "Task Committer" --> Operation: tempFileAbsPath 24 | 25 | Executor <-- Operation: finished 26 | 27 | ' now look at the commit protocol 28 | Executor -> "Task Committer": needsTaskCommit()? 29 | Executor <-- "Task Committer": 30 | 31 | alt needsTaskCommit == true 32 | Executor -> "Driver": AskPermissionToCommitOutput 33 | alt permission to commit granted 34 | Executor -> "Task Committer": commitTask() 35 | Executor <-- "Task Committer": TaskCommitMessage 36 | Executor -> Driver : Success + TaskCommitMessage 37 | else 38 | Executor -> Driver : TaskCommitDenied 39 | end 40 | else needsTaskCommit == false 41 | Executor -> "Task Committer": abortTask() 42 | Executor <-- "Task Committer": aborted task 43 | Executor -> Driver : Success 44 | end 45 | 46 | 47 | == Job Commit== 48 | 49 | 50 | Driver -> "Job Committer": commitJob(TaskCommitMessage+) 51 | Driver <-- "Job Committer": committed job 52 | Driver -> "Job Committer": cleanupJob() 53 | 54 | 55 | @enduml 56 | --------------------------------------------------------------------------------