├── .gitignore
├── README.md
├── pom.xml
└── tex
├── a_zero_rename_committer.tex
├── bibliography.bib
├── commit-protocol.png
├── commit-protocol.puml
├── improvements-to-the-commit-protocols.tex
├── notes.tex
├── spark-protocol.png
└── spark-protocol.puml
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Core latex/pdflatex auxiliary files:
2 | *.aux
3 | *.lof
4 | *.log
5 | *.lot
6 | *.fls
7 | *.out
8 | *.toc
9 | *.fmt
10 | *.fot
11 | *.cb
12 | *.cb2
13 |
14 | ## Intermediate documents:
15 | *.dvi
16 | *-converted-to.*
17 | # these rules might exclude image files for figures etc.
18 | # *.ps
19 | # *.eps
20 | # *.pdf
21 |
22 | ## Generated if empty string is given at "Please type another file name for output:"
23 | .pdf
24 |
25 | ## Bibliography auxiliary files (bibtex/biblatex/biber):
26 | *.bbl
27 | *.bcf
28 | *.blg
29 | *-blx.aux
30 | *-blx.bib
31 | *.run.xml
32 |
33 | ## Build tool auxiliary files:
34 | *.fdb_latexmk
35 | *.synctex
36 | *.synctex(busy)
37 | *.synctex.gz
38 | *.synctex.gz(busy)
39 | *.pdfsync
40 |
41 | ## Auxiliary and intermediate files from other packages:
42 | # algorithms
43 | *.alg
44 | *.loa
45 |
46 | # achemso
47 | acs-*.bib
48 |
49 | # amsthm
50 | *.thm
51 |
52 | # beamer
53 | *.nav
54 | *.pre
55 | *.snm
56 | *.vrb
57 |
58 | # changes
59 | *.soc
60 |
61 | # cprotect
62 | *.cpt
63 |
64 | # elsarticle (documentclass of Elsevier journals)
65 | *.spl
66 |
67 | # endnotes
68 | *.ent
69 |
70 | # fixme
71 | *.lox
72 |
73 | # feynmf/feynmp
74 | *.mf
75 | *.mp
76 | *.t[1-9]
77 | *.t[1-9][0-9]
78 | *.tfm
79 |
80 | #(r)(e)ledmac/(r)(e)ledpar
81 | *.end
82 | *.?end
83 | *.[1-9]
84 | *.[1-9][0-9]
85 | *.[1-9][0-9][0-9]
86 | *.[1-9]R
87 | *.[1-9][0-9]R
88 | *.[1-9][0-9][0-9]R
89 | *.eledsec[1-9]
90 | *.eledsec[1-9]R
91 | *.eledsec[1-9][0-9]
92 | *.eledsec[1-9][0-9]R
93 | *.eledsec[1-9][0-9][0-9]
94 | *.eledsec[1-9][0-9][0-9]R
95 |
96 | # glossaries
97 | *.acn
98 | *.acr
99 | *.glg
100 | *.glo
101 | *.gls
102 | *.glsdefs
103 |
104 | # gnuplottex
105 | *-gnuplottex-*
106 |
107 | # gregoriotex
108 | *.gaux
109 | *.gtex
110 |
111 | # hyperref
112 | *.brf
113 |
114 | # knitr
115 | *-concordance.tex
116 | # TODO Comment the next line if you want to keep your tikz graphics files
117 | *.tikz
118 | *-tikzDictionary
119 |
120 | # listings
121 | *.lol
122 |
123 | # makeidx
124 | *.idx
125 | *.ilg
126 | *.ind
127 | *.ist
128 |
129 | # minitoc
130 | *.maf
131 | *.mlf
132 | *.mlt
133 | *.mtc[0-9]*
134 | *.slf[0-9]*
135 | *.slt[0-9]*
136 | *.stc[0-9]*
137 |
138 | # minted
139 | _minted*
140 | *.pyg
141 |
142 | # morewrites
143 | *.mw
144 |
145 | # nomencl
146 | *.nlo
147 |
148 | # pax
149 | *.pax
150 |
151 | # pdfpcnotes
152 | *.pdfpc
153 |
154 | # sagetex
155 | *.sagetex.sage
156 | *.sagetex.py
157 | *.sagetex.scmd
158 |
159 | # scrwfile
160 | *.wrt
161 |
162 | # sympy
163 | *.sout
164 | *.sympy
165 | sympy-plots-for-*.tex/
166 |
167 | # pdfcomment
168 | *.upa
169 | *.upb
170 |
171 | # pythontex
172 | *.pytxcode
173 | pythontex-files-*/
174 |
175 | # thmtools
176 | *.loe
177 |
178 | # TikZ & PGF
179 | *.dpth
180 | *.md5
181 | *.auxlock
182 |
183 | # todonotes
184 | *.tdo
185 |
186 | # easy-todo
187 | *.lod
188 |
189 | # xindy
190 | *.xdy
191 |
192 | # xypic precompiled matrices
193 | *.xyc
194 |
195 | # endfloat
196 | *.ttt
197 | *.fff
198 |
199 | # Latexian
200 | TSWLatexianTemp*
201 |
202 | ## Editors:
203 | # WinEdt
204 | *.bak
205 | *.sav
206 |
207 | # Texpad
208 | .texpadtmp
209 |
210 | # Kile
211 | *.backup
212 |
213 | # KBibTeX
214 | *~[0-9]*
215 |
216 | # auto folder when using emacs and auctex
217 | /auto/*
218 |
219 | # expex forward references with \gathertags
220 | *-tags.tex
221 |
222 | target/
223 | out/
224 |
225 | .classpath
226 | .project
227 | .settings
228 | .settings/
229 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # A Zero-Rename Committer: Object-storage as a destination for Apache Hadoop and Spark
2 |
3 | This is a LaTeX formatted paper on the new S3A committers [shipped in Hadoop 3.1.](https://hadoop.apache.org/docs/r3.1.1/hadoop-aws/tools/hadoop-aws/committers.html)
4 |
5 |
6 |
7 | ## Building
8 |
9 | `mvn package` should do all but the image rendering. For that, `plantuml` is
10 | doing the rendering, set up to monitor the directory `tex/`.
11 | As usual, you need to run `bibtex` sporadically.
12 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
15 |
19 | 4.0.0
20 | com.github.steveloughran.papers
21 | s3a-committers
22 | 1.0-SNAPSHOT
23 | Maven to build the PDF/DVIs of the project
24 | S3A Committers
25 | jar
26 |
27 |
28 |
29 | maven-latex-plugin-repo
30 | http://akquinet.github.com/maven-latex-plugin/maven2/
31 |
32 | true
33 |
34 |
35 |
36 |
37 |
38 |
39 | de.akquinet.jbosscc.latex
40 | maven-latex-plugin
41 | 1.2
42 |
43 |
44 |
45 |
46 |
47 |
48 | de.akquinet.maven
49 | maven-latex-plugin
50 | 1.1
51 | false
52 |
53 |
54 | compile
55 |
56 |
57 | tex
58 | pdf
59 |
60 | bibtex
61 |
62 |
63 |
64 | latex
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
--------------------------------------------------------------------------------
/tex/a_zero_rename_committer.tex:
--------------------------------------------------------------------------------
1 | \documentclass[format=acmsmall, screen=true, review=false]{acmart}
2 |
3 | % _ _____ ____
4 | % / \ |__ /___ _ __ ___ | _ \ ___ _ __ __ _ _ __ ___ ___
5 | % / _ \ / // _ \ '__/ _ \ _____| |_) / _ \ '_ \ / _` | '_ ` _ \ / _ \
6 | % / ___ \ / /| __/ | | (_) |_____| _ < __/ | | | (_| | | | | | | __/
7 | % /_/ \_\ /____\___|_| \___/ |_| \_\___|_| |_|\__,_|_| |_| |_|\___|
8 | %
9 | % ____ _ _ _
10 | % / ___|___ _ __ ___ _ __ ___ (_) |_| |_ ___ _ __
11 | % | | / _ \| '_ ` _ \| '_ ` _ \| | __| __/ _ \ '__|
12 | % | |__| (_) | | | | | | | | | | | | |_| || __/ |
13 | % \____\___/|_| |_| |_|_| |_| |_|_|\__|\__\___|_|
14 |
15 | %\usepackage{babel}
16 | \usepackage{graphicx}
17 | \usepackage{color}
18 | \usepackage{cite}
19 | %\usepackage{algorithmic}
20 | %\usepackage{algorithmicx}
21 | \usepackage[ruled,vlined,boxed]{algorithm2e}
22 | \usepackage{listings}
23 | %\usepackage{minted}
24 | \usepackage{underscore}
25 | \usepackage{multicol}
26 | \usepackage{float}
27 | \usepackage{checkend}
28 | \usepackage{enumitem}
29 |
30 | % ========================================================================
31 | % commands
32 |
33 |
34 | \newcommand{\SUCCESS}{\texttt{\_SUCCESS}\ }
35 |
36 | % add a todo marker. We can turn this off when we don't want to see it.
37 | \newcommand{\TODO}{\emph{TODO}\ }
38 | \newcommand{\FOC}{\texttt{FileOutputCommitter}\ }
39 |
40 |
41 | % ========================================================================
42 |
43 |
44 | \title{ A Zero-Rename Committer}
45 |
46 |
47 | \subtitle{Object-storage as a Destination for Apache Hadoop and Spark}
48 | % Yes, this titling is broken
49 | \author{
50 | Loughran, Steve
51 | \and
52 | Blue, Ryan
53 | \and
54 | Radia, Sanjay
55 | \and
56 | Demoor, Thomas
57 | }
58 | %\author{
59 | % Loughran, Steve
60 | % \texttt{stevel@apache.org}
61 | %\and
62 | % Blue, Ryan
63 | % \texttt{rblue@netflix.com}
64 | %\and
65 | % Radia, Sanjay
66 | % \texttt{sradia@apache.org}
67 | %\and
68 | % Demoor, Thomas
69 | % \texttt{thomas.demoor@wdc.com}
70 | %}
71 |
72 | \date{December 2017}
73 |
74 | % ========================================================================
75 |
76 | \begin{document}
77 |
78 |
79 | \maketitle
80 |
81 | % ========================================================================
82 |
83 | \begin{abstract}
84 |
85 | We introduce new \emph{committers} for Apache Hadoop, which
86 | the Amazon S3 Object Store to be safely used as a direct destination of output generated
87 | by Hadoop MapReduce and Apache Spark.
88 |
89 | By using the operations directly exported by the store,
90 | most critically the multipart-upload mechanism, tasks within a distributed
91 | query can upload their output to the final destination,
92 | yet not materialize this data until the overall job is committed.
93 | As a result, the committers meet the core requirement of the Hadoop and Spark commit
94 | protocols: the output of the job is complete and consistent: it contains
95 | all the output of the successful ``committed'' work, and none of the output of
96 | tasks which were committed.
97 | That this mechanism permits highly-performant commit operations is an added benefit.
98 |
99 | We also document the commit protocols of Hadoop and Spark, and show how the classic committer
100 | implementation's requirements of atomic file creation and rename operations mean that they
101 | cannot be safely used with Amazon S3.
102 |
103 | We introduce our the two committers, ``Staging'' and ``Magic'', exploring their differences.
104 | The Staging committer stages all generated output to the local filesystem of
105 | worker nodes, uploading this data when a task is committed.
106 | The Magic committer streams data directly to the object store, relying on the
107 | object store client to recognise some output paths as special (``magic''), and
108 | so translating writes to these paths as initiating a delayed-completion write
109 | to a calculated final destination.
110 |
111 | In order to evaluate the correctness of our work, we provide a definition
112 | of \emph{Correctness} ---the requirements of a committer for safe integration
113 | with Hadoop MapReduce and Spark.
114 |
115 | The requirements are
116 |
117 | \begin{labeling}{continuity of correctness}
118 | \item [complete] the output includes the work of all successful tasks.
119 | \item [exclusive] the output of unsuccessful tasks is not present.
120 | \item [concurrent] multiple tasks can be committed in parallel.
121 | \item [abortable] jobs and tasks may be aborted, after which their output is not visible.
122 | \item [continuity of correctness] once a job is committed, the output
123 | of any failed, aborted, or unsuccessful task must not appear at some point in the future.
124 | \end{labeling}
125 |
126 |
127 |
128 | As well as demonstrating that our new committers meet these requirements,
129 | we show that one of the committers which ships with Hadoop.
130 | Evaluating other object-store-specific committers, IBM's Stocator\ \cite{Stocator}
131 | meets the requirements, while Amazon's EMRFS S3-optimized Committer does not
132 | appear to do so \ \cite{AWS-EMR-committer}.
133 | Equally notably, one of the committers which ships in Hadoop and is broadly used,
134 | ``The V2 committer'' does not meet these correctness criteria eiher.
135 | In both the EMR and Hadoop V2 committers, tasks publish their output directly
136 | into the destination during their task commit operations, through a sequence of
137 | operations.
138 | Any failure during such non-atomic task commits could leave the
139 | destination in an undefined state.
140 |
141 | The obvious mitigation technique is ``avoid these'', but it is also
142 | possible for the commit protocols to be extended to allow the committers
143 | to declare whether or not a failure during task commit is recoverable.
144 | The application could then use that information to react to a failure
145 | in a stricter way, such as failing the job, or restarting it entirely.
146 |
147 | \end{abstract}
148 |
149 | % ========================================================================
150 |
151 | \section{Introduction}
152 | \label{sec:introduction}
153 |
154 | It has long been a core requirement of ``Big Data'' computation platforms that
155 | the source and destination of data was a fully consistent distributed filesystem.
156 |
157 | Distributed, because data needs to be readable and writable by the distributed
158 | processes executing a single query across the cluster of computers.
159 | Consistent, because all machines across the cluster need to be able to
160 | list and read data written by any of the others.
161 | As for ``Filesystem'', that's the model and API for distributed storage which
162 | developers are familiar with.
163 |
164 |
165 | The full semantics of a POSIX filesystem are not always necessary;
166 | random write access to a file being an oft-omitted feature of the stores,
167 | forcing the persistence formats to rely purely on appending data.
168 |
169 | What has been a critical part of the required semantics has been that the filesystem
170 | presents a model of directories and files with consistent operations to list and
171 | read those directories and their contents, with at least four atomic operations:
172 |
173 | \begin{itemize}
174 | \item Rename of a single file to another location within the same volume.
175 | \item Rename of a directory and its contents to another location within the same volume.
176 | \item Create a file iff a file or directory does not exist at that part.
177 | \item Recursive delete of a directory.
178 | \end{itemize}
179 |
180 | These operations are regularly used by applications as the foundational operators of higher-level
181 | co-ordination and commit protocols.
182 |
183 | For example, the \texttt{create()} operation can be used to obtain a lock on a resource:
184 | the first process to create a file can consider itself having exclusive access to it,
185 | and so implicitly consider itself to have acquired the resource.
186 |
187 | The \texttt{rename()} operation is generally critical to providing atomic promotion
188 | of output: a single \texttt{rename()} call can promote all in-progress output
189 | of a worker to become completed work, simply by moving all the output to a well known path.
190 | And, when the job completes, its final output may be renamed to a new location to become
191 | publicly visible.
192 |
193 | As covered in the original MapReduce paper\ \cite{MapReduce}:
194 |
195 | \begin{quote}
196 | We rely on the atomic rename operation provided by the underlying file system
197 | to guarantee that the final file system state contains just the data produced
198 | by one execution of the reduce task.
199 | \end{quote}
200 |
201 |
202 | Apache Hadoop was written with its own filesystem, Hadoop Distributed File System
203 | (HDFS)\ \cite{Chansler2011}.
204 |
205 | It is self-admittedly sub-POSIX as data can only be
206 | appended directly to the end of the current file.
207 | What it does offer is the classic filesystem model of
208 | a tree of directories and files,
209 | and the atomic operations needed by MapReduce to safely use HDFS
210 | as a destination of work.
211 | As will be shown, some object stores do not provide the same guarantees,
212 | and so cannot be safely used as a destination with the standard protocol,
213 | even if everything \emph{appears} to work.
214 |
215 |
216 | % ========================================================================
217 |
218 | \section{The Hadoop MapReduce Commit Protocol}
219 | \label{sec:hadoop-mr-commit}
220 |
221 | Before the challenge and solution of using an object store as a destination
222 | of work can be covered, the problem of outputting data from a distributed
223 | query itself must be covered, along with the existing protocols and algorithms.
224 |
225 |
226 | \subsubsection{Terminology}
227 |
228 | First, some terminology needs to be introduced to describe
229 | the protocols.
230 |
231 |
232 | \textbf{Query}.
233 | One or more transformations of source data to a result;
234 | data presented or saved in some form.
235 | The query may be described in procedural source code,
236 | or declaratively in a form such as SQL\@.
237 |
238 |
239 | \textbf{Job}.
240 | A parallelized query, composed of one or more distributed \emph{tasks}.
241 | The output of a Job is made visible to other stages in a larger operation
242 | sequence or other applications iff the job \emph{completes successfully}.
243 | A complex query may consist of a chain of Jobs, either executing in sequence
244 | or as a DAG of jobs.
245 |
246 | \textbf{Job Attempt}.
247 | A single attempt at executing a job.
248 |
249 | \textbf{Task}.
250 | Part of a job, such as a single Map or Reduce transformation applied to a fraction
251 | of the input data.
252 |
253 |
254 | \textbf{Task Attempt}.
255 | A single attempt to complete a task on a single process running on a single host
256 | in the cluster.
257 | A task attempt is \emph{successful} if it generates all its output without
258 | failing in some way.
259 | A task attempt has \emph{failed} if the execution raises an exception, or
260 | if the process executing the task attempt stops communicating with
261 | the process managing the job.
262 |
263 | Multiple attempts may be made to execute a task;
264 | sequentially, if addressing task failure, or in parallel when task attempts are
265 | executed speculatively.
266 | It is critical that only one task attempt's output is propagated
267 | to the final output of a job.
268 |
269 |
270 | \textbf{Job Manager}.
271 | The application which schedules task attempt execution, tracks success/failures,
272 | determines when a job has been completed and publishes the results.
273 | It may also determine that a job has failed and cannot be recovered,
274 | in which case the job is aborted.
275 |
276 | \textbf{Executor}.
277 | A process capable of executing work, as directed by the Job Manager.
278 | In Hadoop MapReduce, a unique executor is created for each partition
279 | of the data, destroyed when the processing is completed.
280 | In Spark, executors are long lived and can be allocated task attempts from multiple
281 | jobs to execute, often simultaneously.
282 |
283 | \textbf{Job Output Directory}.
284 | The directory into which the output of a job writing to the filesystem is placed
285 | so as to be visible.
286 | After a successful job completion, the data MUST be visible in the destination
287 | directory.
288 |
289 | \textbf{Task Working Directory}.
290 | A directory for exclusive access by a single task attempt, into which uncommitted
291 | work may be placed.
292 | All data written in and under this directory is considered the output of
293 | that task attempt.
294 |
295 |
296 | \textbf{Task Commit}.
297 | The act of taking the output of a task attempt
298 | and promoting it to become part of the final output of the active job
299 | attempt.
300 | When the output is a filesystem, this consists of moving the files
301 | under the Task Working Directory and moving to the Job Output Directory,
302 | preserving the hierarchy of subdirectories.
303 |
304 |
305 | \textbf{Job Commit}.
306 | The act of taking the output of all committed tasks of a job attempt,
307 | and generating the final output.
308 | This normally consists of publishing this output in an aggregate form;
309 | it can also include generating extra summary data.
310 | As this it is often a serialized operation at the end of a job attempt,
311 | its performance can be a bottleneck.
312 |
313 | \textbf{Task Abort}.
314 | To cancel a task such that its data is not committed.
315 |
316 | \textbf{Job Abort}.
317 | To cancel all work in a job attempt: no task's work is committed.
318 |
319 |
320 | \textbf{Job Context}.
321 | An instance of the Java class \texttt{org.apache.hadoop.mapreduce.JobContext},
322 | which provides a read-only view of the Job for the Job Driver and tasks.
323 |
324 | \textbf{Task Attempt Context}.
325 | An instance of the class
326 | \texttt{org.apache.hadoop.mapreduce.TaskAttemptContext},
327 | which provides operations for tasks, such as getting and setting status,
328 | progress and counter values.
329 |
330 |
331 | \subsection{Requirements of a Commitment Protocol}
332 | \label{subsec:commit-protocol-requirements}
333 |
334 | Apache Hadoop's MapReduce implementation is designed to support long-lived
335 | large-scale queries taking minutes to hours to complete.
336 | Its requirements include the following:
337 |
338 | \begin{enumerate}
339 |
340 | \item Support for thousands to tens of thousands of individually scheduled $tasks$
341 | within a single $job$.
342 |
343 | \item Support different destinations of work, such as databases and
344 | distributed filesystems.
345 |
346 | \item ``Correctly'' propagate the output of individual tasks to the final
347 | aggregate of the job.
348 | What constitutes correctness is covered in\ \ref{sec:correctness}.
349 |
350 | \item Recover from the failure of a task attempt by rescheduling the task;
351 | a new task attempt may be executed anywhere within the cluster.
352 |
353 | \item Support speculative execution of task attempts as a means of compensating for the
354 | delay caused by \emph{stragglers} in the execution.
355 |
356 | \item Potentially: recover from a failure of a job attempt, using all the committed
357 | task output from the previous, failed attempt.
358 |
359 | \item Be resilient to network failures/partitions of tasks, and of the job manager
360 | itself becoming isolated from other parts of the system (and hence: a second
361 | attempt at the job being started).
362 |
363 | \end{enumerate}
364 |
365 | This leads to some specific requirements of an implementation, requirements
366 | which can be used to assess its correctness.
367 |
368 | \textbf{Independent.}
369 | Individual tasks must be able to output without directly
370 | co-ordinating that write with those of other tasks.
371 |
372 | \textbf{Speculative tasks until committed.}
373 | Multiple tasks must be able to simultaneously execute on the same input
374 | source, to generate the required output of that part of the input.
375 | This is required for recovery, and for speculation.
376 | Non requirement: idempotent output;
377 | that is left to the implementors of the operations executed in the tasks.
378 |
379 | \textbf{Scaleable communication protocol.}
380 | The commit protocol communications between task and job manager
381 | must be support tens of thousands of simultaneous tasks.
382 |
383 | \textbf{Abortable.}
384 | It must be possible to abort an uncommitted task or job.
385 | There should be no leftover output.
386 |
387 | \textbf{Recoverable or restartable job.}
388 | A committer can declare whether or not it supports job recovery;
389 | if it does, it must implement recovery.
390 | If not, the job must be restartable from the beginning.
391 |
392 | \begin{figure*}
393 | \centering
394 | \includegraphics[width=.8\textwidth]{commit-protocol.png}
395 | \caption{Hadoop commit protocol (excluding Job recovery)}
396 | \label{fig:commit-protocol}
397 | \end{figure*}
398 |
399 | An UML sequence diagram of the core commit protocol is
400 | show in\ \ref{fig:commit-protocol}.
401 |
402 | The commit algorithm is designed to work on the YARN cluster scheduler
403 | \ \cite{Vavilapalli2013}.
404 |
405 | On each node in the YARN cluster, a \emph{Node Manager} has the responsibility
406 | of launching the applications, usually within a memory-and-CPU-bounded
407 | environment.
408 | A central \emph{Resource Manager} manages the scheduling liveness monitoring
409 | of individual applications.
410 | When an application is submitted for execution, the Resource Manager schedules
411 | its root process, the \emph{Application Master}.
412 | This communicates with the Resource Manager via an umbilical protocol
413 | which is explicitly used by the application for requesting new processes
414 | across the cluster, and implicitly used by the Resource Manager
415 | as a liveness probe of the application.
416 |
417 | When a launched process terminates, the process exit code
418 | is passed to the \emph{ResourceManager} within the regular status heartbeats
419 | between each NodeManager and the ResourceManager.
420 | If it is the Application Master itself which has terminated, unless it explicitly
421 | declared itself to be finished, the application is considered to have failed.
422 | All worker processes will be terminated (by default), and a new instance
423 | of the Application Master scheduled for execution.
424 | If it was a worker process, the Application Master chooses how to react.
425 |
426 |
427 | In MapReduce, the YARN Application Master is the Job Manager,
428 | with every individual task attempt executed in a unique worker process, termed
429 | an ```executor'' in this paper.
430 | A direct RPC protocol between the Job Manager and the executors is used to manage
431 | the commit operation.
432 | Excluding process failures, all liveness detection must be performed by the
433 | Job Manager, which is done based on timeouts of this direct RPC protocol.
434 |
435 | % -----------------------------------------------------------------------
436 |
437 | \subsection{Recoverable Failure Modes}
438 | \label{subsec:optionalRecoverableFailureModes}
439 |
440 | \subsubsection{Job Recovery}
441 |
442 | When YARN perceives the Job Manager process to have failed, it instantiates
443 | a new instance of the process somewhere within the cluster.
444 | This new Job Manager creates an instance of the specified Job Committer,
445 | and queries it as to whether job recovery is supported.
446 |
447 | If a committer does support recovery.,
448 | the state of the previous job attempt is rebuilt from reading the
449 | ``Job History'' file.
450 | This file is a log of the events saved during the execution, including
451 | a record of all tasks which successfully committed, and which could
452 | therefore be recovered.
453 | The Job Committer's \texttt{recoverTask(TaskAttempt)} method is called
454 | for each of these tasks.
455 | All unexecuted, uncommitted or unrecoverable tasks are scheduled for execution.
456 |
457 | If job recovery is not support the entire job is re-executed.
458 |
459 | As the probability of the Job Manager failing is, excluding bugs in the code itself,
460 | a function of job time, rather than scale, recovering from job failure is more
461 | important on long lived jobs ---those which last many hours.
462 |
463 | \subsubsection{Bad Records}
464 |
465 | To avoid an entire task, and hence job, failing due to a single unprocesseable record,
466 | task attempts may skip records whose processing raises an exception.
467 | If the number of skipped records in a task attempt is below some threshold
468 | threshold, these records will not result in a task attempt reporting itself as
469 | having failed.
470 | This is not of direct relevance to the commit protocol except as a
471 | reason for a task attempt to fail.
472 |
473 | \section{Hadoop's FileOutputCommitter}
474 | \label{sec:fileoutputcommitter}
475 |
476 |
477 | The operations to commit the work, the Task Commit and the Job Commit,
478 | are all implemented in the same class, an implementation of \texttt{OutputCommitter}.
479 | For writing to HDFS, this is done in the \texttt{FileOutputCommitter} class.
480 |
481 | This actually implements two separate algorithms for committing work: each
482 | with different performance and scalability characteristics.
483 |
484 | The ``v1'' algorithm was designed to handle failure and recovery with an
485 | atomic task commit and the ability to explicitly recover the output generated
486 | by the committed tasks of a failed job attempt.
487 |
488 | The ``v2'' algorithm was added in 2015, as its predecessor was found
489 | to have scalablity problems for jobs with tens of thousands of files
490 | \ \cite{MAPREDUCE-4815}.
491 | While the v2 algorithm can deliver better performance, it comes at the price of
492 | reduced isolation of output.
493 |
494 | \textbf{v1.}
495 | When a task attempt is committed, its task working directory is renamed into
496 | the job attempt directory.
497 | When the job attempt is committed, all committed task directories are merged
498 | (serially) into the job output directory.
499 | A restarted job move the directories of committed tasks from the previous
500 | attempt, so recovering their output.
501 |
502 |
503 | \textbf{v2.}
504 |
505 | When a task attempt is committed, its output is immediately merged into the
506 | job output directory;
507 | the job commit operation does nothing but create a marker file.
508 | This is faster, but intermediate work is visible.
509 | The task commit operation is no longer atomic, changing failure modes.
510 |
511 | %\begin{table}
512 | % \caption{Attributes of the \texttt{FileOutputCommitter} algorithms}
513 | % \begin{tabular}{ l c c }
514 | % \hline
515 | % & \textbf{v1} & \textbf{v2} \\
516 | % Independent & True & True \\
517 | % Speculative Tasks & True & True \\
518 | % Recoverable Job & True & False \\
519 | % Abortable Job & True & Delete output directory \\
520 | % Observable & False & True \\
521 | % Atomic Task Commit & True & False \\
522 | % Idempotent Task Commit & True & False \\
523 | % Atomic Job Commit & False & True \\
524 | % Idempotent Job Commit & False & True \\
525 | % \hline
526 | % \end{tabular}
527 | % \label{tab:file-committer-attributes}
528 | %\end{table}
529 |
530 | \subsection{Common Variables}
531 | \label{subsec:common-variables}
532 |
533 |
534 | \begin{table}
535 | \caption{Variables used in the algorithms}
536 | \begin{tabular}{ l l l }
537 | \hline
538 | \textbf{name} & \textbf{meaning} \\
539 | $fs$ & Destination filesystem \\
540 | $destPath$ & Job Output Directory in the destination filesystem. \\
541 | $jobId$ & Numeric Job ID $\geq$ 0; expected to be unique for all application instances in the cluster. \\
542 | $jobAttemptId$ & $JobID_\$counter$; Counter starts at 0 for a job and increments on each attempt.\\
543 | $jobAttemptPath$ & a path under which a job attempt may store any data.\\
544 | $partId$ & Numeric value of partition of data to be allocated to a task.\\
545 | $taskId$ & $jobAttemptId_\$partId$; the task which works on part $partId$ in the job attempt.\\
546 | $taskAttemptId$ & $taskId\$counter$; a single attempt to execute a task.\\
547 | $taskAttemptPath$ & a path under $jobAttemptPath$ into which a task attempt may write uncommitted data.\\
548 | $taskCommittedPath$ & a path under $jobAttemptPath$ where the contents of $taskAttemptPath$ are moved when that attempt is committed. \\
549 | \hline
550 | \end{tabular}
551 | \label{tab:variables}
552 | \end{table}
553 |
554 | For a Job Attempt $jobAttemptId$ to be successful, all parts of the dataset
555 | must be processed in one or more successful tasks.
556 | The output of exactly one task attempt for each task must be in the final dataset,
557 |
558 | The function of a commit algorithm, then, is to guarantee that this condition
559 | is met, even in the presence of failures.
560 | It is not a requirement for an algorithm to be able to recover from all
561 | failures;
562 | it may react to some failure conditions by failing the entire job.
563 |
564 | It is also not a general requirement that if a job fails, the job output directory
565 | must be unchanged.
566 | Together, this implies that at-most-once semantics are required,
567 | and that the task of handling job failures is to be handled
568 | by a higher-level workflow.
569 |
570 | \subsection{Hadoop V1 commit algorithm}
571 | \label{subsec:hadoop-v1-commit-algorithm}
572 |
573 | %% Define the standard commit variables
574 | \newcommand{\FileOutputCommitVars}{
575 | \DontPrintSemicolon
576 | \SetKwData{fs}{$fs$}
577 | \SetKwData{dest}{$destPath$}
578 | \SetKwData{jobAttemptPath}{$jobAttemptPath$}
579 | \SetKwData{jobAttemptId}{$jobAttemptId$}
580 | \SetKwData{SUCCESS}{_$SUCCESS$}
581 | \SetKwData{taskAttemptId}{$taskAttemptId$}
582 | \SetKwData{taskAttemptPath}{$taskAttemptPath$}
583 | \SetKwData{taskCommittedPath}{$taskCommittedPath$}
584 | \SetKwData{temp}{_$temporary$}}
585 |
586 | \newcommand{\true}{ $true$ }
587 | \newcommand{\false}{ $false$ }
588 |
589 |
590 | \textbf{Job Setup}
591 |
592 | This creates the path \emph{jobAttemptPath}, under the
593 | directory \texttt{\_temporary} of the output directory
594 | \emph{destPath}.
595 |
596 | \begin{procedure}
597 | \FileOutputCommitVars
598 | % Operations, which are defined for all subsequent procedures/functions
599 | % ALL functions must go in here, in alphabetical order
600 | % Macros cannot have numbers in them, though their values can.
601 | \SetKwFunction{abortUpload}{abortUpload}
602 | \SetKwFunction{checkForConflicts}{checkForConflicts}
603 | \SetKwFunction{commitJob}{commitJob}
604 | \SetKwFunction{completeUpload}{completeUpload}
605 | \SetKwFunction{delete}{delete}
606 | \SetKwFunction{exists}{exists}
607 | \SetKwFunction{Exception}{Exception}
608 | \SetKwFunction{getFileStatus}{getFileStatus}
609 | \SetKwFunction{getJobAttemptPath}{getJobAttemptPath}
610 | \SetKwFunction{getUsername}{getUsername}
611 | \SetKwFunction{isDirectory}{isDirectory}
612 | \SetKwFunction{isFile}{isFile}
613 | \SetKwFunction{listFiles}{listFiles}
614 | \SetKwFunction{listPendingUploads}{listPendingUploads}
615 | \SetKwFunction{loadPendingFile}{loadPendingFile}
616 | \SetKwFunction{loadPendingSet}{loadPendingSet}
617 | \SetKwFunction{mergePathsA}{mergePathsV1}
618 | \SetKwFunction{mergePathsB}{mergePathsV2}
619 | \SetKwFunction{mkdir}{mkdir}
620 | \SetKwFunction{mkdirs}{mkdirs}
621 | \SetKwFunction{newUUID}{newUUID}
622 | \SetKwFunction{rename}{rename}
623 | \SetKwFunction{return}{return}
624 | \SetKwFunction{savePendingSet}{savePendingSet}
625 | \SetKwFunction{tempDirForStaging}{tempDirForStaging}
626 | \SetKwFunction{throw}{throw}
627 | \SetKwFunction{touch}{touch}
628 | \SetKwFunction{uniquePath}{uniquePath}
629 | \SetKwFunction{uploadFileToPendingCommit}{uploadFileToPendingCommit}
630 |
631 | \jobAttemptPath $\longleftarrow$ \dest/\temp/\jobAttemptId\;
632 | \mkdir(\fs, \jobAttemptPath)\;
633 |
634 | \caption{setupJob()}
635 | \label{alg:FileOutputCommitter.setupJob}
636 | \end{procedure}
637 |
638 | Note Hadoop has a convention that all paths starting with ``_'' are not considered
639 | ``visible'';
640 | everything under this directory is excluded from normal
641 | listings of the destination path.
642 | Creating all intermediate files in a subdirectory of the destination
643 | directory provides an implicit guarantee that the data is created in the
644 | same volume (in a multi-volume filesystem), and in the same encryption zone,
645 | for any HDFS cluster with encryption enabled.
646 |
647 |
648 | \textbf{Task Setup}
649 |
650 | The task attempt is given a directory under the job attempt path
651 | as its task working directory.
652 |
653 | \begin{procedure}
654 | \FileOutputCommitVars
655 |
656 | \taskAttemptPath $\longleftarrow$ \jobAttemptPath/\taskAttemptId\;
657 |
658 | \caption{setupTask()}
659 | \label{alg:FileOutputCommitter.setupTask}
660 | \end{procedure}
661 |
662 | The actual directories are created on demand.
663 |
664 |
665 | \textbf{Needs Task Commit}
666 |
667 | A commit is required iff data was generated.
668 |
669 | \begin{function}
670 | \FileOutputCommitVars
671 |
672 | \exists(\fs, \taskAttemptPath)\;
673 |
674 | \caption{needsTaskCommit()}
675 | \label{alg:FileOutputCommitter.needsTaskCommit}
676 | \end{function}
677 |
678 | This is somewhere where any eventual consistency of object listings in an object
679 | store can generate a false ``no data to commit'' negative result.
680 |
681 | \textbf{Task Commit}
682 |
683 | A task attempt is committed simply by renaming the task attempt working directory
684 | into the job attempt directory.
685 |
686 | \begin{procedure}
687 | \FileOutputCommitVars
688 |
689 | \If{\exists(\fs, \taskAttemptPath)} {
690 | \delete(\fs, \taskCommittedPath, $recursive$)\;
691 | \rename(\fs, \taskAttemptPath, \taskCommittedPath)\;
692 | }
693 |
694 | \caption{commitTask()}
695 | \label{alg:FileOutputCommitter.commitTask}
696 | \end{procedure}
697 |
698 |
699 | In a true file system, the rename is an $O(1)$ atomic operation.
700 | %Even if the task fails to report to the Job Manager that the
701 | %commit operation was completed, the existence of the \texttt{taskCommittedPath}
702 | %is an implicit confirmation that the task was committed.
703 | %Its absence is not a guarantee that the task has failed ---it could just
704 | %be taking slow to execute the operation.
705 | %However, the Job Manager can assume that the task has failed,
706 | %and reschedule another attempt at that task.
707 | %Whichever of the rescheduled or original (delayed/partitioned) task
708 | %last renames their task attempt path to the task committed path is considered
709 | %the successful committer.
710 |
711 | \textbf{Task Abort}
712 |
713 | Abort a task attempt by deleting its task attempt path.
714 |
715 | \begin{procedure}
716 | \FileOutputCommitVars
717 |
718 | \delete(\fs, \taskAttemptPath, $recursive$)\;
719 |
720 | \caption{abortTask()}
721 | \label{alg:FileOutputCommitter.abortTask}
722 | \end{procedure}
723 |
724 |
725 | On a genuine filesystem this is an $O(1)$` operation.
726 | %On an object store, usually $O(files)$.
727 |
728 | \textbf{Job Commit}
729 |
730 | A Job is committed by merging all files/directories in from all the task
731 | committed paths into final job output directory.
732 |
733 | Optionally, it can create a zero-byte `\SUCCESS` file in the output directory.
734 |
735 | \begin{procedure*}
736 | \FileOutputCommitVars
737 |
738 | \For { committedTask $\in$ listFiles(\fs, \jobAttemptPath) } {
739 | \mergePathsA(\fs, committedTask, \dest)\;
740 | }
741 | \touch(\fs, \dest/\SUCCESS)\;
742 | \delete(\fs, \temp)\;
743 |
744 | \caption{commitJob()}
745 | \label{alg:FileOutputCommitter.commitJob}
746 | \end{procedure*}
747 |
748 |
749 | The \texttt{mergePathsV1(FileSystem, Path, Path)} procedure is
750 | a recursive move of all the output of a committed task into/underneath.
751 | the destination directory.
752 |
753 | % ------------------------------------------------------------
754 | \begin{procedure*}
755 | \FileOutputCommitVars
756 |
757 | \eIf { \isFile(\fs, $src$) } {
758 | \If { \exists(\fs, $dest$) } {
759 | \delete(\fs, $dest$, $recursive$)\;
760 | }
761 | \rename(\fs, $src$, $dest$)\;
762 | } {
763 | \eIf { \exists(\fs, $dest$) } {
764 | \eIf { \isFile(\fs, $dest$) } {
765 | \delete(\fs, $dest$, recursive)\;
766 | \rename(\fs, $src$, $dest$)\;
767 | } {
768 | \For { f $\in$ \listFiles(\fs, $src$) } {
769 | \mergePathsA(\fs, f, $dest$ + f.name)\;
770 | }
771 | }
772 | } {
773 | \rename(\fs, $src$, $dest$)\;
774 | }
775 | }
776 |
777 | \caption{mergePathsV1(fs, src, dest)}
778 | \label{alg:mergePathsV1}
779 | \end{procedure*}
780 | % ------------------------------------------------------------
781 |
782 | All the files and directories are promoted to the destination directory.
783 |
784 | \begin{enumerate}
785 | \item If the calculated destination path of a source file or directory does
786 | not exist, the source files/directory renamed.
787 | \item If the destination path does exist and is a file, it is deleted and then
788 | the source files/directory renamed.
789 | \item If the destination path exists and is a directory, and the source
790 | is also a directory, then \texttt{mergePathsV1} is applied to the child
791 | entries of the source path.
792 | \end{enumerate}
793 |
794 | Together, it forms a depth-first overwrite of the source tree by the destination
795 | tree, specifically merging the contents all directories.
796 |
797 | %This is clearly not an atomic operation;
798 | %it is performing a sequence of operations on the distributed filesystem,
799 | %potentially including recursive operations down a directory tree.
800 | The time to execute the merge depends on the number of source entries
801 | and the state of the destination directory.
802 |
803 | If it fails, the state of the operation is unknown: it cannot simply
804 | be repeated.
805 |
806 |
807 | \begin{function}
808 |
809 | \false
810 |
811 | \caption{v1.isCommitJobRepeatable()}
812 | \label{v1.isCommitJobRepeatable}
813 |
814 | \end{function}
815 |
816 | Accordingly: if a job attempt fails during the commit process, it is
817 | unrecoverable: the subsequent attempt reports an error and aborts.
818 |
819 | % Abort Job v1
820 | \begin{procedure}
821 | \FileOutputCommitVars
822 |
823 | \delete(\fs, \jobAttemptPath, $recursive$)\;
824 |
825 | \caption{v1.abortJob()}
826 | \label{alg:v1.abortJob}
827 |
828 | \end{procedure}
829 |
830 | A job attempt an be cleaned up by deleting the output of all job attempts which
831 | may have been made.
832 | This can be achieved by deleting the entire \texttt{_temporary} directory
833 | under the destination directory.
834 |
835 | % cleanup job v1
836 | \begin{procedure}
837 | \FileOutputCommitVars
838 |
839 | \delete(\fs, \dest/\temp, $recursive$)\;
840 |
841 | \caption{cleanupJob()}
842 | \label{alg:v1.cleanupJob}
843 | \end{procedure}
844 |
845 | This would break with any other ongoing job which is writing to the
846 | same destination directory.
847 | It is a requirement that only one job may be actively writing
848 | to a specific destination, something which is checked for during job submission.
849 |
850 |
851 | \textbf{Job Recovery}
852 |
853 | The v1 committer can recover from a failed job attempt, with the
854 | second attempt being able to reuse the output of all committed tasks from
855 | the previous attempt.
856 |
857 | This job whole attempt recovery process is a complex one;
858 | from the perspective of the committer, if the task attempt was committed
859 | in the previous job attempt, the $taskCommittedPath$ of the previous attempt
860 | can be moved under the $jobAttemptPath$ of the new job attempt.
861 |
862 |
863 | \begin{procedure}
864 | \FileOutputCommitVars
865 |
866 | \delete(\fs, \dest/\temp, $recursive$)\;
867 |
868 | \caption{isRecoverySupported()}
869 | \label{alg:v1.isRecoverySupported}
870 | \end{procedure}
871 |
872 |
873 | \begin{procedure}
874 | \FileOutputCommitVars
875 |
876 | $previousJobId$ = $jobId$ - 1
877 | $previousJobAttemptDir$ = getJobAttemptPath($previousJobId$ - 1)
878 | \delete(\fs, \dest/\temp, $recursive$)\;
879 |
880 | \caption{recoverTask(TaskAttemptContext )}
881 | \label{alg:v1.recoverTask}
882 | \end{procedure}
883 |
884 |
885 | The only lost work is that of all in progress task attempts ---those which had generated
886 | data but were not yet committed.
887 |
888 | When working with HDFS, the main limitation of this algorithm is
889 | one of scale: job commit is an $O(tasks)$, with the time for
890 | each task's merge being a function of the number of files and the depth of
891 | the directory tree.
892 |
893 | As this is serialized at the end of the job, irrespective of how many workers
894 | there were, the job commit is a single point of delay, and of failure.
895 | The more tasks, the more work to commit, the longer the commit, and the
896 | higher risk of that failure.
897 |
898 | \subsection{Hadoop V2 commit algorithm}
899 | \label{subsec:hadoop-v2-commit-algorithm}
900 |
901 |
902 | The v2 commit algorithm propagates the task attempts output into the job's
903 | output directory in the task commit.
904 | This is done in a variant \texttt{mergePaths()} algorithm,\ \ref{alg:mergePathsV2},
905 | designed to support parallel writers to the output directory.
906 | In the Hadoop source the two algorithms are intermixed within a pair of
907 | co-recursive procedures;
908 | they have been isolated here for clarity.
909 |
910 |
911 | % ------------------------------------------------------------
912 | % V2 commit algorithm
913 | \begin{procedure*}
914 | \FileOutputCommitVars
915 |
916 | \eIf {\isFile(\fs, $src$)} {
917 | \If {\exists(\fs, $dest$)} {
918 | \delete(\fs, $dest$, $recursive$)\;
919 | }
920 | \rename(\fs, $src$, $dest$)\;
921 | } {
922 | \eIf {\exists(\fs, $dest$)} {
923 | \eIf {\isFile(\fs, $dest$)} {
924 | \delete(\fs, $dest$, $recursive$)\;
925 | \mkdirs(\fs, $dest$)\;
926 | \For {c $\in$ \listFiles(\fs, $src$)} {
927 | \mergePathsB(\fs, c, $dest$ + c.name)\;
928 | }
929 | } {
930 | \For {f $\in$ \listFiles(\fs, $src$)} {
931 | \mergePathsB(\fs, f, $dest$ + f.name)\;
932 | }
933 | }
934 | } {
935 | \mkdirs(\fs, $dest$)\;
936 | \For {f $\in$ \listFiles(\fs, $src$)} {
937 | \mergePathsB(\fs, f, $dest$ + f.name)\;
938 | }
939 | }
940 | }
941 |
942 | \label{alg:mergePathsV2}
943 | \caption{mergePathsV2(fs, rc, dest)}
944 |
945 | \end{procedure*}
946 | % ------------------------------------------------------------
947 |
948 | Here, the \texttt{rename()} operation is restricted to committing
949 | a single file: whenever a directory is to be committed, it is done
950 | as a recursive merge.
951 | This is necessary because multiple tasks may be committing simultaneously, tasks which
952 | may be writing to the same destination.
953 | The atomic exclusivity of a directory rename is precisely what is not
954 | wanted when trying to support multiple tasks merging their output into the
955 | same directory tree.
956 |
957 | Performance-wise, the \texttt{mergePathsV2} operation is slower
958 | than the v1 algorithm whenever there are directories to commit.
959 | Yet, because these operations are taking place in task commits, work is parallelized
960 | across the cluster, and, often, not directly slowing down the overall job.
961 |
962 | With the file propagation taking place in the tasks, the job commit
963 | operation is reduced to creating the \SUCCESS file and cleaning up
964 | working directories:
965 |
966 | \begin{procedure*}
967 | \FileOutputCommitVars
968 |
969 | \touch(\fs, \dest/\SUCCESS)\;
970 | \delete(\fs, \temp)\;
971 |
972 | \label{alg:commitJobV2}
973 |
974 | \caption{v2 commitJob()}
975 | \end{procedure*}
976 |
977 | As a result, the time to commit a job is barely measurable.
978 |
979 | In terms of failure resilience, the v2 algorithm is weaker than the v1 algorithm.
980 | Task commit is now a non-atomic operation;
981 | It is therefore not possible to safely recover from the failure or loss of a task attempt
982 | while it was committing work.
983 |
984 | Because the output of committed tasks are visible,
985 | if the job fails the contents of all committed tasks are visible.
986 |
987 | This commit algorithm has chosen speed over resilience.
988 |
989 | This is often a valid decision to make, however, the callers of the committers
990 | need to be aware that this decision has been made, and that failures in
991 | certain parts of the process, specifically task commit, are not recoverable.
992 |
993 | \subsection{Limitations of the Hadoop MapReduce Commit Protocol}
994 | \label{subsec:hadoop-commit-protocol}
995 |
996 | Alongside some implementation details, such as the fact that a task process
997 | will be exit without calling \texttt{cleanupTask()} once it is informed that it
998 | is unknown, we have to consider: are there any fundamental issues with
999 | the Hadoop commit protocol?
1000 |
1001 | A key weakness is that the job committer is not passed the list of task
1002 | attempts considered successful, and, from those committed tasks, their
1003 | lists of files which were committed.
1004 |
1005 | The committers themselves have to implement some mechanism to enumerate those
1006 | committed tasks.
1007 |
1008 | The File Output Committer does this through the filesystem, relying on consistent
1009 | directory listings to enumerate task output to merge, and, for the v1 algorithm,
1010 | to enumerate the set of committed tasks whose output must be published during job commit.
1011 | This places a requirement for the filesystem metadata listings to be consistent,
1012 | a requirement not met by all not object stores.
1013 |
1014 | As no list of completed tasks is directly passed to the \texttt{commitJob} operation,
1015 | the job committer cannot determine whether the actual committed output
1016 | in the filesystem is correct.
1017 |
1018 | There also appears to be a race condition between
1019 | verification that the destination directory does not exist in the
1020 | the client-side job submission, and the creation of that directory during
1021 | the \texttt{setupJob()} operation.
1022 | In a busy cluster there can be a delay between the scheduling of the job and
1023 | its application manager actually starting to execute\ldots
1024 | a second, conflicting, job may also be scheduled at this point.
1025 | If the destination directory were to be created during job submission,
1026 | this window would be nearly completely eliminated.
1027 |
1028 | % ========================================================================
1029 |
1030 | \section{The Spark Commit Protocol}
1031 | \label{sec:spark-commit-protocol}
1032 |
1033 | Apache Spark's execution model is significantly different from
1034 | that of Hadoop.
1035 | Rather than dedicatinga single process to executing a single operation
1036 | across a subset of the source data, Spark creates a set of \emph{Executors},
1037 | each of which can execute task attempts across a number of threads.
1038 | As a result, a single executor may be executing many task attempts
1039 | simultaneously, with each task's commit operations being centrally managed
1040 | by the single Job Manager.
1041 |
1042 | \begin{figure*}
1043 | \centering
1044 | \includegraphics[width=.8\textwidth]{spark-protocol.png}
1045 | \caption{Spark commit protocol)}
1046 | \label{fig:spark-protocol}
1047 | \end{figure*}
1048 |
1049 |
1050 | When a failure of an executor is detected by loss of its heartbeat,
1051 | all active tasks will be rescheduled.
1052 | As the failure may be a network partition, multiple task attempts may be active
1053 | simultaneously.
1054 | It is therefore a requirement that no data is promoted until a task attempt is actually
1055 | committed.
1056 |
1057 |
1058 | Spark can use the Hadoop Committers within its commit protocol,
1059 | which is usually done whenever writing data to HDFS or other cluster filesystem.
1060 |
1061 | Spark manages its requirement of ``only one task attempt may be committed''
1062 | in its \texttt{OutputCommitCoordinator} class;
1063 | an instance of this in the driver tracks the state of all active task attempts
1064 | and grants or denies permission to commit.
1065 |
1066 | A task attempt is only granted permission to commit if a set of conditions
1067 | are met\footnote{see: \texttt{OutputCommitCoordinator.handleAskPermissionToCommit()}}:
1068 |
1069 | \begin{enumerate}
1070 | \item The task attempt is not recorded as having failed.
1071 | \item The task attempt is in the set of known-to-be active tasks
1072 | \item Either the requesting task attempt has already been granted this permission,
1073 | no task attempt has been granted permission to commit, or a previous task
1074 | was granted permission, but it is considered to have failed.
1075 |
1076 | \end{enumerate}
1077 |
1078 | That is: it must be a valid task attempt and no other task attempt can be
1079 | actively committing or have committed this task.
1080 |
1081 | The Executor requests this permission to commit via an RPC call and will
1082 | proceed with the commit when it receives a successful message.
1083 | A timeout on the RPC channel or a denial of commit permission will result
1084 | in \texttt{abortTask()} being invoked.
1085 |
1086 | Once a task attempt has been granted permission to commit, then no other
1087 | attempt will be granted unless the first attempt is reported as having failed.
1088 |
1089 | \TODO: \texttt{OutputCommitCoordinator} reacts to task events from the scheduler, but
1090 | does that cover executor failure?
1091 |
1092 | Spark makes no attempt to recover from a failed Job Manager;
1093 | its mechanism for recovering from a failed job is ``rerun the entire query''.
1094 |
1095 | One area where Spark goes beyond Hadoop's protocol ias that
1096 | it adds a new operation to request a file with an absolute path,
1097 | \texttt{newTaskTempFileAbsPath()}.
1098 | It is needed to address the special case of Apache Hive, wherein
1099 | some parts of the dataset are written to different locations than
1100 | under the destination directory of a job.
1101 | The operation, having calculated the absolute destination of the output,
1102 | requests a temporary file which will be placed in the final destination
1103 | directory on a job commit.
1104 |
1105 | Spark implements this operation atop the standard \texttt{FileOutputCommitter}
1106 | as follows:
1107 |
1108 | \begin{enumerate}
1109 | \item An ``absolute path staging directory'' is created under the job output
1110 | directory;
1111 | this is \texttt{_temporary-\$jobId}.
1112 | \item When a \texttt{newTaskTempFileAbsPath()} is invoked, a path under this
1113 | directory is generated, with a UUID in the filename.
1114 | \item The mapping of absolute path to temporary file is stored in a map in the Task Committer.
1115 | \item In the \texttt{commitTask()} operation, the map of all files to rename is passed back.
1116 | \item In \texttt{commitJob()}, after invoking the Hadoop committer's \texttt{commitJob()}
1117 | call, all files in the aggregate map of files to rename to absolute paths is iterated through.
1118 | Each file is renamed to its final path, in turn.
1119 | \item Task abort will delete files of that task, while Job abort will delete
1120 | the whole absolute path staging directory.
1121 | \end{enumerate}
1122 |
1123 | This is extra operation is currently only used in that specific use case,
1124 | ``Hive table with partitions elsewhere in the same filesystem as the active job''.
1125 | This is not a common use case, at least with data stored in object stores.
1126 | Accordingly, our new committers do not support this operation
1127 | \footnote{It is possible to support this, but it would complicate cleaning up
1128 | after tasks, especially failed ones, and failed jobs.}.
1129 |
1130 | Spark is more flexible in its commit protocol, because
1131 | the name for a file is generated by the committer, not the application,
1132 | and because successful task committers can pass arbitrary serialized data back
1133 | to the driver, for use in the Job Commit operation.
1134 | This could potentially be used as the sole mechanism for passing a list
1135 | of written files from the task attempts to the job committer.
1136 | Being able to generate names (albeit while preserving a sort order), could
1137 | also be potentially useful.
1138 | We have initially chosen to not explore this as a commitment strategy;
1139 | others may wish to do so.
1140 |
1141 |
1142 |
1143 | \subsubsection{Limitations of the Spark Commit Protocol}
1144 |
1145 | The standard commit coordination in the Spark Driver is with the
1146 | \texttt{OutputCommitCoordinator}.
1147 | this class's state includes tracking whether or not a task attempt
1148 | has been granted permission to commit its work.
1149 | Once one task attempt has been granted permission to commit,
1150 | all other task attempts for the same task will be denied.
1151 | However, if the task attempt granted permission to commit its work fails
1152 | for any reason, the attempt is considered a failure, and
1153 | another attempt will be granted permission to commit its work.
1154 |
1155 | This strategy works, provided task commit is a repeatable operation,
1156 | even if the first attempt has failed or become partitioned from the
1157 | Spark Driver.
1158 | That requirement is met for the \texttt{FileOutputCommitter} v1 algorithm,
1159 | but possibly not by the v2 algorithm, or, potentially, others.
1160 | If committers could declare their ability to recover from
1161 | failed task commits, along with other aspects of their operation,
1162 | the \texttt{OutputCommitCoordinator} would be able to decide whether
1163 | a repeated attempt were permitted, or whether failing the Job was the
1164 | safer outcome.
1165 |
1166 |
1167 | Unlike the Hadoop protocol, there is no requirement for the Spark Driver
1168 | to have received a recent liveness check from the cluster scheduler.
1169 | Unless the Spark Driver process exits once it determines that it has been
1170 | isolated from any underlying cluster scheduler, there is a risk that
1171 | a partitioned Spark cluster may commit a job to the same destination
1172 | as a cluster instantiated as a replacement.
1173 | Careful review of the YARN and Mesos integration code is required to be
1174 | confident that this risk does not exist.
1175 |
1176 | Spark's commit protocol permit task committers to return data to
1177 | the Job Committer in the Spark Driver;
1178 | it would be possible to use this to validate the output of the tasks.
1179 | The current committer implementations do not do this, but at least the underlying
1180 | protocol makes such an improvement possible.
1181 |
1182 |
1183 |
1184 |
1185 | % ========================================================================
1186 |
1187 | \section{The Challenge of Object Stores}
1188 | \label{sec:object-stores}
1189 |
1190 | Having introduced the classic filesystem and the commit protocols and algorithms
1191 | used to commit the output of distributed computation, let us consider
1192 | Object Stores such as Amazon S3, Google Cloud Storage and
1193 | Windows Azure Storage\ \cite{AWS-S3-intro,Calder11}.
1194 |
1195 | % As all filesystem
1196 | %operations are via the NameNode, all clients get a consistent view of the filesystem.
1197 | %And, as the
1198 |
1199 |
1200 | The most salient point, is this: Object Stores are not filesystems.
1201 | Rather than the classic hierarchical view of directories, subdirectories
1202 | and paths, object stores store a set of objects, each with a unique key;
1203 | a sequence of characters provided when the object was created.
1204 | Classic path separators ``\texttt{/}'' are invariably part of the set of valid
1205 | characters, so allowing objects to be created which have the appearance
1206 | of files in a directory.
1207 |
1208 | As examples, the following are all valid keys on the Amazon, Google and Microsoft
1209 | stores
1210 |
1211 | \begin{verbatim}
1212 | /entry
1213 | /path1/path2/path3
1214 | /path1/
1215 | /path1
1216 | \end{verbatim}
1217 |
1218 | More subtly, it is valid for an object store container (on S3:, a ``bucket'')
1219 | to have objects with all of these names simultaneously.
1220 | It is not an error to have an object whose key would make it appear to be
1221 | ``under'' another object, nor to explicitly contain path entries separators.
1222 |
1223 | Objects cannot generally be appended to once created, or renamed.
1224 | They can be replaced by new objects or deleted.
1225 | Some form of copy operation permits an object to be duplicated, creating
1226 | a new object with a different key.
1227 | Such copy operations take place within the storage infrastructure with a
1228 | copy time measurable in megabytes/second.
1229 |
1230 |
1231 | The set of operations offered are normally an extended set of HTTP verbs:
1232 |
1233 | \begin{description}[leftmargin=8em, style=nextline]
1234 | \item[PUT] Atomic write of an object
1235 | \item[GET] retrieve all or part of an object
1236 | \item[HEAD] retrieve the object metadata
1237 | \item[LIST] list all objects starting with a given prefix
1238 | \item[COPY] copy a single object within the store, possibly from other containers.
1239 | \item[DELETE] Delete an object
1240 | \end{description}
1241 |
1242 | There are usually two extra operations to address scale:
1243 | a bulk delete call which may have partial failures,
1244 | and \emph{Multipart Upload}; a way to upload an object larger than the
1245 | 5GB which a single HTTP POST can support.
1246 | The exact nature of multipart uploads varies from store to store.
1247 | For Amazon this is initiated as a sequence of POST calls, one to initiate,
1248 | one or more POST calls with data, and a final POST listing the (ordered)
1249 | etags of the uploaded object parts.
1250 | All but the last upload in the object must be 5 MB or larger
1251 |
1252 |
1253 | Object store implementations can display different levels of inconsistency.
1254 | Windows Azure Storage is fully consistent;
1255 | Amazon S3 offers create consistency on new objects, but not updated or deleted ones.
1256 | It also exhibits listing inconsistency, wherein a newly created object
1257 | may not be visible in the results of a \texttt{LIST} call, or a newly deleted
1258 | object still be listed as present.
1259 |
1260 |
1261 | Despite the clear mismatch between the capabilities and APIs of object storage,
1262 | and that expected of a Hadoop filesystem, they have one key thing in common:
1263 | they can store Petabytes of data.
1264 | For that reason, all the popular cloud storage infrastructures have connectors
1265 | from Hadoop, and thus transitively application such as Apache Hive, Apache HBase
1266 | and Apache Spark.
1267 | Many of these are developed within the Apache Software Foundation's own
1268 | source repository, including the Azure ``wasb'' connector and the ``s3a'' connector
1269 | to Amazon S3.
1270 | Others are maintained externally ---particularly Amazon EMR's own ``EMRFS'',
1271 | known by the \texttt{s3} URL schema, and the Google Cloud Storage connector,
1272 | \texttt{gcs}.
1273 |
1274 | Irrespective of where they are implemented, they all share a common objective:
1275 | trying to maintain the filesystem metaphor atop an object store.
1276 |
1277 | As an example of a simple case, the \texttt{getFileStatus()} call mimics
1278 | a directory, in conjunction with zero-byte ``empty directory'' markers,
1279 | so must look for a file, then a marker and the most expensive operation, a path listing.
1280 |
1281 | \begin{verbatim}
1282 | GET path
1283 | GET path/
1284 | LIST path/
1285 | \end{verbatim}
1286 |
1287 | The performance differences of every HTTPS request slows
1288 | down all RPC operations, even with pooled collections: even this simple probe
1289 | can for a file take hundreds of milliseconds.
1290 | Other mimicked operations have similar costs.
1291 |
1292 | Operations upon directories are mimicked by listing all objects under that path,
1293 | and acting upon those objects individually.
1294 | A recursive delete is implemented as a listing of the maximum number of files
1295 | returned in one HTTP request (5000 or a similar value), then either issuing
1296 | bulk DELETE operations, where supported, or falling back to individual DELETE
1297 | calls.
1298 | Bulk LIST/DELETE operations have a cost of one HTTP request/page size, such
1299 | as $O(1 + descendants/5000)$; if sequential delete operations must be issued, then
1300 | the cost is at least $O(1+ descendants)$, with the ``at least'' qualifier being
1301 | added because request throttling can slow down the requests even further.
1302 |
1303 | File and directory renaming is even more expensive.
1304 | A file rename is implemented as a copy of the original data to a new path,
1305 | followed by a delete of the original data.
1306 | This makes the time to copy a single file an $O(length(file))$ operation
1307 | \footnote{Third party implementations of the S3 protocol do generally offer an $O(1)$ rename operation}.
1308 |
1309 | Directory rename is a paged listing of all children, and a copy and delete for
1310 | each, which makes its duration a function the number of files and total amount of data.
1311 |
1312 | These are the tangible performance issues, the ones which are most visible
1313 | to users.
1314 | However it is the fact that the atomicity behaviors of a POSIX filesystem
1315 | are not provided which are most dangerous.
1316 |
1317 | The \texttt{rename()} call is no longer atomic: two clients may start renaming
1318 | into to the same destination directory.
1319 | Furthermore, if any rename fails, the state of the source and destination
1320 | directory is unknown: the data may be spread across both locations.
1321 | Finally, because the files to be copied is determined from a LIST call,
1322 | if the object store is not consistent, the listing can be incomplete or out of
1323 | date.
1324 | Newly created files may not be visible, so not copied as part of the rename
1325 | operation.
1326 |
1327 | \emph{Directory rename cannot be used in a commit algorithm which
1328 | requires atomic, exclusive or consistent renames}.
1329 |
1330 | The \texttt{create(path, overwrite=false)} operation is also flawed.
1331 | This is expected to be an atomic operation to immediately create a file iff there is
1332 | no entry at that path;
1333 | Instead may be mimicked by a sequence of the \texttt{getFileStatus()} call
1334 | and the creation of a buffer on the client side for the output: the data
1335 | will not be visible until the data is completely written and the stream
1336 | closed.
1337 | As a result, it is impossible to use file creation as a means of creating any
1338 | form of lock or exclusive access in such a store.
1339 |
1340 |
1341 | Returning to the MapReduce v1 and v2 commit algorithms, they are unsuitable for
1342 | use in any object store without atomic renames (v1), consistent
1343 | directory listings and existence checks (v1 and v2).
1344 |
1345 | As a result, neither can be used directly against Amazon S3 today.
1346 | With a consistent metadata layer such as S3mper or S3Guard, the v2 algorithm
1347 | can be used, though its task commit time will be $O(data)$\ \cite{S3mper,HADOOP-13345}.
1348 |
1349 | Providing a safe, performant output committer for object stores forces
1350 | us to leave the metaphor of a filesystem behind, and embrace
1351 | the capabilities of object stores themselves.
1352 |
1353 | % ========================================================================
1354 |
1355 | \section{The new S3A Committers: working with S3 from the outset}
1356 | \label{sec:new-committers}
1357 |
1358 |
1359 | Given that S3 does not deliver the safe and performant operations
1360 | which the file committers expect, how can Hadoop and Spark
1361 | jobs safely use it as a destination of their work?
1362 |
1363 | This is the problem solved by the new ``S3A committers''.
1364 | These are called as they are closely integrated with Hadoop's S3A connector to S3,
1365 | using the multipart upload operation to decouple writing the data from
1366 | manifesting it at its final destination.
1367 |
1368 | Multipart upload is already used for writing large files to the object store.
1369 | When a file is written, it is initially buffered to disk or memory, when the
1370 | buffer size reaches some threshold the upload is initiated, and first block uploaded
1371 | in a \texttt{POST} operation.
1372 | S3's response to the POST operation is an MD5-checksum of the
1373 | uploaded data, the ``entity tag'', as used in existing HTTP operations.
1374 | After all the blocks of a stream have been uploaded, the ordered list
1375 | of entity tags is POSTed to S3 in a final request completing the MPU .
1376 | It is only after this final POST that the uploaded object is manifest in S3.
1377 | If this final POST operation can be used to commit the output of a task,
1378 | then the committer has atomic and effectively O(1) operation for each file.
1379 |
1380 | The challenge for an S3 committer then becomes one of: how to have
1381 | user code write to the destination directory, preserving and propagating
1382 | the lists of MPUs to finally commit in the job commit operation?
1383 |
1384 | That is the challenge addressed in the two committers.
1385 |
1386 | Underneath, they both use the same methods offered by the S3A connector,
1387 | and the same persistent data formats to propagate the lists of pending uploads.
1388 | Where they differ is how tasks write data, and how the lists are passed
1389 | to the job committer.
1390 |
1391 | In the ``Staging Committer'', each task attempt writes its data into the local
1392 | filesystem of the server on which the attempt is executed.
1393 | When a task attempt is committed, its data is uploaded to the final
1394 | paths on S3.
1395 | The manifest of the pending MPUs is passed to the job tracker via
1396 | a shared consistent cluster filesystem (usually HDFS), \emph{using the v1
1397 | File Output Committer}.
1398 | When the Hadoop or Spark job is committed, the Staging committer reads in
1399 | from HDFS the manifests written of the committed task attempts, and
1400 | completes the uploads listed therein.
1401 |
1402 | Performance-wise, all the data is uploaded to its final destination in the task commit, with the
1403 | job commit being the time to execute the v1 commit operation within HDFS, followed
1404 | by that of a POST call per uploaded file.
1405 |
1406 |
1407 | The ``Magic Committer'' works within the S3A filesystem connector, changing
1408 | how files are incrementally written to S3.
1409 | Rather than completing a multipart upload when the output stream being written
1410 | by a task is closed, in the process doing the writing, the magic
1411 | committer delays the final POST until the job is committed.
1412 | Instead it writes a manifest describing the upload to S3.
1413 | When the task is committed, all the single file manifests of that attempt
1414 | are aggregated into a single manifest for the task attempt, which is then
1415 | PUT to S3 in the directory of completed tasks.
1416 | The Job commit process is one of reading in the manifests of all committed
1417 | tasks, and as with the Staging Committer, completing their uploads.
1418 |
1419 | Because of its incremental upload of blocks of the output data, the magic committer promises
1420 | faster uploads of larger datasets: there is no need to postpone the upload
1421 | to S3 until the task is actually committed.
1422 | Because it does not buffer any data other than the yet-to-be-written blocks,
1423 | the amount of local storage is reduced, so potentially avoiding running
1424 | out of local disk capacity\footnote{and/or allow for VMs with less virtual disk to be used}.
1425 |
1426 |
1427 | \subsubsection{The Staging Committer}
1428 |
1429 | The staging committer declares the working directory of a task
1430 | attempt to be in the local filesystem, the directory \texttt{workPath}.
1431 | It is this which is returned in the method \texttt{PathOutputCommitter.getWorkPath()},
1432 | which is then used in \texttt{FileOutputFormat} to provide the paths which
1433 | callers use when creating files in a task attempt.
1434 |
1435 |
1436 |
1437 | \begin{table}
1438 | \caption{Extra variables used by the staging committer}
1439 | \begin{tabular}{ l l }
1440 | \hline
1441 | \textbf{name} & \textbf{meaning} \\
1442 | $localfs$ & The local ``file:'' filesystem \\
1443 | $localAttemptPath$ & A local filesystem path \\
1444 | $clusterfs$ & The cluster filesystem \\
1445 | $wrappedCommitter$ & The committer for the cluster filesystem. \\
1446 | $clusterJobAttemptPath$ & the job attempt path of $wrappedCommitter$ \\
1447 | $clusterTaskAttemptPath$ & the job attempt path of $wrappedCommitter$ \\
1448 | \hline
1449 | \end{tabular}
1450 | \label{tab:StagingCommitter.variables}
1451 | \end{table}
1452 |
1453 | %% Define the extra variables for the staging committer
1454 | \newcommand{\StagingVars}{
1455 | \FileOutputCommitVars
1456 | \SetKwData{clusterfs}{$clusterfs$}
1457 | \SetKwData{wrappedCommitter}{$wrappedCommitter$}
1458 | \SetKwData{clusterJobAttemptPath}{$clusterJobAttemptPath$}
1459 | \SetKwData{clusterTaskAttemptPath}{$clusterTaskAttemptPath$}
1460 | \SetKwData{jobUUID}{$jobUUID$}
1461 | \SetKwData{localfs}{$localfs$}
1462 | \SetKwData{localAttemptPath}{$localAttemptPath$}
1463 | \SetKwData{temp}{_$temporary$}
1464 | }
1465 |
1466 |
1467 | \textbf{Job Setup}
1468 |
1469 | The cluster-filesystem committer, \texttt{wrappedCommitter}.
1470 | is created and initialized, configured to use a unique path within the
1471 | cluster filesystem as its $clusterJobAttemptPath$ output directory.
1472 | This committer will have its own job attempt and task attempt directories.
1473 | This committer is set to use the v1 commit algorithm,.
1474 |
1475 | %% StagingCommitter.setupJob()
1476 | \begin{procedure}
1477 | \StagingVars
1478 |
1479 | \jobUUID $\longleftarrow$ \newUUID \;
1480 | \clusterJobAttemptPath $\longleftarrow$ \tempDirForStaging + \getUsername + \jobUUID \;
1481 |
1482 | \taskAttemptPath $\longleftarrow$ \jobAttemptPath/\taskAttemptId\;
1483 |
1484 | \wrappedCommitter.setupJob(\clusterJobAttemptPath)\;
1485 | \caption{StagingCommitter.setupJob()}
1486 | \label{alg:StagingCommitter.setupJob}
1487 | \end{procedure}
1488 |
1489 | %If the staging committer is configured to fail if the destination exists,
1490 | %this setup will also include a check for the destination path, raising
1491 | %an exception if it is present.
1492 |
1493 |
1494 | % -----------------------------------------------------------------
1495 |
1496 | \textbf{Task Setup}
1497 |
1498 |
1499 | %% StagingCommitter.setupTask()
1500 | \begin{procedure}
1501 | \StagingVars
1502 |
1503 | \taskAttemptPath $\longleftarrow$ \jobAttemptPath/\taskAttemptId\;
1504 | \localAttemptPath $\longleftarrow$ uniquePath(\localfs, \taskAttemptId)\;
1505 | \clusterTaskAttemptPath $\longleftarrow$ \clusterJobAttemptPath + \taskAttemptId)\;
1506 |
1507 | \wrappedCommitter.setupTask(\clusterTaskAttemptPath)\;
1508 |
1509 | \caption{StagingCommitter.setupTask()}
1510 | \label{alg:StagingCommitter.setupTask}
1511 |
1512 | \end{procedure}
1513 |
1514 | The function \texttt{uniquePath(filesystem, taskAttemptId)} is required to return
1515 | a unique path in the local filesystem for a task attempt.
1516 | It does this under the local \texttt{/tmp} directory, which is where
1517 | large intermediate datafiles are stored during MapReduce operations.
1518 | A well managed Hadoop cluster has this temporary data stored on a non-root
1519 | volume, along with a regularly scheduled job to delete old temporary files.
1520 |
1521 | This local filesystem is returned by the committer's \texttt{getWorkPath()} method.
1522 |
1523 | \begin{function}
1524 | \StagingVars
1525 | \return \localAttemptPath\;
1526 |
1527 | \caption{StagingCommitter.getWorkPath()}
1528 | \label{alg:StagingCommitter.getWorkPath}
1529 | \end{function}
1530 |
1531 | This is the crux of the algorithm.
1532 |
1533 | The working path returned to the task attempt execution code in MapReduce and Spark
1534 | is a \texttt{file://}-prefixed local directory, not one in the object store.
1535 | The task attempt commit process is where these will be uploaded, and the job
1536 | commit where the uploads are materialized.
1537 |
1538 | % -----------------------------------------------------------------
1539 | \textbf{Needs Task Commit}
1540 |
1541 | A commit is required iff data has been generated in the local filesystem.
1542 |
1543 | \begin{function}
1544 | \StagingVars
1545 |
1546 | \return \exists(\localfs, \localAttemptPath)\;
1547 |
1548 | \caption{StagingCommitter.needsTaskCommit()}
1549 | \label{alg:StagingCommitter.needsTaskCommit}
1550 |
1551 | \end{function}
1552 |
1553 |
1554 | \textbf{Task Abort}
1555 |
1556 | A task attempt is aborted by deleting all staged data, and aborting the wrapped committer's
1557 | task.
1558 |
1559 | \begin{procedure}
1560 | \StagingVars
1561 |
1562 | \delete(\localfs, \localAttemptPath, $recursive$)\;
1563 | \wrappedCommitter.abortTask()\;
1564 |
1565 | \caption{StagingCommitter.abortTask()}
1566 | \label{alg:StagingCommitter.abortTask}
1567 | \end{procedure}
1568 |
1569 |
1570 |
1571 | \textbf{Task Commit}
1572 |
1573 | If a task attempt is given permission to commit its output, it does
1574 | so by initiating multipart uploads of all files under \texttt{localAttemptPath}
1575 | to the final destination directory, uploading the data, but not completing
1576 | the operation.
1577 |
1578 | % commit task
1579 | \begin{procedure}
1580 | \StagingVars
1581 |
1582 | \wrappedCommitter.commitTask()\;
1583 |
1584 | $U \longleftarrow \emptyset$\;
1585 | \For{$f\in $ \listFiles(\localfs, \localAttemptPath) } {
1586 | $U \longleftarrow U + \{$ \uploadFileToPendingCommit($f$, \dest) $\}$\;
1587 | }
1588 | \savePendingSet(\clusterfs, \clusterTaskAttemptPath, $U$)\;
1589 |
1590 | \caption{StagingCommitter.commitTask()}
1591 | \label{alg:StagingCommitter.commitTask}
1592 |
1593 | \end{procedure}
1594 |
1595 | The information needed to complete these pending uploads are then saved as
1596 | a manifest file to \texttt{clusterTaskAttemptPath}, after which the wrapped committer has
1597 | its \texttt{commitTask()} operation called.
1598 | This will rename the saved file into the job attempt directory with the
1599 | filename of the actual task., that is \texttt{\$clusterJobAttemptPath/\$taskId}.
1600 |
1601 |
1602 | \textbf{Job Commit}
1603 |
1604 | The Job commit process manifests the pending uploads.
1605 |
1606 | The list of uploads is found by listing the files in the cluster job attempt path
1607 | This is the directory into which the pending set files of task attempts are
1608 | renamed during their task commits.
1609 |
1610 | % commit job
1611 | \begin{procedure}
1612 | \StagingVars
1613 |
1614 | $Pending \longleftarrow \emptyset$\;
1615 | \For{$f \in $ \listFiles(\clusterfs, \clusterJobAttemptPath)} {
1616 | $Pending \longleftarrow Pending + $ \loadPendingSet(\clusterfs, $f$)\;
1617 | }
1618 | \checkForConflicts(\fs, $Pending$) \;
1619 | \For{$p \in Pending$} {
1620 | \completeUpload($p$)\;
1621 | }
1622 |
1623 | \caption{StagingCommitter.commitJob()}
1624 | \label{alg:StagingCommitter.commitJob}
1625 | \end{procedure}
1626 |
1627 | The \texttt{completeUpload()} operation completes the upload of a file by POST-ing
1628 | a complete-multipart-upload requesting list the ordered MD5 checksums of every block
1629 | previously uploaded.
1630 |
1631 | Note that \texttt{wrappedCommitter.commitJob()} is not invoked;
1632 | because the location of the pending set files of this job attempt is known,
1633 | they can be read directly.
1634 | This is a minor optimization.
1635 |
1636 | % Abort Job
1637 |
1638 | \textbf{Job Abort}
1639 |
1640 | To abort an entire job, the set of pending uploads must be enumerated as
1641 | per job commit, only now the jobs are aborted.
1642 |
1643 | \begin{procedure}
1644 | \StagingVars
1645 |
1646 | $Pending \longleftarrow \emptyset$\;
1647 | \For{$f \in $ \listFiles(\clusterfs, \clusterJobAttemptPath)} {
1648 | $Pending \longleftarrow Pending + $ \loadPendingSet(\clusterfs, $f$)\;
1649 | }
1650 | \For{$p \in Pending$} {
1651 | \abortUpload($p$)\;
1652 | }
1653 | \wrappedCommitter.abortJob()\;
1654 |
1655 | \caption{StagingCommitter.abortJob()}
1656 | \label{alg:StagingCommitter.abortJob}
1657 | \end{procedure}
1658 |
1659 | % Cleanup
1660 |
1661 | \textbf{Job Cleanup}
1662 |
1663 | To clean up a job all incomplete uploads targeted at or under
1664 | the output directory must be enumerated and aborted, which can be done
1665 | with a POST to S3 to list the outstanding uploads, and another POST per
1666 | upload to abort.
1667 |
1668 | Local task attempt directories must be deleted, as well as those in the shared cluster.
1669 |
1670 | \begin{procedure}
1671 | \StagingVars
1672 |
1673 | \For{$f \in $ \listPendingUploads(\dest)} {
1674 | \abortUpload($f$)\;
1675 | }
1676 |
1677 | \delete(\localfs, $local directories for job$, recursive)\;
1678 | \wrappedCommitter.cleanupJob()\;
1679 |
1680 | \caption{StagingCommitter.cleanupJob()}
1681 | \label{alg:StagingCommitter.cleanupJob}
1682 | \end{procedure}
1683 |
1684 |
1685 | As those local task attempt directories are local to the nodes executing
1686 | individual tasks, they will not be deleted in the job cleanup, except for
1687 | those tasks which were executed on the same host as that where the
1688 | \texttt{cleanupJob()} operation is invoked.
1689 |
1690 |
1691 | \subsubsection{Enhancing conflict resolution for a zero-rename workflow}
1692 |
1693 | One aspect of the commit algorithm omitted is how this committer resolves
1694 | conflict with existing files.
1695 | The \texttt{FileOutputCommitter} algorithms fail if there is destination data;
1696 | they are required to have an empty output directory.
1697 |
1698 | The Staging Committer supports alternative policies, and may be configured
1699 | to overwrite or add to data in the destination directory.
1700 | To guarantee that newly added files have unique names, the
1701 | uploaded files can have a unique ID inserted in their filenames.
1702 |
1703 | One conflict resolution option is targeted explicitly at Spark SQL queries writing
1704 | output in the layout structure popularized by Apache Hive, wherein
1705 | different levels in the directory tree are used to partition data.
1706 |
1707 | For example, data could be partitioned by year, month and day, such as
1708 | \texttt{/data/YEAR=2017/MONTH=12/DAY=21/}.
1709 | Partitioning increases query performance where only select field ranges are used;
1710 | any query of December 2017 only needs to look under all subdirectories of
1711 | \texttt{/data/YEAR=2017/MONTH=12/}, ignoring all adjacent directories.
1712 |
1713 | Often large datasets like these are built up over time, with nightly or hourly
1714 | results being added.
1715 | In a traditional workflow, this is normally by done by
1716 | executing the new query into an empty directory, then, once the job has succeeded,
1717 | moving the new data into the aggregate dataset through `rename()` operations.
1718 | This generate-then-rename strategy ensures that if a job fails, no matter when
1719 | it happened, original dataset is unchanged,
1720 | and that applications can continue to use the current set of files.
1721 |
1722 | In an object store, that rename operation is of course, another expensive copy
1723 | operation, with its own failure modes.
1724 |
1725 | What to do?
1726 |
1727 | The solution as developed and utilized at Netflix is to have a special mode
1728 | of the committer, ``Partitioned'', which expects all data to be written
1729 | into one or more subdirectories of a partitioned dataset, a dataset which
1730 | may already exist in the destination directory.
1731 |
1732 | Conflict resolution is scoped purely to that of the destination partitions,
1733 | ignoring all other partitions in the existing dataset.
1734 | In the Job Commit operation, the ``fail'' conflict option will only fail if there
1735 | are existing files in the partitions to which new files are added;
1736 | the ``overwrite'' option will cause the existing files in the destination
1737 | partitions to be deleted.
1738 |
1739 | Thus, ``the Partitioned Staging Committer''
1740 | permits jobs to be run with their destination set to the actively
1741 | shared dataset, while existing work queries can be run across the data.
1742 | By eliminating the need to copy data at the end of an isolated query,
1743 | it can speed up a workflow of execute-then-rename.
1744 |
1745 |
1746 | \subsection{The Magic Committer}
1747 | \label{subsec:magic-committer}
1748 |
1749 | Rather than stage data in the local filesystem, the magic committer
1750 | allows task attempts to write directly to the object store as
1751 | delayed multipart uploads.
1752 |
1753 | One challenge of the committer is : how to determine when a client wants to initiate
1754 | a delayed-visibility write operation?
1755 |
1756 |
1757 | Whenever a file is written to a directory under the path \texttt{__magic},
1758 | it is considered to be a delayed write operation.
1759 | The relative path under the this directory is mapped as being relative to the
1760 | job's destination directory ---the parent directory of the \texttt{__magic} path.
1761 |
1762 | To support multiple job and task attempts, the output of every task attempt
1763 | must be written as to be relative to the Job's destination directory.
1764 | Accordingly, whenever a directory with the name \texttt{__base} is
1765 | encountered, it declares that its contents must be mapped relative to the destination
1766 | directory.
1767 |
1768 | \begin{table}
1769 | \caption{Example magic path mappings}
1770 | \begin{tabular}{ l l }
1771 | \hline
1772 | \textbf{original} & \textbf{final} \\
1773 | \texttt{dest} & \texttt{dest} \\
1774 | \texttt{dest/__magic/1} & \texttt{dest/1} \\
1775 | \texttt{dest/__magic/1/2} & \texttt{dest/1/2} \\
1776 | \texttt{dest/__magic/job1/task003/__base/3} & \texttt{dest/3} \\
1777 | \texttt{dest/__magic/job2/task004/__base/4/5} & \texttt{dest/4/5} \\
1778 | \texttt{dest/__magic/1/2.pending} & \texttt{dest/__magic/1/2.pending} \\
1779 | \texttt{dest/__magic/job1/task003.pendingset} & \texttt{dest/__magic/job1/task003.pendingset} \\
1780 | \hline
1781 | \end{tabular}
1782 | \label{tab:magic-paths}
1783 | \end{table}
1784 |
1785 |
1786 | When the a magic output stream is closed, the manifest of the single upload is saved
1787 | to a \texttt{.pending}-suffixed file saved under the \texttt{__magic} path,
1788 | along with a 0-byte marker file of the original path.
1789 | The latter is required to satisfy applications which verify the existence
1790 | of their written file.
1791 |
1792 | When a task attempt is committed, all \texttt{.pending} files under its task attempt directory are
1793 | listed and saved into a single \texttt{.pendingset} file into the job attempt directory.
1794 |
1795 | When the job is committed, all \texttt{.pendingset} files in its job attempt
1796 | directory are loaded, and the outstanding uploads listed therein committed.
1797 |
1798 | Because of its use of list operations to enumerate uploads to commit, this
1799 | committer needs consistent metadata listings of the object store.
1800 | This is provided by the S3Guard extension to S3A\ \cite{HADOOP-13345},
1801 | which uses Amazon's DynamoDB database for the consistent metadata view.
1802 | This significantly speeds up the listing operations, so speeding up the task
1803 | and job commit operations.
1804 |
1805 | %% Define the extra variables for the magic committer
1806 | \newcommand{\MagicVars}{
1807 | \FileOutputCommitVars
1808 | \SetKwData{temp}{_$temporary$}
1809 | \SetKwData{magic}{\_\_magic}
1810 | \SetKwData{magicPath}{$magicPath$}
1811 | }
1812 |
1813 |
1814 | \begin{table}
1815 | \caption{Extra variables used by the magic committer}
1816 | \begin{tabular}{ l l }
1817 | \hline
1818 | \textbf{name} & \textbf{meaning} \\
1819 |
1820 | $magicPath$ & The magic directory \\
1821 | \hline
1822 | \end{tabular}
1823 | \label{tab:MagicCommitter.variables}
1824 | \end{table}
1825 |
1826 |
1827 | \textbf{Job Setup}
1828 |
1829 |
1830 | %% StagingCommitter.setupJob(
1831 | \begin{procedure}
1832 | \MagicVars
1833 |
1834 | \magicPath $\longleftarrow$ \dest + __magic\;
1835 | \jobAttemptPath $\longleftarrow$ \magicPath + + \jobAttemptId\;
1836 | \taskAttemptPath $\longleftarrow$ \jobAttemptPath/\taskAttemptId\;
1837 | \mkdirs(\fs, \jobAttemptPath)\;
1838 | \caption{MagicCommitter.setupJob()}
1839 | \label{alg:MagicCommitter.setupJob}
1840 |
1841 | \end{procedure}
1842 |
1843 | \textbf{Task Setup}
1844 |
1845 |
1846 | %% MagicCommitter.setupTask()
1847 | \begin{procedure}
1848 | \MagicVars
1849 |
1850 | \taskAttemptPath $\longleftarrow$ \jobAttemptPath/\taskAttemptId\;
1851 | \mkdirs(\fs, \taskAttemptPath)\;
1852 |
1853 | \caption{MagicCommitter.setupTask()}
1854 | \label{alg:MagicCommitter.setupTask}
1855 |
1856 | \end{procedure}
1857 |
1858 | \textbf{Needs Task Commit}
1859 |
1860 | A commit is required iff files are pending, which is true if there are
1861 | files to upload.
1862 |
1863 | \begin{function}
1864 | \StagingVars
1865 |
1866 | \return \exists(\fs, \taskAttemptPath)\;
1867 |
1868 | \caption{MagicCommitter.needsTaskCommit()}
1869 | \label{alg:MagicCommitter.needsTaskCommit}
1870 |
1871 | \end{function}
1872 |
1873 | This will return true even if there are no \texttt{.pending} files under the task attempt
1874 | path.
1875 | A full path listing could determine this, but as this itself is potentially a slow
1876 | operation, we have omitted it, relying on the task commit process to
1877 | handle the case of no output being generated.
1878 |
1879 | % commit task
1880 |
1881 | A task attempt is committed by listing all the single \texttt{.pending} files
1882 | under a the task attempt directory, reading in the contents and merging it
1883 | into the set of all pending uploads initiated by this task attempt.
1884 | This file is then saved as a \texttt{.pendingset} file into the job attempt directory,
1885 | which is still in the \texttt{__magic} directory.
1886 |
1887 | \begin{procedure}
1888 | \MagicVars
1889 |
1890 | $Pending \longleftarrow \emptyset$\;
1891 | \For{$f \in $ \listFiles(\fs, \taskAttemptPath, $recursive$)} {
1892 | $Pending \longleftarrow Pending + \{$ \loadPendingFile(\fs, $f$) $\}$\;
1893 | }
1894 |
1895 | \savePendingSet(\fs, \jobAttemptPath + $taskId$, $Pending$)\;
1896 |
1897 | \caption{MagicCommitter.commitTask()}
1898 | \label{alg:MagicCommitter.commitTask}
1899 |
1900 | \end{procedure}
1901 |
1902 | Because the \texttt{.pendingset} file is written in a single atomic PUT, the
1903 | commit of an individual task attempt is atomic.
1904 |
1905 | If there are no .\texttt{.pendingset} files, the saved \texttt{.pendingset} file
1906 | will simply contain an empty list of pending uploads.
1907 |
1908 |
1909 | \textbf{Task Abort}
1910 |
1911 | A task is aborted by listing all \texttt{.pending} files in the task attempt directory,
1912 | then aborting the upload associated with it
1913 |
1914 | \begin{procedure}
1915 | \MagicVars
1916 |
1917 | \For{$f \in $ \listFiles(\fs, \taskAttemptPath, $recursive$)} {
1918 | \abortUpload(\loadPendingFile(\fs, $f$))\;
1919 | }
1920 |
1921 | \caption{MagicCommitter.abortTask()}
1922 | \label{alg:MagicCommitter.abortTask}
1923 | \end{procedure}
1924 |
1925 |
1926 | \textbf{Job Commit}
1927 |
1928 | The Job commit operation is very similar to that of the Staging Committer, because
1929 | they are doing nearly the same operation: loading in the \texttt{.pendingset} files
1930 | from a directory and completing the uploads listed within.
1931 |
1932 | % commit job
1933 | \begin{procedure}
1934 | \MagicVars
1935 |
1936 | $Pending \longleftarrow \emptyset$\;
1937 | \For {$f \in $ \listFiles(\fs, \jobAttemptPath)} {
1938 | $Pending \longleftarrow Pending + $ \loadPendingSet(\fs, $f$)\;
1939 | }
1940 | \For {$p \in Pending$} {
1941 | \completeUpload($p$)\;
1942 | }
1943 |
1944 | \caption{MagicCommitter.commitJob()}
1945 | \label{alg:MagicCommitter.commitJob}
1946 | \end{procedure}
1947 |
1948 |
1949 | This committer does not currently support the ``partitioned commit'' conflict
1950 | resolution mechanism, so omits the conflict handling operation.
1951 | Otherwise it is identical, and has the similar performance and (non-) atomicity
1952 | characteristics.
1953 |
1954 |
1955 | % Abort Job
1956 |
1957 | \textbf{Job Abort/ Job Cleanup}
1958 |
1959 | A job is aborted and/or cleaned up by aborting all outstanding uploads
1960 | pending against the destination directory.
1961 |
1962 | \begin{procedure}
1963 | \MagicVars
1964 |
1965 | \For {$f \in $ \listPendingUploads(\dest)} {
1966 | \abortUpload($f$)\;
1967 | }
1968 |
1969 | \caption{MagicCommitter.abortJob()}
1970 | \label{alg:MagicCommitter.abortJob}
1971 | \end{procedure}
1972 |
1973 |
1974 | No attempt is made here to list any \texttt{.pending} or .\texttt{.pendingset} files.
1975 | We committer cannot rely on those to enumerate all uploaded files,
1976 | specifically those of failed task attempts, where the information about
1977 | the pending uploads may not have been not saved.
1978 | Asking the S3 store to enumerate all pending uploads, and then aborting each one
1979 | guarantees that all incomplete uploads will be aborted.
1980 |
1981 |
1982 |
1983 | % ========================================================================
1984 | \section{Integration with Hadoop and Spark}
1985 | \label{sec:integration}
1986 |
1987 | A major challenge with this work is integrating the committers with MapReduce
1988 | and Spark, without making changes to their commit protocols themselves.
1989 | This is complicated by the fact that the choice of committer to use is
1990 | not made directly by their commit engines, but, when the Hadoop file output
1991 | formats are used, returned by the method \texttt{OutputFormat.getOutputCommitter}.
1992 | Hadoop-based output formats all extend a common \texttt{FileOutputCommitter},
1993 | so return its committer or a custom subclass thereof, the \texttt{FileOutputCommitter}.
1994 |
1995 | How to switch all the existing subclasses of \texttt{FileOutputFormat} to using
1996 | a new committer when using an object store as a destination?
1997 |
1998 | This was achieved by modifying \texttt{FileOutputFormat}, so that rather than
1999 | only working with the standard \texttt{FileOutputCommitter}, it was possible to declare
2000 | a different committer factory for different filesystem schemas\ \cite{MAPREDUCE-6823}.
2001 | The \texttt{s3a:} schema is configured to refer to an S3A-specific factory, which
2002 | returns the specific S3A committer chosen on the job configuration.
2003 |
2004 | An alternative strategy would have been to retrofit an ``algorithm 3'' inside
2005 | the \texttt{FileOutputCommitter}, which would have implemented the plugin point.
2006 | This would have permitted the new committers to be inserted underneath any
2007 | subclass, so retrofit it to classes such as the \texttt{ParquetFileOutputCommitter}.
2008 | We chose not to do this
2009 |
2010 | \begin{enumerate}
2011 | \item The existing code is complex, containing two intermixed co-recursive
2012 | algorithms.
2013 | \item Our changes could unintentionally break the correctness of the existing committer.
2014 | \item Subclasses of the existing committer may have been implemented to extend
2015 | the protocol, perhaps by summarizing the output, writing extra files, etc.
2016 | Changing the superclass behavior to not create output files until job commit
2017 | ran the risk of breaking all this code.
2018 | \end{enumerate}
2019 |
2020 | The factory design eliminated these risk at the expense of complicating
2021 | Spark/Parquet integration.
2022 |
2023 | To address this, we ultimately implemented two committers
2024 | The \texttt{PathOutputCommitProtocol}, which extended Spark's
2025 | \texttt{HadoopMapReduceCommitProtocol} class, relaxing the requirement of a
2026 | committer to be a subclass of \texttt{FileOutputCommitter}.
2027 |
2028 | The \texttt{BindingParquetOutputCommitter} then extends Parquet's
2029 | \texttt{ParquetOutputCommitter} class, relaying
2030 | all commit operations to that of whichever committer was committer dynamically created
2031 | through the factory mechanism.
2032 | This allows Spark's requirement ``ParquetFileFormat requires a ParquetOutputCommitter''
2033 | to be satisfied with any of the factory-created committers.
2034 |
2035 | % During the development of the committers, a change in Spark caused the
2036 | % tests to fail.
2037 | % Spark was enhanced to measure the amount of data created by a task, by
2038 | % measuring the length of the written file\ \cite{SPARK-21669}.
2039 | % With the Magic Committer, there is no written file, not until the job is committed.
2040 | % Accordingly: the probe failed, so resulting in a task and, transitively a job, failure.
2041 | % The Magic Committer was extended to create a zero-byte file in the expected path,
2042 | % so guaranteed that the existence check will hold.
2043 | % It does mean, however, that the statistics collected by Spark will not measure
2044 | % the amount of data written.
2045 |
2046 |
2047 |
2048 | % ========================================================================
2049 | \section{Correctness}
2050 | \label{sec:correctness}
2051 |
2052 | The two new committers implement variants of the same concept: delaying
2053 | manifesting of multipart uploads.
2054 | Do the new algorithms actually \emph{work}?
2055 |
2056 |
2057 | \subsubsection{Defining Correctness of Committed work}
2058 |
2059 | First, a definition of correct behavior must be defined.
2060 |
2061 | \begin{paragraph}
2062 | \textbf{Completeness of job output.}
2063 | After a successful invocation of \texttt{commitJob()},
2064 | the destination directory tree will contain all files written under the output directory
2065 | of all task attempts which successfully returned from an invocation of \texttt{commitTask()}.
2066 | The contents of these files will contain exactly the data written by the user code.
2067 | \emph{``You get what was committed''}
2068 | \end{paragraph}
2069 |
2070 | \begin{paragraph}
2071 | \textbf{Exclusivity of output.}
2072 | After a successful invocation of \texttt{commitJob()},
2073 | the destination directory tree must only contain the output of successfully
2074 | committed tasks.
2075 | \emph{``And not what wasn't''}.
2076 | \end{paragraph}
2077 |
2078 | \begin{paragraph}
2079 | \textbf{Consistency of the commit.}
2080 | The task or job must be able to reliably commit the work, even in the presence
2081 | of inconsistent listings.
2082 | This could be addressed, for example, by using a consistent store for some operations,
2083 | or a manifest mechanism and a reliance on create consistency.
2084 | Consistency with subsequent queries in a workflow is encouraged, else a ``sufficient''
2085 | delay is needed for the listings to become consistent.
2086 | \emph{``Addresses store inconsistencies, somehow''}
2087 | \end{paragraph}
2088 |
2089 | \begin{paragraph}
2090 | \textbf{Concurrent.}
2091 | Multiple tasks in the same job must be able to commit concurrently.
2092 | A job must be able to commit its work while other jobs are committing
2093 | their work \emph{to different destinations in the store}.
2094 | \end{paragraph}
2095 |
2096 | \begin{paragraph}
2097 | \textbf{Ability to abort.}
2098 | If a job attempt is aborted before \texttt{commitJob()}, is invoked, and
2099 | \texttt{cleanupJob()} called, then the output of the attempt will not appear in the
2100 | destination directory at any point in the future.
2101 | \emph{``An aborted/cleaned up job no longer exists''}
2102 | \end{paragraph}
2103 |
2104 |
2105 | \begin{paragraph}
2106 | \textbf{Continuity of correctness.}
2107 | After a job has been successfully committed, no outstanding task may promote
2108 | output into the destination directory.
2109 | That is: if a task attempt has not ``failed'' mid-commit, merely proceeded at a slow rate,
2110 | its output will not contaminate the directory of the already-successful job.
2111 | \emph{``A dead task attempt stays dead''}
2112 | \end{paragraph}
2113 |
2114 |
2115 | The \emph{continuity-of-correctness} requirement excludes that of a failed job.
2116 | We depend here upon the restriction that a job will not commit its work unless
2117 | a heartbeat has been received in a predefined time interval from the YARN ResourceManager.
2118 | Assuming all clocks move forward at approximately the same rate, if a job has
2119 | not received/responded to heartbeats outside that interval,
2120 | we can can conclude that the process will no longer commit work.
2121 | This failure to respond to heartbeats triggers YARN rescheduling a new
2122 | instance of the Job Manager and an attempt to kill the previous attempt.
2123 | A second job attempt may conclude from the very fact that it has been launched
2124 | that the previous job attempt will not attempt to commit its work
2125 | \footnote{the monotonically increasing YARN attempt ID value implicitly
2126 | informs the Job whether or not it is the first attempt}.
2127 |
2128 | This definition of correctness omits some constraints:
2129 |
2130 | \begin{itemize}
2131 | \item The output of committed tasks not being present in the output directory
2132 | until the job is committed.
2133 | Rationale: It is the final state of the job which matters, not intermediate states.
2134 |
2135 | \item That task commit operation is atomic.
2136 | Rationale: The v2 commit algorithm does not meet this requirement.
2137 |
2138 | \item The job commit operation is atomic.
2139 | Rationale: The v1 commit algorithm does not meet this requirement.
2140 |
2141 | \item Concurrent jobs writing to the same destination will succeed and
2142 | produce output equivalent to a serialized commit of the the jobs.
2143 | Rationale: none of the original commit algorithm offers such guarantees.
2144 | \end{itemize}
2145 |
2146 | The implication of not requiring these constraints is that the higher-level
2147 | commit protocol must react to failures or timeouts of the task and job
2148 | commit operations.
2149 |
2150 | %Concurrency cannot easily be handled in the commit protocol except through
2151 | %some mechanism of obtaining an exclusive lock of operations
2152 | %on the destination path, one shared with all applications which may write
2153 | %to that path.
2154 | %Directory existence may be such an option for filesystems
2155 | %supporting an atomic check-and-create \tt{mkdir()} call, though it is not
2156 | %a check which the \tt{FileOutputCommitter} directly performs.
2157 |
2158 |
2159 | We do not attempt provide a formal proof of the correctness of the algorithms.
2160 | A TLA+ specification of the behavior of a consistent object store was created
2161 | during the process, however we have not completed that with any
2162 | algorithm specifications\ \cite{s3-tla}.
2163 | Modelling an eventually consistent somewhat is ``somewhat challenging''
2164 | At the same time, it is fundamentally impossible to demonstrate through testing
2165 | that the algorithms are correct in the presence of inconsistency ---so arguing
2166 | strongly for that correctness proof.
2167 |
2168 | In the absence of proofs,
2169 | here are our informal assertions about the correctness of the two algorithms.
2170 |
2171 | \subsubsection{Correctness of the Staging Committer}
2172 |
2173 | All task attempt output is written to the local filesystem;
2174 | it is implicitly not in the destination object store until task commit.
2175 |
2176 | In task commit, the contents of the local attempt directory are uploaded to the
2177 | destination, as incomplete uploads.
2178 | Hence: not visible until an operation completes the multipart upload.
2179 |
2180 | A task attempt's manifest of uploads to complete is saved to the cluster filesystem,
2181 | where the v1 commit algorithm is used to commit this file.
2182 | Thus the commitment of upload data has the same correctness as the
2183 | v1 commit algorithm on a consistent HDFS filesystem.
2184 |
2185 | In the job commit, the v1 commit algorithm ensures that the contents
2186 | of $clusterJobAttemptPath$ contains only the manifest of committed tasks.
2187 |
2188 | As the v1 algorithm satisfies the completeness and exclusivity requirements,
2189 | we can be confident that reading in these lists will build an aggregate list
2190 | of files to commit, a list which is, transitively, complete and exclusive.
2191 |
2192 | The subsequent job commit action is to complete these uploads,
2193 | then cancel all other multipart uploads pending against the directory tree.
2194 | This will cancel the pending work of all tasks attempts which have uploaded staged
2195 | data, but which were somehow not included in the list of committed tasks.
2196 | That is: they failed during the task commit process.
2197 |
2198 | Because HDFS is used to propagate the lists of uncommitted files from
2199 | committed task attempts to the job committer, there is no requirement for
2200 | a consistent view of the object store during the actual job.
2201 | For the results of a successful job to be safely used by another application,
2202 | something must still present a consistent view of the results, or
2203 | the interval between the publishing of the results and their use must be sufficient
2204 | for the users to be \emph{confident} that the store is now consistent,
2205 | or at least optimistic\ \cite{Bermbach:2014:BEC:2624303.2624662}.
2206 |
2207 |
2208 |
2209 | \subsubsection{Correctness of the Magic Committer}
2210 |
2211 | This is harder to demonstrate, and depends on consistent directory
2212 | listings of the object stores, that is: all files created under a path
2213 | in the object store are visible to the LIST operation.
2214 | For Amazon S3, this requires a consistency layer, such as S3mper or S3Guard
2215 | \ \cite{S3mper,HADOOP-13345}.
2216 | Implementations of the S3 Store API which offer consistent listings are not
2217 | at risk.
2218 |
2219 | All task attempt output is written to the object store, to the final (calculated)
2220 | destination.
2221 | However, the writes are not made visible until the job is committed.
2222 |
2223 | The requirements of completeness and exclusivity must be met by
2224 | having the lists of pending uploads generated by committed task attempts propagated
2225 | to the Job Commit phase, and the list of pending uploads from uncommitted
2226 | attempts not propagated to the Job Commit.
2227 |
2228 | That is:
2229 |
2230 | \begin{enumerate}
2231 | \item All pending uploads written by a committed task attempt must be
2232 | included in the final list of uploads for the job to commit.
2233 | \item No pending uploads by a failed task attempt must be included in this list.
2234 | \item A partitioned task attempt's uploads must never become visible,
2235 | even if uploaded after task or job commit.
2236 | \end{enumerate}
2237 |
2238 |
2239 | Reviewing this code, there appears to be a small race condition in job commit,
2240 | wherein a task attempt partitioned from the Job Manager during task commit
2241 | can still complete its PUT of its list of uploads to commit, the ``pending set'',
2242 | overwriting that of the task attempt which had considered itself successful.
2243 |
2244 | We cannot defend against that with the traditional strategy of creating
2245 | a file with overwrite=false, because against S3, there is no atomic
2246 | ``create-no-overwrite'' operation.
2247 |
2248 | Instead we rely on the higher level requirement that any committed task attempt must
2249 | constitute a valid outcome, and argue that the pending set from either attempt
2250 | must constitute a valid result of a task.
2251 |
2252 |
2253 | It's notable that this process could be improved were the job commit
2254 | operation supplied with a list of successful task attempts;
2255 | this would avoid inferring this state from the filesystem, except in
2256 | the case of job recovery from a commit algorithm capable of
2257 | rebuilding its state from a directory listing (i.e.\ the v1 committer).
2258 | Spark's protocol already permits this, but not Hadoop's.
2259 |
2260 | Regarding the requirement to abort safely, the fact that all writes are
2261 | not manifest until job commit means that the any writes from failed tasks
2262 | will remain ``pending''.
2263 |
2264 | Data in this state is still billed by byte, so must not be neglected.
2265 | After the job commits all successful tasks it lists all outstanding
2266 | uploads against the destination directory and cancels them.
2267 | We implemented a command line tools to list and cancel pending uploads for
2268 | scheduled workloads, and, finally,
2269 | it is possible to set a rule on an S3 bucket whereby uncompleted
2270 | pending uploads are deleted a specific time interval after their creation.
2271 | Our documentation recommends an interval of twenty-four hours here, to
2272 | clean out old data yet without affecting jobs ---assuming that all jobs
2273 | take less than a day to complete.
2274 |
2275 |
2276 | \subsection{Testing}
2277 | \label{subsec:testing}
2278 |
2279 | Confidence in the correctness of the algorithms notwithstanding, there
2280 | is still the issue of the correctness of the implementation.
2281 |
2282 |
2283 | This was done through testing:
2284 |
2285 | \begin{enumerate}
2286 | \item Functional tests of the underlying IO operations against Amazon S3.
2287 | \item Tests of the commit operation against a mock S3 service endpoint.
2288 | \item Invocations of the commit protocols in the normal and failing sequences of operations.
2289 | \item Integration tests on a single host MapReduce cluster.
2290 | \item Single-host integration tests of Spark integration, tests derived from Spark's own SQL test suites.
2291 | \item Large scale integration tests in virtual test clusters.
2292 | \item Peer review.
2293 | \end{enumerate}
2294 |
2295 | To aid in demonstrating resilience to metadata inconsistency
2296 | operations and transient network failures, Hadoop's \texttt{hadoop-aws} module
2297 | now contains a special fault-injecting S3 connector: idempotent throttling errors and
2298 | delayed consistency can both be simulated in the downstream tests;
2299 | this was used in integration testing.
2300 |
2301 | The large-scale integration tests have not, at the time of writing, highlighted any problems;
2302 | the simpler test suites were co-developed with the code, and exposing issues and
2303 | being expanded as new issues were discovered.
2304 | One bug the integration tests did show that our committers' cleanup code was
2305 | over-aggressive in listing and cancelling all outstanding uploads pending
2306 | on the destination directory.
2307 |
2308 |
2309 | The \texttt{cleanupJob()} procedure used the existing S3A client command
2310 | \texttt{listMultipartUploads(directory)} to enumerate the updates,
2311 | which were then cancelled.
2312 | A detailed review of this code while trying to identify an intermittent problem
2313 | made clear that this existing routine had a long standing bug in it.
2314 | Rather than just list all uploads under a directory, it also included
2315 | all uploads in directories whose paths began with the same string.
2316 | That is, listing and cancelling pending work in the directory \texttt{/output/dataset1},
2317 | would also delete the output in \texttt{/output/dataset10}, \texttt{/output/dataset11/work},
2318 | ...etc.
2319 | We are fortunate that this was found before the product shipped.
2320 | This does, however highlight our implementation's dependencies on the correctness
2321 | of the existing codebase, and how hard it is to imagine test cases which
2322 | can demonstrate the existence of bugs.
2323 | Who would have expected a test running in \texttt{/output/dataset1} to
2324 | cause an independent test in \texttt{/output/dataset10} iff the two tests
2325 | executions overlapped, and the first test executed is \texttt{cleanupJob()}
2326 | operation when the second had committed at least one task but not committed
2327 | the final job?
2328 |
2329 |
2330 | Peer review is an integral part of the development process;
2331 | It was invaluable to have other developers interested in this problem
2332 | and willing to contribute time reviewing the code and testing it
2333 | in their own environments, including a commercial S3-compatible
2334 | storage system.
2335 |
2336 |
2337 | % ========================================================================
2338 |
2339 | \section{Results}
2340 | \label{sec:results}
2341 |
2342 |
2343 | The performance of the new committers is not visible with small amounts
2344 | of data, as the number of HTTP requests is the dominant factor.
2345 | As the amount of data increases, the elimination of the copy operations
2346 | delivers a significant speedup to the new committers.
2347 | With a measured in-S3 copy time of ~6-10MB/s, the saving is 1 second per 10 MB
2348 | of data committed.
2349 |
2350 | Comparing the staging and magic committers is interesting.
2351 |
2352 | The Staging committer writes all data locally, with the write bandwidth
2353 | of the local (usually virtual) disk.
2354 | In task commit, this data must be read and uploaded to the S3 service.
2355 | Usually it is the bandwidth between the server and S3 which is the bottleneck,
2356 | though as S3 throttles requests to specific shards, having many servers trying
2357 | to write to the same destination directory tree can slow down the write, irrespective
2358 | of bandwidth.
2359 | \ \cite{AWS-S3-throttling}.
2360 | If a single task has generated many files, or many tasks of the same job are
2361 | committing nearly simultaneously, this may be observed.
2362 | \footnote{Throttling can also be observed on read operations;
2363 | in such situations adding more workers is counterproductive.}
2364 |
2365 | Job commit is a matter of reading the small \texttt{.pendingset} files saved in the
2366 | cluster filesystem (HDFS), and then issuing the relevant POSTs: one per uploaded
2367 | object.
2368 | This is parallelized, and not constrained by bandwidth.
2369 | Capacity in a local pool of HTTP1.1 connections, the time to create more,
2370 | and potentially throttling are the primary limits on IO performance at this point.
2371 |
2372 | The Magic Committer uploads data in blocks as it is written: the larger
2373 | the amount of data created by a single task, the greater the performance
2374 | benefit over the Staging committer's task-commit-time upload.
2375 | However, task commit does list the task attempt directory and read all \texttt{.pending}
2376 | files within, an operation which can take a few hundred milliseconds per file,
2377 | and again, potentially throttled.
2378 | With only a single summary file written back to S3, task commit is never
2379 | bandwidth constrained.
2380 |
2381 | Job commit time is that of the Staging Committer, proceeded by a listing
2382 | of and reading in of the pending files of every committed task.
2383 | This is again a few hundred milliseconds per file, though parallelization
2384 | can reduce the delay.
2385 |
2386 | Ignoring throttling, the Magic Committer is best with tasks which create
2387 | large amounts of data in each task attempt.
2388 | As well as avoiding the upload in the task commit, by reducing the
2389 | amount of storage needed in the virtual machine, VM and Container instances
2390 | with smaller amounts of storage can be request, or simply more tasks executed
2391 | per VM: computation, RAM and network bandwidth are the bottlenecks.
2392 |
2393 | In production use, we have found that the default size of the HTTP thread
2394 | pool becomes a bottleneck in the job commit phase for any queries
2395 | containing many thousands of files.
2396 | The small-payload POST requests are executed in parallel for higher
2397 | throughput, but the default limit on the number of HTTP connections, 15,
2398 | limits that parallelization.
2399 | Increasing this value to a larger number, such as 50 to 100, significantly
2400 | speeds up this phase of a query.
2401 |
2402 | One final feature to highlight is the ``partitioned committer'' variant
2403 | of the Staging Committer, which is designed to update an existing
2404 | dataset in-place, only considering conflict with existing data in
2405 | those partitions for which data is actually generated.
2406 | This supports workflows where large datasets are updated on a daily basis,
2407 | without the need for any post-job copy of the new day's data into the
2408 | final dataset.
2409 | If the existing workflow for maintaining such large datasets involved
2410 | moving the new data into the aggregated dataset, those renames themselves
2411 | suffer from the performance constraints of the store's COPY operation.
2412 | Here, then, the speedup comes from the overall workflow, rather than
2413 | simply the query.
2414 |
2415 |
2416 | % ========================================================================
2417 |
2418 | \section{Limitations}
2419 | \label{sec:limitations}
2420 |
2421 | A key criticism of the new committers is that the job commit operation is not atomic;
2422 | it is an $O(files)$ operation which may fail partway through.
2423 | We respond that as Hadoop's MapReduce v1 commit algorithm it itself non-atomic in job commit;
2424 | the Job Manager commit protocol detect failures in job commits
2425 | of previous attempts, and either recover or fail, according to the actions
2426 | offered by the committer.
2427 | A more subtle issue is the volume of POST requests required of a significantly large job,
2428 | all against a specific shard of the S3 store can trigger HTTP throttling.
2429 | This reduces the benefit of parallelized issuing of the POST requests.
2430 |
2431 | A Hadoop task process may exit without \texttt{abortTask()} being invoked.
2432 | Specifically, it exits immediately during the ping/response
2433 | heartbeat process if any of a number of conditions are met.
2434 | This is probably a bug in Hadoop ---and straightforward to correct.
2435 |
2436 | \begin{enumerate}
2437 | \item Predefined task limits are exceeded
2438 | (currently an optional limit on the number of bytes written to the local filesystem).
2439 | \item Communications with the Job Manager have failed beyond configured limits.
2440 | \item The response to the \texttt{ping()} call is \texttt{false}, indicating the current
2441 | Job Manager does not consider the task to part of its set of active tasks.
2442 | \end{enumerate}
2443 |
2444 | The first check is a defense against an errant process filling the local
2445 | filesystem with data;
2446 | the latter are symptoms of ---and reaction to--- different failures (loss of manager/network failure)
2447 | and restarted manager with no knowledge of active task, respectively.
2448 | There is also the without-warning failures triggered by the operating system
2449 | if limits on the execution environment are exceeded: usually memory allocation.
2450 |
2451 | While OS-level failures can occur without warning, it would be useful if the
2452 | ``managed'' system exits triggered in the heartbeat thread were to invoke
2453 | an emergency task cleanup operation.
2454 | For the S3A committers, this would consist of aborting all pending uploads, and
2455 | deleting any local data.
2456 | While the Job committer's \texttt{cleanupJob()} operation is expected to clean up
2457 | the output of all task attempts, active participation of the tasks would
2458 | reduce the time incomplete uploads were pending (reducing costs) and
2459 | potentially free up local disk storage.
2460 |
2461 | This appears to us to be an enhancement to the commit protocol which could
2462 | be considered.
2463 |
2464 |
2465 | One problem which may manifest itself in cloud-based deployments,
2466 | is that the Hadoop commit protocol assumes that time increases monotonically
2467 | on individual machines in the cluster.
2468 | The job manager and workers can use the interval between the last successful heartbeat
2469 | and the current time as the means by which they can consider themselves to have lost
2470 | contact with each other and system services.
2471 | In cloud environments clocks may stutter, proceed at significantly different rates,
2472 | and indeed, may even proceed backwards, especially if the VMs are moved between
2473 | physical cluster nodes.
2474 | We hope that Amazon's newly introduced \emph{Time Sync Service}
2475 | can address this on well-configured systems\ \cite{AWS-clock-service}.
2476 |
2477 |
2478 | % ========================================================================
2479 |
2480 | \section{Improvements to the Commit Protocols}
2481 | \label{sec:improvements-to-commit-protocols}
2482 |
2483 |
2484 | This work has highlighted some of the existing limitations of the commit protocols,
2485 | specifically:-
2486 |
2487 | \begin{itemize}
2488 | \item Lack of information returned by task attempts as to what output they have committed,
2489 | prevents this from being validated in the job commit.
2490 | \item Lack of information declared by the committers as to whether they can
2491 | recover from a failure during the task commit.
2492 | \item A general expectation in the execution engines that job and task abort do
2493 | not fail, or at least, succeed in a bounded time and log rather than propagate
2494 | failures.
2495 | \end{itemize}
2496 |
2497 | These can be addressed;
2498 | the Spark commit protocol is already partway to doing so, as tasks attempts
2499 | can propagate information to the job commit phase.
2500 |
2501 | We recommend extending the underlying output committers, to provide the
2502 | information needed to enhance the protocols robustness.
2503 | An extended \texttt{commitTask()} operation should return the (opaque) information
2504 | needed for the \texttt{commitJob()} method to validate the correctness and
2505 | completeness of the operation;
2506 | The job committer can then validate the content of the final output directory.
2507 |
2508 | Were the committers also declare their ability to recover from a failed task commit,
2509 | as they do for job recovery, then query engines would be able to choose the safe
2510 | actions following the failure/timeout of a task attempt commit.
2511 |
2512 | We should also have the object store connectors declare their consistency policy,
2513 | so have the file committers fail fast when executed against an inconsistent store.
2514 | This can be implemented at the filesystem and committer layers.
2515 |
2516 | As for the task abort issues;
2517 | that can be addressed with a straightforward hardening of the abort operations,
2518 | \emph{and} their use.
2519 |
2520 | Finally, the committers and the underlying storage infrastructures are instrumented;
2521 | they can and collect statistics about their operations, information
2522 | which can be useful in identify performance and scale problems.
2523 | Again, this could be propagated back from the committers to to the
2524 | query engine.
2525 | Our new committers do collect this information, and aggregated it in the
2526 | job commit process, but only to publish it in the \texttt{_SUCCESS} file;
2527 | it is not integrated with the applications themselves.
2528 |
2529 | All these problems are tractable, and will improve confidence in the ability
2530 | of the query engines to safely interact with alternate data stores and
2531 | commit algorithms written to work with them.
2532 |
2533 |
2534 |
2535 | \section{Related Work}
2536 | \label{sec:related-work}
2537 |
2538 | \subsection{Spark's Direct Output Committer}
2539 | \label{subsec:direct-output}
2540 |
2541 | Apache Spark (briefly) offered a zero rename committer,
2542 | the \emph{Direct Output Committer}\ \cite{SPARK-6352}.
2543 | With this committer, output was written directly to the destination directory;
2544 | both task and job commit operations were reduced to no-ops.
2545 | To avoid concurrency issues, speculative execution of tasks was automatically
2546 | disabled when this committer was used.
2547 | Unfortunately, the committer was still not resilient to failure: a failed
2548 | task could not be repeated, as its output was unknown.
2549 | For this reason it was removed\ \cite{SPARK-10063}.
2550 |
2551 | It's absence is now noted by users, showing how a much zero-rename committer
2552 | was valued by users, even if one which failed to offer the complete semantics
2553 | of a commit protocol.
2554 | Alternatively: performance is observable, whereas consistency and failures
2555 | are not considered important until they surface in production systems.
2556 |
2557 | \subsection{IBMs's Stocator}
2558 | \label{subsec:stocator}
2559 |
2560 |
2561 | IBM's Stocator eliminates renames by also having a direct write to the
2562 | destination\ \cite{Stocator}.
2563 | As with the \emph{the Magic Committer}, it modifies the semantics of write
2564 | operations into the temporary directories of work, here the standard
2565 | \texttt{\_temporary} directory used by the classic \texttt{FileOutputCommitter}.
2566 | To avoid the failure semantics of Spark's \texttt{Direct Output Committer},
2567 | every remapped file is given a name which inserted the job and task attempt IDs,
2568 | while still preserving the sort order.
2569 | Failed and aborted tasks and jobs can then be cleaned up by their successors.
2570 | Stocator also generates a JSON-formatted \SUCCESS file, which offers
2571 | the ability to obtain a consistent list of the final files committed by a job,
2572 | even in the presence of listing inconsistency.
2573 |
2574 | With this design, Stocator makes the output of work immediately visible;
2575 | there is no task commit, and the job commit is a matter of writing
2576 | the \SUCCESS file.
2577 |
2578 | The actual implementation is achieved by misleading the classic committer,
2579 | changing the semantics of file creation under the task attempt directories
2580 | under the \texttt{\_temporary} path.
2581 | The committer believes that it tasks are writing files to a temporary destination
2582 | and renaming them, when in fact they are being written direct to the final destination directory,
2583 | with a task-attempt-specific filename.
2584 |
2585 | The filesystem \texttt{rename()} operations of the the committer are then implicitly
2586 | omitted: there is no work to rename.
2587 |
2588 |
2589 | Stocator's task commit operation becomes a no-op, thus trivially repeatable.
2590 | Job commit is a listing of the output and generation of the manifest;
2591 | as the manifest PUT is atomic, the job commit itself is atomic.
2592 |
2593 |
2594 | What is critical for Stocator is that the output of all failed tasks
2595 | is cleaned up, \TODO \emph{WHERE?}.
2596 | This cannot be guaranteed in the failure case of: partitioned task attempt
2597 | continues to execute and write new files.
2598 | When that task attempt attempts to commit, it will fail to be granted permission,
2599 | and presumably clean up.
2600 | \TODO: verify commit $\rightarrow$ fail triggers cleanup
2601 | Before that commit and cleanup phase, the destination directory will contain
2602 | data from the ongoing, uncommitted task.
2603 |
2604 | Compared to the other designs, this is unique in that it retrofits an
2605 | object-store-optimized committer under the MapReduce V1 and V2 commit algorithms.
2606 | Thus existing applications can switch to the new committer without needing
2607 | explicit changes.
2608 | This makes it significantly easier to adopt.
2609 |
2610 |
2611 | The closest of the two S3A committers is the Magic Committer.
2612 | It too modifies the object store connector to write the output to a
2613 | different destination than the path requested
2614 | in the user's code \texttt{createFile(path)} call.
2615 |
2616 | The Magic Committer, does not attempt to work underneath the existing committer,
2617 | instead we provide our own store-aware committer
2618 | which ensures that output is not actually manifest until
2619 | the final job is committed.
2620 | Thus it provides the standard semantics of task and job commit: no data is
2621 | visible until the job is committed, and partitioned task attempts will
2622 | never make changes to the visible file set.
2623 |
2624 | \subsection{Amazon's EMRFS S3-optimized Committer}
2625 | \label{subsec:emrfs-committer}
2626 |
2627 | In November 2018, Amazon announced they had implemented their own S3-specific
2628 | committer for Apache Spark\ \cite{AWS-EMR-committer}, with an article
2629 | in March 2019 providing some details on the algorithm\ \cite{AWS-EMR-committer-blog}.
2630 |
2631 | \begin{quote}
2632 | \emph{
2633 | The EMRFS S3-optimized committer is used for Spark jobs that use Spark SQL,
2634 | DataFrames, or Datasets to write Parquet files.}
2635 | \end{quote}
2636 |
2637 | The documentation contains three assertions:-
2638 |
2639 | \begin{itemize}
2640 | \item "Improves application performance by avoiding list and rename operations"
2641 | \item "Lets you safely enable the speculative execution of idempotent tasks in Spark jobs to help reduce the performance impact of task stragglers."
2642 | \item "Avoids issues that can occur with Amazon S3 eventual consistency during job and task commit phases, and helps improve job correctness under task failure conditions.
2643 | \end{itemize}
2644 |
2645 |
2646 | Without the source being available to examine, we can only infer aspects
2647 | of its behaviour from the documentation and blog post.
2648 |
2649 | The 2019 article is the most informative, as are some hints in.
2650 |
2651 | \begin{itemize}
2652 | \item Data is uploaded as multipart uploads, which are not made visible at the time of writing.
2653 | \item The list of pending uploads is built up the task committers, hence consumes memory in the worker process \cite{AWS-EMR-committer-tuning}.
2654 | \item The manifestation of the files, by completing the multipart upload, is performed in the individual task commit operations.
2655 | \item The job commit operation simply becomes one of writing the \texttt{_SUCCESS)} file, and, hopefully, listing and aborting any incomplete
2656 | multipart uploads in progress under the destination path.
2657 |
2658 | \end{itemize}
2659 |
2660 | Because no data is made visible until the task commit is executed, the output
2661 | of any in-progress tasks is not visible: speculative execution is safe,
2662 | as is reattempts of any task which failed up to the point of the task commit.
2663 |
2664 | However, because the uploads are completed in the task commit operation,
2665 | the output of each committed task is visible.
2666 | Furthermore, because the commit operation is not-atomic, the operation may
2667 | fail partway through, which will trigger a new task attempt, which
2668 | will then commit its work into a destination of unknown state.
2669 |
2670 | It's disappointing that this approach has been chosen, given that passing a
2671 | list of files to commit to the application master is straightforward in the
2672 | spark protocol, and also that the Netflix and Apache prior art showed what
2673 | could be done.
2674 |
2675 | As discussed previously, extending the committer API/protocol to allow tasks
2676 | to declare when task commit was unrecoverable, so Spark would at least
2677 | know when a task commit failure must trigger a job failure, rather than
2678 | rescheduling of another task attempt.
2679 |
2680 |
2681 | \begin{table}
2682 | \begin{tabular}{ l c c c c }
2683 | \hline
2684 | & \textbf{Direct} & \textbf{Stocator} & \textbf{S3A} & \textbf{EMR}\\
2685 | Speculative Tasks & False & True & True & True \\
2686 | Recoverable Job & False & False & False & True \\
2687 | Abortable Task & False & True & True & True \\
2688 | Abortable Job & True & True & True & True \\
2689 | Uncommitted task output observable & True & True & False & False \\
2690 | Committed task output observable & True & True & False & True\\
2691 | Atomic Task Commit & True & True & True & False \\
2692 | Atomic Job Commit & True & True & False & True \\
2693 | Partitioned Executor resilience & False & False & True & False\\
2694 | \hline
2695 | \end{tabular}
2696 | \caption{Attributes of the different committer algorithms}
2697 | \label{tab:other-committer-attributes}
2698 | \end{table}
2699 |
2700 |
2701 | The classic file output committers postpone this until task (v2) or job (v1)
2702 | commit, and use rename as the low-cost operation to promote the files.
2703 |
2704 | All these object-store-optimized committers focus on eliminating renames,
2705 | and are left with the challenge of finding alternative algorithms to
2706 | allow for distributed queries to complete successfully in the presence
2707 | of failures of individual worker processes.
2708 |
2709 |
2710 | The Direct Committer fails at the foundational requirement: ability to support
2711 | speculative or restarted task attempts.
2712 | This is why it was removed from the ASF codebase.
2713 |
2714 | Stocator also writes to the destination directory, but by renaming the output
2715 | files retains the ability to clean up the output of uncommitted tasks.
2716 | It does however, fail to meet our requirement ``Continuity of correctness.''
2717 |
2718 | A task which is still in progress after the job commit may generate output
2719 | into the destination directory.
2720 |
2721 | Neither committer is performing any operation in task commit other than creating
2722 | a \SUCCESS marker, which is both atomic and repeatable, therefore their job commit
2723 | operations are both fast and safe.
2724 |
2725 | The EMR Committer has adopted the same multipart upload mechanism to postpone
2726 | manifesting files written to their ultimate paths, but has chosen to materialize
2727 | those files in the task commit rather than postponing it until the job is committed.
2728 | This will deliver performance benefits in larger applications ---at the cost
2729 | of resilience to failures during that task commit operation.
2730 | Without modifying Spark to fail the entire job in such an event, it is placing
2731 | users at risk of corrupted output.
2732 |
2733 |
2734 | One recurrent theme here is that the output of a job is defined as
2735 | ``the contents of the job output directory'', thus all committers are
2736 | forced to output data ``somewhere'' and manifest it in the commit process.
2737 | It is also the reason that eventually consistent metadata operations are
2738 | dangerous: even when the directory tree is valid, a listing may be incorrect.
2739 |
2740 | If applications moved towards declaring the output of a job in
2741 | a manifest file, rather than implicitly defining it as ``all files in the directory
2742 | tree which do not begin with '.' or '\_''', then the writing/renaming
2743 | of this manifest would be all that is needed to commit a job.
2744 |
2745 | The S3A committers and Stocator already generate manifest data in the
2746 | standard \SUCCESS file.
2747 | For our committers, this was done initially for testing;
2748 | later it included the filesystem statistics of the process, so helping
2749 | collect data on IO costs.
2750 | However, it is present, and it could perhaps be used as an alternative to
2751 | a directory listing.
2752 |
2753 | Provided all relevant applications agree to use a single, shared manifest
2754 | format, it may be possible to move to a simpler structure of
2755 | output being written straight to the destination, and the atomic PUT of the
2756 | manifest defining the output.
2757 |
2758 | This is essentially one aspect of the in-incubation Apache Iceberg project,
2759 | which uses manifest files to describe a data source, amongst its other
2760 | features designed to support efficient operation in S3 and elsewhere\ \cite{iceberg, iceberg:slides}.
2761 |
2762 | \section{Conclusions and Further Work}
2763 | \label{sec:conclusions}
2764 |
2765 | Object Stores are becoming a common source and destination of data analyzed
2766 | through Apache Hadoop and Spark.
2767 | The client connectors make the stores resemble filesystems in
2768 | terms of the API exposed to applications, so enabling existing code to
2769 | interact with the stores without modification.
2770 | However, the core semantics required by conventional commit algorithms, particularly
2771 | that of an $O(1)$ atomic rename, are not always met.
2772 | While the existing Hadoop/Spark commit algorithms appear to work, they lack
2773 | both the performance and correctness delivered when used with a ``real'' filesystem.
2774 |
2775 | We have demonstrated that the use of object-store specific operations --here
2776 | the multipart PUT with its ability to complete the upload from a different host--
2777 | allow for object-store aware commit algorithms to be implemented,
2778 | algorithms which do meet these requirements.
2779 |
2780 | The new committers are implemented in Apache Hadoop 3.1, with a small bridging
2781 | library to aid integration with Apache Spark\ \cite{HADOOP-13786}.
2782 |
2783 |
2784 | These committers have shown that the metaphor presented to applications,
2785 | \emph{Object Stores are File Systems} cannot be sustained.
2786 | As means of allowing existing applications to use stores as a source
2787 | of data the mimicking of directories and files works, albeit sometimes
2788 | inefficiently\ \cite{HADOOP-13208}).
2789 | What does not work is code which expects the strict semantics
2790 | offered by HDFS and other filesystems --atomic creation and rename algorithms.
2791 | This commit algorithm is one key example of a failure point, as
2792 | is any other algorithm attempting to use a shared filesystem
2793 | as a coordination mechanism between process.
2794 |
2795 | The Hadoop project has long discussed the merits of explicitly
2796 | exposing an API for object stores, offering only the limited
2797 | set of verbs such stores present\ \cite{HADOOP-9565}.
2798 | However, we have been unable to progress because of the nuanced details
2799 | between the different stores\ \cite{S3, WASB, ADL, GCS}.
2800 | It is these nuances which prove critical in safely implementing
2801 | commit protocols and suchlike: any API which offered a lowest-common-denominator
2802 | would likely prove itself inadequate.
2803 |
2804 | The integration with the Hadoop and Spark commit protocols is intended
2805 | to support different committers for different destination filesystems.
2806 | We hope to see committers supporting other object stores, each
2807 | able to use store-specific operations.
2808 | What can be offered is common code for much of each implementation,
2809 | knowledge of the new algorithms needed, and
2810 | with the suites of tests used to validate their functionality.
2811 |
2812 | One recurrent issue which this work has shown is that using the
2813 | filesystem or object store to communicate state from task attempts
2814 | to the job committer, and from the job committer to successor
2815 | applications, is brittle.
2816 |
2817 | There is no reason why the job committer cannot be passed the list of
2818 | successful task attempts from the job manager, as well as, ideally,
2819 | the list of failed attempts.
2820 | This can be used for the creation of a manifest, and for aiding cleanup
2821 | of failed task attempts.
2822 | The Spark commit protocol does permit committed task attempts to pass data
2823 | to the Spark committer;
2824 | use of this should be explored.
2825 |
2826 |
2827 | Finally, we note that the Hadoop commit protocols are woefully under-documented;
2828 | understanding them involved stepping through tests with a debugger and
2829 | some deliberate fault injection to see what happened.
2830 | Given how critical the correctness of the protocol and committers and implementations
2831 | are, and how other projects depend also use the same code, there
2832 | are opportunities to better specify the protocol and APIs, and review
2833 | their use.
2834 | We hope this document is a start, while warning readers that it is non-normative.
2835 |
2836 | % ========================================================================
2837 |
2838 | \section*{Acknowledgements}
2839 | \label{sec:acknowledgements}
2840 |
2841 | We are grateful for the contributions of all reviewers and testers, especially
2842 | Aaron Fabbri and Ewan Higgs.
2843 | We must also highlight the contributions of our QE teams: it is through
2844 | their work that this work is ready for others to use.
2845 |
2846 | % ========================================================================
2847 |
2848 | \section{References}
2849 | \label{sec:references}
2850 |
2851 | % Bibliography. Include
2852 |
2853 | \bibliographystyle{IEEEtran}
2854 | \bibliography{bibliography}
2855 |
2856 |
2857 | \end{document}
2858 |
--------------------------------------------------------------------------------
/tex/bibliography.bib:
--------------------------------------------------------------------------------
1 | %% This BibTeX bibliography file was created using BibDesk.
2 | %% http://bibdesk.sourceforge.net/
3 |
4 | %% Created for Loughran, Steve at 2017-12-06 20:15:33 +0000
5 |
6 |
7 | %% Saved with string encoding Unicode (UTF-8)
8 |
9 |
10 |
11 | @url{AWS-S3-intro,
12 | Author = {Amazon},
13 | Title = {{Introduction to Amazon S3}},
14 | Url = {http://docs.aws.amazon.com/AmazonS3/latest/dev/Introduction.html} }
15 |
16 |
17 | @url{AWS-S3-throttling,
18 | Author = {Amazon},
19 | Date-Added = {2017-12-06 20:12:58 +0000},
20 | Date-Modified = {2017-12-06 20:14:27 +0000},
21 | Title = {{Request Rate and Performance Considerations}},
22 | Url = {http://docs.aws.amazon.com/AmazonS3/latest/dev/request-rate-perf-considerations.html}
23 | }
24 |
25 | @url{AWS-clock-service,
26 | Author = {Hunt, Randall},
27 | Date-Added = {2017-12-06 20:12:58 +0000},
28 | Date-Modified = {2017-12-06 20:14:27 +0000},
29 | Title = {{Keeping Time With Amazon Time Sync Service}},
30 | Url = {https://aws.amazon.com/blogs/aws/keeping-time-with-amazon-time-sync-service/},
31 | Urldate = {2017-11-27},
32 | Year = {2017}
33 | }
34 |
35 | @url{AWS-EMR-committer,
36 | Author = {Amazon},
37 | Title = {{ Using the EMRFS S3-optimized Committer }},
38 | Url = {https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-s3-optimized-committer.html},
39 | Urldate = {2018-11},
40 | Year = {2018}
41 | }
42 |
43 | @url{AWS-EMR-committer-tuning,
44 | Author = {Amazon},
45 | Title = {{ EMRFS S3-optimized Committer: Job Tuning Considerations }},
46 | Url = {https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-committer-tuning.html},
47 | Urldate = {2018-11},
48 | Year = {2018}
49 | }
50 |
51 | @url{AWS-EMR-committer-blog,
52 | Author = {Kelly, Jonathan and Slawski, Peter },
53 | Title = {{ Improve Apache Spark write performance on Apache Parquet formats with the EMRFS S3-optimized committer }},
54 | Url = {https://aws.amazon.com/blogs/big-data/improve-apache-spark-write-performance-on-apache-parquet-formats-with-the-emrfs-s3-optimized-committer/},
55 | Urldate = {2019-03},
56 | Year = {2019}
57 | }
58 |
59 |
60 | @url{S3mper,
61 | Author = {Weeks, Daniel C.},
62 | Title = {{ S3mper: Consistency in the Cloud}},
63 | Url = {https://medium.com/netflix-techblog/s3mper-consistency-in-the-cloud-b6a1076aa4f8},
64 | Year = {2014},
65 | }
66 |
67 | @inproceedings{Calder11,
68 | author = {Calder, Brad and Wang, Ju and Ogus, Aaron and Nilakantan, Niranjan and Skjolsvold, Arild and McKelvie, Sam and Xu, Yikang and Srivastav, Shashwat and Wu, Jiesheng and Simitci, Huseyin and Haridas, Jaidev and Uddaraju, Chakravarthy and Khatri, Hemal and Edwards, Andrew and Bedekar, Vaman and Mainali, Shane and Abbasi, Rafay and Agarwal, Arpit and Haq, Mian Fahim ul and Haq, Muhammad Ikram ul and Bhardwaj, Deepali and Dayanand, Sowmya and Adusumilli, Anitha and McNett, Marvin and Sankaran, Sriram and Manivannan, Kavitha and Rigas, Leonidas},
69 | title = {Windows Azure Storage: A Highly Available Cloud Storage Service with Strong Consistency},
70 | booktitle = {Proceedings of the Twenty-Third ACM Symposium on Operating Systems Principles},
71 | series = {SOSP '11},
72 | year = {2011},
73 | isbn = {978-1-4503-0977-6},
74 | location = {Cascais, Portugal},
75 | pages = {143--157},
76 | numpages = {15},
77 | url = {http://doi.acm.org/10.1145/2043556.2043571},
78 | doi = {10.1145/2043556.2043571},
79 | acmid = {2043571},
80 | publisher = {ACM},
81 | address = {New York, NY, USA},
82 | keywords = {Windows Azure, cloud storage, distributed storage systems},
83 | }
84 |
85 | @url{HADOOP-S3A-Committers,
86 | Author = {Apache Software Foundation},
87 | Title = {{ ``Committing work to S3 with the “S3A Committers'' }},
88 | Url = {https://hadoop.apache.org/docs/r3.1.1/hadoop-aws/tools/hadoop-aws/committers.html},
89 | Year = {2018}
90 | }
91 |
92 | @url{HADOOP-9565,
93 | Author = {Loughran, Steve},
94 | Title = {{ HADOOP-9565. Add a Blobstore interface to add to blobstore FileSystems }},
95 | Url = {https://issues.apache.org/jira/browse/HADOOP-9565},
96 | Year = {2013}
97 | }
98 |
99 | @url{HADOOP-10400,
100 | Author = {Mendelson Jordan},
101 | Title = {{ HADOOP-10400. Incorporate new S3A FileSystem implementation }},
102 | Url = {https://issues.apache.org/jira/browse/HADOOP-10400},
103 | Year = {2014}
104 | }
105 |
106 | @url{HADOOP-13208,
107 | Author = {Loughran, Steve},
108 | Date-Added = {2017-12-06 20:12:58 +0000},
109 | Date-Modified = {2017-12-06 20:14:27 +0000},
110 | Title = {{HADOOP-13208 listFiles(recursive=true) to do a bulk listObjects}},
111 | Url = {https://issues.apache.org/jira/browse/HADOOP-13208},
112 | Urldate = {2016-05-17},
113 | Year = {2016} }
114 |
115 |
116 | @url{HADOOP-13345,
117 | Author = {Nauroth, Chris and Liu, Minglaing and Fabbri, Aaron and Mackrory, Sean and Loughran, Steve},
118 | Date-Added = {2017-12-06 20:12:58 +0000},
119 | Date-Modified = {2017-12-06 20:14:27 +0000},
120 | Title = {{HADOOP-13345. S3Guard: Improved Consistency for S3A }},
121 | Url = {https://issues.apache.org/jira/browse/HADOOP-13345},
122 | Urldate = {2016-07-06},
123 | Year = {2016} }
124 |
125 | @url{HADOOP-13786,
126 | Author = {Loughran, Steve and Ryan Blue},
127 | Date-Added = {2017-12-06 20:12:58 +0000},
128 | Date-Modified = {2017-12-06 20:14:27 +0000},
129 | Title = {{HADOOP-13786. Add S3A committer for zero-rename commits to S3 endpoints}},
130 | Url = {https://issues.apache.org/jira/browse/HADOOP-13786},
131 | Urldate = {2016-11-02},
132 | Year = {2016} }
133 |
134 | @url{HADOOP-14161,
135 | Author = {Miner, Luke},
136 | Title = {{ HADOOP-14161. Failed to rename file in S3A during FileOutputFormat commitTask }},
137 | Url = {https://issues.apache.org/jira/browse/HADOOP-14161},
138 | Year = {2017} }
139 |
140 | @url{iceberg-asf,
141 | Author = {Apache Software Foundation},
142 | Title = {{ Apache Iceberg (incubating}},
143 | Url = {https://iceberg.apache.org/},
144 | Year = {2018} }
145 |
146 | @url{iceberg,
147 | Author = {Ryan Blue},
148 | Title = {{ Iceberg: a fast table format for S3 }},
149 | Url = {https://www.slideshare.net/Hadoop_Summit/iceberg-a-fast-table-format-for-s3-103201179},
150 | Year = {2018} }
151 |
152 | @url{MAPREDUCE-4815,
153 | Author = {Li, Siqi},
154 | Title = {{MAPREDUCE-4815. Speed up FileOutputCommitter.commitJob for many output files}},
155 | Url = {https://issues.apache.org/jira/browse/MAPREDUCE-4815},
156 | Urldate = {2015-02-15},
157 | Year = {2015} }
158 |
159 | @url{MAPREDUCE-6823,
160 | Author = {Loughran, Steve},
161 | Date-Added = {2017-12-06 20:04:47 +0000},
162 | Date-Modified = {2017-12-06 20:11:52 +0000},
163 | Title = {{MAPREDUCE-6823. FileOutputFormat to support configurable PathOutputCommitter factory}},
164 | Url = {https://issues.apache.org/jira/browse/MAPREDUCE-6823},
165 | Urldate = {2016-12-14},
166 | Year = {2016} }
167 |
168 | @url{SPARK-6352,
169 | Author = {Lee, Pei-Lun},
170 | Date-Added = {2017-12-06 20:08:54 +0000},
171 | Date-Modified = {2017-12-06 20:11:35 +0000},
172 | Title = {{SPARK-6352 Add DirectParquetOutputCommitter}},
173 | Url = {https://issues.apache.org/jira/browse/SPARK-6352},
174 | Urldate = {2015-04-28},
175 | Year = {2015} }
176 |
177 | @url{SPARK-8029,
178 | Author = {Liu, Davies},
179 | Date-Added = {2017-12-06 20:08:54 +0000},
180 | Date-Modified = {2017-12-06 20:11:35 +0000},
181 | Title = {{SPARK-8029. ShuffleMapTasks must be robust to concurrent attempts on the same executor}},
182 | Url = {https://issues.apache.org/jira/browse/SPARK-8029},
183 | Urldate = {2015-07-02},
184 | Year = {2015} }
185 |
186 | @url{SPARK-8413,
187 | Author = {Kim, Mingyu},
188 | Date-Added = {2017-12-06 20:08:54 +0000},
189 | Date-Modified = {2017-12-06 20:11:35 +0000},
190 | Title = {{ SPARK-8413. DirectParquetOutputCommitter doesn't clean up the file on task failure}},
191 | Url = {https://issues.apache.org/jira/browse/SPARK-8413},
192 | Urldate = {2015-07-15},
193 | Year = {2015} }
194 |
195 | @url{SPARK-10063,
196 | Author = {Xin, Reynold},
197 | Date-Added = {2017-12-06 20:04:47 +0000},
198 | Date-Modified = {2017-12-06 20:11:52 +0000},
199 | Title = {{SPARK-10063. Remove DirectParquetOutputCommitter }},
200 | Url = {https://issues.apache.org/jira/browse/SPARK-10063},
201 | Urldate = {2016-04-07},
202 | Year = {2016} }
203 |
204 | @url{SPARK-18512,
205 | Author = {Bonaccorso, Giuseppe },
206 | Title = {{SPARK-18512. FileNotFoundException on _temporary directory with Spark Streaming 2.0.1 and S3A}},
207 | Url = {https://issues.apache.org/jira/browse/SPARK-18512},
208 | Urldate = {2016-12-15},
209 | Year = {2016} }
210 |
211 | @url{SPARK-21669,
212 | Author = {Adrian Ionescu},
213 | Title = {{ SPARK-21669. Internal API for collecting metrics/stats during FileFormatWriter jobs}},
214 | Url = {https://issues.apache.org/jira/browse/SPARK-21669},
215 | Urldate = {2017-08-17},
216 | Year = {2017} }
217 |
218 | @url{SPARK-21762,
219 | Author = {Loughran, Steve},
220 | Title = {{ SPARK-21762. FileFormatWriter/BasicWriteTaskStatsTracker metrics collection fails if a new file isn't yet visible}},
221 | Url = {https://issues.apache.org/jira/browse/SPARK-21762},
222 | Urldate = {2017-08-17},
223 | Year = {2017} }
224 |
225 | @url{SPARK-22217,
226 | Author = {Loughran, Steve},
227 | Title = {{ SPARK-22217. ParquetFileFormat to support arbitrary OutputCommitters }},
228 | Url = {https://issues.apache.org/jira/browse/SPARK-22217},
229 | Urldate = {2017-10-13},
230 | Year = {2017} }
231 |
232 | @url{s3-tla,
233 | Author = {Loughran, Steve},
234 | Title = {{ TLA+ Specification of a consistent S3 object store }},
235 | Url = {https://github.com/steveloughran/formality/releases/download/tag_blobstore_0.3/objectstore.pdf},
236 | Year = {2017} }
237 |
238 | @article{Stocator,
239 | Archiveprefix = {arXiv},
240 | Title = {{ Stocator: A High Performance Object Store Connector for Spark}},
241 | Author = {Vernik, Gil and Factor, Michael and Kolodner, Elliot K. and Michiardi, Pietro and Ofer, Effi and Pace, Francesco},
242 | Bibsource = {dblp computer science bibliography, http://dblp.org},
243 | Biburl = {http://dblp.org/rec/bib/journals/corr/abs-1709-01812},
244 | Date-Added = {2017-11-28 14:34:59 +0000},
245 | Date-Modified = {2017-11-28 14:34:59 +0000},
246 | Eprint = {1709.01812},
247 | Journal = {CoRR},
248 | Timestamp = {Thu, 05 Oct 2017 09:42:54 +0200},
249 | Url = {http://arxiv.org/abs/1709.01812},
250 | Volume = {abs/1709.01812},
251 | Year = {2017},
252 | Bdsk-Url-1 = {http://arxiv.org/abs/1709.01812}
253 | }
254 |
255 | @inproceedings{MapReduce,
256 | Acmid = {1251264},
257 | Address = {Berkeley, CA, USA},
258 | Author = {Dean, Jeffrey and Ghemawat, Sanjay},
259 | Booktitle = {Proceedings of the 6th Conference on Symposium on Operating Systems Design and Implementation - Volume 6},
260 | Date-Added = {2017-11-28 14:23:50 +0000},
261 | Date-Modified = {2017-11-28 14:23:50 +0000},
262 | Location = {San Francisco, CA},
263 | Numpages = {1},
264 | Pages = {10--10},
265 | Publisher = {USENIX Association},
266 | Series = {OSDI'04},
267 | Title = {{MapReduce: Simplified Data Processing on Large Clusters}},
268 | Url = {http://dl.acm.org/citation.cfm?id=1251254.1251264},
269 | Year = {2004},
270 | Bdsk-Url-1 = {http://dl.acm.org/citation.cfm?id=1251254.1251264}
271 | }
272 |
273 |
274 | @book{2011_AOSA,
275 | Title = {{ The Architecture of Open Source Applications }},
276 | Author = {{ Brown, Amy and Wilson, Greg }},
277 | Isbn = {9781257638017},
278 | Month = jun,
279 | Posted-At = {2011-07-14 10:06:08},
280 | Priority = {5},
281 | Publisher = {CreativeCommons},
282 | Url = {http://aosabook.org/en/index.html},
283 | Year = {2011},
284 | Bdsk-Url-1 = {http://www.worldcat.org/isbn/9781257638017}
285 | }
286 |
287 | @inproceedings{Bermbach:2014:BEC:2624303.2624662,
288 | author = {Bermbach, David and Tai, Stefan},
289 | title = {Benchmarking Eventual Consistency: Lessons Learned from Long-Term Experimental Studies},
290 | booktitle = {Proceedings of the 2014 IEEE International Conference on Cloud Engineering},
291 | series = {IC2E '14},
292 | year = {2014},
293 | isbn = {978-1-4799-3766-0},
294 | pages = {47--56},
295 | numpages = {10},
296 | url = {https://doi.org/10.1109/IC2E.2014.37},
297 | doi = {10.1109/IC2E.2014.37},
298 | acmid = {2624662},
299 | publisher = {IEEE Computer Society},
300 | address = {Washington, DC, USA},
301 | }
302 |
303 | @incollection{Chansler2011,
304 | author = {{Chansler, Robert and Kuang, Hairong and Radia, Sanjay and Shvachko, Konstantin and Srinivas, Suresh}},
305 | title = {{The Hadoop Distributed File System}},
306 | editor = {{ Brown, Amy and Wilson, Greg }},
307 | booktitle = {{ The Architecture of Open Source Applications }},
308 | Isbn = {9781257638017},
309 | chapter = {8},
310 | Publisher = {CreativeCommons},
311 | Url = {http://aosabook.org/en/index.html},
312 | Year = {2011},
313 | Bdsk-Url-1 = {http://www.worldcat.org/isbn/9781257638017}
314 | }
315 |
316 | @inproceedings{Vavilapalli2013,
317 | author = {Vavilapalli, Vinod Kumar and Murthy, Arun C. and Douglas, Chris and Agarwal, Sharad and Konar, Mahadev and Evans, Robert and Graves, Thomas and Lowe, Jason and Shah, Hitesh and Seth, Siddharth and Saha, Bikas and Curino, Carlo and O'Malley, Owen and Radia, Sanjay and Reed, Benjamin and Baldeschwieler, Eric},
318 | title = {{Apache Hadoop YARN: Yet Another Resource Negotiator}},
319 | booktitle = {{Proceedings of the 4th Annual Symposium on Cloud Computing}},
320 | series = {SOCC '13},
321 | year = {2013},
322 | isbn = {978-1-4503-2428-1},
323 | location = {Santa Clara, California},
324 | pages = {5:1--5:16},
325 | articleno = {5},
326 | numpages = {16},
327 | url = {http://doi.acm.org/10.1145/2523616.2523633},
328 | doi = {10.1145/2523616.2523633},
329 | acmid = {2523633},
330 | publisher = {ACM},
331 | address = {New York, NY, USA},
332 | }
333 |
334 | @article{Satya89,
335 | author = {M. Satyanarayanan},
336 | title = {A Survey of Distributed File Systems},
337 | institution = {Department of Computer Science Carnegie Mellon University},
338 | year = {1989},
339 | doi = {0.1146/annurev.cs.04.060190.000445},
340 | journal = {Annual Review of Computer Science},
341 | pages = {73-104},
342 | notes = {Vol. 4:73-104 (Volume publication date June 1990)}
343 | }
344 |
345 |
--------------------------------------------------------------------------------
/tex/commit-protocol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steveloughran/zero-rename-committer/fe31fbf8f474c8580a1471a08eec3946b642268c/tex/commit-protocol.png
--------------------------------------------------------------------------------
/tex/commit-protocol.puml:
--------------------------------------------------------------------------------
1 | @startuml
2 |
3 | 'define icons
4 | control "Job Manager"
5 | participant "Job Committer"
6 | control Executor
7 | participant "Task Committer"
8 | participant Operation
9 | control "YARN NodeManager"
10 | control "YARN ResourceManager"
11 |
12 | ' protocol
13 | '== Initialization ==
14 |
15 | "Job Manager" -> "Job Committer": setupJob()
16 |
17 | == For Each Task Attempt ==
18 | "Job Manager" -> Executor: execute work
19 | Executor -> "Task Committer": setupTask()
20 | Executor -> "Task Committer": getWorkPath()
21 | Executor <-- "Task Committer": taskAttemptDirectory
22 | Executor --> Operation: map/reduce + taskAttemptDirectory
23 |
24 | loop Until execution is complete...
25 | Executor -> "Job Manager" : ping?
26 | "Job Manager" -> Executor : task is known
27 | alt task is unknown/timeout/limits-exceeded
28 | Executor -> Executor: exit!
29 | Executor -> "YARN NodeManager": (exited)
30 | "YARN NodeManager" -> "YARN ResourceManager": process exited
31 | end
32 | "Job Manager" -> "YARN ResourceManager": heartbeat request
33 | "Job Manager" <-- "YARN ResourceManager": heartbeat response [exited tasks]*
34 | end
35 | Executor <-- Operation: map/reduce finished
36 |
37 | 'TODO: MR failure
38 |
39 | Executor -> "Task Committer": needsTaskCommit()?
40 | Executor <-- "Task Committer":
41 |
42 |
43 | alt needs task commit
44 | Executor -> "Job Manager": can commit?
45 | Executor <-- "Job Manager":
46 | alt action = commit
47 | Executor -> "Job Manager": commitPending
48 | Executor -> "Task Committer": commitTask()
49 | Executor <-- "Task Committer": committed task
50 | else action = abort
51 | Executor -> "Task Committer": abortTask()
52 | Executor <-- "Task Committer": aborted task
53 | end
54 | end
55 | Executor -> "Job Manager" : done
56 | Executor -> Executor: exit!
57 | Executor -> "YARN NodeManager": (exited)
58 | "YARN NodeManager" -> "YARN ResourceManager": process exited
59 |
60 | alt done-not-received-in time interval
61 | "Job Manager" -> "Job Committer": cleanupTask()
62 | "Job Manager" -> "Job Manager": reschedule task
63 | end
64 |
65 |
66 | == Job Commit==
67 |
68 | "Job Manager" -> "YARN ResourceManager": heartbeat request
69 | "Job Manager" <-- "YARN ResourceManager": heartbeat response [exited tasks]*
70 |
71 | "Job Manager" -> "Job Committer": commitJob()
72 | "Job Manager" <-- "Job Committer": committed job
73 | "Job Manager" -> "Job Committer": cleanupJob()
74 | "Job Manager" -> "YARN ResourceManager": finished
75 | "Job Manager" -> "Job Manager": exit!
76 | "Job Manager" -> "YARN NodeManager": (exited)
77 | "YARN NodeManager" -> "YARN ResourceManager": process exited
78 |
79 |
80 | @enduml
81 |
--------------------------------------------------------------------------------
/tex/improvements-to-the-commit-protocols.tex:
--------------------------------------------------------------------------------
1 | \documentclass{article}
2 | % ========================================================================
3 | \begin{document}
4 |
5 | What can be done for improving commit operations
6 |
7 | \section{Hadoop}\label{sec:hadoop}
8 |
9 | Add check for dest dir existing before job setup?
10 | Currently there's a window between job submit and OutputCommitter.setupJob()
11 | where no attempt to create the dest dir exists.
12 |
13 | The *first* attempt could reject if the dest dir is there;
14 | later ones should expect it.
15 |
16 | \section{Spark}\label{sec:spark}
17 |
18 | Handling on task attempt commit timeout
19 |
20 | \begin{enumerate}
21 | \item choose whether to commit another attempt on task commit timeout based on committer attributes.
22 | \item Short term: if FileOutputCommitter, check algorithm and reject restart if v2
23 | \item Long term: add predicate to PathOutputCommitter.
24 | \end{enumerate}
25 |
26 | Avoid duplicate job attempt commmit after a partitioned spark driver
27 |
28 | Q1: is this actually a problem on YARN?
29 |
30 | How would a test simulate this?
31 |
32 | Q2: what about on Mesos or YARN with driver running off-cluster?
33 |
34 | Q3: How to address in a infra-neutral way?
35 | Need to query infra for last time a heartbeat took place and wait until
36 | its within a time range before committing.
37 |
38 |
39 | \texttt{newTaskTempFileAbsPath()} is, well, trouble for the new committers.
40 | It could be supported in the staging committers again throue upload in task commit.
41 |
42 | For the magic committer, we could add a new prefix under \texttt{__magic},
43 | \texttt{__root}, alongside \texttt{__base}.
44 | The magic path mapping logic would then be "when writing to a magic path
45 | \texttt{dest/__magic/jobAttemptId/taskAttemptID/__root/abspath}, then the destination of the write
46 | would actually become "abspath".
47 | The existing commit algorithm would automatically incorporate the .pending files
48 | created in the write, and so commit them along with the relative paths.
49 | There is one sole complication: with the writes scattered through the S3 bucket,
50 | a full cleanup would require listing and aborting all outstanding MPUs
51 | writes throughout the bucket.
52 | This could not be done without breaking all other active jobs writing to
53 | the same bucket.
54 |
55 |
56 | Committer resilience/reliability could be enhanced by tracking that list of
57 | files to commit: tasks could record the files they'd provisionally committed and return
58 | that to the job committer. (more specifically, final dest files)
59 | That could then verify that after job commit, they were there.
60 | This is a safety check which could be done at the commit layer; the spark protocols
61 | have room for this
62 |
63 |
64 | \section{Statistic collection}
65 |
66 | A long term fix woud be to radically improve the means by which statistics are
67 | collected by tasks and then aggregated in the job committer.
68 | The latest versions of the HDFS and object store connectors are heavily instrumented,
69 | collecting lots of information on filesystem client use.
70 | for S3A this includes: times an HTTP1.1 connection was aborted on a file read,
71 | the number of times a S3 request was throttled and retried,
72 | even some latency statistics.
73 | All of these would be useful if they could be collected by the job, aggregated
74 | appropriately, and included in the job execution history.
75 | It is possible to collect this information, and then pass this back.
76 | Indeed, our committers do collect this data themselves, saving it in the
77 | manifest files listing pending uploads, combinining this into the \SUCCESS file
78 | in job commit.
79 | This data is not yet extracted by the job committer and returned to the execution
80 | environment.
81 | This can be fixed.
82 |
83 | \section{Committer Capabilities}
84 |
85 | Committers should be able to declare that they have certain attributes, such
86 | as repeatable task commit operations (currently only job commit is declared).
87 |
88 | Rather than propose a continually growing list of predicates, we'd argue for
89 | implementing the \texttt{org.apache.hadoop.fs.StreamCapabilities} interface
90 | and its \texttt{boolean hasCapability(String capability)} predicate, with capabilities
91 | chosen so that the default return value, false, is the pessimistic outcome.
92 | By reusing the capabilities probe already built into Hadoop 2.9+, it's easier
93 | to adopt the checks.
94 |
95 |
96 |
97 | \section{S3a Committers}
98 |
99 |
100 | \section{Validation}
101 |
102 | What about an option of a post Job commit verification that all committed files
103 | are really there?
104 | That's done in the integration tests by reading the data in _SUCCESS;
105 | it could be added into the Job Committer itself.
106 | It's potentially expensive in cost and time at a HEAD/file;
107 | a listFiles(dest, recursive) could be used instead, with, on the partitioned
108 | committer, only generated partitions checked.
109 |
110 | This is really just a safety check, but it could could be useful in testing
111 | and diagnostics, because you wouldn't need to add specific logic to read in
112 | the (unstable) _SUCCESS JSON data & check there.
113 |
114 | \subsubsection{Magic committer}
115 |
116 | Make sure that an empty directory can still be committed & generates
117 | an empty pendingset file, and that when loaded, its harmless.
118 |
119 |
120 |
121 | \section{Aborts and cleanup}
122 |
123 | Everythuing needs to assume that the committers abort and cleanup operations may raise exceptions.
124 |
125 | When invoked, they should be should be surrounded by try/catch blocks When subclassed, the subclasses
126 | should do their own cleanup operations in a try/finally clause.
127 |
128 | Executors/drivers should call these operations on all failure codepaths;
129 | Hadoop MR doesn't appear to do this.
130 |
131 | Current committers should do that try/catch themselves, even though the method
132 | signature allows them to raise IOEs.
133 | This avoids them having to wait for new releases of the execution engines
134 | for the add resilience.
135 |
136 |
137 | \end{document}
138 |
--------------------------------------------------------------------------------
/tex/notes.tex:
--------------------------------------------------------------------------------
1 | %%
2 | %% Author: stevel
3 | %% 03/01/2018
4 | %%
5 |
6 | % Preamble
7 | \documentclass[11pt]{article}
8 |
9 | % Packages
10 | \usepackage{a4wide}
11 |
12 | % Document
13 | \begin{document}
14 |
15 |
16 |
17 | The check for a dest dir existing happens in job submission, thus there is a
18 | small race condition where >1 job may target the same directory.
19 |
20 | \begin{verbatim}
21 |
22 |
23 | Tests run: 1, Failures: 0, Errors: 1, Skipped: 0, Time elapsed: 10.82 s <<< FAILURE! - in org.apache.hadoop.fs.s3a.commit.staging.integration.ITStagingCommitMRJobBadDest[ERROR] testMRJob(org.apache.hadoop.fs.s3a.commit.staging.integration.ITStagingCommitMRJobBadDest) Time elapsed: 3.138 s <<< ERROR!org.apache.hadoop.mapred.FileAlreadyExistsException: Output directory s3a://hwdev-steve-ireland-new/test/DELAY_LISTING_ME/testMRJob already exists
24 | at org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.checkOutputSpecs(FileOutputFormat.java:164)
25 | at org.apache.hadoop.mapreduce.JobSubmitter.checkSpecs(JobSubmitter.java:280)
26 | at org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:146)
27 | at org.apache.hadoop.mapreduce.Job\$11.run(Job.java:1570)
28 | at org.apache.hadoop.mapreduce.Job\$11.run(Job.java:1567)
29 | at java.security.AccessController.doPrivileged(Native Method)
30 | at javax.security.auth.Subject.doAs(Subject.java:422)
31 | at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1965)
32 | at org.apache.hadoop.mapreduce.Job.submit(Job.java:1567)
33 | at org.apache.hadoop.fs.s3a.commit.AbstractITCommitMRJob.testMRJob(AbstractITCommitMRJob.java:206)
34 | at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
35 | at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
36 | at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
37 | at java.lang.reflect.Method.invoke(Method.java:498)
38 | at org.junit.runners.model.FrameworkMethod\$1.runReflectiveCall(FrameworkMethod.java:47)
39 | at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
40 | at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:44)
41 | at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
42 | at org.junit.internal.runners.statements.RunBefores.evaluate(RunBefores.java:26)
43 | at org.junit.internal.runners.statements.RunAfters.evaluate(RunAfters.java:27)
44 | at org.junit.rules.ExternalResource\$1.evaluate(ExternalResource.java:48)
45 | at org.junit.rules.TestWatcher\$1.evaluate(TestWatcher.java:55)
46 | at org.junit.internal.runners.statements.FailOnTimeout\$StatementThread.run(FailOnTimeout.java:74)
47 |
48 | \end{verbatim}
49 |
50 | \end{document}
51 |
52 |
--------------------------------------------------------------------------------
/tex/spark-protocol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steveloughran/zero-rename-committer/fe31fbf8f474c8580a1471a08eec3946b642268c/tex/spark-protocol.png
--------------------------------------------------------------------------------
/tex/spark-protocol.puml:
--------------------------------------------------------------------------------
1 | @startuml
2 |
3 | 'define icons
4 | control Driver
5 | participant "Job Committer"
6 | control Executor
7 | participant "Task Committer"
8 | participant Operation
9 |
10 | ' protocol
11 | '== Initialization ==
12 |
13 | Driver -> "Job Committer": setupJob()
14 |
15 | == For Each Task Attempt ==
16 | Driver -> Executor: execute work
17 | Executor -> "Task Committer": setupTask()
18 | Executor --> Operation: execute
19 | "Task Committer" <-- Operation: newTaskTempFile
20 | "Task Committer" --> Operation: tempFile
21 |
22 | "Task Committer" <-- Operation: newTaskTempFileAbsPath
23 | "Task Committer" --> Operation: tempFileAbsPath
24 |
25 | Executor <-- Operation: finished
26 |
27 | ' now look at the commit protocol
28 | Executor -> "Task Committer": needsTaskCommit()?
29 | Executor <-- "Task Committer":
30 |
31 | alt needsTaskCommit == true
32 | Executor -> "Driver": AskPermissionToCommitOutput
33 | alt permission to commit granted
34 | Executor -> "Task Committer": commitTask()
35 | Executor <-- "Task Committer": TaskCommitMessage
36 | Executor -> Driver : Success + TaskCommitMessage
37 | else
38 | Executor -> Driver : TaskCommitDenied
39 | end
40 | else needsTaskCommit == false
41 | Executor -> "Task Committer": abortTask()
42 | Executor <-- "Task Committer": aborted task
43 | Executor -> Driver : Success
44 | end
45 |
46 |
47 | == Job Commit==
48 |
49 |
50 | Driver -> "Job Committer": commitJob(TaskCommitMessage+)
51 | Driver <-- "Job Committer": committed job
52 | Driver -> "Job Committer": cleanupJob()
53 |
54 |
55 | @enduml
56 |
--------------------------------------------------------------------------------