├── pig
    ├── pig.pdf
    ├── .gitignore
    ├── Figures
    │   ├── opt1.pdf
    │   ├── opt2.pdf
    │   ├── opt3.pdf
    │   ├── cogroup.pdf
    │   ├── compiler.pdf
    │   ├── pig_perf.pdf
    │   ├── example2_mr.pdf
    │   ├── expressions.pdf
    │   ├── example2_plan.pdf
    │   └── pig_overview.pdf
    ├── pig.tex
    └── references.bib
├── disa
    ├── disa.pdf
    ├── .gitignore
    ├── Figures
    │   ├── pr.png
    │   ├── pairs.png
    │   ├── pbfs.png
    │   ├── pr_toy.png
    │   ├── stripes.png
    │   ├── combiners.png
    │   ├── functional.png
    │   ├── pr_sketch.png
    │   ├── simple_MR.png
    │   ├── word_count.png
    │   └── simple_MR_combiners.png
    ├── references.bib
    ├── disa.tex
    ├── principles.tex
    ├── graph_algorithms.tex
    ├── programming_model.tex
    └── design_patterns.tex
├── .gitignore
├── ccomp
    └── dscc-4.pdf
├── dstore
    ├── dscc-1.pdf
    └── dscc-2.pdf
├── hadoop
    ├── hadoop.pdf
    ├── Figures
    │   ├── hdfs.png
    │   ├── data2map.png
    │   ├── map_task.png
    │   ├── mapreduce.png
    │   ├── reduce_task.png
    │   ├── sequencefiles.png
    │   ├── split_block.png
    │   ├── hadoop_distance.png
    │   ├── spill_partition.png
    │   ├── chain_replication.png
    │   └── cluster_net_topology.png
    ├── introduction.tex
    ├── references.bib
    ├── hadoop.tex
    ├── io.tex
    ├── hdfs.tex
    ├── deployments.tex
    └── mapreduce.tex
├── intro
    ├── course.pdf
    ├── .gitignore
    ├── course.tex
    └── introduction.tex
├── relal
    ├── relal.pdf
    ├── .gitignore
    ├── relal.tex
    └── relational.tex
├── coordination
    └── dscc-3.pdf
└── README.md


/pig/pig.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/pig/pig.pdf


--------------------------------------------------------------------------------
/disa/disa.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/disa/disa.pdf


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .aux
 2 | .bbl
 3 | .blg
 4 | .log
 5 | .nav
 6 | .out
 7 | .snm
 8 | .toc
 9 | 
10 | 


--------------------------------------------------------------------------------
/ccomp/dscc-4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/ccomp/dscc-4.pdf


--------------------------------------------------------------------------------
/dstore/dscc-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/dstore/dscc-1.pdf


--------------------------------------------------------------------------------
/dstore/dscc-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/dstore/dscc-2.pdf


--------------------------------------------------------------------------------
/hadoop/hadoop.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/hadoop/hadoop.pdf


--------------------------------------------------------------------------------
/intro/course.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/intro/course.pdf


--------------------------------------------------------------------------------
/relal/relal.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/relal/relal.pdf


--------------------------------------------------------------------------------
/disa/.gitignore:
--------------------------------------------------------------------------------
 1 | .aux
 2 | .bbl
 3 | .blg
 4 | .log
 5 | .nav
 6 | .out
 7 | .snm
 8 | .toc
 9 | 
10 | 


--------------------------------------------------------------------------------
/disa/Figures/pr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/disa/Figures/pr.png


--------------------------------------------------------------------------------
/intro/.gitignore:
--------------------------------------------------------------------------------
 1 | .aux
 2 | .bbl
 3 | .blg
 4 | .log
 5 | .nav
 6 | .out
 7 | .snm
 8 | .toc
 9 | 
10 | 


--------------------------------------------------------------------------------
/pig/.gitignore:
--------------------------------------------------------------------------------
 1 | .aux
 2 | .bbl
 3 | .blg
 4 | .log
 5 | .nav
 6 | .out
 7 | .snm
 8 | .toc
 9 | 
10 | 


--------------------------------------------------------------------------------
/relal/.gitignore:
--------------------------------------------------------------------------------
 1 | .aux
 2 | .bbl
 3 | .blg
 4 | .log
 5 | .nav
 6 | .out
 7 | .snm
 8 | .toc
 9 | 
10 | 


--------------------------------------------------------------------------------
/disa/Figures/pairs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/disa/Figures/pairs.png


--------------------------------------------------------------------------------
/disa/Figures/pbfs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/disa/Figures/pbfs.png


--------------------------------------------------------------------------------
/pig/Figures/opt1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/pig/Figures/opt1.pdf


--------------------------------------------------------------------------------
/pig/Figures/opt2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/pig/Figures/opt2.pdf


--------------------------------------------------------------------------------
/pig/Figures/opt3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/pig/Figures/opt3.pdf


--------------------------------------------------------------------------------
/coordination/dscc-3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/coordination/dscc-3.pdf


--------------------------------------------------------------------------------
/disa/Figures/pr_toy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/disa/Figures/pr_toy.png


--------------------------------------------------------------------------------
/disa/Figures/stripes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/disa/Figures/stripes.png


--------------------------------------------------------------------------------
/hadoop/Figures/hdfs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/hadoop/Figures/hdfs.png


--------------------------------------------------------------------------------
/pig/Figures/cogroup.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/pig/Figures/cogroup.pdf


--------------------------------------------------------------------------------
/pig/Figures/compiler.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/pig/Figures/compiler.pdf


--------------------------------------------------------------------------------
/pig/Figures/pig_perf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/pig/Figures/pig_perf.pdf


--------------------------------------------------------------------------------
/disa/Figures/combiners.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/disa/Figures/combiners.png


--------------------------------------------------------------------------------
/disa/Figures/functional.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/disa/Figures/functional.png


--------------------------------------------------------------------------------
/disa/Figures/pr_sketch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/disa/Figures/pr_sketch.png


--------------------------------------------------------------------------------
/disa/Figures/simple_MR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/disa/Figures/simple_MR.png


--------------------------------------------------------------------------------
/disa/Figures/word_count.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/disa/Figures/word_count.png


--------------------------------------------------------------------------------
/hadoop/Figures/data2map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/hadoop/Figures/data2map.png


--------------------------------------------------------------------------------
/hadoop/Figures/map_task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/hadoop/Figures/map_task.png


--------------------------------------------------------------------------------
/pig/Figures/example2_mr.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/pig/Figures/example2_mr.pdf


--------------------------------------------------------------------------------
/pig/Figures/expressions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/pig/Figures/expressions.pdf


--------------------------------------------------------------------------------
/hadoop/Figures/mapreduce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/hadoop/Figures/mapreduce.png


--------------------------------------------------------------------------------
/pig/Figures/example2_plan.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/pig/Figures/example2_plan.pdf


--------------------------------------------------------------------------------
/pig/Figures/pig_overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/pig/Figures/pig_overview.pdf


--------------------------------------------------------------------------------
/hadoop/Figures/reduce_task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/hadoop/Figures/reduce_task.png


--------------------------------------------------------------------------------
/hadoop/Figures/sequencefiles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/hadoop/Figures/sequencefiles.png


--------------------------------------------------------------------------------
/hadoop/Figures/split_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/hadoop/Figures/split_block.png


--------------------------------------------------------------------------------
/hadoop/Figures/hadoop_distance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/hadoop/Figures/hadoop_distance.png


--------------------------------------------------------------------------------
/hadoop/Figures/spill_partition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/hadoop/Figures/spill_partition.png


--------------------------------------------------------------------------------
/disa/Figures/simple_MR_combiners.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/disa/Figures/simple_MR_combiners.png


--------------------------------------------------------------------------------
/hadoop/Figures/chain_replication.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/hadoop/Figures/chain_replication.png


--------------------------------------------------------------------------------
/hadoop/Figures/cluster_net_topology.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fede1024/DISC-CLOUD-COURSE/master/hadoop/Figures/cluster_net_topology.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Data-intensive Scalable Computing and Clouds
 2 | =================
 3 | 
 4 | This repo contains the Latex sources of some of the lecture notes of the Distributed Systems and Cloud Computing course at Eurecom.
 5 | 
 6 | ### DISC or Distributed Systems?
 7 | Actually, a bit of both, but I couldn't figure out a short name for all of that.
 8 | 
 9 | ### Who?
10 | This repo is maintained by [Pietro Michiardi](http://www.eurecom.fr/~michiard).
11 | 
12 | I work at [Eurecom](http://www.eurecom.fr)
13 | 


--------------------------------------------------------------------------------
/intro/course.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{beamer}
 2 | 
 3 | \usepackage[lined,ruled]{algorithm2e}
 4 | \usepackage{subfigure}
 5 | \usepackage[english]{babel}
 6 | \usepackage[latin1]{inputenc}
 7 | \usepackage{times}
 8 | \usepackage[T1]{fontenc} 
 9 | \usepackage{color}
10 | 
11 | \usetheme[secheader]{Boadilla}
12 | \usefonttheme[onlylarge]{structurebold}
13 | \setbeamerfont*{frametitle}{size=\normalsize,series=\bfseries}
14 | \setbeamertemplate{navigation symbols}{}
15 | \setbeamertemplate{mini frames}[box]
16 | \setbeamertemplate{sections/subsections in toc}[square]
17 | \setbeamertemplate{blocks}[rounded][shadow=true]
18 | \setbeamertemplate{bibliography item}[text]
19 | 
20 | \setbeamercolor{lightorange}{fg=black,bg=orange!40}
21 | \setbeamercolor{lightblue}{fg=black,bg=blue!30}
22 | 
23 | \newenvironment{colorblock}[2]
24 | {\setbeamercolor{item}{fg=#1,bg=#1}\begin{beamerboxesrounded}[upper=#1,lower=#2,shadow=true]}
25 |   {\end{beamerboxesrounded}}
26 | 
27 | 
28 | 
29 | % Setup TikZ
30 | 
31 | \usepackage{tikz}
32 | \usetikzlibrary{arrows}
33 | \tikzstyle{block}=[draw opacity=0.7,line width=1.4cm]
34 | 
35 | 
36 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
37 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
38 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
39 | 
40 | \newtheorem{observation}[theorem]{Observation} 
41 | 
42 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
43 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
44 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
45 | 
46 | \title{Data-intensive Scalable Computing}
47 | \subtitle{Introduction}
48 | \author{Pietro Michiardi}
49 | \institute{Eurecom}
50 | \date
51 | 
52 | 
53 | \begin{document}
54 | 
55 | \begin{frame}
56 |   \titlepage
57 | \end{frame}
58 | 
59 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
60 | \section{Introduction and Motivations}
61 | 
62 | \begin{frame}
63 |  \begin{colorblock}{blue}{lightblue}{ }
64 |   \begin{center}
65 |     \Huge \textbf{\texttt{Introduction and Motivations}}
66 |   \end{center}
67 |   \end{colorblock}
68 | \end{frame}
69 | 
70 | \input{./introduction}
71 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
72 | 
73 | \end{document}
74 | 


--------------------------------------------------------------------------------
/hadoop/introduction.tex:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | \frame {\frametitle{From Theory to Practice}
 3 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 |   \begin{itemize}
 5 |   \item \textbf{The story so far}
 6 |     \begin{itemize}
 7 |     \item Principles behind the MapReduce Framework
 8 |     \item Programming model
 9 |     \item Algorithm design and patterns
10 |     \end{itemize}
11 | 
12 |     \vspace{20pt}
13 | 
14 |   \item \textbf{Hadoop implementation of MapReduce}
15 |     \begin{itemize}
16 |     \item HDFS in details
17 |     \item Hadoop MapReduce
18 |       \begin{itemize}
19 |       \item Implementation details
20 |       \item Types and Formats
21 |       \end{itemize}
22 |     \item Hadoop I/O
23 |     \end{itemize}
24 | 
25 |     \vspace{20pt}
26 | 
27 |   \item \textbf{Hadoop Deployments}
28 |   \begin{itemize}
29 |     \item The BigFoot platform
30 |   \end{itemize}
31 |   \end{itemize}
32 | }
33 | 
34 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
35 | \frame {\frametitle{Terminology}
36 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
37 |   \begin{itemize}
38 |   \item \textbf{MapReduce:}
39 |     \begin{itemize}
40 |     \item \texttt{Job}: an execution of a Mapper and Reducer across a
41 |       data set
42 |     \item \texttt{Task}: an execution of a Mapper or a Reducer on a slice of
43 |       data
44 |     \item \texttt{Task Attempt}: instance of an attempt to execute a
45 |       task
46 |     \item \textbf{Example:}
47 |       \begin{itemize}
48 |       \item Running ``Word Count'' across 20 files is one job
49 |       \item 20 files to be mapped = 20 map tasks + some number of
50 |         reduce tasks
51 |       \item At least 20 attempts will be performed... more if a
52 |         machine crashes
53 |       \end{itemize}
54 |     \end{itemize}
55 | 
56 | 
57 | 
58 |     \vspace{20pt}
59 | 
60 |   \item \textbf{Task Attempts}
61 |     \begin{itemize}
62 |     \item Task attempted at least once, possibly more
63 |     \item Multiple crashes on input imply discarding it
64 |     \item Multiple attempts may occur in parallel (a.k.a. speculative execution)
65 |     \item Task ID from TaskInProgress is not a unique identifier
66 |     \end{itemize}
67 |   \end{itemize}
68 | }
69 | 


--------------------------------------------------------------------------------
/intro/introduction.tex:
--------------------------------------------------------------------------------
 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 2 | \frame {\frametitle{What is this Course About}
 3 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 4 |   \begin{itemize}
 5 |     \item \textbf{The MapReduce Programming Model}
 6 |     \begin{itemize}
 7 |       \item Principles of functional programming
 8 |       \item Scalable algorithm design
 9 |     \end{itemize}
10 | 
11 | \vspace{20pt}
12 |   
13 |     \item \textbf{In-depth description of Hadoop MapReduce}
14 |     \begin{itemize}
15 |       \item Architecture internals
16 |       \item Software components
17 |       \item Cluster deployments
18 |     \end{itemize}
19 | 
20 | \vspace{20pt}
21 | 
22 |     \item \textbf{Relational Algebra and High-Level Languages}
23 |     \begin{itemize}
24 |       \item Basic operators and their equivalence in MapReduce
25 |       \item Hadoop Pig and PigLatin
26 |     \end{itemize}
27 | 
28 |   \end{itemize}
29 | }
30 | 
31 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
32 | \frame {\frametitle{What is MapReduce?}
33 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
34 |   \begin{itemize}
35 |   \item \textbf{A programming model}:
36 |     \begin{itemize}
37 |     \item Inspired by functional programming
38 |     \item Parallel computations on massive amounts of data
39 |     \end{itemize}
40 | 
41 | \vspace{20pt}
42 | 
43 |   \item \textbf{An execution framework}:
44 |     \begin{itemize}
45 |     \item Designed for large-scale data processing
46 |     \item Designed to run on clusters of commodity hardware
47 |     \end{itemize}
48 |  \end{itemize}
49 | }
50 | 
51 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
52 | \frame {\frametitle{What is Big Data?}
53 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
54 |   \begin{itemize}
55 |   \item \textbf{Vast repositories of data}
56 |     \begin{itemize}
57 |       \item The Web
58 |       \item Physics
59 |       \item Astronomy
60 |       \item Finance
61 |     \end{itemize}
62 | 
63 |   \vspace{20pt}
64 | 
65 |   \item \textbf{Volume, Velocity, Variety}
66 | 
67 |   \vspace{20pt}
68 | 
69 |   \item \textbf{It's not the algorithm, it's the data!} \cite{banko01}
70 |     \begin{itemize}
71 |       \item More data leads to better accuracy
72 |       \item With more data, accuracy of different algorithms converges
73 |     \end{itemize}
74 |   \end{itemize}
75 | }
76 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
77 | 
78 | 


--------------------------------------------------------------------------------
/pig/pig.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | 
  3 | \usepackage{subfigure}
  4 | \usepackage[english]{babel}
  5 | \usepackage[latin1]{inputenc}
  6 | \usepackage{times}
  7 | \usepackage[T1]{fontenc} 
  8 | \usepackage{color}
  9 | 
 10 | \usepackage{algorithm}
 11 | \usepackage{algorithmicx}
 12 | \usepackage[noend]{algpseudocode}
 13 | 
 14 | \usetheme[secheader]{Boadilla}
 15 | \usefonttheme[onlylarge]{structurebold}
 16 | \setbeamerfont*{frametitle}{size=\normalsize,series=\bfseries}
 17 | \setbeamertemplate{navigation symbols}{}
 18 | \setbeamertemplate{mini frames}[box]
 19 | \setbeamertemplate{sections/subsections in toc}[square]
 20 | \setbeamertemplate{blocks}[rounded][shadow=true]
 21 | \setbeamertemplate{bibliography item}[text]
 22 | 
 23 | \setbeamercolor{lightorange}{fg=black,bg=orange!40}
 24 | \setbeamercolor{lightblue}{fg=black,bg=blue!30}
 25 | 
 26 | \newenvironment{colorblock}[2]
 27 | {\setbeamercolor{item}{fg=#1,bg=#1}\begin{beamerboxesrounded}[upper=#1,lower=#2,shadow=true]}
 28 |   {\end{beamerboxesrounded}}
 29 | 
 30 | 
 31 | 
 32 | % Setup TikZ
 33 | 
 34 | \usepackage{tikz}
 35 | \usetikzlibrary{arrows}
 36 | \tikzstyle{block}=[draw opacity=0.7,line width=1.4cm]
 37 | 
 38 | 
 39 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 40 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 41 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 42 | 
 43 | \newtheorem{observation}[theorem]{Observation} 
 44 | 
 45 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 46 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 47 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 48 | 
 49 | \title{High-level Programming Languages}
 50 | \subtitle{Apache Pig and Pig Latin}
 51 | \author{Pietro Michiardi}
 52 | \institute{Eurecom}
 53 | \date
 54 | 
 55 | 
 56 | \begin{document}
 57 | 
 58 | \begin{frame}
 59 |   \titlepage
 60 | \end{frame}
 61 | 
 62 | 
 63 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 64 | \section{Apache Pig}
 65 | 
 66 | \begin{frame}
 67 |  \begin{colorblock}{blue}{lightblue}{ }
 68 |   \begin{center}
 69 |     \Huge \textbf{\texttt{Apache Pig}}
 70 |   \end{center}
 71 |   \end{colorblock}
 72 | 
 73 | \begin{itemize}
 74 |   \item[] See also the 4 segments on Pig on coursera:
 75 |   \item[] \url{https://www.coursera.org/course/datasci}
 76 | \end{itemize}
 77 |   
 78 | \end{frame}
 79 | 
 80 | \input{./pig-overview}
 81 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 82 | 
 83 | 
 84 | 
 85 | 
 86 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 87 | \section{References}
 88 | 
 89 | \begin{frame}
 90 |  \begin{colorblock}{blue}{lightblue}{ }
 91 |   \begin{center}
 92 |     \Huge \textbf{\texttt{References}}
 93 |   \end{center}
 94 |   \end{colorblock}
 95 | \end{frame}
 96 | 
 97 | \begin{frame}[allowframebreaks]{References}
 98 | \bibliographystyle{plain} 
 99 | \bibliography{references} 
100 | \end{frame}
101 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
102 | 
103 | \end{document}
104 | 


--------------------------------------------------------------------------------
/pig/references.bib:
--------------------------------------------------------------------------------
 1 | % This file was created with JabRef 2.7.2.
 2 | % Encoding: MacRoman
 3 | 
 4 | @INPROCEEDINGS{Brewer01,
 5 |   author = {Eric Brewer},
 6 |   title = {Lessons from giant-scale services},
 7 |   booktitle = {In IEEE Internet Computing},
 8 |   year = {2001}
 9 | }
10 | 
11 | @INPROCEEDINGS{Chang2006,
12 |   author = {Fay Chang and Jeffrey Dean and Sanjay Ghemawat and Wilson C. Hsieh
13 | 	and Deborah A. Wallach and Mike Burrows and Tushar Chandra and Andrew
14 | 	Fikes and Robert E. Gruber},
15 |   title = {Bigtable: A Distributed Storage System for Structured Data},
16 |   booktitle = {Proc. od USENIX OSDI},
17 |   year = {2006},
18 |   owner = {michiard},
19 |   timestamp = {2012.02.21}
20 | }
21 | 
22 | @INPROCEEDINGS{Dean2004,
23 |   author = {Jeffrey Dean and Sanjay Ghemawat},
24 |   title = {MapReduce: Simplified Data Processing on Large Clusters},
25 |   booktitle = {Proc. of ACM OSDI},
26 |   year = {2004},
27 |   owner = {michiard},
28 |   timestamp = {2012.02.21}
29 | }
30 | 
31 | @BOOK{George2011,
32 |   title = {HBase, The Definitive Guide},
33 |   publisher = {O'Reilly},
34 |   year = {2011},
35 |   author = {Lars George},
36 |   owner = {michiard},
37 |   timestamp = {2012.02.21}
38 | }
39 | 
40 | @INPROCEEDINGS{Ghemawat2003,
41 |   author = {Sanjay Ghemawat and Howard Gobioff and Shun-Tak Leung},
42 |   title = {The Google File System},
43 |   booktitle = {Proc. of ACM OSDI},
44 |   year = {2003},
45 |   owner = {michiard},
46 |   timestamp = {2012.02.21}
47 | }
48 | 
49 | @UNPUBLISHED{ONeil1996,
50 |   author = {Patrick O'Neil and Edward Cheng and Dieter Gawlick and Elizabeth
51 | 	O'Neil},
52 |   title = {The Log-Structured Merge-Tree (LSM-Tree)},
53 |   year = {1996},
54 |   owner = {michiard},
55 |   timestamp = {2012.02.21}
56 | }
57 | 
58 | @INPROCEEDINGS{Olston2008,
59 |   author = {C. Olston and B. Reed and U. Srivastava and R. Kumar and and A. Tomkins},
60 |   title = {Pig Latin: A Not-So-Foreign Language for Data Processing},
61 |   booktitle = {Proc. of ACM SIGMOD},
62 |   year = {2008},
63 |   owner = {michiard},
64 |   timestamp = {2012.03.01}
65 | }
66 | 
67 | @MISC{Salmen09,
68 |   author = {D. Salmen},
69 |   title = {Cloud Data Structure Diagramming Techniques and Design Patterns},
70 |   howpublished = {\url{https://www.data-tactics-corp.com/index.php/component/jdownloads/finish/22-white-papers/68-cloud-data-structure-diagramming}},
71 |   year = {2009},
72 |   owner = {michiard},
73 |   timestamp = {2012.02.21}
74 | }
75 | 
76 | @BOOK{White2010,
77 |   title = {Hadoop, The Definitive Guide},
78 |   publisher = {O'Reilly, Yahoo},
79 |   year = {2010},
80 |   author = {Tom White},
81 |   owner = {michiard},
82 |   timestamp = {2011.04.29}
83 | }
84 | 
85 | @ELECTRONIC{b+tree,
86 |   title = {B+ Tree},
87 |   howpublished = {\url{http://en.wikipedia.org/wiki/B%2B_tree}},
88 |   owner = {michiard},
89 |   timestamp = {2012.02.21}
90 | }
91 | 
92 | @MISC{pig,
93 |   title = {Pig Wiki},
94 |   howpublished = {\url{http://wiki.apache.org/pig/}},
95 |   owner = {michiard},
96 |   timestamp = {2012.03.01}
97 | }
98 | 
99 | 


--------------------------------------------------------------------------------
/disa/references.bib:
--------------------------------------------------------------------------------
 1 | % This file was created with JabRef 2.6.
 2 | % Encoding: MacRoman
 3 | 
 4 | @INPROCEEDINGS{banko01,
 5 |   author = {Michele Banko and Eric Brill},
 6 |   title = {Scaling to very very large corpora for natural language disambiguation},
 7 |   booktitle = {Proc. of the 39th Annual Meeting of the Association for Computational
 8 | 	Linguistic (ACL)},
 9 |   year = {2001}
10 | }
11 | 
12 | @INPROCEEDINGS{barroso09,
13 |   author = {Luiz Andre Barroso and Urs Holzle},
14 |   title = {The Datacebter as a Computer: An introduction to the Design of Warehouse-Scale
15 | 	Machines},
16 |   year = {2009},
17 |   publisher = {Morgan \& Claypool Publishers}
18 | }
19 | 
20 | @INPROCEEDINGS{Bianchini2005,
21 |   author = {Monica Bianchini and Marco Gori and Franco Scarselli},
22 |   title = {Inside PageRank},
23 |   booktitle = {ACM Transactions on Internet Technology},
24 |   year = {2005},
25 |   owner = {michiard},
26 |   timestamp = {2011.05.11}
27 | }
28 | 
29 | @INPROCEEDINGS{hamilton09,
30 |   author = {James Hamilton},
31 |   title = {Cooperative Expendable Micro-Slice Servers (CEMS): Low cost, low
32 | 	power servers for Internet-scale services},
33 |   booktitle = {Proc. of the 4th Biennal Conference on Innovative Data Systems Research
34 | 	(CIDR)},
35 |   year = {2009}
36 | }
37 | 
38 | @INPROCEEDINGS{hey09,
39 |   author = {Tony Hey and Stewart Tansley and Kristin Tolle},
40 |   title = {The Fourth Paradigm: Data-Intensive Scientific Discovery},
41 |   year = {2009},
42 |   publisher = {Microsoft Research}
43 | }
44 | 
45 | @INPROCEEDINGS{Lattanzi2011,
46 |   author = {Silvio Lattanzi and Benjamin Moseley and Siddharth Suri and Sergei
47 | 	Vassilvitskii},
48 |   title = {Filtering: a Method for Solving Graph Problems in MapReduce},
49 |   booktitle = {Proc. of SPAA},
50 |   year = {2011},
51 |   owner = {michiard},
52 |   timestamp = {2011.05.11}
53 | }
54 | 
55 | @INPROCEEDINGS{Leskovec2005,
56 |   author = {Jure Leskovec and Jon Kleinberg and Christos Faloutsos},
57 |   title = {Graphs over time: Densification laws, shrinking diamters and possible
58 | 	explanations},
59 |   booktitle = {Proc. of SIGKDD},
60 |   year = {2005},
61 |   owner = {michiard},
62 |   timestamp = {2011.05.11}
63 | }
64 | 
65 | @INPROCEEDINGS{Page1999,
66 |   author = {Lawrence Page and Sergey Brin and Rajeev Motwani and Terry Winograd},
67 |   title = {The PageRank citation ranking: Bringin order to the Web},
68 |   booktitle = {Stanford Digital Library Working Paper},
69 |   year = {1999},
70 |   owner = {michiard},
71 |   timestamp = {2011.05.11}
72 | }
73 | 
74 | @INPROCEEDINGS{shvachko10,
75 |   author = {Konstantin Shvachko and Hairong Kuang and Sanjay Radia and Robert
76 | 	Chansler},
77 |   title = {The Hadoop Distributed File System},
78 |   booktitle = {Proc. of the 26th IEEE Symposium on Massive Storage Systems and Technologies
79 | 	(MSST)},
80 |   year = {2010},
81 |   publisher = {IEEE}
82 | }
83 | 
84 | @BOOK{hadoop_book,
85 |   title = {Hadoop, The Definitive Guide},
86 |   publisher = {O'Reilly, Yahoo},
87 |   year = {2010},
88 |   author = {Tom White},
89 |   owner = {michiard},
90 |   timestamp = {2011.04.29}
91 | }
92 | 
93 | @MISC{AIRWeb,
94 |   title = {Adversarial Information Retrieval Workshop},
95 |   owner = {michiard},
96 |   timestamp = {2011.05.11}
97 | }
98 | 
99 | 


--------------------------------------------------------------------------------
/hadoop/references.bib:
--------------------------------------------------------------------------------
 1 | % This file was created with JabRef 2.6.
 2 | % Encoding: MacRoman
 3 | 
 4 | @INPROCEEDINGS{banko01,
 5 |   author = {Michele Banko and Eric Brill},
 6 |   title = {Scaling to very very large corpora for natural language disambiguation},
 7 |   booktitle = {Proc. of the 39th Annual Meeting of the Association for Computational
 8 | 	Linguistic (ACL)},
 9 |   year = {2001}
10 | }
11 | 
12 | @INPROCEEDINGS{barroso09,
13 |   author = {Luiz Andre Barroso and Urs Holzle},
14 |   title = {The Datacebter as a Computer: An introduction to the Design of Warehouse-Scale
15 | 	Machines},
16 |   year = {2009},
17 |   publisher = {Morgan \& Claypool Publishers}
18 | }
19 | 
20 | @INPROCEEDINGS{Bianchini2005,
21 |   author = {Monica Bianchini and Marco Gori and Franco Scarselli},
22 |   title = {Inside PageRank},
23 |   booktitle = {ACM Transactions on Internet Technology},
24 |   year = {2005},
25 |   owner = {michiard},
26 |   timestamp = {2011.05.11}
27 | }
28 | 
29 | @INPROCEEDINGS{hamilton09,
30 |   author = {James Hamilton},
31 |   title = {Cooperative Expendable Micro-Slice Servers (CEMS): Low cost, low
32 | 	power servers for Internet-scale services},
33 |   booktitle = {Proc. of the 4th Biennal Conference on Innovative Data Systems Research
34 | 	(CIDR)},
35 |   year = {2009}
36 | }
37 | 
38 | @INPROCEEDINGS{hey09,
39 |   author = {Tony Hey and Stewart Tansley and Kristin Tolle},
40 |   title = {The Fourth Paradigm: Data-Intensive Scientific Discovery},
41 |   year = {2009},
42 |   publisher = {Microsoft Research}
43 | }
44 | 
45 | @INPROCEEDINGS{Lattanzi2011,
46 |   author = {Silvio Lattanzi and Benjamin Moseley and Siddharth Suri and Sergei
47 | 	Vassilvitskii},
48 |   title = {Filtering: a Method for Solving Graph Problems in MapReduce},
49 |   booktitle = {Proc. of SPAA},
50 |   year = {2011},
51 |   owner = {michiard},
52 |   timestamp = {2011.05.11}
53 | }
54 | 
55 | @INPROCEEDINGS{Leskovec2005,
56 |   author = {Jure Leskovec and Jon Kleinberg and Christos Faloutsos},
57 |   title = {Graphs over time: Densification laws, shrinking diamters and possible
58 | 	explanations},
59 |   booktitle = {Proc. of SIGKDD},
60 |   year = {2005},
61 |   owner = {michiard},
62 |   timestamp = {2011.05.11}
63 | }
64 | 
65 | @INPROCEEDINGS{Page1999,
66 |   author = {Lawrence Page and Sergey Brin and Rajeev Motwani and Terry Winograd},
67 |   title = {The PageRank citation ranking: Bringin order to the Web},
68 |   booktitle = {Stanford Digital Library Working Paper},
69 |   year = {1999},
70 |   owner = {michiard},
71 |   timestamp = {2011.05.11}
72 | }
73 | 
74 | @INPROCEEDINGS{shvachko10,
75 |   author = {Konstantin Shvachko and Hairong Kuang and Sanjay Radia and Robert
76 | 	Chansler},
77 |   title = {The Hadoop Distributed File System},
78 |   booktitle = {Proc. of the 26th IEEE Symposium on Massive Storage Systems and Technologies
79 | 	(MSST)},
80 |   year = {2010},
81 |   publisher = {IEEE}
82 | }
83 | 
84 | @BOOK{hadoop_book,
85 |   title = {Hadoop, The Definitive Guide},
86 |   publisher = {O'Reilly, Yahoo},
87 |   year = {2010},
88 |   author = {Tom White},
89 |   owner = {michiard},
90 |   timestamp = {2011.04.29}
91 | }
92 | 
93 | @MISC{AIRWeb,
94 |   title = {Adversarial Information Retrieval Workshop},
95 |   owner = {michiard},
96 |   timestamp = {2011.05.11}
97 | }
98 | 
99 | 


--------------------------------------------------------------------------------
/relal/relal.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | 
  3 | \usepackage{subfigure}
  4 | \usepackage[english]{babel}
  5 | \usepackage[latin1]{inputenc}
  6 | \usepackage{times}
  7 | \usepackage[T1]{fontenc} 
  8 | \usepackage{color}
  9 | 
 10 | \usepackage{algorithm}
 11 | \usepackage{algorithmicx}
 12 | \usepackage[noend]{algpseudocode}
 13 | 
 14 | \usetheme[secheader]{Boadilla}
 15 | \usefonttheme[onlylarge]{structurebold}
 16 | \setbeamerfont*{frametitle}{size=\normalsize,series=\bfseries}
 17 | \setbeamertemplate{navigation symbols}{}
 18 | \setbeamertemplate{mini frames}[box]
 19 | \setbeamertemplate{sections/subsections in toc}[square]
 20 | \setbeamertemplate{blocks}[rounded][shadow=true]
 21 | \setbeamertemplate{bibliography item}[text]
 22 | 
 23 | \setbeamercolor{lightorange}{fg=black,bg=orange!40}
 24 | \setbeamercolor{lightblue}{fg=black,bg=blue!30}
 25 | 
 26 | \newenvironment{colorblock}[2]
 27 | {\setbeamercolor{item}{fg=#1,bg=#1}\begin{beamerboxesrounded}[upper=#1,lower=#2,shadow=true]}
 28 |   {\end{beamerboxesrounded}}
 29 | 
 30 | 
 31 | 
 32 | % Setup TikZ
 33 | 
 34 | \usepackage{tikz}
 35 | \usetikzlibrary{arrows}
 36 | \tikzstyle{block}=[draw opacity=0.7,line width=1.4cm]
 37 | 
 38 | 
 39 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 40 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 41 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 42 | 
 43 | \newtheorem{observation}[theorem]{Observation} 
 44 | 
 45 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 46 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 47 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 48 | 
 49 | \title{Relational Algebra and MapReduce}
 50 | \subtitle{Towards High-level Programming Languages}
 51 | \author{Pietro Michiardi}
 52 | \institute{Eurecom}
 53 | \date
 54 | 
 55 | 
 56 | \begin{document}
 57 | 
 58 | \begin{frame}
 59 |   \titlepage
 60 | \end{frame}
 61 | 
 62 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 63 | \section{Sources and Acks}
 64 | 
 65 | \begin{frame}
 66 |   \begin{itemize}
 67 |     \item Jimmy Lin and Chris Dyer, ``Data-Intensive Text Processing with MapReduce,'' Morgan \& Claypool Publishers, 2010. \url{http://lintool.github.io/MapReduceAlgorithms/}
 68 | 
 69 |     \item[]
 70 | 
 71 |     \item Tom White, ``Hadoop, The Definitive Guide,'' O'Reilly / Yahoo Press, 2012
 72 | 
 73 |     \item[]
 74 | 
 75 |     \item Anand Rajaraman, Jeffrey D. Ullman, Jure Leskovec, ``Mining of Massive Datasets'', Cambridge University Press, 2013
 76 |   \end{itemize}
 77 | \end{frame}
 78 | 
 79 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 80 | 
 81 | 
 82 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 83 | \section{Relational Algebra}
 84 | 
 85 | \begin{frame}
 86 |  \begin{colorblock}{blue}{lightblue}{ }
 87 |   \begin{center}
 88 |     \Huge \textbf{\texttt{Relational Algebra and MapReduce}}
 89 |   \end{center}
 90 |   \end{colorblock}
 91 | \end{frame}
 92 | 
 93 | \input{./relational}
 94 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 95 | 
 96 | 
 97 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 98 | % \section{References}
 99 | 
100 | % \begin{frame}
101 | %  \begin{colorblock}{blue}{lightblue}{ }
102 | %   \begin{center}
103 | %     \Huge \textbf{\texttt{References}}
104 | %   \end{center}
105 | %   \end{colorblock}
106 | % \end{frame}
107 | 
108 | % \begin{frame}[allowframebreaks]{References}
109 | % \bibliographystyle{plain} 
110 | % \bibliography{references} 
111 | % \end{frame}
112 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
113 | 
114 | \end{document}
115 | 


--------------------------------------------------------------------------------
/hadoop/hadoop.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | 
  3 | \usepackage[lined,ruled]{algorithm2e}
  4 | \usepackage{subfigure}
  5 | \usepackage[english]{babel}
  6 | \usepackage[latin1]{inputenc}
  7 | \usepackage{times}
  8 | \usepackage[T1]{fontenc} 
  9 | \usepackage{color}
 10 | 
 11 | \usetheme[secheader]{Boadilla}
 12 | \usefonttheme[onlylarge]{structurebold}
 13 | \setbeamerfont*{frametitle}{size=\normalsize,series=\bfseries}
 14 | \setbeamertemplate{navigation symbols}{}
 15 | \setbeamertemplate{mini frames}[box]
 16 | \setbeamertemplate{sections/subsections in toc}[square]
 17 | \setbeamertemplate{blocks}[rounded][shadow=true]
 18 | \setbeamertemplate{bibliography item}[text]
 19 | 
 20 | \setbeamercolor{lightorange}{fg=black,bg=orange!40}
 21 | \setbeamercolor{lightblue}{fg=black,bg=blue!30}
 22 | 
 23 | \newenvironment{colorblock}[2]
 24 | {\setbeamercolor{item}{fg=#1,bg=#1}\begin{beamerboxesrounded}[upper=#1,lower=#2,shadow=true]}
 25 |   {\end{beamerboxesrounded}}
 26 | 
 27 | 
 28 | 
 29 | % Setup TikZ
 30 | 
 31 | \usepackage{tikz}
 32 | \usetikzlibrary{arrows}
 33 | \tikzstyle{block}=[draw opacity=0.7,line width=1.4cm]
 34 | 
 35 | 
 36 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 37 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 38 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 39 | 
 40 | \newtheorem{observation}[theorem]{Observation} 
 41 | 
 42 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 43 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 44 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 45 | 
 46 | \title{Hadoop Internals}
 47 | % \subtitle{Introduction}
 48 | \author{Pietro Michiardi}
 49 | \institute{Eurecom}
 50 | \date
 51 | 
 52 | 
 53 | \begin{document}
 54 | 
 55 | \begin{frame}
 56 |   \titlepage
 57 | \end{frame}
 58 | 
 59 | 
 60 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 61 | \section{Introduction and Recap}
 62 | 
 63 | \begin{frame}
 64 |  \begin{colorblock}{blue}{lightblue}{ }
 65 |   \begin{center}
 66 |     \Huge \textbf{\texttt{Introduction and Recap}}
 67 |   \end{center}
 68 |   \end{colorblock}
 69 | \end{frame}
 70 | 
 71 | \input{./introduction}
 72 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 73 | 
 74 | 
 75 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 76 | \section{HDFS}
 77 | 
 78 | \begin{frame}
 79 |  \begin{colorblock}{blue}{lightblue}{ }
 80 |   \begin{center}
 81 |     \Huge \textbf{\texttt{Hadoop Distributed File-System}}
 82 |   \end{center}
 83 |   \end{colorblock}
 84 | \end{frame}
 85 | 
 86 | \input{./hdfs}
 87 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 88 | 
 89 | 
 90 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 91 | \section{Hadoop MapReduce}
 92 | 
 93 | \begin{frame}
 94 |  \begin{colorblock}{blue}{lightblue}{ }
 95 |   \begin{center}
 96 |     \Huge \textbf{\texttt{Hadoop MapReduce}}
 97 |   \end{center}
 98 |   \end{colorblock}
 99 | \end{frame}
100 | 
101 | \input{./mapreduce}
102 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
103 | 
104 | 
105 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
106 | \section{Hadoop I/O}
107 | 
108 | \begin{frame}
109 |  \begin{colorblock}{blue}{lightblue}{ }
110 |   \begin{center}
111 |     \Huge \textbf{\texttt{Hadoop I/O}}
112 |   \end{center}
113 |   \end{colorblock}
114 | \end{frame}
115 | 
116 | \input{./io}
117 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
118 | 
119 | 
120 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
121 | \section{Hadoop Deployments}
122 | 
123 | \begin{frame}
124 |  \begin{colorblock}{blue}{lightblue}{ }
125 |   \begin{center}
126 |     \Huge \textbf{\texttt{Hadoop Deployments}}
127 |   \end{center}
128 |   \end{colorblock}
129 | \end{frame}
130 | 
131 | \input{./deployments}
132 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
133 | 
134 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
135 | \section{References}
136 | 
137 | \begin{frame}
138 |  \begin{colorblock}{blue}{lightblue}{ }
139 |   \begin{center}
140 |     \Huge \textbf{\texttt{References}}
141 |   \end{center}
142 |   \end{colorblock}
143 | \end{frame}
144 | 
145 | \begin{frame}[allowframebreaks]{References}
146 | \bibliographystyle{plain} 
147 | \bibliography{references} 
148 | \end{frame}
149 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
150 | 
151 | \end{document}
152 | 


--------------------------------------------------------------------------------
/disa/disa.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | 
  3 | \usepackage{subfigure}
  4 | \usepackage[english]{babel}
  5 | \usepackage[latin1]{inputenc}
  6 | \usepackage{times}
  7 | \usepackage[T1]{fontenc} 
  8 | \usepackage{color}
  9 | 
 10 | \usepackage{algorithm}
 11 | \usepackage{algorithmicx}
 12 | \usepackage[noend]{algpseudocode}
 13 | 
 14 | \usetheme[secheader]{Boadilla}
 15 | \usefonttheme[onlylarge]{structurebold}
 16 | \setbeamerfont*{frametitle}{size=\normalsize,series=\bfseries}
 17 | \setbeamertemplate{navigation symbols}{}
 18 | \setbeamertemplate{mini frames}[box]
 19 | \setbeamertemplate{sections/subsections in toc}[square]
 20 | \setbeamertemplate{blocks}[rounded][shadow=true]
 21 | \setbeamertemplate{bibliography item}[text]
 22 | 
 23 | \setbeamercolor{lightorange}{fg=black,bg=orange!40}
 24 | \setbeamercolor{lightblue}{fg=black,bg=blue!30}
 25 | 
 26 | \newenvironment{colorblock}[2]
 27 | {\setbeamercolor{item}{fg=#1,bg=#1}\begin{beamerboxesrounded}[upper=#1,lower=#2,shadow=true]}
 28 |   {\end{beamerboxesrounded}}
 29 | 
 30 | 
 31 | 
 32 | % Setup TikZ
 33 | 
 34 | \usepackage{tikz}
 35 | \usetikzlibrary{arrows}
 36 | \tikzstyle{block}=[draw opacity=0.7,line width=1.4cm]
 37 | 
 38 | 
 39 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 40 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 41 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 42 | 
 43 | \newtheorem{observation}[theorem]{Observation} 
 44 | 
 45 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 46 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 47 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 48 | 
 49 | \title{Scalable Algorithm Design}
 50 | \subtitle{The MapReduce Programming Model}
 51 | \author{Pietro Michiardi}
 52 | \institute{Eurecom}
 53 | \date
 54 | 
 55 | 
 56 | \begin{document}
 57 | 
 58 | \begin{frame}
 59 |   \titlepage
 60 | \end{frame}
 61 | 
 62 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 63 | \section{Sources and Acks}
 64 | 
 65 | \begin{frame}
 66 |   \begin{itemize}
 67 |     \item Jimmy Lin and Chris Dyer, ``Data-Intensive Text Processing with MapReduce,'' Morgan \& Claypool Publishers, 2010. \url{http://lintool.github.io/MapReduceAlgorithms/}
 68 | 
 69 |     \item[]
 70 | 
 71 |     \item Tom White, ``Hadoop, The Definitive Guide,'' O'Reilly / Yahoo Press, 2012
 72 | 
 73 |     \item[]
 74 | 
 75 |     \item Anand Rajaraman, Jeffrey D. Ullman, Jure Leskovec, ``Mining of Massive Datasets'', Cambridge University Press, 2013
 76 |   \end{itemize}
 77 | \end{frame}
 78 | 
 79 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 80 | 
 81 | 
 82 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 83 | \section{Key Principles}
 84 | 
 85 | \begin{frame}
 86 |  \begin{colorblock}{blue}{lightblue}{ }
 87 |   \begin{center}
 88 |     \Huge \textbf{\texttt{Key Principles}}
 89 |   \end{center}
 90 |   \end{colorblock}
 91 | \end{frame}
 92 | 
 93 | \input{./principles}
 94 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 95 | 
 96 | 
 97 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 98 | \section{The Programming Model}
 99 | 
100 | \begin{frame}
101 |  \begin{colorblock}{blue}{lightblue}{ }
102 |   \begin{center}
103 |     \Huge \textbf{\texttt{The Programming Model}}
104 |   \end{center}
105 |   \end{colorblock}
106 | \end{frame}
107 | 
108 | \input{./programming_model}
109 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
110 | 
111 | 
112 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
113 | \section{Basic Design Patterns}
114 | 
115 | \begin{frame}
116 |  \begin{colorblock}{blue}{lightblue}{ }
117 |   \begin{center}
118 |     \Huge \textbf{\texttt{Basic Design Patterns}}
119 |   \end{center}
120 |   \end{colorblock}
121 | \end{frame}
122 | 
123 | \input{./design_patterns.tex}
124 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
125 | 
126 | 
127 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
128 | % \section{Graph Algorithms}
129 | 
130 | % \begin{frame}
131 | %  \begin{colorblock}{blue}{lightblue}{ }
132 | %   \begin{center}
133 | %     \Huge \textbf{\texttt{Graph Algorithms [Optional]}}
134 | %   \end{center}
135 | %   \end{colorblock}
136 | % \end{frame}
137 | 
138 | % \input{./graph_algorithms}
139 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
140 | 
141 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
142 | \section{References}
143 | 
144 | \begin{frame}
145 |  \begin{colorblock}{blue}{lightblue}{ }
146 |   \begin{center}
147 |     \Huge \textbf{\texttt{References}}
148 |   \end{center}
149 |   \end{colorblock}
150 | \end{frame}
151 | 
152 | \begin{frame}[allowframebreaks]{References}
153 | \bibliographystyle{plain} 
154 | \bibliography{references} 
155 | \end{frame}
156 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
157 | 
158 | \end{document}
159 | 


--------------------------------------------------------------------------------
/hadoop/io.tex:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | \subsection{Technical details}
  4 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  5 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  6 | 
  7 | 
  8 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  9 | \frame {\frametitle{I/O operations in Hadoop}
 10 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 11 |   \begin{itemize}
 12 |   \item \textbf{Reading and writing data}
 13 |     \begin{itemize}
 14 |     \item From/to HDFS
 15 |     \item From/to local disk drives
 16 |     \item Across machines (inter-process communication)
 17 |     \end{itemize}
 18 | 
 19 |     \vspace{20pt}
 20 | 
 21 |   \item \textbf{Customized tools for large amounts of data}
 22 |     \begin{itemize}
 23 |     \item Hadoop does not use Java native classes
 24 |     \item Allows flexibility for dealing with custom data (e.g. binary)
 25 |     \end{itemize}
 26 | 
 27 |     \vspace{20pt}
 28 | 
 29 |   \item \textbf{What's next}
 30 |     \begin{itemize}
 31 |     \item Overview of what Hadoop offers
 32 |     \item For an in depth knowledge, use \cite{hadoop_book}
 33 |     \end{itemize}
 34 | 
 35 |   \end{itemize}
 36 | 
 37 | }
 38 | 
 39 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 40 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 41 | \subsection{Data Integrity}
 42 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 43 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 44 | 
 45 | 
 46 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 47 | \frame {\frametitle{Data Integrity}
 48 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 49 |   \begin{itemize}
 50 |   \item \textbf{Every I/O operation on disks or the network may
 51 |       corrupt data}
 52 |     \begin{itemize}
 53 |     \item Users expect data not to be corrupted during storage or
 54 |       processing
 55 |     \item Data integrity usually achieved with a simple \textbf{checksum} mechanism
 56 |     \end{itemize}
 57 | 
 58 |     \vspace{40pt}
 59 | 
 60 |   \item \textbf{HDFS transparently checksums all data during I/O}
 61 |     \begin{itemize}
 62 |     \item HDFS makes sure that storage overhead is roughly 1\%
 63 |     \item \texttt{DataNodes} are in charge of checksumming
 64 |       \begin{itemize}
 65 |       \item With replication, the last replica performs the check
 66 |       \item Checksums are timestamped and logged for
 67 |         {\color{red}statistics on disks}
 68 |       \end{itemize}
 69 |     \item Checksumming is also run periodically in a separate thread
 70 |       \begin{itemize}
 71 |       \item Note that thanks to replication, {\color{red}error
 72 |           correction} is possible in addition to detection
 73 |       \end{itemize}
 74 |     \end{itemize}
 75 | 
 76 | 
 77 |   \end{itemize}
 78 | }
 79 | 
 80 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 81 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 82 | \subsection{Data Compression}
 83 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 84 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 85 | 
 86 | 
 87 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 88 | \frame {\frametitle{Compression}
 89 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 90 |   \begin{itemize}
 91 |   \item \textbf{Why using compression}
 92 |     \begin{itemize}
 93 |     \item Reduce storage requirements
 94 |     \item Speed up data transfers (across the network or from disks)
 95 |     \end{itemize}
 96 | 
 97 |     \vspace{20pt}
 98 | 
 99 |   \item \textbf{Compression and Input Splits}
100 |     \begin{itemize}
101 |     \item IMPORTANT: use compression that supports
102 |       {\color{red}splitting} (e.g. bzip2)
103 |     \end{itemize}
104 | 
105 |     \vspace{20pt}
106 | 
107 |   \item \textbf{Splittable files, Example 1}
108 |      \begin{itemize}
109 |       \item Consider an uncompressed file of 1GB
110 |       \item HDFS will split it in 16 blocks, 64MB each, to be
111 |         processed by separate Mappers
112 |       \end{itemize}
113 |  \end{itemize}
114 | }
115 | 
116 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
117 | \frame {\frametitle{Compression}
118 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
119 |   \begin{itemize}
120 |   \item \textbf{Unsplittable files, Example 2 (gzip)}
121 |     \begin{itemize}
122 |     \item Consider a compressed file of 1GB
123 |     \item HDFS will split it in 16 blocks of 64MB each
124 |     \item Creating an \texttt{InputSplit} for each block will not
125 |       work, since it is not possible to read at an arbitrary point
126 |     \end{itemize}
127 | 
128 |     \vspace{20pt}
129 |     
130 |   \item \textbf{What's the problem?}
131 |     \begin{itemize}
132 |     \item This forces MapReduce to treat the file as a
133 |       {\color{red}single split}
134 |     \item Then, a single Mapper is fired by the framework
135 |     \item For this Mapper, only 1/16-th is local, the rest comes from
136 |       the network
137 |     \end{itemize}
138 | 
139 |     \vspace{20pt}
140 | 
141 |   \item \textbf{Which compression format to use?}
142 |     \begin{itemize}
143 |     \item Use bzip2
144 |     \item Otherwise, use \texttt{SequenceFiles}
145 |     \item See Chapter 4 \cite{hadoop_book}
146 |     \end{itemize}
147 | 
148 |   \end{itemize} 
149 | }
150 | 
151 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
152 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
153 | \subsection{Serialization}
154 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
155 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
156 | 
157 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
158 | \frame {\frametitle{Serialization}
159 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
160 |   \begin{itemize}
161 |   \item \textbf{Transforms structured objects into a byte stream}
162 |     \begin{itemize}
163 |     \item For transmission over the network: {\color{red}Hadoop uses RPC}
164 |     \item For persistent storage on disks
165 |     \end{itemize}
166 | 
167 |     \vspace{20pt}
168 |     
169 |   \item \textbf{Hadoop uses its own serialization format,
170 |       \texttt{Writable}}
171 |     \begin{itemize}
172 |     \item Comparison of types is crucial (Shuffle and Sort phase):
173 |       Hadoop provides a custom \texttt{RawComparator}, which avoids
174 |       deserialization 
175 |     \item Custom \texttt{Writable} for having full control on the
176 |       binary representation of data
177 |     \item Also ``external'' frameworks are allowed: enter \textbf{Avro}
178 |     \end{itemize}
179 | 
180 |     \vspace{20pt}
181 | 
182 |   \item \textbf{Fixed-length or variable-length encoding?}
183 |     \begin{itemize}
184 |     \item Fixed-length: when the distribution of values is uniform
185 |     \item Variable-length: when the distribution of values is not uniform
186 |     \end{itemize}
187 | 
188 |   \end{itemize}
189 | }
190 | 
191 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
192 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
193 | \subsection{Sequence Files}
194 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
195 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
196 | 
197 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
198 | \frame {\frametitle{Sequence Files}
199 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
200 |   \begin{itemize}
201 |   \item \textbf{Specialized data structure to hold custom input data}
202 |     \begin{itemize}
203 |     \item Using blobs of binaries is not efficient
204 |     \end{itemize}
205 | 
206 |     \vspace{10pt}
207 | 
208 |   \item \textbf{\texttt{SequenceFiles}}
209 |     \begin{itemize}
210 |     \item Provide a persistent data structure for binary key-value
211 |       pairs
212 |     \item Also work well as containers for smaller files so that the
213 |       framework is more happy (remember, better few large files than
214 |       lots of small files)
215 |     \item They come with the \texttt{sync()} method to introduce sync
216 |       points to help managing \texttt{InputSplits} for MapReduce
217 |     \end{itemize}
218 | 
219 |     \begin{center}
220 |       \framebox{\includegraphics[scale=0.2]{./Figures/sequencefiles}}      
221 |     \end{center}
222 | 
223 | 
224 |   \end{itemize}
225 | }
226 | 
227 | 


--------------------------------------------------------------------------------
/disa/principles.tex:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | \frame {\frametitle{
  3 |     \begin{beamerboxesrounded}[shadow=true]{}
  4 |       \begin{center}
  5 |       Scale out, not up!        
  6 |       \end{center}
  7 |     \end{beamerboxesrounded}
  8 | }
  9 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 10 |   \begin{itemize}
 11 |   \item \textbf{For data-intensive workloads, a large number of commodity
 12 |     servers is preferred over a small number of high-end servers}
 13 |     \begin{itemize}
 14 |     \item Cost of super-computers is not linear
 15 |     \item But datacenter efficiency is a difficult problem to solve
 16 |       \cite{barroso09, hamilton09}
 17 |     \end{itemize}
 18 | 
 19 |     \vspace{20pt}
 20 | 
 21 |   \item \textbf{Some numbers ($\sim$ 2012):}
 22 |     \begin{itemize}
 23 |     \item Data processed by Google every day: 100+ PB
 24 |     \item Data processed by Facebook every day: 10+ PB
 25 |     \end{itemize}
 26 | 
 27 |   \end{itemize}
 28 | }
 29 | 
 30 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 31 | \frame {\frametitle{Implications of Scaling Out}
 32 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 33 |   \begin{itemize}
 34 |   \item \textbf{Processing data is quick, I/O is very slow}
 35 |     \begin{itemize}
 36 |     \item 1 HDD = 75 MB/sec
 37 |     \item 1000 HDDs = 75 GB/sec
 38 |     \end{itemize}
 39 | 
 40 |     \vspace{20pt}
 41 | 
 42 |   \item \textbf{Sharing vs. Shared nothing}:
 43 |     \begin{itemize}
 44 |     \item Sharing: manage a common/global state
 45 |     \item Shared nothing: {\color{red} independent} entities, no common state
 46 |     \end{itemize}
 47 | 
 48 |     \vspace{20pt}
 49 | 
 50 |   \item \textbf{Sharing is difficult}:
 51 |     \begin{itemize}
 52 |     \item Synchronization, deadlocks
 53 |     \item Finite bandwidth to access data from SAN
 54 |     \item Temporal dependencies are complicated (restarts)
 55 |     \end{itemize}
 56 |   \end{itemize}
 57 | 
 58 | 
 59 | 
 60 | }
 61 | 
 62 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 63 | \frame {\frametitle{
 64 |     \begin{beamerboxesrounded}[shadow=true]{}
 65 |       \begin{center}
 66 |         Failures are the norm, not the exception
 67 |       \end{center}
 68 |     \end{beamerboxesrounded}
 69 |   }
 70 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 71 | 
 72 |   \begin{itemize}
 73 |   \item LALN data [DSN 2006]
 74 |     \begin{itemize}
 75 |     \item Data for ~ 5000 machines, for 9 years
 76 |     \item Hardware: 60\%, Software: 20\%, Network 5\%
 77 |     \end{itemize}
 78 | 
 79 | \vspace{20pt}
 80 | 
 81 |   \item DRAM error analysis [Sigmetrics 2009]
 82 |     \begin{itemize}
 83 |     \item Data for 2.5 years
 84 |     \item 8\% of DIMMs affected by errors
 85 |     \end{itemize}
 86 | 
 87 | \vspace{20pt}
 88 | 
 89 |   \item Disk drive failure analysis [FAST 2007]
 90 |     \begin{itemize}
 91 |     \item Utilization and temperature major causes of failures
 92 |     \end{itemize}
 93 | 
 94 | \vspace{20pt}
 95 | 
 96 |   \item Amazon Web Service(s) failures [Several!]
 97 |     \begin{itemize}
 98 |     \item Cascading effect
 99 |     \end{itemize}
100 | 
101 |   \end{itemize}
102 |   
103 | }
104 | 
105 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
106 | \frame {\frametitle{Implications of Failures}
107 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
108 |   \begin{itemize}
109 |   \item \textbf{Failures are part of everyday life}
110 |     \begin{itemize}
111 |     \item Mostly due to the scale and shared environment
112 |     \end{itemize}
113 | 
114 | \vspace{20pt}
115 | 
116 |   \item \textbf{Sources of Failures}
117 |     \begin{itemize}
118 |     \item Hardware / Software
119 |     \item Electrical, Cooling, ...
120 |     \item Unavailability of a resource due to overload
121 |     \end{itemize}
122 | 
123 | \vspace{20pt} 
124 | 
125 |   \item \textbf{Failure Types}
126 |     \begin{itemize}
127 |     \item Permanent
128 |     \item Transient
129 |     \end{itemize}
130 |   \end{itemize}
131 | }
132 | 
133 | 
134 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
135 | \frame {\frametitle{
136 |     \begin{beamerboxesrounded}[shadow=true]{}
137 |       \begin{center}
138 |         Move Processing to the Data
139 |       \end{center}
140 |     \end{beamerboxesrounded}
141 |   }
142 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
143 | 
144 |   \begin{itemize}
145 |   \item \textbf{Drastic departure from high-performance computing model}
146 |     \begin{itemize}
147 |     \item HPC: distinction between processing nodes and storage nodes
148 |     \item HPC: CPU intensive tasks
149 |     \end{itemize}
150 | 
151 | \vspace{20pt}
152 | 
153 |   \item \textbf{Data intensive workloads}
154 |     \begin{itemize}
155 |     \item Generally not processor demanding
156 |     \item The network becomes the bottleneck
157 |     \item MapReduce assumes processing and storage nodes to be
158 |       collocated
159 |     \item[$\to$] {\color{red}\textbf{Data Locality Principle}}
160 |     \end{itemize}
161 | 
162 |     \vspace{20pt}
163 |     
164 |   \item \textbf{Distributed filesystems are necessary}
165 |   \end{itemize}
166 | } 
167 | 
168 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
169 | \frame {\frametitle{
170 |     \begin{beamerboxesrounded}[shadow=true]{}
171 |       \begin{center}
172 |         Process Data Sequentially and Avoid Random Access
173 |       \end{center}
174 |     \end{beamerboxesrounded}
175 |   }
176 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
177 | 
178 |   \begin{itemize}
179 |   \item \textbf{Data intensive workloads}
180 |     \begin{itemize}
181 |     \item Relevant datasets are too large to fit in memory
182 |     \item Such data resides on disks
183 |     \end{itemize}
184 | 
185 | \vspace{20pt}
186 | 
187 |   \item \textbf{Disk performance is a bottleneck}
188 |     \begin{itemize}
189 |     \item \textbf{Seek times} for random disk access are \textbf{the problem}
190 |       \begin{itemize}
191 |     \item Example: 1 TB DB with $10^{10}$ 100-byte records. Updates on
192 |       1\% requires 1 month, reading and rewriting the whole DB would
193 |       take 1 day\footnote{From a post by Ted Dunning on the Hadoop mailing list}
194 |       \end{itemize}
195 |     \item Organize computation for sequential reads
196 |    \end{itemize}
197 | 
198 |   \end{itemize}
199 | 
200 | } 
201 | 
202 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
203 | \frame {\frametitle{Implications of Data Access Patterns}
204 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
205 |   \begin{itemize}
206 |   \item \textbf{MapReduce is designed for:} 
207 |     \begin{itemize}
208 |     \item {\color{red}\textbf{Batch processing}}
209 |     \item involving  (mostly) {\color{red}\textbf{full scans}} of the data
210 |     \end{itemize}
211 | 
212 |     \vspace{20pt}
213 | 
214 |   \item\textbf{ Typically, data is collected ``elsewhere'' and copied to the
215 |     distributed filesystem}
216 |     \begin{itemize}
217 |       \item E.g.: Apache Flume, Hadoop Sqoop, $\cdots$
218 |     \end{itemize}
219 | 
220 |     \vspace{20pt}
221 | 
222 |   \item \textbf{Data-intensive applications}
223 |     \begin{itemize}
224 |     \item Read and process the whole Web (e.g. PageRank)
225 |     \item Read and process the whole Social Graph (e.g. LinkPrediction, a.k.a. ``friend suggest'')
226 |     \item Log analysis (e.g. Network traces, Smart-meter data, $\cdots$)
227 |     \end{itemize}
228 | 
229 |   \end{itemize}
230 | }
231 | 
232 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
233 | \frame {\frametitle{
234 |     \begin{beamerboxesrounded}[shadow=true]{}
235 |       \begin{center}
236 |         Hide System-level Details
237 |       \end{center}
238 |     \end{beamerboxesrounded}
239 |   }
240 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
241 | 
242 |   \begin{itemize}
243 |   \item \textbf{Separate the \textit{what} from the \textit{how}}
244 |     \begin{itemize}
245 |     \item MapReduce abstracts away the ``distributed'' part of the system
246 |     \item Such details are handled by the framework
247 |     \end{itemize}
248 | 
249 |     \vspace{20pt}
250 | 
251 |   \item {\color{red}\textbf{BUT: }}\textbf{In-depth knowledge of the framework is key}
252 |     \begin{itemize}
253 |     \item Custom data reader/writer
254 |     \item Custom {\color{red}data partitioning}
255 |     \item Memory utilization
256 |     \end{itemize}
257 | 
258 |     \vspace{20pt}
259 | 
260 |   \item \textbf{Auxiliary components}
261 |     \begin{itemize}
262 |     \item Hadoop Pig
263 |     \item Hadoop Hive
264 |     \item Cascading/Scalding
265 |     \item ... and many many more!
266 |     \end{itemize}
267 | 
268 |   \end{itemize}
269 | } 
270 | 
271 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
272 | \frame {\frametitle{
273 |     \begin{beamerboxesrounded}[shadow=true]{}
274 |       \begin{center}
275 |         Seamless Scalability
276 |       \end{center}
277 |     \end{beamerboxesrounded}
278 |   }
279 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
280 | 
281 |   \begin{itemize}
282 |   \item \textbf{We can define scalability along two dimensions}
283 |     \begin{itemize}
284 |     \item In terms of data: given twice the amount of data, the same
285 |       algorithm should take no more than twice as long to run
286 |     \item In terms of resources: given a cluster twice the size, the
287 |       same algorithm should take no more than half as long to run
288 |     \end{itemize}
289 | 
290 |     \vspace{20pt}
291 | 
292 |   \item \textbf{Embarrassingly parallel problems}
293 |     \begin{itemize}
294 |     \item Simple definition: independent ({\color{red}shared nothing})
295 |       computations on fragments of the dataset
296 |     \item How to to decide if a problem is embarrassingly
297 |       parallel or not?
298 |     \end{itemize}
299 | 
300 |     \vspace{20pt}
301 | 
302 |   \item \textbf{MapReduce is a first attempt, not the final answer}
303 |   \end{itemize}
304 | } 
305 | 


--------------------------------------------------------------------------------
/hadoop/hdfs.tex:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | \subsection{Motivations}
  4 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  5 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  6 | 
  7 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  8 | \frame {\frametitle{Collocate data and computation!}
  9 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 10 |   \begin{itemize}
 11 |   \item \textbf{As dataset sizes increase, more computing capacity is
 12 |       required for processing}
 13 | 
 14 |     \vspace{20pt}
 15 | 
 16 |   \item \textbf{As compute capacity grows, the link between the
 17 |       compute nodes and the storage nodes becomes a bottleneck}
 18 |     \begin{itemize}
 19 |     \item One could eventually think of special-purpose interconnects
 20 |       for high-performance networking
 21 |     \item This is often a costly solution as cost does not increase
 22 |       linearly with performance
 23 |     \end{itemize}
 24 | 
 25 |     \vspace{20pt}
 26 | 
 27 |   \item \textbf{{\color{red}Key idea}: abandon the separation between
 28 |       compute and storage nodes}
 29 |     \begin{itemize}
 30 |     \item This is exactly what happens in current implementations of
 31 |       the MapReduce framework
 32 |     \item A distributed filesystem is not mandatory, but highly desirable
 33 |     \end{itemize}
 34 |   \end{itemize}
 35 | }
 36 | 
 37 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 38 | \frame {\frametitle{The Hadoop Distributed Filesystem}
 39 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 40 |   \begin{itemize}
 41 |   \item \textbf{Large dataset(s) outgrowing the storage capacity of a single
 42 |       physical machine}
 43 |     \begin{itemize}
 44 |     \item Need to partition it across a number of separate machines
 45 |     \item Network-based system, with all its complications
 46 |     \item Tolerate failures of machines
 47 |     \end{itemize}
 48 | 
 49 |     \vspace{20pt}
 50 | 
 51 |   \item \textbf{Distributed filesystems are not new!}
 52 |     \begin{itemize}
 53 |     \item HDFS builds upon previous results, tailored to the specific
 54 |       requirements of MapReduce
 55 |     \item {\color{red}Write once, read many workloads}
 56 |     \item Does not handle concurrency, but allow replication
 57 |     \item Optimized for throughput, not latency
 58 |     \end{itemize}
 59 | 
 60 |     \vspace{20pt}
 61 | 
 62 |   \item \textbf{Hadoop Distributed Filesystem\cite{shvachko10, hadoop_book}}
 63 |     \begin{itemize}
 64 |     \item Very large files
 65 |     \item Streaming data access
 66 |     \item Commodity hardware
 67 |     \end{itemize}
 68 |   \end{itemize}
 69 | }
 70 | 
 71 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 72 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 73 | \subsection{Blocks}
 74 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 75 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 76 | 
 77 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 78 | \frame {\frametitle{HDFS Blocks}
 79 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 80 |   \begin{itemize}
 81 |   \item \textbf{(Big) files are broken into block-sized chunks}
 82 |     \begin{itemize}
 83 |     \item Blocks are big! [64, 128] MB
 84 |     \item Avoids problems related to metadata management
 85 |     \item \texttt{NOTE}: A file that is smaller than a single block {\color{red}does not} occupy a full block's worth of underlying storage
 86 |     \end{itemize}
 87 | 
 88 | \vspace{20pt}
 89 | 
 90 |   \item \textbf{Blocks are stored on independent machines}
 91 |     \begin{itemize}
 92 |     \item Replicate across the local disks of nodes in the cluster
 93 |     \item Reliability and parallel access
 94 |     \item Replication is handled by storage nodes themselves (similar
 95 |       to \textbf{chain replication})
 96 |     \end{itemize}
 97 | 
 98 | \vspace{20pt}
 99 | 
100 |   \item \textbf{Why is a block so large?}
101 |     \begin{itemize}
102 |     \item Make transfer times larger than seek latency
103 |     \item E.g.: Assume seek time is 10ms and the transfer rate is 100
104 |       MB/s, if you want seek time to be 1\% of transfer time, then the
105 |       block size should be 100MB
106 |     \end{itemize}
107 |   \end{itemize}
108 | 
109 | }
110 | 
111 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
112 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
113 | \subsection{Architecture}
114 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
115 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
116 | 
117 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
118 | \frame {\frametitle{NameNodes and DataNodes}
119 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
120 |   \begin{itemize}
121 |   \item \textbf{\texttt{NameNode}}
122 |     \begin{itemize}
123 |     \item Keeps metadata {\color{red}in RAM}
124 |     \item Each block information occupies roughly 150 bytes of memory
125 |     \item Without \texttt{NameNode}, the filesystem cannot be used
126 |       \begin{itemize}
127 |       \item Persistence of metadata: synchronous and atomic writes to
128 |         NFS
129 |       \end{itemize}
130 |     \item Maintains overall {\color{red}health} of the file system
131 |     \end{itemize}
132 | 
133 |     \vspace{20pt}
134 | 
135 |   \item \textbf{\texttt{Secondary NameNode}}
136 |     \begin{itemize}
137 |     \item Merges the namespace with the edit log
138 |     \item A useful trick to recover from a failure of the
139 |       \texttt{NameNode} is to use the NFS copy of metadata and 
140 |       switch the secondary to primary
141 |     \end{itemize}
142 | 
143 |     \vspace{20pt}
144 | 
145 |   \item \textbf{\texttt{DataNode}}
146 |     \begin{itemize}
147 |     \item They store data and talk to clients
148 |     \item They report periodically to the \texttt{NameNode} the list
149 |       of blocks they hold
150 |     \end{itemize}
151 | 
152 |   \end{itemize}
153 | }
154 | 
155 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
156 | \frame {\frametitle{Architecture Illustration}
157 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
158 |   \begin{figure}[h]
159 |     \centering
160 |     \includegraphics[scale=0.36]{./Figures/hdfs}
161 |     \caption{Architecture sketch of HDFS operations.}
162 |     \label{fig:hdfs}
163 |   \end{figure}
164 | }
165 | 
166 | 
167 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
168 | \frame {\frametitle{Anatomy of a File Read}
169 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
170 |   \begin{itemize}
171 |   \item \textbf{\texttt{NameNode} is only used to get block location}
172 |     \begin{itemize}
173 |     \item Unresponsive \texttt{DataNode} are discarded by clients
174 |     \item Batch reading of blocks is allowed
175 |     \end{itemize}
176 | 
177 |     \vspace{20pt}
178 | 
179 |   \item \textbf{``External'' clients}
180 |     \begin{itemize}
181 |     \item For each block, the \texttt{NameNode} returns {\color{red}a
182 |         set} of \texttt{DataNodes} holding a copy thereof
183 |     \item \texttt{DataNodes} are sorted according to their proximity
184 |       to the client
185 |     \end{itemize}
186 | 
187 |     \vspace{20pt}
188 | 
189 |   \item \textbf{``MapReduce'' clients}
190 |     \begin{itemize}
191 |     \item \texttt{TaskTracker} and \texttt{DataNodes} are
192 |       {\color{red}collocated}
193 |     \item For each block, the \texttt{NameNode}
194 |       usually\footnote{Exceptions exist due to stragglers.} returns
195 |       the local \texttt{DataNode}
196 |     \end{itemize}
197 | 
198 |   \end{itemize}
199 | }
200 | 
201 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
202 | \frame {\frametitle{Anatomy of a File Write}
203 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
204 |   \begin{itemize}
205 |   \item \textbf{Details on replication}
206 |     \begin{itemize}
207 |     \item Clients ask \texttt{NameNode} for a list of suitable
208 |       \texttt{DataNodes}
209 |     \item This list forms a \texttt{pipeline}: first \texttt{DataNode}
210 |       stores a copy of a block, then forwards it to the second, and so
211 |       on
212 |     \end{itemize}
213 | 
214 |     \vspace{40pt}
215 | 
216 |   \item \textbf{Replica Placement}
217 |     \begin{itemize}
218 |     \item {\color{red}Tradeoff} between reliability and bandwidth
219 |     \item Default placement:
220 |       \begin{itemize}
221 |       \item First copy on the ``same'' node of the client, second
222 |         replica is {\color{red}off-rack}, third replica is on the same
223 |         rack as the second but on a different node
224 |       \item Since Hadoop 0.21, replica placement can be customized
225 |       \end{itemize}
226 |     \end{itemize}
227 | 
228 |   \end{itemize}
229 | 
230 | }
231 | 
232 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
233 | \frame {\frametitle{Chain Replication and Distance Metrics}
234 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
235 | 
236 |   \begin{columns}[c]
237 |     \column{5cm}
238 |     \framebox{\includegraphics[width=4cm]{./Figures/chain_replication}}
239 |     \column{5cm}
240 |     \framebox{\includegraphics[width=4cm]{./Figures/hadoop_distance}}
241 |   \end{columns}
242 | 
243 | }
244 | 
245 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
246 | \frame {\frametitle{HDFS Coherency Model}
247 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
248 |   \begin{itemize}
249 |   \item \textbf{Read your writes is not guaranteed}
250 |     \begin{itemize}
251 |     \item The namespace is updated
252 |     \item Block contents may not be visible after a write is finished
253 |     \item Application design (other than MapReduce) should use
254 |       \texttt{sync()} to force synchronization
255 |     \item \texttt{sync()} involves some overhead: tradeoff between
256 |       robustness/consistency and throughput
257 |     \end{itemize}
258 | 
259 |     \vspace{40pt}
260 | 
261 |   \item \textbf{Multiple writers (for the {\color{red}same} block) are not
262 |     supported}
263 |     \begin{itemize}
264 |     \item Instead, different blocks can be written in parallel (using MapReduce)
265 |     \end{itemize}
266 | 
267 |   \end{itemize}
268 | }
269 | 
270 | 


--------------------------------------------------------------------------------
/hadoop/deployments.tex:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | \frame {\frametitle{Setting up a Hadoop Cluster}
  3 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  4 |   \begin{itemize}
  5 |   \item \textbf{Cluster deployment}
  6 |     \begin{itemize}
  7 |     \item Private cluster
  8 |     \item Cloud-based cluster
  9 |     \item AWS Elastic MapReduce
 10 |     \end{itemize}
 11 |     
 12 |     \vspace{20pt}
 13 |     
 14 |   \item \textbf{Outlook:}
 15 |     \begin{itemize}
 16 |     \item Cluster specification
 17 |       \begin{itemize}
 18 |       \item Hardware
 19 |       \item Network Topology
 20 |       \end{itemize}
 21 |     \item Hadoop Configuration
 22 |       \begin{itemize}
 23 |       \item Memory considerations
 24 |       \end{itemize}
 25 |     \end{itemize}  
 26 |   \end{itemize}  
 27 | }
 28 | 
 29 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 30 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 31 | \subsection{Specification}
 32 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 33 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 34 | 
 35 | 
 36 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 37 | \frame {\frametitle{Cluster Specification}
 38 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 39 |   \begin{itemize}
 40 |   \item \textbf{Commodity Hardware}
 41 |     \begin{itemize}
 42 |     \item Commodity $\neq$ Low-end
 43 |       \begin{itemize}
 44 |       \item False economy due to failure rate and maintenance costs
 45 |       \end{itemize}
 46 |     \item Commodity $\neq$ High-end
 47 |       \begin{itemize}
 48 |       \item High-end machines perform better, which would imply a
 49 |         smaller cluster
 50 |       \item A single machine failure would compromise a large fraction
 51 |         of the cluster
 52 |       \end{itemize}
 53 |     \end{itemize}
 54 | 
 55 |     \vspace{20pt}
 56 | 
 57 |   \item \textbf{A 2012 specification}:
 58 |     \begin{itemize}
 59 |     \item Dual socket, Two exacore
 60 |     \item 128 GB {\color{red}ECC} RAM
 61 |     \item 8 $\times$ 1 TB disks\footnote{\color{red}Why not using
 62 |         RAID instead of JBOD?}
 63 |     \item \{1,10\} Gigabit Ethernet
 64 |     \end{itemize}
 65 |   \end{itemize}
 66 |  
 67 | }
 68 | 
 69 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 70 | \frame {\frametitle{Cluster Specification}
 71 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 72 |   \begin{itemize}
 73 |   \item \textbf{Example:}
 74 |     \begin{itemize}
 75 |     \item Assume your data grows by 1 TB per week
 76 |     \item Assume you have three-way replication in HDFS
 77 |     \item[$\to$] You need additional 3TB of raw storage per week
 78 |     \item Allow for some overhead (temporary files, logs)
 79 |     \item[$\to$] {\color{red}This is a new machine per week}
 80 |     \end{itemize}
 81 | 
 82 |     \vspace{20pt}
 83 | 
 84 |   \item \textbf{How to dimension a cluster?}
 85 |     \begin{itemize}
 86 |     \item Obviously, you won't buy a machine per week!!
 87 |     \item The idea is that the above back-of-the-envelope calculation
 88 |       is that you can project over a 2 year life-time of your system
 89 |     \item[$\to$] You would need a 100-machine cluster 
 90 |     \end{itemize}
 91 | 
 92 |     \vspace{20pt}
 93 | 
 94 |   \item \textbf{Where should you put the various components?}
 95 |     \begin{itemize}
 96 |     \item Small cluster: NameNode and JobTracker can be
 97 |       {\color{red}collocated}
 98 |     \item Large cluster: requires more RAM at the NameNode
 99 |     \end{itemize}
100 | 
101 |   \end{itemize}
102 | }
103 | 
104 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
105 | \frame {\frametitle{Cluster Specification}
106 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
107 |   \begin{itemize}
108 |   \item \textbf{Should we use 64-bit or 32-bit machines?}
109 |     \begin{itemize}
110 |     \item NameNode should run on a 64-bit machine: this avoids the
111 |       3GB Java heap size limit on 32-bit machines
112 |     \end{itemize}
113 | 
114 |     \vspace{40pt}
115 | 
116 |   \item \textbf{What's the role of Java?}
117 |     \begin{itemize}
118 |     \item Recent releases (Java6) implement some optimization to
119 |       eliminate large pointer overhead
120 |     \item[$\to$] A cluster of 64-bit machines has no downside
121 |     \end{itemize}
122 |   \end{itemize}
123 | 
124 | }
125 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
126 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
127 | \subsection{Network Topology}
128 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
129 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
130 | 
131 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
132 | \frame {\frametitle{Network Topology}
133 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
134 |    \begin{center}
135 |       \framebox{\includegraphics[scale=0.3]{./Figures/cluster_net_topology}}      
136 |     \end{center}
137 | }
138 | 
139 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
140 | \frame {\frametitle{Network Topology}
141 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
142 |   \begin{itemize}
143 |   \item \textbf{Two-level network topology}
144 |     \begin{itemize}
145 |     \item Switch redundancy is not shown in the figure
146 |     \end{itemize}
147 | 
148 | \vspace{20pt}
149 | 
150 |   \item \textbf{Typical configuration}
151 |     \begin{itemize}
152 |     \item 30-40 servers per rack
153 |     \item 10 GB switch TOR
154 |     \item Core switch or router with 10GB or better
155 |     \end{itemize}
156 | 
157 | \vspace{20pt}
158 | 
159 |   \item \textbf{Features}
160 |     \begin{itemize}
161 |     \item Aggregate bandwidth between nodes on the same rack is much
162 |       larger than for nodes on different racks
163 |     \item {\color{red}Rack awareness}
164 |       \begin{itemize}
165 |       \item Hadoop should know the cluster topology
166 |       \item Benefits both HDFS (data placement) and MapReduce (locality)
167 |       \end{itemize}
168 |     \end{itemize}
169 |   \end{itemize}
170 | }
171 | 
172 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
173 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
174 | \subsection{Hadoop Configuration}
175 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
176 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
177 | 
178 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
179 | \frame {\frametitle{Hadoop Configuration}
180 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
181 |   \begin{itemize}
182 |   \item \textbf{There are a handful of files for controlling the operation of
183 |       an Hadoop Cluster}
184 |     \begin{itemize}
185 |     \item Hundreds of parameters!!
186 |     \item See next slide for a summary table
187 |     \end{itemize}
188 |     
189 |     \vspace{20pt}
190 |     
191 |   \item \textbf{Managing the configuration across several machines}
192 |     \begin{itemize}
193 |     \item All machines of an Hadoop cluster must be in sync!
194 |     \item What happens if you dispatch an update and some machines are
195 |       down?
196 |     \item What happens when you add (new) machines to your cluster?
197 |     \item What if you need to patch MapReduce?
198 |     \end{itemize}
199 | 
200 |     \vspace{20pt}
201 |     
202 |   \item \textbf{Common practice: use configuration management tools}
203 |     \begin{itemize}
204 |     \item Chef, Puppet, ...
205 |     \item Declarative language to specify configurations
206 |     \item Allow also to install software
207 |     \end{itemize}
208 |     
209 |   \end{itemize}
210 | }
211 | 
212 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
213 | \frame {\frametitle{Hadoop Configuration}
214 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
215 | \begin{tiny}
216 |   \begin{table}[h]
217 |     \centering
218 |     \begin{tabular}{||c|c|l||}
219 |       \hline
220 |       \hline
221 |       {\textbf{Filename}} & {\textbf{Format}} & {\textbf{Description}} \\
222 |       \hline
223 |       \hline
224 |       hadoop-env.sh & Bash script & {Environment variables that are
225 |       used in the scripts to run Hadoop.} \\
226 |       core-site.xml & Hadoop configuration XML & I/O settings that are common
227 |       to HDFS and MapReduce.\\
228 |       hdfs-site.xml  & Hadoop configuration XML & Namenode, the secondary
229 |       namenode, and the datanodes. \\
230 |       mapred-site.xml  & Hadoop configuration XML & Jobtracker, and the
231 |       tasktrackers.\\
232 |       masters & Plain text & A list of machines that
233 |       each run a secondary namenode.\\
234 |       slaves & Plain text & A list of machines that
235 |       each run a datanode and a tasktracker. \\
236 |       \hline
237 |       \hline
238 |     \end{tabular}
239 |     \caption{Hadoop Configuration Files}
240 |     \label{tab:conf}
241 |   \end{table}
242 | \end{tiny}
243 | }
244 | 
245 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
246 | \frame {\frametitle{Hadoop Configuration: memory utilization}
247 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
248 |   \begin{itemize}
249 |   \item \textbf{Hadoop uses a lot of memory}
250 |     \begin{itemize}
251 |     \item Default values, for a typical cluster configuration
252 |       \begin{itemize}
253 |       \item DataNode: 1 GB
254 |       \item TaskTracker: 1 GB
255 |       \item Child JVM map task: 2 $\times$ 200MB
256 |       \item Child JVM reduce task: 2 $\times$ 200MB
257 |       \end{itemize}
258 |     \end{itemize}
259 | 
260 |     \vspace{20pt}
261 | 
262 |   \item \textbf{All the moving parts of Hadoop (HDFS and MapReduce) can be
263 |     individually configured}
264 |     \begin{itemize}
265 |     \item This is true for cluster configuration but also for {\color{red}job
266 |       specific} configurations
267 |     \end{itemize}
268 | 
269 |     \vspace{20pt}
270 | 
271 |   \item \textbf{Hadoop is fast when using RAM}
272 |     \begin{itemize}
273 |     \item Generally, MapReduce Jobs {\color{red}are not} CPU-bound
274 |     \item Avoid I/O on disk as much as you can
275 |     \item Minimize network traffic
276 |       \begin{itemize}
277 |       \item Customize the partitioner
278 |       \item Use compression ($\to$ decompression is in RAM)
279 |       \end{itemize}
280 |     \end{itemize}
281 |     
282 |   \end{itemize}
283 | }
284 | 
285 | 
286 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
287 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
288 | \subsection{Cloud Deployments}
289 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
290 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
291 | 
292 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
293 | \frame {\frametitle{Elephants in the cloud!}
294 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
295 |   \begin{itemize}
296 |   \item \textbf{Many organizations run Hadoop in private clusters}
297 |     \begin{itemize}
298 |     \item Pros and cons
299 |     \end{itemize}
300 | 
301 | \vspace{40pt}
302 | 
303 |   \item \textbf{Cloud based Hadoop installations (Amazon biased)}
304 |     \begin{itemize}
305 |     \item Use Cloudera + \{Whirr, boto, ...\}
306 |     \item Use Elastic MapReduce
307 |     \end{itemize}
308 |   \end{itemize}
309 | }
310 | 
311 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
312 | \frame {\frametitle{Hadoop on EC2}
313 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
314 |   \begin{itemize}
315 |   \item \textbf{Launch instances of a cluster on demand, paying by hour}
316 |     \begin{itemize}
317 |     \item CPU, in general bandwidth is used from within a datacenter,
318 |       hence it's free
319 |     \end{itemize}
320 | 
321 |     \vspace{20pt}
322 | 
323 |   \item \textbf{Apache Whirr project}
324 |     \begin{itemize}
325 |     \item Launch, terminate, modify a running cluster
326 |     \item Requires AWS credentials
327 |     \end{itemize}
328 | 
329 |     \vspace{20pt}
330 | 
331 |   \item \textbf{Example}
332 |     \begin{itemize}
333 |     \item Launch a cluster \texttt{test-hadoop-cluster}, with one
334 |       master node (\texttt{JobTracker} and \texttt{NameNode}) and 5
335 |       worker nodes (\texttt{DataNodes} and \texttt{TaskTrackers})
336 |     \item[$\to$] \texttt{hadoop-ec2 launch-cluster test-hadoop-cluster
337 |         5}
338 |     \item See Chapter 9 \cite{hadoop_book}
339 |     \end{itemize}
340 |   \end{itemize}
341 | }
342 | 
343 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
344 | \frame {\frametitle{AWS Elastic MapReduce}
345 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
346 |   \begin{itemize}
347 |   \item \textbf{Hadoop as a service}
348 |     \begin{itemize}
349 |     \item Amazon handles everything, which becomes transparent
350 |     \item How this is done remains a mystery
351 |     \end{itemize}
352 | 
353 |     \vspace{40pt}
354 | 
355 |   \item \textbf{Focus on What not How}
356 |     \begin{itemize}
357 |     \item All you need to do is to package a MapReduce Job in a JAR
358 |       and upload it using a Web Interface
359 |     \item Other Jobs are available: python, pig, hive, ...
360 |     \item {\color{red}Test your jobs locally!!!}
361 |     \end{itemize}
362 |   \end{itemize}
363 | 
364 | }
365 | 


--------------------------------------------------------------------------------
/disa/graph_algorithms.tex:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | \subsection{Preliminaries}
  4 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  5 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  6 | 
  7 | 
  8 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  9 | \frame {\frametitle{Motivations}
 10 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 11 |   \begin{itemize}
 12 |   \item \textbf{Examples of graph problems}
 13 |     \begin{itemize}
 14 |     \item Clustering
 15 |     \item Matching problems
 16 |     \item Element analysis: node and edge centralities
 17 |     \end{itemize}
 18 | 
 19 |     \vspace{20pt}
 20 |     
 21 |   \item \textbf{The problem: big graphs}
 22 | 
 23 |     \vspace{20pt}
 24 |     
 25 |   \item \textbf{Why MapReduce?}
 26 |     \begin{itemize}
 27 |     \item Algorithms for the above problems on a single machine are
 28 |       not scalable
 29 |     \item Recently, Google designed a new system, Pregel, for
 30 |       large-scale ({\color{red}incremental}) graph processing
 31 |     \item Even more recently, \cite{Lattanzi2011} indicate a
 32 |       fundamentally new design pattern to analyze graphs in MapReduce
 33 |     \item New trend: graph databases, graph processing systems\footnote{If you're interested, we'll discuss this off-line.}
 34 |     \end{itemize}
 35 |   \end{itemize}
 36 | }
 37 | 
 38 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 39 | \frame {\frametitle{Graph Representations}
 40 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 41 |   \begin{itemize}
 42 |   \item \textbf{Basic data structures}
 43 |     \begin{itemize}
 44 |     \item Adjacency matrix
 45 |     \item Adjacency list
 46 |     \end{itemize}
 47 | 
 48 |     \vspace{20pt}
 49 | 
 50 |   \item \textbf{Are graphs sparse or dense?}
 51 |     \begin{itemize}
 52 |     \item Determines which data-structure to use
 53 |       \begin{itemize}
 54 |       \item Adjacency matrix: operations on incoming links are easy
 55 |         (column scan)
 56 |       \item Adjacency list: operations on outgoing links are easy
 57 |       \item The shuffle and sort phase can help, by grouping edges by
 58 |         their destination reducer
 59 |       \end{itemize}
 60 |     \item \cite{Leskovec2005} dispelled the notion of sparseness of real-world graphs
 61 |     \end{itemize}
 62 |   \end{itemize}
 63 | }
 64 | 
 65 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 66 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 67 | \subsection{Breadth-First Search}
 68 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 69 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 70 | 
 71 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 72 | \frame {\frametitle{Parallel Breadth-First Search}
 73 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 74 |   \begin{itemize}
 75 |   \item \textbf{Single-source shortest path}
 76 |     \begin{itemize}
 77 |     \item Dijkstra algorithm using a {\color{red}global priority
 78 |         queue}
 79 |       \begin{itemize}
 80 |       \item Maintains a globally sorted list of nodes by current distance
 81 |       \end{itemize}
 82 |    \item How to solve this problem in parallel?
 83 |      \begin{itemize}
 84 |      \item ``Brute-force'' approach: breadth-first search
 85 |      \end{itemize}
 86 |     \end{itemize}
 87 | 
 88 |     \vspace{40pt}
 89 | 
 90 |   \item \textbf{Parallel BFS: intuition}
 91 |     \begin{itemize}
 92 |     \item Flooding
 93 |     \item {\color{red}Iterative algorithm} in MapReduce
 94 |     \item Shoehorn message passing style algorithms
 95 |     \end{itemize}
 96 | 
 97 |   \end{itemize}
 98 | }
 99 | 
100 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
101 | \frame {\frametitle{Parallel Breadth-First Search}
102 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
103 |   \begin{center}
104 |     \includegraphics[scale=0.4]{./Figures/pbfs}
105 |   \end{center}
106 | }
107 | 
108 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
109 | \frame {\frametitle{Parallel Breadth-First Search}
110 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
111 |   \begin{itemize}
112 |   \item \textbf{Assumptions}
113 |     \begin{itemize}
114 |     \item Connected, directed graph
115 |     \item Data structure: adjacency list
116 |     \item Distance to each node is stored alongside the adjacency list
117 |       of that node
118 |     \end{itemize}
119 | 
120 |     \vspace{20pt}
121 | 
122 |   \item \textbf{The pseudo-code}
123 |     \begin{itemize}
124 |     \item We use $n$ to denote the node id (an integer)
125 |     \item We use $N$ to denote the node adjacency list and current
126 |       distance
127 |     \item The algorithm works by mapping over all nodes
128 |     \item Mappers emit a key-value pair for each neighbor on the
129 |       node's adjacency list
130 |       \begin{itemize}
131 |       \item The key: node id of the neighbor
132 |       \item The value: the current distance to the node plus one
133 |       \item If we can reach node $n$ with a distance $d$, then we must
134 |         be able to reach all the nodes connected to $n$ with distance $d+1$
135 |       \end{itemize}
136 |     \end{itemize}
137 |   \end{itemize}
138 | }
139 | 
140 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
141 | \frame {\frametitle{Parallel Breadth-First Search}
142 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
143 |   \begin{itemize}
144 |   \item \textbf{The pseudo-code (continued)}
145 |     \begin{itemize}
146 |     \item After shuffle and sort, reducers receive keys corresponding
147 |       to the destination node ids and distances corresponding to all
148 |       paths leading to that node
149 |     \item The reducer selects the shortest of these distances and
150 |       update the distance in the node data structure
151 |     \end{itemize}
152 | 
153 |     \vspace{20pt}
154 | 
155 |   \item \textbf{Passing the graph along}
156 |     \begin{itemize}
157 |     \item The mapper: emits the node adjacency list, with the node id
158 |       as the key
159 |     \item The reducer: must distinguish between the node data
160 |       structure and the distance values
161 |     \end{itemize}
162 |  \end{itemize}
163 | }
164 | 
165 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
166 | \frame {\frametitle{Parallel Breadth-First Search}
167 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
168 |   \begin{itemize}
169 |   \item \textbf{MapReduce iterations}
170 |     \begin{itemize}
171 |     \item The first time we run the algorithm, we ``discover'' all
172 |       nodes connected to the source
173 |     \item The second iteration, we discover all nodes connected to
174 |       those
175 |     \item[$\to$] Each iteration expands the ``search frontier'' by one
176 |       hop
177 |     \item {\color{red}How many iterations before convergence?}
178 |     \end{itemize}
179 | 
180 |     \vspace{40pt}
181 | 
182 |   \item \textbf{This approach is suitable for small-world graphs}
183 |     \begin{itemize}
184 |     \item The diameter of the network is small
185 |     \item See \cite{Lattanzi2011} for advanced topics on the subject
186 |     \end{itemize}
187 |   \end{itemize}
188 | }
189 | 
190 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
191 | \frame {\frametitle{Parallel Breadth-First Search}
192 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
193 |   \begin{itemize}
194 |   \item \textbf{Checking the termination of the algorithm}
195 |     \begin{itemize}
196 |     \item Requires a ``driver'' program which submits a job, check
197 |       termination condition and eventually iterates
198 |     \item In practice:
199 |       \begin{itemize}
200 |       \item Hadoop counters
201 |       \item Side-data to be passed to the job configuration
202 |       \end{itemize}
203 |     \end{itemize}
204 | 
205 |     \vspace{40pt}
206 | 
207 |   \item \textbf{Extensions}
208 |     \begin{itemize}
209 |     \item Storing the actual shortest-path
210 |     \item Weighted edges (as opposed to unit distance)
211 |     \end{itemize}
212 |   \end{itemize}
213 | }
214 | 
215 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
216 | \frame {\frametitle{The story so far}
217 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
218 |   \begin{itemize}
219 |   \item \textbf{The graph structure is stored in an adjacency lists}
220 |     \begin{itemize}
221 |     \item This data structure can be augmented with additional information
222 |     \end{itemize}
223 | 
224 |     \vspace{20pt}
225 | 
226 |   \item \textbf{The MapReduce framework}
227 |     \begin{itemize}
228 |     \item Maps over the node data structures involving only the node's
229 |       internal state and it's {\color{red}local} graph structure
230 |     \item Map results are ``passed'' along outgoing edges
231 |     \item The graph itself is passed from the mapper to the reducer
232 |       \begin{itemize}
233 |       \item This is a very costly operation for large graphs!
234 |       \end{itemize}
235 |     \item Reducers aggregate over ``same destination'' nodes
236 |     \end{itemize}
237 | 
238 |     \vspace{20pt}
239 | 
240 |   \item \textbf{Graph algorithms are generally iterative}
241 |     \begin{itemize}
242 |     \item Require a driver program to check for termination
243 |     \end{itemize}
244 |   \end{itemize}
245 | }
246 | 
247 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
248 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
249 | \subsection{PageRank}
250 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
251 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
252 | 
253 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
254 | \frame {\frametitle{Introduction}
255 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
256 |   \begin{itemize}
257 |   \item \textbf{What is PageRank}
258 |     \begin{itemize}
259 |     \item It's a measure of the relevance of a Web page, based on the
260 |       structure of the hyperlink graph
261 |     \item Based on the concept of random Web surfer
262 |     \end{itemize}
263 | 
264 |     \vspace{20pt}
265 |   
266 |   \item \textbf{Formally we have: }
267 |     $$P(n) = \alpha \Big( \frac{1}{|G|}\Big) + (1-\alpha) \sum_{m \in L(n)}\frac{P(m)}{C(m)}$$
268 |     \begin{itemize}
269 |     \item $|G|$ is the number of nodes in the graph
270 |     \item $\alpha$ is a random jump factor
271 |     \item $L(n)$ is the set of out-going links from page $n$
272 |     \item $C(m)$ is the out-degree of node $m$
273 |     \end{itemize}
274 |   \end{itemize}
275 | }
276 | 
277 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
278 | \frame {\frametitle{PageRank in Details}
279 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
280 |   \begin{itemize}
281 |   \item \textbf{PageRank is defined recursively, hence we need an
282 |       iterative algorithm}
283 |     \begin{itemize}
284 |     \item A node receives ``contributions'' from all pages that link
285 |       to it
286 |     \end{itemize}
287 | 
288 |     \vspace{20pt}
289 |     
290 |   \item \textbf{Consider the set of nodes $L(n)$}
291 |     \begin{itemize}
292 |     \item A random surfer at $m$ arrives at $n$ with probability
293 |       $1/C(m)$
294 |     \item Since the PageRank value of $m$ is the probability that the
295 |       random surfer is at $m$, the probability of arriving at $n$ from
296 |       $m$ is $P(m)/C(m)$
297 |     \end{itemize}
298 | 
299 |     \vspace{20pt}
300 | 
301 |   \item \textbf{To compute the PageRank of $n$ we need:}
302 |     \begin{itemize}
303 |     \item Sum the contributions from all pages that link to $n$
304 |     \item Take into account the random jump, which is uniform over all
305 |       nodes in the graph
306 |     \end{itemize}
307 |   \end{itemize}
308 | }
309 | 
310 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
311 | \frame {\frametitle{PageRank in MapReduce}
312 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
313 |   \begin{center}
314 |     \includegraphics[scale=0.4]{./Figures/pr}
315 |   \end{center}
316 | }
317 | 
318 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
319 | \frame {\frametitle{PageRank in MapReduce}
320 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
321 |   \begin{center}
322 |     \includegraphics[scale=0.4]{./Figures/pr_toy}
323 |   \end{center}
324 | }
325 | 
326 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
327 | \frame {\frametitle{PageRank in MapReduce}
328 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
329 |  \begin{center}
330 |     \includegraphics[scale=0.4]{./Figures/pr_sketch}
331 |   \end{center}
332 | }
333 | 
334 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
335 | \frame {\frametitle{PageRank in MapReduce}
336 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
337 |   \begin{itemize}
338 |   \item \textbf{Sketch of the MapReduce algorithm}
339 |     \begin{itemize}
340 |     \item The algorithm maps over the nodes
341 |     \item For each node computes the PageRank mass the needs to be
342 |       distributed to neighbors
343 |     \item Each fraction of the PageRank mass is emitted as the value,
344 |       keyed by the node ids of the neighbors
345 |     \item In the shuffle and sort, values are grouped by node id
346 |       \begin{itemize}
347 |       \item Also, we pass the graph structure from mappers to reducers
348 |         (for subsequent iterations to take place over the updated graph)
349 |       \end{itemize}
350 |     \item The reducer updates the value of the PageRank of every
351 |       single node
352 |     \end{itemize}
353 |   \end{itemize}
354 | }
355 | 
356 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
357 | \frame {\frametitle{PageRank in MapReduce}
358 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
359 |   \begin{itemize}
360 |   \item \textbf{Implementation details}
361 |     \begin{itemize}
362 |     \item Loss of PageRank mass for sink nodes
363 |     \item Auxiliary state information
364 |     \item One iteration of the algorithm
365 |       \begin{itemize}
366 |       \item Two MapReduce jobs: one to distribute the PageRank mass,
367 |         the other for dangling nodes and random jumps
368 |       \end{itemize}
369 |     \item Checking for convergence
370 |       \begin{itemize}
371 |       \item Requires a driver program
372 |       \item When updates of PageRank are ``stable'' the algorithm stops
373 |       \end{itemize}
374 |     \end{itemize}
375 |     
376 |     \vspace{20pt}
377 |     
378 |   \item \textbf{Further reading on {\color{red}convergence} and
379 |       {\color{red}attacks}}
380 |     \begin{itemize}
381 |     \item Convergence: \cite{Page1999, Bianchini2005}
382 |     \end{itemize}
383 |   \end{itemize}
384 | }
385 | 


--------------------------------------------------------------------------------
/disa/programming_model.tex:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | \frame {\frametitle{Functional Programming Roots}
  3 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  4 |   \begin{itemize}
  5 |   \item \textbf{Key feature: higher order functions}
  6 |     \begin{itemize}
  7 |     \item Functions that accept other functions as arguments
  8 |     \item \textbf{Map} and \textbf{Fold}
  9 |     \end{itemize}
 10 |   \end{itemize}
 11 | 
 12 |   \begin{figure}[h]
 13 |     \centering
 14 |     \includegraphics[scale=0.4]{./Figures/functional}
 15 |     \caption{Illustration of \emph{map} and \emph{fold}.}
 16 |     \label{fig:functional}
 17 |   \end{figure}
 18 | }
 19 | 
 20 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 21 | \frame {\frametitle{Functional Programming Roots}
 22 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 23 |   \begin{itemize}
 24 |   \item \textbf{map phase:}
 25 |     \begin{itemize}
 26 |     \item Given a list, \emph{map} takes as an argument a function $f$
 27 |       (that takes a single argument) and applies it to all element in a list
 28 |     \end{itemize}
 29 | 
 30 |     \vspace{20pt}
 31 | 
 32 |   \item \textbf{fold phase:}
 33 |     \begin{itemize}
 34 |     \item Given a list, fold takes as arguments a function $g$ (that
 35 |       takes two arguments) and an initial value (an accumulator)
 36 |     \item $g$ is first applied to the initial value and the first
 37 |       item in the list
 38 |     \item The result is stored in an intermediate variable, which is
 39 |       used as an input together with the next item to a second
 40 |       application of $g$
 41 |     \item The process is repeated until all items in the list have
 42 |       been consumed
 43 |     \end{itemize}
 44 |  \end{itemize}
 45 | }
 46 | 
 47 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 48 | \frame {\frametitle{Functional Programming Roots}
 49 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 50 |   \begin{itemize}
 51 |   \item \textbf{We can view map as a transformation over a dataset}
 52 |     \begin{itemize}
 53 |     \item This transformation is specified by the function $f$
 54 |     \item Each functional application happens in
 55 |       {\color{red} \textbf{isolation}}
 56 |     \item The application of $f$ to each element of a dataset can be
 57 |       parallelized in a straightforward manner
 58 |     \end{itemize}
 59 | 
 60 |     \vspace{20pt}
 61 | 
 62 |   \item \textbf{We can view fold as an aggregation operation}
 63 |     \begin{itemize}
 64 |     \item The aggregation is defined by the function $g$
 65 |     \item Data locality: elements in the list must be ``brought
 66 |       together''
 67 |     \item If we can {\color{red} \textbf{group}} elements of the list, also the fold phase can proceed in parallel 
 68 |     \end{itemize}
 69 | 
 70 |     \vspace{20pt}
 71 | 
 72 |   \item \textbf{Associative and commutative operations}
 73 |     \begin{itemize}
 74 |     \item Allow performance gains through local aggregation and reordering
 75 |     \end{itemize}
 76 |   \end{itemize}
 77 |  
 78 | }
 79 | 
 80 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 81 | \frame {\frametitle{Functional Programming and MapReduce}
 82 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 83 |   \begin{itemize}
 84 |   \item \textbf{Equivalence of MapReduce and Functional Programming: }
 85 |     \begin{itemize}
 86 |     \item The map of MapReduce corresponds to the map operation
 87 |     \item The reduce of MapReduce corresponds to the fold operation
 88 |     \end{itemize}
 89 | 
 90 |     \vspace{20pt}
 91 | 
 92 |   \item\textbf{The framework coordinates the map and reduce phases:}
 93 |     \begin{itemize}
 94 |     \item Grouping intermediate results happens in parallel
 95 |     \end{itemize}
 96 | 
 97 |     \vspace{20pt}
 98 |     
 99 |   \item \textbf{In practice:}
100 |     \begin{itemize}
101 |     \item User-specified computation is applied (in parallel) to all input
102 |       records of a dataset
103 |     \item Intermediate results are aggregated by another
104 |       user-specified computation
105 |     \end{itemize}
106 | 
107 |   \end{itemize}
108 | }
109 | 
110 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
111 | \frame {\frametitle{What can we do with MapReduce?}
112 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
113 |   \begin{itemize}
114 |   \item \textbf{MapReduce ``implements'' a subset of functional
115 |       programming}
116 |     \begin{itemize}
117 |     \item The programming model appears quite limited and strict
118 |     \end{itemize}
119 | 
120 | \vspace{20pt}
121 | 
122 |   \item \textbf{There are several important problems that can be
123 |       adapted to MapReduce}
124 |     \begin{itemize}
125 |     \item We will focus on illustrative cases
126 |     \item We will see in detail ``design patterns''
127 |       \begin{itemize}
128 |       \item How to transform a problem and its input
129 |       \item How to save memory and bandwidth in the system
130 |       \end{itemize}
131 |     \end{itemize}
132 | 
133 |   \end{itemize}
134 | 
135 | }
136 | 
137 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
138 | \frame {\frametitle{Data Structures}
139 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
140 |   \begin{itemize}
141 |   \item \textbf{Key-value pairs are the basic data structure in
142 |       MapReduce}
143 |     \begin{itemize}
144 |     \item Keys and values can be: integers, float, strings, raw bytes
145 |     \item They can also be \textbf{arbitrary data structures}
146 |     \end{itemize}
147 | 
148 |     \vspace{20pt}
149 | 
150 |   \item \textbf{The design of MapReduce algorithms involves}:
151 |     \begin{itemize}
152 |     \item Imposing the key-value structure on arbitrary datasets\footnote{There's more about it: here we only look at the input to the map function.}
153 |       \begin{itemize}
154 |       \item E.g.: for a collection of Web pages, input keys may be URLs
155 |         and values may be the HTML content
156 |       \end{itemize}
157 |     \item In some algorithms, input keys are not used, in others they
158 |       uniquely identify a record
159 |     \item Keys can be combined in complex ways to design various algorithms
160 |     \end{itemize}
161 |   \end{itemize}
162 | }
163 | 
164 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
165 | \frame {\frametitle{A Generic MapReduce Algorithm}
166 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
167 |   \begin{itemize}
168 |   \item \textbf{The programmer defines a mapper and a reducer as
169 |       follows}\footnote{We use the convention $[ \cdots ]$ to denote a list.}\footnote{Pedices indicate different data types.}:
170 |     \begin{itemize}
171 |     \item map: $(k_1,v_1) \to [(k_2,v_2)]$
172 |     \item reduce: $(k_2,[v_2]) \to [(k_3,v_3)]$
173 |     \end{itemize}
174 | 
175 |     \vspace{20pt}
176 | 
177 |   \item \textbf{In words}:
178 |     \begin{itemize}
179 |     \item A dataset stored on an underlying \textbf{distributed} filesystem,
180 |       which is split in a number of \textbf{blocks} across machines
181 |     \item The mapper is applied to every input key-value pair to
182 |       generate intermediate key-value pairs
183 |     \item The reducer is applied to all values associated with the
184 |       same intermediate key to generate output key-value pairs
185 |     \end{itemize}
186 |   \end{itemize}
187 | }
188 | 
189 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
190 | \frame {\frametitle{Where the magic happens}
191 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
192 |   \begin{itemize}
193 |   \item \textbf{Implicit between the map and reduce phases is a {\color{red}parallel ``\textbf{group by}''} operation on intermediate keys}
194 |     \begin{itemize}
195 |     \item Intermediate data arrive at each reducer in order, sorted by
196 |       the key
197 |     \item No ordering is guaranteed across reducers
198 |    \end{itemize}
199 | 
200 |     \vspace{20pt}
201 | 
202 |   \item \textbf{Output keys from reducers are written back to the
203 |       distributed filesystem}
204 |     \begin{itemize}
205 |     \item The output may consist of $r$ distinct files, where $r$ is
206 |       the number of reducers
207 |     \item Such output may be the input to a subsequent MapReduce phase\footnote{Think of \textbf{iterative algorithms}.}
208 |     \end{itemize}
209 | 
210 |     \vspace{20pt}
211 | 
212 |   \item \textbf{Intermediate keys are transient}:
213 |     \begin{itemize}
214 |     \item They are not stored on the distributed filesystem
215 |     \item They are ``spilled'' to the local disk of each machine in
216 |       the cluster
217 |     \end{itemize}
218 |   \end{itemize}
219 | }
220 | 
221 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
222 | \frame {\frametitle{``Hello World'' in MapReduce}
223 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
224 | 
225 | \begin{algorithm}[H]
226 | \algrenewcommand\algorithmicfunction{\textbf{class}}
227 | \algrenewcommand\algorithmicprocedure{\textbf{method}}
228 | 
229 |   \begin{algorithmic}[1]
230 |     \Function{Mapper}{}
231 |     \Procedure{Map}{offset $a$, line $l$}
232 |     \ForAll{term $t \in$ line $l$}
233 |       \State $\textsc{Emit}(\textrm{term }t, \textrm{count }1)$
234 |     \EndFor
235 |     \EndProcedure
236 |     \EndFunction
237 |   \end{algorithmic}
238 |   
239 |   \begin{algorithmic}[1]
240 |     \Function{Reducer}{}
241 |     \Procedure{Reduce}{term $t$, counts $[  c_1, c_2, \ldots ]$}
242 |     \State $sum \gets 0$
243 |     \ForAll{$ \textrm{count }c \in \textrm{counts }[  c_1, c_2, \ldots ]$}
244 |     \State $sum \gets sum + c$
245 |     \EndFor
246 |     \State $\textsc{Emit}(\textrm{term }t, \textrm{count }sum)$
247 |     \EndProcedure
248 |     \EndFunction
249 |   \end{algorithmic}
250 | 
251 | \end{algorithm}
252 | 
253 | }
254 | 
255 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
256 | \frame {\frametitle{}
257 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
258 |   \begin{figure}[h]
259 |     \centering
260 |     \includegraphics[scale=0.23]{./Figures/simple_MR}
261 |     \label{fig:simple_MR}
262 |   \end{figure}
263 | }
264 | 
265 | \frame {\frametitle{``Hello World'' in MapReduce}
266 |   \begin{itemize}
267 | 
268 |   \item \textbf{Input:}
269 |     \begin{itemize}
270 |     \item Key-value pairs: (offset, line) of a file stored on the distributed filesystem
271 |     \item a: unique identifier of a line offset
272 |     \item l: is the text of the line itself
273 |     \end{itemize}
274 | 
275 |   \item \textbf{Mapper:}
276 |     \begin{itemize}
277 |     \item Takes an input key-value pair, tokenize the line 
278 |     \item Emits intermediate key-value pairs: the word is the key and
279 |       the integer is the value
280 |     \end{itemize}
281 | 
282 |   \item \textbf{The framework:}
283 |     \begin{itemize}
284 |     \item Guarantees all values associated with the same key (the
285 |       word) are brought to the same reducer
286 |     \end{itemize}
287 | 
288 |   \item \textbf{The reducer:}
289 |     \begin{itemize}
290 |     \item Receives all values associated to some keys
291 |     \item Sums the values and writes output key-value pairs: the key
292 |       is the word and the value is the number of occurrences
293 |     \end{itemize}
294 |   \end{itemize}
295 | }
296 | 
297 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
298 | \frame {\frametitle{Combiners}
299 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
300 |   \begin{itemize}
301 |   \item \textbf{Combiners are a general mechanism to reduce the amount
302 |       of intermediate data}
303 |     \begin{itemize}
304 |     \item They could be thought of as ``mini-reducers''
305 |     \end{itemize}
306 | 
307 |     \vspace{20pt}
308 | 
309 |   \item \textbf{Back to our running example: word count}
310 |     \begin{itemize}
311 |     \item Combiners aggregate term counts across documents processed
312 |       by each map task
313 |     \item If combiners take advantage of all opportunities for local
314 |       aggregation we have at most $m \times V$ intermediate key-value
315 |       pairs
316 |       \begin{itemize}
317 |       \item $m$: number of mappers
318 |       \item $V$: number of unique terms in the collection
319 |       \end{itemize}
320 |     \item Note: due to Zipfian nature of term distributions, not all
321 |       mappers will see all terms
322 |     \end{itemize}
323 |     
324 |   \end{itemize}
325 | }
326 | 
327 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
328 | \frame {\frametitle{A word of caution}
329 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
330 |   \begin{itemize}
331 |   \item \textbf{The use of combiners must be thought carefully}
332 |     \begin{itemize}
333 |     \item In Hadoop, they are optional: the correctness of the
334 |       algorithm cannot depend on computation (or even execution) of
335 |       the combiners
336 |     \end{itemize}
337 | 
338 |     \vspace{20pt}
339 | 
340 |   \item \textbf{Combiners I/O types}
341 |     \begin{itemize}
342 |     \item Input: $(k_2, [v_2])$ [Same input as for Reducers]
343 |     \item Output: $[(k_2, v_2)]$ [Same output as for Mappers]
344 |     \end{itemize}
345 | 
346 |     \vspace{20pt}
347 | 
348 |   \item \textbf{Commutative and Associative computations}
349 |     \begin{itemize}
350 |     \item Reducer and Combiner code may be interchangeable (e.g. Word Count)
351 |     \item This is not true in the general case
352 |     \end{itemize}
353 |   \end{itemize}
354 | }
355 | 
356 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
357 | \frame {\frametitle{}
358 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
359 |    \begin{center}
360 |    \includegraphics[scale=0.23]{./Figures/simple_MR_combiners}
361 |     \end{center}
362 | }
363 | 
364 | 
365 | 
366 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
367 | \frame {\frametitle{Algorithmic Correctness: an Example}
368 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
369 |   \begin{itemize}
370 |   \item \textbf{Problem statement}
371 |     \begin{itemize}
372 |     \item We have a large dataset where input keys are strings and input
373 |       values are integers
374 |     \item We wish to compute the mean of all integers associated with
375 |       the same key
376 |       \begin{itemize}
377 |       \item In practice: the dataset can be a log from a website,
378 |         where the keys are user IDs and values are some measure of activity
379 |       \end{itemize}
380 |     \end{itemize}
381 | 
382 |     \vspace{20pt}
383 | 
384 |   \item \textbf{Next, a baseline approach}
385 |     \begin{itemize}
386 |     \item We use an \textbf{identity mapper}, which groups and sorts
387 |       appropriately input key-value pairs
388 |     \item Reducers keep track of running sum and the number of
389 |       integers encountered
390 |     \item The mean is emitted as the output of the reducer, with the
391 |       input string as the key
392 |     \end{itemize}
393 | 
394 |     \vspace{20pt}
395 | 
396 |   \item \textbf{Inefficiency problems in the shuffle phase}
397 |   \end{itemize}
398 | }
399 | 
400 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
401 | \frame {\frametitle{Example: Computing the mean}
402 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
403 | \begin{algorithm}[H]
404 | \algrenewcommand\algorithmicfunction{\textbf{class}}
405 | \algrenewcommand\algorithmicprocedure{\textbf{method}}
406 |   \begin{algorithmic}[1]
407 |     \Function{Mapper}{}
408 |     \Procedure{Map}{string $t$, integer $r$}
409 |     \State $\textsc{Emit}(\textrm{string }t, \textrm{integer }r)$
410 |     \EndProcedure
411 |     \EndFunction
412 |   \end{algorithmic}
413 | 
414 |   \begin{algorithmic}[1]
415 |     \Function{Reducer}{}
416 |     \Procedure{Reduce}{string $t$, integers $[ r_1, r_2, \ldots ]$}
417 |     \State $sum \gets 0$
418 |     \State $cnt \gets 0$
419 |     \ForAll{$ \textrm{integer }r \in \textrm{integers }[ r_1, r_2, \ldots ]$}
420 |     \State $sum \gets sum + r$
421 |     \State $cnt \gets cnt + 1$
422 |     \EndFor
423 |     \State $r_{avg} \gets sum/cnt$
424 |     \State $\textsc{Emit}(\textrm{string }t, \textrm{integer } r_{avg})$
425 |     \EndProcedure
426 |     \EndFunction
427 |   \end{algorithmic}
428 | \end{algorithm}
429 | }
430 | 
431 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
432 | \frame {\frametitle{Algorithmic Correctness}
433 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
434 |   \begin{itemize}
435 |   \item \textbf{Note: operations are not distributive}
436 |     \begin{itemize}
437 |     \item \texttt{Mean}(1,2,3,4,5) $\neq$
438 |       \texttt{Mean}(\texttt{Mean}(1,2), \texttt{Mean}(3,4,5))
439 |     \item Hence: a combiner cannot output partial means and hope that
440 |       the reducer will compute the correct final mean
441 |     \end{itemize}
442 | 
443 |     \vspace{20pt}
444 | 
445 |   \item \textbf{Rule of thumb:}
446 |     \begin{itemize}
447 |       \item Combiners are optimizations, the algorithm should work even when ``removing'' them
448 |     \end{itemize}   
449 |   \end{itemize}
450 | }
451 | 
452 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
453 | \frame {\frametitle{Example: Computing the mean with combiners}
454 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
455 | 
456 | \begin{algorithm}[H]
457 | \algrenewcommand\algorithmicfunction{\textbf{class}}
458 | \algrenewcommand\algorithmicprocedure{\textbf{method}}
459 | \begin{tiny}
460 |   \begin{algorithmic}[1]
461 |     \Function{Mapper}{}
462 |     \Procedure{Map}{string $t$, integer $r$}
463 |     \State $\textsc{Emit}(\textrm{string }t, \textrm{pair }(r, 1))$
464 |     \EndProcedure
465 |     \EndFunction
466 |   \end{algorithmic}
467 | 
468 |   \begin{algorithmic}[1]
469 |     \Function{Combiner}{}
470 |     \Procedure{Combine}{string $t$, pairs $[ (s_1, c_1), (s_2, c_2) \ldots ]$}
471 |     \State $sum \gets 0$
472 |     \State $cnt \gets 0$
473 |     \ForAll{$ \textrm{pair }(s, c) \in \textrm{pairs }[ (s_1, c_1), (s_2, c_2) \ldots ]$}
474 |     \State $sum \gets sum + s$
475 |     \State $cnt \gets cnt + c$
476 |     \EndFor
477 |     \State $\textsc{Emit}(\textrm{string }t, \textrm{pair }(sum, cnt))$
478 |     \EndProcedure
479 |     \EndFunction
480 |   \end{algorithmic}
481 | 
482 |   \begin{algorithmic}[1]
483 |     \Function{Reducer}{}
484 |     \Procedure{Reduce}{string $t$, pairs $[ (s_1, c_1), (s_2, c_2) \ldots ]$}
485 |     \State $sum \gets 0$
486 |     \State $cnt \gets 0$
487 |     \ForAll{$ \textrm{pair }(s, c) \in \textrm{pairs }[ (s_1, c_1), (s_2, c_2) \ldots ]$}
488 |     \State $sum \gets sum + s$
489 |     \State $cnt \gets cnt + c$
490 |     \EndFor
491 |     \State $r_{avg} \gets sum/cnt$
492 |     \State $\textsc{Emit}(\textrm{string }t, \textrm{integer } r_{avg})$
493 |     \EndProcedure
494 |     \EndFunction
495 |   \end{algorithmic}
496 | \end{tiny}
497 | \end{algorithm}
498 | 
499 | }
500 | 
501 | 


--------------------------------------------------------------------------------
/relal/relational.tex:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | \subsection{Introduction}
  4 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  5 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  6 | 
  7 | 
  8 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  9 | \frame {\frametitle{Introduction}
 10 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 11 |   \begin{itemize}
 12 |   \item \textbf{Disclaimer}
 13 |     \begin{itemize}
 14 |     \item This is not a full course on Relational Algebra
 15 |     \item Neither this is a course on SQL
 16 |     \end{itemize}
 17 | 
 18 |     \vspace{20pt}
 19 | 
 20 |   \item \textbf{Introduction to Relational Algebra, RDBMS and SQL}
 21 |     \begin{itemize}
 22 |     \item Follow the video lectures of the Stanford class on RDBMS
 23 |     \item[] \url{https://www.coursera.org/course/db}
 24 |     \item[$\to$] Note that you have to sign up for an account
 25 |     \end{itemize}
 26 | 
 27 |     \vspace{20pt}
 28 | 
 29 |   \item \textbf{Overview of this part}
 30 |     \begin{itemize}
 31 |     \item Brief introduction to simplified relational algebra
 32 |     \item Useful to understand Pig, Hive and HBase
 33 |     \end{itemize}
 34 | 
 35 |   \end{itemize}
 36 | }
 37 | 
 38 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 39 | \frame {\frametitle{Relational Algebra Operators}
 40 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 41 |   \begin{itemize}
 42 |   \item \textbf{There are a number of operations on data that fit well
 43 |       the relational algebra model}
 44 |     \begin{itemize}
 45 |     \item In traditional RDBMS, queries involve retrieval of
 46 |       {\color{red}small amounts of data}
 47 |     \item In this course, and in particular in this class, we should
 48 |       keep in mind the particular workload underlying MapReduce
 49 |     \item[$\to$] Full scans of large amounts of data
 50 |     \item[$\to$] Queries are not selective\footnote{This is true in general. However, most ETL jobs involve selection and projection to do data preparation.}, they process all data
 51 |     \end{itemize}
 52 | 
 53 |     \vspace{20pt}
 54 | 
 55 |   \item \textbf{A review of some terminology}
 56 |     \begin{itemize}
 57 |     \item A {\color{red}\textit{relation}} is a table
 58 |     \item {\color{red}\textit{Attributes}} are the column headers of
 59 |       the table
 60 |     \item The set of attributes of a relation is called a
 61 |       {\color{red}\textit{schema}}
 62 |     \item[] Example: $R(A_1,A_2,...,A_n)$ indicates a relation called
 63 |       $R$ whose attributes are $A_1,A_2,...,A_n$
 64 |     \end{itemize}
 65 |   \end{itemize}
 66 | }
 67 | 
 68 | 
 69 | 
 70 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 71 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 72 | \subsection{Operators}
 73 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 74 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 75 | 
 76 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 77 | \frame {\frametitle{Operators}
 78 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 79 |   \begin{itemize}
 80 |   \item \textbf{Let's start with an example}
 81 |     \begin{itemize}
 82 |     \item Below, we have part of a relation called \textit{Links}
 83 |       describing the structure of the Web
 84 |     \item There are two \textit{attributes}: \textit{From} and
 85 |       \textit{To}
 86 |     \item A row, or {\color{red}\textit{tuple}}, of the relation is a
 87 |       pair of URLs, indicating the existence of a link between them
 88 |     \item[$\to$] The number of tuples in a real dataset is in the
 89 |       order of billions ($10^9$)
 90 |     \end{itemize}
 91 |   \end{itemize}
 92 |   
 93 |   \begin{center}
 94 |     \begin{tabular}[h]{|c|c|}
 95 |       \hline
 96 |       From & To \\
 97 |       \hline
 98 |       \hline
 99 |       \texttt{url1} & \texttt{url2} \\
100 |       \texttt{url1} & \texttt{url3} \\
101 |       \texttt{url2} & \texttt{url3} \\
102 |       \texttt{url2} & \texttt{url4} \\
103 |       $\cdots$ & $\cdots$\\
104 |       \hline
105 |     \end{tabular}   
106 |   \end{center}
107 | }
108 | 
109 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
110 | \frame {\frametitle{Operators}
111 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
112 |   \begin{itemize}
113 |   \item \textbf{Relations (however big) can be stored in a distributed
114 |       filesystem}
115 |     \begin{itemize}
116 |     \item If they don't fit in a single machine, they're broken into
117 |       pieces (think HDFS)
118 |     \end{itemize}
119 | 
120 |     \vspace{20pt}
121 | 
122 |   \item \textbf{Next, we review and describe a set of relational
123 |       algebra operators}
124 |     \begin{itemize}
125 |     \item Intuitive explanation of what they do
126 |     \item ``Pseudo-code'' of their implementation in/by MapReduce
127 |     \end{itemize}
128 |   \end{itemize}
129 | }
130 | 
131 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
132 | \frame {\frametitle{Operators}
133 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
134 |   \begin{itemize}
135 |   \item \textbf{Selection: $\sigma_C(R)$}
136 |     \begin{itemize}
137 |     \item Apply condition $C$ to each tuple of relation $R$
138 |     \item Produce in output a relation containing only tuples that
139 |       satisfy $C$
140 |     \end{itemize}
141 | 
142 |     \vspace{10pt}
143 | 
144 |   \item \textbf{Projection: $\pi_S(R)$}
145 |     \begin{itemize}
146 |     \item Given a \textit{subset} $S$ of relation $R$ attributes
147 |     \item Produce in output a relation containing only tuples for
148 |       the attributes in $S$
149 |     \end{itemize}
150 | 
151 |     \vspace{10pt}
152 | 
153 |   \item \textbf{Union, Intersection and Difference}
154 |     \begin{itemize}
155 |     \item Well known operators on sets
156 |     \item Apply to the set of tuples in two relations that have the
157 |       {\color{red}same schema}
158 |     \item Variations on the theme: work on \textit{bags}
159 |     \end{itemize}
160 | 
161 |   \end{itemize}
162 | }
163 | 
164 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
165 | \frame {\frametitle{Operators}
166 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
167 |   \begin{itemize}
168 |   \item \textbf{Natural join $R \Join S$}
169 |     \begin{itemize}
170 |     \item Given two relations, \textit{compare each pair of tuples},
171 |       one from each relation
172 |     \item If the tuples agree on all the attributes common to both
173 |       schema $\to$ produce an output tuple that has components on each
174 |       attribute
175 |     \item Otherwise produce nothing
176 |     \item {\color{red}\textit{Join condition}} can be on a subset of attributes
177 |     \end{itemize}
178 | 
179 |     \vspace{20pt}
180 | 
181 |   \item \textbf{Let's work with an example}
182 |     \begin{itemize}
183 |     \item Recall the \textit{Links} relation from previous slides
184 |     \item \texttt{Query} (or data processing job): \texttt{find the paths of length two
185 |       in the Web}
186 |     \end{itemize}
187 |   \end{itemize}
188 | }
189 | 
190 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
191 | \frame {\frametitle{Join Example}
192 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
193 |   \begin{itemize}
194 |   \item \textbf{Informally, to satisfy the query we must:}
195 |     \begin{itemize}
196 |     \item find the triples of URLs in the form $(u,v,w)$ such that
197 |       there is a link from $u$ to $v$ and a link from $v$ to $w$
198 |     \end{itemize}
199 | 
200 |     \vspace{20pt}
201 | 
202 |   \item \textbf{Using the join operator}
203 |     \begin{itemize}
204 |     \item Imagine we have two relations (with different schema), and
205 |       let's try to apply the natural join operator
206 |     \item There are two copies of \textit{Links}: $L_1(U_1,U_2)$ and
207 |       $L_2(U_2,U_3)$
208 |     \item Let's compute $L_1 \Join L_2$
209 |       \begin{itemize}
210 |       \item For each tuple $t_1$ of $L_1$ and each tuple $t_2$ of
211 |         $L_2$, see if their $U_2$ component are the same
212 |       \item If yes, then produce a tuple in output, with the schema $(U_1,U_2,U_3)$
213 |       \end{itemize}
214 |     \end{itemize}
215 | 
216 |   \end{itemize}
217 | }
218 | 
219 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
220 | \frame {\frametitle{Join Example}
221 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
222 |   \begin{itemize}
223 |   \item \textbf{What we have seen is called (to be precise) a
224 |       {\color{red}self-join}}
225 |     \begin{itemize}
226 |     \item {\color{red}Question}: How would you implement a self join in your favorite programming language?
227 |     \item {\color{red}Question}: What is the time complexity of your
228 |       algorithm?
229 |     \item {\color{red}Question}: What is the space complexity of your
230 |       algorithm?
231 |     \end{itemize}
232 | 
233 |     \vspace{20pt}
234 | 
235 |   \item \textbf{To continue the example}
236 |     \begin{itemize}
237 |     \item Say you are not interested in the entire two-hop path but
238 |       just the start and end nodes
239 |     \item Then you do a projection and the notation would be:
240 |       $\pi_{U_1,U_3}(L_1 \Join L_2)$
241 |     \end{itemize}
242 |   \end{itemize}
243 | 
244 | }
245 | 
246 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
247 | \frame {\frametitle{Operators}
248 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
249 |   \begin{itemize}
250 |   \item \textbf{Grouping and Aggregation: $\gamma_X(R)$}
251 |     \begin{itemize}
252 |     \item Given a relation $R$, partition its tuples according to
253 |       their values in one set of attributes $G$
254 |       \begin{itemize}
255 |       \item The set $G$ is called the {\color{red}grouping attributes}
256 |       \end{itemize}
257 |     \item Then, for each group, aggregate the values in certain other
258 |       attributes
259 |       \begin{itemize}
260 |       \item Aggregation functions: \texttt{SUM}, \texttt{COUNT}, \texttt{AVG}, \texttt{MIN}, \texttt{MAX}, ...
261 |       \end{itemize}
262 |     \end{itemize}
263 | 
264 |     \vspace{20pt}
265 | 
266 |   \item \textbf{In the notation, $X$ is a list of elements that can be:}
267 |     \begin{itemize}
268 |     \item A grouping attribute
269 |     \item An expression $\theta(A)$, where $\theta$ is one of the
270 |       (five) aggregation functions and $A$ is an attribute
271 |       {\color{red}NOT} among the grouping attributes 
272 |     \end{itemize}
273 | 
274 |   \end{itemize}
275 | 
276 | }
277 | 
278 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
279 | \frame {\frametitle{Operators}
280 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
281 |   \begin{itemize}
282 |   \item \textbf{Grouping and Aggregation: $\gamma_X(R)$}
283 |     \begin{itemize}
284 |     \item The result of this operation is a relation with one tuple
285 |       for each group
286 |     \item That tuple has a component for each of the grouping
287 |       attributes, with the value common to tuples of that group
288 |     \item That tuple has another component for each aggregation, with
289 |       the aggregate value for that group
290 |     \end{itemize}
291 | 
292 |     \vspace{20pt}
293 | 
294 |   \item \textbf{Let's work with an example}
295 |     \begin{itemize}
296 |     \item Imagine that a social-networking site has a relation
297 |     \item[] \texttt{Friends(User, Friend)}
298 |     \item The tuples are pairs $(a,b)$ such that $b$ is a friend of
299 |       $a$
300 |     \item \texttt{Query: compute the number of friends each member has}
301 |     \end{itemize}
302 | 
303 |   \end{itemize}
304 | }
305 | 
306 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
307 | \frame {\frametitle{Grouping and Aggregation Example} 
308 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
309 |   \begin{itemize}
310 |   \item \textbf{How to satisfy the query}
311 |     \begin{itemize}
312 |     \item[] $\gamma_{User, \mathtt{COUNT}(Friend))}(Friends)$
313 |     \item This operation groups all the tuples by the value in their
314 |       frist component
315 |     \item[$\to$] There is one group for each user
316 |     \item Then, for each group, it counts the number of friends
317 |     \end{itemize}
318 | 
319 |     \vspace{20pt}
320 | 
321 |   \item \textbf{Some details}
322 |     \begin{itemize}
323 |     \item The \texttt{COUNT} operation applied to an attribute does
324 |       not consider the values of that attribute
325 |     \item In fact, it counts the number of tuples in the group
326 |     \item In SQL, there is a ``count distinct'' operator that counts
327 |       the number of different values
328 |     \end{itemize}
329 |   \end{itemize}
330 | }
331 | 
332 | 
333 | 
334 | 
335 | 
336 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
337 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
338 | \subsection{Operators and MapReduce}
339 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
340 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
341 | 
342 | 
343 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
344 | \frame {\frametitle{Computing Selection}
345 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
346 |   \begin{itemize}
347 |   \item \textbf{In practice, selections do not need a full-blown
348 |       MapReduce implementation}
349 |     \begin{itemize}
350 |     \item They can be implemented in the {\color{red}map phase alone}
351 |     \item Actually, they could also be implemented in the reduce portion
352 |     \end{itemize}
353 | 
354 |     \vspace{20pt}
355 | 
356 |   \item \textbf{A MapReduce implementation of $\sigma_C(R)$}
357 |     \begin{itemize}
358 |     \item[\texttt{Map}:] 
359 |       \begin{itemize}
360 |       \item For each tuple $t$ in $R$, check if $t$ satisfies $C$
361 |       \item If so, emit a key/value pair $(t,t)$
362 |       \end{itemize}
363 |     \item[\texttt{Reduce}:]
364 |       \begin{itemize}
365 |       \item  Identity reducer
366 |       \item {\color{red}Question}: single or multiple reducers?
367 |       \end{itemize}
368 |     \end{itemize}
369 |     
370 |     \vspace{20pt}
371 |     
372 |   \item \textbf{NOTE: the output is not exactly a relation}
373 |     \begin{itemize}
374 |     \item {\color{red}WHY?}
375 |     \end{itemize}
376 |     
377 |   \end{itemize}
378 | }
379 | 
380 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
381 | \frame {\frametitle{Computing Projections}
382 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
383 |   \begin{itemize}
384 |   \item \textbf{Similar process to selection}
385 |     \begin{itemize}
386 |     \item But, projection may cause same tuple to appear several times
387 |     \end{itemize}
388 | 
389 |     \vspace{20pt}
390 | 
391 |   \item \textbf{A MapReduce implementation of $\pi_S(R)$}
392 |     \begin{itemize}
393 |     \item[\texttt{Map}:] 
394 |       \begin{itemize}
395 |       \item For each tuple $t$ in $R$, construct a tuple $t'$ by
396 |         eliminating those components whose attributes are not in $S$
397 |       \item Emit a key/value pair $(t',t')$
398 |       \end{itemize}
399 |     \item[\texttt{Reduce}:] 
400 |       \begin{itemize}
401 |       \item For each key $t'$ produced by any of the Map tasks, fetch
402 |         $t', [t', \cdots, t']$
403 |       \item Emit a key/value pair $(t',t')$
404 |       \end{itemize}
405 |     \end{itemize}
406 | 
407 |     \vspace{20pt}
408 | 
409 |   \item \textbf{NOTE: the reduce operation is {\color{red}duplicate elimination}}
410 |     \begin{itemize}
411 |     \item This operation is associative and commutative, so it is
412 |       possible to optimize MapReduce by using a \texttt{Combiner} in
413 |       each mapper
414 |     \end{itemize}
415 |   \end{itemize}
416 | 
417 | }
418 | 
419 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
420 | \frame {\frametitle{Computing Unions}
421 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
422 |   \begin{itemize}
423 |   \item \textbf{Suppose relations $R$ and $S$ have the same schema}
424 |     \begin{itemize}
425 |     \item Map tasks will be assigned chunks from either $R$ or $S$
426 |     \item Mappers don't do much, just pass by to reducers
427 |     \item Reducers do duplicate elimination
428 |     \end{itemize}
429 |     
430 |     \vspace{20pt}
431 |     
432 |   \item \textbf{A MapReduce implementation of union}
433 |     \begin{itemize}
434 |     \item[\texttt{Map}:]\footnote{Hadoop MapReduce supports reading multiple inputs.} 
435 |       \begin{itemize}
436 |       \item For each tuple $t$ in $R$ or $S$, emit a key/value pair $(t,t)$
437 |       \end{itemize}
438 | 
439 |     \item[\texttt{Reduce}:] 
440 |       \begin{itemize}
441 |       \item For each key $t$ there will be either one or two values
442 |       \item Emit $(t,t)$ in either case
443 |       \end{itemize}
444 |     \end{itemize}
445 | 
446 |   \end{itemize}
447 | }
448 | 
449 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
450 | \frame {\frametitle{Computing Intersections}
451 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
452 |   \begin{itemize}
453 |   \item \textbf{Very similar to computing unions}
454 |     \begin{itemize}
455 |     \item Suppose relations $R$ and $S$ have the same schema
456 |     \item The map function is the same (an identity mapper) as for union
457 |     \item The reduce function must produce a tuple only if both
458 |       relations have that tuple
459 |     \end{itemize}
460 |     
461 |     \vspace{20pt}
462 |     
463 |   \item \textbf{A MapReduce implementation of intersection}
464 |     \begin{itemize}
465 |     \item[\texttt{Map}:] 
466 |       \begin{itemize}
467 |       \item For each tuple $t$ in $R$ or $S$, emit a key/value pair
468 |         $(t,t)$
469 |       \end{itemize}
470 |     \item[\texttt{Reduce}:]
471 |       \begin{itemize}
472 |       \item If key $t$ has value list $[t,t]$ then emit the key/value
473 |         pair $(t,t)$
474 |       \item Otherwise, emit the key/value pair $(t, \mathtt{NULL})$
475 |       \end{itemize}
476 |     \end{itemize}
477 |   \end{itemize}
478 | }
479 | 
480 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
481 | \frame {\frametitle{Computing difference}
482 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
483 |   \begin{itemize}
484 |   \item \textbf{Assume we have two relations $R$ and $S$ with the same schema}
485 |     \begin{itemize}
486 |     \item The only way a tuple $t$ can appear in the output is if it
487 |       is in $R$ but not in $S$
488 |     \item The map function passes tuples from $R$ and $S$ to the reducer
489 |     \item NOTE: it must inform the reducer whether the tuple came from $R$ or $S$
490 |     \end{itemize}
491 |     
492 |     \vspace{20pt}
493 |     
494 |   \item \textbf{A MapReduce implementation of difference}
495 |     \begin{itemize}
496 |     \item[\texttt{Map}:]
497 |       \begin{itemize}
498 |       \item For a tuple $t$ in $R$ emit a key/value pair
499 |         $(t,\mathtt{'R'})$ and for a tuple $t$ in $S$, emit a
500 |         key/value pair $(t,\mathtt{'S'})$
501 |       \end{itemize}
502 |     \item[\texttt{Reduce}:]
503 |       \begin{itemize}
504 |       \item For each key $t$, do the following:
505 |       \item If it is associated to $\mathtt{'R'}$, then emit $(t,t)$
506 |       \item If it is associated to $[\mathtt{'R'},\mathtt{'S'}]$ or
507 |         $[\mathtt{'S'},\mathtt{'R'}]$, or $[\mathtt{'S'}]$, emit the
508 |         key/value pair $(t, \mathtt{NULL})$
509 |       \end{itemize}
510 |     \end{itemize}
511 |   \end{itemize}
512 | }
513 | 
514 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
515 | \frame {\frametitle{Computing the natural Join}
516 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
517 |   \begin{itemize}
518 |   \item \textbf{This topic is subject to continuous refinements}
519 |     \begin{itemize}
520 |     \item There are many \texttt{JOIN} operators and many different
521 |       implementations
522 |     \item We've seen some of them in the laboratory sessions
523 |     \end{itemize}
524 | 
525 |     \vspace{20pt}
526 | 
527 |   \item \textbf{Let's look at two relations $R(A,B)$ and $S(B,C)$}
528 |     \begin{itemize}
529 |     \item We must find tuples that agree on their $B$ components
530 |     \item We shall use the $B$-value of tuples from either relation as
531 |       the key
532 |     \item The value will be the other component and the name of the
533 |       relation
534 |     \item That way the reducer knows from which relation each tuple is
535 |       coming from
536 |     \end{itemize}
537 |   \end{itemize}
538 | }
539 | 
540 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
541 | \frame {\frametitle{Computing the natural Join}
542 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
543 |   \begin{itemize}
544 |   \item \textbf{A MapReduce implementation of Natural Join}
545 |     \begin{itemize}
546 |     \item[\texttt{Map}:]
547 |       \begin{itemize}
548 |       \item For each tuple $(a,b)$ of $R$ emit the key/value pair $(b,
549 |         (\mathtt{'R'}, a))$
550 |       \item For each tuple $(b,c)$ of $S$ emit the key/value pair $(b,
551 |         (\mathtt{'S'}, c))$
552 |       \end{itemize}
553 |     \item[\texttt{Reduce}:]
554 |       \begin{itemize}
555 |       \item Each key $b$ will be associated to a list of pairs that
556 |         are either $(\mathtt{'R'}, a)$ or $(\mathtt{'S'}, c)$
557 |       \item Emit key/value pairs of the form $(b, [(a_1,b,c_1),(a_2,b,c_2),\cdots,(a_n,b,c_n)])$
558 |       \end{itemize}
559 |     \end{itemize}
560 |     
561 |     \vspace{20pt}
562 |     
563 |   \item \textbf{NOTES}
564 |     \begin{itemize}
565 |     \item {\color{red}Question}: what if the MapReduce framework
566 |       wouldn't implement the distributed (and sorted) group by?
567 |     \item In general, for $n$ tuples in relation $R$ and $m$ tuples
568 |       in relation $S$ all with a common $B$-value, then we end up
569 |       with $nm$ tuples in the result
570 |     \item If all tuples of both relations have the same $B$-value,
571 |       then we're computing the \textbf{Cartesian product}
572 |     \end{itemize}
573 |     
574 |   \end{itemize}
575 |   
576 | }
577 | 
578 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
579 | \frame {\frametitle{Grouping and Aggregation in MapReduce}
580 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
581 |   \begin{itemize}
582 |     
583 |   \item \textbf{Let $R(A,B,C)$ be a relation to which we apply
584 |       $\gamma_{A,\theta(B)}(R)$}
585 |     \begin{itemize}
586 |     \item The map operation prepares the grouping
587 |     \item The grouping is done by the framework
588 |     \item The reducer computes the aggregation
589 |     \item Simplifying assumptions: one grouping attribute and
590 |       one aggregation function
591 |     \end{itemize}
592 |     
593 |     \vspace{20pt}
594 |     
595 |   \item \textbf{MapReduce implementation of $\gamma_{A,\theta(B)}(R)$}\footnote{Note here that we are also projecting.}
596 |     \begin{itemize}
597 |     \item[\texttt{Map}:]
598 |       \begin{itemize}
599 |       \item For each tuple $(a,b,c)$ emit the key/value pair $(a,b)$
600 |       \end{itemize}
601 |     \item[\texttt{Reduce}:]
602 |       \begin{itemize}
603 |       \item Each key $a$ represents a group
604 |       \item Apply $\theta$ to the list $[b_1,b_2,\cdots,b_n]$
605 |       \item Emit the key/value pair $(a,x)$ where $x=\theta([b_1,b_2,\cdots,b_n])$
606 |       \end{itemize}   
607 |     \end{itemize}
608 | 
609 |   \end{itemize}    
610 | }
611 | 


--------------------------------------------------------------------------------
/hadoop/mapreduce.tex:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  3 | \subsection{Overview}
  4 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  5 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  6 | 
  7 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  8 | \frame {\frametitle{Disclaimer}
  9 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 10 |   \begin{itemize}
 11 |   \item \textbf{MapReduce APIs}
 12 |     \begin{itemize}
 13 |     \item Fast evolving
 14 |     \item Sometimes confusing
 15 |     \end{itemize}
 16 | 
 17 | \vspace{40pt} 
 18 | 
 19 |   \item \textbf{Do {\color{red}NOT} rely on this slide deck as a reference}
 20 |     \begin{itemize}
 21 |       \item Use appropriate API docs
 22 |       \item Use Eclipse
 23 |     \end{itemize}
 24 |   \end{itemize}
 25 | }
 26 | 
 27 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 28 | \frame {\frametitle{Anatomy of a MapReduce Job Run}
 29 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 30 |    \begin{center}
 31 |       \framebox{\includegraphics[scale=0.36]{./Figures/mapreduce}}      
 32 |     \end{center}
 33 | }
 34 | 
 35 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 36 | \frame {\frametitle{Job Submission}
 37 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 38 |   \begin{itemize}
 39 |   \item \textbf{\texttt{JobClient} class}
 40 |     \begin{itemize}
 41 |     \item The \texttt{runJob()} method creates a new instance of a
 42 |       \textbf{JobClient}
 43 |     \item Then it calls the \texttt{submitJob()} on this class
 44 |     \end{itemize}
 45 | 
 46 | \vspace{20pt} 
 47 | 
 48 |   \item \textbf{Simple verifications on the Job}
 49 |     \begin{itemize}
 50 |     \item Is there an output directory?
 51 |     \item Are there any input splits?
 52 |     \item Can I copy the JAR of the job to HDFS?
 53 |     \end{itemize}
 54 | 
 55 | \vspace{20pt}
 56 | 
 57 |   \item \textbf{NOTE: the JAR of the job is replicated 10 times}
 58 |   \end{itemize}
 59 | }
 60 | 
 61 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 62 | \frame {\frametitle{Job Initialization}
 63 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 64 |   \begin{itemize}
 65 |   \item \textbf{The \texttt{JobTracker} is responsible for:}
 66 |     \begin{itemize}
 67 |     \item Create an object for the job
 68 |     \item Encapsulate its tasks
 69 |     \item {\color{red}Bookkeeping} with the tasks' status and progress
 70 |     \end{itemize}
 71 | 
 72 |     \vspace{20pt}
 73 | 
 74 |   \item \textbf{This is where the scheduling happens}
 75 |     \begin{itemize}
 76 |     \item \texttt{JobTracker} performs scheduling by maintaining a
 77 |       queue
 78 |     \item Queuing disciplines are pluggable
 79 |     \end{itemize}
 80 | 
 81 |     \vspace{20pt}
 82 | 
 83 |   \item \textbf{Compute mappers and reducers}
 84 |     \begin{itemize}
 85 |     \item \texttt{JobTracker} retrieves input splits (computed by
 86 |       \texttt{JobClient})
 87 |     \item Determines the number of Mappers based on the number of
 88 |       input splits
 89 |     \item Reads the configuration file to set the number of Reducers
 90 |     \end{itemize}
 91 | 
 92 |   \end{itemize}
 93 | }
 94 | 
 95 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 96 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 97 | \subsection{Scheduling}
 98 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 99 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
100 | \begin{frame}
101 |  \begin{colorblock}{blue}{lightblue}{ }
102 |   \begin{center}
103 |     \textbf{Scheduling}
104 |   \end{center}
105 |   \end{colorblock}
106 | \end{frame}
107 | 
108 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
109 | \frame {\frametitle{Task Assignment}
110 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
111 |   \begin{itemize}
112 |   \item\textbf{Heartbeat-based mechanism}
113 |     \begin{itemize}
114 |     \item \texttt{TaskTrackers} periodically send heartbeats to the
115 |       \texttt{JobTracker}
116 |     \item \texttt{TaskTracker} is alive
117 |     \item Heartbeat contains also information on availability of the
118 |       \texttt{TaskTrackers} to execute a task
119 |     \item \texttt{JobTracker} piggybacks a task if
120 |       \texttt{TaskTracker} is available
121 |     \end{itemize}
122 | 
123 |     \vspace{20pt}
124 | 
125 |   \item \textbf{Selecting a task}
126 |     \begin{itemize}
127 |     \item \texttt{JobTracker} first needs to select a job
128 |       (\textit{i.e.} Job scheduling)
129 |     \item \texttt{TaskTrackers} have a fixed number of slots for map
130 |       and reduce tasks
131 |     \item \texttt{JobTracker} gives priority to map tasks ({\color{red}WHY?})
132 |     \end{itemize}
133 | 
134 |     \vspace{20pt}
135 | 
136 |   \item \textbf{Data locality}
137 |     \begin{itemize}
138 |     \item \texttt{JobTracker} is topology aware
139 |       \begin{itemize}
140 |       \item Useful for map tasks
141 |       \item Unused for reduce tasks ({\color{red}WHY?})
142 |       \end{itemize}
143 |     \end{itemize}
144 | 
145 |   \end{itemize} 
146 | }
147 | 
148 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
149 | \frame {\frametitle{Task Execution}
150 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
151 |   \begin{itemize}
152 |   \item \textbf{Task Assignment is done, now \texttt{TaskTrackers} can
153 |     execute}
154 |     \begin{itemize}
155 |     \item Copy the JAR from HDFS
156 |     \item Create a local working directory
157 |     \item Create an instance of \texttt{TaskRunner}
158 |     \end{itemize}
159 | 
160 |     \vspace{20pt}
161 | 
162 |   \item \textbf{\texttt{TaskRunner} launches a {\color{red}child} JVM}
163 |     \begin{itemize}
164 |     \item This prevents bugs from stalling the \texttt{TaskTracker}
165 |     \item A new child JVM is created per \texttt{InputSplit}
166 |       \begin{itemize}
167 |       \item Can be overridden by specifying JVM Reuse option, which is
168 |         very useful for {\color{red}custom, in-memory, combiners}
169 |       \end{itemize}
170 |     \end{itemize}
171 | 
172 |     \vspace{20pt}
173 | 
174 |   \item \textbf{Streaming and Pipes}
175 |     \begin{itemize}
176 |     \item User-defined map and reduce methods need not to be in Java
177 |     \item Streaming and Pipes allow C++ or python mappers and reducers
178 |     \item NOTE: this feature is heavily used in industry, with some tricky downsides
179 |     \end{itemize}
180 |   \end{itemize}
181 | }
182 | 
183 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
184 | \frame {\frametitle{Scheduling in detail}
185 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
186 |   \begin{itemize}
187 |   \item \textbf{FIFO Scheduler (default in vanilla Hadoop)}
188 |     \begin{itemize}
189 |     \item First-come-first-served
190 |       \begin{itemize}
191 |       \item Long jobs monopolize the cluster
192 |       \end{itemize}
193 |     \end{itemize}
194 | 
195 |     \vspace{20pt}
196 | 
197 |   \item \textbf{Fair Scheduler (default in Cloudera)}
198 |     \begin{itemize}
199 |     \item Every user gets a fair share of the cluster capacity over time
200 |     \item Jobs are placed into pools, one for each user
201 |       \begin{itemize}
202 |       \item Users that submit more jobs have no more resources than others
203 |       \item Can guarantee minimum capacity per pool
204 |       \end{itemize}
205 |     \end{itemize}
206 | 
207 |     \vspace{20pt}
208 | 
209 |   \item \textbf{Capacity Scheduler (heavily used in Yahoo)}
210 |     \begin{itemize}
211 |     \item Hierarchical queues (mimic an organization)
212 |     \item FIFO scheduling in each queue
213 |     \item Supports priority
214 |     \end{itemize}
215 |   \end{itemize}
216 | }
217 | 
218 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
219 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
220 | \subsection{Failures}
221 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
222 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
223 | \begin{frame}
224 |  \begin{colorblock}{blue}{lightblue}{ }
225 |   \begin{center}
226 |     \textbf{Failures}
227 |   \end{center}
228 |   \end{colorblock}
229 | \end{frame}
230 | 
231 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
232 | \frame {\frametitle{Handling Failures}
233 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
234 |   \begin{beamerboxesrounded}[shadow=true]{}
235 |     In the real world, code is buggy, processes crash and machines fail
236 |   \end{beamerboxesrounded}
237 | 
238 |   \begin{itemize}
239 |   \item \textbf{Task Failure}
240 |     \begin{itemize}
241 |     \item Case 1: map or reduce task throws a runtime exception
242 |       \begin{itemize}
243 |       \item The child JVM reports back to the parent
244 |         \texttt{TaskTracker}
245 |       \item \texttt{TaskTracker} logs the error and marks the
246 |         TaskAttempt as failed
247 |       \item \texttt{TaskTracker} frees up a slot to run another task
248 |       \end{itemize}
249 |     \item Case 2: Hanging tasks
250 |       \begin{itemize}
251 |       \item \texttt{TaskTracker} notices no progress updates (timeout
252 |         = 10 minutes)
253 |       \item \texttt{TaskTracker} kills the child JVM\footnote{With
254 |           streaming, you need to take care of the orphaned process.}
255 |       \end{itemize}
256 |       \end{itemize}
257 |     \item \texttt{JobTracker} is notified of a failed task
258 |       \begin{itemize}
259 |       \item Avoids rescheduling the task on the same
260 |         \texttt{TaskTracker}
261 |       \item If a task fails 4 times, it is not
262 |         re-scheduled\footnote{Exception is made for speculative execution}
263 |       \item {\color{red}Default behavior}: if any task fails 4 times,
264 |         the job fails
265 |       \end{itemize}
266 |    \end{itemize}
267 | }
268 | 
269 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
270 | \frame {\frametitle{Handling Failures}
271 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
272 |   \begin{itemize}
273 |   \item \textbf{\texttt{TaskTracker} Failure}
274 |     \begin{itemize}
275 |     \item Types: crash, running very slowly
276 |     \item Heartbeats will not be sent to \texttt{JobTracker}
277 |     \item \texttt{JobTracker} waits for a timeout (10 minutes), then
278 |       it removes the \texttt{TaskTracker} from its scheduling pool
279 |     \item \texttt{JobTracker} needs to reschedule even
280 |       \textit{completed} tasks ({\color{red}WHY?})
281 |     \item \texttt{JobTracker} needs to reschedule tasks in progress
282 |     \item \texttt{JobTracker} may even blacklist a
283 |       \texttt{TaskTracker} if too many tasks failed
284 |     \end{itemize}
285 | 
286 |     \vspace{20pt}
287 | 
288 |   \item \textbf{\texttt{JobTracker} Failure}
289 |     \begin{itemize}
290 |     \item Currently, Hadoop has no mechanism for this kind of failure
291 |     \item In future (and commercial) releases:
292 |       \begin{itemize}
293 |       \item Multiple \texttt{JobTrackers}
294 |       \item Use ZooKeeper as a coordination mechanisms
295 |       \item[$\to$] {\color{red}High Availability}
296 |       \end{itemize}
297 |     \end{itemize}
298 | 
299 |   \end{itemize}
300 | }
301 | 
302 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
303 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
304 | \subsection{Internals}
305 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
306 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
307 | \begin{frame}
308 |  \begin{colorblock}{blue}{lightblue}{ }
309 |   \begin{center}
310 |     \textbf{Internals}
311 |   \end{center}
312 |   \end{colorblock}
313 | \end{frame}
314 | 
315 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
316 | \frame {\frametitle{Shuffle and Sort}
317 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
318 |   \begin{itemize}
319 |   \item \textbf{The MapReduce framework guarantees the input to every reducer
320 |       to be sorted by key}
321 |     \begin{itemize}
322 |     \item The process by which the system sorts and transfers map
323 |       outputs to reducers is known as {\color{red}shuffle}
324 |     \end{itemize}
325 | 
326 |     \vspace{20pt}
327 | 
328 |   \item \textbf{Shuffle is the most important part of the framework, where the
329 |     ``magic'' happens}
330 |     \begin{itemize}
331 |     \item Good understanding allows optimizing both the framework and
332 |       the execution time of MapReduce jobs
333 |     \end{itemize}
334 | 
335 |     \vspace{20pt}
336 | 
337 |   \item \textbf{Subject to continuous refinements}
338 | 
339 |   \end{itemize}
340 | }
341 | 
342 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
343 | \frame {\frametitle{Shuffle and Sort: Map Side}
344 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
345 |    \begin{center}
346 |       \includegraphics[scale=0.4]{./Figures/map_task}
347 |     \end{center}
348 | }
349 | 
350 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
351 | \frame {\frametitle{Shuffle and Sort: the Map Side}
352 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
353 |   \begin{itemize}
354 |   \item \textbf{The output of a map task is not simply written to disk}
355 |     \begin{itemize}
356 |     \item In memory buffering
357 |     \item Pre-sorting
358 |     \end{itemize}
359 | 
360 |     \vspace{20pt}
361 |     
362 |   \item \textbf{Circular memory buffer}
363 |     \begin{itemize}
364 |     \item 100 MB by default
365 |     \item Threshold based mechanism to {\color{red}spill} buffer content to disk
366 |     \item Map output written to the buffer {\color{red}while} spilling to disk
367 |     \item If buffer fills up while spilling, the map task is \textbf{blocked}
368 |     \end{itemize}
369 |  
370 |     \vspace{20pt}
371 |     
372 |   \item \textbf{Disk spills}
373 |     \begin{itemize}
374 |     \item Written in round-robin to a local dir
375 |     \item Output data is partitioned corresponding to the reducers they will be sent to
376 |     \item Within each partition, data is sorted ({\color{red}in-memory})
377 |     \item Optionally, if there is a combiner, it is executed just after the sort phase ({\color{red}WHY?})
378 |     \end{itemize}
379 |   \end{itemize}
380 | }
381 | 
382 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
383 | \frame {\frametitle{Shuffle and Sort: the Map Side}
384 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
385 |   \begin{itemize}
386 |   \item \textbf{More on spills and memory buffer}
387 |     \begin{itemize}
388 |     \item Each time the buffer is full, a {\color{red}new} spill is created
389 |     \item Once the map task finishes, there are many spills
390 |     \item Such spills are merged into a single partitioned and sorted output file
391 |     \end{itemize}
392 | 
393 |     \vspace{40pt}
394 | 
395 |   \item \textbf{The output file partitions are made available to reducers over HTTP}
396 |     \begin{itemize}
397 |     \item There are 40 (default) threads dedicated to serve the file partitions to reducers
398 |     \end{itemize}
399 |   \end{itemize}
400 | }
401 | 
402 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
403 | \frame {\frametitle{Details on local spill files}
404 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
405 |    \begin{center}
406 |       \includegraphics[scale=0.4]{./Figures/spill_partition}
407 |     \end{center}
408 | }
409 | 
410 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
411 | \frame {\frametitle{Shuffle and Sort: Reduce Side}
412 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
413 |    \begin{center}
414 |       \includegraphics[scale=0.4]{./Figures/reduce_task}
415 |     \end{center}
416 | }
417 | 
418 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
419 | \frame {\frametitle{Shuffle and Sort: the Reduce Side}
420 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
421 |   \begin{itemize}
422 |   \item \textbf{The map output file is located on the local disk of
423 |      TaskTracker}
424 | 
425 |   \item \textbf{Another TaskTracker (in charge of a reduce task)
426 |       requires input from many other TaskTracker (that finished their
427 |       map tasks)}
428 |     \begin{itemize}
429 |     \item How do reducers know which \texttt{TaskTrackers} to fetch map output
430 |       from?
431 |       \begin{itemize}
432 |       \item When a map task finishes it notifies the parent
433 |         TaskTracker
434 |       \item The TaskTracker notifies (with the heartbeat mechanism)
435 |         the JobTracker
436 |       \item A thread in the reducer {\color{red}polls periodically}
437 |         the \texttt{JobTracker}
438 |       \item \texttt{TaskTrackers} do not delete local map output as soon as a
439 |         reduce task has fetched them ({\color{red}WHY?})
440 |       \end{itemize}
441 |     \end{itemize}
442 | 
443 |   \item \textbf{Copy phase: a pull approach}
444 |     \begin{itemize}
445 |     \item There is a small number (5) of copy threads that can fetch
446 |       map outputs in parallel
447 |     \end{itemize}
448 |   \end{itemize}
449 | }
450 | 
451 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
452 | \frame {\frametitle{Shuffle and Sort: the Reduce Side}
453 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
454 |   \begin{itemize}
455 |   \item \textbf{The map output are copied to the the TraskTracker running the
456 |       reducer in {\color{red}memory} (if they fit)}
457 |     \begin{itemize}
458 |     \item Otherwise they are copied to disk
459 |     \end{itemize}
460 | 
461 |     \vspace{20pt}
462 | 
463 |   \item \textbf{Input consolidation}
464 |     \begin{itemize}
465 |     \item A background thread merges all partial inputs into larger,
466 |       {\color{red}sorted} files
467 |     \item Note that if compression was used (for map outputs to save
468 |       bandwidth), decompression will take place in memory
469 |     \end{itemize}
470 | 
471 |     \vspace{20pt}
472 | 
473 |   \item \textbf{Sorting the input}
474 |     \begin{itemize}
475 |     \item When all map outputs have been copied a merge phase starts
476 |     \item All map outputs are sorted maintaining their sort ordering,
477 |       in rounds
478 |     \end{itemize}
479 |   \end{itemize}
480 | }
481 | 
482 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
483 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
484 | \subsection{Types and Formats}
485 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
486 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
487 | \begin{frame}
488 |  \begin{colorblock}{blue}{lightblue}{ }
489 |   \begin{center}
490 |     \textbf{Types and Formats}
491 |   \end{center}
492 |   \end{colorblock}
493 | \end{frame}
494 | 
495 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
496 | \frame {\frametitle{MapReduce Types}
497 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
498 |   \begin{itemize}
499 |   \item \textbf{Recall: Input / output to mappers and reducers}
500 |     \begin{itemize}
501 |     \item map: $(k1, v1)$ $\to$ $[(k2, v2)]$
502 |     \item reduce: $(k2, [v2])$ $\to$ $[(k3, v3)]$
503 |     \end{itemize}
504 |     
505 |     \vspace{20pt}
506 | 
507 | {\color{red}
508 |   \item \textbf{In Hadoop, a mapper is created as follows:}
509 |     \begin{itemize}
510 |     \item \texttt{void map(K1 key, V1 value, Context context)}
511 |     \end{itemize}
512 | }
513 | 
514 |     \vspace{20pt}
515 | 
516 |   \item \textbf{Types:}
517 |     \begin{itemize}
518 |     \item $K$ types implement \texttt{WritableComparable}
519 |     \item $V$ types implement \texttt{Writable}
520 |     \end{itemize}
521 |   \end{itemize}   
522 | }
523 | 
524 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
525 | \frame {\frametitle{What is a \texttt{Writable}}
526 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
527 |   \begin{itemize}
528 |   \item \textbf{Hadoop defines its own classes for strings (\texttt{Text}),
529 |     integers (\texttt{intWritable}), etc...}
530 | 
531 |     \vspace{20pt}
532 | 
533 |   \item \textbf{All keys are instances of \texttt{WritableComparable}}
534 |     \begin{itemize}
535 |     \item {\color{red}Why comparable?}
536 |     \end{itemize}
537 | 
538 |     \vspace{20pt}
539 | 
540 |   \item \textbf{All values are instances of \texttt{Writable}}
541 | 
542 |   \end{itemize}
543 | }
544 | 
545 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
546 | \frame {\frametitle{Getting Data to the Mapper}
547 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
548 |    \begin{center}
549 |       \includegraphics[scale=0.4]{./Figures/data2map}
550 |     \end{center}
551 | }
552 | 
553 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
554 | \frame {\frametitle{Reading Data}
555 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
556 |   \begin{itemize}
557 |   \item \textbf{Datasets are specified by \texttt{InputFormats}}
558 |     \begin{itemize}
559 |     \item \texttt{InputFormats} define input data (e.g. a file, a
560 |       directory)
561 |     \item \texttt{InputFormats} is a factory for \texttt{RecordReader}
562 |       objects to extract key-value records from the input source
563 |     \end{itemize}
564 | 
565 |     \vspace{20pt}
566 | 
567 |   \item \textbf{\texttt{InputFormats} identify partitions of the data
568 |       that form an \texttt{InputSplit}}
569 |     \begin{itemize}
570 |     \item \texttt{InputSplit} is a (\textbf{reference to a}) chunk of
571 |       the input processed by a {\color{red}single} map
572 |       \begin{itemize}
573 |       \item Largest split is processed first
574 |       \end{itemize}
575 |     \item Each split is divided into records, and the map processes each
576 |       record (a key-value pair) in turn
577 |     \item Splits and records are {\color{red}logical}, they are not
578 |       physically bound to a file
579 |     \end{itemize}
580 |   \end{itemize}
581 | }
582 | 
583 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
584 | \frame {\frametitle{\texttt{InputFormat}}
585 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
586 |   \begin{itemize}
587 |   \item \textbf{\texttt{TextInputFormat}}
588 |     \begin{itemize}
589 |     \item Treats each \texttt{newline}-terminated line of a file as a value
590 |     \end{itemize}
591 | 
592 |     \vspace{20pt}
593 | 
594 |   \item \textbf{\texttt{KeyValueTextInputFormat}}
595 |     \begin{itemize}
596 |     \item Maps \texttt{newline}-terminated text lines of ``key'' SEPARATOR ``value''
597 |     \end{itemize}
598 | 
599 |     \vspace{20pt}
600 | 
601 |   \item\textbf{\texttt{SequenceFileInputFormat}}
602 |     \begin{itemize}
603 |     \item Binary file of key-value pairs with some additional metadata
604 |     \end{itemize}
605 | 
606 |     \vspace{20pt}
607 | 
608 |   \item \textbf{\texttt{SequenceFileAsTextInputFormat}}
609 |     \begin{itemize}
610 |     \item Same as before but, maps \texttt{(k.toString(), v.toString())}
611 |     \end{itemize}
612 |   \end{itemize}
613 | 
614 | }
615 | 
616 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
617 | \frame {\frametitle{\texttt{InputSplit}}
618 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
619 |   \begin{itemize}
620 |   \item \textbf{\texttt{FileInputFormat} divides large files into
621 |       chunks}
622 |     \begin{itemize}
623 |     \item Exact size controlled by \texttt{mapred.min.split.size}
624 |     \end{itemize}
625 | 
626 |     \vspace{20pt}
627 | 
628 |   \item \textbf{Record readers receive file, offset, and length of chunk}
629 |     \begin{itemize}
630 |     \item Example
631 |     \end{itemize}
632 |     \begin{footnotesize}
633 |       \begin{columns}[c]
634 |         \column{6cm}
635 |         On the top of the Crumpetty Tree$\to$\\
636 |         The Quangle Wangle sat,$\to$\\
637 |         But his face you could not see,$\to$\\
638 |         On account of his Beaver Hat.$\to$\\
639 |         
640 |         \column{6cm}
641 |         
642 |         (0, On the top of the Crumpetty Tree)\\
643 |         (33, The Quangle Wangle sat,)\\
644 |         (57, But his face you could not see,)\\
645 |         (89, On account of his Beaver Hat.)\\
646 |       \end{columns}
647 |     \end{footnotesize}
648 | 
649 |     \vspace{20pt}
650 | 
651 |   \item \textbf{Custom \texttt{InputFormat} implementations may
652 |       override split size}
653 |  \end{itemize}
654 | }
655 | 
656 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
657 | \frame {\frametitle{The relationship between an \texttt{InputSplit} and an HDFS block}
658 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
659 |    \begin{center}
660 |       \includegraphics[scale=0.4]{./Figures/split_block}
661 |     \end{center}
662 | }
663 | 
664 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
665 | \frame {\frametitle{Record Readers}
666 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
667 |   \begin{itemize}
668 |   \item \textbf{Each \texttt{InputFormat} provides its own \texttt{RecordReader} implementation}
669 | 
670 |     \vspace{20pt}
671 |  
672 |   \item \textbf{\texttt{LineRecordReader}}
673 |     \begin{itemize}
674 |     \item Reads a line from a text file
675 |     \end{itemize}
676 | 
677 |     \vspace{20pt}
678 |  
679 |   \item \textbf{\texttt{KeyValueRecordReader}}
680 |     \begin{itemize}
681 |     \item Used by \texttt{KeyValueTextInputFormat}
682 |     \end{itemize}
683 | 
684 |   \end{itemize}
685 | }
686 | 
687 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
688 | \frame {\frametitle{Sending Data to Reducers}
689 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
690 | {\color{red}
691 |   \begin{itemize}
692 |   \item \textbf{Map function receives \texttt{Context} object}
693 |     \begin{itemize}
694 |     \item \texttt{Context.write()} receives key-value elements
695 |     \end{itemize}
696 | 
697 |     \vspace{20pt}
698 |     
699 |   \item \textbf{Any (\texttt{WritableComparable}, \texttt{Writable}) can be
700 |       used}
701 |     
702 |     \vspace{20pt}
703 |     
704 |   \item \textbf{By default, mapper output type assumed to be the same
705 |       as the reducer output type}
706 |     
707 |   \end{itemize}
708 | }
709 | }
710 | 
711 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
712 | \frame {\frametitle{WritableComparator}
713 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
714 |   \begin{itemize}
715 |   \item \textbf{Compares \texttt{WritableComparable} data}
716 |     \begin{itemize}
717 |     \item Will call the \texttt{WritableComparable.compare()} method
718 |     \item Can provide fast path for serialized data
719 |     \end{itemize}
720 | 
721 | \vspace{40pt}
722 | 
723 | \item \textbf{Configured through:
724 |     \texttt{JobConf.setOutputValueGroupingComparator()}}
725 |   \end{itemize}
726 | }
727 | 
728 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
729 | \frame {\frametitle{Partitioner}
730 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
731 |   \begin{itemize}
732 |   \item \textbf{\texttt{int getPartition(key, value, numPartitions)}}
733 |     \begin{itemize}
734 |     \item Outputs the partition number for a given key
735 |     \item One partition == all values sent to a single reduce task
736 |     \end{itemize}
737 | 
738 |     \vspace{20pt}
739 |     
740 |   \item \textbf{\texttt{HashPartitioner} used by default}
741 |     \begin{itemize}
742 |     \item Uses \texttt{key.hashCode()} to return partition number
743 |     \end{itemize}
744 |     
745 |     \vspace{20pt}
746 |     
747 |   \item \textbf{\texttt{JobConf} used to set \texttt{Partitioner} implementation}
748 |   \end{itemize}
749 |   
750 | }
751 | 
752 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
753 | \frame {\frametitle{The Reducer}
754 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
755 |   \begin{itemize}
756 |   {\color{red}\item \textbf{\texttt{void reduce(k2 key, Iterator<v2> values, Context context})}}
757 | 
758 |     \vspace{20pt}
759 | 
760 |   \item \textbf{Keys and values sent to one partition all go to the
761 |       same reduce task}
762 | 
763 |     \vspace{20pt}
764 | 
765 |   \item \textbf{Calls are sorted by key}
766 |     \begin{itemize}
767 |     \item ``Early'' keys are reduced and output before ``late'' keys
768 |     \end{itemize}
769 |   \end{itemize}
770 | }
771 | 
772 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
773 | \frame {\frametitle{Writing the Output}
774 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
775 |   \begin{itemize}
776 |   \item \textbf{Analogous to \texttt{InputFormat}}
777 | 
778 |     \vspace{20pt}
779 |     
780 |   \item \textbf{\texttt{TextOutputFormat} writes ``key value <\texttt{newline}>'' strings to output file}
781 | 
782 |     \vspace{20pt}
783 |     
784 |   \item \textbf{\texttt{SequenceFileOutputFormat} uses a binary format
785 |       to pack key-value pairs}
786 | 
787 |     \vspace{20pt}
788 | 
789 |   \item \textbf{\texttt{NullOutputFormat} discards output}
790 |     
791 |   \end{itemize}
792 | 
793 | }
794 | 
795 | 
796 | 
797 | 


--------------------------------------------------------------------------------
/disa/design_patterns.tex:
--------------------------------------------------------------------------------
  1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  2 | \frame {\frametitle{Algorithm Design}
  3 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  4 |   \begin{itemize}
  5 |   \item \textbf{Developing algorithms involve:}
  6 |     \begin{itemize}
  7 |     \item Preparing the input data
  8 |     \item Implement the mapper and the reducer
  9 |     \item Optionally, design the combiner and the partitioner
 10 |    \end{itemize}
 11 | 
 12 |     \vspace{20pt}
 13 | 
 14 |   \item \textbf{How to recast existing algorithms in MapReduce?}
 15 |     \begin{itemize}
 16 |    \item It is not always obvious how to express algorithms
 17 |     \item Data structures play an important role
 18 |     \item Optimization is hard
 19 |     \item[$\to$] The designer needs to ``bend'' the framework
 20 |     \end{itemize}
 21 | 
 22 |     \vspace{20pt}
 23 | 
 24 |   \item \textbf{Learn by examples}
 25 |     \begin{itemize}
 26 |     \item ``Design patterns''
 27 |     \item ``Shuffle'' is perhaps the most tricky aspect
 28 |     \end{itemize}
 29 |   \end{itemize}
 30 | }
 31 | 
 32 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 33 | \frame {\frametitle{Algorithm Design}
 34 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 35 |   \begin{itemize}
 36 |   \item \textbf{Aspects that are {\color{red}not} under the control of the
 37 |     designer}
 38 |     \begin{itemize}
 39 |     \item \textit{Where} a mapper or reducer will run
 40 |     \item \textit{When} a mapper or reducer begins or finishes
 41 |     \item \textit{Which} input key-value pairs are processed by a
 42 |       specific mapper
 43 |     \item \textit{Which} intermediate key-value pairs are processed by a
 44 |       specific reducer
 45 |     \end{itemize}
 46 | 
 47 |     \vspace{20pt}
 48 | 
 49 |   \item \textbf{Aspects that can be controlled}
 50 |     \begin{itemize}
 51 |     \item Construct {\color{red}data structures as keys and values}
 52 |     \item Execute user-specified initialization and termination code
 53 |       for mappers and reducers
 54 |     \item Preserve state across multiple input and intermediate keys
 55 |       in mappers and reducers
 56 |     \item {\color{red}Control the sort order} of intermediate keys, and therefore
 57 |       the order in which a reducer will encounter particular keys
 58 |     \item {\color{red}Control the partitioning of the key space}, and therefore the
 59 |       set of keys that will be encountered by a particular reducer
 60 |     \end{itemize}
 61 |   \end{itemize}
 62 | }
 63 | 
 64 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 65 | \frame {\frametitle{Algorithm Design}
 66 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 67 |   \begin{itemize}
 68 |   \item \textbf{MapReduce algorithms can be complex}
 69 |     \begin{itemize}
 70 |     \item Many algorithms cannot be easily expressed as a single
 71 |       MapReduce job
 72 |     \item Decompose complex algorithms into a sequence of jobs
 73 |       \begin{itemize}
 74 |       \item Requires orchestrating data so that the output of one job
 75 |         becomes the input to the next
 76 |       \end{itemize}
 77 |     \item Iterative algorithms require an {\color{red}external driver}
 78 |       to check for convergence
 79 |     \end{itemize}
 80 | 
 81 |     \vspace{20pt}
 82 | 
 83 |   \item \textbf{Basic design patterns\footnote{You will see them in action during the laboratory sessions.}}
 84 |     \begin{itemize}
 85 |     \item Local Aggregation
 86 |     \item Pairs and Stripes
 87 |     \item Order inversion
 88 |     \end{itemize}
 89 |  \end{itemize}
 90 | }
 91 | 
 92 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 93 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 94 | \subsection{Local Aggregation}
 95 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 96 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 97 | 
 98 | 
 99 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
100 | \frame {\frametitle{Local Aggregation}
101 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
102 |   \begin{itemize}
103 |   \item \textbf{In the context of data-intensive distributed processing, the
104 |       most important aspect of synchronization is the {\color{red}exchange of
105 |         intermediate results}}
106 |     \begin{itemize}
107 |     \item This involves copying intermediate results from the
108 |       processes that produced them to those that consume them
109 |     \item In general, this involves \textbf{data transfers over the network}
110 |     \item In Hadoop, also disk I/O is involved, as intermediate
111 |       results are written to disk
112 |     \end{itemize}
113 | 
114 |     \vspace{20pt}
115 | 
116 |     \item \textbf{Network and disk latencies are expensive}
117 |       \begin{itemize}
118 |       \item Reducing the amount of intermediate data translates into
119 |         algorithmic efficiency
120 |       \end{itemize}
121 | 
122 |       \vspace{20pt}
123 | 
124 |     \item \textbf{Combiners and preserving state across inputs}
125 |       \begin{itemize}
126 |       \item Reduce the number and size of key-value pairs to be shuffled
127 |       \end{itemize}
128 | 
129 |   \end{itemize}
130 | }
131 | 
132 | 
133 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
134 | \frame {\frametitle{In-Mapper Combiners}
135 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
136 |   \begin{itemize}
137 |   \item \textbf{In-Mapper Combiners, a possible improvement over vanilla Combiners}
138 |     \begin{itemize}
139 |     \item Hadoop does not\footnote{Actually, combiners are not called if the number of map output records is less than a small threshold, {\it i.e.}, 4} guarantee combiners to be executed
140 |     \end{itemize}
141 | 
142 |     \vspace{20pt}
143 | 
144 |   \item \textbf{Use an associative array to cumulate intermediate
145 |       results}
146 |     \begin{itemize}
147 |     \item The array is used to tally up term counts within a single ``document''
148 |     \item The \texttt{Emit} method is called only after all \texttt{InputRecords} have been processed
149 |     \end{itemize}
150 | 
151 |     \vspace{20pt}
152 | 
153 |   \item \textbf{Example (see next slide)}
154 |     \begin{itemize}
155 |     \item The code emits a key-value pair for each {\color{red}unique}
156 |       term in the document
157 |     \end{itemize}
158 |   \end{itemize}
159 | }
160 | 
161 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
162 | \frame {\frametitle{In-Memory Combiners}
163 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
164 | \begin{algorithm}[H]
165 | \algrenewcommand\algorithmicfunction{\textbf{class}}
166 | \algrenewcommand\algorithmicprocedure{\textbf{method}}
167 | 
168 |   \begin{algorithmic}[1]
169 |     \Function{Mapper}{}
170 |     \Procedure{Map}{offset $a$, line $l$}
171 |     \State $H \gets$  new AssociativeArray
172 |     \ForAll{term $t \in$ line $l$}
173 |       \State $H\{t\} \gets H\{t\} + 1$
174 |     \EndFor
175 |     \ForAll{term $t \in$ $H$}
176 |       \State $\textsc{Emit}(\textrm{term }t, \textrm{count }H\{t\})$
177 |     \EndFor
178 |     \EndProcedure
179 |     \EndFunction
180 |   \end{algorithmic}
181 | \end{algorithm}
182 | 
183 | }
184 | 
185 | \frame {\frametitle{In-Memory Combiners}
186 |   \begin{itemize}
187 |   \item \textbf{Taking the idea one step further}
188 |     \begin{itemize}
189 |     \item Exploit implementation details in Hadoop
190 |     \item A Java mapper object is created for each map task
191 |     \item JVM reuse must be enabled
192 |     \end{itemize}
193 | 
194 |     \vspace{40pt}
195 | 
196 |   \item \textbf{Preserve state within and across calls to the \texttt{Map}
197 |     method}
198 |     \begin{itemize}
199 |     \item \texttt{Initialize} method, used to create an across-map, persistent
200 |       data structure
201 |     \item \texttt{Close} method, used to emit intermediate key-value
202 |       pairs only when all map task scheduled on one machine are done
203 |     \end{itemize}
204 |   \end{itemize}
205 | }
206 | 
207 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
208 | \frame {\frametitle{In-Memory Combiners}
209 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
210 | \begin{algorithm}[H]
211 | \algrenewcommand\algorithmicfunction{\textbf{class}}
212 | \algrenewcommand\algorithmicprocedure{\textbf{method}}
213 | 
214 |   \begin{algorithmic}[1]
215 |     \Function{Mapper}{}
216 |       \Procedure{Initialize}{}
217 |         \State $H \gets$  new AssociativeArray
218 |       \EndProcedure
219 |       \Procedure{Map}{offset $a$, line $l$}
220 |         \ForAll{term $t \in$ line $l$}
221 |           \State $H\{t\} \gets H\{t\} + 1$
222 |         \EndFor
223 |       \EndProcedure
224 |       \Procedure{Close}{}
225 |         \ForAll{term $t \in$ $H$}
226 |           \State $\textsc{Emit}(\textrm{term }t, \textrm{count }H\{t\})$
227 |         \EndFor
228 |       \EndProcedure
229 |     \EndFunction
230 |   \end{algorithmic}
231 | \end{algorithm}
232 | 
233 | }
234 | 
235 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
236 | \frame {\frametitle{In-Memory Combiners}
237 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
238 |   \begin{itemize}
239 |   \item \textbf{Summing up: a first ``design pattern'', \textit{in-memory
240 |         combining}}
241 |     \begin{itemize}
242 |     \item Provides control over when local aggregation occurs
243 |     \item Designer can determine how exactly aggregation is done
244 |     \end{itemize}
245 | 
246 |     \vspace{40pt}
247 | 
248 |   \item \textbf{Efficiency vs. Combiners}
249 |     \begin{itemize}
250 |     \item There is no additional overhead due to the materialization
251 |       of key-value pairs
252 |       \begin{itemize}
253 |       \item Un-necessary object creation and destruction (garbage
254 |         collection)
255 |       \item Serialization, deserialization when memory bounded
256 |       \end{itemize}
257 |     \item With combiners. mappers still need to emit all key-value pairs, combiners
258 |       ``only'' reduce network traffic
259 |     \end{itemize}
260 |   \end{itemize}
261 | }
262 | 
263 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
264 | \frame {\frametitle{In-Memory Combiners}
265 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
266 |   \begin{itemize}
267 |   \item \textbf{Precautions}
268 |     \begin{itemize}
269 |     \item In-memory combining breaks the functional programming
270 |       paradigm due to {\bf state preservation}
271 |     \item Preserving state across multiple instances implies that
272 |       algorithm behavior might depend on execution order
273 |       \begin{itemize}
274 |       \item Works well with commutative / associative operations
275 |       \item Otherwise, order-dependent bugs are difficult to find
276 |       \end{itemize}
277 |     \end{itemize}
278 | 
279 |     \vspace{20pt}
280 | 
281 |   \item \textbf{Memory capacity is limited}
282 |     \begin{itemize}
283 |     \item In-memory combining strictly depends on having sufficient memory to store intermediate results
284 |     \item A possible {\color{red}solution}: ``block'' and ``flush''
285 |     \end{itemize}
286 |   \end{itemize}
287 | }
288 | 
289 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
290 | \frame {\frametitle{Further Remarks}
291 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
292 |   \begin{itemize}
293 |   \item \textbf{The extent to which efficiency can be increased with local
294 |       aggregation depends on the size of the intermediate key space}
295 |     \begin{itemize}
296 |     \item Opportunities for aggregation arise when multiple values
297 |       are associated to the same keys
298 |     \end{itemize}
299 | 
300 |     \vspace{40pt}
301 | 
302 |   \item \textbf{Local aggregation also effective to deal with reduce
303 |       stragglers}
304 |     \begin{itemize}
305 |     \item Reduce the number of values associated with frequently occurring keys
306 |     \end{itemize}
307 |   \end{itemize}
308 | }
309 | 
310 | 
311 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
312 | \frame {\frametitle{Computing the average, with in-mapper combiners}
313 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
314 |   \begin{itemize}
315 |     \item Partial sums and counts are held in memory (across inputs)
316 |     \item Intermediate values are emitted only after the entire input
317 |       split is processed
318 |     \item The output value is a pair
319 |   \end{itemize}
320 | 
321 | \begin{algorithm}[H]
322 | \algrenewcommand\algorithmicfunction{\textbf{class}}
323 | \algrenewcommand\algorithmicprocedure{\textbf{method}}
324 | 
325 |   \begin{algorithmic}[1]
326 |     \Function{Mapper}{}
327 |       \Procedure{Initialize}{}
328 |         \State $S \gets$  new AssociativeArray
329 |         \State $C \gets$  new AssociativeArray
330 |       \EndProcedure
331 |       \Procedure{Map}{term $t$, integer $r$}
332 |         \State $S\{t\} \gets S\{t\} + r$
333 |         \State $C\{t\} \gets C\{t\} + 1$
334 |       \EndProcedure
335 |       \Procedure{Close}{}
336 |         \ForAll{term $t \in$ $S$}
337 |           \State $\textsc{Emit}(\textrm{term }t, \textrm{pair }(S\{t\},C\{t\}))$
338 |         \EndFor
339 |       \EndProcedure
340 |     \EndFunction
341 |   \end{algorithmic}
342 | \end{algorithm}
343 | 
344 | } 
345 | 
346 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
347 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
348 | \subsection{Pairs and Stripes}
349 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
350 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
351 | 
352 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
353 | \frame {\frametitle{Pairs and Stripes}
354 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
355 |   \begin{itemize}
356 |   \item \textbf{A common approach in MapReduce: build {\color{red}complex} keys}
357 |     \begin{itemize}
358 |     \item Use the framework to group data together
359 |     \end{itemize}
360 | 
361 |     \vspace{20pt}
362 | 
363 |   \item \textbf{Two basic techniques:}
364 |    \begin{itemize}
365 |     \item \textit{Pairs}: similar to the example on the average
366 |     \item \textit{Stripes}: uses in-mapper memory data structures
367 |     \end{itemize}
368 | 
369 |     \vspace{20pt}
370 | 
371 |   \item \textbf{Next, we focus on a particular problem that benefits
372 |       from these two methods}
373 |  \end{itemize}
374 | }
375 | 
376 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
377 | \frame {\frametitle{Problem statement}
378 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
379 |   \begin{itemize}
380 |   \item \textbf{The problem: building word co-occurrence matrices for large corpora}
381 |     \begin{itemize}
382 |     \item The co-occurrence matrix of a corpus is a square $n \times n$ matrix, $M$
383 |     \item $n$ is the number of unique words (\textit{i.e.}, the vocabulary size)
384 |     \item A cell $m_{ij}$ contains the number of times the word $w_i$ co-occurs with word $w_j$ \textit{within a specific context}
385 |     \item Context: a sentence, a paragraph a document or a window of $m$ words
386 |     \item NOTE: the matrix may be symmetric in some cases
387 |     \end{itemize}
388 | 
389 |     \vspace{20pt}
390 | 
391 |   \item \textbf{Motivation}
392 |     \begin{itemize}
393 |     \item This problem is a basic building block for more complex operations
394 |     \item {\color{red}Estimating the distribution of discrete joint events from a large number of observations}
395 |     \item Similar problem in other domains:
396 |       \begin{itemize}
397 |       \item Customers who buy \textit{this} tend to also buy
398 |         \textit{that}
399 |       \end{itemize}
400 |     \end{itemize}
401 |   \end{itemize}
402 | }
403 | 
404 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
405 | \frame {\frametitle{Observations}
406 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
407 |   \begin{itemize}
408 |   \item \textbf{Space requirements}
409 |     \begin{itemize}
410 |     \item Clearly, the space requirement is $O(n^2)$, where $n$ is the size of the vocabulary
411 |     \item For real-world (English) corpora $n$ can be hundreds of thousands of words, or even billions of worlds in some specific cases
412 |     \end{itemize}
413 | 
414 |     \vspace{20pt}
415 | 
416 |   \item \textbf{So what's the problem?}
417 |     \begin{itemize}
418 |     \item If the matrix can fit in the memory of a single machine, then just use whatever naive implementation
419 |     \item Instead, if the matrix is bigger than the available memory, then {\color{red}paging} would kick in, and any naive
420 |       implementation would break
421 |     \end{itemize}
422 | 
423 |     \vspace{20pt}
424 | 
425 |   \item \textbf{Compression}
426 |     \begin{itemize}
427 |     \item Such techniques can help in solving the problem on a single machine
428 |     \item However, there are scalability problems
429 |     \end{itemize}
430 |   \end{itemize}
431 | }
432 | 
433 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
434 | \frame {\frametitle{Word co-occurrence: the Pairs approach}
435 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
436 |   \begin{center}
437 |     \includegraphics[scale=0.36]{./Figures/pairs}
438 |   \end{center}
439 | }
440 | 
441 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
442 | \frame {\frametitle{Word co-occurrence: the Pairs approach}
443 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
444 |   \begin{itemize}
445 |   \item \textbf{Input to the problem}
446 |     \begin{itemize}
447 |     \item Key-value pairs in the form of a \texttt{offset} and a \texttt{line}
448 |     \end{itemize}
449 | 
450 |     \vspace{20pt}
451 | 
452 |   \item \textbf{The mapper:}
453 |     \begin{itemize}
454 |     \item Processes each input document
455 |       \item Emits key-value pairs with:
456 |         \begin{itemize}
457 |         \item Each co-occurring word {\color{red}pair} as the key
458 |         \item The integer one (the count) as the value
459 |         \end{itemize}
460 |       \item This is done with two nested loops:
461 |         \begin{itemize}
462 |         \item The outer loop iterates over all words
463 |         \item The inner loop iterates over all neighbors
464 |         \end{itemize}
465 |     \end{itemize}
466 | 
467 |     \vspace{20pt}
468 | 
469 |   \item \textbf{The reducer:}
470 |     \begin{itemize}
471 |     \item Receives {\color{red}pairs} related to co-occurring words
472 |       \begin{itemize}
473 |       \item This {\color{red}requires \textbf{modifying the partitioner}}
474 |       \end{itemize}
475 |     \item Computes an absolute count of the joint event
476 |     \item Emits the pair and the count as the final key-value output
477 |       \begin{itemize}
478 |       \item Basically reducers emit the cells of the output matrix
479 |       \end{itemize}
480 |     \end{itemize}
481 |   \end{itemize}
482 | }
483 | 
484 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
485 | \frame {\frametitle{Word co-occurrence: the Pairs approach}
486 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
487 | \begin{algorithm}[H]
488 | \algrenewcommand\algorithmicfunction{\textbf{class}}
489 | \algrenewcommand\algorithmicprocedure{\textbf{method}}
490 | 
491 |   \begin{algorithmic}[1]
492 |     \Function{Mapper}{}
493 |       \Procedure{Map}{offset $a$, line $l$}
494 |         \ForAll{term $w \in$ line $l$}
495 |           \ForAll{term $u \in \textsc{Neighbors}(w)$}
496 |             \State $\textsc{Emit } (\textrm{pair }(w,u), \textrm{count }1)$
497 |           \EndFor
498 |         \EndFor
499 |       \EndProcedure
500 |     \EndFunction
501 | 
502 |     \Function{Reducer}{}
503 |       \Procedure{Reduce}{pair $p$, counts $[c_1,c_2, \cdots ]$}
504 |         \State $s \gets 0$
505 |         \ForAll{count $c \in \textrm{counts }[c_1,c_2, \cdots ]$}
506 |           \State $s \gets s + c$
507 |         \EndFor
508 |         \State $\textsc{Emit } (\textrm{pair }p, \textrm{count }s)$
509 |       \EndProcedure
510 |     \EndFunction
511 | 
512 |   \end{algorithmic}
513 | \end{algorithm}
514 | }
515 | 
516 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
517 | \frame {\frametitle{Word co-occurrence: the Stripes approach}
518 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
519 |   \begin{center}
520 |     \includegraphics[scale=0.36]{./Figures/stripes}
521 |   \end{center}
522 | }
523 | 
524 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
525 | \frame {\frametitle{Word co-occurrence: the Stripes approach}
526 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
527 |   \begin{itemize}
528 |   \item \textbf{Input to the problem}
529 |     \begin{itemize}
530 |     \item Key-value pairs in the form of a \texttt{offset} and a \texttt{line}
531 |     \end{itemize}
532 | 
533 |     \vspace{20pt}
534 | 
535 |   \item \textbf{The mapper:}
536 |     \begin{itemize}
537 |     \item Same two nested loops structure as before
538 |     \item Co-occurrence information is first stored in an associative
539 |       array
540 |     \item Emit key-value pairs with {\color{red}words} as keys and the
541 |       corresponding arrays as values
542 |     \end{itemize}
543 | 
544 |     \vspace{20pt}
545 | 
546 |   \item \textbf{The reducer:}
547 |     \begin{itemize}
548 |     \item Receives all associative arrays related to the same word
549 |     \item Performs an element-wise sum of all associative arrays with
550 |       the same key
551 |     \item Emits key-value output in the form of word, associative
552 |       array
553 |       \begin{itemize}
554 |       \item Basically, reducers emit \textbf{rows} of the co-occurrence matrix
555 |       \end{itemize}
556 |     \end{itemize}
557 |  \end{itemize}
558 | 
559 | }
560 | 
561 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
562 | \frame {\frametitle{Word co-occurrence: the Stripes approach}
563 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
564 | \begin{algorithm}[H]
565 | \algrenewcommand\algorithmicfunction{\textbf{class}}
566 | \algrenewcommand\algorithmicprocedure{\textbf{method}}
567 | 
568 |   \begin{algorithmic}[1]
569 |     \Function{Mapper}{}
570 |       \Procedure{Map}{offset $a$, line $l$}
571 |         \ForAll{term $w \in$ line $l$}
572 |         \State $H \gets$ new AssociativeArray
573 |           \ForAll{term $u \in \textsc{Neighbors}(w)$}
574 |             \State $H\{u\} \gets H\{u\}+1$
575 |           \EndFor
576 |           \State $\textsc{Emit } (\textrm{term }w, \textrm{Stripe }H)$
577 |         \EndFor
578 |       \EndProcedure
579 |     \EndFunction
580 | 
581 |     \Function{Reducer}{}
582 |       \Procedure{Reduce}{term $w$, Stripes $[H_1,H_2,H_3 \cdots ]$}
583 |         \State $H_f \gets$ new AssociativeArray
584 |         \ForAll{Stripe $H \in \textrm{Stripes }[H_1,H_2,H_3 \cdots ]$}
585 |           \State $\textsc{Sum}(H_f,H)$
586 |         \EndFor
587 |         \State $\textsc{Emit } (\textrm{term }w, \textrm{Stripe }H_f)$
588 |       \EndProcedure
589 |     \EndFunction
590 | 
591 |   \end{algorithmic}
592 | \end{algorithm}
593 | 
594 | }
595 | 
596 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
597 | \frame {\frametitle{Pairs and Stripes, a comparison}
598 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
599 |   \begin{itemize}
600 |   \item \textbf{The pairs approach}
601 |     \begin{itemize}
602 |     \item Generates a large number of key-value pairs
603 |     \begin{itemize}
604 |       \item In particular, intermediate ones, that fly over the network
605 |     \end{itemize}
606 |     \item The benefit from combiners is limited, as it is less likely
607 |       for a mapper to process multiple occurrences of a word
608 |     \item Does not suffer from memory paging problems
609 |     \end{itemize}
610 | 
611 |     \vspace{20pt}
612 |     
613 |   \item \textbf{The stripes approach}
614 |     \begin{itemize}
615 |     \item More compact
616 |     \item Generates fewer and shorted intermediate keys
617 |       \begin{itemize}
618 |       \item The framework has less sorting to do
619 |       \end{itemize}
620 |     \item The values are more complex and have serialization / deserialization overhead
621 |     \item Greatly benefits from combiners, as the key space is the vocabulary
622 |     \item Suffers from memory paging problems, if not properly engineered
623 |     \end{itemize}
624 |   \end{itemize}
625 | }
626 | 
627 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
628 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
629 | \subsection{Order Inversion}
630 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
631 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
632 | 
633 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
634 | \frame {\frametitle{Computing relative frequencies}
635 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
636 |   \begin{itemize}
637 |   \item \textbf{``Relative'' Co-occurrence matrix construction}
638 |     \begin{itemize}
639 |     \item Similar problem as before, same matrix
640 |     \item Instead of absolute counts, we take into consideration the
641 |       fact that some words appear more frequently than others
642 |       \begin{itemize}
643 |       \item Word $w_i$ may co-occur frequently with word $w_j$ simply
644 |         because one of the two is very common
645 |       \end{itemize}
646 |     \item We need to convert absolute counts to relative frequencies
647 |       $f(w_j | w_i)$
648 |       \begin{itemize}
649 |       \item What proportion of the time does $w_j$ appear in the
650 |         context of $w_i$?
651 |       \end{itemize}
652 |     \end{itemize}
653 | 
654 |     \vspace{20pt}
655 | 
656 |   \item \textbf{Formally, we compute:}
657 |     $$ f(w_j | w_i) = \frac{N(w_i,w_j)}{\sum_{w'} N(w_i, w')}$$
658 |     \begin{itemize}
659 |     \item $N(\cdot,\cdot)$ is the number of times a co-occurring word
660 |       pair is observed
661 |     \item The denominator is called the marginal
662 |     \end{itemize}
663 | 
664 | 
665 |   \end{itemize}
666 | }
667 | 
668 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
669 | \frame {\frametitle{Computing relative frequencies}
670 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
671 |   \begin{itemize}
672 |   \item \textbf{The stripes approach}
673 |     \begin{itemize}
674 |     \item In the reducer, the counts of all words that co-occur with
675 |       the conditioning variable ($w_i$) are available in the
676 |       associative array
677 |     \item Hence, the sum of all those counts gives the marginal
678 |     \item Then we divide the joint counts by the marginal and
679 |       we're done
680 |     \end{itemize}
681 | 
682 |     \vspace{40pt}
683 | 
684 |   \item \textbf{The pairs approach}
685 |     \begin{itemize}
686 |     \item The reducer receives the pair $(w_i,w_j)$ and the count
687 |     \item From this information alone \textbf{it is not possible} to compute
688 |       $f(w_j | w_i)$
689 |     \item Fortunately, as for the mapper, also the reducer can
690 |       {\color{red}preserve state} across multiple keys
691 |       \begin{itemize}
692 |       \item We can buffer in memory all the words that co-occur with
693 |         $w_i$ and their counts
694 |       \item This is basically building the associative array in the
695 |         stripes method
696 |       \end{itemize}
697 |     \end{itemize}
698 |   \end{itemize}
699 | }
700 | 
701 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
702 | \frame {\frametitle{Computing relative frequencies: a basic approach}
703 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
704 |   \begin{itemize}
705 |   \item \textbf{We must define the sort order of the pair}
706 |     \begin{itemize}
707 |     \item In this way, the keys are first sorted by the left word, and
708 |       then by the right word (in the pair)
709 |     \item Hence, we can detect if all pairs associated with the word
710 |       we are conditioning on ($w_i$) have been seen
711 |     \item At this point, we can use the in-memory buffer, compute the
712 |       relative frequencies and emit
713 |     \end{itemize}
714 | 
715 |     \vspace{20pt}
716 | 
717 |   \item \textbf{We must define an appropriate partitioner}
718 |     \begin{itemize}
719 |     \item The default partitioner is based on the hash value of the
720 |       intermediate key, modulo the number of reducers
721 |     \item For a complex key, the {\bf raw byte representation} is used to
722 |       compute the hash value
723 |       \begin{itemize}
724 |       \item Hence, there is no guarantee that the pair (dog, aardvark)
725 |         and (dog,zebra) are sent to the same reducer
726 |       \end{itemize}
727 |     \item What we want is that all pairs with the same left word are
728 |       sent to the same reducer
729 |    \end{itemize}
730 | 
731 |     \vspace{20pt}
732 | 
733 |   \item \textbf{Limitations of this approach}
734 |     \begin{itemize}
735 |     \item Essentially, we reproduce the stripes method in the reducer
736 |       and we need to use a custom partitioner
737 |     \item This algorithm would work, but present the same
738 |       memory-bottleneck problem as the stripes method
739 |     \end{itemize}
740 |   \end{itemize}
741 | }
742 | 
743 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
744 | \frame {\frametitle{Computing relative frequencies: order inversion}
745 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
746 |   \begin{itemize}
747 |   \item \textbf{The key is to properly sequence data presented to
748 |       reducers}
749 |     \begin{itemize}
750 |     \item If it were possible to compute the marginal in the reducer
751 |       before processing the joint counts, the reducer could simply
752 |       divide the joint counts received from mappers by the marginal
753 |     \item The notion of ``before'' and ``after'' can be captured in
754 |       the {\color{red}ordering of key-value pairs}
755 |     \item The programmer can define the sort order of keys so that
756 |       data needed earlier is presented to the reducer before data that
757 |       is needed later
758 |     \end{itemize}
759 | \end{itemize}
760 | }
761 | 
762 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
763 | \frame {\frametitle{Computing relative frequencies: order inversion}
764 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
765 |   \begin{itemize}
766 |   \item \textbf{Recall that mappers emit pairs of co-occurring words
767 |       as keys}
768 | 
769 |     \vspace{20pt}
770 | 
771 |   \item \textbf{The mapper:}
772 |     \begin{itemize}
773 |     \item additionally emits a ``special'' key of the form $(w_i,*)$
774 |     \item The value associated to the special key is one, that
775 |       represents the contribution of the word pair to the marginal
776 |     \item Using combiners, these partial marginal counts will be
777 |       aggregated before being sent to the reducers
778 |     \end{itemize}
779 | 
780 |     \vspace{20pt}
781 | 
782 |   \item \textbf{The reducer:}
783 |     \begin{itemize}
784 |     \item We must make sure that the special key-value pairs are
785 |       processed {\color{red}before} any other key-value pairs where
786 |       the left word is $w_i$
787 |     \item We also need to modify the partitioner as before,
788 |       \textit{i.e.}, it would take into account only the first word
789 |     \end{itemize}
790 |   \end{itemize}
791 | }
792 | 
793 | 
794 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
795 | \frame {\frametitle{Computing relative frequencies: order inversion}
796 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
797 |   \begin{itemize}
798 |   \item \textbf{Memory requirements:}
799 |     \begin{itemize}
800 |     \item Minimal, because only the marginal (an integer) needs to be
801 |       stored
802 |     \item No buffering of individual co-occurring word
803 |     \item No scalability bottleneck
804 |     \end{itemize}
805 | 
806 |     \vspace{20pt}
807 | 
808 |   \item \textbf{Key ingredients for order inversion}
809 |     \begin{itemize}
810 |     \item Emit a special key-value pair to capture the marginal
811 |     \item Control the sort order of the intermediate key, so that the
812 |       special key-value pair is processed first
813 |     \item Define a custom partitioner for routing intermediate
814 |       key-value pairs
815 |     \item Preserve state across multiple keys in the reducer
816 |     \end{itemize}
817 |   \end{itemize}
818 | }
819 | 


--------------------------------------------------------------------------------