├── .gitignore
├── .gitmodules
├── CS262%20Final%20Project
    ├── common.sty
    ├── images
    │   ├── downpour.png
    │   ├── locally.png
    │   ├── remotely.png
    │   ├── sgd_results.png
    │   └── speeds.png
    ├── main.aux
    ├── main.bcf
    ├── main.bib
    ├── main.log
    ├── main.out
    ├── main.pdf
    ├── main.run.xml
    ├── main.synctex.gz
    └── main.tex
├── Dev-Notebook-Kevin.md
├── Dev-Notebook-Mike.md
├── README.md
├── client_list.txt
├── lua-lua
    ├── README.md
    ├── cleanup.py
    ├── copy_files.py
    ├── data
    │   ├── demo-train.hdf5
    │   ├── demo-val.hdf5
    │   ├── demo.src.dict
    │   ├── demo.targ.dict
    │   ├── src-train.txt
    │   ├── src-val.txt
    │   ├── targ-train.txt
    │   └── targ-val.txt
    ├── demo_server.lua
    ├── gcloud_commands.txt
    ├── install_parallel.sh
    ├── locally.png
    ├── outputs
    │   ├── 104.154.239.139
    │   │   ├── ada_4_rem.png
    │   │   ├── ada_4_rem.txt
    │   │   ├── ada_8_rem.png
    │   │   └── ada_8_rem.txt
    │   ├── 104.197.106.197
    │   │   ├── ada_2_rem.png
    │   │   └── ada_2_rem.txt
    │   ├── 104.197.222.148
    │   │   ├── ada_2.txt
    │   │   ├── ada_2_loc.png
    │   │   ├── ada_2_loc.txt
    │   │   └── reg_2.txt
    │   ├── 104.197.250.103
    │   │   ├── reg_1.txt
    │   │   ├── reg_2.txt
    │   │   ├── reg_2_loc.png
    │   │   └── reg_2_loc.txt
    │   ├── 130.211.192.196
    │   │   ├── reg_1_loc.png
    │   │   ├── reg_1_loc.txt
    │   │   └── reg_2.txt
    │   └── 130.211.204.149
    │   │   ├── ada_1.txt
    │   │   ├── ada_1_loc.png
    │   │   ├── ada_1_loc.txt
    │   │   └── reg_2.txt
    ├── parallel
    │   └── init.lua
    ├── parse_outputs.py
    ├── remotely.png
    ├── server.lua
    ├── setup_image.sh
    └── startup.sh
├── python-python
    ├── README.md
    ├── client.py
    ├── data
    │   ├── images(16).npy
    │   └── output_labels(16).npy
    ├── dist_sgd_pb2.py
    ├── image_classes.txt
    ├── neural_net.py
    ├── nnet
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── neural_net.py
    │   └── neural_net.pyc
    ├── paxos.py
    ├── paxos_pb2.py
    ├── protobuf_utils
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── utils.py
    │   └── utils.pyc
    ├── protos
    │   ├── dist_sgd.proto
    │   ├── dist_sgd_pb2.py
    │   ├── paxos.proto
    │   └── paxos_pb2.py
    ├── run_codegen.sh
    ├── server.py
    └── start.sh
└── slides
    ├── .Rhistory
    ├── common_slides.sty
    ├── img
        ├── 2d_func.jpg
        ├── dataset.png
        ├── deep_learning.png
        ├── dist_16.png
        ├── dist_train.png
        ├── downpour.png
        ├── gRPC.png
        ├── large_data.png
        ├── lin_v_nonlin.png
        └── sandblaster.png
    ├── main.pdf
    └── main.tex


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Annoying files
 2 | .DS_Store
 3 | .ipynb_checkpoints
 4 | Icon
 5 | 
 6 | # large data files
 7 | basic/output_labels(128).npy
 8 | 
 9 | # Install files
10 | install/
11 | 
12 | # Model saves
13 | *.t7
14 | 
15 | 
16 | # Annoying text files
17 | slides/main.aux
18 | slides/main.log
19 | slides/main.nav
20 | slides/main.out
21 | slides/main.snm
22 | slides/main.synctex.gz
23 | slides/main.toc


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "lua-lua/End-To-End-Generative-Dialogue"]
2 | 	path = lua-lua/End-To-End-Generative-Dialogue
3 | 	url = https://github.com/michaelfarrell76/End-To-End-Generative-Dialogue.git
4 | 


--------------------------------------------------------------------------------
/CS262%20Final%20Project/common.sty:
--------------------------------------------------------------------------------
  1 | \usepackage{amsmath}
  2 | \usepackage{amssymb}
  3 | \usepackage{url}
  4 | \usepackage{mathpazo}
  5 | \usepackage{palatino}
  6 | \usepackage{fullpage,graphicx}
  7 | \usepackage{tikz}
  8 | \usepackage{tikz-qtree}
  9 | \usepackage[font={it}]{caption}
 10 | % \usepackage[right, mathlines]{lineno}
 11 | 
 12 | \usepackage[procnames]{listings}
 13 | \usepackage{color}
 14 | 
 15 | \definecolor{keywords}{RGB}{255,0,90}
 16 | \definecolor{comments}{RGB}{0,0,113}
 17 | \definecolor{red}{RGB}{160,0,0}
 18 | \definecolor{green}{RGB}{0,150,0}
 19 |  
 20 | \lstset{language=Python, 
 21 |         basicstyle=\ttfamily\small, 
 22 |         keywordstyle=\color{keywords},
 23 |         commentstyle=\color{comments},
 24 |         stringstyle=\color{red},
 25 |         showstringspaces=false,
 26 |         identifierstyle=\color{green},
 27 |         procnamekeys={def,class}}
 28 |  
 29 | % \linenumbers
 30 | 
 31 | \usetikzlibrary{shapes.geometric}
 32 | \usetikzlibrary{patterns}
 33 | \usetikzlibrary{matrix}
 34 | \usetikzlibrary{automata}
 35 | \usepackage{booktabs}
 36 | 
 37 | % \pagestyle{empty}
 38 | \pagenumbering{arabic}
 39 | \usepackage{subfig}
 40 | \usepackage{comment}
 41 | 
 42 | \newcommand{\boldA}{\boldsymbol{A}}
 43 | \newcommand{\boldB}{\boldsymbol{B}}
 44 | \newcommand{\boldC}{\boldsymbol{C}}
 45 | \newcommand{\boldD}{\boldsymbol{D}}
 46 | \newcommand{\boldE}{\boldsymbol{E}}
 47 | \newcommand{\boldF}{\boldsymbol{F}}
 48 | \newcommand{\boldG}{\boldsymbol{G}}
 49 | \newcommand{\boldH}{\boldsymbol{H}}
 50 | \newcommand{\boldI}{\boldsymbol{I}}
 51 | \newcommand{\boldJ}{\boldsymbol{J}}
 52 | \newcommand{\boldK}{\boldsymbol{K}}
 53 | \newcommand{\boldL}{\boldsymbol{L}}
 54 | \newcommand{\boldM}{\boldsymbol{M}}
 55 | \newcommand{\boldN}{\boldsymbol{N}}
 56 | \newcommand{\boldO}{\boldsymbol{O}}
 57 | \newcommand{\boldP}{\boldsymbol{P}}
 58 | \newcommand{\boldQ}{\boldsymbol{Q}}
 59 | \newcommand{\boldR}{\boldsymbol{R}}
 60 | \newcommand{\boldS}{\boldsymbol{S}}
 61 | \newcommand{\boldT}{\boldsymbol{T}}
 62 | \newcommand{\boldU}{\boldsymbol{U}}
 63 | \newcommand{\boldV}{\boldsymbol{V}}
 64 | \newcommand{\boldW}{\boldsymbol{W}}
 65 | \newcommand{\boldX}{\boldsymbol{X}}
 66 | \newcommand{\boldY}{\boldsymbol{Y}}
 67 | \newcommand{\boldZ}{\boldsymbol{Z}}
 68 | \newcommand{\bolda}{\boldsymbol{a}}
 69 | \newcommand{\boldb}{\boldsymbol{b}}
 70 | \newcommand{\boldc}{\boldsymbol{c}}
 71 | \newcommand{\boldd}{\boldsymbol{d}}
 72 | \newcommand{\bolde}{\boldsymbol{e}}
 73 | \newcommand{\boldf}{\boldsymbol{f}}
 74 | \newcommand{\boldg}{\boldsymbol{g}}
 75 | \newcommand{\boldh}{\boldsymbol{h}}
 76 | \newcommand{\boldi}{\boldsymbol{i}}
 77 | \newcommand{\boldj}{\boldsymbol{j}}
 78 | \newcommand{\boldk}{\boldsymbol{k}}
 79 | \newcommand{\boldl}{\boldsymbol{l}}
 80 | \newcommand{\boldm}{\boldsymbol{m}}
 81 | \newcommand{\boldn}{\boldsymbol{n}}
 82 | \newcommand{\boldo}{\boldsymbol{o}}
 83 | \newcommand{\boldp}{\boldsymbol{p}}
 84 | \newcommand{\boldq}{\boldsymbol{q}}
 85 | \newcommand{\boldr}{\boldsymbol{r}}
 86 | \newcommand{\bolds}{\boldsymbol{s}}
 87 | \newcommand{\boldt}{\boldsymbol{t}}
 88 | \newcommand{\boldu}{\boldsymbol{u}}
 89 | \newcommand{\boldv}{\boldsymbol{v}}
 90 | \newcommand{\boldw}{\boldsymbol{w}}
 91 | \newcommand{\boldx}{\boldsymbol{x}}
 92 | \newcommand{\boldy}{\boldsymbol{y}}
 93 | \newcommand{\boldz}{\boldsymbol{z}}
 94 | 
 95 | \newcommand{\mcA}{\mathcal{A}}
 96 | \newcommand{\mcB}{\mathcal{B}}
 97 | \newcommand{\mcC}{\mathcal{C}}
 98 | \newcommand{\mcD}{\mathcal{D}}
 99 | \newcommand{\mcE}{\mathcal{E}}
100 | \newcommand{\mcF}{\mathcal{F}}
101 | \newcommand{\mcG}{\mathcal{G}}
102 | \newcommand{\mcH}{\mathcal{H}}
103 | \newcommand{\mcI}{\mathcal{I}}
104 | \newcommand{\mcJ}{\mathcal{J}}
105 | \newcommand{\mcK}{\mathcal{K}}
106 | \newcommand{\mcL}{\mathcal{L}}
107 | \newcommand{\mcM}{\mathcal{M}}
108 | \newcommand{\mcN}{\mathcal{N}}
109 | \newcommand{\mcO}{\mathcal{O}}
110 | \newcommand{\mcP}{\mathcal{P}}
111 | \newcommand{\mcQ}{\mathcal{Q}}
112 | \newcommand{\mcR}{\mathcal{R}}
113 | \newcommand{\mcS}{\mathcal{S}}
114 | \newcommand{\mcT}{\mathcal{T}}
115 | \newcommand{\mcU}{\mathcal{U}}
116 | \newcommand{\mcV}{\mathcal{V}}
117 | \newcommand{\mcW}{\mathcal{W}}
118 | \newcommand{\mcX}{\mathcal{X}}
119 | \newcommand{\mcY}{\mathcal{Y}}
120 | \newcommand{\mcZ}{\mathcal{Z}}
121 | 
122 | \newcommand{\reals}{\ensuremath{\mathbb{R}}}
123 | \newcommand{\integers}{\ensuremath{\mathbb{Z}}}
124 | \newcommand{\rationals}{\ensuremath{\mathbb{Q}}}
125 | \newcommand{\naturals}{\ensuremath{\mathbb{N}}}
126 | \newcommand{\trans}{\ensuremath{\mathsf{T}}}
127 | \newcommand{\ident}{\boldsymbol{I}}
128 | \newcommand{\bzero}{\boldsymbol{0}}
129 | 
130 | \newcommand{\balpha}{\boldsymbol{\alpha}}
131 | \newcommand{\bbeta}{\boldsymbol{\beta}}
132 | \newcommand{\boldeta}{\boldsymbol{\eta}}
133 | \newcommand{\bkappa}{\boldsymbol{\kappa}}
134 | \newcommand{\bgamma}{\boldsymbol{\gamma}}
135 | \newcommand{\bmu}{\boldsymbol{\mu}}
136 | \newcommand{\bphi}{\boldsymbol{\phi}}
137 | \newcommand{\bpi}{\boldsymbol{\pi}}
138 | \newcommand{\bpsi}{\boldsymbol{\psi}}
139 | \newcommand{\bsigma}{\boldsymbol{\sigma}}
140 | \newcommand{\btheta}{\boldsymbol{\theta}}
141 | \newcommand{\bxi}{\boldsymbol{\xi}}
142 | \newcommand{\bGamma}{\boldsymbol{\Gamma}}
143 | \newcommand{\bLambda}{\boldsymbol{\Lambda}}
144 | \newcommand{\bOmega}{\boldsymbol{\Omega}}
145 | \newcommand{\bPhi}{\boldsymbol{\Phi}}
146 | \newcommand{\bPi}{\boldsymbol{\Pi}}
147 | \newcommand{\bPsi}{\boldsymbol{\Psi}}
148 | \newcommand{\bSigma}{\boldsymbol{\Sigma}}
149 | \newcommand{\bTheta}{\boldsymbol{\Theta}}
150 | \newcommand{\bUpsilon}{\boldsymbol{\Upsilon}}
151 | \newcommand{\bXi}{\boldsymbol{\Xi}}
152 | \newcommand{\bepsilon}{\boldsymbol{\epsilon}}
153 | 
154 | \def\argmin{\operatornamewithlimits{arg\,min}}
155 | \def\argmax{\operatornamewithlimits{arg\,max}}
156 | 
157 | \newcommand{\given}{\,|\,}
158 | \newcommand{\distNorm}{\mathcal{N}}
159 | 
160 | 
161 | \usepackage{tabularx}
162 | \usepackage{algorithm}
163 | \usepackage{algpseudocode}
164 | 
165 | \newcommand{\msc}[1]{\mathrm{\textsc{#1}}}
166 | \newcommand{\air}{\vspace{0.5cm}}
167 | 
168 | \algtext*{EndWhile}% Remove "end while" text
169 | \algtext*{EndFor}% Remove "end while" text
170 | \algtext*{EndIf}% Remove "end if" text
171 | \algtext*{EndProcedure}% Remove "end while" text
172 | 
173 | \newtheorem{theorem}{Theorem}
174 | \newtheorem{defn}{Definition}
175 | 
176 | \newcommand{\Scribe}[1]{\def\ScribeStr{Scribe: #1}}
177 | \newcommand{\Scribes}[1]{\def\ScribeStr{Scribes: #1}}
178 | \newcommand{\Lecturer}[1]{\def\LecStr{Lecturer: #1}}
179 | \newcommand{\Lecturers}[1]{\def\LecStr{Lecturers: #1}}
180 | \newcommand{\LectureNumber}[1]{\def\LecNum{#1}}
181 | \newcommand{\LectureDate}[1]{\def\LecDate{#1}}
182 | \newcommand{\LectureTitle}[1]{\def\LecTitle{#1}}
183 | 
184 | \newdimen\headerwidth
185 | 
186 | \newcommand{\MakeScribeTop}{
187 | \noindent
188 | \begin{center}
189 |   \framebox{
190 |     \vbox{
191 |       \headerwidth=\textwidth
192 |       % \advance\headerwidth by -0.22in
193 |       \hbox to \headerwidth {{\bf Artificial Intelligence \hfill (Harvard CS182, Fall 2015)} }
194 |       \vspace{4mm}
195 |       \hbox to \headerwidth {{\Large \hfill {\LecTitle} \hfill}}
196 |       \vspace{2mm}
197 |       \hbox to \headerwidth {\hfill \LecDate \hfill}
198 |       \vspace{2mm}
199 |       \hbox to \headerwidth {{\it  \hfill \LecStr \hfill }}
200 |       }
201 |     }
202 | \end{center}
203 | \vspace*{4mm}}
204 | 
205 | 
206 | \newcommand*{\QED}{\hfill\ensuremath{\square}}%
207 | 
208 | \newtheorem{exercise}[theorem]{Question}
209 | \let\checkmark\undefined
210 | 
211 | \newcommand{\exinline}[1]{(\refstepcounter{theorem}Question~\thetheorem\label{#1})}
212 | 
213 | \usepackage[utf8]{inputenc}
214 | 
215 | % \DeclareUnicodeCharacter{2693}{\anchor}
216 | \usepackage{bbding}
217 | \usepackage{soul}
218 | 
219 | \ifthenelse{\isundefined{\StudentVersion}}{
220 | \newcommand{\censor}[1]{
221 |     {\small \textcolor{red}{\SunshineOpenCircled}} \textcolor{red}{#1}
222 | }
223 | \newcommand{\censorm}[1]{
224 |     \hbox{{\small \textcolor{red}{\SunshineOpenCircled}}} \textcolor{red}{#1}
225 | }
226 | 
227 | }{
228 | \DeclareRobustCommand*\censor{%
229 |   {\small \textcolor{red}{\SunshineOpenCircled}}
230 |     \SOUL@setup%
231 |     \def\SOUL@everytoken{\phantom{\the\SOUL@token}}%
232 |     \def\SOUL@everyhyphen{%
233 |         \discretionary{%
234 |             \SOUL@setkern\SOUL@hyphkern%
235 |             \phantom{\SOUL@sethyphenchar}%
236 |         }{}{}%
237 |     }%
238 |     \def\SOUL@everyexhyphen##1{%
239 |         \SOUL@setkern\SOUL@hyphkern%
240 |         \hbox{\phantom{##1}}%
241 |         \discretionary{}{}{%
242 |             \SOUL@setkern\SOUL@charkern%
243 |         }%
244 |     }%
245 |     \SOUL@%
246 | }
247 | \newcommand{\censorm}[1]{
248 |     \hbox{{\small \textcolor{red}{\SunshineOpenCircled}}} \hspace*{5cm} 
249 | }
250 | 
251 | }
252 | 
253 | 
254 | 
255 | 
256 | \newcommand{\bolddelta}{\boldsymbol{\delta}}
257 | \newcommand{\indicator}{\mathbf{1}}
258 | 
259 | 
260 | \def\argmin{\operatornamewithlimits{arg\,min}}
261 | \def\argmax{\operatornamewithlimits{arg\,max}}
262 | \def\softmax{\operatornamewithlimits{softmax}}
263 | \def\relu{\operatornamewithlimits{ReLU}}
264 | 
265 | \newcommand{\din}{{d_{\mathrm{in}}}}
266 | \newcommand{\dout}{{d_{\mathrm{out}}}}


--------------------------------------------------------------------------------
/CS262%20Final%20Project/images/downpour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/downpour.png


--------------------------------------------------------------------------------
/CS262%20Final%20Project/images/locally.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/locally.png


--------------------------------------------------------------------------------
/CS262%20Final%20Project/images/remotely.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/remotely.png


--------------------------------------------------------------------------------
/CS262%20Final%20Project/images/sgd_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/sgd_results.png


--------------------------------------------------------------------------------
/CS262%20Final%20Project/images/speeds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/speeds.png


--------------------------------------------------------------------------------
/CS262%20Final%20Project/main.aux:
--------------------------------------------------------------------------------
 1 | \relax 
 2 | \providecommand\hyper@newdestlabel[2]{}
 3 | \providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
 4 | \HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
 5 | \global\let\oldcontentsline\contentsline
 6 | \gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
 7 | \global\let\oldnewlabel\newlabel
 8 | \gdef\newlabel#1#2{\newlabelxx{#1}#2}
 9 | \gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
10 | \AtEndDocument{\ifx\hyper@anchor\@undefined
11 | \let\contentsline\oldcontentsline
12 | \let\newlabel\oldnewlabel
13 | \fi}
14 | \fi}
15 | \global\let\hyper@last\relax 
16 | \gdef\HyperFirstAtBeginDocument#1{#1}
17 | \providecommand\HyField@AuxAddToFields[1]{}
18 | \providecommand\HyField@AuxAddToCoFields[2]{}
19 | \abx@aux@sortscheme{ynt}
20 | \abx@aux@cite{bengio-emb}
21 | \abx@aux@cite{distbelief}
22 | \@writefile{toc}{\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax }
23 | \@writefile{lof}{\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax }
24 | \@writefile{lot}{\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax }
25 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}}
26 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {2}Background on Downpour SGD}{1}{section.2}}
27 | \abx@aux@cite{tensorflow}
28 | \abx@aux@cite{protobuf}
29 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces A graphic modeling the functionality of Downpour SGD \cite {distbelief}\relax }}{2}{figure.caption.1}}
30 | \providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
31 | \newlabel{fig:downpour}{{1}{2}{A graphic modeling the functionality of Downpour SGD \cite {distbelief}\relax }{figure.caption.1}{}}
32 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {3}Motivation}{2}{section.3}}
33 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {4}Challenges}{3}{section.4}}
34 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {5}Methods and Design}{3}{section.5}}
35 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces On left, transfer speeds for different amounts of parameters. On right, transfer speeds based on chunk size while streaming the parameters.\relax }}{4}{figure.caption.2}}
36 | \newlabel{fig:local}{{2}{4}{On left, transfer speeds for different amounts of parameters. On right, transfer speeds based on chunk size while streaming the parameters.\relax }{figure.caption.2}{}}
37 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {6}Results and Discussion}{6}{section.6}}
38 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces On left, training classification error rates per epoch. On right, training classification error rates over time.\relax }}{6}{figure.caption.3}}
39 | \newlabel{fig:local}{{3}{6}{On left, training classification error rates per epoch. On right, training classification error rates over time.\relax }{figure.caption.3}{}}
40 | \abx@aux@cite{adagrad}
41 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {7}Applying SGD in Lua/Torch}{7}{section.7}}
42 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces The results of running our rnn model for 7 epochs locally.\relax }}{7}{figure.caption.4}}
43 | \newlabel{fig:local}{{4}{7}{The results of running our rnn model for 7 epochs locally.\relax }{figure.caption.4}{}}
44 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces The result of running our rnn model for 10 epochs remotely.\relax }}{9}{figure.caption.5}}
45 | \newlabel{fig:remote}{{5}{9}{The result of running our rnn model for 10 epochs remotely.\relax }{figure.caption.5}{}}
46 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {8}Conclusion}{9}{section.8}}
47 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {9}Code}{9}{section.9}}
48 | 


--------------------------------------------------------------------------------
/CS262%20Final%20Project/main.bib:
--------------------------------------------------------------------------------
 1 | @inproceedings{distbelief,
 2 | title = {Large Scale Distributed Deep Networks},
 3 | author  = {Jeffrey Dean and Greg S. Corrado and Rajat Monga and Kai Chen and Matthieu Devin and Quoc V. Le and Mark Z. Mao and Marc’Aurelio Ranzato and Andrew Senior and Paul Tucker and Ke Yang and Andrew Y. Ng},
 4 | year  = 2012,
 5 | booktitle = {NIPS}
 6 | }
 7 | @article{bengio-emb,
 8 |  author = {Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Janvin, Christian},
 9 |  title = {A Neural Probabilistic Language Model},
10 |  journal = {J. Mach. Learn. Res.},
11 |  issue_date = {3/1/2003},
12 |  volume = {3},
13 |  month = mar,
14 |  year = {2003},
15 |  issn = {1532-4435},
16 |  pages = {1137--1155},
17 |  numpages = {19},
18 |  url = {http://dl.acm.org/citation.cfm?id=944919.944966},
19 |  acmid = {944966},
20 |  publisher = {JMLR.org},
21 | } 
22 | 
23 | @article{tensorflow,
24 |   author    = {Mart{\'{\i}}n Abadi and
25 |                Ashish Agarwal and
26 |                Paul Barham and
27 |                Eugene Brevdo and
28 |                Zhifeng Chen and
29 |                Craig Citro and
30 |                Gregory S. Corrado and
31 |                Andy Davis and
32 |                Jeffrey Dean and
33 |                Matthieu Devin and
34 |                Sanjay Ghemawat and
35 |                Ian J. Goodfellow and
36 |                Andrew Harp and
37 |                Geoffrey Irving and
38 |                Michael Isard and
39 |                Yangqing Jia and
40 |                Rafal J{\'{o}}zefowicz and
41 |                Lukasz Kaiser and
42 |                Manjunath Kudlur and
43 |                Josh Levenberg and
44 |                Dan Mane and
45 |                Rajat Monga and
46 |                Sherry Moore and
47 |                Derek Gordon Murray and
48 |                Chris Olah and
49 |                Mike Schuster and
50 |                Jonathon Shlens and
51 |                Benoit Steiner and
52 |                Ilya Sutskever and
53 |                Kunal Talwar and
54 |                Paul A. Tucker and
55 |                Vincent Vanhoucke and
56 |                Vijay Vasudevan and
57 |                Fernanda B. Vi{\'{e}}gas and
58 |                Oriol Vinyals and
59 |                Pete Warden and
60 |                Martin Wattenberg and
61 |                Martin Wicke and
62 |                Yuan Yu and
63 |                Xiaoqiang Zheng},
64 |   title     = {TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed
65 |                Systems},
66 |   journal   = {CoRR},
67 |   volume    = {abs/1603.04467},
68 |   year      = {2016},
69 |   url       = {http://arxiv.org/abs/1603.04467},
70 |   timestamp = {Sun, 03 Apr 2016 11:52:22 +0200},
71 |   biburl    = {http://dblp.uni-trier.de/rec/bib/journals/corr/AbadiABBCCCDDDG16},
72 |   bibsource = {dblp computer science bibliography, http://dblp.org}
73 | }
74 | @MISC{protobuf,
75 |   title={Protocol Buffers},
76 |   author={Kenton Varda},
77 |   howpublished={\url{http://code.google.com/apis/protocolbuffers/}},
78 | }
79 | @techreport{adagrad,
80 |     Author = {Duchi, John and Hazan, Elad and Singer, Yoram},
81 |     Title = {Adaptive Subgradient Methods for Online Learning and Stochastic Optimization},
82 |     Institution = {EECS Department, University of California, Berkeley},
83 |     Year = {2010},
84 |     Month = {Mar},
85 |     URL = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2010/EECS-2010-24.html},
86 |     Number = {UCB/EECS-2010-24},
87 |     Abstract = {We present a new family of subgradient methods that dynamically incorporate knowledge of the geometry of the data observed in earlier iterations to perform more informative gradient-based learning. Metaphorically, the adaptation allows us to find needles in haystacks in the form of very predictive but rarely seen features. Our paradigm stems from recent advances in stochastic optimization and online learning which employ proximal functions to control the gradient steps of the algorithm. We describe and analyze an apparatus for adaptively modifying the proximal function, which significantly simplifies setting a learning rate and results in regret guarantees that are provably as good as the best proximal function that can be chosen in hindsight. We give several efficient algorithms for empirical risk minimization problems with common and important regularization functions and domain constraints. We experimentally study our theoretical analysis and show that adaptive subgradient methods significantly outperform state-of-the-art, yet non-adaptive, subgradient algorithms.}
88 | }


--------------------------------------------------------------------------------
/CS262%20Final%20Project/main.out:
--------------------------------------------------------------------------------
 1 | \BOOKMARK [1][-]{section.1}{Introduction}{}% 1
 2 | \BOOKMARK [1][-]{section.2}{Background on Downpour SGD}{}% 2
 3 | \BOOKMARK [1][-]{section.3}{Motivation}{}% 3
 4 | \BOOKMARK [1][-]{section.4}{Challenges}{}% 4
 5 | \BOOKMARK [1][-]{section.5}{Methods and Design}{}% 5
 6 | \BOOKMARK [1][-]{section.6}{Results and Discussion}{}% 6
 7 | \BOOKMARK [1][-]{section.7}{Applying SGD in Lua/Torch}{}% 7
 8 | \BOOKMARK [1][-]{section.8}{Conclusion}{}% 8
 9 | \BOOKMARK [1][-]{section.9}{Code}{}% 9
10 | 


--------------------------------------------------------------------------------
/CS262%20Final%20Project/main.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/main.pdf


--------------------------------------------------------------------------------
/CS262%20Final%20Project/main.run.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="yes"?>
 2 | <!-- logreq request file -->
 3 | <!-- logreq version 1.0 / dtd version 1.0 -->
 4 | <!-- Do not edit this file! -->
 5 | <!DOCTYPE requests [
 6 |   <!ELEMENT requests (internal | external)*>
 7 |   <!ELEMENT internal (generic, (provides | requires)*)>
 8 |   <!ELEMENT external (generic, cmdline?, input?, output?, (provides | requires)*)>
 9 |   <!ELEMENT cmdline (binary, (option | infile | outfile)*)>
10 |   <!ELEMENT input (file)+>
11 |   <!ELEMENT output (file)+>
12 |   <!ELEMENT provides (file)+>
13 |   <!ELEMENT requires (file)+>
14 |   <!ELEMENT generic (#PCDATA)>
15 |   <!ELEMENT binary (#PCDATA)>
16 |   <!ELEMENT option (#PCDATA)>
17 |   <!ELEMENT infile (#PCDATA)>
18 |   <!ELEMENT outfile (#PCDATA)>
19 |   <!ELEMENT file (#PCDATA)>
20 |   <!ATTLIST requests
21 |     version CDATA #REQUIRED
22 |   >
23 |   <!ATTLIST internal
24 |     package CDATA #REQUIRED
25 |     priority (9) #REQUIRED
26 |     active (0 | 1) #REQUIRED
27 |   >
28 |   <!ATTLIST external
29 |     package CDATA #REQUIRED
30 |     priority (1 | 2 | 3 | 4 | 5 | 6 | 7 | 8) #REQUIRED
31 |     active (0 | 1) #REQUIRED
32 |   >
33 |   <!ATTLIST provides
34 |     type (static | dynamic | editable) #REQUIRED
35 |   >
36 |   <!ATTLIST requires
37 |     type (static | dynamic | editable) #REQUIRED
38 |   >
39 |   <!ATTLIST file
40 |     type CDATA #IMPLIED
41 |   >
42 | ]>
43 | <requests version="1.0">
44 |   <internal package="biblatex" priority="9" active="1">
45 |     <generic>latex</generic>
46 |     <provides type="dynamic">
47 |       <file>main.bcf</file>
48 |     </provides>
49 |     <requires type="dynamic">
50 |       <file>main.bbl</file>
51 |     </requires>
52 |     <requires type="static">
53 |       <file>blx-dm.def</file>
54 |       <file>blx-compat.def</file>
55 |       <file>biblatex.def</file>
56 |       <file>alphabetic.bbx</file>
57 |       <file>standard.bbx</file>
58 |       <file>alphabetic.cbx</file>
59 |       <file>biblatex.cfg</file>
60 |       <file>english.lbx</file>
61 |     </requires>
62 |   </internal>
63 |   <external package="biblatex" priority="5" active="1">
64 |     <generic>biber</generic>
65 |     <cmdline>
66 |       <binary>biber</binary>
67 |       <infile>main</infile>
68 |     </cmdline>
69 |     <input>
70 |       <file>main.bcf</file>
71 |     </input>
72 |     <output>
73 |       <file>main.bbl</file>
74 |     </output>
75 |     <provides type="dynamic">
76 |       <file>main.bbl</file>
77 |     </provides>
78 |     <requires type="dynamic">
79 |       <file>main.bcf</file>
80 |     </requires>
81 |     <requires type="editable">
82 |       <file>main.bib</file>
83 |     </requires>
84 |   </external>
85 | </requests>
86 | 


--------------------------------------------------------------------------------
/CS262%20Final%20Project/main.synctex.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/main.synctex.gz


--------------------------------------------------------------------------------
/Dev-Notebook-Kevin.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Played around with autograd in python. Looking for a reasonable toy dataset to test sgd on distributed system
 3 | Looked into the convolutional network example for autograd https://github.com/HIPS/autograd/blob/master/examples/neural_net.py
 4 | This ended up being perfect because it spits out a long vector of gradients that it uses 
 5 | Looking for a far heavier dataset. MSINT runs in a 1-2 minutes.
 6 | Found Caltech 101, built some preprocessing code, modified some of the code for the neural network
 7 | Needed to downsize the images substantially. 240 x 240 is around 12 GB of data. Shrunk it down to 128 x 128, making it 4 Gb of data. New gradients are around 0.5Gb. This makes network speeds pretty prohibitive though. 
 8 | Epochs take a couple minutes to run. Batches takes around 10-15 seconds each. Seems rather reasonable
 9 | 
10 | 
11 | Looking into Azure for launching VMs 
12 | Discovered CLI for Azure
13 | Set up 5 different accounts all using the Bizspark subscription. One email account also has a free subscription activated.
14 | Emails and passwords are listed below:
15 | 
16 | (candokevin2@hotmail.com, cs262michaelkevin)
17 | (candokevin3@hotmail.com, cs262michaelkevin)
18 | 
19 | 
20 | Log into portal.azure.com to interact more with the system
21 | 
22 | Received instructions from Mike on how to setup grpc. For replicability on later Linux VMs we launch, I've documented the steps
23 | I took below:
24 | 
25 | Set up Protobufs 3.0.0
26 | 	https://github.com/google/protobuf/releases/download/v3.0.0-beta-2/protobuf-python-3.0.0-beta-2.zip
27 | 	./autogen.sh
28 | 	./configure
29 | 	make
30 | 	make check
31 | 	make install
32 | 
33 | Set up grpc
34 | 	git clone https://github.com/grpc/grpc.git
35 | 	sudo make grpc_python_plugin
36 | 	sudo vim /etc/paths, add the line /Users/candokevin/stash/grpc/bins/opt
37 | 
38 | 
39 | It might be a good idea to look into Docker containers, and Docker networks for launching and setting up VMs. 
40 | 
41 | This site suggests that Google Compute might actually be the best platform for this
42 | https://gigaom.com/2014/04/12/need-for-speed-testing-the-networking-performance-of-the-top-4-cloud-providers/
43 | https://cloudplatform.googleblog.com/2014/04/enter-andromeda-zone-google-cloud-platforms-latest-networking-stack.html
44 | Get started, generate a project ID
45 | Network speed is critical considering how huge our gradients may be. 
46 | 
47 | Persistent 10GB disk for saving the state of machine 
48 | Allows you to save the state of a machine
49 | 
50 | gcloud compute instances create example-instance --image test-image --zone us-central1-b
51 | gcloud compute ssh large-example-instance --zone 
52 | gcloud compute copy-files /Users/candokevin/stash/distributed-sgd/scp extra-large-example-instance:~/scp/ --zone us-central1-b
53 | 
54 | 
55 | Generate some code that performs the following
56 | 
57 | Initializes the parameters to some certain set of values
58 | Updates parameters given some gradient
59 | Sends parameters to different servers 
60 | 


--------------------------------------------------------------------------------
/Dev-Notebook-Mike.md:
--------------------------------------------------------------------------------
  1 | - need to install proto3 protocol buffers
  2 | 
  3 | download link:
  4 | https://github.com/google/protobuf/releases/download/v3.0.0-beta-2/protobuf-python-3.0.0-beta-2.zip
  5 | 
  6 | https://github.com/google/protobuf
  7 | 
  8 | example:
  9 | https://github.com/grpc/grpc/tree/release-0_13/examples/python/helloworld
 10 | 
 11 | cd into directory
 12 | 	brew update && brew remove gmp && brew install gmp && brew link gmp
 13 | 
 14 | 	./autogen.sh
 15 | 
 16 | 	./configure
 17 | 
 18 | 	make
 19 | 
 20 | 	make check
 21 | 
 22 | 	make install
 23 | 
 24 | example usage
 25 | protoc -I=$SRC_DIR --python_out=$DST_DIR $SRC_DIR/addressbook.proto
 26 | 
 27 | - installed grpc according to the following instructions listed here: https://github.com/grpc/grpc/tree/release-0_13/examples/python an outline of the command I ran are the following:
 28 | 
 29 | 	sudo pip install grpcio
 30 | 
 31 | git clone https://github.com/grpc/grpc
 32 | 
 33 | 	- We can test to see if the helloworld example works:
 34 | 
 35 | 	cd grpc/examples/python/helloworld
 36 | 
 37 | 	- Run the server
 38 | 
 39 | 	python2.7 greeter_server.py &
 40 | 
 41 | 	- Run the client
 42 | 
 43 | 	python2.7 greeter_client.py
 44 | 
 45 | 	-You should see the output "Greeter client received: Hello, you!"
 46 | 
 47 | Instead going to copy the necessary files into our directory and have a small running example
 48 | 
 49 | in the folder Distributed-SGD/helloworld:
 50 | 
 51 | have the files:
 52 | 
 53 | 	greeter_client.py
 54 | 	greeter_server.py 
 55 | 
 56 | 
 57 | sudo pip install grpcio --upgrade
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | HOW I GOT IT TO WORK
 64 | Used this link:
 65 | https://github.com/grpc/homebrew-grpc
 66 | 
 67 | 
 68 | curl -fsSL https://goo.gl/getgrpc | bash -
 69 | 
 70 |  virtualenv venv
 71 | source venv/bin/activate
 72 | 
 73 | curl -fsSL https://goo.gl/getgrpc | bash -s python
 74 | 
 75 | cd venv
 76 | 
 77 | git clone https://github.com/grpc/grpc.git
 78 | 
 79 | cd grpc
 80 | 
 81 | make grpc_python_plugin
 82 | 
 83 | 
 84 | 
 85 | 
 86 | here we go:
 87 | 
 88 | cd /usr/local/
 89 | mkdir manual
 90 | cd manual
 91 | 
 92 | curl -fsSL https://goo.gl/getgrpc | bash -
 93 | 
 94 | virtualenv venv
 95 | 
 96 | source venv/bin/activate
 97 | 
 98 | curl -fsSL https://goo.gl/getgrpc | bash -s python
 99 | 
100 | pip install numpy
101 | pip install scipy
102 | sudo pip install pillow
103 | pip install sklearn
104 | pip install autograd
105 | 
106 | cd venv
107 | 
108 | git clone https://github.com/grpc/grpc.git
109 | cd grpc
110 | 
111 | make grpc_python_plugin
112 | 
113 | sudo vim /etc/paths
114 | 
115 | 	and add the line:
116 | 
117 | 	/usr/local/manual/venv/grpc/bins/opt
118 | 
119 | 
120 | 
121 | BEFORE RUNNING ANYTHING
122 | 
123 | source /usr/local/manual/venv/bin/activate
124 | 
125 | 
126 | Important links:
127 | https://github.com/grpc/homebrew-grpc
128 | https://docs.docker.com/engine/userguide/networking/
129 | http://www.bpython-interpreter.org
130 | https://github.com/mila-udem/fuel
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Distributed-SGD
 2 | 
 3 | An implementation of distributed stochastic gradient descent for both local and remote clients.
 4 | 
 5 | The [paper](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/CS262%2520Final%2520Project/main.pdf) describing this project.
 6 | 
 7 | ## Usage
 8 | 
 9 | The usage varies depending on the native language. 
10 | 
11 | Please see the respective directory for the language you are interested in:
12 | 
13 | Usage in [python](https://github.com/michaelfarrell76/Distributed-SGD/tree/master/python-python) 
14 | 
15 | Usage in [lua/torch](https://github.com/michaelfarrell76/Distributed-SGD/tree/master/lua-lua) 
16 | 
17 | ## Table of Contents
18 | ```
19 | .
20 | ├── lua-lua	     # Implementation of Distributed SGD in lua/torch
21 | ├── python-python # Implementation of Distributed SGD in python
22 | ├── slides # presentation slides about this project
23 | ├──.gitignore	
24 | ├──.gitmodules 
25 | ├── Dev-Notebook-Kevin.md # Development notes
26 | ├── Dev-Notebook-Mike.md
27 | ├── README.md
28 | └── client_list.txt # List of available server ip addresses
29 | ```
30 | 
31 | 
32 | ## Primary contributors
33 | 
34 | [Kevin Yang](https://github.com/kyang01)
35 | 
36 | [Michael Farrell](https://github.com/michaelfarrell76)
37 | 
38 | 


--------------------------------------------------------------------------------
/client_list.txt:
--------------------------------------------------------------------------------
 1 | 130.211.204.149
 2 | 104.197.250.103
 3 | 130.211.192.196
 4 | 104.197.222.148
 5 | 104.197.106.197
 6 | 104.197.167.23
 7 | 104.154.239.139
 8 | 130.211.206.66
 9 | 104.197.137.32
10 | 104.197.174.106
11 | 


--------------------------------------------------------------------------------
/lua-lua/README.md:
--------------------------------------------------------------------------------
  1 | # Distributed-SGD: lua-lua
  2 | An implementation of distributed stochastic gradient descent in lua/torch. Clients can be local and remote. 
  3 | 
  4 | ## Requirements
  5 | 
  6 | This code is written in Lua, and an installation of [Torch](https://github.com/torch/torch7/) is assumed. Training requires a few packages which can easily be installed through [LuaRocks](https://github.com/keplerproject/luarocks) (which comes with a Torch installation). Datasets are formatted and loaded using [hdf5](https://en.wikipedia.org/wiki/Hierarchical_Data_Format), which can be installed using this [guide](https://github.com/deepmind/torch-hdf5/blob/master/doc/usage.md).
  7 | 
  8 | Once torch and torch-hdf5 are installed, use luarocks to install the other dependencies used in the example:
  9 | 
 10 | ```bash
 11 | $ luarocks install nn
 12 | $ luarocks install rnn
 13 | ```
 14 | If you want to train on an Nvidia GPU using CUDA, you'll need to install the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) as well as the `cutorch` and `cunn` packages:
 15 | ```bash
 16 | $ luarocks install cutorch
 17 | $ luarocks install cunn
 18 | ```
 19 | We need to ensure that our local version of parallel is installed. This can be done with a short bash script from the lua-lua folder:
 20 | ```bash
 21 | $ cd lua-lua
 22 | $ bash install_parallel.sh
 23 | ```
 24 | 
 25 | ## Directory Table of Contents
 26 | ```
 27 | .
 28 | ├── data                            # Folder holding data used for demo
 29 | ├── parallel                        # Folder containing the changes we added to the parallel class
 30 | ├── End-To-End-Generative-Dialgoue  # Folder of our other repo containing the code used in demo
 31 | ├── README.md                       # lua-lua usage 
 32 | ├── server.lua                      # Main server file
 33 | ├── README.md
 34 | ├── startup.sh                      # Startup script for remote gcloud servers
 35 | ├── setup_image.sh                  # Script that copies startup.sh to remote server and calls startup.sh
 36 | ├── install_parallel.sh             # script that installs our version of parallel
 37 | └── demo_server.lua                 # A demo class that implements the server
 38 | ```
 39 | 
 40 | ## Description
 41 | 
 42 | ## Demo-Usage
 43 | Code is run from the lua-lua folder:
 44 | ```bash
 45 | $ cd lua-lua
 46 | ```
 47 | 
 48 | #### Local
 49 | 
 50 | To run a worker with 2 parallel clients on your own machine:
 51 | ```bash
 52 | $ th server.lua -n_proc 2
 53 | ```
 54 | 
 55 | #### Remote - localhost
 56 | 
 57 | In order to get the demo to connect through localhost rather than simply forking, we must first setup an .ssh key for this project. 
 58 | 
 59 | Note: This is basically doing the same thing as [local](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/lua-lua/README.md#local), except we now connect to the clients through localhost. This is a good tool to use to debug problems with clients running on remote servers.
 60 | 
 61 | ##### Generate ssh key
 62 | Replace USERNAME with your username on the computer you want to connect to:
 63 | ```bash
 64 | $ USERNAME=michaelfarrell
 65 | $ ssh-keygen -t rsa -f ~/.ssh/dist-sgd-sshkey -C $USERNAME
 66 | ```
 67 | Hit enter twice and a key should have been generated. 
 68 | 
 69 | ##### Add ssh-key to authorized keys
 70 | 
 71 | In order to connect to clients through localhost, we must add the key to our list of authorized_keys:
 72 | ```bash
 73 | $ cat ~/.ssh/dist-sgd-sshkey.pub >> ~/.ssh/authorized_keys
 74 | $ chmod og-wx ~/.ssh/authorized_keys 
 75 | ```
 76 | 
 77 | ##### Allow ssh connections
 78 | 
 79 | In order to connect through localhost, you must allow your computer to allow incoming ssh connections. 
 80 | 
 81 | On a Mac, this can be done by going to:
 82 | 
 83 | System Preferences > Sharing
 84 | 
 85 | and checking the 'Remote Login' box
 86 | 
 87 | 
 88 | ##### Connect via localhost
 89 | 
 90 | You can now communicate over localhost using the command:
 91 | 
 92 | ```bash
 93 | $ EXTENSION=Desktop/GoogleDrive/FinalProject/Distributed-SGD/lua-lua/
 94 | $ TORCH_PATH=/Users/michaelfarrell/torch/install/bin/th
 95 | $ th server.lua -n_proc 4 -localhost -extension $EXTENSION -torch_path $TORCH_PATH
 96 | ```
 97 | where $EXTENSION is the relative path to the lua-lua folder from the your directory and $TORCH_PATH is the absolute path to torch on your computer
 98 | 
 99 | #### Remote - gcloud 
100 | 
101 | Instead of having the client programs running on your own computer, you can farm them out to any number of remote computers. Below is a description of how to setup remote clients using google cloud (gcloud offers 60 day free trials with $300 worth of credit). 
102 | 
103 | ##### Adding ssh key to gcloud servers
104 | 
105 | We have to allow our gcloud servers to accept incoming ssh connections from our computer. 
106 | 
107 | If you have yet to do so, [generate an ssh-key](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/lua-lua/README.md#generate-ssh-key)
108 | 
109 | Once you have created the key print it out:
110 | 
111 | ```bash
112 | $ cat ~/.ssh/dist-sgd-sshkey.pub
113 | ```
114 | 
115 | Next you must add the key to the set of public keys :
116 | - Login to your google compute account. 
117 | - Go to compute engine dashboard
118 | - Go to metdata tab
119 | - Go to ssh-key subtab
120 | - Click edit
121 | - Add the key you copied as a new line
122 | 
123 | Restrict external access to the key:
124 | ```bash
125 | $ chmod 400 ~/.ssh/dist-sgd-sshkey
126 | ```
127 | 
128 | ##### Create a baseline startup image
129 | 
130 | We only have to setup and install everything once, after which we can clone that client. 
131 | 
132 | ###### Create the image
133 | - Click on the 'VM Instances' tab
134 | - Create Instance
135 | - Give the instance a name i.e. 'demo-baseline'
136 | - Set the zone to us-central1-b
137 | - Choose 8vCPU highmem as machine type
138 | - Under boot disk click change
139 | - Choose Ubuntu 14.04 LTS
140 | - At the bottom change size to 30 GB and click 'select'
141 | - Allow HTTP traffic
142 | - Allow HTTPS traffic
143 | - Click 'Management, disk, networking, SSH keys' to dropdown more options
144 | - Under 'Disk' unclick 'Delete boot disk when instance is deleted'
145 | - Click 'Create' an you should see your new instance listed in the table
146 | 
147 | ###### Allow tcp connections
148 | - Wait for the VM instance to startup (indicated by a green check next to the instance)
149 | - Under the 'network' column, click 'default'
150 | - Go to 'Firewall rules' and Add a new rule
151 | - Set name to be 'all'
152 | - Set source filter to allow from any source
153 | - Under allowed protocols, put 'tcp:0-65535; udp:0-65535; icmp'
154 | - Create
155 | 
156 | ###### Setup the disk
157 | - Return to the 'VM instances' tab
158 | - Grab the external IP address for the instance 
159 | ```bash
160 | $ EXTERNAL_IP=104.154.48.250
161 | $ USERNAME=michaelfarrell
162 | ```
163 | - Next you must modify the 'startup.sh' script to also include any additional installs that you may need on the server. This script is run from the home directory of the remote client. To run the demo, you do not need to modify this script.
164 | - Next you must modify the 'setup_image.sh' script so that it correctly calls your startup.sh script on the remote server. If you did not change 'startup.sh' script, you should probably not be changing this script either. 
165 | - Setup the image:
166 | ```bash
167 | $ source setup_image.sh
168 | ```
169 | Note you can connect to the server:
170 | ```bash
171 | $ ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$EXTERNAL_IP
172 | ```
173 | - Once the server is setup to your liking, disconnect from the server and return to your google cloud dashboard
174 | - Go to the 'VM Dashboard'
175 | - Click on the instance you just setup, and delete it. This should remove the instance and save it as a disk. If you click on the 'disks' tab, you should see the instance name you just deleted.
176 | 
177 | ###### Create the image
178 | 
179 | - Click on the 'Images' tab
180 | - 'Create Image'
181 | - Give it a name i.e. 'demo-image'
182 | - Under Source-Disk, choose the disk that you just created 
183 | - Create
184 | 
185 | ##### Generate an 'Instance Template'
186 | - Click on the 'Instance templates' tab
187 | - Create new
188 | - Name the template i.e. 'demo-template'
189 | - Under 'Boot Disk' click change
190 | - At the top click 'Your image'
191 | - Choose the image you just created i.e. 'demo-image'
192 | - Set size to 30 GB
193 | - Select
194 | - Allow HTTP traffic
195 | - Allow HTTPS traffic
196 | - Under more->Disks, unclick 'Delete boot disk when instance is deleted'
197 | - Create
198 | 
199 | ##### Generate an 'Instance Group'
200 | - Go to the "Instance groups" tab
201 | - Create instance group
202 | - Give the group a name, i.e. 'demo-group'
203 | - Give a description
204 | - Set zone to us-central1-b
205 | - Use instance template
206 | - Choose the template you just made i.e. 'demo-template' 
207 | - Set the number of instances
208 | - Create
209 | - Wait for the instances to launch
210 | - Once there is a green checkmark, click on the new instance
211 | 
212 | ##### Adding remote clients
213 | You will want to add your list of client servers to the file 'client_list.txt' where each line in the file is one of the external ip addresses located in the Instance group you are currently using. You will need to copy this list of files to the computer that you are going to use as the main parameter server. Choose an IP from the freshly updated 'client_list.txt' and set the $SERVER_IP environment variable:
214 | ```bash
215 | $ SERVER_IP=130.211.160.115
216 | ```
217 | Copy over 'client_list.txt' to the main server:
218 | ```bash
219 | $ scp -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey ../client_list.txt $USERNAME@$SERVER_IP:~/Distributed-SGD
220 | ```
221 | 
222 | ##### Connecting to gcloud servers
223 | 
224 | You can connect to one of the servers by running:
225 | ```bash
226 | $ ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$SERVER_IP
227 | ```
228 | Note: the flag `-o "StrictHostKeyChecking no"` automatically adds the host to your list and does not prompt confirmation.
229 | 
230 | If you get an error like this:
231 | ```bash
232 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
233 | @    WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED!     @
234 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
235 | ```
236 | then you'll want to
237 | ```bash
238 | $ vim ~/.ssh/known_hosts
239 | ```
240 | and delete the last few lines that were added. They should look like some ip address and then something that starts with AAAA. You can delete lines in vim by typing 'dd' to delete the current line. This can happen when you restart the servers and they change ip addresses, among other things.
241 | 
242 | ##### Adding ssh keys again
243 | 
244 | If the servers have been initialized, you will first want to connect to the computer above that you chose to be the main server
245 | ```bash
246 | $ ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$SERVER_IP
247 | ```
248 | 
249 | Once connected, you need to again setup an ssh key from the computer that you are using as the client.
250 | 
251 | 1) [generate an ssh-key](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/lua-lua/README.md#generate-ssh-key)
252 | 
253 | 2) [add key to gcloud server account](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/lua-lua/README.md#adding-ssh-key-to-gcloud-servers)
254 | 
255 | ##### Running on remote servers:
256 | 
257 | Once this is done, you can run the server with remote gcloud clients using the command:
258 | ```bash
259 | $ cd Distributed-SGD/lua-lua
260 | $ EXTENSION=Distributed-SGD/lua-lua/
261 | $ TORCH_PATH=/home/michaelfarrell/torch/install/bin/th
262 | $ th server.lua -n_proc 4 -remote -extension $EXTENSION  -torch_path $TORCH_PATH
263 | 
264 | ```
265 | 
266 | ## For Personal Usage
267 | 
268 | If you wish to extend this demo to work with your own SGD model you must simply create a new server class specific to your task, replacing the 'demo_server' class. Use the file 'demo_server.lua' as an example. The server only needs to have __init(opt) and run() functions defined in order to work. Once this class is properly defined (i.e. named 'new_server'), you can run the following to initiate your task:
269 | 
270 | ```bash
271 | $ NEW_SERVER_NAME=new_server
272 | $ th server.lua -server_class $NEW_SERVER_NAME # Plus Additional arguments 
273 | 
274 | ``` 
275 | 
276 | When developing, all command line arguments should be added in the file server.lua. Look at the command arguments 
277 | ```bash 
278 | $ th server.lua --help
279 | ```
280 | that already exist and use those names when developing your model. If you need an additional command line argument, add it in server.lua. Other than this, there should be no reason to edit the server.lua file. 
281 | 
282 | If you are having your clients run remotely, you may also need to modify 'startup.sh' and 'setup_image.sh' so that they setup the server environements according to the specifications that you need. 
283 | 
284 | 
285 | ## TODO
286 | - Document data folder and include description in demo-usage about what the demo is
287 | - Add in documentation of how the data needs to be formatted in order to run the demo
288 | - Finish description
289 | - Finish Acknowledgements
290 | - Add in proto implementation
291 | - Add in git pull at startup
292 | - add way to catch if failure down and reset
293 | - maybe add paxos if kevin is successful 
294 | - try adding protobufs 
295 | - get results
296 | - Add in addtional catches for errors like add to path
297 | 
298 | 
299 | ## Acknowledgments
300 | This example is also apart of another one of our repos: https://github.com/michaelfarrell76/End-To-End-Generative-Dialogue
301 | 
302 | Our implementation utilizes code from the following:
303 | 
304 | * [Yoon Kim's seq2seq-attn repo](https://github.com/harvardnlp/seq2seq-attn)
305 | * [Element rnn library](https://github.com/Element-Research/rnn)
306 | * [Facebook's neural attention model](https://github.com/facebook/NAMAS)
307 | 


--------------------------------------------------------------------------------
/lua-lua/cleanup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Copy files from servers
 5 | """
 6 | 
 7 | import sys
 8 | import os
 9 | import time
10 | 
11 | 
12 | def child(ip_addr):
13 |     if not os.path.exists('outputs/' + ip_addr):
14 |             os.makedirs('outputs/' + ip_addr)
15 |     os.system('(echo " echo starting; pkill torch; pkill lua; cd Distributed-SGD/lua-lua/; git pull; cd End-To-End-Generative-Dialogue/; git pull origin master; exit") | ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey michaelfarrell@%s' % ip_addr)
16 |     os._exit(0)  
17 | 
18 | 
19 | def main(arguments):
20 |     with open('../client_list.txt') as f:
21 |         if not os.path.exists('outputs'):
22 |             os.makedirs('outputs')
23 |         pids = []
24 |         for line in f:
25 |             # os.system('echo ' + line)
26 |             newpid = os.fork()
27 |             pids.append(newpid)
28 |             if newpid == 0:
29 |                 if line[-1] ==  '\n':
30 |                     child(line[:-1])
31 |                 else:
32 |                     child(line)
33 | 
34 |     
35 | if __name__ == '__main__':
36 |     sys.exit(main(sys.argv[1:]))


--------------------------------------------------------------------------------
/lua-lua/copy_files.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Copy files from servers
 5 | """
 6 | 
 7 | import sys
 8 | import os
 9 | import time
10 | 
11 | 
12 | def child(ip_addr):
13 |     if not os.path.exists('outputs/' + ip_addr):
14 |             os.makedirs('outputs/' + ip_addr)
15 |     cmd = 'scp -r -i ~/.ssh/dist-sgd-sshkey michaelfarrell@%s:~/Distributed-SGD/lua-lua/*.txt ~/Desktop/GoogleDrive/FinalProject/Distributed-SGD/lua-lua/outputs/%s/ &> /dev/null' % (ip_addr, ip_addr)
16 | 
17 |     os.system(cmd)
18 |     os._exit(0)  
19 | 
20 | 
21 | def main(arguments):
22 |     with open('../client_list.txt') as f:
23 |         if not os.path.exists('outputs'):
24 |             os.makedirs('outputs')
25 |         pids = []
26 |         for line in f:
27 |             # os.system('echo ' + line)
28 |             newpid = os.fork()
29 |             pids.append(newpid)
30 |             if newpid == 0:
31 |                 if line[-1] ==  '\n':
32 |                     child(line[:-1])
33 |                 else:
34 |                     child(line)
35 | 
36 |     
37 |     time.sleep(5)
38 | if __name__ == '__main__':
39 |     sys.exit(main(sys.argv[1:]))


--------------------------------------------------------------------------------
/lua-lua/data/demo-train.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/data/demo-train.hdf5


--------------------------------------------------------------------------------
/lua-lua/data/demo-val.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/data/demo-val.hdf5


--------------------------------------------------------------------------------
/lua-lua/demo_server.lua:
--------------------------------------------------------------------------------
  1 | ------------------------------------------------------------------------
  2 | -- demo_server.lua
  3 | --
  4 | -- This is the example of a class that is used to implement a sever in
  5 | --      server.lua. This class has an _init(opt) function that takes in
  6 | --      the global parameters, loads in the data and builds the model on 
  7 | --      the parameter server. The class also has a run() function that
  8 | --      forks out the child clients and executes the function 'worker'
  9 | --      on each corresponding client. 
 10 | --
 11 | -- If you wish to develop your own SGD model, create a new class that is
 12 | --      similar to this.
 13 | ------------------------------------------------------------------------
 14 | local demo_server = torch.class('demo_server')
 15 | 
 16 | ------------
 17 | -- Worker code
 18 | ------------
 19 | function worker()
 20 |     -- Used to check files 
 21 |     require "lfs"
 22 | 
 23 |     -- Used to update path
 24 |     require 'package'
 25 |     
 26 |     -- Alert successfully started up
 27 |     parallel.print('Im a worker, my ID is: ',  parallel.id, ' and my IP: ', parallel.ip)
 28 | 
 29 |     -- Global indicating is a child
 30 |     ischild = true
 31 | 
 32 |     -- Extension to lua-lua folder from home directory. Set to no extension as default
 33 |     ext = ""
 34 | 
 35 |     -- Number of packages received
 36 |     local n_pkg = 0
 37 |     while true do
 38 | 
 39 |         -- Allow the parent to terminate the child
 40 |         m = parallel.yield()
 41 |         if m == 'break' then break end   
 42 | 
 43 |         -- Receive data
 44 |         local pkg = parallel.parent:receive()
 45 | 
 46 | 
 47 |         -- Make sure to clean everything up since big files are being passed
 48 |         io.write('.') io.flush()
 49 |         collectgarbage()
 50 | 
 51 | 
 52 |         if n_pkg == 0 then 
 53 |             -- This is the first time receiving a package, it has the globals
 54 | 
 55 |             -- Receive and parse global parameters
 56 |             parallel.print('Recieved initialization parameters')
 57 |             cmd, arg, ext = pkg.cmd, pkg.arg, pkg.ext
 58 |             opt = cmd:parse(arg)
 59 | 
 60 |             -- Update path
 61 |             package.path = opt.add_to_path .. package.path
 62 | 
 63 |             -- Add in additional necessary parameters
 64 |             opt.print = parallel.print
 65 |             opt.parallel = true
 66 | 
 67 | 
 68 |              -- Library used to handle data types
 69 |             local data_loc = ext .. 'End-To-End-Generative-Dialogue/src/data'
 70 |             if not lfs.attributes(data_loc .. '.lua') then
 71 |                 print('The file data.lua could not be found in ' .. data_loc .. '.lua')
 72 |                 os.exit()
 73 |             end
 74 |             data = require(data_loc)
 75 | 
 76 |             -- Load in helper functions for this model defined in End-To-End-Generative-Dialogue
 77 |             local model_funcs_loc = ext .. "End-To-End-Generative-Dialogue/src/model_functions.lua"
 78 |             if not lfs.attributes(model_funcs_loc) then
 79 |                 print('The file model_functions.lua could not be found in ' .. model_funcs_loc)
 80 |                 os.exit()
 81 |             end
 82 |             funcs = loadfile(model_funcs_loc)
 83 |             funcs()
 84 | 
 85 |             -- Change the locations of the datafiles based on new extension
 86 |             opt.data_file = ext .. opt.data_file
 87 |             opt.val_data_file = ext .. opt.val_data_file
 88 | 
 89 |             --point the wordvec to the right place if exists
 90 |             if opt.pre_word_vecs ~= "" then
 91 |                 opt.pre_word_vecs = opt.extension .. opt.pre_word_vecs
 92 |             end
 93 |             
 94 |             -- Load in data to client
 95 |             train_data, valid_data, opt = load_data(opt)
 96 | 
 97 |             -- Build the model on the client
 98 |             model, criterion = build()
 99 | 
100 |             -- send some data back
101 |             parallel.parent:send('Received parameters and loaded data successfully')
102 |         else
103 |             parallel.print('received params from batch with index: ', pkg.index)
104 | 
105 |             -- Load in the parameters sent from the parent
106 |             for i = 1, #model.params do
107 |                 model.params[i]:copy(pkg.parameters[i])
108 |             end
109 | 
110 |             -- Training the model at the given index
111 |             local pkg_o = train_ind(pkg.index, model, criterion, train_data)
112 | 
113 |             -- send some data back
114 |             parallel.print('sending back derivative for batch with index: ', pkg.index)
115 |             parallel.parent:send(pkg_o)
116 |         end
117 |         n_pkg = n_pkg + 1
118 |     end
119 | end
120 | 
121 | 
122 | ------------
123 | -- Server class
124 | ------------
125 | 
126 | -- Initialization function for the server object. Here we load in the data, build our
127 | --      model, and then add any remote client objects if necessary. 
128 | function demo_server:__init(opt)
129 |     -- Save the command line options 
130 |     self.opt = opt
131 | 
132 |     -- Used to check files 
133 |     require "lfs"
134 | 
135 |      -- Library used to handle data types
136 |     local data_loc = 'End-To-End-Generative-Dialogue/src/data'
137 |     if not lfs.attributes(data_loc .. '.lua') then
138 |         print('The file data.lua could not be found in ' .. data_loc .. '.lua')
139 |         os.exit()
140 |     end
141 |     data = require(data_loc)
142 | 
143 |     -- Load in helper functions for this model defined in End-To-End-Generative-Dialogue
144 |     local model_funcs_loc = "End-To-End-Generative-Dialogue/src/model_functions.lua"
145 |     if not lfs.attributes(model_funcs_loc) then
146 |         print('The file model_functions.lua could not be found in ' .. model_funcs_loc)
147 |         os.exit()
148 |     end
149 |     funcs = loadfile(model_funcs_loc)
150 |     funcs()
151 | 
152 |     -- Load in the data
153 |     self:load_data()
154 | 
155 |     -- Setup and build the model
156 |     self:build()
157 | 
158 |     -- Add remote computers if necessary
159 |     if self.opt.remote then
160 |         parallel.print('Runnings clients remotely')
161 |         
162 |         -- Open the list of client ip addresses
163 |         local fh,err = io.open("../client_list.txt")
164 |         if err then print("../client_list.txt not found"); return; end
165 | 
166 |         -- line by line
167 |         while true do
168 |             local line = fh:read()
169 |             if line == nil then break end
170 |             local addr = self.opt.username .. '@' .. line
171 |             addr = string.gsub(addr, "\n", "") -- remove line breaks
172 | 
173 |             -- Add the remote server by ip address
174 |             parallel.addremote( {ip=addr, cores=4, lua=self.opt.torch_path, protocol='ssh -ttq -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey'})
175 |             parallel.print('Adding address ', addr)
176 |         end
177 |     elseif opt.localhost then
178 |         -- Has remote clients launched through localhost
179 |         parallel.print('Running clients through localhost')
180 | 
181 |         parallel.addremote({ip='localhost', cores=4, lua=self.opt.torch_path, protocol='ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey'})
182 |     end
183 | end
184 | 
185 | -- Main function that runs the server. Here the child clients are forked off and
186 | --      the code in the 'worker' function is sent to the clients to be run. Once
187 | --      the connection is established, :send() and :recieve() are used to pass 
188 | --      parameters between the client and the server
189 | function demo_server:run()
190 |     parallel.print('Forking ', self.opt.n_proc, ' processes')
191 |     parallel.sfork(self.opt.n_proc)
192 |     parallel.print('Forked')
193 | 
194 |     -- exec worker code in each process
195 |     parallel.children:exec(worker)
196 |     parallel.print('Finished telling workers to execute')
197 | 
198 |     --send the global parameters to the children
199 |     parallel.children:join()
200 |     parallel.print('Sending parameters to children')
201 |     parallel.children:send({cmd = cmd, arg = arg, ext = self.opt.extension})
202 | 
203 |     -- Get the responses from the children
204 |     replies = parallel.children:receive()
205 |     parallel.print('Replies from children', replies)
206 | 
207 |     -- Train the model
208 |     train(self.model, self.criterion, self.train_data, self.valid_data)
209 |     parallel.print('Finished training the model')
210 | 
211 |     -- sync/terminate when all workers are done
212 |     parallel.children:join('break')
213 |     parallel.print('All processes terminated')
214 | end
215 | 
216 | -- Function loads in the training and validation data into self.train_data and
217 | --      seld.valid_data. 
218 | function demo_server:load_data()
219 |     -- Simply calls the load_data function defined in "End-To-End-Generative-Dialogue/src/model_functions.lua"
220 |     self.train_data, self.valid_data, self.opt = load_data(self.opt)
221 | end
222 | 
223 | -- Function loads in the nn model and criterion into self.model and self.criterion
224 | function demo_server:build()
225 |     -- Simply calls the build function defined in "End-To-End-Generative-Dialogue/src/model_functions.lua"
226 |     self.model, self.criterion = build()
227 | end
228 | 
229 | -- Return the server
230 | return demo_server
231 | 


--------------------------------------------------------------------------------
/lua-lua/gcloud_commands.txt:
--------------------------------------------------------------------------------
 1 | th server.lua -n_proc 2 -remote -extension $EXTENSION -add_to_path '/home/michaelfarrell/Distributed-SGD/lua-lua/End-To-End-Generative-Dialogue/src/?.lua;'  -torch_path $TORCH_PATH | tee ada_2_rem.txt
 2 | 
 3 | 
 4 | 
 5 | th server.lua -n_proc 4 -remote -extension $EXTENSION -add_to_path '/home/michaelfarrell/Distributed-SGD/lua-lua/End-To-End-Generative-Dialogue/src/?.lua;'  -torch_path $TORCH_PATH | tee ada_4_rem.txt
 6 | 
 7 | th server.lua -n_proc 8 -remote -extension $EXTENSION -add_to_path '/home/michaelfarrell/Distributed-SGD/lua-lua/End-To-End-Generative-Dialogue/src/?.lua;'  -torch_path $TORCH_PATH -num_epochs 20 | tee ada_8_rem.txt
 8 | 
 9 | th server.lua -n_proc 2 -ada_grad -learning_rate .1 | tee reg_2_loc.txt
10 | 
11 |  cd End-To-End-Generative-Dialogue/; git pull origin master; cd ..


--------------------------------------------------------------------------------
/lua-lua/install_parallel.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # 
 3 | # Install script for parallel that uses local file init.lua
 4 | #
 5 | 
 6 | 
 7 | # Move into install directory
 8 | cd ..
 9 | if [ -e "install" ]	
10 | then
11 | 	echo -e "\033[0;32minstall folder exists\033[0m"
12 | else
13 | 	echo -e "\033[0;34mMaking install repo ...\033[0m"
14 | 	mkdir install
15 | fi
16 | cd install 
17 | 
18 | # Ensure that parallel is downloaded and installed with local version
19 | if [ -e "lua---parallel" ]	
20 | then
21 | 	echo -e "\033[0;32mparallel exists\033[0m"
22 | else
23 | 	echo -e "\033[0;34mCloining Parallel Repo ...\033[0m"
24 | 	git clone https://github.com/clementfarabet/lua---parallel.git &> /dev/null
25 | fi
26 | 
27 | cd lua---parallel
28 | echo -e "\033[0;34mCopying local init.lua file for parallel...\033[0m"
29 | cp ../../lua-lua/parallel/init.lua .
30 | echo -e "\033[0;34mBuilding local version of parallel...\033[0m"
31 | luarocks remove parallel &> /dev/null
32 | luarocks make &> /dev/null
33 | echo -e "\033[0;32mInstall complete\033[0m"
34 | 
35 | 


--------------------------------------------------------------------------------
/lua-lua/locally.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/locally.png


--------------------------------------------------------------------------------
/lua-lua/outputs/104.154.239.139/ada_4_rem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.154.239.139/ada_4_rem.png


--------------------------------------------------------------------------------
/lua-lua/outputs/104.154.239.139/ada_8_rem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.154.239.139/ada_8_rem.png


--------------------------------------------------------------------------------
/lua-lua/outputs/104.197.106.197/ada_2_rem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.197.106.197/ada_2_rem.png


--------------------------------------------------------------------------------
/lua-lua/outputs/104.197.222.148/ada_2_loc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.197.222.148/ada_2_loc.png


--------------------------------------------------------------------------------
/lua-lua/outputs/104.197.222.148/reg_2.txt:
--------------------------------------------------------------------------------
 1 | <parallel#000>	Im the parent, my ID is: 	0	 and my IP: 	10.251.50.144	
 2 | <parallel#000>	Loading data...	
 3 | <parallel#000>	Source vocab size: 28721, Target vocab size: 42787	
 4 | <parallel#000>	Source max sent len: 52, Target max sent len: 52	
 5 | <parallel#000>	Done loading data!
 6 | 	
 7 | <parallel#000>	Building model with specs:	
 8 | <parallel#000>	Layer type: lstm	
 9 | <parallel#000>	Model type: red	
10 | <parallel#000>	Embedding size: 300	
11 | <parallel#000>	Hidden layer size: 300	
12 | <parallel#000>	Number of layers: 2	
13 | <parallel#000>	Number of parameters: 37219687
14 | 	
15 | <parallel#000>	Forking 	2	 processes	
16 | <parallel#000>	Forked	
17 | <parallel#000>	Finished telling workers to execute	
18 | <parallel#000>	Sending parameters to children	
19 | <parallel#002>	Im a worker, my ID is: 	2	 and my IP: 	10.251.50.144
20 | .<parallel#001>	Im a worker, my ID is: 	1	 and my IP: 	10.251.50.144
21 | .<parallel#002>	Recieved initialization parameters
22 | <parallel#001>	Recieved initialization parameters
23 | <parallel#000>	Replies from children	{
24 |   1 : "Received parameters and loaded data successfully"
25 |   2 : "Received parameters and loaded data successfully"
26 | }
27 | <parallel#000>	Beginning training...	
28 | <parallel#001>	Loading data...
29 | <parallel#001>	Source vocab size: 28721, Target vocab size: 42787
30 | <parallel#001>	Source max sent len: 52, Target max sent len: 52
31 | <parallel#001>	Done loading data!
32 | 
33 | <parallel#001>	Building model with specs:
34 | <parallel#001>	Layer type: lstm
35 | <parallel#001>	Model type: red
36 | <parallel#001>	Embedding size: 300
37 | <parallel#001>	Hidden layer size: 300
38 | <parallel#001>	Number of layers: 2
39 | <parallel#001>	Number of parameters: 37219687
40 | 
41 | .<parallel#002>	Loading data...
42 | <parallel#002>	Source vocab size: 28721, Target vocab size: 42787
43 | <parallel#002>	Source max sent len: 52, Target max sent len: 52
44 | <parallel#002>	Done loading data!
45 | 
46 | <parallel#002>	Building model with specs:
47 | <parallel#002>	Layer type: lstm
48 | <parallel#002>	Model type: red
49 | <parallel#002>	Embedding size: 300
50 | <parallel#002>	Hidden layer size: 300
51 | <parallel#002>	Number of layers: 2
52 | <parallel#002>	Number of parameters: 37219687
53 | 
54 | .<parallel#002>	received params from batch with index: 	21
55 | <parallel#002>	sending back derivative for batch with index: 	21
56 | 


--------------------------------------------------------------------------------
/lua-lua/outputs/104.197.250.103/reg_2.txt:
--------------------------------------------------------------------------------
 1 | <parallel#000>	Im the parent, my ID is: 	0	 and my IP: 	10.251.50.144	
 2 | <parallel#000>	Loading data...	
 3 | <parallel#000>	Source vocab size: 28721, Target vocab size: 42787	
 4 | <parallel#000>	Source max sent len: 52, Target max sent len: 52	
 5 | <parallel#000>	Done loading data!
 6 | 	
 7 | <parallel#000>	Building model with specs:	
 8 | <parallel#000>	Layer type: lstm	
 9 | <parallel#000>	Model type: red	
10 | <parallel#000>	Embedding size: 300	
11 | <parallel#000>	Hidden layer size: 300	
12 | <parallel#000>	Number of layers: 2	
13 | <parallel#000>	Number of parameters: 37219687
14 | 	
15 | <parallel#000>	Forking 	2	 processes	
16 | <parallel#000>	Forked	
17 | <parallel#000>	Finished telling workers to execute	
18 | <parallel#000>	Sending parameters to children	
19 | <parallel#002>	Im a worker, my ID is: 	2	 and my IP: 	10.251.50.144
20 | .<parallel#001>	Im a worker, my ID is: 	1	 and my IP: 	10.251.50.144
21 | .<parallel#002>	Recieved initialization parameters
22 | <parallel#001>	Recieved initialization parameters
23 | <parallel#000>	Replies from children	{
24 |   1 : "Received parameters and loaded data successfully"
25 |   2 : "Received parameters and loaded data successfully"
26 | }
27 | <parallel#000>	Beginning training...	
28 | <parallel#001>	Loading data...
29 | <parallel#001>	Source vocab size: 28721, Target vocab size: 42787
30 | <parallel#001>	Source max sent len: 52, Target max sent len: 52
31 | <parallel#001>	Done loading data!
32 | 
33 | <parallel#001>	Building model with specs:
34 | <parallel#001>	Layer type: lstm
35 | <parallel#001>	Model type: red
36 | <parallel#001>	Embedding size: 300
37 | <parallel#001>	Hidden layer size: 300
38 | <parallel#001>	Number of layers: 2
39 | <parallel#001>	Number of parameters: 37219687
40 | 
41 | .<parallel#002>	Loading data...
42 | <parallel#002>	Source vocab size: 28721, Target vocab size: 42787
43 | <parallel#002>	Source max sent len: 52, Target max sent len: 52
44 | <parallel#002>	Done loading data!
45 | 
46 | <parallel#002>	Building model with specs:
47 | <parallel#002>	Layer type: lstm
48 | <parallel#002>	Model type: red
49 | <parallel#002>	Embedding size: 300
50 | <parallel#002>	Hidden layer size: 300
51 | <parallel#002>	Number of layers: 2
52 | <parallel#002>	Number of parameters: 37219687
53 | 
54 | .<parallel#002>	received params from batch with index: 	21
55 | <parallel#002>	sending back derivative for batch with index: 	21
56 | 


--------------------------------------------------------------------------------
/lua-lua/outputs/104.197.250.103/reg_2_loc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.197.250.103/reg_2_loc.png


--------------------------------------------------------------------------------
/lua-lua/outputs/130.211.192.196/reg_1_loc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/130.211.192.196/reg_1_loc.png


--------------------------------------------------------------------------------
/lua-lua/outputs/130.211.192.196/reg_2.txt:
--------------------------------------------------------------------------------
 1 | <parallel#000>	Im the parent, my ID is: 	0	 and my IP: 	10.251.50.144	
 2 | <parallel#000>	Loading data...	
 3 | <parallel#000>	Source vocab size: 28721, Target vocab size: 42787	
 4 | <parallel#000>	Source max sent len: 52, Target max sent len: 52	
 5 | <parallel#000>	Done loading data!
 6 | 	
 7 | <parallel#000>	Building model with specs:	
 8 | <parallel#000>	Layer type: lstm	
 9 | <parallel#000>	Model type: red	
10 | <parallel#000>	Embedding size: 300	
11 | <parallel#000>	Hidden layer size: 300	
12 | <parallel#000>	Number of layers: 2	
13 | <parallel#000>	Number of parameters: 37219687
14 | 	
15 | <parallel#000>	Forking 	2	 processes	
16 | <parallel#000>	Forked	
17 | <parallel#000>	Finished telling workers to execute	
18 | <parallel#000>	Sending parameters to children	
19 | <parallel#002>	Im a worker, my ID is: 	2	 and my IP: 	10.251.50.144
20 | .<parallel#001>	Im a worker, my ID is: 	1	 and my IP: 	10.251.50.144
21 | .<parallel#002>	Recieved initialization parameters
22 | <parallel#001>	Recieved initialization parameters
23 | <parallel#000>	Replies from children	{
24 |   1 : "Received parameters and loaded data successfully"
25 |   2 : "Received parameters and loaded data successfully"
26 | }
27 | <parallel#000>	Beginning training...	
28 | <parallel#001>	Loading data...
29 | <parallel#001>	Source vocab size: 28721, Target vocab size: 42787
30 | <parallel#001>	Source max sent len: 52, Target max sent len: 52
31 | <parallel#001>	Done loading data!
32 | 
33 | <parallel#001>	Building model with specs:
34 | <parallel#001>	Layer type: lstm
35 | <parallel#001>	Model type: red
36 | <parallel#001>	Embedding size: 300
37 | <parallel#001>	Hidden layer size: 300
38 | <parallel#001>	Number of layers: 2
39 | <parallel#001>	Number of parameters: 37219687
40 | 
41 | .<parallel#002>	Loading data...
42 | <parallel#002>	Source vocab size: 28721, Target vocab size: 42787
43 | <parallel#002>	Source max sent len: 52, Target max sent len: 52
44 | <parallel#002>	Done loading data!
45 | 
46 | <parallel#002>	Building model with specs:
47 | <parallel#002>	Layer type: lstm
48 | <parallel#002>	Model type: red
49 | <parallel#002>	Embedding size: 300
50 | <parallel#002>	Hidden layer size: 300
51 | <parallel#002>	Number of layers: 2
52 | <parallel#002>	Number of parameters: 37219687
53 | 
54 | .<parallel#002>	received params from batch with index: 	21
55 | <parallel#002>	sending back derivative for batch with index: 	21
56 | 


--------------------------------------------------------------------------------
/lua-lua/outputs/130.211.204.149/ada_1_loc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/130.211.204.149/ada_1_loc.png


--------------------------------------------------------------------------------
/lua-lua/outputs/130.211.204.149/reg_2.txt:
--------------------------------------------------------------------------------
 1 | <parallel#000>	Im the parent, my ID is: 	0	 and my IP: 	10.251.50.144	
 2 | <parallel#000>	Loading data...	
 3 | <parallel#000>	Source vocab size: 28721, Target vocab size: 42787	
 4 | <parallel#000>	Source max sent len: 52, Target max sent len: 52	
 5 | <parallel#000>	Done loading data!
 6 | 	
 7 | <parallel#000>	Building model with specs:	
 8 | <parallel#000>	Layer type: lstm	
 9 | <parallel#000>	Model type: red	
10 | <parallel#000>	Embedding size: 300	
11 | <parallel#000>	Hidden layer size: 300	
12 | <parallel#000>	Number of layers: 2	
13 | <parallel#000>	Number of parameters: 37219687
14 | 	
15 | <parallel#000>	Forking 	2	 processes	
16 | <parallel#000>	Forked	
17 | <parallel#000>	Finished telling workers to execute	
18 | <parallel#000>	Sending parameters to children	
19 | <parallel#002>	Im a worker, my ID is: 	2	 and my IP: 	10.251.50.144
20 | .<parallel#001>	Im a worker, my ID is: 	1	 and my IP: 	10.251.50.144
21 | .<parallel#002>	Recieved initialization parameters
22 | <parallel#001>	Recieved initialization parameters
23 | <parallel#000>	Replies from children	{
24 |   1 : "Received parameters and loaded data successfully"
25 |   2 : "Received parameters and loaded data successfully"
26 | }
27 | <parallel#000>	Beginning training...	
28 | <parallel#001>	Loading data...
29 | <parallel#001>	Source vocab size: 28721, Target vocab size: 42787
30 | <parallel#001>	Source max sent len: 52, Target max sent len: 52
31 | <parallel#001>	Done loading data!
32 | 
33 | <parallel#001>	Building model with specs:
34 | <parallel#001>	Layer type: lstm
35 | <parallel#001>	Model type: red
36 | <parallel#001>	Embedding size: 300
37 | <parallel#001>	Hidden layer size: 300
38 | <parallel#001>	Number of layers: 2
39 | <parallel#001>	Number of parameters: 37219687
40 | 
41 | .<parallel#002>	Loading data...
42 | <parallel#002>	Source vocab size: 28721, Target vocab size: 42787
43 | <parallel#002>	Source max sent len: 52, Target max sent len: 52
44 | <parallel#002>	Done loading data!
45 | 
46 | <parallel#002>	Building model with specs:
47 | <parallel#002>	Layer type: lstm
48 | <parallel#002>	Model type: red
49 | <parallel#002>	Embedding size: 300
50 | <parallel#002>	Hidden layer size: 300
51 | <parallel#002>	Number of layers: 2
52 | <parallel#002>	Number of parameters: 37219687
53 | 
54 | .<parallel#002>	received params from batch with index: 	21
55 | <parallel#002>	sending back derivative for batch with index: 	21
56 | 


--------------------------------------------------------------------------------
/lua-lua/parse_outputs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python 
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | """Copy files from servers
  6 | """
  7 | 
  8 | from __future__ import print_function
  9 | 
 10 | import sys
 11 | import os
 12 | import re
 13 | import numpy as np
 14 | import warnings
 15 | warnings.filterwarnings("ignore", category=UserWarning) 
 16 | import matplotlib.pyplot as plt
 17 | 
 18 | 
 19 | class Print:
 20 |     def red(self, prt): print("\033[91m{}\033[00m" .format(prt), end="")
 21 |     def green(self, prt): print("\033[92m{}\033[00m" .format(prt), end="")
 22 |     def yellow(self, prt): print("\033[93m{}\033[00m" .format(prt), end="")
 23 |     def lightpurple(self, prt): print("\033[94m{}\033[00m" .format(prt), end="")
 24 |     def purple(self, prt): print("\033[95m{}\033[00m" .format(prt), end="")
 25 |     def cyan(self, prt): print("\033[96m{}\033[00m" .format(prt), end="")
 26 |     def lightgray(self, prt): print("\033[97m{}\033[00m" .format(prt), end="")
 27 |     def black(self, prt): print("\033[98m{}\033[00m" .format(prt), end="")
 28 | 
 29 | class Result:
 30 |     def __init__(self, floc):
 31 |         self.results = []
 32 |         self.floc = floc
 33 |         self.loc_split = floc.split('/')
 34 |         self.fname = self.loc_split[-1]
 35 |         self.ip_addr = self.loc_split[-2]
 36 |         self.no_ext = self.fname.split('.')[0]
 37 |         self.ada_grad, self.n_proc, self.loc =  self.no_ext.split('_')
 38 |         self.n_proc = int(self.n_proc)
 39 | 
 40 |         if self.ada_grad == 'ada':
 41 |             self.ada_grad = 'ada grad SGD'
 42 |         else:
 43 |             self.ada_grad = 'simple SGD'
 44 | 
 45 |         if self.loc == 'rem':
 46 |             self.loc = 'remotely'
 47 |         else:
 48 |             self.loc = 'locally'
 49 | 
 50 |         self.description = '%d processes, %s, running %s' % (self.n_proc, self.ada_grad, self.loc)
 51 | 
 52 |     def add_result(self, result):
 53 |         self.results.append(result)
 54 | 
 55 |     def get_data(self, max_epoch, min_t):
 56 |         return [result.time_ellapse for result in self.results if (max_epoch is None  or result.epoch <= max_epoch) and (min_t is None or result.time_ellapse >= min_t)], [np.log(result.perplexity) for result in self.results if (max_epoch is None  or result.epoch <= max_epoch) and (min_t is None or result.time_ellapse >= min_t)]
 57 |     
 58 |     def graph(self, close = True, out_name = None, max_epoch = None, min_t = None):
 59 |         times, log_perps = self.get_data(max_epoch, min_t)
 60 |         
 61 |         plt.ylabel('Log perplexity')
 62 |         plt.xlabel('Time (s)')
 63 | 
 64 |         plt.title(self.description)
 65 |         plt.plot(times, log_perps, label = self.description)
 66 |         
 67 |         if close:
 68 |             if out_name == None:
 69 |                 out_name = "/".join(self.loc_split[:-1]) + '/' + self.no_ext + '.png'
 70 |         
 71 |             plt.savefig(out_name)
 72 |             plt.clf()
 73 |             plt.cla()
 74 |             plt.close()
 75 | 
 76 | 
 77 |     def display(self):
 78 |         Print().green('Results for file %s \n' % self.floc) 
 79 | 
 80 |         Print().lightpurple('Number of processes: ')
 81 |         print(self.n_proc)
 82 | 
 83 |         Print().lightpurple('SGD type: ')
 84 |         print(self.ada_grad)
 85 | 
 86 |         Print().lightpurple('Running location: ')
 87 |         print(self.loc)
 88 | 
 89 |         Print().lightpurple('Server: ')
 90 |         print(self.ip_addr)
 91 | 
 92 |         if len(self.results) == 0:
 93 |             Print().red('No results\n')
 94 |             return
 95 | 
 96 |         Print().lightpurple('Number of batches: ')
 97 |         print(self.results[0].n_batch)
 98 | 
 99 |         epoch = -1
100 |         for result in self.results:
101 |             if result.epoch != epoch:
102 |                 epoch = result.epoch
103 |                 Print().yellow('Epoch: %d\n' % epoch)
104 |             result.display()
105 | 
106 | class DataPoint:
107 |     def __init__(self, line):
108 |         # Store the line itself
109 |         self.line = line
110 | 
111 |         # The epoch we're on
112 |         self.epoch = int(self.clean_match('Epoch: (.*?), Batch:', line))
113 |         
114 |         # Current batch, total number of batches, current batchsuze
115 |         self.batch_str = self.clean_match('Batch: (.*?), Batch size:', line)
116 |         batch_splt = str.split(self.batch_str, '/')
117 |         self.batch, self.n_batch = [int(ind) for ind in batch_splt]
118 |         self.batch_size = int(self.clean_match('Batch size: (.*?), LR:', line))
119 | 
120 |         self.learning_rate = float(self.clean_match('LR: (.*?), PPL: ', line))
121 |         
122 |         self.perplexity = float(self.clean_match('PPL: (.*?), |Param|:', line))
123 |         
124 |         self.speed = self.clean_match('Training: (.*?) total/source/target', line)
125 | 
126 |         self.time_ellapse = int(str.split(line)[-1])
127 | 
128 | 
129 |     def clean_match(self, pattern, string):
130 |         res = re.findall(pattern, string)
131 |         return filter(lambda x: x != '', res)[0]
132 |     def display(self):
133 |         args = (self.batch, self.perplexity, self.time_ellapse)
134 |         print('Batch: %d, perplexity: %.2f, time: %d\n' % args, end = "")
135 | 
136 | class Results:
137 |     def __init__(self):
138 |         self.results = []
139 | 
140 |     def add_result(self, result):
141 |         self.results.append(result)
142 | 
143 |     def graph(self, location = None, max_epoch = None, min_t = None):
144 |         for result in self.results:
145 |             if location == None or result.loc == location:
146 |                 result.graph(close = False, max_epoch = max_epoch, min_t = min_t)
147 |         if location == None:
148 |             out_name = "All.png"
149 |         else:
150 |             out_name = location + ".png"
151 |         plt.title(location)
152 |         plt.legend(bbox_to_anchor=(1.05, 1))
153 | 
154 |         plt.savefig(out_name)
155 |         plt.clf()
156 |         plt.cla()
157 |         plt.close()
158 | 
159 | 
160 | 
161 | def process_file(path_to_file):
162 |     result = Result(path_to_file)
163 |     with open(path_to_file) as f:
164 |         for line in f:
165 |             if 'total/source/target' in line:                
166 |                 # Parse the line into a DataPoint object 
167 |                 data_point = DataPoint(line)
168 | 
169 |                 # Add the datapoint to the result
170 |                 result.add_result(data_point)
171 |                 
172 |         result.display()
173 |         result.graph()
174 |     return result
175 |        
176 |                
177 | 
178 | def main(arguments):
179 | 
180 |     while True:
181 |         print('Copying over files')
182 |         # Updating files
183 |         os.system('python copy_files.py')
184 | 
185 |         import time
186 |         time.sleep(3)
187 | 
188 |         # hold the results
189 |         results = Results()
190 | 
191 |         # Get all folders of ip addresses
192 |         for ip_fold in os.walk('outputs'):
193 | 
194 |             # Find the .txt files
195 |             for file in os.listdir(ip_fold[0]):
196 |                 if file.endswith(".txt") and len(file.split('_')) == 3:
197 | 
198 |                     # Full path to the file
199 |                     full_path = ip_fold[0] + '/' + file
200 | 
201 |                     result = process_file(full_path)
202 | 
203 |                     results.add_result(result)
204 |         
205 |         results.graph(location = 'locally', max_epoch = 7)
206 |         results.graph(location = 'remotely', min_t = 50, max_epoch = 10)
207 |         time.sleep(20)
208 | 
209 | if __name__ == '__main__':
210 |     sys.exit(main(sys.argv[1:]))


--------------------------------------------------------------------------------
/lua-lua/remotely.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/remotely.png


--------------------------------------------------------------------------------
/lua-lua/server.lua:
--------------------------------------------------------------------------------
  1 | ------------------------------------------------------------------------
  2 | -- server.lua
  3 | --
  4 | -- A general Distributed SGD Parameter server written in lua/torch
  5 | --
  6 | -- The is a general parameter server file. It takes in the command line 
  7 | -- 		options that are necessary to launch the server. The server 
  8 | --		will be of the class: 'server_class'. The 'server_class' must 
  9 | --		be a class definied with two required functions: :__init() and :run()
 10 | --		This file will load in the class, call, making a new object via
 11 | --		the __init() functionm and then call the run() function inside
 12 | --		 a protected loop
 13 | --
 14 | -- The 'add_to_path' option is a string that will be appended onto the 
 15 | --		path before requiring the new 'server_class'
 16 | --     
 17 | -- Run 
 18 | --		th server.lua --help
 19 | -- to see a full list of options for the parameter server
 20 | ------------------------------------------------------------------------
 21 | 
 22 | -- Library used to run clients in parallel
 23 | require 'parallel'
 24 | 
 25 | -- Used to update the path variable
 26 | require 'package'
 27 | 
 28 | ------------
 29 | -- Options
 30 | ------------
 31 | 
 32 | cmd = torch.CmdLine()
 33 | 
 34 | cmd:text("")
 35 | cmd:text("**General options**")
 36 | cmd:text("")
 37 | 
 38 | cmd:option('-server_class',    'demo_server',     	'Class name to use')
 39 | cmd:option('-add_to_path' , 	'./End-To-End-Generative-Dialogue/src/?.lua;', 				'A string that will be appended on to the front of the path')
 40 | 
 41 | cmd:text("")
 42 | cmd:text("**_____________________________**")
 43 | cmd:text("Below are all options specific to models")
 44 | cmd:text("**_____________________________**")
 45 | cmd:text("")
 46 | 
 47 | cmd:text("")
 48 | cmd:text("**Data options**")
 49 | cmd:text("")
 50 | cmd:option('-data_file',    'data/demo-train.hdf5',     'Path to the training *.hdf5 file')
 51 | cmd:option('-val_data_file','data/demo-val.hdf5',       'Path to validation *.hdf5 file')
 52 | cmd:option('-save_file',    'demo-seq2seq_lstm',         'Save file name (model will be saved as savefile_epochX_PPL.t7  where X is the X-th epoch and PPL is the validation perplexity')
 53 | cmd:option('-train_from',   '',                         'If training from a checkpoint then this is the path to the pretrained model.')
 54 | 
 55 | cmd:text("")
 56 | cmd:text("**Model options**")
 57 | cmd:text("")
 58 | 
 59 | cmd:option('-num_layers',       2,      'Number of layers in the LSTM encoder/decoder')
 60 | cmd:option('-hidden_size',      300,    'Size of LSTM hidden states')
 61 | cmd:option('-word_vec_size',    300,    'Word embedding sizes')
 62 | cmd:option('-layer_type',       'lstm', 'Recurrent layer type (rnn, gru, lstm, fast)')
 63 | cmd:option('-model_type',       'red', 	'Model structure (red, hred)')
 64 | 
 65 | 
 66 | cmd:text("")
 67 | cmd:text("**Optimization options**")
 68 | cmd:text("")
 69 | 
 70 | cmd:option('-num_epochs',       10,     'Number of training epochs')
 71 | cmd:option('-start_epoch',      1,      'If loading from a checkpoint, the epoch from which to start')
 72 | cmd:option('-param_init',       0.1,    'Parameters are initialized over uniform distribution with support (-param_init, param_init)')
 73 | cmd:option('-learning_rate',    .01,      'Starting learning rate')
 74 | cmd:option('-ada_grad',    		true,      'When true, update parameters using adagrad algorithm')
 75 | cmd:option('-max_grad_norm',    5,      'If the norm of the gradient vector exceeds this, renormalize it to have the norm equal to max_grad_norm')
 76 | cmd:option('-dropout',          0.3,    'Dropout probability. Dropout is applied between vertical LSTM stacks.')
 77 | cmd:option('-lr_decay',         0.5,    'Decay learning rate by this much if (i) perplexity does not decrease on the validation set or (ii) epoch has gone past the start_decay_at_limit')
 78 | cmd:option('-start_decay_at',   9,      'Start decay after this epoch')
 79 | cmd:option('-fix_word_vecs',    0,      'If = 1, fix lookup table word embeddings')
 80 | cmd:option('-beam_k',           5,      'K value to use with beam search')
 81 | cmd:option('-max_bleu',         4,      'The number of n-grams used in calculating the bleu score')
 82 | cmd:option('-pre_word_vecs',    '', 'If a valid path is specified, then this will load pretrained word embeddings (hdf5 file) on the encoder side. See README for specific formatting instructions.')
 83 | 
 84 | cmd:text("")
 85 | cmd:text("**Other options**")
 86 | cmd:text("")
 87 | 
 88 | -- GPU (not supported on servers)
 89 | cmd:option('-gpuid',    -1, 'Which gpu to use. -1 = use CPU')
 90 | cmd:option('-gpuid2',   -1, 'If this is >= 0, then the model will use two GPUs whereby the encoder is on the first GPU and the decoder is on the second GPU. This will allow you to train with bigger batches/models.')
 91 | 
 92 | -- Bookkeeping
 93 | cmd:option('-save_every',   1,      'Save every this many epochs')
 94 | cmd:option('-print_every',  5,      'Print stats after this many batches')
 95 | cmd:option('-seed',         3435,   'Seed for random initialization')
 96 | 
 97 | 
 98 | -- Parallel options
 99 | cmd:option('-n_proc',           4,      	'The number of processes to farm out')
100 | cmd:option('-remote',           false,   	'When true, the farmed out processes are run on remote servers. This overrides localhost')
101 | cmd:option('-localhost',           false,   'When true, the farmed out processes are run on localhost. ')
102 | 
103 | cmd:option('-torch_path',	'/Users/michaelfarrell/torch/install/bin/th',   'The path to the torch directory on the client computers')
104 | cmd:option('-extension',    '',   										'The location from the home directory to the lua-lua folder on the client computer')
105 | cmd:option('-username',     'michaelfarrell',   						'The username for connecting used for connecting to remote clients')
106 | 
107 | -- Parse arguments
108 | opt = cmd:parse(arg)
109 | torch.manualSeed(opt.seed)
110 | 
111 | -- Indicate we are running things in parallel
112 | opt.parallel = true
113 | 
114 | -- The print function
115 | opt.print = parallel.print
116 | 
117 | -- Add on location to path of new class if not already in path
118 | package.path = opt.add_to_path .. package.path
119 | 
120 | -- Main server function, initializes and runs
121 | function server_main()
122 | 	-- Load in the class type
123 | 	server = require(opt.server_class)
124 | 
125 | 	-- Print from parent process
126 | 	parallel.print('Im the parent, my ID is: ',  parallel.id, ' and my IP: ', parallel.ip)
127 | 
128 | 	-- Create a new server
129 | 	param_server = server.new(opt)
130 | 
131 |      -- Run the server
132 |     param_server:run()   
133 | 
134 | end
135 | 
136 | -- Protected execution of parllalel script:
137 | ok, err = pcall(server_main)
138 | if not ok then print(err) parallel.close() end
139 | 


--------------------------------------------------------------------------------
/lua-lua/setup_image.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # 
 3 | # setup_image.sh
 4 | #	
 5 | # This is a bash script that is used to setup an image on the google cloud server
 6 | #	it copies over the startup script, runs the script, disconnects and reconnects,
 7 | #	then reruns the startup script
 8 | 
 9 | # Copy over the startup script
10 | scp -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey startup.sh $USERNAME@$EXTERNAL_IP:~/
11 | 
12 | # Run the startup script on the server
13 | echo "bash startup.sh" | ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$EXTERNAL_IP
14 | 
15 | # Disconnect from the server, reconnect  and finish running last things needed for initialization
16 | echo "bash startup.sh; " | ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$EXTERNAL_IP
17 | 


--------------------------------------------------------------------------------
/lua-lua/startup.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # 
  3 | # gcloud_startup.sh
  4 | #	
  5 | # This is a bash script that is used to setup a google cloud server. This script
  6 | #	will install the following on the server:
  7 | #		- git
  8 | #		- luarocks
  9 | #		- pip
 10 | #		- torch
 11 | #		- lua-parallel (local version)
 12 | #		- rnn (torch) 
 13 | #		- hdf5 (torch)
 14 | #		- anaconda
 15 | #		- h5py
 16 | # The script will also clone the Distributed-SGD repo onto the server
 17 | 
 18 | # Ensure that git is installed
 19 | if hash git &> /dev/null
 20 | then
 21 | 	echo -e "\033[0;32mgit installed\033[0m"
 22 | else
 23 | 	echo -e "\033[0;34mInstalling git ...\033[0m"
 24 | 	(echo "Y" | sudo apt-get install git) > /dev/null  
 25 | fi
 26 | 
 27 | # Ensure that luarocks is installed
 28 | if hash luarocks &> /dev/null
 29 | then
 30 | 	echo -e "\033[0;32mluarocks installed\033[0m"
 31 | else
 32 | 	echo -e "\033[0;34mInstalling luarocks ...\033[0m"
 33 | 	(echo "Y" | sudo apt-get install luarocks) &> /dev/null  
 34 | fi
 35 | 
 36 | # Ensure that pip is installed
 37 | if hash pip &> /dev/null
 38 | then
 39 | 	echo -e "\033[0;32mpython-pip installed\033[0m"
 40 | else
 41 | 	echo -e "\033[0;34mInstalling python-pip ...\033[0m"
 42 | 	(echo "Y" | sudo apt-get install python-pip) > /dev/null  
 43 | fi
 44 | 
 45 | source ~/.profile
 46 | 
 47 | # Ensure that torch is installed
 48 | if hash th &> /dev/null
 49 | then
 50 | 	echo -e "\033[0;32mtorch installed\033[0m"
 51 | else
 52 | 	echo -e "\033[0;34mInstalling torch ...\033[0m"
 53 | 	git clone https://github.com/torch/distro.git ~/torch --recursive &> /dev/null
 54 | 	cd ~/torch
 55 | 	bash install-deps 2&>1 > /dev/null 
 56 | 	echo "yes" | ./install.sh 2&>1 > /dev/null 
 57 | 	cd ..
 58 | 	source ~/.profile
 59 | fi
 60 | 
 61 | # Ensure that rnn is installed
 62 | if (luarocks list | grep -q rnn) &> /dev/null  
 63 | then
 64 | 	echo -e "\033[0;32mrnn installed\033[0m"
 65 | else
 66 | 	echo -e "\033[0;34mInstalling rnn ...\033[0m"
 67 | 	luarocks install rnn &> /dev/null  
 68 | fi
 69 | 
 70 | # Ensure that torch-hdf5 is installed
 71 | if (luarocks list | grep -q hdf5) &> /dev/null  
 72 | then
 73 | 	echo -e "\033[0;32mhdf5 installed\033[0m"
 74 | else
 75 | 	echo -e "\033[0;34mInstalling hdf5 ...\033[0m"
 76 |  	echo "Y" | sudo apt-get install libhdf5-serial-dev hdf5-tools > /dev/null  
 77 |  	git clone https://github.com/deepmind/torch-hdf5.git &> /dev/null
 78 |  	cd torch-hdf5
 79 |  	luarocks make hdf5-0-0.rockspec &> /dev/null  
 80 |  	cd ..
 81 | fi
 82 | 
 83 | # Make sure that the Distributed SGD is downloaded and isntalled
 84 | if [ -e "Distributed-SGD" ]
 85 | then 
 86 | 	# Update the repos
 87 | 	echo -e "\033[0;34mPulling Distributed-SGD repo changes ...\033[0m"
 88 | 	cd Distributed-SGD
 89 | 	git pull &> /dev/null
 90 | 	cd lua-lua/End-To-End-Generative-Dialogue
 91 | 	echo -e "\033[0;34mPulling End-To-End-Generative-Dialogue repo changes ...\033[0m"
 92 | 	git pull origin master &> /dev/null
 93 | 
 94 | 	cd ../../..
 95 | else
 96 | 	# Clone repo and install parallel
 97 | 	echo -e "\033[0;34mCloning repo Distributed-SGD ...\033[0m"
 98 |  	git clone --recursive https://github.com/michaelfarrell76/Distributed-SGD.git &> /dev/null
 99 |  	cd Distributed-SGD/lua-lua
100 |  	bash install_parallel.sh 
101 |  	cd ../../
102 | fi
103 | 
104 | # Ensure that anaconda is installed 
105 | if [ -e "anaconda2" ]
106 | then 
107 | 	echo -e "\033[0;32manaconda installed\033[0m"
108 | 	echo -e "\033[0;34mInstalling h5py ...\033[0m"
109 | 
110 | 	# Install hdf5 for python
111 | 	echo "y" | conda install h5py &> /dev/null
112 | else
113 | 	echo -e "\033[0;34mDownloading anaconda ...\033[0m"
114 | 	wget http://repo.continuum.io/archive/Anaconda2-4.0.0-Linux-x86_64.sh &> /dev/null
115 | 	echo -e "\033[0;34mInstalling anaconda ...\033[0m"
116 | 	bash Anaconda2-4.0.0-Linux-x86_64.sh -b > /dev/null
117 | 	rm Anaconda2-4.0.0-Linux-x86_64.sh
118 | 	echo 'export PATH="/home/michaelfarrell/anaconda2/bin:$PATH"' > .bashrc
119 | 	echo -e "\033[0;33mIn order for python to be run, you must logout and log back in\033[0m" 
120 | fi
121 | 
122 | 


--------------------------------------------------------------------------------
/python-python/README.md:
--------------------------------------------------------------------------------
  1 | # Distributed-SGD for Python
  2 | An implementation of distributed stochastic gradient descent in python. Clients can be local and remote. For this task, you can download the data from http://www.vision.caltech.edu/Image_Datasets/Caltech101/.
  3 | 
  4 | ## Requirements
  5 | 
  6 | This code is written entirely in Python, and an installation of gRPC, Numpy, Scipy, and Autograd are necessary. These packages can be easily installed through PIP using the following commands. 
  7 | 
  8 | ```bash
  9 | $ pip install numpy
 10 | $ pip install scipy
 11 | $ pip install autograd
 12 | $ pip install grpcio
 13 | ```
 14 | 
 15 | For launching the code remotely, we will be working with Google Cloud Compute. In order to interact with GCloud instances, please install the GCloud sdk. This is located here: https://cloud.google.com/sdk/.  
 16 | 
 17 | ## Directory Table of Contents
 18 | ```
 19 | .
 20 | ├── 101_ObjectCategories		    # Folder holding the raw data from the 101_ObjectCategories
 21 | |-- data                            # Folder holding the processed data
 22 | ├── client.py                       # Python script used to initiate a client
 23 | |── server.py 						# Python script to manually initiate a server
 24 | ├── dist_sgd_pb2.py                 # Automatically compiled protobufs for the parameter server
 25 | ├── README.md                       # Python usage 
 26 | ├── images(16).npy                  # Extremely small dataset included for reference
 27 | ├── output_labels(16).npy           # Classifications of each image for the extremely small dataset
 28 | ├── nnet           					# Folder that includes a module for a convolution neural net
 29 | ├── protobuf_utils         			# Folder that includes utilities for manipulating tensor protobuffers
 30 | ├── run_codegen.sh                  # Shell command used to generates the protobuffers 
 31 | └── start.sh                        # Script that launches client.py on when running within gCloud
 32 | ```
 33 | 
 34 | ## Description
 35 | 
 36 | ## Local Usage Instructions
 37 | To launch clients locally, in three different terminals, simply run:
 38 | ```bash
 39 | $ python client.py --id 1 
 40 | $ python client.py --id 2
 41 | $ python client.py --id 3
 42 | ```
 43 | 
 44 | #### Remote Usage Instructions
 45 | 
 46 | ##### Create a baseline startup image
 47 | 
 48 | We only have to setup and install everything once, after which we can clone that image repeatedly when we launch VMs. 
 49 | 
 50 | ###### Create the image
 51 | - Click on the 'VM Instances' tab
 52 | - Create Instance
 53 | - Give the instance a name i.e. 'train-conv-nn'
 54 | - Set the zone to us-central1-b
 55 | - Choose 2vCPU highmem as machine type
 56 | - Under boot disk click change
 57 | - Choose Ubuntu 14.04 LTS
 58 | - At the bottom change size to 30 GB and click 'select'
 59 | - Allow HTTP traffic
 60 | - Allow HTTPS traffic
 61 | - Click 'Management, disk, networking, SSH keys' to dropdown more options
 62 | - Under 'Disk' unclick 'Delete boot disk when instance is deleted'
 63 | - Click 'Create' an you should see your new instance listed in the table
 64 | 
 65 | ###### Setup the disk
 66 | - Run the command gcloud init and log into your Google Cloud account
 67 | - Run the command to SSH into your instance:
 68 | ```bash
 69 | $ gcloud compute ssh train-conv-nn --zone us-central1-b
 70 | ```
 71 | - After logging in, we can clone the repository and install the necessary requirements.
 72 | - Once the server is setup to your liking, disconnect from the server and return to your google cloud dashboard
 73 | - Go to the 'VM Dashboard'
 74 | - Click on the instance you just setup, and delete it. This should remove the instance and save it as a disk. If you click on the 'disks' tab, you should see the instance name you just deleted.
 75 | 
 76 | ###### Create the image
 77 | 
 78 | - Click on the 'Images' tab
 79 | - 'Create Image'
 80 | - Give it a name i.e. 'train-conv-image'
 81 | - Under Source-Disk, choose the disk that you just created 
 82 | - Create
 83 | 
 84 | ##### Generate an 'Instance Template'
 85 | - Click on the 'Instance templates' tab
 86 | - Create new
 87 | - Name the template i.e. 'train-conv-template'
 88 | - Under 'Boot Disk' click change
 89 | - At the top click 'Your image'
 90 | - Choose the image you just created i.e. 'train-conv-image'
 91 | - Set size to 30 GB
 92 | - Select
 93 | - Allow HTTP traffic
 94 | - Allow HTTPS traffic
 95 | - Under more->Management, include cd ~/distributed-sgd/python-python; sh start.sh
 96 |   in startup script
 97 | - Under more->Disks, unclick 'Delete boot disk when instance is deleted'
 98 | - Create
 99 | 
100 | ##### Generate an 'Instance Group'
101 | - Go to the "Instance groups" tab
102 | - Create instance group
103 | - Give the group a name, i.e. 'train-conv-group'
104 | - Give a description
105 | - Set zone to us-central1-b
106 | - Use instance template
107 | - Choose the template you just made i.e. 'train-conv-template' 
108 | - Set the number of instances
109 | - Create
110 | - Wait for the instances to launch
111 | - Once there is a green checkmark, click on the new instance
112 | 
113 | All instances in the instance group are now running the python client.py command and will begin training.
114 | SSH into any of the instances to see their progress.
115 | 
116 | ## Acknowledgments
117 | 
118 | Our implementation adapts code for the convolutional neural net from the Autograd convolution neural net example:
119 | 
120 | * [Autograd](https://github.com/HIPS/autograd)


--------------------------------------------------------------------------------
/python-python/client.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------
  2 | # Implements a client that runs backpropogation on batches 
  3 | # provided by the server. If no server exists, then Paxos 
  4 | # is called to generate a server. 
  5 | # ------------------------------------------------------------
  6 | 
  7 | from __future__ import print_function
  8 | from __future__ import absolute_import
  9 | from grpc.beta import implementations
 10 | import time
 11 | import sys
 12 | 
 13 | import dist_sgd_pb2
 14 | import argparse
 15 | import traceback
 16 | 
 17 | import autograd.numpy as np
 18 | import autograd.numpy.random as npr
 19 | from autograd import grad
 20 | 
 21 | from nnet.neural_net import *
 22 | from protobuf_utils.utils import * 
 23 | from server_utils.utils import * 
 24 | 
 25 | from server import serve
 26 | from paxos import run_paxos
 27 | import subprocess
 28 | 
 29 | 
 30 | # Loads in a really small version of the data that could fit in Github.
 31 | # It will train extremely quickly as a result.
 32 | images_fname = 'data/images(16).npy'
 33 | labels_fname = 'data/output_labels(16).npy'
 34 | 
 35 | _TIMEOUT_SECONDS = 20
 36 | TENSOR_TIMEOUT_SECONDS = 60
 37 | SERVER_PORT = 50051
 38 | 
 39 | # Loops through all possible addressses that are part of the instance 
 40 | # group if this is launched on a remote server. Loops through all possible
 41 | # addresses that are part of the local server as well. 
 42 | # Determines whether or not a server exists by trying to connect with the 
 43 | # a predefined port on the server 
 44 | def find_server(local_id=None):
 45 | 	TOT_ATTEMPTS = 1
 46 | 	for i in range(TOT_ATTEMPTS):
 47 | 		# Generates local address information
 48 | 		local_address = gen_local_address(local_id)
 49 | 		server_addresses = gen_server_addresses(local_id, local_address)
 50 | 		server_addresses.remove(local_address)
 51 | 
 52 | 		# Loops through all the servers and tries to makes the server stub
 53 | 		for server_address in server_addresses:
 54 | 			if local_id is not None:
 55 | 				channel = implementations.insecure_channel('localhost', SERVER_PORT)
 56 | 			else:
 57 | 				channel = implementations.insecure_channel(server_address, SERVER_PORT)
 58 | 			stub = dist_sgd_pb2.beta_create_ParamFeeder_stub(channel)
 59 | 			try:
 60 | 				# Attempts to ping the server to see if the port is open
 61 | 				response = stub.ping(dist_sgd_pb2.empty(), _TIMEOUT_SECONDS)
 62 | 
 63 | 				# If the PING succeeds, then it is the server
 64 | 				return server_address
 65 | 
 66 | 			except Exception as e:
 67 | 				# Log any network or expiration errors we run into 
 68 | 				if ('ExpirationError' in str(e) or 'NetworkError' in str(e)):
 69 | 					log_info(str(e))
 70 | 					continue
 71 | 				else:
 72 | 					# More severe error, should log and crash
 73 | 					traceback.print_exc()
 74 | 					sys.exit(1)
 75 | 		time.sleep(1 * TOT_ATTEMPTS)
 76 | 	return ''
 77 | 
 78 | # After determining the correct server, generate the stub for it
 79 | def connect_server_stub(server_addr, local_id):
 80 | 	if local_id is not None:
 81 | 		channel = implementations.insecure_channel('localhost', SERVER_PORT)
 82 | 	else:
 83 | 		channel = implementations.insecure_channel(server_addr, SERVER_PORT)
 84 | 	stub = dist_sgd_pb2.beta_create_ParamFeeder_stub(channel)
 85 | 	return stub
 86 | 
 87 | 
 88 | # Main function of the client that loops forever. Receieves parameters and 
 89 | # batch information from the server. Calculates gradients and sends them 
 90 | # to the server
 91 | def run(local_id = None):
 92 | 	# Load and process Caltech data
 93 | 	train_images, train_labels, test_images, test_labels = load_caltech100(images_fname, labels_fname)
 94 | 	image_input_d = train_images.shape[1]
 95 | 
 96 |     # Network parameters
 97 | 	layer_sizes = [image_input_d, 800, 600, 400, 350, 250, 101]
 98 | 
 99 | 	L2_reg = 1.0
100 | 
101 | 	# Training parameters
102 | 	param_scale = 0.1
103 | 	momentum = 0.9
104 | 	batch_size = 256
105 | 	num_epochs = 50
106 | 
107 | 	# Make neural net functions
108 | 	N_weights, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg)
109 | 	loss_grad = grad(loss_fun)
110 | 
111 | 	# Train with sgd
112 | 	batch_idxs = make_batches(train_images.shape[0], batch_size)
113 | 	cur_dir = np.zeros(N_weights)
114 | 
115 | 	# Previous batch for the purpose of timing
116 | 	prev_data_indx = -1
117 | 
118 | 	# Number of consective expirations, used to detect server failure
119 | 	consec_expiration = 0
120 | 
121 | 	# Determine the server address by running Paxos or pinging all addresses
122 | 	server_addr = ''
123 | 	while server_addr == '':
124 | 		server_addr = run_paxos(local_id)
125 | 		if server_addr == '':
126 | 			server_addr = find_server(local_id)
127 | 	log_info('Server address is ' + server_addr)
128 | 
129 | 	# If this client is selected to be server, then transform into a server
130 | 	if server_addr == gen_local_address(local_id):
131 | 		log_info('Transforming into the server')
132 | 		try:
133 | 			serve(server_addr, None, prev_data_indx, local_id)
134 | 		except KeyboardInterrupt as e:
135 | 			log_info('interrupted')
136 | 			sys.exit(0)
137 | 		return
138 | 
139 | 	# Generates the server stub and connects with it
140 | 	stub = connect_server_stub(server_addr, local_id)
141 | 	client_id = 0
142 | 
143 | 	log_info('Data loaded and connected to server:')
144 | 	
145 | 	try:
146 | 		# Gets the next batch that it should run
147 | 		response = stub.SendNextBatch(dist_sgd_pb2.PrevBatch(client_id=client_id, prev_data_indx=prev_data_indx), _TIMEOUT_SECONDS)
148 | 		while response.data_indx != -2:
149 | 			client_id = response.client_id
150 | 			# If this fails, it keeps on trying to get your first batch
151 | 			while response.data_indx == -1:
152 | 				time.sleep(5)
153 | 				log_info('Waiting for server to send next batch')
154 | 				response = stub.SendNextBatch(dist_sgd_pb2.PrevBatch(client_id=client_id, prev_data_indx=prev_data_indx), _TIMEOUT_SECONDS)
155 | 			log_info('Processing parameters in batch %d!' % response.data_indx)
156 | 
157 | 			# Generates the W matrix 
158 | 			get_parameters_time = time.time()
159 | 			W_bytes = ''
160 | 			W_subtensors_iter = stub.SendParams(dist_sgd_pb2.ClientInfo(client_id=client_id), TENSOR_TIMEOUT_SECONDS)
161 | 			for W_subtensor_pb in W_subtensors_iter:
162 | 				W_bytes = W_bytes + W_subtensor_pb.tensor_content
163 | 			W = convert_bytes_to_array(W_bytes)
164 | 			log_info('Received parameters in {0:.2f}s'.format(time.time() - get_parameters_time))
165 | 
166 | 			# Calculate the gradients
167 | 			grad_start = time.time()
168 | 			grad_W = loss_grad(W, train_images[batch_idxs[response.data_indx]], train_labels[batch_idxs[response.data_indx]])
169 | 			log_info('Done calculating gradients in {0:.2f}s'.format(time.time() - grad_start))
170 | 			
171 | 			# Serialize the gradients
172 | 			tensor_compress_start = time.time()
173 | 			tensor_bytes = convert_array_to_bytes(grad_W)
174 | 			tensor_iterator = convert_tensor_iter(tensor_bytes, response.data_indx)
175 | 			log_info('Done compressing gradients in {0:.2f}s'.format(time.time() - tensor_compress_start))
176 | 
177 | 			# Send the gradients
178 | 			send_grad_start = time.time()
179 | 			stub.GetUpdates(tensor_iterator, _TIMEOUT_SECONDS) 
180 | 			log_info('Done sending gradients through in {0:.2f}s'.format(time.time() - send_grad_start))
181 | 
182 | 			# Get the next batch to process
183 | 			prev_data_indx = response.data_indx
184 | 			response = stub.SendNextBatch(dist_sgd_pb2.PrevBatch(client_id=client_id, prev_data_indx=prev_data_indx), _TIMEOUT_SECONDS)
185 | 
186 | 			consec_expiration = 0
187 | 	except KeyboardInterrupt as e:
188 | 		sys.exit(1)
189 | 	except Exception as e:
190 | 		if ('ExpirationError' in str(e) or 'NetworkError' in str(e)):
191 | 			SERVER_CONSEC_FAILURE = 2
192 | 			# Count the failures of the server
193 | 			consec_expiration += 1
194 | 
195 | 			# If consecutive failures exceed a predefined value, then we look for
196 | 			# the server by pinging available instances or by restarting Paxos
197 | 			if consec_expiration == SERVER_CONSEC_FAILURE:
198 | 				log_info('Failure to connect to server_stub. Starting Paxos')
199 | 				# Launches paxos and then looks for the server
200 | 				while server_addr == '':
201 | 					server_addr = run_paxos(local_id)
202 | 					if server_addr == '':
203 | 						server_addr = find_server(local_id)
204 | 				# Generates the server if it is chosen to be the server
205 | 				if server_addr == gen_local_address(local_id):
206 | 					serve(server_addr, W, prev_data_indx, local_id)
207 | 					return
208 | 				# Connects to the server
209 | 				stub = connect_server_stub(server_addr)
210 | 		else:
211 | 			log_info(traceback.print_exc())
212 | 			sys.exit(0)
213 | 
214 | if __name__ == '__main__':
215 | 	log_info('Starting client')
216 | 	parser = argparse.ArgumentParser()
217 | 	parser.add_argument('--id')
218 | 	args = parser.parse_args()
219 | 
220 | 	# Local id is only used if running the machine locally
221 | 	local_id = args.id
222 | 	if local_id is not None:
223 | 		local_id = int(local_id)
224 | 		assert(local_id > 0)
225 | 	while True:
226 | 		run(local_id)


--------------------------------------------------------------------------------
/python-python/data/images(16).npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/data/images(16).npy


--------------------------------------------------------------------------------
/python-python/data/output_labels(16).npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/data/output_labels(16).npy


--------------------------------------------------------------------------------
/python-python/dist_sgd_pb2.py:
--------------------------------------------------------------------------------
  1 | # Generated by the protocol buffer compiler.  DO NOT EDIT!
  2 | # source: dist_sgd.proto
  3 | 
  4 | import sys
  5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
  6 | from google.protobuf import descriptor as _descriptor
  7 | from google.protobuf import message as _message
  8 | from google.protobuf import reflection as _reflection
  9 | from google.protobuf import symbol_database as _symbol_database
 10 | from google.protobuf import descriptor_pb2
 11 | # @@protoc_insertion_point(imports)
 12 | 
 13 | _sym_db = _symbol_database.Default()
 14 | 
 15 | 
 16 | 
 17 | 
 18 | DESCRIPTOR = _descriptor.FileDescriptor(
 19 |   name='dist_sgd.proto',
 20 |   package='dist_sgd',
 21 |   syntax='proto3',
 22 |   serialized_pb=_b('\n\x0e\x64ist_sgd.proto\x12\x08\x64ist_sgd\"`\n\tSubTensor\x12\x12\n\ntensor_len\x18\x01 \x01(\x05\x12\x14\n\x0ctensor_chunk\x18\x02 \x01(\x05\x12\x16\n\x0etensor_content\x18\x03 \x01(\x0c\x12\x11\n\tdata_indx\x18\x04 \x01(\x05\"\x1f\n\nClientInfo\x12\x11\n\tclient_id\x18\x01 \x01(\x05\"\x1c\n\nStatusCode\x12\x0e\n\x06status\x18\x01 \x01(\x05\"6\n\tPrevBatch\x12\x11\n\tclient_id\x18\x01 \x01(\x05\x12\x16\n\x0eprev_data_indx\x18\x02 \x01(\x05\"1\n\tNextBatch\x12\x11\n\tclient_id\x18\x01 \x01(\x05\x12\x11\n\tdata_indx\x18\x02 \x01(\x05\"\x07\n\x05\x65mpty2\xf0\x01\n\x0bParamFeeder\x12;\n\nSendParams\x12\x14.dist_sgd.ClientInfo\x1a\x13.dist_sgd.SubTensor\"\x00\x30\x01\x12;\n\rSendNextBatch\x12\x13.dist_sgd.PrevBatch\x1a\x13.dist_sgd.NextBatch\"\x00\x12;\n\nGetUpdates\x12\x13.dist_sgd.SubTensor\x1a\x14.dist_sgd.StatusCode\"\x00(\x01\x12*\n\x04ping\x12\x0f.dist_sgd.empty\x1a\x0f.dist_sgd.empty\"\x00\x42\x18\n\x0bio.dist_sgdB\x07\x44istSGDP\x01\x62\x06proto3')
 23 | )
 24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 25 | 
 26 | 
 27 | 
 28 | 
 29 | _SUBTENSOR = _descriptor.Descriptor(
 30 |   name='SubTensor',
 31 |   full_name='dist_sgd.SubTensor',
 32 |   filename=None,
 33 |   file=DESCRIPTOR,
 34 |   containing_type=None,
 35 |   fields=[
 36 |     _descriptor.FieldDescriptor(
 37 |       name='tensor_len', full_name='dist_sgd.SubTensor.tensor_len', index=0,
 38 |       number=1, type=5, cpp_type=1, label=1,
 39 |       has_default_value=False, default_value=0,
 40 |       message_type=None, enum_type=None, containing_type=None,
 41 |       is_extension=False, extension_scope=None,
 42 |       options=None),
 43 |     _descriptor.FieldDescriptor(
 44 |       name='tensor_chunk', full_name='dist_sgd.SubTensor.tensor_chunk', index=1,
 45 |       number=2, type=5, cpp_type=1, label=1,
 46 |       has_default_value=False, default_value=0,
 47 |       message_type=None, enum_type=None, containing_type=None,
 48 |       is_extension=False, extension_scope=None,
 49 |       options=None),
 50 |     _descriptor.FieldDescriptor(
 51 |       name='tensor_content', full_name='dist_sgd.SubTensor.tensor_content', index=2,
 52 |       number=3, type=12, cpp_type=9, label=1,
 53 |       has_default_value=False, default_value=_b(""),
 54 |       message_type=None, enum_type=None, containing_type=None,
 55 |       is_extension=False, extension_scope=None,
 56 |       options=None),
 57 |     _descriptor.FieldDescriptor(
 58 |       name='data_indx', full_name='dist_sgd.SubTensor.data_indx', index=3,
 59 |       number=4, type=5, cpp_type=1, label=1,
 60 |       has_default_value=False, default_value=0,
 61 |       message_type=None, enum_type=None, containing_type=None,
 62 |       is_extension=False, extension_scope=None,
 63 |       options=None),
 64 |   ],
 65 |   extensions=[
 66 |   ],
 67 |   nested_types=[],
 68 |   enum_types=[
 69 |   ],
 70 |   options=None,
 71 |   is_extendable=False,
 72 |   syntax='proto3',
 73 |   extension_ranges=[],
 74 |   oneofs=[
 75 |   ],
 76 |   serialized_start=28,
 77 |   serialized_end=124,
 78 | )
 79 | 
 80 | 
 81 | _CLIENTINFO = _descriptor.Descriptor(
 82 |   name='ClientInfo',
 83 |   full_name='dist_sgd.ClientInfo',
 84 |   filename=None,
 85 |   file=DESCRIPTOR,
 86 |   containing_type=None,
 87 |   fields=[
 88 |     _descriptor.FieldDescriptor(
 89 |       name='client_id', full_name='dist_sgd.ClientInfo.client_id', index=0,
 90 |       number=1, type=5, cpp_type=1, label=1,
 91 |       has_default_value=False, default_value=0,
 92 |       message_type=None, enum_type=None, containing_type=None,
 93 |       is_extension=False, extension_scope=None,
 94 |       options=None),
 95 |   ],
 96 |   extensions=[
 97 |   ],
 98 |   nested_types=[],
 99 |   enum_types=[
100 |   ],
101 |   options=None,
102 |   is_extendable=False,
103 |   syntax='proto3',
104 |   extension_ranges=[],
105 |   oneofs=[
106 |   ],
107 |   serialized_start=126,
108 |   serialized_end=157,
109 | )
110 | 
111 | 
112 | _STATUSCODE = _descriptor.Descriptor(
113 |   name='StatusCode',
114 |   full_name='dist_sgd.StatusCode',
115 |   filename=None,
116 |   file=DESCRIPTOR,
117 |   containing_type=None,
118 |   fields=[
119 |     _descriptor.FieldDescriptor(
120 |       name='status', full_name='dist_sgd.StatusCode.status', index=0,
121 |       number=1, type=5, cpp_type=1, label=1,
122 |       has_default_value=False, default_value=0,
123 |       message_type=None, enum_type=None, containing_type=None,
124 |       is_extension=False, extension_scope=None,
125 |       options=None),
126 |   ],
127 |   extensions=[
128 |   ],
129 |   nested_types=[],
130 |   enum_types=[
131 |   ],
132 |   options=None,
133 |   is_extendable=False,
134 |   syntax='proto3',
135 |   extension_ranges=[],
136 |   oneofs=[
137 |   ],
138 |   serialized_start=159,
139 |   serialized_end=187,
140 | )
141 | 
142 | 
143 | _PREVBATCH = _descriptor.Descriptor(
144 |   name='PrevBatch',
145 |   full_name='dist_sgd.PrevBatch',
146 |   filename=None,
147 |   file=DESCRIPTOR,
148 |   containing_type=None,
149 |   fields=[
150 |     _descriptor.FieldDescriptor(
151 |       name='client_id', full_name='dist_sgd.PrevBatch.client_id', index=0,
152 |       number=1, type=5, cpp_type=1, label=1,
153 |       has_default_value=False, default_value=0,
154 |       message_type=None, enum_type=None, containing_type=None,
155 |       is_extension=False, extension_scope=None,
156 |       options=None),
157 |     _descriptor.FieldDescriptor(
158 |       name='prev_data_indx', full_name='dist_sgd.PrevBatch.prev_data_indx', index=1,
159 |       number=2, type=5, cpp_type=1, label=1,
160 |       has_default_value=False, default_value=0,
161 |       message_type=None, enum_type=None, containing_type=None,
162 |       is_extension=False, extension_scope=None,
163 |       options=None),
164 |   ],
165 |   extensions=[
166 |   ],
167 |   nested_types=[],
168 |   enum_types=[
169 |   ],
170 |   options=None,
171 |   is_extendable=False,
172 |   syntax='proto3',
173 |   extension_ranges=[],
174 |   oneofs=[
175 |   ],
176 |   serialized_start=189,
177 |   serialized_end=243,
178 | )
179 | 
180 | 
181 | _NEXTBATCH = _descriptor.Descriptor(
182 |   name='NextBatch',
183 |   full_name='dist_sgd.NextBatch',
184 |   filename=None,
185 |   file=DESCRIPTOR,
186 |   containing_type=None,
187 |   fields=[
188 |     _descriptor.FieldDescriptor(
189 |       name='client_id', full_name='dist_sgd.NextBatch.client_id', index=0,
190 |       number=1, type=5, cpp_type=1, label=1,
191 |       has_default_value=False, default_value=0,
192 |       message_type=None, enum_type=None, containing_type=None,
193 |       is_extension=False, extension_scope=None,
194 |       options=None),
195 |     _descriptor.FieldDescriptor(
196 |       name='data_indx', full_name='dist_sgd.NextBatch.data_indx', index=1,
197 |       number=2, type=5, cpp_type=1, label=1,
198 |       has_default_value=False, default_value=0,
199 |       message_type=None, enum_type=None, containing_type=None,
200 |       is_extension=False, extension_scope=None,
201 |       options=None),
202 |   ],
203 |   extensions=[
204 |   ],
205 |   nested_types=[],
206 |   enum_types=[
207 |   ],
208 |   options=None,
209 |   is_extendable=False,
210 |   syntax='proto3',
211 |   extension_ranges=[],
212 |   oneofs=[
213 |   ],
214 |   serialized_start=245,
215 |   serialized_end=294,
216 | )
217 | 
218 | 
219 | _EMPTY = _descriptor.Descriptor(
220 |   name='empty',
221 |   full_name='dist_sgd.empty',
222 |   filename=None,
223 |   file=DESCRIPTOR,
224 |   containing_type=None,
225 |   fields=[
226 |   ],
227 |   extensions=[
228 |   ],
229 |   nested_types=[],
230 |   enum_types=[
231 |   ],
232 |   options=None,
233 |   is_extendable=False,
234 |   syntax='proto3',
235 |   extension_ranges=[],
236 |   oneofs=[
237 |   ],
238 |   serialized_start=296,
239 |   serialized_end=303,
240 | )
241 | 
242 | DESCRIPTOR.message_types_by_name['SubTensor'] = _SUBTENSOR
243 | DESCRIPTOR.message_types_by_name['ClientInfo'] = _CLIENTINFO
244 | DESCRIPTOR.message_types_by_name['StatusCode'] = _STATUSCODE
245 | DESCRIPTOR.message_types_by_name['PrevBatch'] = _PREVBATCH
246 | DESCRIPTOR.message_types_by_name['NextBatch'] = _NEXTBATCH
247 | DESCRIPTOR.message_types_by_name['empty'] = _EMPTY
248 | 
249 | SubTensor = _reflection.GeneratedProtocolMessageType('SubTensor', (_message.Message,), dict(
250 |   DESCRIPTOR = _SUBTENSOR,
251 |   __module__ = 'dist_sgd_pb2'
252 |   # @@protoc_insertion_point(class_scope:dist_sgd.SubTensor)
253 |   ))
254 | _sym_db.RegisterMessage(SubTensor)
255 | 
256 | ClientInfo = _reflection.GeneratedProtocolMessageType('ClientInfo', (_message.Message,), dict(
257 |   DESCRIPTOR = _CLIENTINFO,
258 |   __module__ = 'dist_sgd_pb2'
259 |   # @@protoc_insertion_point(class_scope:dist_sgd.ClientInfo)
260 |   ))
261 | _sym_db.RegisterMessage(ClientInfo)
262 | 
263 | StatusCode = _reflection.GeneratedProtocolMessageType('StatusCode', (_message.Message,), dict(
264 |   DESCRIPTOR = _STATUSCODE,
265 |   __module__ = 'dist_sgd_pb2'
266 |   # @@protoc_insertion_point(class_scope:dist_sgd.StatusCode)
267 |   ))
268 | _sym_db.RegisterMessage(StatusCode)
269 | 
270 | PrevBatch = _reflection.GeneratedProtocolMessageType('PrevBatch', (_message.Message,), dict(
271 |   DESCRIPTOR = _PREVBATCH,
272 |   __module__ = 'dist_sgd_pb2'
273 |   # @@protoc_insertion_point(class_scope:dist_sgd.PrevBatch)
274 |   ))
275 | _sym_db.RegisterMessage(PrevBatch)
276 | 
277 | NextBatch = _reflection.GeneratedProtocolMessageType('NextBatch', (_message.Message,), dict(
278 |   DESCRIPTOR = _NEXTBATCH,
279 |   __module__ = 'dist_sgd_pb2'
280 |   # @@protoc_insertion_point(class_scope:dist_sgd.NextBatch)
281 |   ))
282 | _sym_db.RegisterMessage(NextBatch)
283 | 
284 | empty = _reflection.GeneratedProtocolMessageType('empty', (_message.Message,), dict(
285 |   DESCRIPTOR = _EMPTY,
286 |   __module__ = 'dist_sgd_pb2'
287 |   # @@protoc_insertion_point(class_scope:dist_sgd.empty)
288 |   ))
289 | _sym_db.RegisterMessage(empty)
290 | 
291 | 
292 | DESCRIPTOR.has_options = True
293 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n\013io.dist_sgdB\007DistSGDP\001'))
294 | import abc
295 | import six
296 | from grpc.beta import implementations as beta_implementations
297 | from grpc.framework.common import cardinality
298 | from grpc.framework.interfaces.face import utilities as face_utilities
299 | 
300 | class BetaParamFeederServicer(six.with_metaclass(abc.ABCMeta, object)):
301 |   """<fill me in later!>"""
302 |   @abc.abstractmethod
303 |   def SendParams(self, request, context):
304 |     raise NotImplementedError()
305 |   @abc.abstractmethod
306 |   def SendNextBatch(self, request, context):
307 |     raise NotImplementedError()
308 |   @abc.abstractmethod
309 |   def GetUpdates(self, request_iterator, context):
310 |     raise NotImplementedError()
311 |   @abc.abstractmethod
312 |   def ping(self, request, context):
313 |     raise NotImplementedError()
314 | 
315 | class BetaParamFeederStub(six.with_metaclass(abc.ABCMeta, object)):
316 |   """The interface to which stubs will conform."""
317 |   @abc.abstractmethod
318 |   def SendParams(self, request, timeout):
319 |     raise NotImplementedError()
320 |   @abc.abstractmethod
321 |   def SendNextBatch(self, request, timeout):
322 |     raise NotImplementedError()
323 |   SendNextBatch.future = None
324 |   @abc.abstractmethod
325 |   def GetUpdates(self, request_iterator, timeout):
326 |     raise NotImplementedError()
327 |   GetUpdates.future = None
328 |   @abc.abstractmethod
329 |   def ping(self, request, timeout):
330 |     raise NotImplementedError()
331 |   ping.future = None
332 | 
333 | def beta_create_ParamFeeder_server(servicer, pool=None, pool_size=None, default_timeout=None, maximum_timeout=None):
334 |   import dist_sgd_pb2
335 |   import dist_sgd_pb2
336 |   import dist_sgd_pb2
337 |   import dist_sgd_pb2
338 |   import dist_sgd_pb2
339 |   import dist_sgd_pb2
340 |   import dist_sgd_pb2
341 |   import dist_sgd_pb2
342 |   request_deserializers = {
343 |     ('dist_sgd.ParamFeeder', 'GetUpdates'): dist_sgd_pb2.SubTensor.FromString,
344 |     ('dist_sgd.ParamFeeder', 'SendNextBatch'): dist_sgd_pb2.PrevBatch.FromString,
345 |     ('dist_sgd.ParamFeeder', 'SendParams'): dist_sgd_pb2.ClientInfo.FromString,
346 |     ('dist_sgd.ParamFeeder', 'ping'): dist_sgd_pb2.empty.FromString,
347 |   }
348 |   response_serializers = {
349 |     ('dist_sgd.ParamFeeder', 'GetUpdates'): dist_sgd_pb2.StatusCode.SerializeToString,
350 |     ('dist_sgd.ParamFeeder', 'SendNextBatch'): dist_sgd_pb2.NextBatch.SerializeToString,
351 |     ('dist_sgd.ParamFeeder', 'SendParams'): dist_sgd_pb2.SubTensor.SerializeToString,
352 |     ('dist_sgd.ParamFeeder', 'ping'): dist_sgd_pb2.empty.SerializeToString,
353 |   }
354 |   method_implementations = {
355 |     ('dist_sgd.ParamFeeder', 'GetUpdates'): face_utilities.stream_unary_inline(servicer.GetUpdates),
356 |     ('dist_sgd.ParamFeeder', 'SendNextBatch'): face_utilities.unary_unary_inline(servicer.SendNextBatch),
357 |     ('dist_sgd.ParamFeeder', 'SendParams'): face_utilities.unary_stream_inline(servicer.SendParams),
358 |     ('dist_sgd.ParamFeeder', 'ping'): face_utilities.unary_unary_inline(servicer.ping),
359 |   }
360 |   server_options = beta_implementations.server_options(request_deserializers=request_deserializers, response_serializers=response_serializers, thread_pool=pool, thread_pool_size=pool_size, default_timeout=default_timeout, maximum_timeout=maximum_timeout)
361 |   return beta_implementations.server(method_implementations, options=server_options)
362 | 
363 | def beta_create_ParamFeeder_stub(channel, host=None, metadata_transformer=None, pool=None, pool_size=None):
364 |   import dist_sgd_pb2
365 |   import dist_sgd_pb2
366 |   import dist_sgd_pb2
367 |   import dist_sgd_pb2
368 |   import dist_sgd_pb2
369 |   import dist_sgd_pb2
370 |   import dist_sgd_pb2
371 |   import dist_sgd_pb2
372 |   request_serializers = {
373 |     ('dist_sgd.ParamFeeder', 'GetUpdates'): dist_sgd_pb2.SubTensor.SerializeToString,
374 |     ('dist_sgd.ParamFeeder', 'SendNextBatch'): dist_sgd_pb2.PrevBatch.SerializeToString,
375 |     ('dist_sgd.ParamFeeder', 'SendParams'): dist_sgd_pb2.ClientInfo.SerializeToString,
376 |     ('dist_sgd.ParamFeeder', 'ping'): dist_sgd_pb2.empty.SerializeToString,
377 |   }
378 |   response_deserializers = {
379 |     ('dist_sgd.ParamFeeder', 'GetUpdates'): dist_sgd_pb2.StatusCode.FromString,
380 |     ('dist_sgd.ParamFeeder', 'SendNextBatch'): dist_sgd_pb2.NextBatch.FromString,
381 |     ('dist_sgd.ParamFeeder', 'SendParams'): dist_sgd_pb2.SubTensor.FromString,
382 |     ('dist_sgd.ParamFeeder', 'ping'): dist_sgd_pb2.empty.FromString,
383 |   }
384 |   cardinalities = {
385 |     'GetUpdates': cardinality.Cardinality.STREAM_UNARY,
386 |     'SendNextBatch': cardinality.Cardinality.UNARY_UNARY,
387 |     'SendParams': cardinality.Cardinality.UNARY_STREAM,
388 |     'ping': cardinality.Cardinality.UNARY_UNARY,
389 |   }
390 |   stub_options = beta_implementations.stub_options(host=host, metadata_transformer=metadata_transformer, request_serializers=request_serializers, response_deserializers=response_deserializers, thread_pool=pool, thread_pool_size=pool_size)
391 |   return beta_implementations.dynamic_stub(channel, 'dist_sgd.ParamFeeder', cardinalities, options=stub_options)
392 | # @@protoc_insertion_point(module_scope)
393 | 


--------------------------------------------------------------------------------
/python-python/image_classes.txt:
--------------------------------------------------------------------------------
  1 | 0,accordion
  2 | 1,airplanes
  3 | 2,anchor
  4 | 3,ant
  5 | 4,BACKGROUND_Google
  6 | 5,barrel
  7 | 6,bass
  8 | 7,beaver
  9 | 8,binocular
 10 | 9,bonsai
 11 | 10,brain
 12 | 11,brontosaurus
 13 | 12,buddha
 14 | 13,butterfly
 15 | 14,camera
 16 | 15,cannon
 17 | 16,car_side
 18 | 17,ceiling_fan
 19 | 18,cellphone
 20 | 19,chair
 21 | 20,chandelier
 22 | 21,cougar_body
 23 | 22,cougar_face
 24 | 23,crab
 25 | 24,crayfish
 26 | 25,crocodile
 27 | 26,crocodile_head
 28 | 27,cup
 29 | 28,dalmatian
 30 | 29,dollar_bill
 31 | 30,dolphin
 32 | 31,dragonfly
 33 | 32,electric_guitar
 34 | 33,elephant
 35 | 34,emu
 36 | 35,euphonium
 37 | 36,ewer
 38 | 37,Faces
 39 | 38,Faces_easy
 40 | 39,ferry
 41 | 40,flamingo
 42 | 41,flamingo_head
 43 | 42,garfield
 44 | 43,gerenuk
 45 | 44,gramophone
 46 | 45,grand_piano
 47 | 46,hawksbill
 48 | 47,headphone
 49 | 48,hedgehog
 50 | 49,helicopter
 51 | 50,ibis
 52 | 51,inline_skate
 53 | 52,joshua_tree
 54 | 53,kangaroo
 55 | 54,ketch
 56 | 55,lamp
 57 | 56,laptop
 58 | 57,Leopards
 59 | 58,llama
 60 | 59,lobster
 61 | 60,lotus
 62 | 61,mandolin
 63 | 62,mayfly
 64 | 63,menorah
 65 | 64,metronome
 66 | 65,minaret
 67 | 66,Motorbikes
 68 | 67,nautilus
 69 | 68,octopus
 70 | 69,okapi
 71 | 70,pagoda
 72 | 71,panda
 73 | 72,pigeon
 74 | 73,pizza
 75 | 74,platypus
 76 | 75,pyramid
 77 | 76,revolver
 78 | 77,rhino
 79 | 78,rooster
 80 | 79,saxophone
 81 | 80,schooner
 82 | 81,scissors
 83 | 82,scorpion
 84 | 83,sea_horse
 85 | 84,snoopy
 86 | 85,soccer_ball
 87 | 86,stapler
 88 | 87,starfish
 89 | 88,stegosaurus
 90 | 89,stop_sign
 91 | 90,strawberry
 92 | 91,sunflower
 93 | 92,tick
 94 | 93,trilobite
 95 | 94,umbrella
 96 | 95,watch
 97 | 96,water_lilly
 98 | 97,wheelchair
 99 | 98,wild_cat
100 | 99,windsor_chair
101 | 100,wrench
102 | 101,yin_yang
103 | 


--------------------------------------------------------------------------------
/python-python/neural_net.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import print_function
  3 | from scipy.ndimage import imread
  4 | from scipy.misc import imresize
  5 | from sklearn.cross_validation import train_test_split
  6 | from os import listdir
  7 | from os.path import isfile, join
  8 | import autograd.numpy as np
  9 | import autograd.numpy.random as npr
 10 | from autograd.scipy.misc import logsumexp
 11 | from autograd import grad
 12 | from autograd.util import quick_grad_check
 13 | import gc
 14 | import resource
 15 | from sys import getsizeof
 16 | import os
 17 | import psutil
 18 | 
 19 | # {0: 'accordion', 1: 'airplanes', 2: 'anchor', 3: 'ant', 4: 'BACKGROUND_Google', 5: 'barrel', 6: 'bass', 7: 'beaver', 8: 'binocular', 9: 'bonsai', 10: 'brain', 11: 'brontosaurus', 12: 'buddha', 13: 'butterfly', 14: 'camera', 15: 'cannon', 16: 'car_side', 17: 'ceiling_fan', 18: 'cellphone', 19: 'chair', 20: 'chandelier', 21: 'cougar_body', 22: 'cougar_face', 23: 'crab', 24: 'crayfish', 25: 'crocodile', 26: 'crocodile_head', 27: 'cup', 28: 'dalmatian', 29: 'dollar_bill', 30: 'dolphin', 31: 'dragonfly', 32: 'electric_guitar', 33: 'elephant', 34: 'emu', 35: 'euphonium', 36: 'ewer', 37: 'Faces', 38: 'Faces_easy', 39: 'ferry', 40: 'flamingo', 41: 'flamingo_head', 42: 'garfield', 43: 'gerenuk', 44: 'gramophone', 45: 'grand_piano', 46: 'hawksbill', 47: 'headphone', 48: 'hedgehog', 49: 'helicopter', 50: 'ibis', 51: 'inline_skate', 52: 'joshua_tree', 53: 'kangaroo', 54: 'ketch', 55: 'lamp', 56: 'laptop', 57: 'Leopards', 58: 'llama', 59: 'lobster', 60: 'lotus', 61: 'mandolin', 62: 'mayfly', 63: 'menorah', 64: 'metronome', 65: 'minaret', 66: 'Motorbikes', 67: 'nautilus', 68: 'octopus', 69: 'okapi', 70: 'pagoda', 71: 'panda', 72: 'pigeon', 73: 'pizza', 74: 'platypus', 75: 'pyramid', 76: 'revolver', 77: 'rhino', 78: 'rooster', 79: 'saxophone', 80: 'schooner', 81: 'scissors', 82: 'scorpion', 83: 'sea_horse', 84: 'snoopy', 85: 'soccer_ball', 86: 'stapler', 87: 'starfish', 88: 'stegosaurus', 89: 'stop_sign', 90: 'strawberry', 91: 'sunflower', 92: 'tick', 93: 'trilobite', 94: 'umbrella', 95: 'watch', 96: 'water_lilly', 97: 'wheelchair', 98: 'wild_cat', 99: 'windsor_chair', 100: 'wrench', 101: 'yin_yang'}
 20 | 
 21 | images_fname = 'images(128).npy'
 22 | output_labels_fname = 'output_labels(128).npy'
 23 | 
 24 | def make_nn_funs(layer_sizes, L2_reg):
 25 |     shapes = zip(layer_sizes[:-1], layer_sizes[1:])
 26 |     N = sum((m+1)*n for m, n in shapes)
 27 | 
 28 |     def unpack_layers(W_vect):
 29 |         for m, n in shapes:
 30 |             yield W_vect[:m*n].reshape((m,n)), W_vect[m*n:m*n+n]
 31 |             W_vect = W_vect[(m+1)*n:]
 32 | 
 33 |     def predictions(W_vect, inputs):
 34 |         for W, b in unpack_layers(W_vect):
 35 |             outputs = np.dot(inputs, W) + b
 36 |             inputs = np.tanh(outputs)
 37 |         return outputs - logsumexp(outputs, axis=1, keepdims=True)
 38 | 
 39 |     def loss(W_vect, X, T):
 40 |         log_prior = -L2_reg * np.dot(W_vect, W_vect)
 41 |         log_lik = np.sum(predictions(W_vect, X) * T)
 42 |         return - log_prior - log_lik
 43 | 
 44 |     def frac_err(W_vect, X, T):
 45 |         return np.mean(np.argmax(T, axis=1) != np.argmax(predictions(W_vect, X), axis=1))
 46 | 
 47 |     return N, predictions, loss, frac_err
 48 | 
 49 | def convert_bw_to_rgb(im):
 50 |     im.resize((im.shape[0], im.shape[1], 1))
 51 |     return np.repeat(im.astype(np.uint8), 3, 2)
 52 | 
 53 | def standarizeImage(im):
 54 | 	if len(im.shape) < 3:
 55 | 		im = convert_bw_to_rgb(im)
 56 | 	im = np.array(im, 'float32') 
 57 | 	if im.shape[0] != 64:
 58 | 		im = imresize(im, (64, 64, 3))
 59 | 	if np.amax(im) > 1.1:
 60 | 		im = im / 255.0
 61 | 	assert((np.amax(im) > 0.01) & (np.amax(im) <= 1))
 62 | 	assert((np.amin(im) >= 0.00))
 63 | 	return im
 64 | 
 65 | def gen_data():
 66 | 	category_paths = [f for f in listdir('101_ObjectCategories/')]
 67 | 	image_paths = [f for f in listdir('101_ObjectCategories/menorah/') if isfile(join('101_ObjectCategories/menorah/', f))]
 68 | 
 69 | 	images = []
 70 | 	output_labels = []
 71 | 	# Include all categories with mappings to the integer representing the category
 72 | 	categories_dict = {}
 73 | 
 74 | 	category = 0
 75 | 	for category_path in category_paths:
 76 | 		image_paths = [f for f in listdir('101_ObjectCategories/' + category_path + '/')]
 77 | 		for image_path in image_paths:
 78 | 			im = standarizeImage(imread('101_ObjectCategories/' + category_path + '/' + image_path))
 79 | 			if im.shape == (64, 64, 3):
 80 | 				images.append(im)
 81 | 				output_labels.append(category)
 82 | 		categories_dict[category] = category_path
 83 | 		category = category + 1
 84 | 
 85 | 	images = np.array(images)
 86 | 	partial_flatten = lambda x : np.reshape(x, (x.shape[0], np.prod(x.shape[1:])))
 87 | 	images = partial_flatten(images)
 88 | 
 89 | 	np.save('images.npy', images)
 90 | 	np.save('output_labels.npy', output_labels)
 91 | 
 92 | def make_batches(N_data, batch_size):
 93 |     return [slice(i, min(i+batch_size, N_data))
 94 |             for i in range(0, N_data, batch_size)]
 95 | 
 96 | def load_caltech100(): 
 97 | 	# gen_data()
 98 | 	one_hot = lambda x, K: np.array(x[:,None] == np.arange(K)[None, :], dtype=int)
 99 | 	images = np.load(images_fname)
100 | 	output_labels = np.load(output_labels_fname)
101 | 	train_images, valid_images, train_labels, valid_labels = train_test_split(images, output_labels, test_size=0.20, random_state=1729)
102 | 	train_labels = one_hot(train_labels, 101)
103 | 	valid_labels = one_hot(valid_labels, 101)
104 |     # import bpdb; bpdb.set_trace()
105 | 	return train_images, train_labels, valid_images, valid_labels
106 | 
107 | if __name__ == '__main__':
108 | 
109 |    	print(resource.getrusage(resource.RUSAGE_SELF))
110 | 	process = psutil.Process(os.getpid())
111 | 	print (process.memory_info().rss)
112 | 
113 | 	# Load and process Caltech data
114 | 	train_images, train_labels, test_images, test_labels = load_caltech100()
115 | 	image_input_d = train_images.shape[1]
116 | 
117 |     # Network parameters
118 | 	layer_sizes = [image_input_d, 1500, 650, 101]
119 | 	L2_reg = 1.0
120 | 
121 | 	# Training parameters
122 | 	param_scale = 0.1
123 | 	learning_rate = 1e-3
124 | 	momentum = 0.9
125 | 	batch_size = 256
126 | 	num_epochs = 50
127 | 
128 | 	# Make neural net functions
129 | 	N_weights, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg)
130 | 	loss_grad = grad(loss_fun)
131 | 
132 | 	# Initialize weights
133 | 	rs = npr.RandomState()
134 | 	W = rs.randn(N_weights) * param_scale
135 | 	
136 | 	# Check the gradients numerically, just to be safe
137 | 	# quick_grad_check(loss_fun, W, (train_images, train_labels))
138 | 
139 | 	print("    Epoch      |    Train err  |   Test err  ")
140 | 
141 | 	def print_perf(epoch, W):
142 | 	    test_perf  = frac_err(W, test_images, test_labels)
143 | 	    train_perf = frac_err(W, train_images, train_labels)
144 | 	    print("{0:15}|{1:15}|{2:15}".format(epoch, train_perf, test_perf))
145 | 
146 | 	# Train with sgd
147 | 	batch_idxs = make_batches(train_images.shape[0], batch_size)
148 | 	import bpdb; bpdb.set_trace()
149 | 	cur_dir = np.zeros(N_weights)
150 | 
151 | 	for epoch in range(num_epochs):
152 | 	    print_perf(epoch, W)
153 | 	    for idxs in batch_idxs:
154 | 	        grad_W = loss_grad(W, train_images[idxs], train_labels[idxs])
155 | 	        print('----------------------------')
156 | 	        print(getsizeof(grad_W))
157 | 	       	#print(process.memory_info().rss)
158 | 	        #print(resource.getrusage(resource.RUSAGE_SELF))
159 | 	        gc.collect()
160 | 	        #print(process.memory_info().rss)
161 | 	        cur_dir = momentum * cur_dir + (1.0 - momentum) * grad_W
162 | 	        W -= learning_rate * cur_dir


--------------------------------------------------------------------------------
/python-python/nnet/__init__.py:
--------------------------------------------------------------------------------
1 | # Default python file required for initializing the module for 
2 | # neural net class. More documentation included in the next file.


--------------------------------------------------------------------------------
/python-python/nnet/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/nnet/__init__.pyc


--------------------------------------------------------------------------------
/python-python/nnet/neural_net.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import print_function
  3 | import time
  4 | 
  5 | from scipy.ndimage import imread
  6 | from scipy.misc import imresize
  7 | from sklearn.cross_validation import train_test_split
  8 | from autograd.scipy.misc import logsumexp
  9 | 
 10 | from os import listdir
 11 | from os.path import isfile, join
 12 | 
 13 | import autograd.numpy as np
 14 | import autograd.numpy.random as npr
 15 | from autograd import grad
 16 | 
 17 | import traceback
 18 | 
 19 | # Set up a basic convolutional neural net is adapted from Ryan Adam's example
 20 | # with Autograd located below: 
 21 | # https://github.com/twitter/torch-autograd/blob/master/examples/train-mnist-cnn.lua
 22 | 
 23 | # We apply this model to the Caltech 101 dataset rather than the MNIST dataset
 24 | # to increase the difficulty of the task
 25 | def make_nn_funs(layer_sizes, L2_reg):
 26 |     shapes = zip(layer_sizes[:-1], layer_sizes[1:])
 27 |     N = sum((m+1)*n for m, n in shapes)
 28 | 
 29 |     def unpack_layers(W_vect):
 30 |         for m, n in shapes:
 31 |             yield W_vect[:m*n].reshape((m,n)), W_vect[m*n:m*n+n]
 32 |             W_vect = W_vect[(m+1)*n:]
 33 | 
 34 |     def predictions(W_vect, inputs):
 35 |         for W, b in unpack_layers(W_vect):
 36 |             outputs = np.dot(inputs, W) + b
 37 |             inputs = np.tanh(outputs)
 38 |         return outputs - logsumexp(outputs, axis=1, keepdims=True)
 39 | 
 40 |     def loss(W_vect, X, T):
 41 |         log_prior = -L2_reg * np.dot(W_vect.T, W_vect)
 42 |         log_lik = np.sum(predictions(W_vect, X) * T)
 43 |         return - log_prior - log_lik
 44 | 
 45 |     def frac_err(W_vect, X, T):
 46 |         return np.mean(np.argmax(T, axis=1) != np.argmax(predictions(W_vect, X), axis=1))
 47 | 
 48 |     return N, predictions, loss, frac_err
 49 | 
 50 | def convert_bw_to_rgb(im):
 51 |     im.resize((im.shape[0], im.shape[1], 1))
 52 |     return np.repeat(im.astype(np.uint8), 3, 2)
 53 | 
 54 | def standarizeImage(im):
 55 |     if len(im.shape) < 3:
 56 |         im = convert_bw_to_rgb(im)
 57 |     im = np.array(im, 'float32') 
 58 |     if im.shape[0] != 64:
 59 |         im = imresize(im, (64, 64, 3))
 60 |     if np.amax(im) > 1.1:
 61 |         im = im / 255.0
 62 |     assert((np.amax(im) > 0.01) & (np.amax(im) <= 1))
 63 |     assert((np.amin(im) >= 0.00))
 64 |     return im
 65 | 
 66 | def gen_data():
 67 |     category_paths = [f for f in listdir('101_ObjectCategories/')]
 68 |     image_paths = [f for f in listdir('101_ObjectCategories/menorah/') if isfile(join('101_ObjectCategories/menorah/', f))]
 69 | 
 70 |     images = []
 71 |     output_labels = []
 72 |     # Include all categories with mappings to the integer representing the category
 73 |     categories_dict = {}
 74 | 
 75 |     category = 0
 76 |     for category_path in category_paths:
 77 |         image_paths = [f for f in listdir('101_ObjectCategories/' + category_path + '/')]
 78 |         for image_path in image_paths:
 79 |             im = standarizeImage(imread('101_ObjectCategories/' + category_path + '/' + image_path))
 80 |             if im.shape == (64, 64, 3):
 81 |                 images.append(im)
 82 |                 output_labels.append(category)
 83 |         categories_dict[category] = category_path
 84 |         category = category + 1
 85 | 
 86 |     images = np.array(images)
 87 |     partial_flatten = lambda x : np.reshape(x, (x.shape[0], np.prod(x.shape[1:])))
 88 |     images = partial_flatten(images)
 89 | 
 90 |     np.save('images(64).npy', images)
 91 |     np.save('output_labels(64).npy', output_labels)
 92 | 
 93 | def make_batches(N_data, batch_size):
 94 |     return [slice(i, min(i+batch_size, N_data))
 95 |             for i in range(0, N_data, batch_size)]
 96 | 
 97 | def load_caltech100(images_fname, labels_fname): 
 98 |     # if images(64).npy or output_labels(64).npy missing then
 99 |         # print('Generating data because it does not exist. Note that this may take a while')
100 |     # gen_data()
101 |     one_hot = lambda x, K: np.array(x[:,None] == np.arange(K)[None, :], dtype=int)
102 |     images = np.load(images_fname)
103 |     output_labels = np.load(labels_fname)
104 |     output_labels = np.load(labels_fname)
105 |     train_images, valid_images, train_labels, valid_labels = train_test_split(images, output_labels, test_size=0.20, random_state=1729)
106 |     train_labels = one_hot(train_labels, 101)
107 |     valid_labels = one_hot(valid_labels, 101)
108 |     return train_images, train_labels, valid_images, valid_labels
109 | 


--------------------------------------------------------------------------------
/python-python/nnet/neural_net.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/nnet/neural_net.pyc


--------------------------------------------------------------------------------
/python-python/paxos.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------
  2 | # Implements a Paxos server and runs Paxos with this server. 
  3 | # This function is called through run_paxos if the client_server
  4 | # has gone down.
  5 | # ------------------------------------------------------------
  6 | 
  7 | from __future__ import print_function
  8 | from __future__ import absolute_import
  9 | from grpc.beta import implementations
 10 | import time
 11 | import sys
 12 | from threading import Thread
 13 | 
 14 | import paxos_pb2
 15 | import argparse
 16 | import traceback
 17 | 
 18 | import autograd.numpy as np
 19 | import autograd.numpy.random as npr
 20 | from autograd import grad
 21 | import random
 22 | 
 23 | from protobuf_utils.utils import * 
 24 | from server_utils.utils import * 
 25 | 
 26 | import subprocess
 27 | 
 28 | _TIMEOUT_SECONDS = 4
 29 | PAXOS_PORT_STR = 50052
 30 | 
 31 | # Actual implementation of the PaxosServer that is used to communicate between the clients. 
 32 | # Paxos is called to determine the future main server from amongst many different clients.
 33 | class PaxosServer(paxos_pb2.BetaPaxosServerServicer):
 34 |     def __init__(self, hostname):
 35 |     	# Initial consensus value is none, this will be the server
 36 |     	self.new_server = ''
 37 |     	self.consensus_value = None
 38 |     	self.consensus_reached = False
 39 | 
 40 |     	# Values for paxos 
 41 |     	self.n = random.random()
 42 |     	self.prop_n = 0
 43 |     	self.v = ''
 44 |     	self.n_v = 0
 45 | 
 46 |     	# Exponential backoff to prevent spamming other servers 
 47 |     	# Randomness is introduced to help Paxos converge quicker
 48 |     	self.backoff = (1 * random.gauss(1, 0.25))
 49 |     	if self.backoff < 0:
 50 |     		self.backoff = 1
 51 | 
 52 |     	# Saves the server's address as well
 53 |     	self.address = hostname
 54 | 
 55 |     # Runs the prepare phase of the Paxos algorithm
 56 |     def prepare(self, request, context):
 57 |     	# Update the highest seen proposal
 58 |     	if request.n > self.prop_n:
 59 |     		self.prop_n = request.n 
 60 |     	# Returns an acknowledgement containing highest accepted proposal
 61 |     	return paxos_pb2.ack(n=self.n, v=self.v, n_v=self.n_v)
 62 | 
 63 |     # Accepts the proposal if it is higher than  
 64 |     def accept(self, request, context):
 65 |     	if request.n >= self.prop_n:
 66 |     		self.n_v = request.n
 67 |     		self.v = request.v
 68 |     		self.n = request.n
 69 |         	return paxos_pb2.acquiescence(accept_bool=True)
 70 |         else:
 71 |         	return paxos_pb2.acquiescence(accept_bool=False)
 72 | 
 73 |     # Notifies the server that consensus has been reached
 74 |     def accepted(self, request, context):
 75 |     	self.consensus_reached = True
 76 |     	self.new_server = request.v
 77 |     	return paxos_pb2.empty()
 78 | 
 79 |     # Ping function to allow confirmation between PaxosServer that they
 80 |     # are still running
 81 |     def ping(self, request, context):
 82 |     	return paxos_pb2.empty() 
 83 | 
 84 | # Runs the PaxosServer. Checks periodically to see if a consensus has 
 85 | # been reached.
 86 | def run_server(server, paxos_server):
 87 | 	server.start()
 88 | 	while True:
 89 | 		time.sleep(0.1)
 90 | 		try:
 91 | 			if paxos_server.consensus_reached:
 92 | 				if paxos_server.new_server != '':
 93 | 					log_info('Consensus reached, server shutting down')
 94 | 				# Wait briefly for the consensus message to propogate out
 95 | 				time.sleep(5)
 96 | 				server.stop(0)
 97 | 				break
 98 | 			time.sleep(1)
 99 | 		except KeyboardInterrupt:
100 | 			server.stop(0)
101 | 
102 | # Actually instantiates the Paxos Server according to a defined port
103 | def create_server(hostname, local_id):
104 |     # Allow argument that allows this parameter to be changsed
105 | 	paxos_server = PaxosServer(hostname) 
106 | 	server = paxos_pb2.beta_create_PaxosServer_server(paxos_server)
107 | 	if local_id is None:
108 | 		server.add_insecure_port(hostname + ':' + str(PAXOS_PORT_STR))
109 | 	else:
110 | 		server.add_insecure_port(hostname)
111 | 	return paxos_server, server
112 | 
113 | # Attempts to send proposals to all the other servers
114 | def send_proposals(server_stubs, self_paxos_server):
115 | 	# Increments the proposal number from the previous one that it sends out
116 | 	self_paxos_server.n = self_paxos_server.n * (1 + random.random())
117 | 	self_paxos_server.v = self_paxos_server.address
118 | 	n_proposal = self_paxos_server.n
119 | 	value = self_paxos_server.address
120 | 	log_info('Making a proposal from {0} for n = {1} '.format(self_paxos_server.address, n_proposal))
121 | 
122 | 	# Track the failures of the proposals
123 | 	n_so_far = 0
124 | 	failed = False
125 | 	responded = 0
126 | 
127 | 	for server_stub in server_stubs:
128 | 		# Makes the connection to the server
129 | 		try:
130 | 			# gRPC call to other Paxos Servers to see if they acceept the proposal
131 | 			response = server_stub.prepare(paxos_pb2.proposal(n=n_proposal), _TIMEOUT_SECONDS)
132 | 
133 | 			# Sees a higher n value then it's current value and immediately stops the process
134 | 			if response.n >= n_proposal:
135 | 				failed = True
136 | 				log_info('Proposal ' + str(n_proposal) + ' failed')
137 | 				break
138 | 			else:
139 | 				# If the response is positive, then it notes the positive response
140 | 			 	if response.n_v > n_so_far:
141 | 					n_so_far = response.n
142 | 					value = response.v
143 | 				responded += 1
144 | 		except Exception as e:
145 | 			if ('ExpirationError' in str(e)):
146 | 				log_info('Failure to connect to server_stub')
147 | 				continue
148 | 			else:
149 | 				# More severe error, should log and crash
150 | 				traceback.print_exc()
151 | 				sys.exit(1)
152 | 
153 | 	# No proposals have been sent so far, suggests its own IP 
154 | 	if value is None:
155 | 		value = self_paxos_server.address
156 | 
157 | 	# If it does not have a majority of responses, Paxos fails
158 | 	if responded < len(server_stubs) / 2.0:
159 | 		failed = True
160 | 		
161 | 	return(failed, n_proposal, value)
162 | 
163 | # Requests that the other Paxos Server accepts the proposal
164 | def request_accept(server_stubs, self_paxos_server, n_proposal, value):
165 | 	accepted = 0
166 | 	for stub in server_stubs:
167 | 		try:
168 | 			response = stub.accept(paxos_pb2.request_acceptance(n=n_proposal, v=value), _TIMEOUT_SECONDS)
169 | 		except Exception as e:
170 | 			traceback.print_exc()
171 | 			return False
172 | 		if response.accept_bool:
173 | 			accepted += 1
174 | 
175 | 	# If the majority accept the proposal, then it passes
176 | 	if accepted > len(server_stubs) / 2.0:
177 | 		log_info('Proposal accepted')
178 | 		return True
179 | 	else:
180 | 		log_info('Proposal {0} rejected with value {1}'.format(n_proposal, value))
181 | 		return False
182 | 
183 | # Checks to ensure that all the stubs are currently available by pinging them
184 | # If more than half of them are available, it begins Paxos. Otherwise, it waits.
185 | def check_stubs_up(stubs):
186 | 	responses = 0
187 | 	for stub in stubs:
188 | 		try:
189 | 			response = stub.ping(paxos_pb2.empty(), _TIMEOUT_SECONDS)
190 | 			responses += 1
191 | 		except Exception as e:
192 | 			if ('ExpirationError' in str(e)):
193 | 				log_info('Failure to connect to server_stub during startup')
194 | 				continue
195 | 			else:
196 | 				# More severe error, should log and crash
197 | 				traceback.print_exc()
198 | 				sys.exit(1)
199 | 	if responses < len(stubs) / 2:
200 | 		return False
201 | 	else:
202 | 		return True
203 | 
204 | # Make sure that all machines are aware that the Paxos algorithm is finishing
205 | # Not all machines are aware that the server has failed at the same time. Could 
206 | # be in the middle of calculating gradients or waiting to be timed out.
207 | def gen_server_stubs(self_paxos_server, local_id):
208 | 	TOT_ATTEMPTS = 3
209 | 	for i in range(TOT_ATTEMPTS):
210 | 		server_addresses = gen_server_addresses(local_id, self_paxos_server.address)
211 | 		print(server_addresses)
212 | 		server_addresses.remove(self_paxos_server.address)
213 | 		stubs = []
214 | 		for server_address in server_addresses:
215 | 			if not self_paxos_server.consensus_reached:
216 | 				if local_id is not None:
217 | 					server_port = int(server_address[-5:])
218 | 					channel = implementations.insecure_channel('localhost', server_port)
219 | 				else:
220 | 					channel = implementations.insecure_channel(server_address, PAXOS_PORT_STR)
221 | 
222 | 				stub = paxos_pb2.beta_create_PaxosServer_stub(channel)
223 | 				stubs.append(stub)
224 | 		all_stubs_responsive = check_stubs_up(stubs)
225 | 		if all_stubs_responsive:
226 | 			return stubs
227 | 		time.sleep(1 * TOT_ATTEMPTS)
228 | 	return None
229 | 
230 | # Sends to all servers that consensus was reached and a server was chosen.
231 | def broadcast_consensus(server_stubs, self_paxos_server, value):
232 | 	for stub in server_stubs:
233 | 		response = stub.accepted(paxos_pb2.consensus(n=self_paxos_server.n, v=value), 2 * _TIMEOUT_SECONDS)
234 | 
235 | # Begins the Paxos protocol
236 | def start_paxos(server_stubs, self_paxos_server):
237 | 	proposal_failed, n_proposal, value = send_proposals(server_stubs, self_paxos_server)
238 | 	if not proposal_failed and not self_paxos_server.consensus_reached:
239 | 		# Have everyone accept the proposal
240 | 		accepted = request_accept(server_stubs, self_paxos_server, n_proposal, value)
241 | 		if accepted and not self_paxos_server.consensus_reached:
242 | 			# If accepted, let everyone know that the server has been chosen
243 | 			broadcast_consensus(server_stubs, self_paxos_server, value)
244 | 			self_paxos_server.new_server = value
245 | 			self_paxos_server.consensus_reached = True
246 | 			return True
247 | 
248 | 	# If proposal failed, backoff to try again later
249 | 	self_paxos_server.backoff = self_paxos_server.backoff * (1 + 10 * random.random())
250 | 	return False
251 | 
252 | # Client loops and runs the paxos algorithm every few seconds
253 | def paxos_loop(self_paxos_server, local_id):
254 | 	time_slept = 0
255 | 	send_proposal_time = self_paxos_server.backoff
256 | 
257 | 	while not self_paxos_server.consensus_reached:
258 | 		time.sleep(0.1)
259 | 		time_slept += 0.1
260 | 
261 | 		# Send a proposal at allocated time
262 | 		if time_slept > send_proposal_time and not self_paxos_server.consensus_reached:
263 | 			time.sleep(random.random())
264 | 			server_stubs = gen_server_stubs(self_paxos_server, local_id)
265 | 			if server_stubs is None:
266 | 				self_paxos_server.new_server = ''
267 | 				break
268 | 			start_paxos(server_stubs, self_paxos_server)
269 | 			send_proposal_time = (random.gauss(1, 0.25) * self_paxos_server.backoff)
270 | 			time_slept = 0
271 | 
272 | 		# If proposal fails, revert to checking for a server
273 | 		if send_proposal_time > 60:
274 | 			self_paxos_server.consensus_reached = True
275 | 			self_paxos_server.consensus_value = ''
276 | 			break
277 | 
278 | # This is the final function that exterior functions like client.py will call
279 | def run_paxos(local_id=None):
280 | 	# Generates the host name
281 | 	hostname = gen_local_address(local_id)
282 | 	log_info(hostname + ' called to run Paxos for determining the server')
283 | 
284 | 	# Generates the server
285 | 	paxos_server, server = create_server(hostname, local_id)
286 | 	try:
287 | 		# Launch the server on a separate thread
288 | 		Thread(target=run_server, args=(server,paxos_server,)).start()
289 | 		start_paxos = time.time()
290 | 
291 | 		# Begin to run Paxos
292 | 		paxos_loop(paxos_server, local_id)
293 | 		if paxos_server.new_server != '':
294 | 			log_info('Done, new server is: {0} finished paxos in {1:2}s'.format(paxos_server.new_server, time.time()-start_paxos))
295 | 		else:
296 | 			# New server is empty only when a suitable server was not found after a predefined amount of time
297 | 			log_info('Failure to connect to other allocated instances. Stopping paxos.')
298 | 	except KeyboardInterrupt:
299 | 		sys.exit(0)
300 | 	finally:
301 | 		paxos_server.consensus_reached = True
302 | 		server.stop(0)
303 | 	return paxos_server.new_server
304 | 
305 | if __name__ == '__main__':
306 | 	parser = argparse.ArgumentParser()
307 | 	parser.add_argument('--id')
308 | 	args = parser.parse_args()
309 | 	local_id = args.id
310 | 	if local_id is not None:
311 | 		local_id = int(local_id)
312 | 		assert(local_id > 0)
313 | 	log_info(run_paxos(local_id))
314 | 


--------------------------------------------------------------------------------
/python-python/paxos_pb2.py:
--------------------------------------------------------------------------------
  1 | # Generated by the protocol buffer compiler.  DO NOT EDIT!
  2 | # source: paxos.proto
  3 | 
  4 | import sys
  5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
  6 | from google.protobuf import descriptor as _descriptor
  7 | from google.protobuf import message as _message
  8 | from google.protobuf import reflection as _reflection
  9 | from google.protobuf import symbol_database as _symbol_database
 10 | from google.protobuf import descriptor_pb2
 11 | # @@protoc_insertion_point(imports)
 12 | 
 13 | _sym_db = _symbol_database.Default()
 14 | 
 15 | 
 16 | 
 17 | 
 18 | DESCRIPTOR = _descriptor.FileDescriptor(
 19 |   name='paxos.proto',
 20 |   package='dist_sgd',
 21 |   syntax='proto3',
 22 |   serialized_pb=_b('\n\x0bpaxos.proto\x12\x08\x64ist_sgd\"(\n\x03\x61\x63k\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\x12\x0b\n\x03n_v\x18\x03 \x01(\x02\"\x15\n\x08proposal\x12\t\n\x01n\x18\x01 \x01(\x02\"*\n\x12request_acceptance\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\"#\n\x0c\x61\x63quiescence\x12\x13\n\x0b\x61\x63\x63\x65pt_bool\x18\x01 \x01(\x08\"!\n\tconsensus\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\"\x07\n\x05\x65mpty2\xdf\x01\n\x0bPaxosServer\x12.\n\x07prepare\x12\x12.dist_sgd.proposal\x1a\r.dist_sgd.ack\"\x00\x12@\n\x06\x61\x63\x63\x65pt\x12\x1c.dist_sgd.request_acceptance\x1a\x16.dist_sgd.acquiescence\"\x00\x12\x32\n\x08\x61\x63\x63\x65pted\x12\x13.dist_sgd.consensus\x1a\x0f.dist_sgd.empty\"\x00\x12*\n\x04ping\x12\x0f.dist_sgd.empty\x1a\x0f.dist_sgd.empty\"\x00\x42\x18\n\x0bio.dist_sgdB\x07\x44istSGDP\x01\x62\x06proto3')
 23 | )
 24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 25 | 
 26 | 
 27 | 
 28 | 
 29 | _ACK = _descriptor.Descriptor(
 30 |   name='ack',
 31 |   full_name='dist_sgd.ack',
 32 |   filename=None,
 33 |   file=DESCRIPTOR,
 34 |   containing_type=None,
 35 |   fields=[
 36 |     _descriptor.FieldDescriptor(
 37 |       name='n', full_name='dist_sgd.ack.n', index=0,
 38 |       number=1, type=2, cpp_type=6, label=1,
 39 |       has_default_value=False, default_value=0,
 40 |       message_type=None, enum_type=None, containing_type=None,
 41 |       is_extension=False, extension_scope=None,
 42 |       options=None),
 43 |     _descriptor.FieldDescriptor(
 44 |       name='v', full_name='dist_sgd.ack.v', index=1,
 45 |       number=2, type=9, cpp_type=9, label=1,
 46 |       has_default_value=False, default_value=_b("").decode('utf-8'),
 47 |       message_type=None, enum_type=None, containing_type=None,
 48 |       is_extension=False, extension_scope=None,
 49 |       options=None),
 50 |     _descriptor.FieldDescriptor(
 51 |       name='n_v', full_name='dist_sgd.ack.n_v', index=2,
 52 |       number=3, type=2, cpp_type=6, label=1,
 53 |       has_default_value=False, default_value=0,
 54 |       message_type=None, enum_type=None, containing_type=None,
 55 |       is_extension=False, extension_scope=None,
 56 |       options=None),
 57 |   ],
 58 |   extensions=[
 59 |   ],
 60 |   nested_types=[],
 61 |   enum_types=[
 62 |   ],
 63 |   options=None,
 64 |   is_extendable=False,
 65 |   syntax='proto3',
 66 |   extension_ranges=[],
 67 |   oneofs=[
 68 |   ],
 69 |   serialized_start=25,
 70 |   serialized_end=65,
 71 | )
 72 | 
 73 | 
 74 | _PROPOSAL = _descriptor.Descriptor(
 75 |   name='proposal',
 76 |   full_name='dist_sgd.proposal',
 77 |   filename=None,
 78 |   file=DESCRIPTOR,
 79 |   containing_type=None,
 80 |   fields=[
 81 |     _descriptor.FieldDescriptor(
 82 |       name='n', full_name='dist_sgd.proposal.n', index=0,
 83 |       number=1, type=2, cpp_type=6, label=1,
 84 |       has_default_value=False, default_value=0,
 85 |       message_type=None, enum_type=None, containing_type=None,
 86 |       is_extension=False, extension_scope=None,
 87 |       options=None),
 88 |   ],
 89 |   extensions=[
 90 |   ],
 91 |   nested_types=[],
 92 |   enum_types=[
 93 |   ],
 94 |   options=None,
 95 |   is_extendable=False,
 96 |   syntax='proto3',
 97 |   extension_ranges=[],
 98 |   oneofs=[
 99 |   ],
100 |   serialized_start=67,
101 |   serialized_end=88,
102 | )
103 | 
104 | 
105 | _REQUEST_ACCEPTANCE = _descriptor.Descriptor(
106 |   name='request_acceptance',
107 |   full_name='dist_sgd.request_acceptance',
108 |   filename=None,
109 |   file=DESCRIPTOR,
110 |   containing_type=None,
111 |   fields=[
112 |     _descriptor.FieldDescriptor(
113 |       name='n', full_name='dist_sgd.request_acceptance.n', index=0,
114 |       number=1, type=2, cpp_type=6, label=1,
115 |       has_default_value=False, default_value=0,
116 |       message_type=None, enum_type=None, containing_type=None,
117 |       is_extension=False, extension_scope=None,
118 |       options=None),
119 |     _descriptor.FieldDescriptor(
120 |       name='v', full_name='dist_sgd.request_acceptance.v', index=1,
121 |       number=2, type=9, cpp_type=9, label=1,
122 |       has_default_value=False, default_value=_b("").decode('utf-8'),
123 |       message_type=None, enum_type=None, containing_type=None,
124 |       is_extension=False, extension_scope=None,
125 |       options=None),
126 |   ],
127 |   extensions=[
128 |   ],
129 |   nested_types=[],
130 |   enum_types=[
131 |   ],
132 |   options=None,
133 |   is_extendable=False,
134 |   syntax='proto3',
135 |   extension_ranges=[],
136 |   oneofs=[
137 |   ],
138 |   serialized_start=90,
139 |   serialized_end=132,
140 | )
141 | 
142 | 
143 | _ACQUIESCENCE = _descriptor.Descriptor(
144 |   name='acquiescence',
145 |   full_name='dist_sgd.acquiescence',
146 |   filename=None,
147 |   file=DESCRIPTOR,
148 |   containing_type=None,
149 |   fields=[
150 |     _descriptor.FieldDescriptor(
151 |       name='accept_bool', full_name='dist_sgd.acquiescence.accept_bool', index=0,
152 |       number=1, type=8, cpp_type=7, label=1,
153 |       has_default_value=False, default_value=False,
154 |       message_type=None, enum_type=None, containing_type=None,
155 |       is_extension=False, extension_scope=None,
156 |       options=None),
157 |   ],
158 |   extensions=[
159 |   ],
160 |   nested_types=[],
161 |   enum_types=[
162 |   ],
163 |   options=None,
164 |   is_extendable=False,
165 |   syntax='proto3',
166 |   extension_ranges=[],
167 |   oneofs=[
168 |   ],
169 |   serialized_start=134,
170 |   serialized_end=169,
171 | )
172 | 
173 | 
174 | _CONSENSUS = _descriptor.Descriptor(
175 |   name='consensus',
176 |   full_name='dist_sgd.consensus',
177 |   filename=None,
178 |   file=DESCRIPTOR,
179 |   containing_type=None,
180 |   fields=[
181 |     _descriptor.FieldDescriptor(
182 |       name='n', full_name='dist_sgd.consensus.n', index=0,
183 |       number=1, type=2, cpp_type=6, label=1,
184 |       has_default_value=False, default_value=0,
185 |       message_type=None, enum_type=None, containing_type=None,
186 |       is_extension=False, extension_scope=None,
187 |       options=None),
188 |     _descriptor.FieldDescriptor(
189 |       name='v', full_name='dist_sgd.consensus.v', index=1,
190 |       number=2, type=9, cpp_type=9, label=1,
191 |       has_default_value=False, default_value=_b("").decode('utf-8'),
192 |       message_type=None, enum_type=None, containing_type=None,
193 |       is_extension=False, extension_scope=None,
194 |       options=None),
195 |   ],
196 |   extensions=[
197 |   ],
198 |   nested_types=[],
199 |   enum_types=[
200 |   ],
201 |   options=None,
202 |   is_extendable=False,
203 |   syntax='proto3',
204 |   extension_ranges=[],
205 |   oneofs=[
206 |   ],
207 |   serialized_start=171,
208 |   serialized_end=204,
209 | )
210 | 
211 | 
212 | _EMPTY = _descriptor.Descriptor(
213 |   name='empty',
214 |   full_name='dist_sgd.empty',
215 |   filename=None,
216 |   file=DESCRIPTOR,
217 |   containing_type=None,
218 |   fields=[
219 |   ],
220 |   extensions=[
221 |   ],
222 |   nested_types=[],
223 |   enum_types=[
224 |   ],
225 |   options=None,
226 |   is_extendable=False,
227 |   syntax='proto3',
228 |   extension_ranges=[],
229 |   oneofs=[
230 |   ],
231 |   serialized_start=206,
232 |   serialized_end=213,
233 | )
234 | 
235 | DESCRIPTOR.message_types_by_name['ack'] = _ACK
236 | DESCRIPTOR.message_types_by_name['proposal'] = _PROPOSAL
237 | DESCRIPTOR.message_types_by_name['request_acceptance'] = _REQUEST_ACCEPTANCE
238 | DESCRIPTOR.message_types_by_name['acquiescence'] = _ACQUIESCENCE
239 | DESCRIPTOR.message_types_by_name['consensus'] = _CONSENSUS
240 | DESCRIPTOR.message_types_by_name['empty'] = _EMPTY
241 | 
242 | ack = _reflection.GeneratedProtocolMessageType('ack', (_message.Message,), dict(
243 |   DESCRIPTOR = _ACK,
244 |   __module__ = 'paxos_pb2'
245 |   # @@protoc_insertion_point(class_scope:dist_sgd.ack)
246 |   ))
247 | _sym_db.RegisterMessage(ack)
248 | 
249 | proposal = _reflection.GeneratedProtocolMessageType('proposal', (_message.Message,), dict(
250 |   DESCRIPTOR = _PROPOSAL,
251 |   __module__ = 'paxos_pb2'
252 |   # @@protoc_insertion_point(class_scope:dist_sgd.proposal)
253 |   ))
254 | _sym_db.RegisterMessage(proposal)
255 | 
256 | request_acceptance = _reflection.GeneratedProtocolMessageType('request_acceptance', (_message.Message,), dict(
257 |   DESCRIPTOR = _REQUEST_ACCEPTANCE,
258 |   __module__ = 'paxos_pb2'
259 |   # @@protoc_insertion_point(class_scope:dist_sgd.request_acceptance)
260 |   ))
261 | _sym_db.RegisterMessage(request_acceptance)
262 | 
263 | acquiescence = _reflection.GeneratedProtocolMessageType('acquiescence', (_message.Message,), dict(
264 |   DESCRIPTOR = _ACQUIESCENCE,
265 |   __module__ = 'paxos_pb2'
266 |   # @@protoc_insertion_point(class_scope:dist_sgd.acquiescence)
267 |   ))
268 | _sym_db.RegisterMessage(acquiescence)
269 | 
270 | consensus = _reflection.GeneratedProtocolMessageType('consensus', (_message.Message,), dict(
271 |   DESCRIPTOR = _CONSENSUS,
272 |   __module__ = 'paxos_pb2'
273 |   # @@protoc_insertion_point(class_scope:dist_sgd.consensus)
274 |   ))
275 | _sym_db.RegisterMessage(consensus)
276 | 
277 | empty = _reflection.GeneratedProtocolMessageType('empty', (_message.Message,), dict(
278 |   DESCRIPTOR = _EMPTY,
279 |   __module__ = 'paxos_pb2'
280 |   # @@protoc_insertion_point(class_scope:dist_sgd.empty)
281 |   ))
282 | _sym_db.RegisterMessage(empty)
283 | 
284 | 
285 | DESCRIPTOR.has_options = True
286 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n\013io.dist_sgdB\007DistSGDP\001'))
287 | import abc
288 | import six
289 | from grpc.beta import implementations as beta_implementations
290 | from grpc.framework.common import cardinality
291 | from grpc.framework.interfaces.face import utilities as face_utilities
292 | 
293 | class BetaPaxosServerServicer(six.with_metaclass(abc.ABCMeta, object)):
294 |   """<fill me in later!>"""
295 |   @abc.abstractmethod
296 |   def prepare(self, request, context):
297 |     raise NotImplementedError()
298 |   @abc.abstractmethod
299 |   def accept(self, request, context):
300 |     raise NotImplementedError()
301 |   @abc.abstractmethod
302 |   def accepted(self, request, context):
303 |     raise NotImplementedError()
304 |   @abc.abstractmethod
305 |   def ping(self, request, context):
306 |     raise NotImplementedError()
307 | 
308 | class BetaPaxosServerStub(six.with_metaclass(abc.ABCMeta, object)):
309 |   """The interface to which stubs will conform."""
310 |   @abc.abstractmethod
311 |   def prepare(self, request, timeout):
312 |     raise NotImplementedError()
313 |   prepare.future = None
314 |   @abc.abstractmethod
315 |   def accept(self, request, timeout):
316 |     raise NotImplementedError()
317 |   accept.future = None
318 |   @abc.abstractmethod
319 |   def accepted(self, request, timeout):
320 |     raise NotImplementedError()
321 |   accepted.future = None
322 |   @abc.abstractmethod
323 |   def ping(self, request, timeout):
324 |     raise NotImplementedError()
325 |   ping.future = None
326 | 
327 | def beta_create_PaxosServer_server(servicer, pool=None, pool_size=None, default_timeout=None, maximum_timeout=None):
328 |   import paxos_pb2
329 |   import paxos_pb2
330 |   import paxos_pb2
331 |   import paxos_pb2
332 |   import paxos_pb2
333 |   import paxos_pb2
334 |   import paxos_pb2
335 |   import paxos_pb2
336 |   request_deserializers = {
337 |     ('dist_sgd.PaxosServer', 'accept'): paxos_pb2.request_acceptance.FromString,
338 |     ('dist_sgd.PaxosServer', 'accepted'): paxos_pb2.consensus.FromString,
339 |     ('dist_sgd.PaxosServer', 'ping'): paxos_pb2.empty.FromString,
340 |     ('dist_sgd.PaxosServer', 'prepare'): paxos_pb2.proposal.FromString,
341 |   }
342 |   response_serializers = {
343 |     ('dist_sgd.PaxosServer', 'accept'): paxos_pb2.acquiescence.SerializeToString,
344 |     ('dist_sgd.PaxosServer', 'accepted'): paxos_pb2.empty.SerializeToString,
345 |     ('dist_sgd.PaxosServer', 'ping'): paxos_pb2.empty.SerializeToString,
346 |     ('dist_sgd.PaxosServer', 'prepare'): paxos_pb2.ack.SerializeToString,
347 |   }
348 |   method_implementations = {
349 |     ('dist_sgd.PaxosServer', 'accept'): face_utilities.unary_unary_inline(servicer.accept),
350 |     ('dist_sgd.PaxosServer', 'accepted'): face_utilities.unary_unary_inline(servicer.accepted),
351 |     ('dist_sgd.PaxosServer', 'ping'): face_utilities.unary_unary_inline(servicer.ping),
352 |     ('dist_sgd.PaxosServer', 'prepare'): face_utilities.unary_unary_inline(servicer.prepare),
353 |   }
354 |   server_options = beta_implementations.server_options(request_deserializers=request_deserializers, response_serializers=response_serializers, thread_pool=pool, thread_pool_size=pool_size, default_timeout=default_timeout, maximum_timeout=maximum_timeout)
355 |   return beta_implementations.server(method_implementations, options=server_options)
356 | 
357 | def beta_create_PaxosServer_stub(channel, host=None, metadata_transformer=None, pool=None, pool_size=None):
358 |   import paxos_pb2
359 |   import paxos_pb2
360 |   import paxos_pb2
361 |   import paxos_pb2
362 |   import paxos_pb2
363 |   import paxos_pb2
364 |   import paxos_pb2
365 |   import paxos_pb2
366 |   request_serializers = {
367 |     ('dist_sgd.PaxosServer', 'accept'): paxos_pb2.request_acceptance.SerializeToString,
368 |     ('dist_sgd.PaxosServer', 'accepted'): paxos_pb2.consensus.SerializeToString,
369 |     ('dist_sgd.PaxosServer', 'ping'): paxos_pb2.empty.SerializeToString,
370 |     ('dist_sgd.PaxosServer', 'prepare'): paxos_pb2.proposal.SerializeToString,
371 |   }
372 |   response_deserializers = {
373 |     ('dist_sgd.PaxosServer', 'accept'): paxos_pb2.acquiescence.FromString,
374 |     ('dist_sgd.PaxosServer', 'accepted'): paxos_pb2.empty.FromString,
375 |     ('dist_sgd.PaxosServer', 'ping'): paxos_pb2.empty.FromString,
376 |     ('dist_sgd.PaxosServer', 'prepare'): paxos_pb2.ack.FromString,
377 |   }
378 |   cardinalities = {
379 |     'accept': cardinality.Cardinality.UNARY_UNARY,
380 |     'accepted': cardinality.Cardinality.UNARY_UNARY,
381 |     'ping': cardinality.Cardinality.UNARY_UNARY,
382 |     'prepare': cardinality.Cardinality.UNARY_UNARY,
383 |   }
384 |   stub_options = beta_implementations.stub_options(host=host, metadata_transformer=metadata_transformer, request_serializers=request_serializers, response_deserializers=response_deserializers, thread_pool=pool, thread_pool_size=pool_size)
385 |   return beta_implementations.dynamic_stub(channel, 'dist_sgd.PaxosServer', cardinalities, options=stub_options)
386 | # @@protoc_insertion_point(module_scope)
387 | 


--------------------------------------------------------------------------------
/python-python/protobuf_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Default python file required for initializing the module for 
2 | # protobuffer utilities. More documentation included in the next file.


--------------------------------------------------------------------------------
/python-python/protobuf_utils/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/protobuf_utils/__init__.pyc


--------------------------------------------------------------------------------
/python-python/protobuf_utils/utils.py:
--------------------------------------------------------------------------------
 1 | import autograd.numpy as np
 2 | import autograd.numpy.random as npr
 3 | from autograd import grad
 4 | import dist_sgd_pb2
 5 | 
 6 | def convert_array_to_bytes(params):
 7 |     if (params.dtype == np.float64):
 8 |         params = params.astype(np.float32)
 9 |     param_bytes = params.tostring()
10 |     return param_bytes
11 | 
12 | def convert_bytes_to_array(param_bytes):
13 |     params = np.fromstring(param_bytes, dtype=np.float32)
14 |     return params
15 | 
16 | def convert_tensor_iter(tensor_bytes, data_indx):
17 | 	CHUNK_SIZE = 524228
18 | 	tensor_bytes_len = len(tensor_bytes)
19 | 	tensor_chunk_count = 0
20 | 	while len(tensor_bytes):
21 | 	    tensor_chunk_count += 1
22 | 	    tensor_content = tensor_bytes[:CHUNK_SIZE]
23 | 	    tensor_bytes = tensor_bytes[CHUNK_SIZE:]
24 | 	    yield dist_sgd_pb2.SubTensor(tensor_len = tensor_bytes_len, tensor_chunk = tensor_chunk_count, tensor_content = tensor_content, data_indx = data_indx)


--------------------------------------------------------------------------------
/python-python/protobuf_utils/utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/protobuf_utils/utils.pyc


--------------------------------------------------------------------------------
/python-python/protos/dist_sgd.proto:
--------------------------------------------------------------------------------
 1 | //Protocol buffers for project
 2 | 
 3 | syntax = "proto3";
 4 | 
 5 | package dist_sgd;
 6 | 
 7 | option java_multiple_files = true;
 8 | option java_package = "io.dist_sgd";
 9 | option java_outer_classname = "DistSGD";
10 | //option objc_class_prefix = "DSG";
11 | 
12 | // Main server for passing infromation around
13 | service ParamFeeder {
14 |   // Sends the parameters back and forth between server and client
15 |   rpc SendParams (ClientInfo) returns (stream SubTensor) {}
16 | 
17 |   // Sends information about the next batch
18 |   rpc SendNextBatch (PrevBatch) returns (NextBatch) {}
19 | 
20 |   // Gets gardient updates from client servers
21 |   rpc GetUpdates (stream SubTensor) returns (StatusCode) {}
22 | 
23 |   // This call simply makes sure that all machines have begun to run Paxos. 
24 |   rpc ping (empty) returns (empty) {}
25 | 
26 | }
27 | 
28 | message SubTensor { 
29 | 	// Length of the tensor getting passed
30 | 	int32 tensor_len = 1; 
31 | 	
32 | 	// Current chunk of the tensor
33 | 	int32 tensor_chunk = 2;
34 | 
35 | 	// Serialized tensor getting passed
36 | 	bytes tensor_content = 3;
37 | 
38 | 	// Batch for gradient update, used to determine whether or not
39 | 	// the gradient is stale and should be thrown out
40 | 	int32 data_indx = 4;
41 | }
42 | 
43 | // Later on we can extend client info to include information about processing speed, etc.
44 | message ClientInfo {
45 | 	int32 client_id = 1;
46 | }
47 | 
48 | // Includes information about sucesss and failure
49 | message StatusCode {
50 | 	int32 status = 1;
51 | }
52 | 
53 | message PrevBatch { 
54 | 	int32 client_id = 1;
55 | 
56 | 	int32 prev_data_indx = 2;
57 | }
58 | 
59 | message NextBatch { 
60 | 	int32 client_id = 1;
61 | 
62 | 	int32 data_indx = 2;
63 | }
64 | 
65 | message empty {}


--------------------------------------------------------------------------------
/python-python/protos/dist_sgd_pb2.py:
--------------------------------------------------------------------------------
  1 | # Generated by the protocol buffer compiler.  DO NOT EDIT!
  2 | # source: protos/dist_sgd.proto
  3 | 
  4 | import sys
  5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
  6 | from google.protobuf import descriptor as _descriptor
  7 | from google.protobuf import message as _message
  8 | from google.protobuf import reflection as _reflection
  9 | from google.protobuf import symbol_database as _symbol_database
 10 | from google.protobuf import descriptor_pb2
 11 | # @@protoc_insertion_point(imports)
 12 | 
 13 | _sym_db = _symbol_database.Default()
 14 | 
 15 | 
 16 | 
 17 | 
 18 | DESCRIPTOR = _descriptor.FileDescriptor(
 19 |   name='protos/dist_sgd.proto',
 20 |   package='dist_sgd',
 21 |   syntax='proto3',
 22 |   serialized_pb=_b('\n\x15protos/dist_sgd.proto\x12\x08\x64ist_sgd\"`\n\tSubTensor\x12\x12\n\ntensor_len\x18\x01 \x01(\x05\x12\x14\n\x0ctensor_chunk\x18\x02 \x01(\x05\x12\x16\n\x0etensor_content\x18\x03 \x01(\x0c\x12\x11\n\tdata_indx\x18\x04 \x01(\x05\"\x1f\n\nClientInfo\x12\x11\n\tclient_id\x18\x01 \x01(\x05\"\x1c\n\nStatusCode\x12\x0e\n\x06status\x18\x01 \x01(\x05\"6\n\tPrevBatch\x12\x11\n\tclient_id\x18\x01 \x01(\x05\x12\x16\n\x0eprev_data_indx\x18\x02 \x01(\x05\"1\n\tNextBatch\x12\x11\n\tclient_id\x18\x01 \x01(\x05\x12\x11\n\tdata_indx\x18\x02 \x01(\x05\"\x07\n\x05\x65mpty2\xf0\x01\n\x0bParamFeeder\x12;\n\nSendParams\x12\x14.dist_sgd.ClientInfo\x1a\x13.dist_sgd.SubTensor\"\x00\x30\x01\x12;\n\rSendNextBatch\x12\x13.dist_sgd.PrevBatch\x1a\x13.dist_sgd.NextBatch\"\x00\x12;\n\nGetUpdates\x12\x13.dist_sgd.SubTensor\x1a\x14.dist_sgd.StatusCode\"\x00(\x01\x12*\n\x04ping\x12\x0f.dist_sgd.empty\x1a\x0f.dist_sgd.empty\"\x00\x42\x18\n\x0bio.dist_sgdB\x07\x44istSGDP\x01\x62\x06proto3')
 23 | )
 24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 25 | 
 26 | 
 27 | 
 28 | 
 29 | _SUBTENSOR = _descriptor.Descriptor(
 30 |   name='SubTensor',
 31 |   full_name='dist_sgd.SubTensor',
 32 |   filename=None,
 33 |   file=DESCRIPTOR,
 34 |   containing_type=None,
 35 |   fields=[
 36 |     _descriptor.FieldDescriptor(
 37 |       name='tensor_len', full_name='dist_sgd.SubTensor.tensor_len', index=0,
 38 |       number=1, type=5, cpp_type=1, label=1,
 39 |       has_default_value=False, default_value=0,
 40 |       message_type=None, enum_type=None, containing_type=None,
 41 |       is_extension=False, extension_scope=None,
 42 |       options=None),
 43 |     _descriptor.FieldDescriptor(
 44 |       name='tensor_chunk', full_name='dist_sgd.SubTensor.tensor_chunk', index=1,
 45 |       number=2, type=5, cpp_type=1, label=1,
 46 |       has_default_value=False, default_value=0,
 47 |       message_type=None, enum_type=None, containing_type=None,
 48 |       is_extension=False, extension_scope=None,
 49 |       options=None),
 50 |     _descriptor.FieldDescriptor(
 51 |       name='tensor_content', full_name='dist_sgd.SubTensor.tensor_content', index=2,
 52 |       number=3, type=12, cpp_type=9, label=1,
 53 |       has_default_value=False, default_value=_b(""),
 54 |       message_type=None, enum_type=None, containing_type=None,
 55 |       is_extension=False, extension_scope=None,
 56 |       options=None),
 57 |     _descriptor.FieldDescriptor(
 58 |       name='data_indx', full_name='dist_sgd.SubTensor.data_indx', index=3,
 59 |       number=4, type=5, cpp_type=1, label=1,
 60 |       has_default_value=False, default_value=0,
 61 |       message_type=None, enum_type=None, containing_type=None,
 62 |       is_extension=False, extension_scope=None,
 63 |       options=None),
 64 |   ],
 65 |   extensions=[
 66 |   ],
 67 |   nested_types=[],
 68 |   enum_types=[
 69 |   ],
 70 |   options=None,
 71 |   is_extendable=False,
 72 |   syntax='proto3',
 73 |   extension_ranges=[],
 74 |   oneofs=[
 75 |   ],
 76 |   serialized_start=35,
 77 |   serialized_end=131,
 78 | )
 79 | 
 80 | 
 81 | _CLIENTINFO = _descriptor.Descriptor(
 82 |   name='ClientInfo',
 83 |   full_name='dist_sgd.ClientInfo',
 84 |   filename=None,
 85 |   file=DESCRIPTOR,
 86 |   containing_type=None,
 87 |   fields=[
 88 |     _descriptor.FieldDescriptor(
 89 |       name='client_id', full_name='dist_sgd.ClientInfo.client_id', index=0,
 90 |       number=1, type=5, cpp_type=1, label=1,
 91 |       has_default_value=False, default_value=0,
 92 |       message_type=None, enum_type=None, containing_type=None,
 93 |       is_extension=False, extension_scope=None,
 94 |       options=None),
 95 |   ],
 96 |   extensions=[
 97 |   ],
 98 |   nested_types=[],
 99 |   enum_types=[
100 |   ],
101 |   options=None,
102 |   is_extendable=False,
103 |   syntax='proto3',
104 |   extension_ranges=[],
105 |   oneofs=[
106 |   ],
107 |   serialized_start=133,
108 |   serialized_end=164,
109 | )
110 | 
111 | 
112 | _STATUSCODE = _descriptor.Descriptor(
113 |   name='StatusCode',
114 |   full_name='dist_sgd.StatusCode',
115 |   filename=None,
116 |   file=DESCRIPTOR,
117 |   containing_type=None,
118 |   fields=[
119 |     _descriptor.FieldDescriptor(
120 |       name='status', full_name='dist_sgd.StatusCode.status', index=0,
121 |       number=1, type=5, cpp_type=1, label=1,
122 |       has_default_value=False, default_value=0,
123 |       message_type=None, enum_type=None, containing_type=None,
124 |       is_extension=False, extension_scope=None,
125 |       options=None),
126 |   ],
127 |   extensions=[
128 |   ],
129 |   nested_types=[],
130 |   enum_types=[
131 |   ],
132 |   options=None,
133 |   is_extendable=False,
134 |   syntax='proto3',
135 |   extension_ranges=[],
136 |   oneofs=[
137 |   ],
138 |   serialized_start=166,
139 |   serialized_end=194,
140 | )
141 | 
142 | 
143 | _PREVBATCH = _descriptor.Descriptor(
144 |   name='PrevBatch',
145 |   full_name='dist_sgd.PrevBatch',
146 |   filename=None,
147 |   file=DESCRIPTOR,
148 |   containing_type=None,
149 |   fields=[
150 |     _descriptor.FieldDescriptor(
151 |       name='client_id', full_name='dist_sgd.PrevBatch.client_id', index=0,
152 |       number=1, type=5, cpp_type=1, label=1,
153 |       has_default_value=False, default_value=0,
154 |       message_type=None, enum_type=None, containing_type=None,
155 |       is_extension=False, extension_scope=None,
156 |       options=None),
157 |     _descriptor.FieldDescriptor(
158 |       name='prev_data_indx', full_name='dist_sgd.PrevBatch.prev_data_indx', index=1,
159 |       number=2, type=5, cpp_type=1, label=1,
160 |       has_default_value=False, default_value=0,
161 |       message_type=None, enum_type=None, containing_type=None,
162 |       is_extension=False, extension_scope=None,
163 |       options=None),
164 |   ],
165 |   extensions=[
166 |   ],
167 |   nested_types=[],
168 |   enum_types=[
169 |   ],
170 |   options=None,
171 |   is_extendable=False,
172 |   syntax='proto3',
173 |   extension_ranges=[],
174 |   oneofs=[
175 |   ],
176 |   serialized_start=196,
177 |   serialized_end=250,
178 | )
179 | 
180 | 
181 | _NEXTBATCH = _descriptor.Descriptor(
182 |   name='NextBatch',
183 |   full_name='dist_sgd.NextBatch',
184 |   filename=None,
185 |   file=DESCRIPTOR,
186 |   containing_type=None,
187 |   fields=[
188 |     _descriptor.FieldDescriptor(
189 |       name='client_id', full_name='dist_sgd.NextBatch.client_id', index=0,
190 |       number=1, type=5, cpp_type=1, label=1,
191 |       has_default_value=False, default_value=0,
192 |       message_type=None, enum_type=None, containing_type=None,
193 |       is_extension=False, extension_scope=None,
194 |       options=None),
195 |     _descriptor.FieldDescriptor(
196 |       name='data_indx', full_name='dist_sgd.NextBatch.data_indx', index=1,
197 |       number=2, type=5, cpp_type=1, label=1,
198 |       has_default_value=False, default_value=0,
199 |       message_type=None, enum_type=None, containing_type=None,
200 |       is_extension=False, extension_scope=None,
201 |       options=None),
202 |   ],
203 |   extensions=[
204 |   ],
205 |   nested_types=[],
206 |   enum_types=[
207 |   ],
208 |   options=None,
209 |   is_extendable=False,
210 |   syntax='proto3',
211 |   extension_ranges=[],
212 |   oneofs=[
213 |   ],
214 |   serialized_start=252,
215 |   serialized_end=301,
216 | )
217 | 
218 | 
219 | _EMPTY = _descriptor.Descriptor(
220 |   name='empty',
221 |   full_name='dist_sgd.empty',
222 |   filename=None,
223 |   file=DESCRIPTOR,
224 |   containing_type=None,
225 |   fields=[
226 |   ],
227 |   extensions=[
228 |   ],
229 |   nested_types=[],
230 |   enum_types=[
231 |   ],
232 |   options=None,
233 |   is_extendable=False,
234 |   syntax='proto3',
235 |   extension_ranges=[],
236 |   oneofs=[
237 |   ],
238 |   serialized_start=303,
239 |   serialized_end=310,
240 | )
241 | 
242 | DESCRIPTOR.message_types_by_name['SubTensor'] = _SUBTENSOR
243 | DESCRIPTOR.message_types_by_name['ClientInfo'] = _CLIENTINFO
244 | DESCRIPTOR.message_types_by_name['StatusCode'] = _STATUSCODE
245 | DESCRIPTOR.message_types_by_name['PrevBatch'] = _PREVBATCH
246 | DESCRIPTOR.message_types_by_name['NextBatch'] = _NEXTBATCH
247 | DESCRIPTOR.message_types_by_name['empty'] = _EMPTY
248 | 
249 | SubTensor = _reflection.GeneratedProtocolMessageType('SubTensor', (_message.Message,), dict(
250 |   DESCRIPTOR = _SUBTENSOR,
251 |   __module__ = 'protos.dist_sgd_pb2'
252 |   # @@protoc_insertion_point(class_scope:dist_sgd.SubTensor)
253 |   ))
254 | _sym_db.RegisterMessage(SubTensor)
255 | 
256 | ClientInfo = _reflection.GeneratedProtocolMessageType('ClientInfo', (_message.Message,), dict(
257 |   DESCRIPTOR = _CLIENTINFO,
258 |   __module__ = 'protos.dist_sgd_pb2'
259 |   # @@protoc_insertion_point(class_scope:dist_sgd.ClientInfo)
260 |   ))
261 | _sym_db.RegisterMessage(ClientInfo)
262 | 
263 | StatusCode = _reflection.GeneratedProtocolMessageType('StatusCode', (_message.Message,), dict(
264 |   DESCRIPTOR = _STATUSCODE,
265 |   __module__ = 'protos.dist_sgd_pb2'
266 |   # @@protoc_insertion_point(class_scope:dist_sgd.StatusCode)
267 |   ))
268 | _sym_db.RegisterMessage(StatusCode)
269 | 
270 | PrevBatch = _reflection.GeneratedProtocolMessageType('PrevBatch', (_message.Message,), dict(
271 |   DESCRIPTOR = _PREVBATCH,
272 |   __module__ = 'protos.dist_sgd_pb2'
273 |   # @@protoc_insertion_point(class_scope:dist_sgd.PrevBatch)
274 |   ))
275 | _sym_db.RegisterMessage(PrevBatch)
276 | 
277 | NextBatch = _reflection.GeneratedProtocolMessageType('NextBatch', (_message.Message,), dict(
278 |   DESCRIPTOR = _NEXTBATCH,
279 |   __module__ = 'protos.dist_sgd_pb2'
280 |   # @@protoc_insertion_point(class_scope:dist_sgd.NextBatch)
281 |   ))
282 | _sym_db.RegisterMessage(NextBatch)
283 | 
284 | empty = _reflection.GeneratedProtocolMessageType('empty', (_message.Message,), dict(
285 |   DESCRIPTOR = _EMPTY,
286 |   __module__ = 'protos.dist_sgd_pb2'
287 |   # @@protoc_insertion_point(class_scope:dist_sgd.empty)
288 |   ))
289 | _sym_db.RegisterMessage(empty)
290 | 
291 | 
292 | DESCRIPTOR.has_options = True
293 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n\013io.dist_sgdB\007DistSGDP\001'))
294 | import abc
295 | import six
296 | from grpc.beta import implementations as beta_implementations
297 | from grpc.framework.common import cardinality
298 | from grpc.framework.interfaces.face import utilities as face_utilities
299 | 
300 | class BetaParamFeederServicer(six.with_metaclass(abc.ABCMeta, object)):
301 |   """<fill me in later!>"""
302 |   @abc.abstractmethod
303 |   def SendParams(self, request, context):
304 |     raise NotImplementedError()
305 |   @abc.abstractmethod
306 |   def SendNextBatch(self, request, context):
307 |     raise NotImplementedError()
308 |   @abc.abstractmethod
309 |   def GetUpdates(self, request_iterator, context):
310 |     raise NotImplementedError()
311 |   @abc.abstractmethod
312 |   def ping(self, request, context):
313 |     raise NotImplementedError()
314 | 
315 | class BetaParamFeederStub(six.with_metaclass(abc.ABCMeta, object)):
316 |   """The interface to which stubs will conform."""
317 |   @abc.abstractmethod
318 |   def SendParams(self, request, timeout):
319 |     raise NotImplementedError()
320 |   @abc.abstractmethod
321 |   def SendNextBatch(self, request, timeout):
322 |     raise NotImplementedError()
323 |   SendNextBatch.future = None
324 |   @abc.abstractmethod
325 |   def GetUpdates(self, request_iterator, timeout):
326 |     raise NotImplementedError()
327 |   GetUpdates.future = None
328 |   @abc.abstractmethod
329 |   def ping(self, request, timeout):
330 |     raise NotImplementedError()
331 |   ping.future = None
332 | 
333 | def beta_create_ParamFeeder_server(servicer, pool=None, pool_size=None, default_timeout=None, maximum_timeout=None):
334 |   import protos.dist_sgd_pb2
335 |   import protos.dist_sgd_pb2
336 |   import protos.dist_sgd_pb2
337 |   import protos.dist_sgd_pb2
338 |   import protos.dist_sgd_pb2
339 |   import protos.dist_sgd_pb2
340 |   import protos.dist_sgd_pb2
341 |   import protos.dist_sgd_pb2
342 |   request_deserializers = {
343 |     ('dist_sgd.ParamFeeder', 'GetUpdates'): protos.dist_sgd_pb2.SubTensor.FromString,
344 |     ('dist_sgd.ParamFeeder', 'SendNextBatch'): protos.dist_sgd_pb2.PrevBatch.FromString,
345 |     ('dist_sgd.ParamFeeder', 'SendParams'): protos.dist_sgd_pb2.ClientInfo.FromString,
346 |     ('dist_sgd.ParamFeeder', 'ping'): protos.dist_sgd_pb2.empty.FromString,
347 |   }
348 |   response_serializers = {
349 |     ('dist_sgd.ParamFeeder', 'GetUpdates'): protos.dist_sgd_pb2.StatusCode.SerializeToString,
350 |     ('dist_sgd.ParamFeeder', 'SendNextBatch'): protos.dist_sgd_pb2.NextBatch.SerializeToString,
351 |     ('dist_sgd.ParamFeeder', 'SendParams'): protos.dist_sgd_pb2.SubTensor.SerializeToString,
352 |     ('dist_sgd.ParamFeeder', 'ping'): protos.dist_sgd_pb2.empty.SerializeToString,
353 |   }
354 |   method_implementations = {
355 |     ('dist_sgd.ParamFeeder', 'GetUpdates'): face_utilities.stream_unary_inline(servicer.GetUpdates),
356 |     ('dist_sgd.ParamFeeder', 'SendNextBatch'): face_utilities.unary_unary_inline(servicer.SendNextBatch),
357 |     ('dist_sgd.ParamFeeder', 'SendParams'): face_utilities.unary_stream_inline(servicer.SendParams),
358 |     ('dist_sgd.ParamFeeder', 'ping'): face_utilities.unary_unary_inline(servicer.ping),
359 |   }
360 |   server_options = beta_implementations.server_options(request_deserializers=request_deserializers, response_serializers=response_serializers, thread_pool=pool, thread_pool_size=pool_size, default_timeout=default_timeout, maximum_timeout=maximum_timeout)
361 |   return beta_implementations.server(method_implementations, options=server_options)
362 | 
363 | def beta_create_ParamFeeder_stub(channel, host=None, metadata_transformer=None, pool=None, pool_size=None):
364 |   import protos.dist_sgd_pb2
365 |   import protos.dist_sgd_pb2
366 |   import protos.dist_sgd_pb2
367 |   import protos.dist_sgd_pb2
368 |   import protos.dist_sgd_pb2
369 |   import protos.dist_sgd_pb2
370 |   import protos.dist_sgd_pb2
371 |   import protos.dist_sgd_pb2
372 |   request_serializers = {
373 |     ('dist_sgd.ParamFeeder', 'GetUpdates'): protos.dist_sgd_pb2.SubTensor.SerializeToString,
374 |     ('dist_sgd.ParamFeeder', 'SendNextBatch'): protos.dist_sgd_pb2.PrevBatch.SerializeToString,
375 |     ('dist_sgd.ParamFeeder', 'SendParams'): protos.dist_sgd_pb2.ClientInfo.SerializeToString,
376 |     ('dist_sgd.ParamFeeder', 'ping'): protos.dist_sgd_pb2.empty.SerializeToString,
377 |   }
378 |   response_deserializers = {
379 |     ('dist_sgd.ParamFeeder', 'GetUpdates'): protos.dist_sgd_pb2.StatusCode.FromString,
380 |     ('dist_sgd.ParamFeeder', 'SendNextBatch'): protos.dist_sgd_pb2.NextBatch.FromString,
381 |     ('dist_sgd.ParamFeeder', 'SendParams'): protos.dist_sgd_pb2.SubTensor.FromString,
382 |     ('dist_sgd.ParamFeeder', 'ping'): protos.dist_sgd_pb2.empty.FromString,
383 |   }
384 |   cardinalities = {
385 |     'GetUpdates': cardinality.Cardinality.STREAM_UNARY,
386 |     'SendNextBatch': cardinality.Cardinality.UNARY_UNARY,
387 |     'SendParams': cardinality.Cardinality.UNARY_STREAM,
388 |     'ping': cardinality.Cardinality.UNARY_UNARY,
389 |   }
390 |   stub_options = beta_implementations.stub_options(host=host, metadata_transformer=metadata_transformer, request_serializers=request_serializers, response_deserializers=response_deserializers, thread_pool=pool, thread_pool_size=pool_size)
391 |   return beta_implementations.dynamic_stub(channel, 'dist_sgd.ParamFeeder', cardinalities, options=stub_options)
392 | # @@protoc_insertion_point(module_scope)
393 | 


--------------------------------------------------------------------------------
/python-python/protos/paxos.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto3";
 2 | 
 3 | package dist_sgd;
 4 | 
 5 | option java_multiple_files = true;
 6 | option java_package = "io.dist_sgd";
 7 | option java_outer_classname = "DistSGD";
 8 | 
 9 | // After getting a majority of proposals without getting rejected, the 
10 | // machine chooses an IP from the most recent acknowledgements or one 
11 | // that it generates itself and calls accept on all servers. 
12 | 
13 | // The servers return whether or not they accept. If majority of servers accept, 
14 | // then it calls consensus on all servers and sends them the decided upon IP address 
15 | // that is server
16 | 
17 | // TECHNICALLY, each machine should call consensus 
18 | 
19 | // Main server for running the Paxos Algorithm. Everyone hosts this server on 
20 | // their localhost. Used for sending and receiving messages for coordinating Paxos.
21 | service PaxosServer {
22 | 
23 | 	// The machine sends each server a proposal. The server then
24 | 	// sends an acknowledgement accepting or rejecting the proposal.
25 | 	rpc prepare(proposal) returns (ack) {}
26 | 
27 | 	// Requests that people accept the proposal
28 | 	rpc accept(request_acceptance) returns (acquiescence) {}
29 | 
30 | 	// Notified that consensus has been achieved about a server
31 | 	// Technically each server should broadcast that it accepted the consensus
32 | 	rpc accepted (consensus) returns (empty) {}
33 | 
34 | 	// This call simply makes sure that all machines have begun to run Paxos. 
35 | 	rpc ping (empty) returns (empty) {}
36 | }
37 | 
38 | message ack {
39 | 	float n = 1;
40 | 	string v = 2;
41 | 	float n_v = 3;
42 | }
43 | 
44 | message proposal {
45 | 	float n = 1;  
46 | }
47 | 
48 | message request_acceptance{
49 | 	float n = 1;
50 | 	string v = 2;
51 | }
52 | 
53 | message acquiescence {
54 | 	bool accept_bool = 1;
55 | }
56 | 
57 | message consensus { 
58 | 	float n = 1;
59 | 	string v = 2;
60 | }
61 | 
62 | message empty {
63 | }


--------------------------------------------------------------------------------
/python-python/protos/paxos_pb2.py:
--------------------------------------------------------------------------------
  1 | # Generated by the protocol buffer compiler.  DO NOT EDIT!
  2 | # source: protos/paxos.proto
  3 | 
  4 | import sys
  5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
  6 | from google.protobuf import descriptor as _descriptor
  7 | from google.protobuf import message as _message
  8 | from google.protobuf import reflection as _reflection
  9 | from google.protobuf import symbol_database as _symbol_database
 10 | from google.protobuf import descriptor_pb2
 11 | # @@protoc_insertion_point(imports)
 12 | 
 13 | _sym_db = _symbol_database.Default()
 14 | 
 15 | 
 16 | 
 17 | 
 18 | DESCRIPTOR = _descriptor.FileDescriptor(
 19 |   name='protos/paxos.proto',
 20 |   package='dist_sgd',
 21 |   syntax='proto3',
 22 |   serialized_pb=_b('\n\x12protos/paxos.proto\x12\x08\x64ist_sgd\"(\n\x03\x61\x63k\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\x12\x0b\n\x03n_v\x18\x03 \x01(\x02\"\x15\n\x08proposal\x12\t\n\x01n\x18\x01 \x01(\x02\"*\n\x12request_acceptance\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\"#\n\x0c\x61\x63quiescence\x12\x13\n\x0b\x61\x63\x63\x65pt_bool\x18\x01 \x01(\x08\"!\n\tconsensus\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\"\x07\n\x05\x65mpty2\xdf\x01\n\x0bPaxosServer\x12.\n\x07prepare\x12\x12.dist_sgd.proposal\x1a\r.dist_sgd.ack\"\x00\x12@\n\x06\x61\x63\x63\x65pt\x12\x1c.dist_sgd.request_acceptance\x1a\x16.dist_sgd.acquiescence\"\x00\x12\x32\n\x08\x61\x63\x63\x65pted\x12\x13.dist_sgd.consensus\x1a\x0f.dist_sgd.empty\"\x00\x12*\n\x04ping\x12\x0f.dist_sgd.empty\x1a\x0f.dist_sgd.empty\"\x00\x42\x18\n\x0bio.dist_sgdB\x07\x44istSGDP\x01\x62\x06proto3')
 23 | )
 24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 25 | 
 26 | 
 27 | 
 28 | 
 29 | _ACK = _descriptor.Descriptor(
 30 |   name='ack',
 31 |   full_name='dist_sgd.ack',
 32 |   filename=None,
 33 |   file=DESCRIPTOR,
 34 |   containing_type=None,
 35 |   fields=[
 36 |     _descriptor.FieldDescriptor(
 37 |       name='n', full_name='dist_sgd.ack.n', index=0,
 38 |       number=1, type=2, cpp_type=6, label=1,
 39 |       has_default_value=False, default_value=0,
 40 |       message_type=None, enum_type=None, containing_type=None,
 41 |       is_extension=False, extension_scope=None,
 42 |       options=None),
 43 |     _descriptor.FieldDescriptor(
 44 |       name='v', full_name='dist_sgd.ack.v', index=1,
 45 |       number=2, type=9, cpp_type=9, label=1,
 46 |       has_default_value=False, default_value=_b("").decode('utf-8'),
 47 |       message_type=None, enum_type=None, containing_type=None,
 48 |       is_extension=False, extension_scope=None,
 49 |       options=None),
 50 |     _descriptor.FieldDescriptor(
 51 |       name='n_v', full_name='dist_sgd.ack.n_v', index=2,
 52 |       number=3, type=2, cpp_type=6, label=1,
 53 |       has_default_value=False, default_value=0,
 54 |       message_type=None, enum_type=None, containing_type=None,
 55 |       is_extension=False, extension_scope=None,
 56 |       options=None),
 57 |   ],
 58 |   extensions=[
 59 |   ],
 60 |   nested_types=[],
 61 |   enum_types=[
 62 |   ],
 63 |   options=None,
 64 |   is_extendable=False,
 65 |   syntax='proto3',
 66 |   extension_ranges=[],
 67 |   oneofs=[
 68 |   ],
 69 |   serialized_start=32,
 70 |   serialized_end=72,
 71 | )
 72 | 
 73 | 
 74 | _PROPOSAL = _descriptor.Descriptor(
 75 |   name='proposal',
 76 |   full_name='dist_sgd.proposal',
 77 |   filename=None,
 78 |   file=DESCRIPTOR,
 79 |   containing_type=None,
 80 |   fields=[
 81 |     _descriptor.FieldDescriptor(
 82 |       name='n', full_name='dist_sgd.proposal.n', index=0,
 83 |       number=1, type=2, cpp_type=6, label=1,
 84 |       has_default_value=False, default_value=0,
 85 |       message_type=None, enum_type=None, containing_type=None,
 86 |       is_extension=False, extension_scope=None,
 87 |       options=None),
 88 |   ],
 89 |   extensions=[
 90 |   ],
 91 |   nested_types=[],
 92 |   enum_types=[
 93 |   ],
 94 |   options=None,
 95 |   is_extendable=False,
 96 |   syntax='proto3',
 97 |   extension_ranges=[],
 98 |   oneofs=[
 99 |   ],
100 |   serialized_start=74,
101 |   serialized_end=95,
102 | )
103 | 
104 | 
105 | _REQUEST_ACCEPTANCE = _descriptor.Descriptor(
106 |   name='request_acceptance',
107 |   full_name='dist_sgd.request_acceptance',
108 |   filename=None,
109 |   file=DESCRIPTOR,
110 |   containing_type=None,
111 |   fields=[
112 |     _descriptor.FieldDescriptor(
113 |       name='n', full_name='dist_sgd.request_acceptance.n', index=0,
114 |       number=1, type=2, cpp_type=6, label=1,
115 |       has_default_value=False, default_value=0,
116 |       message_type=None, enum_type=None, containing_type=None,
117 |       is_extension=False, extension_scope=None,
118 |       options=None),
119 |     _descriptor.FieldDescriptor(
120 |       name='v', full_name='dist_sgd.request_acceptance.v', index=1,
121 |       number=2, type=9, cpp_type=9, label=1,
122 |       has_default_value=False, default_value=_b("").decode('utf-8'),
123 |       message_type=None, enum_type=None, containing_type=None,
124 |       is_extension=False, extension_scope=None,
125 |       options=None),
126 |   ],
127 |   extensions=[
128 |   ],
129 |   nested_types=[],
130 |   enum_types=[
131 |   ],
132 |   options=None,
133 |   is_extendable=False,
134 |   syntax='proto3',
135 |   extension_ranges=[],
136 |   oneofs=[
137 |   ],
138 |   serialized_start=97,
139 |   serialized_end=139,
140 | )
141 | 
142 | 
143 | _ACQUIESCENCE = _descriptor.Descriptor(
144 |   name='acquiescence',
145 |   full_name='dist_sgd.acquiescence',
146 |   filename=None,
147 |   file=DESCRIPTOR,
148 |   containing_type=None,
149 |   fields=[
150 |     _descriptor.FieldDescriptor(
151 |       name='accept_bool', full_name='dist_sgd.acquiescence.accept_bool', index=0,
152 |       number=1, type=8, cpp_type=7, label=1,
153 |       has_default_value=False, default_value=False,
154 |       message_type=None, enum_type=None, containing_type=None,
155 |       is_extension=False, extension_scope=None,
156 |       options=None),
157 |   ],
158 |   extensions=[
159 |   ],
160 |   nested_types=[],
161 |   enum_types=[
162 |   ],
163 |   options=None,
164 |   is_extendable=False,
165 |   syntax='proto3',
166 |   extension_ranges=[],
167 |   oneofs=[
168 |   ],
169 |   serialized_start=141,
170 |   serialized_end=176,
171 | )
172 | 
173 | 
174 | _CONSENSUS = _descriptor.Descriptor(
175 |   name='consensus',
176 |   full_name='dist_sgd.consensus',
177 |   filename=None,
178 |   file=DESCRIPTOR,
179 |   containing_type=None,
180 |   fields=[
181 |     _descriptor.FieldDescriptor(
182 |       name='n', full_name='dist_sgd.consensus.n', index=0,
183 |       number=1, type=2, cpp_type=6, label=1,
184 |       has_default_value=False, default_value=0,
185 |       message_type=None, enum_type=None, containing_type=None,
186 |       is_extension=False, extension_scope=None,
187 |       options=None),
188 |     _descriptor.FieldDescriptor(
189 |       name='v', full_name='dist_sgd.consensus.v', index=1,
190 |       number=2, type=9, cpp_type=9, label=1,
191 |       has_default_value=False, default_value=_b("").decode('utf-8'),
192 |       message_type=None, enum_type=None, containing_type=None,
193 |       is_extension=False, extension_scope=None,
194 |       options=None),
195 |   ],
196 |   extensions=[
197 |   ],
198 |   nested_types=[],
199 |   enum_types=[
200 |   ],
201 |   options=None,
202 |   is_extendable=False,
203 |   syntax='proto3',
204 |   extension_ranges=[],
205 |   oneofs=[
206 |   ],
207 |   serialized_start=178,
208 |   serialized_end=211,
209 | )
210 | 
211 | 
212 | _EMPTY = _descriptor.Descriptor(
213 |   name='empty',
214 |   full_name='dist_sgd.empty',
215 |   filename=None,
216 |   file=DESCRIPTOR,
217 |   containing_type=None,
218 |   fields=[
219 |   ],
220 |   extensions=[
221 |   ],
222 |   nested_types=[],
223 |   enum_types=[
224 |   ],
225 |   options=None,
226 |   is_extendable=False,
227 |   syntax='proto3',
228 |   extension_ranges=[],
229 |   oneofs=[
230 |   ],
231 |   serialized_start=213,
232 |   serialized_end=220,
233 | )
234 | 
235 | DESCRIPTOR.message_types_by_name['ack'] = _ACK
236 | DESCRIPTOR.message_types_by_name['proposal'] = _PROPOSAL
237 | DESCRIPTOR.message_types_by_name['request_acceptance'] = _REQUEST_ACCEPTANCE
238 | DESCRIPTOR.message_types_by_name['acquiescence'] = _ACQUIESCENCE
239 | DESCRIPTOR.message_types_by_name['consensus'] = _CONSENSUS
240 | DESCRIPTOR.message_types_by_name['empty'] = _EMPTY
241 | 
242 | ack = _reflection.GeneratedProtocolMessageType('ack', (_message.Message,), dict(
243 |   DESCRIPTOR = _ACK,
244 |   __module__ = 'protos.paxos_pb2'
245 |   # @@protoc_insertion_point(class_scope:dist_sgd.ack)
246 |   ))
247 | _sym_db.RegisterMessage(ack)
248 | 
249 | proposal = _reflection.GeneratedProtocolMessageType('proposal', (_message.Message,), dict(
250 |   DESCRIPTOR = _PROPOSAL,
251 |   __module__ = 'protos.paxos_pb2'
252 |   # @@protoc_insertion_point(class_scope:dist_sgd.proposal)
253 |   ))
254 | _sym_db.RegisterMessage(proposal)
255 | 
256 | request_acceptance = _reflection.GeneratedProtocolMessageType('request_acceptance', (_message.Message,), dict(
257 |   DESCRIPTOR = _REQUEST_ACCEPTANCE,
258 |   __module__ = 'protos.paxos_pb2'
259 |   # @@protoc_insertion_point(class_scope:dist_sgd.request_acceptance)
260 |   ))
261 | _sym_db.RegisterMessage(request_acceptance)
262 | 
263 | acquiescence = _reflection.GeneratedProtocolMessageType('acquiescence', (_message.Message,), dict(
264 |   DESCRIPTOR = _ACQUIESCENCE,
265 |   __module__ = 'protos.paxos_pb2'
266 |   # @@protoc_insertion_point(class_scope:dist_sgd.acquiescence)
267 |   ))
268 | _sym_db.RegisterMessage(acquiescence)
269 | 
270 | consensus = _reflection.GeneratedProtocolMessageType('consensus', (_message.Message,), dict(
271 |   DESCRIPTOR = _CONSENSUS,
272 |   __module__ = 'protos.paxos_pb2'
273 |   # @@protoc_insertion_point(class_scope:dist_sgd.consensus)
274 |   ))
275 | _sym_db.RegisterMessage(consensus)
276 | 
277 | empty = _reflection.GeneratedProtocolMessageType('empty', (_message.Message,), dict(
278 |   DESCRIPTOR = _EMPTY,
279 |   __module__ = 'protos.paxos_pb2'
280 |   # @@protoc_insertion_point(class_scope:dist_sgd.empty)
281 |   ))
282 | _sym_db.RegisterMessage(empty)
283 | 
284 | 
285 | DESCRIPTOR.has_options = True
286 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n\013io.dist_sgdB\007DistSGDP\001'))
287 | import abc
288 | import six
289 | from grpc.beta import implementations as beta_implementations
290 | from grpc.framework.common import cardinality
291 | from grpc.framework.interfaces.face import utilities as face_utilities
292 | 
293 | class BetaPaxosServerServicer(six.with_metaclass(abc.ABCMeta, object)):
294 |   """<fill me in later!>"""
295 |   @abc.abstractmethod
296 |   def prepare(self, request, context):
297 |     raise NotImplementedError()
298 |   @abc.abstractmethod
299 |   def accept(self, request, context):
300 |     raise NotImplementedError()
301 |   @abc.abstractmethod
302 |   def accepted(self, request, context):
303 |     raise NotImplementedError()
304 |   @abc.abstractmethod
305 |   def ping(self, request, context):
306 |     raise NotImplementedError()
307 | 
308 | class BetaPaxosServerStub(six.with_metaclass(abc.ABCMeta, object)):
309 |   """The interface to which stubs will conform."""
310 |   @abc.abstractmethod
311 |   def prepare(self, request, timeout):
312 |     raise NotImplementedError()
313 |   prepare.future = None
314 |   @abc.abstractmethod
315 |   def accept(self, request, timeout):
316 |     raise NotImplementedError()
317 |   accept.future = None
318 |   @abc.abstractmethod
319 |   def accepted(self, request, timeout):
320 |     raise NotImplementedError()
321 |   accepted.future = None
322 |   @abc.abstractmethod
323 |   def ping(self, request, timeout):
324 |     raise NotImplementedError()
325 |   ping.future = None
326 | 
327 | def beta_create_PaxosServer_server(servicer, pool=None, pool_size=None, default_timeout=None, maximum_timeout=None):
328 |   import protos.paxos_pb2
329 |   import protos.paxos_pb2
330 |   import protos.paxos_pb2
331 |   import protos.paxos_pb2
332 |   import protos.paxos_pb2
333 |   import protos.paxos_pb2
334 |   import protos.paxos_pb2
335 |   import protos.paxos_pb2
336 |   request_deserializers = {
337 |     ('dist_sgd.PaxosServer', 'accept'): protos.paxos_pb2.request_acceptance.FromString,
338 |     ('dist_sgd.PaxosServer', 'accepted'): protos.paxos_pb2.consensus.FromString,
339 |     ('dist_sgd.PaxosServer', 'ping'): protos.paxos_pb2.empty.FromString,
340 |     ('dist_sgd.PaxosServer', 'prepare'): protos.paxos_pb2.proposal.FromString,
341 |   }
342 |   response_serializers = {
343 |     ('dist_sgd.PaxosServer', 'accept'): protos.paxos_pb2.acquiescence.SerializeToString,
344 |     ('dist_sgd.PaxosServer', 'accepted'): protos.paxos_pb2.empty.SerializeToString,
345 |     ('dist_sgd.PaxosServer', 'ping'): protos.paxos_pb2.empty.SerializeToString,
346 |     ('dist_sgd.PaxosServer', 'prepare'): protos.paxos_pb2.ack.SerializeToString,
347 |   }
348 |   method_implementations = {
349 |     ('dist_sgd.PaxosServer', 'accept'): face_utilities.unary_unary_inline(servicer.accept),
350 |     ('dist_sgd.PaxosServer', 'accepted'): face_utilities.unary_unary_inline(servicer.accepted),
351 |     ('dist_sgd.PaxosServer', 'ping'): face_utilities.unary_unary_inline(servicer.ping),
352 |     ('dist_sgd.PaxosServer', 'prepare'): face_utilities.unary_unary_inline(servicer.prepare),
353 |   }
354 |   server_options = beta_implementations.server_options(request_deserializers=request_deserializers, response_serializers=response_serializers, thread_pool=pool, thread_pool_size=pool_size, default_timeout=default_timeout, maximum_timeout=maximum_timeout)
355 |   return beta_implementations.server(method_implementations, options=server_options)
356 | 
357 | def beta_create_PaxosServer_stub(channel, host=None, metadata_transformer=None, pool=None, pool_size=None):
358 |   import protos.paxos_pb2
359 |   import protos.paxos_pb2
360 |   import protos.paxos_pb2
361 |   import protos.paxos_pb2
362 |   import protos.paxos_pb2
363 |   import protos.paxos_pb2
364 |   import protos.paxos_pb2
365 |   import protos.paxos_pb2
366 |   request_serializers = {
367 |     ('dist_sgd.PaxosServer', 'accept'): protos.paxos_pb2.request_acceptance.SerializeToString,
368 |     ('dist_sgd.PaxosServer', 'accepted'): protos.paxos_pb2.consensus.SerializeToString,
369 |     ('dist_sgd.PaxosServer', 'ping'): protos.paxos_pb2.empty.SerializeToString,
370 |     ('dist_sgd.PaxosServer', 'prepare'): protos.paxos_pb2.proposal.SerializeToString,
371 |   }
372 |   response_deserializers = {
373 |     ('dist_sgd.PaxosServer', 'accept'): protos.paxos_pb2.acquiescence.FromString,
374 |     ('dist_sgd.PaxosServer', 'accepted'): protos.paxos_pb2.empty.FromString,
375 |     ('dist_sgd.PaxosServer', 'ping'): protos.paxos_pb2.empty.FromString,
376 |     ('dist_sgd.PaxosServer', 'prepare'): protos.paxos_pb2.ack.FromString,
377 |   }
378 |   cardinalities = {
379 |     'accept': cardinality.Cardinality.UNARY_UNARY,
380 |     'accepted': cardinality.Cardinality.UNARY_UNARY,
381 |     'ping': cardinality.Cardinality.UNARY_UNARY,
382 |     'prepare': cardinality.Cardinality.UNARY_UNARY,
383 |   }
384 |   stub_options = beta_implementations.stub_options(host=host, metadata_transformer=metadata_transformer, request_serializers=request_serializers, response_deserializers=response_deserializers, thread_pool=pool, thread_pool_size=pool_size)
385 |   return beta_implementations.dynamic_stub(channel, 'dist_sgd.PaxosServer', cardinalities, options=stub_options)
386 | # @@protoc_insertion_point(module_scope)
387 | 


--------------------------------------------------------------------------------
/python-python/run_codegen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2015, Google Inc.
 3 | # All rights reserved.
 4 | #
 5 | # Redistribution and use in source and binary forms, with or without
 6 | # modification, are permitted provided that the following conditions are
 7 | # met:
 8 | #
 9 | #     * Redistributions of source code must retain the above copyright
10 | # notice, this list of conditions and the following disclaimer.
11 | #     * Redistributions in binary form must reproduce the above
12 | # copyright notice, this list of conditions and the following disclaimer
13 | # in the documentation and/or other materials provided with the
14 | # distribution.
15 | #     * Neither the name of Google Inc. nor the names of its
16 | # contributors may be used to endorse or promote products derived from
17 | # this software without specific prior written permission.
18 | #
19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 
31 | # Runs the protoc with gRPC plugin to generate protocol messages and gRPC stubs.
32 | protoc -I . --python_out=. --grpc_out=. --plugin=protoc-gen-grpc=`which grpc_python_plugin` ./protos/dist_sgd.proto
33 | protoc -I . --python_out=. --grpc_out=. --plugin=protoc-gen-grpc=`which grpc_python_plugin` ./protos/paxos.proto
34 | 


--------------------------------------------------------------------------------
/python-python/server.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------
  2 | # Implements a parameter server. The server takes parameter updates in and
  3 | # sends back the most up to date parameters. This server also keeps track of 
  4 | # the current training/test error.  
  5 | # ------------------------------------------------------------
  6 | 
  7 | from __future__ import absolute_import
  8 | from __future__ import print_function
  9 | import time
 10 | 
 11 | import dist_sgd_pb2
 12 | from sets import Set
 13 | 
 14 | import autograd.numpy as np
 15 | import autograd.numpy.random as npr
 16 | from autograd import grad
 17 | 
 18 | from nnet.neural_net import *
 19 | from protobuf_utils.utils import * 
 20 | from server_utils.utils import * 
 21 | 
 22 | import traceback
 23 | 
 24 | _ONE_DAY_IN_SECONDS = 60 * 60 * 24
 25 | 
 26 | _REQUIRED_CHILDREN = 1
 27 | 
 28 | # Data files that we are training from. This is the small demo set. 
 29 | images_fname = 'data/images(16).npy'
 30 | labels_fname = 'data/output_labels(16).npy'
 31 | 
 32 | class ParamFeeder(dist_sgd_pb2.BetaParamFeederServicer):
 33 |     def __init__(self, W = None, prevBatch=None):
 34 |         # Keeps track of all child IDs that it has seen so far
 35 |         self.child_ids = Set([])
 36 | 
 37 |         # Load and process Caltech data
 38 |         self.train_images, self.train_labels, self.test_images, self.test_labels = load_caltech100(images_fname, labels_fname)
 39 |         self.image_input_d = self.train_images.shape[1]
 40 | 
 41 |         # Network parameters
 42 |         self.layer_sizes = [self.image_input_d, 800, 600, 400, 350, 250, 101]
 43 | 
 44 |         # Training parameters
 45 |         self.param_scale = 0.1
 46 |         self.learning_rate = 1e-5
 47 |         self.momentum = 0.9
 48 |         self.batch_size = 256
 49 |         self.num_epochs = 50
 50 |         self.L2_reg = 1.0
 51 | 
 52 |         # Make neural net functions
 53 |         self.N_weights, self.pred_fun, self.loss_fun, self.frac_err = make_nn_funs(self.layer_sizes, self.L2_reg)
 54 |         self.loss_grad = grad(self.loss_fun)
 55 | 
 56 |         # Initialize weights
 57 |         if W is None:
 58 |             rs = npr.RandomState()
 59 |             self.W = rs.randn(self.N_weights) * self.param_scale
 60 |         else:
 61 |             # Passed in weights
 62 |             self.W = W
 63 |         self.param_len = self.W.shape[0]
 64 |         log_info("# of parameters:")
 65 |         log_info(self.param_len)
 66 | 
 67 |         # Train with sgd
 68 |         self.batch_idxs = make_batches(self.train_images.shape[0], self.batch_size)        
 69 | 
 70 |         # Set the current batch to zero unless it has been passed in
 71 |         self.epoch = 0
 72 |         if prevBatch is None:
 73 |             self.batch_num = 0
 74 |         else:
 75 |             self.batch_num = prevBatch
 76 |         self.n_batches = len(self.batch_idxs)
 77 | 
 78 |         # Initialize information about the clients
 79 |         self.n_childs = 0
 80 |         self.max_client_id = 0
 81 | 
 82 |         # Intializes starting information about training 
 83 |         self.prev_test_perf = 1
 84 | 
 85 |         # The batches that are currently being processed
 86 |         self.batches_processing = {}
 87 | 
 88 |         # The batches that were failed to process, model training machine may have failed
 89 |         # Send these batches to a new machine
 90 |         self.batches_unprocessed = []
 91 | 
 92 |         log_info('Data loaded on server, waiting for clients....')
 93 |         log_info('Number of child processes: 0')
 94 | 
 95 |     # Logs the current performance of the model. Called once per epoch.
 96 |     def log_info_perf(self, epoch):
 97 |         test_perf  = self.frac_err(self.W, self.test_images, self.test_labels)
 98 |         train_perf = self.frac_err(self.W, self.train_images, self.train_labels)
 99 |         if test_perf > self.prev_test_perf:
100 |             self.learning_rate = 0.1 * self.learning_rate
101 |         self.prev_test_perf = test_perf
102 |         log_info("Epoch {0}, TrainErr {1:5}, TestErr {2:5}, LR {3:2}".format(self.epoch, train_perf, test_perf, self.learning_rate))
103 | 
104 |     # Streams updates from the client.
105 |     def GetUpdates(self, request_iterator, context):
106 |         tensor_bytes = ''  
107 |         for subtensor in request_iterator:
108 |             tensor_bytes = tensor_bytes + subtensor.tensor_content
109 | 
110 |         # Serialize the tensor
111 |         grad_W = convert_bytes_to_array(tensor_bytes)
112 | 
113 |         # Gradient descent
114 |         self.W -= 0.5 * self.learning_rate * grad_W
115 | 
116 |         return dist_sgd_pb2.StatusCode(status=1)
117 | 
118 |     # Sends the next batch that the client should process
119 |     def SendNextBatch(self, request, context):
120 |         # Figure out what the maximum client_id is. If client_id does not exist, 
121 |         # assigns the client a new id. 
122 |         if request.client_id == 0:
123 |             self.max_client_id += 1
124 |             request.client_id = self.max_client_id
125 |         else:
126 |             self.max_client_id = max(request.client_id, self.max_client_id)
127 | 
128 |         # Does not start until a sufficient number of child processes exists
129 |         self.child_ids.add(request.client_id)
130 |         if len(self.child_ids) != self.n_childs:
131 |             self.n_childs = len(self.child_ids)
132 |             log_info('Number of child processes: ' + str(len(self.child_ids)))
133 |         if len(self.child_ids) < _REQUIRED_CHILDREN:
134 |             return dist_sgd_pb2.NextBatch(client_id=request.client_id, data_indx = -1)
135 | 
136 |         # Logs information about previous batch timing
137 |         if request.prev_data_indx != -1:
138 |             log_info('Time taken to process batch {0} was {1:.2f} by client {2}'.format(request.prev_data_indx, (time.time() - self.batches_processing[request.prev_data_indx]), request.client_id))
139 |             del self.batches_processing[request.prev_data_indx]
140 | 
141 |         # log_info epoch information if we've hit the end of an epoch
142 |         if self.batch_num == self.n_batches:
143 |             self.batch_num, self.epoch = 0, self.epoch + 1
144 |             self.log_info_perf(self.epoch)
145 | 
146 |         # Takes any previously failed batches first, otherwise takes next batch
147 |         if self.batches_unprocessed != []:
148 |             cur_batchnum = self.batches_unprocessed.pop(0)
149 |         else:
150 |             cur_batchnum, self.batch_num =  self.batch_num, self.batch_num + 1
151 | 
152 |         # Save the time that the next batch was sent out on the server
153 |         self.batches_processing[cur_batchnum] = time.time()
154 | 
155 |         return dist_sgd_pb2.NextBatch(client_id=request.client_id, data_indx = cur_batchnum)
156 | 
157 |     # This sends the parameters from the server to the client by converting the tensor into a 
158 |     # protobuffer and streaming it 
159 |     def SendParams(self, request, context):
160 |         CHUNK_SIZE = 524228
161 |         tensor_bytes = convert_array_to_bytes(self.W)
162 |         tensor_bytes_len = len(tensor_bytes)
163 |         tensor_chunk_count = 0
164 |         try:
165 |             while len(tensor_bytes):
166 |                 tensor_chunk_count += 1
167 |                 tensor_content = tensor_bytes[:CHUNK_SIZE]
168 |                 tensor_bytes = tensor_bytes[CHUNK_SIZE:]
169 |                 yield dist_sgd_pb2.SubTensor(tensor_len = tensor_bytes_len, tensor_chunk = tensor_chunk_count, tensor_content = tensor_content, data_indx= -1)
170 |         except Exception, e:
171 |             traceback.print_exc()
172 | 
173 |     # Function to ping the server to see if it is available
174 |     def ping(self, request, context):
175 |         return dist_sgd_pb2.empty() 
176 | 
177 | # Main function that is called to instantiate the server and have 
178 | # it connect and send or receieve parameters from clients.
179 | def serve(hostname, W = None, prev_batch = None, local_id = None):
180 |     # Set up the server on port 50051
181 |     hostname = '[::]:50051'
182 |     BATCH_TRAIN_TIMEOUT = 60
183 | 
184 |     # Instantiate the server and add the port
185 |     param_feeder = ParamFeeder(W, prev_batch)
186 |     server = dist_sgd_pb2.beta_create_ParamFeeder_server(param_feeder)
187 |     server.add_insecure_port(hostname)
188 | 
189 |     # Begin the server 
190 |     server.start()
191 |     try:
192 |         while True:
193 |             time.sleep(BATCH_TRAIN_TIMEOUT)
194 | 
195 |     except KeyboardInterrupt:
196 |         server.stop(0)
197 |         raise KeyboardInterrupt
198 | 
199 | if __name__ == '__main__':
200 |     serve('[::]:50051')


--------------------------------------------------------------------------------
/python-python/start.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | #! /home/candokevin/anaconda2/bin/python
3 | cd /home/candokevin/stash/distributed-sgd/python-python
4 | git pull
5 | rm /home/candokevin/log.txt
6 | while true; do
7 |    python client.py >> /home/candokevin/log.txt
8 | done
9 | 


--------------------------------------------------------------------------------
/slides/.Rhistory:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/.Rhistory


--------------------------------------------------------------------------------
/slides/common_slides.sty:
--------------------------------------------------------------------------------
  1 | \setbeamertemplate{navigation symbols}{}
  2 | \let\tempone\itemize
  3 | \let\temptwo\enditemize
  4 | \renewenvironment{itemize}{\tempone\addtolength{\itemsep}{0.5\baselineskip}}{\temptwo}
  5 | % \usepackage{beamerthemeshadow}
  6 | \usepackage{ulem}
  7 |     % \usepackage{movie15}
  8 | \usepackage{mathpazo}
  9 | % \usepackage{palatino}
 10 | 
 11 | \usepackage{tikz}
 12 | \usepackage{hyperref}
 13 | \usepackage{natbib}
 14 | \usepackage{pgffor}
 15 | \usepackage{booktabs}
 16 | \usepackage{amssymb}
 17 | \usepackage{tikz,etoolbox}
 18 | \usepackage{subcaption}
 19 | \usepackage{url}
 20 | \usepackage{pgf}
 21 | \usepackage{latexsym}
 22 | \usepackage{amsfonts}
 23 | \usepackage{amssymb}
 24 | \usepackage{amsthm}
 25 | \usepackage{algorithm}
 26 | \usepackage{amsmath}
 27 | \usepackage{tabularx}
 28 | \usepackage{mathtools}
 29 | \usepackage{algorithm}
 30 | \usepackage{algpseudocode}
 31 | 
 32 | \usetikzlibrary{arrows,positioning,automata,positioning,spy,matrix,scopes,chains}
 33 | 
 34 | \setbeamersize{text margin left=6mm}
 35 | \setbeamersize{text margin right=6mm}
 36 | \renewcommand{\insertnavigation}[1]{}
 37 | \setbeamertemplate{headline}{}
 38 | \setbeamertemplate{footline}{}
 39 | % \usefonttheme{professionalfonts}
 40 | % make itemize things larger
 41 | %\setbeamerfont*{itemize/enumerate body}{size=\Large}
 42 | %\setbeamerfont*{itemize/enumerate subbody}{size=\large}
 43 | \setbeamercovered{transparent}
 44 | \mode<presentation>
 45 | %\mode<handout>
 46 | \linespread{1.25}
 47 | 
 48 | \usepackage{color}
 49 | \usepackage{multirow}
 50 | \usepackage{rotating}
 51 | \usepackage[all,dvips]{xy}
 52 | \usepackage{colortbl}
 53 | \usepackage{graphicx}
 54 | \usepackage{verbatim}
 55 | \usepackage{framed}
 56 | \usepackage{natbib}
 57 | \usepackage[labelformat=empty]{caption}
 58 | \newcommand{\air}{\vspace{0.25cm}}
 59 | % \newcommand{\mair}{\vspace{-0.25cm}}
 60 | 
 61 | \setbeamertemplate{navigation symbols}{}%remove navigation symbols
 62 | \renewcommand{\rmdefault}{crm}
 63 | \newcommand{\lnbrack}{{\normalfont [}}
 64 | \newcommand{\rnbrack}{{\normalfont ]}\thinspace}
 65 | \newcommand{\lbbrack}{\textcolor{red}{\textbf{[}}}
 66 | \newcommand{\rbbrack}{\textcolor{red}{\textbf{]}}\thinspace}
 67 | \definecolor{vermillion}{RGB}{213,94,0}
 68 | 
 69 | \definecolor{orange}{RGB}{230,159,0}
 70 | \definecolor{skyblue}{RGB}{86,180,233}
 71 | \definecolor{bluegreen}{RGB}{0,158,115}
 72 | \definecolor{myyellow}{RGB}{240,228,66} % i dunno if this is the same as standard yellow
 73 | \definecolor{myblue}{RGB}{0,114,178}
 74 | \definecolor{vermillion}{RGB}{213,94,0}
 75 | \definecolor{redpurple}{RGB}{204,121,167}
 76 | \definecolor{lightgrey}{RGB}{234,234,234}
 77 | 
 78 | \AtBeginSection[]
 79 | {
 80 |   \begin{frame}
 81 |   \frametitle{Contents}
 82 |   \tableofcontents[currentsection]
 83 |   \end{frame}
 84 | }
 85 | % \AtBeginSection[]{
 86 | %   \begin{frame}
 87 | %   \vfill
 88 | %   \centering
 89 | %   \begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title}
 90 | %     \usebeamerfont{title}\insertsectionhead\par%
 91 | %   \end{beamercolorbox}
 92 | %   \vfill
 93 | %   \end{frame}
 94 | % }
 95 | 
 96 | \newcommand{\boldA}{\mathbf{A}}
 97 | \newcommand{\boldB}{\mathbf{B}}
 98 | \newcommand{\boldC}{\mathbf{C}}
 99 | \newcommand{\boldD}{\mathbf{D}}
100 | \newcommand{\boldE}{\mathbf{E}}
101 | \newcommand{\boldF}{\mathbf{F}}
102 | \newcommand{\boldG}{\mathbf{G}}
103 | \newcommand{\boldH}{\mathbf{H}}
104 | \newcommand{\boldI}{\mathbf{I}}
105 | \newcommand{\boldJ}{\mathbf{J}}
106 | \newcommand{\boldK}{\mathbf{K}}
107 | \newcommand{\boldL}{\mathbf{L}}
108 | \newcommand{\boldM}{\mathbf{M}}
109 | \newcommand{\boldN}{\mathbf{N}}
110 | \newcommand{\boldO}{\mathbf{O}}
111 | \newcommand{\boldP}{\mathbf{P}}
112 | \newcommand{\boldQ}{\mathbf{Q}}
113 | \newcommand{\boldR}{\mathbf{R}}
114 | \newcommand{\boldS}{\mathbf{S}}
115 | \newcommand{\boldT}{\mathbf{T}}
116 | \newcommand{\boldU}{\mathbf{U}}
117 | \newcommand{\boldV}{\mathbf{V}}
118 | \newcommand{\boldW}{\mathbf{W}}
119 | \newcommand{\boldX}{\mathbf{X}}
120 | \newcommand{\boldY}{\mathbf{Y}}
121 | \newcommand{\boldZ}{\mathbf{Z}}
122 | \newcommand{\bolda}{\mathbf{a}}
123 | \newcommand{\boldb}{\mathbf{b}}
124 | \newcommand{\boldc}{\mathbf{c}}
125 | \newcommand{\boldd}{\mathbf{d}}
126 | \newcommand{\bolde}{\mathbf{e}}
127 | \newcommand{\boldf}{\mathbf{f}}
128 | \newcommand{\boldg}{\mathbf{g}}
129 | \newcommand{\boldh}{\mathbf{h}}
130 | \newcommand{\boldi}{\mathbf{i}}
131 | \newcommand{\boldj}{\mathbf{j}}
132 | \newcommand{\boldk}{\mathbf{k}}
133 | \newcommand{\boldl}{\mathbf{l}}
134 | \newcommand{\boldm}{\mathbf{m}}
135 | \newcommand{\boldn}{\mathbf{n}}
136 | \newcommand{\boldo}{\mathbf{o}}
137 | \newcommand{\boldp}{\mathbf{p}}
138 | \newcommand{\boldq}{\mathbf{q}}
139 | \newcommand{\boldr}{\mathbf{r}}
140 | \newcommand{\bolds}{\mathbf{s}}
141 | \newcommand{\boldt}{\mathbf{t}}
142 | \newcommand{\boldu}{\mathbf{u}}
143 | \newcommand{\boldv}{\mathbf{v}}
144 | \newcommand{\boldw}{\mathbf{w}}
145 | \newcommand{\boldx}{\mathbf{x}}
146 | \newcommand{\boldy}{\mathbf{y}}
147 | \newcommand{\boldz}{\mathbf{z}}
148 | 
149 | \newcommand{\bolddelta}{\boldsymbol{\delta}}
150 | \newcommand{\indicator}{\mathbf{1}}
151 | \newcommand{\mcA}{\mathcal{A}}
152 | \newcommand{\mcB}{\mathcal{B}}
153 | \newcommand{\mcC}{\mathcal{C}}
154 | \newcommand{\mcD}{\mathcal{D}}
155 | \newcommand{\mcE}{\mathcal{E}}
156 | \newcommand{\mcF}{\mathcal{F}}
157 | \newcommand{\mcG}{\mathcal{G}}
158 | \newcommand{\mcH}{\mathcal{H}}
159 | \newcommand{\mcI}{\mathcal{I}}
160 | \newcommand{\mcJ}{\mathcal{J}}
161 | \newcommand{\mcK}{\mathcal{K}}
162 | \newcommand{\mcL}{\mathcal{L}}
163 | \newcommand{\mcM}{\mathcal{M}}
164 | \newcommand{\mcN}{\mathcal{N}}
165 | \newcommand{\mcO}{\mathcal{O}}
166 | \newcommand{\mcP}{\mathcal{P}}
167 | \newcommand{\mcQ}{\mathcal{Q}}
168 | \newcommand{\mcR}{\mathcal{R}}
169 | \newcommand{\mcS}{\mathcal{S}}
170 | \newcommand{\mcT}{\mathcal{T}}
171 | \newcommand{\mcU}{\mathcal{U}}
172 | \newcommand{\mcV}{\mathcal{V}}
173 | \newcommand{\mcW}{\mathcal{W}}
174 | \newcommand{\mcX}{\mathcal{X}}
175 | \newcommand{\mcY}{\mathcal{Y}}
176 | \newcommand{\mcZ}{\mathcal{Z}}
177 | 
178 | \newcommand{\reals}{\ensuremath{\mathbb{R}}}
179 | \newcommand{\integers}{\ensuremath{\mathbb{Z}}}
180 | \newcommand{\rationals}{\ensuremath{\mathbb{Q}}}
181 | \newcommand{\naturals}{\ensuremath{\mathbb{N}}}
182 | \newcommand{\trans}{\ensuremath{\mathsf{T}}}
183 | \newcommand{\ident}{\mathbf{I}}
184 | \newcommand{\bzero}{\mathbf{0}}
185 | 
186 | \newcommand{\balpha}{\boldsymbol{\alpha}}
187 | \newcommand{\bbeta}{\boldsymbol{\beta}}
188 | \newcommand{\boldeta}{\boldsymbol{\eta}}
189 | \newcommand{\bkappa}{\boldsymbol{\kappa}}
190 | \newcommand{\bgamma}{\boldsymbol{\gamma}}
191 | \newcommand{\bmu}{\boldsymbol{\mu}}
192 | \newcommand{\bphi}{\boldsymbol{\phi}}
193 | \newcommand{\bpi}{\boldsymbol{\pi}}
194 | \newcommand{\bpsi}{\boldsymbol{\psi}}
195 | \newcommand{\bsigma}{\boldsymbol{\sigma}}
196 | \newcommand{\btheta}{\boldsymbol{\theta}}
197 | \newcommand{\bxi}{\boldsymbol{\xi}}
198 | \newcommand{\bGamma}{\boldsymbol{\Gamma}}
199 | \newcommand{\bLambda}{\boldsymbol{\Lambda}}
200 | \newcommand{\bOmega}{\boldsymbol{\Omega}}
201 | \newcommand{\bPhi}{\boldsymbol{\Phi}}
202 | \newcommand{\bPi}{\boldsymbol{\Pi}}
203 | \newcommand{\bPsi}{\boldsymbol{\Psi}}
204 | \newcommand{\bSigma}{\boldsymbol{\Sigma}}
205 | \newcommand{\bTheta}{\boldsymbol{\Theta}}
206 | \newcommand{\bUpsilon}{\boldsymbol{\Upsilon}}
207 | \newcommand{\bXi}{\boldsymbol{\Xi}}
208 | \newcommand{\bepsilon}{\boldsymbol{\epsilon}}
209 | 
210 | \def\argmin{\operatornamewithlimits{arg\,min}}
211 | \def\argmax{\operatornamewithlimits{arg\,max}}
212 | \def\softmax{\operatornamewithlimits{softmax}}
213 | \def\relu{\operatornamewithlimits{ReLU}}
214 | 
215 | \newcommand{\given}{\,|\,}
216 | \newcommand{\distNorm}{\mathcal{N}}
217 | 
218 | 
219 | 
220 | \newcommand{\din}{{d_{\mathrm{in}}}}
221 | \newcommand{\dhid}{{d_{\mathrm{hid}}}}
222 | \newcommand{\dwin}{{d_{\mathrm{win}}}}
223 | \newcommand{\dout}{{d_{\mathrm{out}}}}
224 | \newcommand{\demb}{{d_{\mathrm{emb}}}}
225 | 
226 | \algtext*{EndWhile}% Remove "end while" text
227 | \algtext*{EndFor}% Remove "end while" text
228 | \algtext*{EndIf}% Remove "end if" text
229 | \algtext*{EndProcedure}% Remove "end while" text
230 | 


--------------------------------------------------------------------------------
/slides/img/2d_func.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/2d_func.jpg


--------------------------------------------------------------------------------
/slides/img/dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/dataset.png


--------------------------------------------------------------------------------
/slides/img/deep_learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/deep_learning.png


--------------------------------------------------------------------------------
/slides/img/dist_16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/dist_16.png


--------------------------------------------------------------------------------
/slides/img/dist_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/dist_train.png


--------------------------------------------------------------------------------
/slides/img/downpour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/downpour.png


--------------------------------------------------------------------------------
/slides/img/gRPC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/gRPC.png


--------------------------------------------------------------------------------
/slides/img/large_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/large_data.png


--------------------------------------------------------------------------------
/slides/img/lin_v_nonlin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/lin_v_nonlin.png


--------------------------------------------------------------------------------
/slides/img/sandblaster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/sandblaster.png


--------------------------------------------------------------------------------
/slides/main.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/main.pdf


--------------------------------------------------------------------------------
/slides/main.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{beamer}
  2 | \usepackage{./common_slides}
  3 | \usepackage[absolute,overlay]{textpos}
  4 | \usepackage{graphicx}
  5 | 
  6 | 
  7 | \title{ Distributed Stochastic Gradient Descent }
  8 | 
  9 | \author{Kevin Yang and Michael Farrell}
 10 | \begin{document}
 11 | 
 12 | \begin{frame}w
 13 |   \titlepage
 14 | \end{frame}
 15 | 
 16 | \begin{frame}{Motivation - Deep Learning}
 17 | 
 18 | \begin{columns}[T] % align columns
 19 | \begin{column}{.48\textwidth}
 20 | \begin{itemize}
 21 | \item Deep-Learning
 22 | \begin{itemize}
 23 | \item Objective: Learn a complicated, non-linear function that minimizes some loss function
 24 | \end{itemize}
 25 | \item Why do we need deep models?
 26 | \begin{itemize}
 27 | \item The class of linear functions is inadequate for many problems.
 28 | \end{itemize}
 29 | \end{itemize}
 30 | \end{column}%
 31 | \hfill%
 32 | \begin{column}{.48\textwidth}
 33 | \begin{figure}
 34 |     \includegraphics[scale = .35]{./img/deep_learning}
 35 |       \caption{\scalebox{.3}{http://www.rsipvision.com/exploring-deep-learning/}}
 36 | \end{figure}
 37 | \begin{figure}
 38 |     \includegraphics[scale = .17]{./img/lin_v_nonlin}
 39 |       \caption{\scalebox{.3}{http://sebastianraschka.com/Articles/2014{\_}naive{\_}bayes{\_}1.html}}
 40 | \end{figure}
 41 | \end{column}%
 42 | \end{columns}
 43 | \end{frame}
 44 | 
 45 | \begin{frame}{Motivation - Deep Learning}
 46 | \begin{itemize}
 47 | \item How do we learn these deep models?
 48 | \begin{itemize}
 49 | \item Choose a random example
 50 | \item Run the neural network on the example
 51 | \item Adjust the parameters of the network such that our loss function is minimized more than it was before
 52 | \item Repeat
 53 | \end{itemize}
 54 | \pause
 55 | \item Difficulties?
 56 | \begin{itemize}
 57 | \item Local Minima
 58 | \item Non-convexity
 59 | \item Neural Networks can have millions or even billions of parameters
 60 | \end{itemize}
 61 | \end{itemize}
 62 | \begin{textblock*}{5cm}(8cm,.5cm) % {block width} (coords)
 63 | \includegraphics[scale = .3]{./img/2d_func}
 64 | \end{textblock*}
 65 | \end{frame}
 66 | 
 67 | \begin{frame}{Motivation - SGD}
 68 | \begin{itemize}
 69 | \item How do we maximize our reward function?
 70 | \begin{itemize}
 71 | \item One common technique is Stochastic Gradient Descent
 72 | \item $\mathbf w$ is the vector of parameters for the model
 73 | \item $\eta$ is the learning rate 
 74 | \item $\mathbf f(\mathbf w)$ is the loss function evaluated with the current parameters $\mathbf w$
 75 | \item 
 76 | \begin{algorithmic}
 77 | \State $\mathbf w \gets \mathbf 0$
 78 | \While {$\mathbf f(\mathbf w)$ is not minimized}
 79 | 	\For {$i = 1, n$}
 80 |     \State $\mathbf w \gets \mathbf w - \eta\nabla f(\mathbf w)$
 81 | 	\EndFor
 82 | \EndWhile
 83 | 
 84 | \end{algorithmic}
 85 | \item As the number of training examples, $n$, and the number of parameters, $|\mathbf w|$, increases, this algorithm quickly becomes very slow...
 86 | \end{itemize}
 87 | \end{itemize}
 88 | \end{frame}
 89 | 
 90 | \begin{frame}{Motivation - Distributed SGD}
 91 | \begin{itemize}
 92 | \item Since some of these models take days/weeks/months to run, we would hope that we could use a distributed computing cluster in order to parallelize this process.
 93 | \pause
 94 | \item Learn from Google!
 95 | \begin{itemize}
 96 | \item DistBelief- 2012
 97 | \begin{itemize}
 98 | \item Downpour SGD
 99 | \item Sandblaster L-BFGS
100 | \end{itemize}
101 | \item TensorFlow- 2015
102 | \begin{itemize}
103 | \item gRPC
104 | \end{itemize}
105 | \end{itemize}
106 | \end{itemize}
107 | 
108 | \end{frame}
109 | 
110 | \begin{frame}{DistBelief - Downpour SGD}
111 | \begin{itemize}
112 | \item ``An asynchronous stochastic gradient descent procedure supporting a large number of model replicas." \footnote{Diagram taken from Dean et al. \it{Large Scale Distributed Deep Networks}
113 | }
114 | \end{itemize}
115 | $$\includegraphics[scale = .5]{./img/downpour}$$
116 | \end{frame}
117 | 
118 | \begin{frame}{DistBelief - Sandblaster L-BFGS}
119 | \begin{itemize}
120 | \item ``A framework that supports a variety of distributed batch optimization procedures, including a distributed implementation of L-BFGS" \footnote{Diagram taken from Dean et al. \it{Large Scale Distributed Deep Networks}}
121 | \end{itemize}
122 | $$\includegraphics[scale = .5]{./img/sandblaster}$$
123 | \end{frame}
124 | 
125 | \begin{frame}{TensorFlow-GRPC}
126 | \begin{itemize}
127 | \item Second Generation ML Model focused on distributing models to CPUs and GPUs
128 | \item Uses the high performance RPC framework (GRPC \footnote{Diagram taken from http://www.grpc.io/}) in order to communicate between separate processes 
129 | \begin{itemize}
130 | \item Uses Protocol Buffers -v3
131 | \item C-based
132 | \item Client-server stubs in 10+ languages and counting
133 | \end{itemize}
134 | \end{itemize}
135 | $$\includegraphics[scale = .2]{./img/gRPC}$$
136 | \end{frame}
137 | 
138 | \begin{frame}{DistBelief/TensorFlow Summary}
139 | \begin{itemize}
140 | \item TensorFlow is basically the second version of DistBelief that is approximately twice as fast and much more user-friendly.
141 | \item Results from DistBelief" \footnote{Diagram taken from Dean et al. \it{Large Scale Distributed Deep Networks}}:
142 | \end{itemize}
143 | $$\includegraphics[scale = .18]{./img/dist_train}\includegraphics[scale = .18]{./img/dist_16}$$
144 | \end{frame}
145 | 
146 | \begin{frame}{Our Project}
147 | \begin{itemize}
148 | \item We frequently run into scenarios where we have a model that trains incredibly slowly on our local machines. As a consequence, we hope to benefit from additional cloud computing resources and build our own Distributed SGD system based on DistBelief and TensorFlow systems.
149 | \begin{itemize}
150 | \item The Distributed SGD system will have the user give a function that returns the outputs of a model, a function that returns the gradients of a model, and the number of machines to train the model on.
151 | \item Use GRPC with Protocol Buffers to communicated between processes, similar to TensorFlow.
152 | \item Implement Downpour-SGD which seems to be the most effective model with limited resources.
153 | \end{itemize}
154 | \end{itemize}
155 | \end{frame}
156 | 
157 | 
158 | \begin{frame}{Our Example}
159 | \begin{itemize}
160 | \item To test our system, we're working with the Caltech 101 Computational Vision dataset \footnote{L. Fei-Fei, R. Fergus and P. Perona. \it{Learning generative visual models
161 | from few training examples: an incremental Bayesian approach tested on
162 | 101 object categories.}}. In this dataset, there are about 20,000 pictures of objects in 101 categories. All of these images are around 300 x 200 pixels in size.
163 | \item We've implemented a convolutional neural net that tries to classify what object is represented in the image.
164 | 
165 | $$\includegraphics[scale = .30]{./img/dataset.png}$$
166 | \end{itemize}
167 | \end{frame}
168 | 
169 | \begin{frame}{Computational Resources}
170 | \begin{itemize}
171 | \item We are using Google Cloud Compute Engine to set up VMs and run the code. To run classification on our image dataset, we're using small instances with 6GB of RAM with 2 cores. This has a rate of 7.8 cents per hour.
172 | \item On a machine of this size, running 10 epochs of gradient descent takes 56 minutes.
173 | \item To streamline things, we've preconfigured images of a parameter server and model training server that are already set up with relevant code, tools, and libraries.
174 | \item As a result, setting up and launching the compute instances necessary for model training takes only a couple lines. 
175 | \end{itemize}
176 | \end{frame}
177 | 
178 | \begin{frame}{Implementing Downpour-SGD}
179 | \begin{itemize}
180 | \item The Downpour-SGD requires the passing of parameters and parameter updates between processes. In our example, we have 74,770,901 parameters and the size of our parameters is 0.5GB.
181 | \item Bottleneck here is the network. Parameters can be $>>$0.5Gb.
182 | \item We can leverage the fact that some of these models are extremely sparse
183 | \begin{itemize}
184 | \item only send parameters updated
185 | \item only update parameters every $n_x$ times
186 | \end{itemize}
187 | \item Explore protocol buffer streams
188 | \end{itemize} 
189 | $$\includegraphics[scale = .27]{./img/large_data}$$
190 | \end{frame}
191 | 
192 | \begin{frame}{Main Distributed System Challenges}
193 | \begin{itemize}
194 | \item Network Issues
195 | \begin{itemize}
196 |     \item We have to deal with network latency and try to reduce transportation cost as much as possible in order for our models to train properly. 
197 |     \item We would like to experiment with a couple different RPCs to optimize the speed of our system.
198 | \end{itemize}
199 | \item Fault tolerance
200 | \begin{itemize}
201 |     \item We need to make our system as resilient as possible against failures. Because all of these machines are doing a lot of computation while running gradient descent and manipulating parameters, these systems are bound to fail with relatively high frequently.  
202 |     \item Having methods in place to detect and remedy the failure of parameter servers and model replicas will be critical.
203 | \end{itemize}
204 | 
205 | \end{itemize}
206 | \end{frame}
207 | 
208 | \end{document}
209 | 


--------------------------------------------------------------------------------