├── .gitignore ├── .gitmodules ├── CS262%20Final%20Project ├── common.sty ├── images │ ├── downpour.png │ ├── locally.png │ ├── remotely.png │ ├── sgd_results.png │ └── speeds.png ├── main.aux ├── main.bcf ├── main.bib ├── main.log ├── main.out ├── main.pdf ├── main.run.xml ├── main.synctex.gz └── main.tex ├── Dev-Notebook-Kevin.md ├── Dev-Notebook-Mike.md ├── README.md ├── client_list.txt ├── lua-lua ├── README.md ├── cleanup.py ├── copy_files.py ├── data │ ├── demo-train.hdf5 │ ├── demo-val.hdf5 │ ├── demo.src.dict │ ├── demo.targ.dict │ ├── src-train.txt │ ├── src-val.txt │ ├── targ-train.txt │ └── targ-val.txt ├── demo_server.lua ├── gcloud_commands.txt ├── install_parallel.sh ├── locally.png ├── outputs │ ├── 104.154.239.139 │ │ ├── ada_4_rem.png │ │ ├── ada_4_rem.txt │ │ ├── ada_8_rem.png │ │ └── ada_8_rem.txt │ ├── 104.197.106.197 │ │ ├── ada_2_rem.png │ │ └── ada_2_rem.txt │ ├── 104.197.222.148 │ │ ├── ada_2.txt │ │ ├── ada_2_loc.png │ │ ├── ada_2_loc.txt │ │ └── reg_2.txt │ ├── 104.197.250.103 │ │ ├── reg_1.txt │ │ ├── reg_2.txt │ │ ├── reg_2_loc.png │ │ └── reg_2_loc.txt │ ├── 130.211.192.196 │ │ ├── reg_1_loc.png │ │ ├── reg_1_loc.txt │ │ └── reg_2.txt │ └── 130.211.204.149 │ │ ├── ada_1.txt │ │ ├── ada_1_loc.png │ │ ├── ada_1_loc.txt │ │ └── reg_2.txt ├── parallel │ └── init.lua ├── parse_outputs.py ├── remotely.png ├── server.lua ├── setup_image.sh └── startup.sh ├── python-python ├── README.md ├── client.py ├── data │ ├── images(16).npy │ └── output_labels(16).npy ├── dist_sgd_pb2.py ├── image_classes.txt ├── neural_net.py ├── nnet │ ├── __init__.py │ ├── __init__.pyc │ ├── neural_net.py │ └── neural_net.pyc ├── paxos.py ├── paxos_pb2.py ├── protobuf_utils │ ├── __init__.py │ ├── __init__.pyc │ ├── utils.py │ └── utils.pyc ├── protos │ ├── dist_sgd.proto │ ├── dist_sgd_pb2.py │ ├── paxos.proto │ └── paxos_pb2.py ├── run_codegen.sh ├── server.py └── start.sh └── slides ├── .Rhistory ├── common_slides.sty ├── img ├── 2d_func.jpg ├── dataset.png ├── deep_learning.png ├── dist_16.png ├── dist_train.png ├── downpour.png ├── gRPC.png ├── large_data.png ├── lin_v_nonlin.png └── sandblaster.png ├── main.pdf └── main.tex /.gitignore: -------------------------------------------------------------------------------- 1 | # Annoying files 2 | .DS_Store 3 | .ipynb_checkpoints 4 | Icon 5 | 6 | # large data files 7 | basic/output_labels(128).npy 8 | 9 | # Install files 10 | install/ 11 | 12 | # Model saves 13 | *.t7 14 | 15 | 16 | # Annoying text files 17 | slides/main.aux 18 | slides/main.log 19 | slides/main.nav 20 | slides/main.out 21 | slides/main.snm 22 | slides/main.synctex.gz 23 | slides/main.toc -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "lua-lua/End-To-End-Generative-Dialogue"] 2 | path = lua-lua/End-To-End-Generative-Dialogue 3 | url = https://github.com/michaelfarrell76/End-To-End-Generative-Dialogue.git 4 | -------------------------------------------------------------------------------- /CS262%20Final%20Project/common.sty: -------------------------------------------------------------------------------- 1 | \usepackage{amsmath} 2 | \usepackage{amssymb} 3 | \usepackage{url} 4 | \usepackage{mathpazo} 5 | \usepackage{palatino} 6 | \usepackage{fullpage,graphicx} 7 | \usepackage{tikz} 8 | \usepackage{tikz-qtree} 9 | \usepackage[font={it}]{caption} 10 | % \usepackage[right, mathlines]{lineno} 11 | 12 | \usepackage[procnames]{listings} 13 | \usepackage{color} 14 | 15 | \definecolor{keywords}{RGB}{255,0,90} 16 | \definecolor{comments}{RGB}{0,0,113} 17 | \definecolor{red}{RGB}{160,0,0} 18 | \definecolor{green}{RGB}{0,150,0} 19 | 20 | \lstset{language=Python, 21 | basicstyle=\ttfamily\small, 22 | keywordstyle=\color{keywords}, 23 | commentstyle=\color{comments}, 24 | stringstyle=\color{red}, 25 | showstringspaces=false, 26 | identifierstyle=\color{green}, 27 | procnamekeys={def,class}} 28 | 29 | % \linenumbers 30 | 31 | \usetikzlibrary{shapes.geometric} 32 | \usetikzlibrary{patterns} 33 | \usetikzlibrary{matrix} 34 | \usetikzlibrary{automata} 35 | \usepackage{booktabs} 36 | 37 | % \pagestyle{empty} 38 | \pagenumbering{arabic} 39 | \usepackage{subfig} 40 | \usepackage{comment} 41 | 42 | \newcommand{\boldA}{\boldsymbol{A}} 43 | \newcommand{\boldB}{\boldsymbol{B}} 44 | \newcommand{\boldC}{\boldsymbol{C}} 45 | \newcommand{\boldD}{\boldsymbol{D}} 46 | \newcommand{\boldE}{\boldsymbol{E}} 47 | \newcommand{\boldF}{\boldsymbol{F}} 48 | \newcommand{\boldG}{\boldsymbol{G}} 49 | \newcommand{\boldH}{\boldsymbol{H}} 50 | \newcommand{\boldI}{\boldsymbol{I}} 51 | \newcommand{\boldJ}{\boldsymbol{J}} 52 | \newcommand{\boldK}{\boldsymbol{K}} 53 | \newcommand{\boldL}{\boldsymbol{L}} 54 | \newcommand{\boldM}{\boldsymbol{M}} 55 | \newcommand{\boldN}{\boldsymbol{N}} 56 | \newcommand{\boldO}{\boldsymbol{O}} 57 | \newcommand{\boldP}{\boldsymbol{P}} 58 | \newcommand{\boldQ}{\boldsymbol{Q}} 59 | \newcommand{\boldR}{\boldsymbol{R}} 60 | \newcommand{\boldS}{\boldsymbol{S}} 61 | \newcommand{\boldT}{\boldsymbol{T}} 62 | \newcommand{\boldU}{\boldsymbol{U}} 63 | \newcommand{\boldV}{\boldsymbol{V}} 64 | \newcommand{\boldW}{\boldsymbol{W}} 65 | \newcommand{\boldX}{\boldsymbol{X}} 66 | \newcommand{\boldY}{\boldsymbol{Y}} 67 | \newcommand{\boldZ}{\boldsymbol{Z}} 68 | \newcommand{\bolda}{\boldsymbol{a}} 69 | \newcommand{\boldb}{\boldsymbol{b}} 70 | \newcommand{\boldc}{\boldsymbol{c}} 71 | \newcommand{\boldd}{\boldsymbol{d}} 72 | \newcommand{\bolde}{\boldsymbol{e}} 73 | \newcommand{\boldf}{\boldsymbol{f}} 74 | \newcommand{\boldg}{\boldsymbol{g}} 75 | \newcommand{\boldh}{\boldsymbol{h}} 76 | \newcommand{\boldi}{\boldsymbol{i}} 77 | \newcommand{\boldj}{\boldsymbol{j}} 78 | \newcommand{\boldk}{\boldsymbol{k}} 79 | \newcommand{\boldl}{\boldsymbol{l}} 80 | \newcommand{\boldm}{\boldsymbol{m}} 81 | \newcommand{\boldn}{\boldsymbol{n}} 82 | \newcommand{\boldo}{\boldsymbol{o}} 83 | \newcommand{\boldp}{\boldsymbol{p}} 84 | \newcommand{\boldq}{\boldsymbol{q}} 85 | \newcommand{\boldr}{\boldsymbol{r}} 86 | \newcommand{\bolds}{\boldsymbol{s}} 87 | \newcommand{\boldt}{\boldsymbol{t}} 88 | \newcommand{\boldu}{\boldsymbol{u}} 89 | \newcommand{\boldv}{\boldsymbol{v}} 90 | \newcommand{\boldw}{\boldsymbol{w}} 91 | \newcommand{\boldx}{\boldsymbol{x}} 92 | \newcommand{\boldy}{\boldsymbol{y}} 93 | \newcommand{\boldz}{\boldsymbol{z}} 94 | 95 | \newcommand{\mcA}{\mathcal{A}} 96 | \newcommand{\mcB}{\mathcal{B}} 97 | \newcommand{\mcC}{\mathcal{C}} 98 | \newcommand{\mcD}{\mathcal{D}} 99 | \newcommand{\mcE}{\mathcal{E}} 100 | \newcommand{\mcF}{\mathcal{F}} 101 | \newcommand{\mcG}{\mathcal{G}} 102 | \newcommand{\mcH}{\mathcal{H}} 103 | \newcommand{\mcI}{\mathcal{I}} 104 | \newcommand{\mcJ}{\mathcal{J}} 105 | \newcommand{\mcK}{\mathcal{K}} 106 | \newcommand{\mcL}{\mathcal{L}} 107 | \newcommand{\mcM}{\mathcal{M}} 108 | \newcommand{\mcN}{\mathcal{N}} 109 | \newcommand{\mcO}{\mathcal{O}} 110 | \newcommand{\mcP}{\mathcal{P}} 111 | \newcommand{\mcQ}{\mathcal{Q}} 112 | \newcommand{\mcR}{\mathcal{R}} 113 | \newcommand{\mcS}{\mathcal{S}} 114 | \newcommand{\mcT}{\mathcal{T}} 115 | \newcommand{\mcU}{\mathcal{U}} 116 | \newcommand{\mcV}{\mathcal{V}} 117 | \newcommand{\mcW}{\mathcal{W}} 118 | \newcommand{\mcX}{\mathcal{X}} 119 | \newcommand{\mcY}{\mathcal{Y}} 120 | \newcommand{\mcZ}{\mathcal{Z}} 121 | 122 | \newcommand{\reals}{\ensuremath{\mathbb{R}}} 123 | \newcommand{\integers}{\ensuremath{\mathbb{Z}}} 124 | \newcommand{\rationals}{\ensuremath{\mathbb{Q}}} 125 | \newcommand{\naturals}{\ensuremath{\mathbb{N}}} 126 | \newcommand{\trans}{\ensuremath{\mathsf{T}}} 127 | \newcommand{\ident}{\boldsymbol{I}} 128 | \newcommand{\bzero}{\boldsymbol{0}} 129 | 130 | \newcommand{\balpha}{\boldsymbol{\alpha}} 131 | \newcommand{\bbeta}{\boldsymbol{\beta}} 132 | \newcommand{\boldeta}{\boldsymbol{\eta}} 133 | \newcommand{\bkappa}{\boldsymbol{\kappa}} 134 | \newcommand{\bgamma}{\boldsymbol{\gamma}} 135 | \newcommand{\bmu}{\boldsymbol{\mu}} 136 | \newcommand{\bphi}{\boldsymbol{\phi}} 137 | \newcommand{\bpi}{\boldsymbol{\pi}} 138 | \newcommand{\bpsi}{\boldsymbol{\psi}} 139 | \newcommand{\bsigma}{\boldsymbol{\sigma}} 140 | \newcommand{\btheta}{\boldsymbol{\theta}} 141 | \newcommand{\bxi}{\boldsymbol{\xi}} 142 | \newcommand{\bGamma}{\boldsymbol{\Gamma}} 143 | \newcommand{\bLambda}{\boldsymbol{\Lambda}} 144 | \newcommand{\bOmega}{\boldsymbol{\Omega}} 145 | \newcommand{\bPhi}{\boldsymbol{\Phi}} 146 | \newcommand{\bPi}{\boldsymbol{\Pi}} 147 | \newcommand{\bPsi}{\boldsymbol{\Psi}} 148 | \newcommand{\bSigma}{\boldsymbol{\Sigma}} 149 | \newcommand{\bTheta}{\boldsymbol{\Theta}} 150 | \newcommand{\bUpsilon}{\boldsymbol{\Upsilon}} 151 | \newcommand{\bXi}{\boldsymbol{\Xi}} 152 | \newcommand{\bepsilon}{\boldsymbol{\epsilon}} 153 | 154 | \def\argmin{\operatornamewithlimits{arg\,min}} 155 | \def\argmax{\operatornamewithlimits{arg\,max}} 156 | 157 | \newcommand{\given}{\,|\,} 158 | \newcommand{\distNorm}{\mathcal{N}} 159 | 160 | 161 | \usepackage{tabularx} 162 | \usepackage{algorithm} 163 | \usepackage{algpseudocode} 164 | 165 | \newcommand{\msc}[1]{\mathrm{\textsc{#1}}} 166 | \newcommand{\air}{\vspace{0.5cm}} 167 | 168 | \algtext*{EndWhile}% Remove "end while" text 169 | \algtext*{EndFor}% Remove "end while" text 170 | \algtext*{EndIf}% Remove "end if" text 171 | \algtext*{EndProcedure}% Remove "end while" text 172 | 173 | \newtheorem{theorem}{Theorem} 174 | \newtheorem{defn}{Definition} 175 | 176 | \newcommand{\Scribe}[1]{\def\ScribeStr{Scribe: #1}} 177 | \newcommand{\Scribes}[1]{\def\ScribeStr{Scribes: #1}} 178 | \newcommand{\Lecturer}[1]{\def\LecStr{Lecturer: #1}} 179 | \newcommand{\Lecturers}[1]{\def\LecStr{Lecturers: #1}} 180 | \newcommand{\LectureNumber}[1]{\def\LecNum{#1}} 181 | \newcommand{\LectureDate}[1]{\def\LecDate{#1}} 182 | \newcommand{\LectureTitle}[1]{\def\LecTitle{#1}} 183 | 184 | \newdimen\headerwidth 185 | 186 | \newcommand{\MakeScribeTop}{ 187 | \noindent 188 | \begin{center} 189 | \framebox{ 190 | \vbox{ 191 | \headerwidth=\textwidth 192 | % \advance\headerwidth by -0.22in 193 | \hbox to \headerwidth {{\bf Artificial Intelligence \hfill (Harvard CS182, Fall 2015)} } 194 | \vspace{4mm} 195 | \hbox to \headerwidth {{\Large \hfill {\LecTitle} \hfill}} 196 | \vspace{2mm} 197 | \hbox to \headerwidth {\hfill \LecDate \hfill} 198 | \vspace{2mm} 199 | \hbox to \headerwidth {{\it \hfill \LecStr \hfill }} 200 | } 201 | } 202 | \end{center} 203 | \vspace*{4mm}} 204 | 205 | 206 | \newcommand*{\QED}{\hfill\ensuremath{\square}}% 207 | 208 | \newtheorem{exercise}[theorem]{Question} 209 | \let\checkmark\undefined 210 | 211 | \newcommand{\exinline}[1]{(\refstepcounter{theorem}Question~\thetheorem\label{#1})} 212 | 213 | \usepackage[utf8]{inputenc} 214 | 215 | % \DeclareUnicodeCharacter{2693}{\anchor} 216 | \usepackage{bbding} 217 | \usepackage{soul} 218 | 219 | \ifthenelse{\isundefined{\StudentVersion}}{ 220 | \newcommand{\censor}[1]{ 221 | {\small \textcolor{red}{\SunshineOpenCircled}} \textcolor{red}{#1} 222 | } 223 | \newcommand{\censorm}[1]{ 224 | \hbox{{\small \textcolor{red}{\SunshineOpenCircled}}} \textcolor{red}{#1} 225 | } 226 | 227 | }{ 228 | \DeclareRobustCommand*\censor{% 229 | {\small \textcolor{red}{\SunshineOpenCircled}} 230 | \SOUL@setup% 231 | \def\SOUL@everytoken{\phantom{\the\SOUL@token}}% 232 | \def\SOUL@everyhyphen{% 233 | \discretionary{% 234 | \SOUL@setkern\SOUL@hyphkern% 235 | \phantom{\SOUL@sethyphenchar}% 236 | }{}{}% 237 | }% 238 | \def\SOUL@everyexhyphen##1{% 239 | \SOUL@setkern\SOUL@hyphkern% 240 | \hbox{\phantom{##1}}% 241 | \discretionary{}{}{% 242 | \SOUL@setkern\SOUL@charkern% 243 | }% 244 | }% 245 | \SOUL@% 246 | } 247 | \newcommand{\censorm}[1]{ 248 | \hbox{{\small \textcolor{red}{\SunshineOpenCircled}}} \hspace*{5cm} 249 | } 250 | 251 | } 252 | 253 | 254 | 255 | 256 | \newcommand{\bolddelta}{\boldsymbol{\delta}} 257 | \newcommand{\indicator}{\mathbf{1}} 258 | 259 | 260 | \def\argmin{\operatornamewithlimits{arg\,min}} 261 | \def\argmax{\operatornamewithlimits{arg\,max}} 262 | \def\softmax{\operatornamewithlimits{softmax}} 263 | \def\relu{\operatornamewithlimits{ReLU}} 264 | 265 | \newcommand{\din}{{d_{\mathrm{in}}}} 266 | \newcommand{\dout}{{d_{\mathrm{out}}}} -------------------------------------------------------------------------------- /CS262%20Final%20Project/images/downpour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/downpour.png -------------------------------------------------------------------------------- /CS262%20Final%20Project/images/locally.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/locally.png -------------------------------------------------------------------------------- /CS262%20Final%20Project/images/remotely.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/remotely.png -------------------------------------------------------------------------------- /CS262%20Final%20Project/images/sgd_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/sgd_results.png -------------------------------------------------------------------------------- /CS262%20Final%20Project/images/speeds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/speeds.png -------------------------------------------------------------------------------- /CS262%20Final%20Project/main.aux: -------------------------------------------------------------------------------- 1 | \relax 2 | \providecommand\hyper@newdestlabel[2]{} 3 | \providecommand\HyperFirstAtBeginDocument{\AtBeginDocument} 4 | \HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined 5 | \global\let\oldcontentsline\contentsline 6 | \gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}} 7 | \global\let\oldnewlabel\newlabel 8 | \gdef\newlabel#1#2{\newlabelxx{#1}#2} 9 | \gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}} 10 | \AtEndDocument{\ifx\hyper@anchor\@undefined 11 | \let\contentsline\oldcontentsline 12 | \let\newlabel\oldnewlabel 13 | \fi} 14 | \fi} 15 | \global\let\hyper@last\relax 16 | \gdef\HyperFirstAtBeginDocument#1{#1} 17 | \providecommand\HyField@AuxAddToFields[1]{} 18 | \providecommand\HyField@AuxAddToCoFields[2]{} 19 | \abx@aux@sortscheme{ynt} 20 | \abx@aux@cite{bengio-emb} 21 | \abx@aux@cite{distbelief} 22 | \@writefile{toc}{\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax } 23 | \@writefile{lof}{\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax } 24 | \@writefile{lot}{\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax } 25 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}} 26 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {2}Background on Downpour SGD}{1}{section.2}} 27 | \abx@aux@cite{tensorflow} 28 | \abx@aux@cite{protobuf} 29 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces A graphic modeling the functionality of Downpour SGD \cite {distbelief}\relax }}{2}{figure.caption.1}} 30 | \providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}} 31 | \newlabel{fig:downpour}{{1}{2}{A graphic modeling the functionality of Downpour SGD \cite {distbelief}\relax }{figure.caption.1}{}} 32 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {3}Motivation}{2}{section.3}} 33 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {4}Challenges}{3}{section.4}} 34 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {5}Methods and Design}{3}{section.5}} 35 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces On left, transfer speeds for different amounts of parameters. On right, transfer speeds based on chunk size while streaming the parameters.\relax }}{4}{figure.caption.2}} 36 | \newlabel{fig:local}{{2}{4}{On left, transfer speeds for different amounts of parameters. On right, transfer speeds based on chunk size while streaming the parameters.\relax }{figure.caption.2}{}} 37 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {6}Results and Discussion}{6}{section.6}} 38 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces On left, training classification error rates per epoch. On right, training classification error rates over time.\relax }}{6}{figure.caption.3}} 39 | \newlabel{fig:local}{{3}{6}{On left, training classification error rates per epoch. On right, training classification error rates over time.\relax }{figure.caption.3}{}} 40 | \abx@aux@cite{adagrad} 41 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {7}Applying SGD in Lua/Torch}{7}{section.7}} 42 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces The results of running our rnn model for 7 epochs locally.\relax }}{7}{figure.caption.4}} 43 | \newlabel{fig:local}{{4}{7}{The results of running our rnn model for 7 epochs locally.\relax }{figure.caption.4}{}} 44 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces The result of running our rnn model for 10 epochs remotely.\relax }}{9}{figure.caption.5}} 45 | \newlabel{fig:remote}{{5}{9}{The result of running our rnn model for 10 epochs remotely.\relax }{figure.caption.5}{}} 46 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {8}Conclusion}{9}{section.8}} 47 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {9}Code}{9}{section.9}} 48 | -------------------------------------------------------------------------------- /CS262%20Final%20Project/main.bib: -------------------------------------------------------------------------------- 1 | @inproceedings{distbelief, 2 | title = {Large Scale Distributed Deep Networks}, 3 | author = {Jeffrey Dean and Greg S. Corrado and Rajat Monga and Kai Chen and Matthieu Devin and Quoc V. Le and Mark Z. Mao and Marc’Aurelio Ranzato and Andrew Senior and Paul Tucker and Ke Yang and Andrew Y. Ng}, 4 | year = 2012, 5 | booktitle = {NIPS} 6 | } 7 | @article{bengio-emb, 8 | author = {Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Janvin, Christian}, 9 | title = {A Neural Probabilistic Language Model}, 10 | journal = {J. Mach. Learn. Res.}, 11 | issue_date = {3/1/2003}, 12 | volume = {3}, 13 | month = mar, 14 | year = {2003}, 15 | issn = {1532-4435}, 16 | pages = {1137--1155}, 17 | numpages = {19}, 18 | url = {http://dl.acm.org/citation.cfm?id=944919.944966}, 19 | acmid = {944966}, 20 | publisher = {JMLR.org}, 21 | } 22 | 23 | @article{tensorflow, 24 | author = {Mart{\'{\i}}n Abadi and 25 | Ashish Agarwal and 26 | Paul Barham and 27 | Eugene Brevdo and 28 | Zhifeng Chen and 29 | Craig Citro and 30 | Gregory S. Corrado and 31 | Andy Davis and 32 | Jeffrey Dean and 33 | Matthieu Devin and 34 | Sanjay Ghemawat and 35 | Ian J. Goodfellow and 36 | Andrew Harp and 37 | Geoffrey Irving and 38 | Michael Isard and 39 | Yangqing Jia and 40 | Rafal J{\'{o}}zefowicz and 41 | Lukasz Kaiser and 42 | Manjunath Kudlur and 43 | Josh Levenberg and 44 | Dan Mane and 45 | Rajat Monga and 46 | Sherry Moore and 47 | Derek Gordon Murray and 48 | Chris Olah and 49 | Mike Schuster and 50 | Jonathon Shlens and 51 | Benoit Steiner and 52 | Ilya Sutskever and 53 | Kunal Talwar and 54 | Paul A. Tucker and 55 | Vincent Vanhoucke and 56 | Vijay Vasudevan and 57 | Fernanda B. Vi{\'{e}}gas and 58 | Oriol Vinyals and 59 | Pete Warden and 60 | Martin Wattenberg and 61 | Martin Wicke and 62 | Yuan Yu and 63 | Xiaoqiang Zheng}, 64 | title = {TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed 65 | Systems}, 66 | journal = {CoRR}, 67 | volume = {abs/1603.04467}, 68 | year = {2016}, 69 | url = {http://arxiv.org/abs/1603.04467}, 70 | timestamp = {Sun, 03 Apr 2016 11:52:22 +0200}, 71 | biburl = {http://dblp.uni-trier.de/rec/bib/journals/corr/AbadiABBCCCDDDG16}, 72 | bibsource = {dblp computer science bibliography, http://dblp.org} 73 | } 74 | @MISC{protobuf, 75 | title={Protocol Buffers}, 76 | author={Kenton Varda}, 77 | howpublished={\url{http://code.google.com/apis/protocolbuffers/}}, 78 | } 79 | @techreport{adagrad, 80 | Author = {Duchi, John and Hazan, Elad and Singer, Yoram}, 81 | Title = {Adaptive Subgradient Methods for Online Learning and Stochastic Optimization}, 82 | Institution = {EECS Department, University of California, Berkeley}, 83 | Year = {2010}, 84 | Month = {Mar}, 85 | URL = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2010/EECS-2010-24.html}, 86 | Number = {UCB/EECS-2010-24}, 87 | Abstract = {We present a new family of subgradient methods that dynamically incorporate knowledge of the geometry of the data observed in earlier iterations to perform more informative gradient-based learning. Metaphorically, the adaptation allows us to find needles in haystacks in the form of very predictive but rarely seen features. Our paradigm stems from recent advances in stochastic optimization and online learning which employ proximal functions to control the gradient steps of the algorithm. We describe and analyze an apparatus for adaptively modifying the proximal function, which significantly simplifies setting a learning rate and results in regret guarantees that are provably as good as the best proximal function that can be chosen in hindsight. We give several efficient algorithms for empirical risk minimization problems with common and important regularization functions and domain constraints. We experimentally study our theoretical analysis and show that adaptive subgradient methods significantly outperform state-of-the-art, yet non-adaptive, subgradient algorithms.} 88 | } -------------------------------------------------------------------------------- /CS262%20Final%20Project/main.out: -------------------------------------------------------------------------------- 1 | \BOOKMARK [1][-]{section.1}{Introduction}{}% 1 2 | \BOOKMARK [1][-]{section.2}{Background on Downpour SGD}{}% 2 3 | \BOOKMARK [1][-]{section.3}{Motivation}{}% 3 4 | \BOOKMARK [1][-]{section.4}{Challenges}{}% 4 5 | \BOOKMARK [1][-]{section.5}{Methods and Design}{}% 5 6 | \BOOKMARK [1][-]{section.6}{Results and Discussion}{}% 6 7 | \BOOKMARK [1][-]{section.7}{Applying SGD in Lua/Torch}{}% 7 8 | \BOOKMARK [1][-]{section.8}{Conclusion}{}% 8 9 | \BOOKMARK [1][-]{section.9}{Code}{}% 9 10 | -------------------------------------------------------------------------------- /CS262%20Final%20Project/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/main.pdf -------------------------------------------------------------------------------- /CS262%20Final%20Project/main.run.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 23 | 28 | 33 | 36 | 39 | 42 | ]> 43 | 44 | 45 | latex 46 | 47 | main.bcf 48 | 49 | 50 | main.bbl 51 | 52 | 53 | blx-dm.def 54 | blx-compat.def 55 | biblatex.def 56 | alphabetic.bbx 57 | standard.bbx 58 | alphabetic.cbx 59 | biblatex.cfg 60 | english.lbx 61 | 62 | 63 | 64 | biber 65 | 66 | biber 67 | main 68 | 69 | 70 | main.bcf 71 | 72 | 73 | main.bbl 74 | 75 | 76 | main.bbl 77 | 78 | 79 | main.bcf 80 | 81 | 82 | main.bib 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /CS262%20Final%20Project/main.synctex.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/main.synctex.gz -------------------------------------------------------------------------------- /Dev-Notebook-Kevin.md: -------------------------------------------------------------------------------- 1 | 2 | Played around with autograd in python. Looking for a reasonable toy dataset to test sgd on distributed system 3 | Looked into the convolutional network example for autograd https://github.com/HIPS/autograd/blob/master/examples/neural_net.py 4 | This ended up being perfect because it spits out a long vector of gradients that it uses 5 | Looking for a far heavier dataset. MSINT runs in a 1-2 minutes. 6 | Found Caltech 101, built some preprocessing code, modified some of the code for the neural network 7 | Needed to downsize the images substantially. 240 x 240 is around 12 GB of data. Shrunk it down to 128 x 128, making it 4 Gb of data. New gradients are around 0.5Gb. This makes network speeds pretty prohibitive though. 8 | Epochs take a couple minutes to run. Batches takes around 10-15 seconds each. Seems rather reasonable 9 | 10 | 11 | Looking into Azure for launching VMs 12 | Discovered CLI for Azure 13 | Set up 5 different accounts all using the Bizspark subscription. One email account also has a free subscription activated. 14 | Emails and passwords are listed below: 15 | 16 | (candokevin2@hotmail.com, cs262michaelkevin) 17 | (candokevin3@hotmail.com, cs262michaelkevin) 18 | 19 | 20 | Log into portal.azure.com to interact more with the system 21 | 22 | Received instructions from Mike on how to setup grpc. For replicability on later Linux VMs we launch, I've documented the steps 23 | I took below: 24 | 25 | Set up Protobufs 3.0.0 26 | https://github.com/google/protobuf/releases/download/v3.0.0-beta-2/protobuf-python-3.0.0-beta-2.zip 27 | ./autogen.sh 28 | ./configure 29 | make 30 | make check 31 | make install 32 | 33 | Set up grpc 34 | git clone https://github.com/grpc/grpc.git 35 | sudo make grpc_python_plugin 36 | sudo vim /etc/paths, add the line /Users/candokevin/stash/grpc/bins/opt 37 | 38 | 39 | It might be a good idea to look into Docker containers, and Docker networks for launching and setting up VMs. 40 | 41 | This site suggests that Google Compute might actually be the best platform for this 42 | https://gigaom.com/2014/04/12/need-for-speed-testing-the-networking-performance-of-the-top-4-cloud-providers/ 43 | https://cloudplatform.googleblog.com/2014/04/enter-andromeda-zone-google-cloud-platforms-latest-networking-stack.html 44 | Get started, generate a project ID 45 | Network speed is critical considering how huge our gradients may be. 46 | 47 | Persistent 10GB disk for saving the state of machine 48 | Allows you to save the state of a machine 49 | 50 | gcloud compute instances create example-instance --image test-image --zone us-central1-b 51 | gcloud compute ssh large-example-instance --zone 52 | gcloud compute copy-files /Users/candokevin/stash/distributed-sgd/scp extra-large-example-instance:~/scp/ --zone us-central1-b 53 | 54 | 55 | Generate some code that performs the following 56 | 57 | Initializes the parameters to some certain set of values 58 | Updates parameters given some gradient 59 | Sends parameters to different servers 60 | -------------------------------------------------------------------------------- /Dev-Notebook-Mike.md: -------------------------------------------------------------------------------- 1 | - need to install proto3 protocol buffers 2 | 3 | download link: 4 | https://github.com/google/protobuf/releases/download/v3.0.0-beta-2/protobuf-python-3.0.0-beta-2.zip 5 | 6 | https://github.com/google/protobuf 7 | 8 | example: 9 | https://github.com/grpc/grpc/tree/release-0_13/examples/python/helloworld 10 | 11 | cd into directory 12 | brew update && brew remove gmp && brew install gmp && brew link gmp 13 | 14 | ./autogen.sh 15 | 16 | ./configure 17 | 18 | make 19 | 20 | make check 21 | 22 | make install 23 | 24 | example usage 25 | protoc -I=$SRC_DIR --python_out=$DST_DIR $SRC_DIR/addressbook.proto 26 | 27 | - installed grpc according to the following instructions listed here: https://github.com/grpc/grpc/tree/release-0_13/examples/python an outline of the command I ran are the following: 28 | 29 | sudo pip install grpcio 30 | 31 | git clone https://github.com/grpc/grpc 32 | 33 | - We can test to see if the helloworld example works: 34 | 35 | cd grpc/examples/python/helloworld 36 | 37 | - Run the server 38 | 39 | python2.7 greeter_server.py & 40 | 41 | - Run the client 42 | 43 | python2.7 greeter_client.py 44 | 45 | -You should see the output "Greeter client received: Hello, you!" 46 | 47 | Instead going to copy the necessary files into our directory and have a small running example 48 | 49 | in the folder Distributed-SGD/helloworld: 50 | 51 | have the files: 52 | 53 | greeter_client.py 54 | greeter_server.py 55 | 56 | 57 | sudo pip install grpcio --upgrade 58 | 59 | 60 | 61 | 62 | 63 | HOW I GOT IT TO WORK 64 | Used this link: 65 | https://github.com/grpc/homebrew-grpc 66 | 67 | 68 | curl -fsSL https://goo.gl/getgrpc | bash - 69 | 70 | virtualenv venv 71 | source venv/bin/activate 72 | 73 | curl -fsSL https://goo.gl/getgrpc | bash -s python 74 | 75 | cd venv 76 | 77 | git clone https://github.com/grpc/grpc.git 78 | 79 | cd grpc 80 | 81 | make grpc_python_plugin 82 | 83 | 84 | 85 | 86 | here we go: 87 | 88 | cd /usr/local/ 89 | mkdir manual 90 | cd manual 91 | 92 | curl -fsSL https://goo.gl/getgrpc | bash - 93 | 94 | virtualenv venv 95 | 96 | source venv/bin/activate 97 | 98 | curl -fsSL https://goo.gl/getgrpc | bash -s python 99 | 100 | pip install numpy 101 | pip install scipy 102 | sudo pip install pillow 103 | pip install sklearn 104 | pip install autograd 105 | 106 | cd venv 107 | 108 | git clone https://github.com/grpc/grpc.git 109 | cd grpc 110 | 111 | make grpc_python_plugin 112 | 113 | sudo vim /etc/paths 114 | 115 | and add the line: 116 | 117 | /usr/local/manual/venv/grpc/bins/opt 118 | 119 | 120 | 121 | BEFORE RUNNING ANYTHING 122 | 123 | source /usr/local/manual/venv/bin/activate 124 | 125 | 126 | Important links: 127 | https://github.com/grpc/homebrew-grpc 128 | https://docs.docker.com/engine/userguide/networking/ 129 | http://www.bpython-interpreter.org 130 | https://github.com/mila-udem/fuel 131 | 132 | 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Distributed-SGD 2 | 3 | An implementation of distributed stochastic gradient descent for both local and remote clients. 4 | 5 | The [paper](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/CS262%2520Final%2520Project/main.pdf) describing this project. 6 | 7 | ## Usage 8 | 9 | The usage varies depending on the native language. 10 | 11 | Please see the respective directory for the language you are interested in: 12 | 13 | Usage in [python](https://github.com/michaelfarrell76/Distributed-SGD/tree/master/python-python) 14 | 15 | Usage in [lua/torch](https://github.com/michaelfarrell76/Distributed-SGD/tree/master/lua-lua) 16 | 17 | ## Table of Contents 18 | ``` 19 | . 20 | ├── lua-lua # Implementation of Distributed SGD in lua/torch 21 | ├── python-python # Implementation of Distributed SGD in python 22 | ├── slides # presentation slides about this project 23 | ├──.gitignore 24 | ├──.gitmodules 25 | ├── Dev-Notebook-Kevin.md # Development notes 26 | ├── Dev-Notebook-Mike.md 27 | ├── README.md 28 | └── client_list.txt # List of available server ip addresses 29 | ``` 30 | 31 | 32 | ## Primary contributors 33 | 34 | [Kevin Yang](https://github.com/kyang01) 35 | 36 | [Michael Farrell](https://github.com/michaelfarrell76) 37 | 38 | -------------------------------------------------------------------------------- /client_list.txt: -------------------------------------------------------------------------------- 1 | 130.211.204.149 2 | 104.197.250.103 3 | 130.211.192.196 4 | 104.197.222.148 5 | 104.197.106.197 6 | 104.197.167.23 7 | 104.154.239.139 8 | 130.211.206.66 9 | 104.197.137.32 10 | 104.197.174.106 11 | -------------------------------------------------------------------------------- /lua-lua/README.md: -------------------------------------------------------------------------------- 1 | # Distributed-SGD: lua-lua 2 | An implementation of distributed stochastic gradient descent in lua/torch. Clients can be local and remote. 3 | 4 | ## Requirements 5 | 6 | This code is written in Lua, and an installation of [Torch](https://github.com/torch/torch7/) is assumed. Training requires a few packages which can easily be installed through [LuaRocks](https://github.com/keplerproject/luarocks) (which comes with a Torch installation). Datasets are formatted and loaded using [hdf5](https://en.wikipedia.org/wiki/Hierarchical_Data_Format), which can be installed using this [guide](https://github.com/deepmind/torch-hdf5/blob/master/doc/usage.md). 7 | 8 | Once torch and torch-hdf5 are installed, use luarocks to install the other dependencies used in the example: 9 | 10 | ```bash 11 | $ luarocks install nn 12 | $ luarocks install rnn 13 | ``` 14 | If you want to train on an Nvidia GPU using CUDA, you'll need to install the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) as well as the `cutorch` and `cunn` packages: 15 | ```bash 16 | $ luarocks install cutorch 17 | $ luarocks install cunn 18 | ``` 19 | We need to ensure that our local version of parallel is installed. This can be done with a short bash script from the lua-lua folder: 20 | ```bash 21 | $ cd lua-lua 22 | $ bash install_parallel.sh 23 | ``` 24 | 25 | ## Directory Table of Contents 26 | ``` 27 | . 28 | ├── data # Folder holding data used for demo 29 | ├── parallel # Folder containing the changes we added to the parallel class 30 | ├── End-To-End-Generative-Dialgoue # Folder of our other repo containing the code used in demo 31 | ├── README.md # lua-lua usage 32 | ├── server.lua # Main server file 33 | ├── README.md 34 | ├── startup.sh # Startup script for remote gcloud servers 35 | ├── setup_image.sh # Script that copies startup.sh to remote server and calls startup.sh 36 | ├── install_parallel.sh # script that installs our version of parallel 37 | └── demo_server.lua # A demo class that implements the server 38 | ``` 39 | 40 | ## Description 41 | 42 | ## Demo-Usage 43 | Code is run from the lua-lua folder: 44 | ```bash 45 | $ cd lua-lua 46 | ``` 47 | 48 | #### Local 49 | 50 | To run a worker with 2 parallel clients on your own machine: 51 | ```bash 52 | $ th server.lua -n_proc 2 53 | ``` 54 | 55 | #### Remote - localhost 56 | 57 | In order to get the demo to connect through localhost rather than simply forking, we must first setup an .ssh key for this project. 58 | 59 | Note: This is basically doing the same thing as [local](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/lua-lua/README.md#local), except we now connect to the clients through localhost. This is a good tool to use to debug problems with clients running on remote servers. 60 | 61 | ##### Generate ssh key 62 | Replace USERNAME with your username on the computer you want to connect to: 63 | ```bash 64 | $ USERNAME=michaelfarrell 65 | $ ssh-keygen -t rsa -f ~/.ssh/dist-sgd-sshkey -C $USERNAME 66 | ``` 67 | Hit enter twice and a key should have been generated. 68 | 69 | ##### Add ssh-key to authorized keys 70 | 71 | In order to connect to clients through localhost, we must add the key to our list of authorized_keys: 72 | ```bash 73 | $ cat ~/.ssh/dist-sgd-sshkey.pub >> ~/.ssh/authorized_keys 74 | $ chmod og-wx ~/.ssh/authorized_keys 75 | ``` 76 | 77 | ##### Allow ssh connections 78 | 79 | In order to connect through localhost, you must allow your computer to allow incoming ssh connections. 80 | 81 | On a Mac, this can be done by going to: 82 | 83 | System Preferences > Sharing 84 | 85 | and checking the 'Remote Login' box 86 | 87 | 88 | ##### Connect via localhost 89 | 90 | You can now communicate over localhost using the command: 91 | 92 | ```bash 93 | $ EXTENSION=Desktop/GoogleDrive/FinalProject/Distributed-SGD/lua-lua/ 94 | $ TORCH_PATH=/Users/michaelfarrell/torch/install/bin/th 95 | $ th server.lua -n_proc 4 -localhost -extension $EXTENSION -torch_path $TORCH_PATH 96 | ``` 97 | where $EXTENSION is the relative path to the lua-lua folder from the your directory and $TORCH_PATH is the absolute path to torch on your computer 98 | 99 | #### Remote - gcloud 100 | 101 | Instead of having the client programs running on your own computer, you can farm them out to any number of remote computers. Below is a description of how to setup remote clients using google cloud (gcloud offers 60 day free trials with $300 worth of credit). 102 | 103 | ##### Adding ssh key to gcloud servers 104 | 105 | We have to allow our gcloud servers to accept incoming ssh connections from our computer. 106 | 107 | If you have yet to do so, [generate an ssh-key](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/lua-lua/README.md#generate-ssh-key) 108 | 109 | Once you have created the key print it out: 110 | 111 | ```bash 112 | $ cat ~/.ssh/dist-sgd-sshkey.pub 113 | ``` 114 | 115 | Next you must add the key to the set of public keys : 116 | - Login to your google compute account. 117 | - Go to compute engine dashboard 118 | - Go to metdata tab 119 | - Go to ssh-key subtab 120 | - Click edit 121 | - Add the key you copied as a new line 122 | 123 | Restrict external access to the key: 124 | ```bash 125 | $ chmod 400 ~/.ssh/dist-sgd-sshkey 126 | ``` 127 | 128 | ##### Create a baseline startup image 129 | 130 | We only have to setup and install everything once, after which we can clone that client. 131 | 132 | ###### Create the image 133 | - Click on the 'VM Instances' tab 134 | - Create Instance 135 | - Give the instance a name i.e. 'demo-baseline' 136 | - Set the zone to us-central1-b 137 | - Choose 8vCPU highmem as machine type 138 | - Under boot disk click change 139 | - Choose Ubuntu 14.04 LTS 140 | - At the bottom change size to 30 GB and click 'select' 141 | - Allow HTTP traffic 142 | - Allow HTTPS traffic 143 | - Click 'Management, disk, networking, SSH keys' to dropdown more options 144 | - Under 'Disk' unclick 'Delete boot disk when instance is deleted' 145 | - Click 'Create' an you should see your new instance listed in the table 146 | 147 | ###### Allow tcp connections 148 | - Wait for the VM instance to startup (indicated by a green check next to the instance) 149 | - Under the 'network' column, click 'default' 150 | - Go to 'Firewall rules' and Add a new rule 151 | - Set name to be 'all' 152 | - Set source filter to allow from any source 153 | - Under allowed protocols, put 'tcp:0-65535; udp:0-65535; icmp' 154 | - Create 155 | 156 | ###### Setup the disk 157 | - Return to the 'VM instances' tab 158 | - Grab the external IP address for the instance 159 | ```bash 160 | $ EXTERNAL_IP=104.154.48.250 161 | $ USERNAME=michaelfarrell 162 | ``` 163 | - Next you must modify the 'startup.sh' script to also include any additional installs that you may need on the server. This script is run from the home directory of the remote client. To run the demo, you do not need to modify this script. 164 | - Next you must modify the 'setup_image.sh' script so that it correctly calls your startup.sh script on the remote server. If you did not change 'startup.sh' script, you should probably not be changing this script either. 165 | - Setup the image: 166 | ```bash 167 | $ source setup_image.sh 168 | ``` 169 | Note you can connect to the server: 170 | ```bash 171 | $ ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$EXTERNAL_IP 172 | ``` 173 | - Once the server is setup to your liking, disconnect from the server and return to your google cloud dashboard 174 | - Go to the 'VM Dashboard' 175 | - Click on the instance you just setup, and delete it. This should remove the instance and save it as a disk. If you click on the 'disks' tab, you should see the instance name you just deleted. 176 | 177 | ###### Create the image 178 | 179 | - Click on the 'Images' tab 180 | - 'Create Image' 181 | - Give it a name i.e. 'demo-image' 182 | - Under Source-Disk, choose the disk that you just created 183 | - Create 184 | 185 | ##### Generate an 'Instance Template' 186 | - Click on the 'Instance templates' tab 187 | - Create new 188 | - Name the template i.e. 'demo-template' 189 | - Under 'Boot Disk' click change 190 | - At the top click 'Your image' 191 | - Choose the image you just created i.e. 'demo-image' 192 | - Set size to 30 GB 193 | - Select 194 | - Allow HTTP traffic 195 | - Allow HTTPS traffic 196 | - Under more->Disks, unclick 'Delete boot disk when instance is deleted' 197 | - Create 198 | 199 | ##### Generate an 'Instance Group' 200 | - Go to the "Instance groups" tab 201 | - Create instance group 202 | - Give the group a name, i.e. 'demo-group' 203 | - Give a description 204 | - Set zone to us-central1-b 205 | - Use instance template 206 | - Choose the template you just made i.e. 'demo-template' 207 | - Set the number of instances 208 | - Create 209 | - Wait for the instances to launch 210 | - Once there is a green checkmark, click on the new instance 211 | 212 | ##### Adding remote clients 213 | You will want to add your list of client servers to the file 'client_list.txt' where each line in the file is one of the external ip addresses located in the Instance group you are currently using. You will need to copy this list of files to the computer that you are going to use as the main parameter server. Choose an IP from the freshly updated 'client_list.txt' and set the $SERVER_IP environment variable: 214 | ```bash 215 | $ SERVER_IP=130.211.160.115 216 | ``` 217 | Copy over 'client_list.txt' to the main server: 218 | ```bash 219 | $ scp -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey ../client_list.txt $USERNAME@$SERVER_IP:~/Distributed-SGD 220 | ``` 221 | 222 | ##### Connecting to gcloud servers 223 | 224 | You can connect to one of the servers by running: 225 | ```bash 226 | $ ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$SERVER_IP 227 | ``` 228 | Note: the flag `-o "StrictHostKeyChecking no"` automatically adds the host to your list and does not prompt confirmation. 229 | 230 | If you get an error like this: 231 | ```bash 232 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 233 | @ WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED! @ 234 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 235 | ``` 236 | then you'll want to 237 | ```bash 238 | $ vim ~/.ssh/known_hosts 239 | ``` 240 | and delete the last few lines that were added. They should look like some ip address and then something that starts with AAAA. You can delete lines in vim by typing 'dd' to delete the current line. This can happen when you restart the servers and they change ip addresses, among other things. 241 | 242 | ##### Adding ssh keys again 243 | 244 | If the servers have been initialized, you will first want to connect to the computer above that you chose to be the main server 245 | ```bash 246 | $ ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$SERVER_IP 247 | ``` 248 | 249 | Once connected, you need to again setup an ssh key from the computer that you are using as the client. 250 | 251 | 1) [generate an ssh-key](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/lua-lua/README.md#generate-ssh-key) 252 | 253 | 2) [add key to gcloud server account](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/lua-lua/README.md#adding-ssh-key-to-gcloud-servers) 254 | 255 | ##### Running on remote servers: 256 | 257 | Once this is done, you can run the server with remote gcloud clients using the command: 258 | ```bash 259 | $ cd Distributed-SGD/lua-lua 260 | $ EXTENSION=Distributed-SGD/lua-lua/ 261 | $ TORCH_PATH=/home/michaelfarrell/torch/install/bin/th 262 | $ th server.lua -n_proc 4 -remote -extension $EXTENSION -torch_path $TORCH_PATH 263 | 264 | ``` 265 | 266 | ## For Personal Usage 267 | 268 | If you wish to extend this demo to work with your own SGD model you must simply create a new server class specific to your task, replacing the 'demo_server' class. Use the file 'demo_server.lua' as an example. The server only needs to have __init(opt) and run() functions defined in order to work. Once this class is properly defined (i.e. named 'new_server'), you can run the following to initiate your task: 269 | 270 | ```bash 271 | $ NEW_SERVER_NAME=new_server 272 | $ th server.lua -server_class $NEW_SERVER_NAME # Plus Additional arguments 273 | 274 | ``` 275 | 276 | When developing, all command line arguments should be added in the file server.lua. Look at the command arguments 277 | ```bash 278 | $ th server.lua --help 279 | ``` 280 | that already exist and use those names when developing your model. If you need an additional command line argument, add it in server.lua. Other than this, there should be no reason to edit the server.lua file. 281 | 282 | If you are having your clients run remotely, you may also need to modify 'startup.sh' and 'setup_image.sh' so that they setup the server environements according to the specifications that you need. 283 | 284 | 285 | ## TODO 286 | - Document data folder and include description in demo-usage about what the demo is 287 | - Add in documentation of how the data needs to be formatted in order to run the demo 288 | - Finish description 289 | - Finish Acknowledgements 290 | - Add in proto implementation 291 | - Add in git pull at startup 292 | - add way to catch if failure down and reset 293 | - maybe add paxos if kevin is successful 294 | - try adding protobufs 295 | - get results 296 | - Add in addtional catches for errors like add to path 297 | 298 | 299 | ## Acknowledgments 300 | This example is also apart of another one of our repos: https://github.com/michaelfarrell76/End-To-End-Generative-Dialogue 301 | 302 | Our implementation utilizes code from the following: 303 | 304 | * [Yoon Kim's seq2seq-attn repo](https://github.com/harvardnlp/seq2seq-attn) 305 | * [Element rnn library](https://github.com/Element-Research/rnn) 306 | * [Facebook's neural attention model](https://github.com/facebook/NAMAS) 307 | -------------------------------------------------------------------------------- /lua-lua/cleanup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Copy files from servers 5 | """ 6 | 7 | import sys 8 | import os 9 | import time 10 | 11 | 12 | def child(ip_addr): 13 | if not os.path.exists('outputs/' + ip_addr): 14 | os.makedirs('outputs/' + ip_addr) 15 | os.system('(echo " echo starting; pkill torch; pkill lua; cd Distributed-SGD/lua-lua/; git pull; cd End-To-End-Generative-Dialogue/; git pull origin master; exit") | ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey michaelfarrell@%s' % ip_addr) 16 | os._exit(0) 17 | 18 | 19 | def main(arguments): 20 | with open('../client_list.txt') as f: 21 | if not os.path.exists('outputs'): 22 | os.makedirs('outputs') 23 | pids = [] 24 | for line in f: 25 | # os.system('echo ' + line) 26 | newpid = os.fork() 27 | pids.append(newpid) 28 | if newpid == 0: 29 | if line[-1] == '\n': 30 | child(line[:-1]) 31 | else: 32 | child(line) 33 | 34 | 35 | if __name__ == '__main__': 36 | sys.exit(main(sys.argv[1:])) -------------------------------------------------------------------------------- /lua-lua/copy_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """Copy files from servers 5 | """ 6 | 7 | import sys 8 | import os 9 | import time 10 | 11 | 12 | def child(ip_addr): 13 | if not os.path.exists('outputs/' + ip_addr): 14 | os.makedirs('outputs/' + ip_addr) 15 | cmd = 'scp -r -i ~/.ssh/dist-sgd-sshkey michaelfarrell@%s:~/Distributed-SGD/lua-lua/*.txt ~/Desktop/GoogleDrive/FinalProject/Distributed-SGD/lua-lua/outputs/%s/ &> /dev/null' % (ip_addr, ip_addr) 16 | 17 | os.system(cmd) 18 | os._exit(0) 19 | 20 | 21 | def main(arguments): 22 | with open('../client_list.txt') as f: 23 | if not os.path.exists('outputs'): 24 | os.makedirs('outputs') 25 | pids = [] 26 | for line in f: 27 | # os.system('echo ' + line) 28 | newpid = os.fork() 29 | pids.append(newpid) 30 | if newpid == 0: 31 | if line[-1] == '\n': 32 | child(line[:-1]) 33 | else: 34 | child(line) 35 | 36 | 37 | time.sleep(5) 38 | if __name__ == '__main__': 39 | sys.exit(main(sys.argv[1:])) -------------------------------------------------------------------------------- /lua-lua/data/demo-train.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/data/demo-train.hdf5 -------------------------------------------------------------------------------- /lua-lua/data/demo-val.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/data/demo-val.hdf5 -------------------------------------------------------------------------------- /lua-lua/demo_server.lua: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------ 2 | -- demo_server.lua 3 | -- 4 | -- This is the example of a class that is used to implement a sever in 5 | -- server.lua. This class has an _init(opt) function that takes in 6 | -- the global parameters, loads in the data and builds the model on 7 | -- the parameter server. The class also has a run() function that 8 | -- forks out the child clients and executes the function 'worker' 9 | -- on each corresponding client. 10 | -- 11 | -- If you wish to develop your own SGD model, create a new class that is 12 | -- similar to this. 13 | ------------------------------------------------------------------------ 14 | local demo_server = torch.class('demo_server') 15 | 16 | ------------ 17 | -- Worker code 18 | ------------ 19 | function worker() 20 | -- Used to check files 21 | require "lfs" 22 | 23 | -- Used to update path 24 | require 'package' 25 | 26 | -- Alert successfully started up 27 | parallel.print('Im a worker, my ID is: ', parallel.id, ' and my IP: ', parallel.ip) 28 | 29 | -- Global indicating is a child 30 | ischild = true 31 | 32 | -- Extension to lua-lua folder from home directory. Set to no extension as default 33 | ext = "" 34 | 35 | -- Number of packages received 36 | local n_pkg = 0 37 | while true do 38 | 39 | -- Allow the parent to terminate the child 40 | m = parallel.yield() 41 | if m == 'break' then break end 42 | 43 | -- Receive data 44 | local pkg = parallel.parent:receive() 45 | 46 | 47 | -- Make sure to clean everything up since big files are being passed 48 | io.write('.') io.flush() 49 | collectgarbage() 50 | 51 | 52 | if n_pkg == 0 then 53 | -- This is the first time receiving a package, it has the globals 54 | 55 | -- Receive and parse global parameters 56 | parallel.print('Recieved initialization parameters') 57 | cmd, arg, ext = pkg.cmd, pkg.arg, pkg.ext 58 | opt = cmd:parse(arg) 59 | 60 | -- Update path 61 | package.path = opt.add_to_path .. package.path 62 | 63 | -- Add in additional necessary parameters 64 | opt.print = parallel.print 65 | opt.parallel = true 66 | 67 | 68 | -- Library used to handle data types 69 | local data_loc = ext .. 'End-To-End-Generative-Dialogue/src/data' 70 | if not lfs.attributes(data_loc .. '.lua') then 71 | print('The file data.lua could not be found in ' .. data_loc .. '.lua') 72 | os.exit() 73 | end 74 | data = require(data_loc) 75 | 76 | -- Load in helper functions for this model defined in End-To-End-Generative-Dialogue 77 | local model_funcs_loc = ext .. "End-To-End-Generative-Dialogue/src/model_functions.lua" 78 | if not lfs.attributes(model_funcs_loc) then 79 | print('The file model_functions.lua could not be found in ' .. model_funcs_loc) 80 | os.exit() 81 | end 82 | funcs = loadfile(model_funcs_loc) 83 | funcs() 84 | 85 | -- Change the locations of the datafiles based on new extension 86 | opt.data_file = ext .. opt.data_file 87 | opt.val_data_file = ext .. opt.val_data_file 88 | 89 | --point the wordvec to the right place if exists 90 | if opt.pre_word_vecs ~= "" then 91 | opt.pre_word_vecs = opt.extension .. opt.pre_word_vecs 92 | end 93 | 94 | -- Load in data to client 95 | train_data, valid_data, opt = load_data(opt) 96 | 97 | -- Build the model on the client 98 | model, criterion = build() 99 | 100 | -- send some data back 101 | parallel.parent:send('Received parameters and loaded data successfully') 102 | else 103 | parallel.print('received params from batch with index: ', pkg.index) 104 | 105 | -- Load in the parameters sent from the parent 106 | for i = 1, #model.params do 107 | model.params[i]:copy(pkg.parameters[i]) 108 | end 109 | 110 | -- Training the model at the given index 111 | local pkg_o = train_ind(pkg.index, model, criterion, train_data) 112 | 113 | -- send some data back 114 | parallel.print('sending back derivative for batch with index: ', pkg.index) 115 | parallel.parent:send(pkg_o) 116 | end 117 | n_pkg = n_pkg + 1 118 | end 119 | end 120 | 121 | 122 | ------------ 123 | -- Server class 124 | ------------ 125 | 126 | -- Initialization function for the server object. Here we load in the data, build our 127 | -- model, and then add any remote client objects if necessary. 128 | function demo_server:__init(opt) 129 | -- Save the command line options 130 | self.opt = opt 131 | 132 | -- Used to check files 133 | require "lfs" 134 | 135 | -- Library used to handle data types 136 | local data_loc = 'End-To-End-Generative-Dialogue/src/data' 137 | if not lfs.attributes(data_loc .. '.lua') then 138 | print('The file data.lua could not be found in ' .. data_loc .. '.lua') 139 | os.exit() 140 | end 141 | data = require(data_loc) 142 | 143 | -- Load in helper functions for this model defined in End-To-End-Generative-Dialogue 144 | local model_funcs_loc = "End-To-End-Generative-Dialogue/src/model_functions.lua" 145 | if not lfs.attributes(model_funcs_loc) then 146 | print('The file model_functions.lua could not be found in ' .. model_funcs_loc) 147 | os.exit() 148 | end 149 | funcs = loadfile(model_funcs_loc) 150 | funcs() 151 | 152 | -- Load in the data 153 | self:load_data() 154 | 155 | -- Setup and build the model 156 | self:build() 157 | 158 | -- Add remote computers if necessary 159 | if self.opt.remote then 160 | parallel.print('Runnings clients remotely') 161 | 162 | -- Open the list of client ip addresses 163 | local fh,err = io.open("../client_list.txt") 164 | if err then print("../client_list.txt not found"); return; end 165 | 166 | -- line by line 167 | while true do 168 | local line = fh:read() 169 | if line == nil then break end 170 | local addr = self.opt.username .. '@' .. line 171 | addr = string.gsub(addr, "\n", "") -- remove line breaks 172 | 173 | -- Add the remote server by ip address 174 | parallel.addremote( {ip=addr, cores=4, lua=self.opt.torch_path, protocol='ssh -ttq -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey'}) 175 | parallel.print('Adding address ', addr) 176 | end 177 | elseif opt.localhost then 178 | -- Has remote clients launched through localhost 179 | parallel.print('Running clients through localhost') 180 | 181 | parallel.addremote({ip='localhost', cores=4, lua=self.opt.torch_path, protocol='ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey'}) 182 | end 183 | end 184 | 185 | -- Main function that runs the server. Here the child clients are forked off and 186 | -- the code in the 'worker' function is sent to the clients to be run. Once 187 | -- the connection is established, :send() and :recieve() are used to pass 188 | -- parameters between the client and the server 189 | function demo_server:run() 190 | parallel.print('Forking ', self.opt.n_proc, ' processes') 191 | parallel.sfork(self.opt.n_proc) 192 | parallel.print('Forked') 193 | 194 | -- exec worker code in each process 195 | parallel.children:exec(worker) 196 | parallel.print('Finished telling workers to execute') 197 | 198 | --send the global parameters to the children 199 | parallel.children:join() 200 | parallel.print('Sending parameters to children') 201 | parallel.children:send({cmd = cmd, arg = arg, ext = self.opt.extension}) 202 | 203 | -- Get the responses from the children 204 | replies = parallel.children:receive() 205 | parallel.print('Replies from children', replies) 206 | 207 | -- Train the model 208 | train(self.model, self.criterion, self.train_data, self.valid_data) 209 | parallel.print('Finished training the model') 210 | 211 | -- sync/terminate when all workers are done 212 | parallel.children:join('break') 213 | parallel.print('All processes terminated') 214 | end 215 | 216 | -- Function loads in the training and validation data into self.train_data and 217 | -- seld.valid_data. 218 | function demo_server:load_data() 219 | -- Simply calls the load_data function defined in "End-To-End-Generative-Dialogue/src/model_functions.lua" 220 | self.train_data, self.valid_data, self.opt = load_data(self.opt) 221 | end 222 | 223 | -- Function loads in the nn model and criterion into self.model and self.criterion 224 | function demo_server:build() 225 | -- Simply calls the build function defined in "End-To-End-Generative-Dialogue/src/model_functions.lua" 226 | self.model, self.criterion = build() 227 | end 228 | 229 | -- Return the server 230 | return demo_server 231 | -------------------------------------------------------------------------------- /lua-lua/gcloud_commands.txt: -------------------------------------------------------------------------------- 1 | th server.lua -n_proc 2 -remote -extension $EXTENSION -add_to_path '/home/michaelfarrell/Distributed-SGD/lua-lua/End-To-End-Generative-Dialogue/src/?.lua;' -torch_path $TORCH_PATH | tee ada_2_rem.txt 2 | 3 | 4 | 5 | th server.lua -n_proc 4 -remote -extension $EXTENSION -add_to_path '/home/michaelfarrell/Distributed-SGD/lua-lua/End-To-End-Generative-Dialogue/src/?.lua;' -torch_path $TORCH_PATH | tee ada_4_rem.txt 6 | 7 | th server.lua -n_proc 8 -remote -extension $EXTENSION -add_to_path '/home/michaelfarrell/Distributed-SGD/lua-lua/End-To-End-Generative-Dialogue/src/?.lua;' -torch_path $TORCH_PATH -num_epochs 20 | tee ada_8_rem.txt 8 | 9 | th server.lua -n_proc 2 -ada_grad -learning_rate .1 | tee reg_2_loc.txt 10 | 11 | cd End-To-End-Generative-Dialogue/; git pull origin master; cd .. -------------------------------------------------------------------------------- /lua-lua/install_parallel.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Install script for parallel that uses local file init.lua 4 | # 5 | 6 | 7 | # Move into install directory 8 | cd .. 9 | if [ -e "install" ] 10 | then 11 | echo -e "\033[0;32minstall folder exists\033[0m" 12 | else 13 | echo -e "\033[0;34mMaking install repo ...\033[0m" 14 | mkdir install 15 | fi 16 | cd install 17 | 18 | # Ensure that parallel is downloaded and installed with local version 19 | if [ -e "lua---parallel" ] 20 | then 21 | echo -e "\033[0;32mparallel exists\033[0m" 22 | else 23 | echo -e "\033[0;34mCloining Parallel Repo ...\033[0m" 24 | git clone https://github.com/clementfarabet/lua---parallel.git &> /dev/null 25 | fi 26 | 27 | cd lua---parallel 28 | echo -e "\033[0;34mCopying local init.lua file for parallel...\033[0m" 29 | cp ../../lua-lua/parallel/init.lua . 30 | echo -e "\033[0;34mBuilding local version of parallel...\033[0m" 31 | luarocks remove parallel &> /dev/null 32 | luarocks make &> /dev/null 33 | echo -e "\033[0;32mInstall complete\033[0m" 34 | 35 | -------------------------------------------------------------------------------- /lua-lua/locally.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/locally.png -------------------------------------------------------------------------------- /lua-lua/outputs/104.154.239.139/ada_4_rem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.154.239.139/ada_4_rem.png -------------------------------------------------------------------------------- /lua-lua/outputs/104.154.239.139/ada_8_rem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.154.239.139/ada_8_rem.png -------------------------------------------------------------------------------- /lua-lua/outputs/104.197.106.197/ada_2_rem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.197.106.197/ada_2_rem.png -------------------------------------------------------------------------------- /lua-lua/outputs/104.197.222.148/ada_2_loc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.197.222.148/ada_2_loc.png -------------------------------------------------------------------------------- /lua-lua/outputs/104.197.222.148/reg_2.txt: -------------------------------------------------------------------------------- 1 | Im the parent, my ID is: 0 and my IP: 10.251.50.144 2 | Loading data... 3 | Source vocab size: 28721, Target vocab size: 42787 4 | Source max sent len: 52, Target max sent len: 52 5 | Done loading data! 6 | 7 | Building model with specs: 8 | Layer type: lstm 9 | Model type: red 10 | Embedding size: 300 11 | Hidden layer size: 300 12 | Number of layers: 2 13 | Number of parameters: 37219687 14 | 15 | Forking 2 processes 16 | Forked 17 | Finished telling workers to execute 18 | Sending parameters to children 19 | Im a worker, my ID is: 2 and my IP: 10.251.50.144 20 | . Im a worker, my ID is: 1 and my IP: 10.251.50.144 21 | . Recieved initialization parameters 22 | Recieved initialization parameters 23 | Replies from children { 24 | 1 : "Received parameters and loaded data successfully" 25 | 2 : "Received parameters and loaded data successfully" 26 | } 27 | Beginning training... 28 | Loading data... 29 | Source vocab size: 28721, Target vocab size: 42787 30 | Source max sent len: 52, Target max sent len: 52 31 | Done loading data! 32 | 33 | Building model with specs: 34 | Layer type: lstm 35 | Model type: red 36 | Embedding size: 300 37 | Hidden layer size: 300 38 | Number of layers: 2 39 | Number of parameters: 37219687 40 | 41 | . Loading data... 42 | Source vocab size: 28721, Target vocab size: 42787 43 | Source max sent len: 52, Target max sent len: 52 44 | Done loading data! 45 | 46 | Building model with specs: 47 | Layer type: lstm 48 | Model type: red 49 | Embedding size: 300 50 | Hidden layer size: 300 51 | Number of layers: 2 52 | Number of parameters: 37219687 53 | 54 | . received params from batch with index: 21 55 | sending back derivative for batch with index: 21 56 | -------------------------------------------------------------------------------- /lua-lua/outputs/104.197.250.103/reg_2.txt: -------------------------------------------------------------------------------- 1 | Im the parent, my ID is: 0 and my IP: 10.251.50.144 2 | Loading data... 3 | Source vocab size: 28721, Target vocab size: 42787 4 | Source max sent len: 52, Target max sent len: 52 5 | Done loading data! 6 | 7 | Building model with specs: 8 | Layer type: lstm 9 | Model type: red 10 | Embedding size: 300 11 | Hidden layer size: 300 12 | Number of layers: 2 13 | Number of parameters: 37219687 14 | 15 | Forking 2 processes 16 | Forked 17 | Finished telling workers to execute 18 | Sending parameters to children 19 | Im a worker, my ID is: 2 and my IP: 10.251.50.144 20 | . Im a worker, my ID is: 1 and my IP: 10.251.50.144 21 | . Recieved initialization parameters 22 | Recieved initialization parameters 23 | Replies from children { 24 | 1 : "Received parameters and loaded data successfully" 25 | 2 : "Received parameters and loaded data successfully" 26 | } 27 | Beginning training... 28 | Loading data... 29 | Source vocab size: 28721, Target vocab size: 42787 30 | Source max sent len: 52, Target max sent len: 52 31 | Done loading data! 32 | 33 | Building model with specs: 34 | Layer type: lstm 35 | Model type: red 36 | Embedding size: 300 37 | Hidden layer size: 300 38 | Number of layers: 2 39 | Number of parameters: 37219687 40 | 41 | . Loading data... 42 | Source vocab size: 28721, Target vocab size: 42787 43 | Source max sent len: 52, Target max sent len: 52 44 | Done loading data! 45 | 46 | Building model with specs: 47 | Layer type: lstm 48 | Model type: red 49 | Embedding size: 300 50 | Hidden layer size: 300 51 | Number of layers: 2 52 | Number of parameters: 37219687 53 | 54 | . received params from batch with index: 21 55 | sending back derivative for batch with index: 21 56 | -------------------------------------------------------------------------------- /lua-lua/outputs/104.197.250.103/reg_2_loc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.197.250.103/reg_2_loc.png -------------------------------------------------------------------------------- /lua-lua/outputs/130.211.192.196/reg_1_loc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/130.211.192.196/reg_1_loc.png -------------------------------------------------------------------------------- /lua-lua/outputs/130.211.192.196/reg_2.txt: -------------------------------------------------------------------------------- 1 | Im the parent, my ID is: 0 and my IP: 10.251.50.144 2 | Loading data... 3 | Source vocab size: 28721, Target vocab size: 42787 4 | Source max sent len: 52, Target max sent len: 52 5 | Done loading data! 6 | 7 | Building model with specs: 8 | Layer type: lstm 9 | Model type: red 10 | Embedding size: 300 11 | Hidden layer size: 300 12 | Number of layers: 2 13 | Number of parameters: 37219687 14 | 15 | Forking 2 processes 16 | Forked 17 | Finished telling workers to execute 18 | Sending parameters to children 19 | Im a worker, my ID is: 2 and my IP: 10.251.50.144 20 | . Im a worker, my ID is: 1 and my IP: 10.251.50.144 21 | . Recieved initialization parameters 22 | Recieved initialization parameters 23 | Replies from children { 24 | 1 : "Received parameters and loaded data successfully" 25 | 2 : "Received parameters and loaded data successfully" 26 | } 27 | Beginning training... 28 | Loading data... 29 | Source vocab size: 28721, Target vocab size: 42787 30 | Source max sent len: 52, Target max sent len: 52 31 | Done loading data! 32 | 33 | Building model with specs: 34 | Layer type: lstm 35 | Model type: red 36 | Embedding size: 300 37 | Hidden layer size: 300 38 | Number of layers: 2 39 | Number of parameters: 37219687 40 | 41 | . Loading data... 42 | Source vocab size: 28721, Target vocab size: 42787 43 | Source max sent len: 52, Target max sent len: 52 44 | Done loading data! 45 | 46 | Building model with specs: 47 | Layer type: lstm 48 | Model type: red 49 | Embedding size: 300 50 | Hidden layer size: 300 51 | Number of layers: 2 52 | Number of parameters: 37219687 53 | 54 | . received params from batch with index: 21 55 | sending back derivative for batch with index: 21 56 | -------------------------------------------------------------------------------- /lua-lua/outputs/130.211.204.149/ada_1_loc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/130.211.204.149/ada_1_loc.png -------------------------------------------------------------------------------- /lua-lua/outputs/130.211.204.149/reg_2.txt: -------------------------------------------------------------------------------- 1 | Im the parent, my ID is: 0 and my IP: 10.251.50.144 2 | Loading data... 3 | Source vocab size: 28721, Target vocab size: 42787 4 | Source max sent len: 52, Target max sent len: 52 5 | Done loading data! 6 | 7 | Building model with specs: 8 | Layer type: lstm 9 | Model type: red 10 | Embedding size: 300 11 | Hidden layer size: 300 12 | Number of layers: 2 13 | Number of parameters: 37219687 14 | 15 | Forking 2 processes 16 | Forked 17 | Finished telling workers to execute 18 | Sending parameters to children 19 | Im a worker, my ID is: 2 and my IP: 10.251.50.144 20 | . Im a worker, my ID is: 1 and my IP: 10.251.50.144 21 | . Recieved initialization parameters 22 | Recieved initialization parameters 23 | Replies from children { 24 | 1 : "Received parameters and loaded data successfully" 25 | 2 : "Received parameters and loaded data successfully" 26 | } 27 | Beginning training... 28 | Loading data... 29 | Source vocab size: 28721, Target vocab size: 42787 30 | Source max sent len: 52, Target max sent len: 52 31 | Done loading data! 32 | 33 | Building model with specs: 34 | Layer type: lstm 35 | Model type: red 36 | Embedding size: 300 37 | Hidden layer size: 300 38 | Number of layers: 2 39 | Number of parameters: 37219687 40 | 41 | . Loading data... 42 | Source vocab size: 28721, Target vocab size: 42787 43 | Source max sent len: 52, Target max sent len: 52 44 | Done loading data! 45 | 46 | Building model with specs: 47 | Layer type: lstm 48 | Model type: red 49 | Embedding size: 300 50 | Hidden layer size: 300 51 | Number of layers: 2 52 | Number of parameters: 37219687 53 | 54 | . received params from batch with index: 21 55 | sending back derivative for batch with index: 21 56 | -------------------------------------------------------------------------------- /lua-lua/parse_outputs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | """Copy files from servers 6 | """ 7 | 8 | from __future__ import print_function 9 | 10 | import sys 11 | import os 12 | import re 13 | import numpy as np 14 | import warnings 15 | warnings.filterwarnings("ignore", category=UserWarning) 16 | import matplotlib.pyplot as plt 17 | 18 | 19 | class Print: 20 | def red(self, prt): print("\033[91m{}\033[00m" .format(prt), end="") 21 | def green(self, prt): print("\033[92m{}\033[00m" .format(prt), end="") 22 | def yellow(self, prt): print("\033[93m{}\033[00m" .format(prt), end="") 23 | def lightpurple(self, prt): print("\033[94m{}\033[00m" .format(prt), end="") 24 | def purple(self, prt): print("\033[95m{}\033[00m" .format(prt), end="") 25 | def cyan(self, prt): print("\033[96m{}\033[00m" .format(prt), end="") 26 | def lightgray(self, prt): print("\033[97m{}\033[00m" .format(prt), end="") 27 | def black(self, prt): print("\033[98m{}\033[00m" .format(prt), end="") 28 | 29 | class Result: 30 | def __init__(self, floc): 31 | self.results = [] 32 | self.floc = floc 33 | self.loc_split = floc.split('/') 34 | self.fname = self.loc_split[-1] 35 | self.ip_addr = self.loc_split[-2] 36 | self.no_ext = self.fname.split('.')[0] 37 | self.ada_grad, self.n_proc, self.loc = self.no_ext.split('_') 38 | self.n_proc = int(self.n_proc) 39 | 40 | if self.ada_grad == 'ada': 41 | self.ada_grad = 'ada grad SGD' 42 | else: 43 | self.ada_grad = 'simple SGD' 44 | 45 | if self.loc == 'rem': 46 | self.loc = 'remotely' 47 | else: 48 | self.loc = 'locally' 49 | 50 | self.description = '%d processes, %s, running %s' % (self.n_proc, self.ada_grad, self.loc) 51 | 52 | def add_result(self, result): 53 | self.results.append(result) 54 | 55 | def get_data(self, max_epoch, min_t): 56 | return [result.time_ellapse for result in self.results if (max_epoch is None or result.epoch <= max_epoch) and (min_t is None or result.time_ellapse >= min_t)], [np.log(result.perplexity) for result in self.results if (max_epoch is None or result.epoch <= max_epoch) and (min_t is None or result.time_ellapse >= min_t)] 57 | 58 | def graph(self, close = True, out_name = None, max_epoch = None, min_t = None): 59 | times, log_perps = self.get_data(max_epoch, min_t) 60 | 61 | plt.ylabel('Log perplexity') 62 | plt.xlabel('Time (s)') 63 | 64 | plt.title(self.description) 65 | plt.plot(times, log_perps, label = self.description) 66 | 67 | if close: 68 | if out_name == None: 69 | out_name = "/".join(self.loc_split[:-1]) + '/' + self.no_ext + '.png' 70 | 71 | plt.savefig(out_name) 72 | plt.clf() 73 | plt.cla() 74 | plt.close() 75 | 76 | 77 | def display(self): 78 | Print().green('Results for file %s \n' % self.floc) 79 | 80 | Print().lightpurple('Number of processes: ') 81 | print(self.n_proc) 82 | 83 | Print().lightpurple('SGD type: ') 84 | print(self.ada_grad) 85 | 86 | Print().lightpurple('Running location: ') 87 | print(self.loc) 88 | 89 | Print().lightpurple('Server: ') 90 | print(self.ip_addr) 91 | 92 | if len(self.results) == 0: 93 | Print().red('No results\n') 94 | return 95 | 96 | Print().lightpurple('Number of batches: ') 97 | print(self.results[0].n_batch) 98 | 99 | epoch = -1 100 | for result in self.results: 101 | if result.epoch != epoch: 102 | epoch = result.epoch 103 | Print().yellow('Epoch: %d\n' % epoch) 104 | result.display() 105 | 106 | class DataPoint: 107 | def __init__(self, line): 108 | # Store the line itself 109 | self.line = line 110 | 111 | # The epoch we're on 112 | self.epoch = int(self.clean_match('Epoch: (.*?), Batch:', line)) 113 | 114 | # Current batch, total number of batches, current batchsuze 115 | self.batch_str = self.clean_match('Batch: (.*?), Batch size:', line) 116 | batch_splt = str.split(self.batch_str, '/') 117 | self.batch, self.n_batch = [int(ind) for ind in batch_splt] 118 | self.batch_size = int(self.clean_match('Batch size: (.*?), LR:', line)) 119 | 120 | self.learning_rate = float(self.clean_match('LR: (.*?), PPL: ', line)) 121 | 122 | self.perplexity = float(self.clean_match('PPL: (.*?), |Param|:', line)) 123 | 124 | self.speed = self.clean_match('Training: (.*?) total/source/target', line) 125 | 126 | self.time_ellapse = int(str.split(line)[-1]) 127 | 128 | 129 | def clean_match(self, pattern, string): 130 | res = re.findall(pattern, string) 131 | return filter(lambda x: x != '', res)[0] 132 | def display(self): 133 | args = (self.batch, self.perplexity, self.time_ellapse) 134 | print('Batch: %d, perplexity: %.2f, time: %d\n' % args, end = "") 135 | 136 | class Results: 137 | def __init__(self): 138 | self.results = [] 139 | 140 | def add_result(self, result): 141 | self.results.append(result) 142 | 143 | def graph(self, location = None, max_epoch = None, min_t = None): 144 | for result in self.results: 145 | if location == None or result.loc == location: 146 | result.graph(close = False, max_epoch = max_epoch, min_t = min_t) 147 | if location == None: 148 | out_name = "All.png" 149 | else: 150 | out_name = location + ".png" 151 | plt.title(location) 152 | plt.legend(bbox_to_anchor=(1.05, 1)) 153 | 154 | plt.savefig(out_name) 155 | plt.clf() 156 | plt.cla() 157 | plt.close() 158 | 159 | 160 | 161 | def process_file(path_to_file): 162 | result = Result(path_to_file) 163 | with open(path_to_file) as f: 164 | for line in f: 165 | if 'total/source/target' in line: 166 | # Parse the line into a DataPoint object 167 | data_point = DataPoint(line) 168 | 169 | # Add the datapoint to the result 170 | result.add_result(data_point) 171 | 172 | result.display() 173 | result.graph() 174 | return result 175 | 176 | 177 | 178 | def main(arguments): 179 | 180 | while True: 181 | print('Copying over files') 182 | # Updating files 183 | os.system('python copy_files.py') 184 | 185 | import time 186 | time.sleep(3) 187 | 188 | # hold the results 189 | results = Results() 190 | 191 | # Get all folders of ip addresses 192 | for ip_fold in os.walk('outputs'): 193 | 194 | # Find the .txt files 195 | for file in os.listdir(ip_fold[0]): 196 | if file.endswith(".txt") and len(file.split('_')) == 3: 197 | 198 | # Full path to the file 199 | full_path = ip_fold[0] + '/' + file 200 | 201 | result = process_file(full_path) 202 | 203 | results.add_result(result) 204 | 205 | results.graph(location = 'locally', max_epoch = 7) 206 | results.graph(location = 'remotely', min_t = 50, max_epoch = 10) 207 | time.sleep(20) 208 | 209 | if __name__ == '__main__': 210 | sys.exit(main(sys.argv[1:])) -------------------------------------------------------------------------------- /lua-lua/remotely.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/remotely.png -------------------------------------------------------------------------------- /lua-lua/server.lua: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------ 2 | -- server.lua 3 | -- 4 | -- A general Distributed SGD Parameter server written in lua/torch 5 | -- 6 | -- The is a general parameter server file. It takes in the command line 7 | -- options that are necessary to launch the server. The server 8 | -- will be of the class: 'server_class'. The 'server_class' must 9 | -- be a class definied with two required functions: :__init() and :run() 10 | -- This file will load in the class, call, making a new object via 11 | -- the __init() functionm and then call the run() function inside 12 | -- a protected loop 13 | -- 14 | -- The 'add_to_path' option is a string that will be appended onto the 15 | -- path before requiring the new 'server_class' 16 | -- 17 | -- Run 18 | -- th server.lua --help 19 | -- to see a full list of options for the parameter server 20 | ------------------------------------------------------------------------ 21 | 22 | -- Library used to run clients in parallel 23 | require 'parallel' 24 | 25 | -- Used to update the path variable 26 | require 'package' 27 | 28 | ------------ 29 | -- Options 30 | ------------ 31 | 32 | cmd = torch.CmdLine() 33 | 34 | cmd:text("") 35 | cmd:text("**General options**") 36 | cmd:text("") 37 | 38 | cmd:option('-server_class', 'demo_server', 'Class name to use') 39 | cmd:option('-add_to_path' , './End-To-End-Generative-Dialogue/src/?.lua;', 'A string that will be appended on to the front of the path') 40 | 41 | cmd:text("") 42 | cmd:text("**_____________________________**") 43 | cmd:text("Below are all options specific to models") 44 | cmd:text("**_____________________________**") 45 | cmd:text("") 46 | 47 | cmd:text("") 48 | cmd:text("**Data options**") 49 | cmd:text("") 50 | cmd:option('-data_file', 'data/demo-train.hdf5', 'Path to the training *.hdf5 file') 51 | cmd:option('-val_data_file','data/demo-val.hdf5', 'Path to validation *.hdf5 file') 52 | cmd:option('-save_file', 'demo-seq2seq_lstm', 'Save file name (model will be saved as savefile_epochX_PPL.t7 where X is the X-th epoch and PPL is the validation perplexity') 53 | cmd:option('-train_from', '', 'If training from a checkpoint then this is the path to the pretrained model.') 54 | 55 | cmd:text("") 56 | cmd:text("**Model options**") 57 | cmd:text("") 58 | 59 | cmd:option('-num_layers', 2, 'Number of layers in the LSTM encoder/decoder') 60 | cmd:option('-hidden_size', 300, 'Size of LSTM hidden states') 61 | cmd:option('-word_vec_size', 300, 'Word embedding sizes') 62 | cmd:option('-layer_type', 'lstm', 'Recurrent layer type (rnn, gru, lstm, fast)') 63 | cmd:option('-model_type', 'red', 'Model structure (red, hred)') 64 | 65 | 66 | cmd:text("") 67 | cmd:text("**Optimization options**") 68 | cmd:text("") 69 | 70 | cmd:option('-num_epochs', 10, 'Number of training epochs') 71 | cmd:option('-start_epoch', 1, 'If loading from a checkpoint, the epoch from which to start') 72 | cmd:option('-param_init', 0.1, 'Parameters are initialized over uniform distribution with support (-param_init, param_init)') 73 | cmd:option('-learning_rate', .01, 'Starting learning rate') 74 | cmd:option('-ada_grad', true, 'When true, update parameters using adagrad algorithm') 75 | cmd:option('-max_grad_norm', 5, 'If the norm of the gradient vector exceeds this, renormalize it to have the norm equal to max_grad_norm') 76 | cmd:option('-dropout', 0.3, 'Dropout probability. Dropout is applied between vertical LSTM stacks.') 77 | cmd:option('-lr_decay', 0.5, 'Decay learning rate by this much if (i) perplexity does not decrease on the validation set or (ii) epoch has gone past the start_decay_at_limit') 78 | cmd:option('-start_decay_at', 9, 'Start decay after this epoch') 79 | cmd:option('-fix_word_vecs', 0, 'If = 1, fix lookup table word embeddings') 80 | cmd:option('-beam_k', 5, 'K value to use with beam search') 81 | cmd:option('-max_bleu', 4, 'The number of n-grams used in calculating the bleu score') 82 | cmd:option('-pre_word_vecs', '', 'If a valid path is specified, then this will load pretrained word embeddings (hdf5 file) on the encoder side. See README for specific formatting instructions.') 83 | 84 | cmd:text("") 85 | cmd:text("**Other options**") 86 | cmd:text("") 87 | 88 | -- GPU (not supported on servers) 89 | cmd:option('-gpuid', -1, 'Which gpu to use. -1 = use CPU') 90 | cmd:option('-gpuid2', -1, 'If this is >= 0, then the model will use two GPUs whereby the encoder is on the first GPU and the decoder is on the second GPU. This will allow you to train with bigger batches/models.') 91 | 92 | -- Bookkeeping 93 | cmd:option('-save_every', 1, 'Save every this many epochs') 94 | cmd:option('-print_every', 5, 'Print stats after this many batches') 95 | cmd:option('-seed', 3435, 'Seed for random initialization') 96 | 97 | 98 | -- Parallel options 99 | cmd:option('-n_proc', 4, 'The number of processes to farm out') 100 | cmd:option('-remote', false, 'When true, the farmed out processes are run on remote servers. This overrides localhost') 101 | cmd:option('-localhost', false, 'When true, the farmed out processes are run on localhost. ') 102 | 103 | cmd:option('-torch_path', '/Users/michaelfarrell/torch/install/bin/th', 'The path to the torch directory on the client computers') 104 | cmd:option('-extension', '', 'The location from the home directory to the lua-lua folder on the client computer') 105 | cmd:option('-username', 'michaelfarrell', 'The username for connecting used for connecting to remote clients') 106 | 107 | -- Parse arguments 108 | opt = cmd:parse(arg) 109 | torch.manualSeed(opt.seed) 110 | 111 | -- Indicate we are running things in parallel 112 | opt.parallel = true 113 | 114 | -- The print function 115 | opt.print = parallel.print 116 | 117 | -- Add on location to path of new class if not already in path 118 | package.path = opt.add_to_path .. package.path 119 | 120 | -- Main server function, initializes and runs 121 | function server_main() 122 | -- Load in the class type 123 | server = require(opt.server_class) 124 | 125 | -- Print from parent process 126 | parallel.print('Im the parent, my ID is: ', parallel.id, ' and my IP: ', parallel.ip) 127 | 128 | -- Create a new server 129 | param_server = server.new(opt) 130 | 131 | -- Run the server 132 | param_server:run() 133 | 134 | end 135 | 136 | -- Protected execution of parllalel script: 137 | ok, err = pcall(server_main) 138 | if not ok then print(err) parallel.close() end 139 | -------------------------------------------------------------------------------- /lua-lua/setup_image.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # setup_image.sh 4 | # 5 | # This is a bash script that is used to setup an image on the google cloud server 6 | # it copies over the startup script, runs the script, disconnects and reconnects, 7 | # then reruns the startup script 8 | 9 | # Copy over the startup script 10 | scp -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey startup.sh $USERNAME@$EXTERNAL_IP:~/ 11 | 12 | # Run the startup script on the server 13 | echo "bash startup.sh" | ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$EXTERNAL_IP 14 | 15 | # Disconnect from the server, reconnect and finish running last things needed for initialization 16 | echo "bash startup.sh; " | ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$EXTERNAL_IP 17 | -------------------------------------------------------------------------------- /lua-lua/startup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # gcloud_startup.sh 4 | # 5 | # This is a bash script that is used to setup a google cloud server. This script 6 | # will install the following on the server: 7 | # - git 8 | # - luarocks 9 | # - pip 10 | # - torch 11 | # - lua-parallel (local version) 12 | # - rnn (torch) 13 | # - hdf5 (torch) 14 | # - anaconda 15 | # - h5py 16 | # The script will also clone the Distributed-SGD repo onto the server 17 | 18 | # Ensure that git is installed 19 | if hash git &> /dev/null 20 | then 21 | echo -e "\033[0;32mgit installed\033[0m" 22 | else 23 | echo -e "\033[0;34mInstalling git ...\033[0m" 24 | (echo "Y" | sudo apt-get install git) > /dev/null 25 | fi 26 | 27 | # Ensure that luarocks is installed 28 | if hash luarocks &> /dev/null 29 | then 30 | echo -e "\033[0;32mluarocks installed\033[0m" 31 | else 32 | echo -e "\033[0;34mInstalling luarocks ...\033[0m" 33 | (echo "Y" | sudo apt-get install luarocks) &> /dev/null 34 | fi 35 | 36 | # Ensure that pip is installed 37 | if hash pip &> /dev/null 38 | then 39 | echo -e "\033[0;32mpython-pip installed\033[0m" 40 | else 41 | echo -e "\033[0;34mInstalling python-pip ...\033[0m" 42 | (echo "Y" | sudo apt-get install python-pip) > /dev/null 43 | fi 44 | 45 | source ~/.profile 46 | 47 | # Ensure that torch is installed 48 | if hash th &> /dev/null 49 | then 50 | echo -e "\033[0;32mtorch installed\033[0m" 51 | else 52 | echo -e "\033[0;34mInstalling torch ...\033[0m" 53 | git clone https://github.com/torch/distro.git ~/torch --recursive &> /dev/null 54 | cd ~/torch 55 | bash install-deps 2&>1 > /dev/null 56 | echo "yes" | ./install.sh 2&>1 > /dev/null 57 | cd .. 58 | source ~/.profile 59 | fi 60 | 61 | # Ensure that rnn is installed 62 | if (luarocks list | grep -q rnn) &> /dev/null 63 | then 64 | echo -e "\033[0;32mrnn installed\033[0m" 65 | else 66 | echo -e "\033[0;34mInstalling rnn ...\033[0m" 67 | luarocks install rnn &> /dev/null 68 | fi 69 | 70 | # Ensure that torch-hdf5 is installed 71 | if (luarocks list | grep -q hdf5) &> /dev/null 72 | then 73 | echo -e "\033[0;32mhdf5 installed\033[0m" 74 | else 75 | echo -e "\033[0;34mInstalling hdf5 ...\033[0m" 76 | echo "Y" | sudo apt-get install libhdf5-serial-dev hdf5-tools > /dev/null 77 | git clone https://github.com/deepmind/torch-hdf5.git &> /dev/null 78 | cd torch-hdf5 79 | luarocks make hdf5-0-0.rockspec &> /dev/null 80 | cd .. 81 | fi 82 | 83 | # Make sure that the Distributed SGD is downloaded and isntalled 84 | if [ -e "Distributed-SGD" ] 85 | then 86 | # Update the repos 87 | echo -e "\033[0;34mPulling Distributed-SGD repo changes ...\033[0m" 88 | cd Distributed-SGD 89 | git pull &> /dev/null 90 | cd lua-lua/End-To-End-Generative-Dialogue 91 | echo -e "\033[0;34mPulling End-To-End-Generative-Dialogue repo changes ...\033[0m" 92 | git pull origin master &> /dev/null 93 | 94 | cd ../../.. 95 | else 96 | # Clone repo and install parallel 97 | echo -e "\033[0;34mCloning repo Distributed-SGD ...\033[0m" 98 | git clone --recursive https://github.com/michaelfarrell76/Distributed-SGD.git &> /dev/null 99 | cd Distributed-SGD/lua-lua 100 | bash install_parallel.sh 101 | cd ../../ 102 | fi 103 | 104 | # Ensure that anaconda is installed 105 | if [ -e "anaconda2" ] 106 | then 107 | echo -e "\033[0;32manaconda installed\033[0m" 108 | echo -e "\033[0;34mInstalling h5py ...\033[0m" 109 | 110 | # Install hdf5 for python 111 | echo "y" | conda install h5py &> /dev/null 112 | else 113 | echo -e "\033[0;34mDownloading anaconda ...\033[0m" 114 | wget http://repo.continuum.io/archive/Anaconda2-4.0.0-Linux-x86_64.sh &> /dev/null 115 | echo -e "\033[0;34mInstalling anaconda ...\033[0m" 116 | bash Anaconda2-4.0.0-Linux-x86_64.sh -b > /dev/null 117 | rm Anaconda2-4.0.0-Linux-x86_64.sh 118 | echo 'export PATH="/home/michaelfarrell/anaconda2/bin:$PATH"' > .bashrc 119 | echo -e "\033[0;33mIn order for python to be run, you must logout and log back in\033[0m" 120 | fi 121 | 122 | -------------------------------------------------------------------------------- /python-python/README.md: -------------------------------------------------------------------------------- 1 | # Distributed-SGD for Python 2 | An implementation of distributed stochastic gradient descent in python. Clients can be local and remote. For this task, you can download the data from http://www.vision.caltech.edu/Image_Datasets/Caltech101/. 3 | 4 | ## Requirements 5 | 6 | This code is written entirely in Python, and an installation of gRPC, Numpy, Scipy, and Autograd are necessary. These packages can be easily installed through PIP using the following commands. 7 | 8 | ```bash 9 | $ pip install numpy 10 | $ pip install scipy 11 | $ pip install autograd 12 | $ pip install grpcio 13 | ``` 14 | 15 | For launching the code remotely, we will be working with Google Cloud Compute. In order to interact with GCloud instances, please install the GCloud sdk. This is located here: https://cloud.google.com/sdk/. 16 | 17 | ## Directory Table of Contents 18 | ``` 19 | . 20 | ├── 101_ObjectCategories # Folder holding the raw data from the 101_ObjectCategories 21 | |-- data # Folder holding the processed data 22 | ├── client.py # Python script used to initiate a client 23 | |── server.py # Python script to manually initiate a server 24 | ├── dist_sgd_pb2.py # Automatically compiled protobufs for the parameter server 25 | ├── README.md # Python usage 26 | ├── images(16).npy # Extremely small dataset included for reference 27 | ├── output_labels(16).npy # Classifications of each image for the extremely small dataset 28 | ├── nnet # Folder that includes a module for a convolution neural net 29 | ├── protobuf_utils # Folder that includes utilities for manipulating tensor protobuffers 30 | ├── run_codegen.sh # Shell command used to generates the protobuffers 31 | └── start.sh # Script that launches client.py on when running within gCloud 32 | ``` 33 | 34 | ## Description 35 | 36 | ## Local Usage Instructions 37 | To launch clients locally, in three different terminals, simply run: 38 | ```bash 39 | $ python client.py --id 1 40 | $ python client.py --id 2 41 | $ python client.py --id 3 42 | ``` 43 | 44 | #### Remote Usage Instructions 45 | 46 | ##### Create a baseline startup image 47 | 48 | We only have to setup and install everything once, after which we can clone that image repeatedly when we launch VMs. 49 | 50 | ###### Create the image 51 | - Click on the 'VM Instances' tab 52 | - Create Instance 53 | - Give the instance a name i.e. 'train-conv-nn' 54 | - Set the zone to us-central1-b 55 | - Choose 2vCPU highmem as machine type 56 | - Under boot disk click change 57 | - Choose Ubuntu 14.04 LTS 58 | - At the bottom change size to 30 GB and click 'select' 59 | - Allow HTTP traffic 60 | - Allow HTTPS traffic 61 | - Click 'Management, disk, networking, SSH keys' to dropdown more options 62 | - Under 'Disk' unclick 'Delete boot disk when instance is deleted' 63 | - Click 'Create' an you should see your new instance listed in the table 64 | 65 | ###### Setup the disk 66 | - Run the command gcloud init and log into your Google Cloud account 67 | - Run the command to SSH into your instance: 68 | ```bash 69 | $ gcloud compute ssh train-conv-nn --zone us-central1-b 70 | ``` 71 | - After logging in, we can clone the repository and install the necessary requirements. 72 | - Once the server is setup to your liking, disconnect from the server and return to your google cloud dashboard 73 | - Go to the 'VM Dashboard' 74 | - Click on the instance you just setup, and delete it. This should remove the instance and save it as a disk. If you click on the 'disks' tab, you should see the instance name you just deleted. 75 | 76 | ###### Create the image 77 | 78 | - Click on the 'Images' tab 79 | - 'Create Image' 80 | - Give it a name i.e. 'train-conv-image' 81 | - Under Source-Disk, choose the disk that you just created 82 | - Create 83 | 84 | ##### Generate an 'Instance Template' 85 | - Click on the 'Instance templates' tab 86 | - Create new 87 | - Name the template i.e. 'train-conv-template' 88 | - Under 'Boot Disk' click change 89 | - At the top click 'Your image' 90 | - Choose the image you just created i.e. 'train-conv-image' 91 | - Set size to 30 GB 92 | - Select 93 | - Allow HTTP traffic 94 | - Allow HTTPS traffic 95 | - Under more->Management, include cd ~/distributed-sgd/python-python; sh start.sh 96 | in startup script 97 | - Under more->Disks, unclick 'Delete boot disk when instance is deleted' 98 | - Create 99 | 100 | ##### Generate an 'Instance Group' 101 | - Go to the "Instance groups" tab 102 | - Create instance group 103 | - Give the group a name, i.e. 'train-conv-group' 104 | - Give a description 105 | - Set zone to us-central1-b 106 | - Use instance template 107 | - Choose the template you just made i.e. 'train-conv-template' 108 | - Set the number of instances 109 | - Create 110 | - Wait for the instances to launch 111 | - Once there is a green checkmark, click on the new instance 112 | 113 | All instances in the instance group are now running the python client.py command and will begin training. 114 | SSH into any of the instances to see their progress. 115 | 116 | ## Acknowledgments 117 | 118 | Our implementation adapts code for the convolutional neural net from the Autograd convolution neural net example: 119 | 120 | * [Autograd](https://github.com/HIPS/autograd) -------------------------------------------------------------------------------- /python-python/client.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------ 2 | # Implements a client that runs backpropogation on batches 3 | # provided by the server. If no server exists, then Paxos 4 | # is called to generate a server. 5 | # ------------------------------------------------------------ 6 | 7 | from __future__ import print_function 8 | from __future__ import absolute_import 9 | from grpc.beta import implementations 10 | import time 11 | import sys 12 | 13 | import dist_sgd_pb2 14 | import argparse 15 | import traceback 16 | 17 | import autograd.numpy as np 18 | import autograd.numpy.random as npr 19 | from autograd import grad 20 | 21 | from nnet.neural_net import * 22 | from protobuf_utils.utils import * 23 | from server_utils.utils import * 24 | 25 | from server import serve 26 | from paxos import run_paxos 27 | import subprocess 28 | 29 | 30 | # Loads in a really small version of the data that could fit in Github. 31 | # It will train extremely quickly as a result. 32 | images_fname = 'data/images(16).npy' 33 | labels_fname = 'data/output_labels(16).npy' 34 | 35 | _TIMEOUT_SECONDS = 20 36 | TENSOR_TIMEOUT_SECONDS = 60 37 | SERVER_PORT = 50051 38 | 39 | # Loops through all possible addressses that are part of the instance 40 | # group if this is launched on a remote server. Loops through all possible 41 | # addresses that are part of the local server as well. 42 | # Determines whether or not a server exists by trying to connect with the 43 | # a predefined port on the server 44 | def find_server(local_id=None): 45 | TOT_ATTEMPTS = 1 46 | for i in range(TOT_ATTEMPTS): 47 | # Generates local address information 48 | local_address = gen_local_address(local_id) 49 | server_addresses = gen_server_addresses(local_id, local_address) 50 | server_addresses.remove(local_address) 51 | 52 | # Loops through all the servers and tries to makes the server stub 53 | for server_address in server_addresses: 54 | if local_id is not None: 55 | channel = implementations.insecure_channel('localhost', SERVER_PORT) 56 | else: 57 | channel = implementations.insecure_channel(server_address, SERVER_PORT) 58 | stub = dist_sgd_pb2.beta_create_ParamFeeder_stub(channel) 59 | try: 60 | # Attempts to ping the server to see if the port is open 61 | response = stub.ping(dist_sgd_pb2.empty(), _TIMEOUT_SECONDS) 62 | 63 | # If the PING succeeds, then it is the server 64 | return server_address 65 | 66 | except Exception as e: 67 | # Log any network or expiration errors we run into 68 | if ('ExpirationError' in str(e) or 'NetworkError' in str(e)): 69 | log_info(str(e)) 70 | continue 71 | else: 72 | # More severe error, should log and crash 73 | traceback.print_exc() 74 | sys.exit(1) 75 | time.sleep(1 * TOT_ATTEMPTS) 76 | return '' 77 | 78 | # After determining the correct server, generate the stub for it 79 | def connect_server_stub(server_addr, local_id): 80 | if local_id is not None: 81 | channel = implementations.insecure_channel('localhost', SERVER_PORT) 82 | else: 83 | channel = implementations.insecure_channel(server_addr, SERVER_PORT) 84 | stub = dist_sgd_pb2.beta_create_ParamFeeder_stub(channel) 85 | return stub 86 | 87 | 88 | # Main function of the client that loops forever. Receieves parameters and 89 | # batch information from the server. Calculates gradients and sends them 90 | # to the server 91 | def run(local_id = None): 92 | # Load and process Caltech data 93 | train_images, train_labels, test_images, test_labels = load_caltech100(images_fname, labels_fname) 94 | image_input_d = train_images.shape[1] 95 | 96 | # Network parameters 97 | layer_sizes = [image_input_d, 800, 600, 400, 350, 250, 101] 98 | 99 | L2_reg = 1.0 100 | 101 | # Training parameters 102 | param_scale = 0.1 103 | momentum = 0.9 104 | batch_size = 256 105 | num_epochs = 50 106 | 107 | # Make neural net functions 108 | N_weights, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg) 109 | loss_grad = grad(loss_fun) 110 | 111 | # Train with sgd 112 | batch_idxs = make_batches(train_images.shape[0], batch_size) 113 | cur_dir = np.zeros(N_weights) 114 | 115 | # Previous batch for the purpose of timing 116 | prev_data_indx = -1 117 | 118 | # Number of consective expirations, used to detect server failure 119 | consec_expiration = 0 120 | 121 | # Determine the server address by running Paxos or pinging all addresses 122 | server_addr = '' 123 | while server_addr == '': 124 | server_addr = run_paxos(local_id) 125 | if server_addr == '': 126 | server_addr = find_server(local_id) 127 | log_info('Server address is ' + server_addr) 128 | 129 | # If this client is selected to be server, then transform into a server 130 | if server_addr == gen_local_address(local_id): 131 | log_info('Transforming into the server') 132 | try: 133 | serve(server_addr, None, prev_data_indx, local_id) 134 | except KeyboardInterrupt as e: 135 | log_info('interrupted') 136 | sys.exit(0) 137 | return 138 | 139 | # Generates the server stub and connects with it 140 | stub = connect_server_stub(server_addr, local_id) 141 | client_id = 0 142 | 143 | log_info('Data loaded and connected to server:') 144 | 145 | try: 146 | # Gets the next batch that it should run 147 | response = stub.SendNextBatch(dist_sgd_pb2.PrevBatch(client_id=client_id, prev_data_indx=prev_data_indx), _TIMEOUT_SECONDS) 148 | while response.data_indx != -2: 149 | client_id = response.client_id 150 | # If this fails, it keeps on trying to get your first batch 151 | while response.data_indx == -1: 152 | time.sleep(5) 153 | log_info('Waiting for server to send next batch') 154 | response = stub.SendNextBatch(dist_sgd_pb2.PrevBatch(client_id=client_id, prev_data_indx=prev_data_indx), _TIMEOUT_SECONDS) 155 | log_info('Processing parameters in batch %d!' % response.data_indx) 156 | 157 | # Generates the W matrix 158 | get_parameters_time = time.time() 159 | W_bytes = '' 160 | W_subtensors_iter = stub.SendParams(dist_sgd_pb2.ClientInfo(client_id=client_id), TENSOR_TIMEOUT_SECONDS) 161 | for W_subtensor_pb in W_subtensors_iter: 162 | W_bytes = W_bytes + W_subtensor_pb.tensor_content 163 | W = convert_bytes_to_array(W_bytes) 164 | log_info('Received parameters in {0:.2f}s'.format(time.time() - get_parameters_time)) 165 | 166 | # Calculate the gradients 167 | grad_start = time.time() 168 | grad_W = loss_grad(W, train_images[batch_idxs[response.data_indx]], train_labels[batch_idxs[response.data_indx]]) 169 | log_info('Done calculating gradients in {0:.2f}s'.format(time.time() - grad_start)) 170 | 171 | # Serialize the gradients 172 | tensor_compress_start = time.time() 173 | tensor_bytes = convert_array_to_bytes(grad_W) 174 | tensor_iterator = convert_tensor_iter(tensor_bytes, response.data_indx) 175 | log_info('Done compressing gradients in {0:.2f}s'.format(time.time() - tensor_compress_start)) 176 | 177 | # Send the gradients 178 | send_grad_start = time.time() 179 | stub.GetUpdates(tensor_iterator, _TIMEOUT_SECONDS) 180 | log_info('Done sending gradients through in {0:.2f}s'.format(time.time() - send_grad_start)) 181 | 182 | # Get the next batch to process 183 | prev_data_indx = response.data_indx 184 | response = stub.SendNextBatch(dist_sgd_pb2.PrevBatch(client_id=client_id, prev_data_indx=prev_data_indx), _TIMEOUT_SECONDS) 185 | 186 | consec_expiration = 0 187 | except KeyboardInterrupt as e: 188 | sys.exit(1) 189 | except Exception as e: 190 | if ('ExpirationError' in str(e) or 'NetworkError' in str(e)): 191 | SERVER_CONSEC_FAILURE = 2 192 | # Count the failures of the server 193 | consec_expiration += 1 194 | 195 | # If consecutive failures exceed a predefined value, then we look for 196 | # the server by pinging available instances or by restarting Paxos 197 | if consec_expiration == SERVER_CONSEC_FAILURE: 198 | log_info('Failure to connect to server_stub. Starting Paxos') 199 | # Launches paxos and then looks for the server 200 | while server_addr == '': 201 | server_addr = run_paxos(local_id) 202 | if server_addr == '': 203 | server_addr = find_server(local_id) 204 | # Generates the server if it is chosen to be the server 205 | if server_addr == gen_local_address(local_id): 206 | serve(server_addr, W, prev_data_indx, local_id) 207 | return 208 | # Connects to the server 209 | stub = connect_server_stub(server_addr) 210 | else: 211 | log_info(traceback.print_exc()) 212 | sys.exit(0) 213 | 214 | if __name__ == '__main__': 215 | log_info('Starting client') 216 | parser = argparse.ArgumentParser() 217 | parser.add_argument('--id') 218 | args = parser.parse_args() 219 | 220 | # Local id is only used if running the machine locally 221 | local_id = args.id 222 | if local_id is not None: 223 | local_id = int(local_id) 224 | assert(local_id > 0) 225 | while True: 226 | run(local_id) -------------------------------------------------------------------------------- /python-python/data/images(16).npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/data/images(16).npy -------------------------------------------------------------------------------- /python-python/data/output_labels(16).npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/data/output_labels(16).npy -------------------------------------------------------------------------------- /python-python/dist_sgd_pb2.py: -------------------------------------------------------------------------------- 1 | # Generated by the protocol buffer compiler. DO NOT EDIT! 2 | # source: dist_sgd.proto 3 | 4 | import sys 5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) 6 | from google.protobuf import descriptor as _descriptor 7 | from google.protobuf import message as _message 8 | from google.protobuf import reflection as _reflection 9 | from google.protobuf import symbol_database as _symbol_database 10 | from google.protobuf import descriptor_pb2 11 | # @@protoc_insertion_point(imports) 12 | 13 | _sym_db = _symbol_database.Default() 14 | 15 | 16 | 17 | 18 | DESCRIPTOR = _descriptor.FileDescriptor( 19 | name='dist_sgd.proto', 20 | package='dist_sgd', 21 | syntax='proto3', 22 | serialized_pb=_b('\n\x0e\x64ist_sgd.proto\x12\x08\x64ist_sgd\"`\n\tSubTensor\x12\x12\n\ntensor_len\x18\x01 \x01(\x05\x12\x14\n\x0ctensor_chunk\x18\x02 \x01(\x05\x12\x16\n\x0etensor_content\x18\x03 \x01(\x0c\x12\x11\n\tdata_indx\x18\x04 \x01(\x05\"\x1f\n\nClientInfo\x12\x11\n\tclient_id\x18\x01 \x01(\x05\"\x1c\n\nStatusCode\x12\x0e\n\x06status\x18\x01 \x01(\x05\"6\n\tPrevBatch\x12\x11\n\tclient_id\x18\x01 \x01(\x05\x12\x16\n\x0eprev_data_indx\x18\x02 \x01(\x05\"1\n\tNextBatch\x12\x11\n\tclient_id\x18\x01 \x01(\x05\x12\x11\n\tdata_indx\x18\x02 \x01(\x05\"\x07\n\x05\x65mpty2\xf0\x01\n\x0bParamFeeder\x12;\n\nSendParams\x12\x14.dist_sgd.ClientInfo\x1a\x13.dist_sgd.SubTensor\"\x00\x30\x01\x12;\n\rSendNextBatch\x12\x13.dist_sgd.PrevBatch\x1a\x13.dist_sgd.NextBatch\"\x00\x12;\n\nGetUpdates\x12\x13.dist_sgd.SubTensor\x1a\x14.dist_sgd.StatusCode\"\x00(\x01\x12*\n\x04ping\x12\x0f.dist_sgd.empty\x1a\x0f.dist_sgd.empty\"\x00\x42\x18\n\x0bio.dist_sgdB\x07\x44istSGDP\x01\x62\x06proto3') 23 | ) 24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR) 25 | 26 | 27 | 28 | 29 | _SUBTENSOR = _descriptor.Descriptor( 30 | name='SubTensor', 31 | full_name='dist_sgd.SubTensor', 32 | filename=None, 33 | file=DESCRIPTOR, 34 | containing_type=None, 35 | fields=[ 36 | _descriptor.FieldDescriptor( 37 | name='tensor_len', full_name='dist_sgd.SubTensor.tensor_len', index=0, 38 | number=1, type=5, cpp_type=1, label=1, 39 | has_default_value=False, default_value=0, 40 | message_type=None, enum_type=None, containing_type=None, 41 | is_extension=False, extension_scope=None, 42 | options=None), 43 | _descriptor.FieldDescriptor( 44 | name='tensor_chunk', full_name='dist_sgd.SubTensor.tensor_chunk', index=1, 45 | number=2, type=5, cpp_type=1, label=1, 46 | has_default_value=False, default_value=0, 47 | message_type=None, enum_type=None, containing_type=None, 48 | is_extension=False, extension_scope=None, 49 | options=None), 50 | _descriptor.FieldDescriptor( 51 | name='tensor_content', full_name='dist_sgd.SubTensor.tensor_content', index=2, 52 | number=3, type=12, cpp_type=9, label=1, 53 | has_default_value=False, default_value=_b(""), 54 | message_type=None, enum_type=None, containing_type=None, 55 | is_extension=False, extension_scope=None, 56 | options=None), 57 | _descriptor.FieldDescriptor( 58 | name='data_indx', full_name='dist_sgd.SubTensor.data_indx', index=3, 59 | number=4, type=5, cpp_type=1, label=1, 60 | has_default_value=False, default_value=0, 61 | message_type=None, enum_type=None, containing_type=None, 62 | is_extension=False, extension_scope=None, 63 | options=None), 64 | ], 65 | extensions=[ 66 | ], 67 | nested_types=[], 68 | enum_types=[ 69 | ], 70 | options=None, 71 | is_extendable=False, 72 | syntax='proto3', 73 | extension_ranges=[], 74 | oneofs=[ 75 | ], 76 | serialized_start=28, 77 | serialized_end=124, 78 | ) 79 | 80 | 81 | _CLIENTINFO = _descriptor.Descriptor( 82 | name='ClientInfo', 83 | full_name='dist_sgd.ClientInfo', 84 | filename=None, 85 | file=DESCRIPTOR, 86 | containing_type=None, 87 | fields=[ 88 | _descriptor.FieldDescriptor( 89 | name='client_id', full_name='dist_sgd.ClientInfo.client_id', index=0, 90 | number=1, type=5, cpp_type=1, label=1, 91 | has_default_value=False, default_value=0, 92 | message_type=None, enum_type=None, containing_type=None, 93 | is_extension=False, extension_scope=None, 94 | options=None), 95 | ], 96 | extensions=[ 97 | ], 98 | nested_types=[], 99 | enum_types=[ 100 | ], 101 | options=None, 102 | is_extendable=False, 103 | syntax='proto3', 104 | extension_ranges=[], 105 | oneofs=[ 106 | ], 107 | serialized_start=126, 108 | serialized_end=157, 109 | ) 110 | 111 | 112 | _STATUSCODE = _descriptor.Descriptor( 113 | name='StatusCode', 114 | full_name='dist_sgd.StatusCode', 115 | filename=None, 116 | file=DESCRIPTOR, 117 | containing_type=None, 118 | fields=[ 119 | _descriptor.FieldDescriptor( 120 | name='status', full_name='dist_sgd.StatusCode.status', index=0, 121 | number=1, type=5, cpp_type=1, label=1, 122 | has_default_value=False, default_value=0, 123 | message_type=None, enum_type=None, containing_type=None, 124 | is_extension=False, extension_scope=None, 125 | options=None), 126 | ], 127 | extensions=[ 128 | ], 129 | nested_types=[], 130 | enum_types=[ 131 | ], 132 | options=None, 133 | is_extendable=False, 134 | syntax='proto3', 135 | extension_ranges=[], 136 | oneofs=[ 137 | ], 138 | serialized_start=159, 139 | serialized_end=187, 140 | ) 141 | 142 | 143 | _PREVBATCH = _descriptor.Descriptor( 144 | name='PrevBatch', 145 | full_name='dist_sgd.PrevBatch', 146 | filename=None, 147 | file=DESCRIPTOR, 148 | containing_type=None, 149 | fields=[ 150 | _descriptor.FieldDescriptor( 151 | name='client_id', full_name='dist_sgd.PrevBatch.client_id', index=0, 152 | number=1, type=5, cpp_type=1, label=1, 153 | has_default_value=False, default_value=0, 154 | message_type=None, enum_type=None, containing_type=None, 155 | is_extension=False, extension_scope=None, 156 | options=None), 157 | _descriptor.FieldDescriptor( 158 | name='prev_data_indx', full_name='dist_sgd.PrevBatch.prev_data_indx', index=1, 159 | number=2, type=5, cpp_type=1, label=1, 160 | has_default_value=False, default_value=0, 161 | message_type=None, enum_type=None, containing_type=None, 162 | is_extension=False, extension_scope=None, 163 | options=None), 164 | ], 165 | extensions=[ 166 | ], 167 | nested_types=[], 168 | enum_types=[ 169 | ], 170 | options=None, 171 | is_extendable=False, 172 | syntax='proto3', 173 | extension_ranges=[], 174 | oneofs=[ 175 | ], 176 | serialized_start=189, 177 | serialized_end=243, 178 | ) 179 | 180 | 181 | _NEXTBATCH = _descriptor.Descriptor( 182 | name='NextBatch', 183 | full_name='dist_sgd.NextBatch', 184 | filename=None, 185 | file=DESCRIPTOR, 186 | containing_type=None, 187 | fields=[ 188 | _descriptor.FieldDescriptor( 189 | name='client_id', full_name='dist_sgd.NextBatch.client_id', index=0, 190 | number=1, type=5, cpp_type=1, label=1, 191 | has_default_value=False, default_value=0, 192 | message_type=None, enum_type=None, containing_type=None, 193 | is_extension=False, extension_scope=None, 194 | options=None), 195 | _descriptor.FieldDescriptor( 196 | name='data_indx', full_name='dist_sgd.NextBatch.data_indx', index=1, 197 | number=2, type=5, cpp_type=1, label=1, 198 | has_default_value=False, default_value=0, 199 | message_type=None, enum_type=None, containing_type=None, 200 | is_extension=False, extension_scope=None, 201 | options=None), 202 | ], 203 | extensions=[ 204 | ], 205 | nested_types=[], 206 | enum_types=[ 207 | ], 208 | options=None, 209 | is_extendable=False, 210 | syntax='proto3', 211 | extension_ranges=[], 212 | oneofs=[ 213 | ], 214 | serialized_start=245, 215 | serialized_end=294, 216 | ) 217 | 218 | 219 | _EMPTY = _descriptor.Descriptor( 220 | name='empty', 221 | full_name='dist_sgd.empty', 222 | filename=None, 223 | file=DESCRIPTOR, 224 | containing_type=None, 225 | fields=[ 226 | ], 227 | extensions=[ 228 | ], 229 | nested_types=[], 230 | enum_types=[ 231 | ], 232 | options=None, 233 | is_extendable=False, 234 | syntax='proto3', 235 | extension_ranges=[], 236 | oneofs=[ 237 | ], 238 | serialized_start=296, 239 | serialized_end=303, 240 | ) 241 | 242 | DESCRIPTOR.message_types_by_name['SubTensor'] = _SUBTENSOR 243 | DESCRIPTOR.message_types_by_name['ClientInfo'] = _CLIENTINFO 244 | DESCRIPTOR.message_types_by_name['StatusCode'] = _STATUSCODE 245 | DESCRIPTOR.message_types_by_name['PrevBatch'] = _PREVBATCH 246 | DESCRIPTOR.message_types_by_name['NextBatch'] = _NEXTBATCH 247 | DESCRIPTOR.message_types_by_name['empty'] = _EMPTY 248 | 249 | SubTensor = _reflection.GeneratedProtocolMessageType('SubTensor', (_message.Message,), dict( 250 | DESCRIPTOR = _SUBTENSOR, 251 | __module__ = 'dist_sgd_pb2' 252 | # @@protoc_insertion_point(class_scope:dist_sgd.SubTensor) 253 | )) 254 | _sym_db.RegisterMessage(SubTensor) 255 | 256 | ClientInfo = _reflection.GeneratedProtocolMessageType('ClientInfo', (_message.Message,), dict( 257 | DESCRIPTOR = _CLIENTINFO, 258 | __module__ = 'dist_sgd_pb2' 259 | # @@protoc_insertion_point(class_scope:dist_sgd.ClientInfo) 260 | )) 261 | _sym_db.RegisterMessage(ClientInfo) 262 | 263 | StatusCode = _reflection.GeneratedProtocolMessageType('StatusCode', (_message.Message,), dict( 264 | DESCRIPTOR = _STATUSCODE, 265 | __module__ = 'dist_sgd_pb2' 266 | # @@protoc_insertion_point(class_scope:dist_sgd.StatusCode) 267 | )) 268 | _sym_db.RegisterMessage(StatusCode) 269 | 270 | PrevBatch = _reflection.GeneratedProtocolMessageType('PrevBatch', (_message.Message,), dict( 271 | DESCRIPTOR = _PREVBATCH, 272 | __module__ = 'dist_sgd_pb2' 273 | # @@protoc_insertion_point(class_scope:dist_sgd.PrevBatch) 274 | )) 275 | _sym_db.RegisterMessage(PrevBatch) 276 | 277 | NextBatch = _reflection.GeneratedProtocolMessageType('NextBatch', (_message.Message,), dict( 278 | DESCRIPTOR = _NEXTBATCH, 279 | __module__ = 'dist_sgd_pb2' 280 | # @@protoc_insertion_point(class_scope:dist_sgd.NextBatch) 281 | )) 282 | _sym_db.RegisterMessage(NextBatch) 283 | 284 | empty = _reflection.GeneratedProtocolMessageType('empty', (_message.Message,), dict( 285 | DESCRIPTOR = _EMPTY, 286 | __module__ = 'dist_sgd_pb2' 287 | # @@protoc_insertion_point(class_scope:dist_sgd.empty) 288 | )) 289 | _sym_db.RegisterMessage(empty) 290 | 291 | 292 | DESCRIPTOR.has_options = True 293 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n\013io.dist_sgdB\007DistSGDP\001')) 294 | import abc 295 | import six 296 | from grpc.beta import implementations as beta_implementations 297 | from grpc.framework.common import cardinality 298 | from grpc.framework.interfaces.face import utilities as face_utilities 299 | 300 | class BetaParamFeederServicer(six.with_metaclass(abc.ABCMeta, object)): 301 | """""" 302 | @abc.abstractmethod 303 | def SendParams(self, request, context): 304 | raise NotImplementedError() 305 | @abc.abstractmethod 306 | def SendNextBatch(self, request, context): 307 | raise NotImplementedError() 308 | @abc.abstractmethod 309 | def GetUpdates(self, request_iterator, context): 310 | raise NotImplementedError() 311 | @abc.abstractmethod 312 | def ping(self, request, context): 313 | raise NotImplementedError() 314 | 315 | class BetaParamFeederStub(six.with_metaclass(abc.ABCMeta, object)): 316 | """The interface to which stubs will conform.""" 317 | @abc.abstractmethod 318 | def SendParams(self, request, timeout): 319 | raise NotImplementedError() 320 | @abc.abstractmethod 321 | def SendNextBatch(self, request, timeout): 322 | raise NotImplementedError() 323 | SendNextBatch.future = None 324 | @abc.abstractmethod 325 | def GetUpdates(self, request_iterator, timeout): 326 | raise NotImplementedError() 327 | GetUpdates.future = None 328 | @abc.abstractmethod 329 | def ping(self, request, timeout): 330 | raise NotImplementedError() 331 | ping.future = None 332 | 333 | def beta_create_ParamFeeder_server(servicer, pool=None, pool_size=None, default_timeout=None, maximum_timeout=None): 334 | import dist_sgd_pb2 335 | import dist_sgd_pb2 336 | import dist_sgd_pb2 337 | import dist_sgd_pb2 338 | import dist_sgd_pb2 339 | import dist_sgd_pb2 340 | import dist_sgd_pb2 341 | import dist_sgd_pb2 342 | request_deserializers = { 343 | ('dist_sgd.ParamFeeder', 'GetUpdates'): dist_sgd_pb2.SubTensor.FromString, 344 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): dist_sgd_pb2.PrevBatch.FromString, 345 | ('dist_sgd.ParamFeeder', 'SendParams'): dist_sgd_pb2.ClientInfo.FromString, 346 | ('dist_sgd.ParamFeeder', 'ping'): dist_sgd_pb2.empty.FromString, 347 | } 348 | response_serializers = { 349 | ('dist_sgd.ParamFeeder', 'GetUpdates'): dist_sgd_pb2.StatusCode.SerializeToString, 350 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): dist_sgd_pb2.NextBatch.SerializeToString, 351 | ('dist_sgd.ParamFeeder', 'SendParams'): dist_sgd_pb2.SubTensor.SerializeToString, 352 | ('dist_sgd.ParamFeeder', 'ping'): dist_sgd_pb2.empty.SerializeToString, 353 | } 354 | method_implementations = { 355 | ('dist_sgd.ParamFeeder', 'GetUpdates'): face_utilities.stream_unary_inline(servicer.GetUpdates), 356 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): face_utilities.unary_unary_inline(servicer.SendNextBatch), 357 | ('dist_sgd.ParamFeeder', 'SendParams'): face_utilities.unary_stream_inline(servicer.SendParams), 358 | ('dist_sgd.ParamFeeder', 'ping'): face_utilities.unary_unary_inline(servicer.ping), 359 | } 360 | server_options = beta_implementations.server_options(request_deserializers=request_deserializers, response_serializers=response_serializers, thread_pool=pool, thread_pool_size=pool_size, default_timeout=default_timeout, maximum_timeout=maximum_timeout) 361 | return beta_implementations.server(method_implementations, options=server_options) 362 | 363 | def beta_create_ParamFeeder_stub(channel, host=None, metadata_transformer=None, pool=None, pool_size=None): 364 | import dist_sgd_pb2 365 | import dist_sgd_pb2 366 | import dist_sgd_pb2 367 | import dist_sgd_pb2 368 | import dist_sgd_pb2 369 | import dist_sgd_pb2 370 | import dist_sgd_pb2 371 | import dist_sgd_pb2 372 | request_serializers = { 373 | ('dist_sgd.ParamFeeder', 'GetUpdates'): dist_sgd_pb2.SubTensor.SerializeToString, 374 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): dist_sgd_pb2.PrevBatch.SerializeToString, 375 | ('dist_sgd.ParamFeeder', 'SendParams'): dist_sgd_pb2.ClientInfo.SerializeToString, 376 | ('dist_sgd.ParamFeeder', 'ping'): dist_sgd_pb2.empty.SerializeToString, 377 | } 378 | response_deserializers = { 379 | ('dist_sgd.ParamFeeder', 'GetUpdates'): dist_sgd_pb2.StatusCode.FromString, 380 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): dist_sgd_pb2.NextBatch.FromString, 381 | ('dist_sgd.ParamFeeder', 'SendParams'): dist_sgd_pb2.SubTensor.FromString, 382 | ('dist_sgd.ParamFeeder', 'ping'): dist_sgd_pb2.empty.FromString, 383 | } 384 | cardinalities = { 385 | 'GetUpdates': cardinality.Cardinality.STREAM_UNARY, 386 | 'SendNextBatch': cardinality.Cardinality.UNARY_UNARY, 387 | 'SendParams': cardinality.Cardinality.UNARY_STREAM, 388 | 'ping': cardinality.Cardinality.UNARY_UNARY, 389 | } 390 | stub_options = beta_implementations.stub_options(host=host, metadata_transformer=metadata_transformer, request_serializers=request_serializers, response_deserializers=response_deserializers, thread_pool=pool, thread_pool_size=pool_size) 391 | return beta_implementations.dynamic_stub(channel, 'dist_sgd.ParamFeeder', cardinalities, options=stub_options) 392 | # @@protoc_insertion_point(module_scope) 393 | -------------------------------------------------------------------------------- /python-python/image_classes.txt: -------------------------------------------------------------------------------- 1 | 0,accordion 2 | 1,airplanes 3 | 2,anchor 4 | 3,ant 5 | 4,BACKGROUND_Google 6 | 5,barrel 7 | 6,bass 8 | 7,beaver 9 | 8,binocular 10 | 9,bonsai 11 | 10,brain 12 | 11,brontosaurus 13 | 12,buddha 14 | 13,butterfly 15 | 14,camera 16 | 15,cannon 17 | 16,car_side 18 | 17,ceiling_fan 19 | 18,cellphone 20 | 19,chair 21 | 20,chandelier 22 | 21,cougar_body 23 | 22,cougar_face 24 | 23,crab 25 | 24,crayfish 26 | 25,crocodile 27 | 26,crocodile_head 28 | 27,cup 29 | 28,dalmatian 30 | 29,dollar_bill 31 | 30,dolphin 32 | 31,dragonfly 33 | 32,electric_guitar 34 | 33,elephant 35 | 34,emu 36 | 35,euphonium 37 | 36,ewer 38 | 37,Faces 39 | 38,Faces_easy 40 | 39,ferry 41 | 40,flamingo 42 | 41,flamingo_head 43 | 42,garfield 44 | 43,gerenuk 45 | 44,gramophone 46 | 45,grand_piano 47 | 46,hawksbill 48 | 47,headphone 49 | 48,hedgehog 50 | 49,helicopter 51 | 50,ibis 52 | 51,inline_skate 53 | 52,joshua_tree 54 | 53,kangaroo 55 | 54,ketch 56 | 55,lamp 57 | 56,laptop 58 | 57,Leopards 59 | 58,llama 60 | 59,lobster 61 | 60,lotus 62 | 61,mandolin 63 | 62,mayfly 64 | 63,menorah 65 | 64,metronome 66 | 65,minaret 67 | 66,Motorbikes 68 | 67,nautilus 69 | 68,octopus 70 | 69,okapi 71 | 70,pagoda 72 | 71,panda 73 | 72,pigeon 74 | 73,pizza 75 | 74,platypus 76 | 75,pyramid 77 | 76,revolver 78 | 77,rhino 79 | 78,rooster 80 | 79,saxophone 81 | 80,schooner 82 | 81,scissors 83 | 82,scorpion 84 | 83,sea_horse 85 | 84,snoopy 86 | 85,soccer_ball 87 | 86,stapler 88 | 87,starfish 89 | 88,stegosaurus 90 | 89,stop_sign 91 | 90,strawberry 92 | 91,sunflower 93 | 92,tick 94 | 93,trilobite 95 | 94,umbrella 96 | 95,watch 97 | 96,water_lilly 98 | 97,wheelchair 99 | 98,wild_cat 100 | 99,windsor_chair 101 | 100,wrench 102 | 101,yin_yang 103 | -------------------------------------------------------------------------------- /python-python/neural_net.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | from scipy.ndimage import imread 4 | from scipy.misc import imresize 5 | from sklearn.cross_validation import train_test_split 6 | from os import listdir 7 | from os.path import isfile, join 8 | import autograd.numpy as np 9 | import autograd.numpy.random as npr 10 | from autograd.scipy.misc import logsumexp 11 | from autograd import grad 12 | from autograd.util import quick_grad_check 13 | import gc 14 | import resource 15 | from sys import getsizeof 16 | import os 17 | import psutil 18 | 19 | # {0: 'accordion', 1: 'airplanes', 2: 'anchor', 3: 'ant', 4: 'BACKGROUND_Google', 5: 'barrel', 6: 'bass', 7: 'beaver', 8: 'binocular', 9: 'bonsai', 10: 'brain', 11: 'brontosaurus', 12: 'buddha', 13: 'butterfly', 14: 'camera', 15: 'cannon', 16: 'car_side', 17: 'ceiling_fan', 18: 'cellphone', 19: 'chair', 20: 'chandelier', 21: 'cougar_body', 22: 'cougar_face', 23: 'crab', 24: 'crayfish', 25: 'crocodile', 26: 'crocodile_head', 27: 'cup', 28: 'dalmatian', 29: 'dollar_bill', 30: 'dolphin', 31: 'dragonfly', 32: 'electric_guitar', 33: 'elephant', 34: 'emu', 35: 'euphonium', 36: 'ewer', 37: 'Faces', 38: 'Faces_easy', 39: 'ferry', 40: 'flamingo', 41: 'flamingo_head', 42: 'garfield', 43: 'gerenuk', 44: 'gramophone', 45: 'grand_piano', 46: 'hawksbill', 47: 'headphone', 48: 'hedgehog', 49: 'helicopter', 50: 'ibis', 51: 'inline_skate', 52: 'joshua_tree', 53: 'kangaroo', 54: 'ketch', 55: 'lamp', 56: 'laptop', 57: 'Leopards', 58: 'llama', 59: 'lobster', 60: 'lotus', 61: 'mandolin', 62: 'mayfly', 63: 'menorah', 64: 'metronome', 65: 'minaret', 66: 'Motorbikes', 67: 'nautilus', 68: 'octopus', 69: 'okapi', 70: 'pagoda', 71: 'panda', 72: 'pigeon', 73: 'pizza', 74: 'platypus', 75: 'pyramid', 76: 'revolver', 77: 'rhino', 78: 'rooster', 79: 'saxophone', 80: 'schooner', 81: 'scissors', 82: 'scorpion', 83: 'sea_horse', 84: 'snoopy', 85: 'soccer_ball', 86: 'stapler', 87: 'starfish', 88: 'stegosaurus', 89: 'stop_sign', 90: 'strawberry', 91: 'sunflower', 92: 'tick', 93: 'trilobite', 94: 'umbrella', 95: 'watch', 96: 'water_lilly', 97: 'wheelchair', 98: 'wild_cat', 99: 'windsor_chair', 100: 'wrench', 101: 'yin_yang'} 20 | 21 | images_fname = 'images(128).npy' 22 | output_labels_fname = 'output_labels(128).npy' 23 | 24 | def make_nn_funs(layer_sizes, L2_reg): 25 | shapes = zip(layer_sizes[:-1], layer_sizes[1:]) 26 | N = sum((m+1)*n for m, n in shapes) 27 | 28 | def unpack_layers(W_vect): 29 | for m, n in shapes: 30 | yield W_vect[:m*n].reshape((m,n)), W_vect[m*n:m*n+n] 31 | W_vect = W_vect[(m+1)*n:] 32 | 33 | def predictions(W_vect, inputs): 34 | for W, b in unpack_layers(W_vect): 35 | outputs = np.dot(inputs, W) + b 36 | inputs = np.tanh(outputs) 37 | return outputs - logsumexp(outputs, axis=1, keepdims=True) 38 | 39 | def loss(W_vect, X, T): 40 | log_prior = -L2_reg * np.dot(W_vect, W_vect) 41 | log_lik = np.sum(predictions(W_vect, X) * T) 42 | return - log_prior - log_lik 43 | 44 | def frac_err(W_vect, X, T): 45 | return np.mean(np.argmax(T, axis=1) != np.argmax(predictions(W_vect, X), axis=1)) 46 | 47 | return N, predictions, loss, frac_err 48 | 49 | def convert_bw_to_rgb(im): 50 | im.resize((im.shape[0], im.shape[1], 1)) 51 | return np.repeat(im.astype(np.uint8), 3, 2) 52 | 53 | def standarizeImage(im): 54 | if len(im.shape) < 3: 55 | im = convert_bw_to_rgb(im) 56 | im = np.array(im, 'float32') 57 | if im.shape[0] != 64: 58 | im = imresize(im, (64, 64, 3)) 59 | if np.amax(im) > 1.1: 60 | im = im / 255.0 61 | assert((np.amax(im) > 0.01) & (np.amax(im) <= 1)) 62 | assert((np.amin(im) >= 0.00)) 63 | return im 64 | 65 | def gen_data(): 66 | category_paths = [f for f in listdir('101_ObjectCategories/')] 67 | image_paths = [f for f in listdir('101_ObjectCategories/menorah/') if isfile(join('101_ObjectCategories/menorah/', f))] 68 | 69 | images = [] 70 | output_labels = [] 71 | # Include all categories with mappings to the integer representing the category 72 | categories_dict = {} 73 | 74 | category = 0 75 | for category_path in category_paths: 76 | image_paths = [f for f in listdir('101_ObjectCategories/' + category_path + '/')] 77 | for image_path in image_paths: 78 | im = standarizeImage(imread('101_ObjectCategories/' + category_path + '/' + image_path)) 79 | if im.shape == (64, 64, 3): 80 | images.append(im) 81 | output_labels.append(category) 82 | categories_dict[category] = category_path 83 | category = category + 1 84 | 85 | images = np.array(images) 86 | partial_flatten = lambda x : np.reshape(x, (x.shape[0], np.prod(x.shape[1:]))) 87 | images = partial_flatten(images) 88 | 89 | np.save('images.npy', images) 90 | np.save('output_labels.npy', output_labels) 91 | 92 | def make_batches(N_data, batch_size): 93 | return [slice(i, min(i+batch_size, N_data)) 94 | for i in range(0, N_data, batch_size)] 95 | 96 | def load_caltech100(): 97 | # gen_data() 98 | one_hot = lambda x, K: np.array(x[:,None] == np.arange(K)[None, :], dtype=int) 99 | images = np.load(images_fname) 100 | output_labels = np.load(output_labels_fname) 101 | train_images, valid_images, train_labels, valid_labels = train_test_split(images, output_labels, test_size=0.20, random_state=1729) 102 | train_labels = one_hot(train_labels, 101) 103 | valid_labels = one_hot(valid_labels, 101) 104 | # import bpdb; bpdb.set_trace() 105 | return train_images, train_labels, valid_images, valid_labels 106 | 107 | if __name__ == '__main__': 108 | 109 | print(resource.getrusage(resource.RUSAGE_SELF)) 110 | process = psutil.Process(os.getpid()) 111 | print (process.memory_info().rss) 112 | 113 | # Load and process Caltech data 114 | train_images, train_labels, test_images, test_labels = load_caltech100() 115 | image_input_d = train_images.shape[1] 116 | 117 | # Network parameters 118 | layer_sizes = [image_input_d, 1500, 650, 101] 119 | L2_reg = 1.0 120 | 121 | # Training parameters 122 | param_scale = 0.1 123 | learning_rate = 1e-3 124 | momentum = 0.9 125 | batch_size = 256 126 | num_epochs = 50 127 | 128 | # Make neural net functions 129 | N_weights, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg) 130 | loss_grad = grad(loss_fun) 131 | 132 | # Initialize weights 133 | rs = npr.RandomState() 134 | W = rs.randn(N_weights) * param_scale 135 | 136 | # Check the gradients numerically, just to be safe 137 | # quick_grad_check(loss_fun, W, (train_images, train_labels)) 138 | 139 | print(" Epoch | Train err | Test err ") 140 | 141 | def print_perf(epoch, W): 142 | test_perf = frac_err(W, test_images, test_labels) 143 | train_perf = frac_err(W, train_images, train_labels) 144 | print("{0:15}|{1:15}|{2:15}".format(epoch, train_perf, test_perf)) 145 | 146 | # Train with sgd 147 | batch_idxs = make_batches(train_images.shape[0], batch_size) 148 | import bpdb; bpdb.set_trace() 149 | cur_dir = np.zeros(N_weights) 150 | 151 | for epoch in range(num_epochs): 152 | print_perf(epoch, W) 153 | for idxs in batch_idxs: 154 | grad_W = loss_grad(W, train_images[idxs], train_labels[idxs]) 155 | print('----------------------------') 156 | print(getsizeof(grad_W)) 157 | #print(process.memory_info().rss) 158 | #print(resource.getrusage(resource.RUSAGE_SELF)) 159 | gc.collect() 160 | #print(process.memory_info().rss) 161 | cur_dir = momentum * cur_dir + (1.0 - momentum) * grad_W 162 | W -= learning_rate * cur_dir -------------------------------------------------------------------------------- /python-python/nnet/__init__.py: -------------------------------------------------------------------------------- 1 | # Default python file required for initializing the module for 2 | # neural net class. More documentation included in the next file. -------------------------------------------------------------------------------- /python-python/nnet/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/nnet/__init__.pyc -------------------------------------------------------------------------------- /python-python/nnet/neural_net.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | import time 4 | 5 | from scipy.ndimage import imread 6 | from scipy.misc import imresize 7 | from sklearn.cross_validation import train_test_split 8 | from autograd.scipy.misc import logsumexp 9 | 10 | from os import listdir 11 | from os.path import isfile, join 12 | 13 | import autograd.numpy as np 14 | import autograd.numpy.random as npr 15 | from autograd import grad 16 | 17 | import traceback 18 | 19 | # Set up a basic convolutional neural net is adapted from Ryan Adam's example 20 | # with Autograd located below: 21 | # https://github.com/twitter/torch-autograd/blob/master/examples/train-mnist-cnn.lua 22 | 23 | # We apply this model to the Caltech 101 dataset rather than the MNIST dataset 24 | # to increase the difficulty of the task 25 | def make_nn_funs(layer_sizes, L2_reg): 26 | shapes = zip(layer_sizes[:-1], layer_sizes[1:]) 27 | N = sum((m+1)*n for m, n in shapes) 28 | 29 | def unpack_layers(W_vect): 30 | for m, n in shapes: 31 | yield W_vect[:m*n].reshape((m,n)), W_vect[m*n:m*n+n] 32 | W_vect = W_vect[(m+1)*n:] 33 | 34 | def predictions(W_vect, inputs): 35 | for W, b in unpack_layers(W_vect): 36 | outputs = np.dot(inputs, W) + b 37 | inputs = np.tanh(outputs) 38 | return outputs - logsumexp(outputs, axis=1, keepdims=True) 39 | 40 | def loss(W_vect, X, T): 41 | log_prior = -L2_reg * np.dot(W_vect.T, W_vect) 42 | log_lik = np.sum(predictions(W_vect, X) * T) 43 | return - log_prior - log_lik 44 | 45 | def frac_err(W_vect, X, T): 46 | return np.mean(np.argmax(T, axis=1) != np.argmax(predictions(W_vect, X), axis=1)) 47 | 48 | return N, predictions, loss, frac_err 49 | 50 | def convert_bw_to_rgb(im): 51 | im.resize((im.shape[0], im.shape[1], 1)) 52 | return np.repeat(im.astype(np.uint8), 3, 2) 53 | 54 | def standarizeImage(im): 55 | if len(im.shape) < 3: 56 | im = convert_bw_to_rgb(im) 57 | im = np.array(im, 'float32') 58 | if im.shape[0] != 64: 59 | im = imresize(im, (64, 64, 3)) 60 | if np.amax(im) > 1.1: 61 | im = im / 255.0 62 | assert((np.amax(im) > 0.01) & (np.amax(im) <= 1)) 63 | assert((np.amin(im) >= 0.00)) 64 | return im 65 | 66 | def gen_data(): 67 | category_paths = [f for f in listdir('101_ObjectCategories/')] 68 | image_paths = [f for f in listdir('101_ObjectCategories/menorah/') if isfile(join('101_ObjectCategories/menorah/', f))] 69 | 70 | images = [] 71 | output_labels = [] 72 | # Include all categories with mappings to the integer representing the category 73 | categories_dict = {} 74 | 75 | category = 0 76 | for category_path in category_paths: 77 | image_paths = [f for f in listdir('101_ObjectCategories/' + category_path + '/')] 78 | for image_path in image_paths: 79 | im = standarizeImage(imread('101_ObjectCategories/' + category_path + '/' + image_path)) 80 | if im.shape == (64, 64, 3): 81 | images.append(im) 82 | output_labels.append(category) 83 | categories_dict[category] = category_path 84 | category = category + 1 85 | 86 | images = np.array(images) 87 | partial_flatten = lambda x : np.reshape(x, (x.shape[0], np.prod(x.shape[1:]))) 88 | images = partial_flatten(images) 89 | 90 | np.save('images(64).npy', images) 91 | np.save('output_labels(64).npy', output_labels) 92 | 93 | def make_batches(N_data, batch_size): 94 | return [slice(i, min(i+batch_size, N_data)) 95 | for i in range(0, N_data, batch_size)] 96 | 97 | def load_caltech100(images_fname, labels_fname): 98 | # if images(64).npy or output_labels(64).npy missing then 99 | # print('Generating data because it does not exist. Note that this may take a while') 100 | # gen_data() 101 | one_hot = lambda x, K: np.array(x[:,None] == np.arange(K)[None, :], dtype=int) 102 | images = np.load(images_fname) 103 | output_labels = np.load(labels_fname) 104 | output_labels = np.load(labels_fname) 105 | train_images, valid_images, train_labels, valid_labels = train_test_split(images, output_labels, test_size=0.20, random_state=1729) 106 | train_labels = one_hot(train_labels, 101) 107 | valid_labels = one_hot(valid_labels, 101) 108 | return train_images, train_labels, valid_images, valid_labels 109 | -------------------------------------------------------------------------------- /python-python/nnet/neural_net.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/nnet/neural_net.pyc -------------------------------------------------------------------------------- /python-python/paxos.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------ 2 | # Implements a Paxos server and runs Paxos with this server. 3 | # This function is called through run_paxos if the client_server 4 | # has gone down. 5 | # ------------------------------------------------------------ 6 | 7 | from __future__ import print_function 8 | from __future__ import absolute_import 9 | from grpc.beta import implementations 10 | import time 11 | import sys 12 | from threading import Thread 13 | 14 | import paxos_pb2 15 | import argparse 16 | import traceback 17 | 18 | import autograd.numpy as np 19 | import autograd.numpy.random as npr 20 | from autograd import grad 21 | import random 22 | 23 | from protobuf_utils.utils import * 24 | from server_utils.utils import * 25 | 26 | import subprocess 27 | 28 | _TIMEOUT_SECONDS = 4 29 | PAXOS_PORT_STR = 50052 30 | 31 | # Actual implementation of the PaxosServer that is used to communicate between the clients. 32 | # Paxos is called to determine the future main server from amongst many different clients. 33 | class PaxosServer(paxos_pb2.BetaPaxosServerServicer): 34 | def __init__(self, hostname): 35 | # Initial consensus value is none, this will be the server 36 | self.new_server = '' 37 | self.consensus_value = None 38 | self.consensus_reached = False 39 | 40 | # Values for paxos 41 | self.n = random.random() 42 | self.prop_n = 0 43 | self.v = '' 44 | self.n_v = 0 45 | 46 | # Exponential backoff to prevent spamming other servers 47 | # Randomness is introduced to help Paxos converge quicker 48 | self.backoff = (1 * random.gauss(1, 0.25)) 49 | if self.backoff < 0: 50 | self.backoff = 1 51 | 52 | # Saves the server's address as well 53 | self.address = hostname 54 | 55 | # Runs the prepare phase of the Paxos algorithm 56 | def prepare(self, request, context): 57 | # Update the highest seen proposal 58 | if request.n > self.prop_n: 59 | self.prop_n = request.n 60 | # Returns an acknowledgement containing highest accepted proposal 61 | return paxos_pb2.ack(n=self.n, v=self.v, n_v=self.n_v) 62 | 63 | # Accepts the proposal if it is higher than 64 | def accept(self, request, context): 65 | if request.n >= self.prop_n: 66 | self.n_v = request.n 67 | self.v = request.v 68 | self.n = request.n 69 | return paxos_pb2.acquiescence(accept_bool=True) 70 | else: 71 | return paxos_pb2.acquiescence(accept_bool=False) 72 | 73 | # Notifies the server that consensus has been reached 74 | def accepted(self, request, context): 75 | self.consensus_reached = True 76 | self.new_server = request.v 77 | return paxos_pb2.empty() 78 | 79 | # Ping function to allow confirmation between PaxosServer that they 80 | # are still running 81 | def ping(self, request, context): 82 | return paxos_pb2.empty() 83 | 84 | # Runs the PaxosServer. Checks periodically to see if a consensus has 85 | # been reached. 86 | def run_server(server, paxos_server): 87 | server.start() 88 | while True: 89 | time.sleep(0.1) 90 | try: 91 | if paxos_server.consensus_reached: 92 | if paxos_server.new_server != '': 93 | log_info('Consensus reached, server shutting down') 94 | # Wait briefly for the consensus message to propogate out 95 | time.sleep(5) 96 | server.stop(0) 97 | break 98 | time.sleep(1) 99 | except KeyboardInterrupt: 100 | server.stop(0) 101 | 102 | # Actually instantiates the Paxos Server according to a defined port 103 | def create_server(hostname, local_id): 104 | # Allow argument that allows this parameter to be changsed 105 | paxos_server = PaxosServer(hostname) 106 | server = paxos_pb2.beta_create_PaxosServer_server(paxos_server) 107 | if local_id is None: 108 | server.add_insecure_port(hostname + ':' + str(PAXOS_PORT_STR)) 109 | else: 110 | server.add_insecure_port(hostname) 111 | return paxos_server, server 112 | 113 | # Attempts to send proposals to all the other servers 114 | def send_proposals(server_stubs, self_paxos_server): 115 | # Increments the proposal number from the previous one that it sends out 116 | self_paxos_server.n = self_paxos_server.n * (1 + random.random()) 117 | self_paxos_server.v = self_paxos_server.address 118 | n_proposal = self_paxos_server.n 119 | value = self_paxos_server.address 120 | log_info('Making a proposal from {0} for n = {1} '.format(self_paxos_server.address, n_proposal)) 121 | 122 | # Track the failures of the proposals 123 | n_so_far = 0 124 | failed = False 125 | responded = 0 126 | 127 | for server_stub in server_stubs: 128 | # Makes the connection to the server 129 | try: 130 | # gRPC call to other Paxos Servers to see if they acceept the proposal 131 | response = server_stub.prepare(paxos_pb2.proposal(n=n_proposal), _TIMEOUT_SECONDS) 132 | 133 | # Sees a higher n value then it's current value and immediately stops the process 134 | if response.n >= n_proposal: 135 | failed = True 136 | log_info('Proposal ' + str(n_proposal) + ' failed') 137 | break 138 | else: 139 | # If the response is positive, then it notes the positive response 140 | if response.n_v > n_so_far: 141 | n_so_far = response.n 142 | value = response.v 143 | responded += 1 144 | except Exception as e: 145 | if ('ExpirationError' in str(e)): 146 | log_info('Failure to connect to server_stub') 147 | continue 148 | else: 149 | # More severe error, should log and crash 150 | traceback.print_exc() 151 | sys.exit(1) 152 | 153 | # No proposals have been sent so far, suggests its own IP 154 | if value is None: 155 | value = self_paxos_server.address 156 | 157 | # If it does not have a majority of responses, Paxos fails 158 | if responded < len(server_stubs) / 2.0: 159 | failed = True 160 | 161 | return(failed, n_proposal, value) 162 | 163 | # Requests that the other Paxos Server accepts the proposal 164 | def request_accept(server_stubs, self_paxos_server, n_proposal, value): 165 | accepted = 0 166 | for stub in server_stubs: 167 | try: 168 | response = stub.accept(paxos_pb2.request_acceptance(n=n_proposal, v=value), _TIMEOUT_SECONDS) 169 | except Exception as e: 170 | traceback.print_exc() 171 | return False 172 | if response.accept_bool: 173 | accepted += 1 174 | 175 | # If the majority accept the proposal, then it passes 176 | if accepted > len(server_stubs) / 2.0: 177 | log_info('Proposal accepted') 178 | return True 179 | else: 180 | log_info('Proposal {0} rejected with value {1}'.format(n_proposal, value)) 181 | return False 182 | 183 | # Checks to ensure that all the stubs are currently available by pinging them 184 | # If more than half of them are available, it begins Paxos. Otherwise, it waits. 185 | def check_stubs_up(stubs): 186 | responses = 0 187 | for stub in stubs: 188 | try: 189 | response = stub.ping(paxos_pb2.empty(), _TIMEOUT_SECONDS) 190 | responses += 1 191 | except Exception as e: 192 | if ('ExpirationError' in str(e)): 193 | log_info('Failure to connect to server_stub during startup') 194 | continue 195 | else: 196 | # More severe error, should log and crash 197 | traceback.print_exc() 198 | sys.exit(1) 199 | if responses < len(stubs) / 2: 200 | return False 201 | else: 202 | return True 203 | 204 | # Make sure that all machines are aware that the Paxos algorithm is finishing 205 | # Not all machines are aware that the server has failed at the same time. Could 206 | # be in the middle of calculating gradients or waiting to be timed out. 207 | def gen_server_stubs(self_paxos_server, local_id): 208 | TOT_ATTEMPTS = 3 209 | for i in range(TOT_ATTEMPTS): 210 | server_addresses = gen_server_addresses(local_id, self_paxos_server.address) 211 | print(server_addresses) 212 | server_addresses.remove(self_paxos_server.address) 213 | stubs = [] 214 | for server_address in server_addresses: 215 | if not self_paxos_server.consensus_reached: 216 | if local_id is not None: 217 | server_port = int(server_address[-5:]) 218 | channel = implementations.insecure_channel('localhost', server_port) 219 | else: 220 | channel = implementations.insecure_channel(server_address, PAXOS_PORT_STR) 221 | 222 | stub = paxos_pb2.beta_create_PaxosServer_stub(channel) 223 | stubs.append(stub) 224 | all_stubs_responsive = check_stubs_up(stubs) 225 | if all_stubs_responsive: 226 | return stubs 227 | time.sleep(1 * TOT_ATTEMPTS) 228 | return None 229 | 230 | # Sends to all servers that consensus was reached and a server was chosen. 231 | def broadcast_consensus(server_stubs, self_paxos_server, value): 232 | for stub in server_stubs: 233 | response = stub.accepted(paxos_pb2.consensus(n=self_paxos_server.n, v=value), 2 * _TIMEOUT_SECONDS) 234 | 235 | # Begins the Paxos protocol 236 | def start_paxos(server_stubs, self_paxos_server): 237 | proposal_failed, n_proposal, value = send_proposals(server_stubs, self_paxos_server) 238 | if not proposal_failed and not self_paxos_server.consensus_reached: 239 | # Have everyone accept the proposal 240 | accepted = request_accept(server_stubs, self_paxos_server, n_proposal, value) 241 | if accepted and not self_paxos_server.consensus_reached: 242 | # If accepted, let everyone know that the server has been chosen 243 | broadcast_consensus(server_stubs, self_paxos_server, value) 244 | self_paxos_server.new_server = value 245 | self_paxos_server.consensus_reached = True 246 | return True 247 | 248 | # If proposal failed, backoff to try again later 249 | self_paxos_server.backoff = self_paxos_server.backoff * (1 + 10 * random.random()) 250 | return False 251 | 252 | # Client loops and runs the paxos algorithm every few seconds 253 | def paxos_loop(self_paxos_server, local_id): 254 | time_slept = 0 255 | send_proposal_time = self_paxos_server.backoff 256 | 257 | while not self_paxos_server.consensus_reached: 258 | time.sleep(0.1) 259 | time_slept += 0.1 260 | 261 | # Send a proposal at allocated time 262 | if time_slept > send_proposal_time and not self_paxos_server.consensus_reached: 263 | time.sleep(random.random()) 264 | server_stubs = gen_server_stubs(self_paxos_server, local_id) 265 | if server_stubs is None: 266 | self_paxos_server.new_server = '' 267 | break 268 | start_paxos(server_stubs, self_paxos_server) 269 | send_proposal_time = (random.gauss(1, 0.25) * self_paxos_server.backoff) 270 | time_slept = 0 271 | 272 | # If proposal fails, revert to checking for a server 273 | if send_proposal_time > 60: 274 | self_paxos_server.consensus_reached = True 275 | self_paxos_server.consensus_value = '' 276 | break 277 | 278 | # This is the final function that exterior functions like client.py will call 279 | def run_paxos(local_id=None): 280 | # Generates the host name 281 | hostname = gen_local_address(local_id) 282 | log_info(hostname + ' called to run Paxos for determining the server') 283 | 284 | # Generates the server 285 | paxos_server, server = create_server(hostname, local_id) 286 | try: 287 | # Launch the server on a separate thread 288 | Thread(target=run_server, args=(server,paxos_server,)).start() 289 | start_paxos = time.time() 290 | 291 | # Begin to run Paxos 292 | paxos_loop(paxos_server, local_id) 293 | if paxos_server.new_server != '': 294 | log_info('Done, new server is: {0} finished paxos in {1:2}s'.format(paxos_server.new_server, time.time()-start_paxos)) 295 | else: 296 | # New server is empty only when a suitable server was not found after a predefined amount of time 297 | log_info('Failure to connect to other allocated instances. Stopping paxos.') 298 | except KeyboardInterrupt: 299 | sys.exit(0) 300 | finally: 301 | paxos_server.consensus_reached = True 302 | server.stop(0) 303 | return paxos_server.new_server 304 | 305 | if __name__ == '__main__': 306 | parser = argparse.ArgumentParser() 307 | parser.add_argument('--id') 308 | args = parser.parse_args() 309 | local_id = args.id 310 | if local_id is not None: 311 | local_id = int(local_id) 312 | assert(local_id > 0) 313 | log_info(run_paxos(local_id)) 314 | -------------------------------------------------------------------------------- /python-python/paxos_pb2.py: -------------------------------------------------------------------------------- 1 | # Generated by the protocol buffer compiler. DO NOT EDIT! 2 | # source: paxos.proto 3 | 4 | import sys 5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) 6 | from google.protobuf import descriptor as _descriptor 7 | from google.protobuf import message as _message 8 | from google.protobuf import reflection as _reflection 9 | from google.protobuf import symbol_database as _symbol_database 10 | from google.protobuf import descriptor_pb2 11 | # @@protoc_insertion_point(imports) 12 | 13 | _sym_db = _symbol_database.Default() 14 | 15 | 16 | 17 | 18 | DESCRIPTOR = _descriptor.FileDescriptor( 19 | name='paxos.proto', 20 | package='dist_sgd', 21 | syntax='proto3', 22 | serialized_pb=_b('\n\x0bpaxos.proto\x12\x08\x64ist_sgd\"(\n\x03\x61\x63k\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\x12\x0b\n\x03n_v\x18\x03 \x01(\x02\"\x15\n\x08proposal\x12\t\n\x01n\x18\x01 \x01(\x02\"*\n\x12request_acceptance\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\"#\n\x0c\x61\x63quiescence\x12\x13\n\x0b\x61\x63\x63\x65pt_bool\x18\x01 \x01(\x08\"!\n\tconsensus\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\"\x07\n\x05\x65mpty2\xdf\x01\n\x0bPaxosServer\x12.\n\x07prepare\x12\x12.dist_sgd.proposal\x1a\r.dist_sgd.ack\"\x00\x12@\n\x06\x61\x63\x63\x65pt\x12\x1c.dist_sgd.request_acceptance\x1a\x16.dist_sgd.acquiescence\"\x00\x12\x32\n\x08\x61\x63\x63\x65pted\x12\x13.dist_sgd.consensus\x1a\x0f.dist_sgd.empty\"\x00\x12*\n\x04ping\x12\x0f.dist_sgd.empty\x1a\x0f.dist_sgd.empty\"\x00\x42\x18\n\x0bio.dist_sgdB\x07\x44istSGDP\x01\x62\x06proto3') 23 | ) 24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR) 25 | 26 | 27 | 28 | 29 | _ACK = _descriptor.Descriptor( 30 | name='ack', 31 | full_name='dist_sgd.ack', 32 | filename=None, 33 | file=DESCRIPTOR, 34 | containing_type=None, 35 | fields=[ 36 | _descriptor.FieldDescriptor( 37 | name='n', full_name='dist_sgd.ack.n', index=0, 38 | number=1, type=2, cpp_type=6, label=1, 39 | has_default_value=False, default_value=0, 40 | message_type=None, enum_type=None, containing_type=None, 41 | is_extension=False, extension_scope=None, 42 | options=None), 43 | _descriptor.FieldDescriptor( 44 | name='v', full_name='dist_sgd.ack.v', index=1, 45 | number=2, type=9, cpp_type=9, label=1, 46 | has_default_value=False, default_value=_b("").decode('utf-8'), 47 | message_type=None, enum_type=None, containing_type=None, 48 | is_extension=False, extension_scope=None, 49 | options=None), 50 | _descriptor.FieldDescriptor( 51 | name='n_v', full_name='dist_sgd.ack.n_v', index=2, 52 | number=3, type=2, cpp_type=6, label=1, 53 | has_default_value=False, default_value=0, 54 | message_type=None, enum_type=None, containing_type=None, 55 | is_extension=False, extension_scope=None, 56 | options=None), 57 | ], 58 | extensions=[ 59 | ], 60 | nested_types=[], 61 | enum_types=[ 62 | ], 63 | options=None, 64 | is_extendable=False, 65 | syntax='proto3', 66 | extension_ranges=[], 67 | oneofs=[ 68 | ], 69 | serialized_start=25, 70 | serialized_end=65, 71 | ) 72 | 73 | 74 | _PROPOSAL = _descriptor.Descriptor( 75 | name='proposal', 76 | full_name='dist_sgd.proposal', 77 | filename=None, 78 | file=DESCRIPTOR, 79 | containing_type=None, 80 | fields=[ 81 | _descriptor.FieldDescriptor( 82 | name='n', full_name='dist_sgd.proposal.n', index=0, 83 | number=1, type=2, cpp_type=6, label=1, 84 | has_default_value=False, default_value=0, 85 | message_type=None, enum_type=None, containing_type=None, 86 | is_extension=False, extension_scope=None, 87 | options=None), 88 | ], 89 | extensions=[ 90 | ], 91 | nested_types=[], 92 | enum_types=[ 93 | ], 94 | options=None, 95 | is_extendable=False, 96 | syntax='proto3', 97 | extension_ranges=[], 98 | oneofs=[ 99 | ], 100 | serialized_start=67, 101 | serialized_end=88, 102 | ) 103 | 104 | 105 | _REQUEST_ACCEPTANCE = _descriptor.Descriptor( 106 | name='request_acceptance', 107 | full_name='dist_sgd.request_acceptance', 108 | filename=None, 109 | file=DESCRIPTOR, 110 | containing_type=None, 111 | fields=[ 112 | _descriptor.FieldDescriptor( 113 | name='n', full_name='dist_sgd.request_acceptance.n', index=0, 114 | number=1, type=2, cpp_type=6, label=1, 115 | has_default_value=False, default_value=0, 116 | message_type=None, enum_type=None, containing_type=None, 117 | is_extension=False, extension_scope=None, 118 | options=None), 119 | _descriptor.FieldDescriptor( 120 | name='v', full_name='dist_sgd.request_acceptance.v', index=1, 121 | number=2, type=9, cpp_type=9, label=1, 122 | has_default_value=False, default_value=_b("").decode('utf-8'), 123 | message_type=None, enum_type=None, containing_type=None, 124 | is_extension=False, extension_scope=None, 125 | options=None), 126 | ], 127 | extensions=[ 128 | ], 129 | nested_types=[], 130 | enum_types=[ 131 | ], 132 | options=None, 133 | is_extendable=False, 134 | syntax='proto3', 135 | extension_ranges=[], 136 | oneofs=[ 137 | ], 138 | serialized_start=90, 139 | serialized_end=132, 140 | ) 141 | 142 | 143 | _ACQUIESCENCE = _descriptor.Descriptor( 144 | name='acquiescence', 145 | full_name='dist_sgd.acquiescence', 146 | filename=None, 147 | file=DESCRIPTOR, 148 | containing_type=None, 149 | fields=[ 150 | _descriptor.FieldDescriptor( 151 | name='accept_bool', full_name='dist_sgd.acquiescence.accept_bool', index=0, 152 | number=1, type=8, cpp_type=7, label=1, 153 | has_default_value=False, default_value=False, 154 | message_type=None, enum_type=None, containing_type=None, 155 | is_extension=False, extension_scope=None, 156 | options=None), 157 | ], 158 | extensions=[ 159 | ], 160 | nested_types=[], 161 | enum_types=[ 162 | ], 163 | options=None, 164 | is_extendable=False, 165 | syntax='proto3', 166 | extension_ranges=[], 167 | oneofs=[ 168 | ], 169 | serialized_start=134, 170 | serialized_end=169, 171 | ) 172 | 173 | 174 | _CONSENSUS = _descriptor.Descriptor( 175 | name='consensus', 176 | full_name='dist_sgd.consensus', 177 | filename=None, 178 | file=DESCRIPTOR, 179 | containing_type=None, 180 | fields=[ 181 | _descriptor.FieldDescriptor( 182 | name='n', full_name='dist_sgd.consensus.n', index=0, 183 | number=1, type=2, cpp_type=6, label=1, 184 | has_default_value=False, default_value=0, 185 | message_type=None, enum_type=None, containing_type=None, 186 | is_extension=False, extension_scope=None, 187 | options=None), 188 | _descriptor.FieldDescriptor( 189 | name='v', full_name='dist_sgd.consensus.v', index=1, 190 | number=2, type=9, cpp_type=9, label=1, 191 | has_default_value=False, default_value=_b("").decode('utf-8'), 192 | message_type=None, enum_type=None, containing_type=None, 193 | is_extension=False, extension_scope=None, 194 | options=None), 195 | ], 196 | extensions=[ 197 | ], 198 | nested_types=[], 199 | enum_types=[ 200 | ], 201 | options=None, 202 | is_extendable=False, 203 | syntax='proto3', 204 | extension_ranges=[], 205 | oneofs=[ 206 | ], 207 | serialized_start=171, 208 | serialized_end=204, 209 | ) 210 | 211 | 212 | _EMPTY = _descriptor.Descriptor( 213 | name='empty', 214 | full_name='dist_sgd.empty', 215 | filename=None, 216 | file=DESCRIPTOR, 217 | containing_type=None, 218 | fields=[ 219 | ], 220 | extensions=[ 221 | ], 222 | nested_types=[], 223 | enum_types=[ 224 | ], 225 | options=None, 226 | is_extendable=False, 227 | syntax='proto3', 228 | extension_ranges=[], 229 | oneofs=[ 230 | ], 231 | serialized_start=206, 232 | serialized_end=213, 233 | ) 234 | 235 | DESCRIPTOR.message_types_by_name['ack'] = _ACK 236 | DESCRIPTOR.message_types_by_name['proposal'] = _PROPOSAL 237 | DESCRIPTOR.message_types_by_name['request_acceptance'] = _REQUEST_ACCEPTANCE 238 | DESCRIPTOR.message_types_by_name['acquiescence'] = _ACQUIESCENCE 239 | DESCRIPTOR.message_types_by_name['consensus'] = _CONSENSUS 240 | DESCRIPTOR.message_types_by_name['empty'] = _EMPTY 241 | 242 | ack = _reflection.GeneratedProtocolMessageType('ack', (_message.Message,), dict( 243 | DESCRIPTOR = _ACK, 244 | __module__ = 'paxos_pb2' 245 | # @@protoc_insertion_point(class_scope:dist_sgd.ack) 246 | )) 247 | _sym_db.RegisterMessage(ack) 248 | 249 | proposal = _reflection.GeneratedProtocolMessageType('proposal', (_message.Message,), dict( 250 | DESCRIPTOR = _PROPOSAL, 251 | __module__ = 'paxos_pb2' 252 | # @@protoc_insertion_point(class_scope:dist_sgd.proposal) 253 | )) 254 | _sym_db.RegisterMessage(proposal) 255 | 256 | request_acceptance = _reflection.GeneratedProtocolMessageType('request_acceptance', (_message.Message,), dict( 257 | DESCRIPTOR = _REQUEST_ACCEPTANCE, 258 | __module__ = 'paxos_pb2' 259 | # @@protoc_insertion_point(class_scope:dist_sgd.request_acceptance) 260 | )) 261 | _sym_db.RegisterMessage(request_acceptance) 262 | 263 | acquiescence = _reflection.GeneratedProtocolMessageType('acquiescence', (_message.Message,), dict( 264 | DESCRIPTOR = _ACQUIESCENCE, 265 | __module__ = 'paxos_pb2' 266 | # @@protoc_insertion_point(class_scope:dist_sgd.acquiescence) 267 | )) 268 | _sym_db.RegisterMessage(acquiescence) 269 | 270 | consensus = _reflection.GeneratedProtocolMessageType('consensus', (_message.Message,), dict( 271 | DESCRIPTOR = _CONSENSUS, 272 | __module__ = 'paxos_pb2' 273 | # @@protoc_insertion_point(class_scope:dist_sgd.consensus) 274 | )) 275 | _sym_db.RegisterMessage(consensus) 276 | 277 | empty = _reflection.GeneratedProtocolMessageType('empty', (_message.Message,), dict( 278 | DESCRIPTOR = _EMPTY, 279 | __module__ = 'paxos_pb2' 280 | # @@protoc_insertion_point(class_scope:dist_sgd.empty) 281 | )) 282 | _sym_db.RegisterMessage(empty) 283 | 284 | 285 | DESCRIPTOR.has_options = True 286 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n\013io.dist_sgdB\007DistSGDP\001')) 287 | import abc 288 | import six 289 | from grpc.beta import implementations as beta_implementations 290 | from grpc.framework.common import cardinality 291 | from grpc.framework.interfaces.face import utilities as face_utilities 292 | 293 | class BetaPaxosServerServicer(six.with_metaclass(abc.ABCMeta, object)): 294 | """""" 295 | @abc.abstractmethod 296 | def prepare(self, request, context): 297 | raise NotImplementedError() 298 | @abc.abstractmethod 299 | def accept(self, request, context): 300 | raise NotImplementedError() 301 | @abc.abstractmethod 302 | def accepted(self, request, context): 303 | raise NotImplementedError() 304 | @abc.abstractmethod 305 | def ping(self, request, context): 306 | raise NotImplementedError() 307 | 308 | class BetaPaxosServerStub(six.with_metaclass(abc.ABCMeta, object)): 309 | """The interface to which stubs will conform.""" 310 | @abc.abstractmethod 311 | def prepare(self, request, timeout): 312 | raise NotImplementedError() 313 | prepare.future = None 314 | @abc.abstractmethod 315 | def accept(self, request, timeout): 316 | raise NotImplementedError() 317 | accept.future = None 318 | @abc.abstractmethod 319 | def accepted(self, request, timeout): 320 | raise NotImplementedError() 321 | accepted.future = None 322 | @abc.abstractmethod 323 | def ping(self, request, timeout): 324 | raise NotImplementedError() 325 | ping.future = None 326 | 327 | def beta_create_PaxosServer_server(servicer, pool=None, pool_size=None, default_timeout=None, maximum_timeout=None): 328 | import paxos_pb2 329 | import paxos_pb2 330 | import paxos_pb2 331 | import paxos_pb2 332 | import paxos_pb2 333 | import paxos_pb2 334 | import paxos_pb2 335 | import paxos_pb2 336 | request_deserializers = { 337 | ('dist_sgd.PaxosServer', 'accept'): paxos_pb2.request_acceptance.FromString, 338 | ('dist_sgd.PaxosServer', 'accepted'): paxos_pb2.consensus.FromString, 339 | ('dist_sgd.PaxosServer', 'ping'): paxos_pb2.empty.FromString, 340 | ('dist_sgd.PaxosServer', 'prepare'): paxos_pb2.proposal.FromString, 341 | } 342 | response_serializers = { 343 | ('dist_sgd.PaxosServer', 'accept'): paxos_pb2.acquiescence.SerializeToString, 344 | ('dist_sgd.PaxosServer', 'accepted'): paxos_pb2.empty.SerializeToString, 345 | ('dist_sgd.PaxosServer', 'ping'): paxos_pb2.empty.SerializeToString, 346 | ('dist_sgd.PaxosServer', 'prepare'): paxos_pb2.ack.SerializeToString, 347 | } 348 | method_implementations = { 349 | ('dist_sgd.PaxosServer', 'accept'): face_utilities.unary_unary_inline(servicer.accept), 350 | ('dist_sgd.PaxosServer', 'accepted'): face_utilities.unary_unary_inline(servicer.accepted), 351 | ('dist_sgd.PaxosServer', 'ping'): face_utilities.unary_unary_inline(servicer.ping), 352 | ('dist_sgd.PaxosServer', 'prepare'): face_utilities.unary_unary_inline(servicer.prepare), 353 | } 354 | server_options = beta_implementations.server_options(request_deserializers=request_deserializers, response_serializers=response_serializers, thread_pool=pool, thread_pool_size=pool_size, default_timeout=default_timeout, maximum_timeout=maximum_timeout) 355 | return beta_implementations.server(method_implementations, options=server_options) 356 | 357 | def beta_create_PaxosServer_stub(channel, host=None, metadata_transformer=None, pool=None, pool_size=None): 358 | import paxos_pb2 359 | import paxos_pb2 360 | import paxos_pb2 361 | import paxos_pb2 362 | import paxos_pb2 363 | import paxos_pb2 364 | import paxos_pb2 365 | import paxos_pb2 366 | request_serializers = { 367 | ('dist_sgd.PaxosServer', 'accept'): paxos_pb2.request_acceptance.SerializeToString, 368 | ('dist_sgd.PaxosServer', 'accepted'): paxos_pb2.consensus.SerializeToString, 369 | ('dist_sgd.PaxosServer', 'ping'): paxos_pb2.empty.SerializeToString, 370 | ('dist_sgd.PaxosServer', 'prepare'): paxos_pb2.proposal.SerializeToString, 371 | } 372 | response_deserializers = { 373 | ('dist_sgd.PaxosServer', 'accept'): paxos_pb2.acquiescence.FromString, 374 | ('dist_sgd.PaxosServer', 'accepted'): paxos_pb2.empty.FromString, 375 | ('dist_sgd.PaxosServer', 'ping'): paxos_pb2.empty.FromString, 376 | ('dist_sgd.PaxosServer', 'prepare'): paxos_pb2.ack.FromString, 377 | } 378 | cardinalities = { 379 | 'accept': cardinality.Cardinality.UNARY_UNARY, 380 | 'accepted': cardinality.Cardinality.UNARY_UNARY, 381 | 'ping': cardinality.Cardinality.UNARY_UNARY, 382 | 'prepare': cardinality.Cardinality.UNARY_UNARY, 383 | } 384 | stub_options = beta_implementations.stub_options(host=host, metadata_transformer=metadata_transformer, request_serializers=request_serializers, response_deserializers=response_deserializers, thread_pool=pool, thread_pool_size=pool_size) 385 | return beta_implementations.dynamic_stub(channel, 'dist_sgd.PaxosServer', cardinalities, options=stub_options) 386 | # @@protoc_insertion_point(module_scope) 387 | -------------------------------------------------------------------------------- /python-python/protobuf_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Default python file required for initializing the module for 2 | # protobuffer utilities. More documentation included in the next file. -------------------------------------------------------------------------------- /python-python/protobuf_utils/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/protobuf_utils/__init__.pyc -------------------------------------------------------------------------------- /python-python/protobuf_utils/utils.py: -------------------------------------------------------------------------------- 1 | import autograd.numpy as np 2 | import autograd.numpy.random as npr 3 | from autograd import grad 4 | import dist_sgd_pb2 5 | 6 | def convert_array_to_bytes(params): 7 | if (params.dtype == np.float64): 8 | params = params.astype(np.float32) 9 | param_bytes = params.tostring() 10 | return param_bytes 11 | 12 | def convert_bytes_to_array(param_bytes): 13 | params = np.fromstring(param_bytes, dtype=np.float32) 14 | return params 15 | 16 | def convert_tensor_iter(tensor_bytes, data_indx): 17 | CHUNK_SIZE = 524228 18 | tensor_bytes_len = len(tensor_bytes) 19 | tensor_chunk_count = 0 20 | while len(tensor_bytes): 21 | tensor_chunk_count += 1 22 | tensor_content = tensor_bytes[:CHUNK_SIZE] 23 | tensor_bytes = tensor_bytes[CHUNK_SIZE:] 24 | yield dist_sgd_pb2.SubTensor(tensor_len = tensor_bytes_len, tensor_chunk = tensor_chunk_count, tensor_content = tensor_content, data_indx = data_indx) -------------------------------------------------------------------------------- /python-python/protobuf_utils/utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/protobuf_utils/utils.pyc -------------------------------------------------------------------------------- /python-python/protos/dist_sgd.proto: -------------------------------------------------------------------------------- 1 | //Protocol buffers for project 2 | 3 | syntax = "proto3"; 4 | 5 | package dist_sgd; 6 | 7 | option java_multiple_files = true; 8 | option java_package = "io.dist_sgd"; 9 | option java_outer_classname = "DistSGD"; 10 | //option objc_class_prefix = "DSG"; 11 | 12 | // Main server for passing infromation around 13 | service ParamFeeder { 14 | // Sends the parameters back and forth between server and client 15 | rpc SendParams (ClientInfo) returns (stream SubTensor) {} 16 | 17 | // Sends information about the next batch 18 | rpc SendNextBatch (PrevBatch) returns (NextBatch) {} 19 | 20 | // Gets gardient updates from client servers 21 | rpc GetUpdates (stream SubTensor) returns (StatusCode) {} 22 | 23 | // This call simply makes sure that all machines have begun to run Paxos. 24 | rpc ping (empty) returns (empty) {} 25 | 26 | } 27 | 28 | message SubTensor { 29 | // Length of the tensor getting passed 30 | int32 tensor_len = 1; 31 | 32 | // Current chunk of the tensor 33 | int32 tensor_chunk = 2; 34 | 35 | // Serialized tensor getting passed 36 | bytes tensor_content = 3; 37 | 38 | // Batch for gradient update, used to determine whether or not 39 | // the gradient is stale and should be thrown out 40 | int32 data_indx = 4; 41 | } 42 | 43 | // Later on we can extend client info to include information about processing speed, etc. 44 | message ClientInfo { 45 | int32 client_id = 1; 46 | } 47 | 48 | // Includes information about sucesss and failure 49 | message StatusCode { 50 | int32 status = 1; 51 | } 52 | 53 | message PrevBatch { 54 | int32 client_id = 1; 55 | 56 | int32 prev_data_indx = 2; 57 | } 58 | 59 | message NextBatch { 60 | int32 client_id = 1; 61 | 62 | int32 data_indx = 2; 63 | } 64 | 65 | message empty {} -------------------------------------------------------------------------------- /python-python/protos/dist_sgd_pb2.py: -------------------------------------------------------------------------------- 1 | # Generated by the protocol buffer compiler. DO NOT EDIT! 2 | # source: protos/dist_sgd.proto 3 | 4 | import sys 5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) 6 | from google.protobuf import descriptor as _descriptor 7 | from google.protobuf import message as _message 8 | from google.protobuf import reflection as _reflection 9 | from google.protobuf import symbol_database as _symbol_database 10 | from google.protobuf import descriptor_pb2 11 | # @@protoc_insertion_point(imports) 12 | 13 | _sym_db = _symbol_database.Default() 14 | 15 | 16 | 17 | 18 | DESCRIPTOR = _descriptor.FileDescriptor( 19 | name='protos/dist_sgd.proto', 20 | package='dist_sgd', 21 | syntax='proto3', 22 | serialized_pb=_b('\n\x15protos/dist_sgd.proto\x12\x08\x64ist_sgd\"`\n\tSubTensor\x12\x12\n\ntensor_len\x18\x01 \x01(\x05\x12\x14\n\x0ctensor_chunk\x18\x02 \x01(\x05\x12\x16\n\x0etensor_content\x18\x03 \x01(\x0c\x12\x11\n\tdata_indx\x18\x04 \x01(\x05\"\x1f\n\nClientInfo\x12\x11\n\tclient_id\x18\x01 \x01(\x05\"\x1c\n\nStatusCode\x12\x0e\n\x06status\x18\x01 \x01(\x05\"6\n\tPrevBatch\x12\x11\n\tclient_id\x18\x01 \x01(\x05\x12\x16\n\x0eprev_data_indx\x18\x02 \x01(\x05\"1\n\tNextBatch\x12\x11\n\tclient_id\x18\x01 \x01(\x05\x12\x11\n\tdata_indx\x18\x02 \x01(\x05\"\x07\n\x05\x65mpty2\xf0\x01\n\x0bParamFeeder\x12;\n\nSendParams\x12\x14.dist_sgd.ClientInfo\x1a\x13.dist_sgd.SubTensor\"\x00\x30\x01\x12;\n\rSendNextBatch\x12\x13.dist_sgd.PrevBatch\x1a\x13.dist_sgd.NextBatch\"\x00\x12;\n\nGetUpdates\x12\x13.dist_sgd.SubTensor\x1a\x14.dist_sgd.StatusCode\"\x00(\x01\x12*\n\x04ping\x12\x0f.dist_sgd.empty\x1a\x0f.dist_sgd.empty\"\x00\x42\x18\n\x0bio.dist_sgdB\x07\x44istSGDP\x01\x62\x06proto3') 23 | ) 24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR) 25 | 26 | 27 | 28 | 29 | _SUBTENSOR = _descriptor.Descriptor( 30 | name='SubTensor', 31 | full_name='dist_sgd.SubTensor', 32 | filename=None, 33 | file=DESCRIPTOR, 34 | containing_type=None, 35 | fields=[ 36 | _descriptor.FieldDescriptor( 37 | name='tensor_len', full_name='dist_sgd.SubTensor.tensor_len', index=0, 38 | number=1, type=5, cpp_type=1, label=1, 39 | has_default_value=False, default_value=0, 40 | message_type=None, enum_type=None, containing_type=None, 41 | is_extension=False, extension_scope=None, 42 | options=None), 43 | _descriptor.FieldDescriptor( 44 | name='tensor_chunk', full_name='dist_sgd.SubTensor.tensor_chunk', index=1, 45 | number=2, type=5, cpp_type=1, label=1, 46 | has_default_value=False, default_value=0, 47 | message_type=None, enum_type=None, containing_type=None, 48 | is_extension=False, extension_scope=None, 49 | options=None), 50 | _descriptor.FieldDescriptor( 51 | name='tensor_content', full_name='dist_sgd.SubTensor.tensor_content', index=2, 52 | number=3, type=12, cpp_type=9, label=1, 53 | has_default_value=False, default_value=_b(""), 54 | message_type=None, enum_type=None, containing_type=None, 55 | is_extension=False, extension_scope=None, 56 | options=None), 57 | _descriptor.FieldDescriptor( 58 | name='data_indx', full_name='dist_sgd.SubTensor.data_indx', index=3, 59 | number=4, type=5, cpp_type=1, label=1, 60 | has_default_value=False, default_value=0, 61 | message_type=None, enum_type=None, containing_type=None, 62 | is_extension=False, extension_scope=None, 63 | options=None), 64 | ], 65 | extensions=[ 66 | ], 67 | nested_types=[], 68 | enum_types=[ 69 | ], 70 | options=None, 71 | is_extendable=False, 72 | syntax='proto3', 73 | extension_ranges=[], 74 | oneofs=[ 75 | ], 76 | serialized_start=35, 77 | serialized_end=131, 78 | ) 79 | 80 | 81 | _CLIENTINFO = _descriptor.Descriptor( 82 | name='ClientInfo', 83 | full_name='dist_sgd.ClientInfo', 84 | filename=None, 85 | file=DESCRIPTOR, 86 | containing_type=None, 87 | fields=[ 88 | _descriptor.FieldDescriptor( 89 | name='client_id', full_name='dist_sgd.ClientInfo.client_id', index=0, 90 | number=1, type=5, cpp_type=1, label=1, 91 | has_default_value=False, default_value=0, 92 | message_type=None, enum_type=None, containing_type=None, 93 | is_extension=False, extension_scope=None, 94 | options=None), 95 | ], 96 | extensions=[ 97 | ], 98 | nested_types=[], 99 | enum_types=[ 100 | ], 101 | options=None, 102 | is_extendable=False, 103 | syntax='proto3', 104 | extension_ranges=[], 105 | oneofs=[ 106 | ], 107 | serialized_start=133, 108 | serialized_end=164, 109 | ) 110 | 111 | 112 | _STATUSCODE = _descriptor.Descriptor( 113 | name='StatusCode', 114 | full_name='dist_sgd.StatusCode', 115 | filename=None, 116 | file=DESCRIPTOR, 117 | containing_type=None, 118 | fields=[ 119 | _descriptor.FieldDescriptor( 120 | name='status', full_name='dist_sgd.StatusCode.status', index=0, 121 | number=1, type=5, cpp_type=1, label=1, 122 | has_default_value=False, default_value=0, 123 | message_type=None, enum_type=None, containing_type=None, 124 | is_extension=False, extension_scope=None, 125 | options=None), 126 | ], 127 | extensions=[ 128 | ], 129 | nested_types=[], 130 | enum_types=[ 131 | ], 132 | options=None, 133 | is_extendable=False, 134 | syntax='proto3', 135 | extension_ranges=[], 136 | oneofs=[ 137 | ], 138 | serialized_start=166, 139 | serialized_end=194, 140 | ) 141 | 142 | 143 | _PREVBATCH = _descriptor.Descriptor( 144 | name='PrevBatch', 145 | full_name='dist_sgd.PrevBatch', 146 | filename=None, 147 | file=DESCRIPTOR, 148 | containing_type=None, 149 | fields=[ 150 | _descriptor.FieldDescriptor( 151 | name='client_id', full_name='dist_sgd.PrevBatch.client_id', index=0, 152 | number=1, type=5, cpp_type=1, label=1, 153 | has_default_value=False, default_value=0, 154 | message_type=None, enum_type=None, containing_type=None, 155 | is_extension=False, extension_scope=None, 156 | options=None), 157 | _descriptor.FieldDescriptor( 158 | name='prev_data_indx', full_name='dist_sgd.PrevBatch.prev_data_indx', index=1, 159 | number=2, type=5, cpp_type=1, label=1, 160 | has_default_value=False, default_value=0, 161 | message_type=None, enum_type=None, containing_type=None, 162 | is_extension=False, extension_scope=None, 163 | options=None), 164 | ], 165 | extensions=[ 166 | ], 167 | nested_types=[], 168 | enum_types=[ 169 | ], 170 | options=None, 171 | is_extendable=False, 172 | syntax='proto3', 173 | extension_ranges=[], 174 | oneofs=[ 175 | ], 176 | serialized_start=196, 177 | serialized_end=250, 178 | ) 179 | 180 | 181 | _NEXTBATCH = _descriptor.Descriptor( 182 | name='NextBatch', 183 | full_name='dist_sgd.NextBatch', 184 | filename=None, 185 | file=DESCRIPTOR, 186 | containing_type=None, 187 | fields=[ 188 | _descriptor.FieldDescriptor( 189 | name='client_id', full_name='dist_sgd.NextBatch.client_id', index=0, 190 | number=1, type=5, cpp_type=1, label=1, 191 | has_default_value=False, default_value=0, 192 | message_type=None, enum_type=None, containing_type=None, 193 | is_extension=False, extension_scope=None, 194 | options=None), 195 | _descriptor.FieldDescriptor( 196 | name='data_indx', full_name='dist_sgd.NextBatch.data_indx', index=1, 197 | number=2, type=5, cpp_type=1, label=1, 198 | has_default_value=False, default_value=0, 199 | message_type=None, enum_type=None, containing_type=None, 200 | is_extension=False, extension_scope=None, 201 | options=None), 202 | ], 203 | extensions=[ 204 | ], 205 | nested_types=[], 206 | enum_types=[ 207 | ], 208 | options=None, 209 | is_extendable=False, 210 | syntax='proto3', 211 | extension_ranges=[], 212 | oneofs=[ 213 | ], 214 | serialized_start=252, 215 | serialized_end=301, 216 | ) 217 | 218 | 219 | _EMPTY = _descriptor.Descriptor( 220 | name='empty', 221 | full_name='dist_sgd.empty', 222 | filename=None, 223 | file=DESCRIPTOR, 224 | containing_type=None, 225 | fields=[ 226 | ], 227 | extensions=[ 228 | ], 229 | nested_types=[], 230 | enum_types=[ 231 | ], 232 | options=None, 233 | is_extendable=False, 234 | syntax='proto3', 235 | extension_ranges=[], 236 | oneofs=[ 237 | ], 238 | serialized_start=303, 239 | serialized_end=310, 240 | ) 241 | 242 | DESCRIPTOR.message_types_by_name['SubTensor'] = _SUBTENSOR 243 | DESCRIPTOR.message_types_by_name['ClientInfo'] = _CLIENTINFO 244 | DESCRIPTOR.message_types_by_name['StatusCode'] = _STATUSCODE 245 | DESCRIPTOR.message_types_by_name['PrevBatch'] = _PREVBATCH 246 | DESCRIPTOR.message_types_by_name['NextBatch'] = _NEXTBATCH 247 | DESCRIPTOR.message_types_by_name['empty'] = _EMPTY 248 | 249 | SubTensor = _reflection.GeneratedProtocolMessageType('SubTensor', (_message.Message,), dict( 250 | DESCRIPTOR = _SUBTENSOR, 251 | __module__ = 'protos.dist_sgd_pb2' 252 | # @@protoc_insertion_point(class_scope:dist_sgd.SubTensor) 253 | )) 254 | _sym_db.RegisterMessage(SubTensor) 255 | 256 | ClientInfo = _reflection.GeneratedProtocolMessageType('ClientInfo', (_message.Message,), dict( 257 | DESCRIPTOR = _CLIENTINFO, 258 | __module__ = 'protos.dist_sgd_pb2' 259 | # @@protoc_insertion_point(class_scope:dist_sgd.ClientInfo) 260 | )) 261 | _sym_db.RegisterMessage(ClientInfo) 262 | 263 | StatusCode = _reflection.GeneratedProtocolMessageType('StatusCode', (_message.Message,), dict( 264 | DESCRIPTOR = _STATUSCODE, 265 | __module__ = 'protos.dist_sgd_pb2' 266 | # @@protoc_insertion_point(class_scope:dist_sgd.StatusCode) 267 | )) 268 | _sym_db.RegisterMessage(StatusCode) 269 | 270 | PrevBatch = _reflection.GeneratedProtocolMessageType('PrevBatch', (_message.Message,), dict( 271 | DESCRIPTOR = _PREVBATCH, 272 | __module__ = 'protos.dist_sgd_pb2' 273 | # @@protoc_insertion_point(class_scope:dist_sgd.PrevBatch) 274 | )) 275 | _sym_db.RegisterMessage(PrevBatch) 276 | 277 | NextBatch = _reflection.GeneratedProtocolMessageType('NextBatch', (_message.Message,), dict( 278 | DESCRIPTOR = _NEXTBATCH, 279 | __module__ = 'protos.dist_sgd_pb2' 280 | # @@protoc_insertion_point(class_scope:dist_sgd.NextBatch) 281 | )) 282 | _sym_db.RegisterMessage(NextBatch) 283 | 284 | empty = _reflection.GeneratedProtocolMessageType('empty', (_message.Message,), dict( 285 | DESCRIPTOR = _EMPTY, 286 | __module__ = 'protos.dist_sgd_pb2' 287 | # @@protoc_insertion_point(class_scope:dist_sgd.empty) 288 | )) 289 | _sym_db.RegisterMessage(empty) 290 | 291 | 292 | DESCRIPTOR.has_options = True 293 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n\013io.dist_sgdB\007DistSGDP\001')) 294 | import abc 295 | import six 296 | from grpc.beta import implementations as beta_implementations 297 | from grpc.framework.common import cardinality 298 | from grpc.framework.interfaces.face import utilities as face_utilities 299 | 300 | class BetaParamFeederServicer(six.with_metaclass(abc.ABCMeta, object)): 301 | """""" 302 | @abc.abstractmethod 303 | def SendParams(self, request, context): 304 | raise NotImplementedError() 305 | @abc.abstractmethod 306 | def SendNextBatch(self, request, context): 307 | raise NotImplementedError() 308 | @abc.abstractmethod 309 | def GetUpdates(self, request_iterator, context): 310 | raise NotImplementedError() 311 | @abc.abstractmethod 312 | def ping(self, request, context): 313 | raise NotImplementedError() 314 | 315 | class BetaParamFeederStub(six.with_metaclass(abc.ABCMeta, object)): 316 | """The interface to which stubs will conform.""" 317 | @abc.abstractmethod 318 | def SendParams(self, request, timeout): 319 | raise NotImplementedError() 320 | @abc.abstractmethod 321 | def SendNextBatch(self, request, timeout): 322 | raise NotImplementedError() 323 | SendNextBatch.future = None 324 | @abc.abstractmethod 325 | def GetUpdates(self, request_iterator, timeout): 326 | raise NotImplementedError() 327 | GetUpdates.future = None 328 | @abc.abstractmethod 329 | def ping(self, request, timeout): 330 | raise NotImplementedError() 331 | ping.future = None 332 | 333 | def beta_create_ParamFeeder_server(servicer, pool=None, pool_size=None, default_timeout=None, maximum_timeout=None): 334 | import protos.dist_sgd_pb2 335 | import protos.dist_sgd_pb2 336 | import protos.dist_sgd_pb2 337 | import protos.dist_sgd_pb2 338 | import protos.dist_sgd_pb2 339 | import protos.dist_sgd_pb2 340 | import protos.dist_sgd_pb2 341 | import protos.dist_sgd_pb2 342 | request_deserializers = { 343 | ('dist_sgd.ParamFeeder', 'GetUpdates'): protos.dist_sgd_pb2.SubTensor.FromString, 344 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): protos.dist_sgd_pb2.PrevBatch.FromString, 345 | ('dist_sgd.ParamFeeder', 'SendParams'): protos.dist_sgd_pb2.ClientInfo.FromString, 346 | ('dist_sgd.ParamFeeder', 'ping'): protos.dist_sgd_pb2.empty.FromString, 347 | } 348 | response_serializers = { 349 | ('dist_sgd.ParamFeeder', 'GetUpdates'): protos.dist_sgd_pb2.StatusCode.SerializeToString, 350 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): protos.dist_sgd_pb2.NextBatch.SerializeToString, 351 | ('dist_sgd.ParamFeeder', 'SendParams'): protos.dist_sgd_pb2.SubTensor.SerializeToString, 352 | ('dist_sgd.ParamFeeder', 'ping'): protos.dist_sgd_pb2.empty.SerializeToString, 353 | } 354 | method_implementations = { 355 | ('dist_sgd.ParamFeeder', 'GetUpdates'): face_utilities.stream_unary_inline(servicer.GetUpdates), 356 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): face_utilities.unary_unary_inline(servicer.SendNextBatch), 357 | ('dist_sgd.ParamFeeder', 'SendParams'): face_utilities.unary_stream_inline(servicer.SendParams), 358 | ('dist_sgd.ParamFeeder', 'ping'): face_utilities.unary_unary_inline(servicer.ping), 359 | } 360 | server_options = beta_implementations.server_options(request_deserializers=request_deserializers, response_serializers=response_serializers, thread_pool=pool, thread_pool_size=pool_size, default_timeout=default_timeout, maximum_timeout=maximum_timeout) 361 | return beta_implementations.server(method_implementations, options=server_options) 362 | 363 | def beta_create_ParamFeeder_stub(channel, host=None, metadata_transformer=None, pool=None, pool_size=None): 364 | import protos.dist_sgd_pb2 365 | import protos.dist_sgd_pb2 366 | import protos.dist_sgd_pb2 367 | import protos.dist_sgd_pb2 368 | import protos.dist_sgd_pb2 369 | import protos.dist_sgd_pb2 370 | import protos.dist_sgd_pb2 371 | import protos.dist_sgd_pb2 372 | request_serializers = { 373 | ('dist_sgd.ParamFeeder', 'GetUpdates'): protos.dist_sgd_pb2.SubTensor.SerializeToString, 374 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): protos.dist_sgd_pb2.PrevBatch.SerializeToString, 375 | ('dist_sgd.ParamFeeder', 'SendParams'): protos.dist_sgd_pb2.ClientInfo.SerializeToString, 376 | ('dist_sgd.ParamFeeder', 'ping'): protos.dist_sgd_pb2.empty.SerializeToString, 377 | } 378 | response_deserializers = { 379 | ('dist_sgd.ParamFeeder', 'GetUpdates'): protos.dist_sgd_pb2.StatusCode.FromString, 380 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): protos.dist_sgd_pb2.NextBatch.FromString, 381 | ('dist_sgd.ParamFeeder', 'SendParams'): protos.dist_sgd_pb2.SubTensor.FromString, 382 | ('dist_sgd.ParamFeeder', 'ping'): protos.dist_sgd_pb2.empty.FromString, 383 | } 384 | cardinalities = { 385 | 'GetUpdates': cardinality.Cardinality.STREAM_UNARY, 386 | 'SendNextBatch': cardinality.Cardinality.UNARY_UNARY, 387 | 'SendParams': cardinality.Cardinality.UNARY_STREAM, 388 | 'ping': cardinality.Cardinality.UNARY_UNARY, 389 | } 390 | stub_options = beta_implementations.stub_options(host=host, metadata_transformer=metadata_transformer, request_serializers=request_serializers, response_deserializers=response_deserializers, thread_pool=pool, thread_pool_size=pool_size) 391 | return beta_implementations.dynamic_stub(channel, 'dist_sgd.ParamFeeder', cardinalities, options=stub_options) 392 | # @@protoc_insertion_point(module_scope) 393 | -------------------------------------------------------------------------------- /python-python/protos/paxos.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto3"; 2 | 3 | package dist_sgd; 4 | 5 | option java_multiple_files = true; 6 | option java_package = "io.dist_sgd"; 7 | option java_outer_classname = "DistSGD"; 8 | 9 | // After getting a majority of proposals without getting rejected, the 10 | // machine chooses an IP from the most recent acknowledgements or one 11 | // that it generates itself and calls accept on all servers. 12 | 13 | // The servers return whether or not they accept. If majority of servers accept, 14 | // then it calls consensus on all servers and sends them the decided upon IP address 15 | // that is server 16 | 17 | // TECHNICALLY, each machine should call consensus 18 | 19 | // Main server for running the Paxos Algorithm. Everyone hosts this server on 20 | // their localhost. Used for sending and receiving messages for coordinating Paxos. 21 | service PaxosServer { 22 | 23 | // The machine sends each server a proposal. The server then 24 | // sends an acknowledgement accepting or rejecting the proposal. 25 | rpc prepare(proposal) returns (ack) {} 26 | 27 | // Requests that people accept the proposal 28 | rpc accept(request_acceptance) returns (acquiescence) {} 29 | 30 | // Notified that consensus has been achieved about a server 31 | // Technically each server should broadcast that it accepted the consensus 32 | rpc accepted (consensus) returns (empty) {} 33 | 34 | // This call simply makes sure that all machines have begun to run Paxos. 35 | rpc ping (empty) returns (empty) {} 36 | } 37 | 38 | message ack { 39 | float n = 1; 40 | string v = 2; 41 | float n_v = 3; 42 | } 43 | 44 | message proposal { 45 | float n = 1; 46 | } 47 | 48 | message request_acceptance{ 49 | float n = 1; 50 | string v = 2; 51 | } 52 | 53 | message acquiescence { 54 | bool accept_bool = 1; 55 | } 56 | 57 | message consensus { 58 | float n = 1; 59 | string v = 2; 60 | } 61 | 62 | message empty { 63 | } -------------------------------------------------------------------------------- /python-python/protos/paxos_pb2.py: -------------------------------------------------------------------------------- 1 | # Generated by the protocol buffer compiler. DO NOT EDIT! 2 | # source: protos/paxos.proto 3 | 4 | import sys 5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) 6 | from google.protobuf import descriptor as _descriptor 7 | from google.protobuf import message as _message 8 | from google.protobuf import reflection as _reflection 9 | from google.protobuf import symbol_database as _symbol_database 10 | from google.protobuf import descriptor_pb2 11 | # @@protoc_insertion_point(imports) 12 | 13 | _sym_db = _symbol_database.Default() 14 | 15 | 16 | 17 | 18 | DESCRIPTOR = _descriptor.FileDescriptor( 19 | name='protos/paxos.proto', 20 | package='dist_sgd', 21 | syntax='proto3', 22 | serialized_pb=_b('\n\x12protos/paxos.proto\x12\x08\x64ist_sgd\"(\n\x03\x61\x63k\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\x12\x0b\n\x03n_v\x18\x03 \x01(\x02\"\x15\n\x08proposal\x12\t\n\x01n\x18\x01 \x01(\x02\"*\n\x12request_acceptance\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\"#\n\x0c\x61\x63quiescence\x12\x13\n\x0b\x61\x63\x63\x65pt_bool\x18\x01 \x01(\x08\"!\n\tconsensus\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\"\x07\n\x05\x65mpty2\xdf\x01\n\x0bPaxosServer\x12.\n\x07prepare\x12\x12.dist_sgd.proposal\x1a\r.dist_sgd.ack\"\x00\x12@\n\x06\x61\x63\x63\x65pt\x12\x1c.dist_sgd.request_acceptance\x1a\x16.dist_sgd.acquiescence\"\x00\x12\x32\n\x08\x61\x63\x63\x65pted\x12\x13.dist_sgd.consensus\x1a\x0f.dist_sgd.empty\"\x00\x12*\n\x04ping\x12\x0f.dist_sgd.empty\x1a\x0f.dist_sgd.empty\"\x00\x42\x18\n\x0bio.dist_sgdB\x07\x44istSGDP\x01\x62\x06proto3') 23 | ) 24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR) 25 | 26 | 27 | 28 | 29 | _ACK = _descriptor.Descriptor( 30 | name='ack', 31 | full_name='dist_sgd.ack', 32 | filename=None, 33 | file=DESCRIPTOR, 34 | containing_type=None, 35 | fields=[ 36 | _descriptor.FieldDescriptor( 37 | name='n', full_name='dist_sgd.ack.n', index=0, 38 | number=1, type=2, cpp_type=6, label=1, 39 | has_default_value=False, default_value=0, 40 | message_type=None, enum_type=None, containing_type=None, 41 | is_extension=False, extension_scope=None, 42 | options=None), 43 | _descriptor.FieldDescriptor( 44 | name='v', full_name='dist_sgd.ack.v', index=1, 45 | number=2, type=9, cpp_type=9, label=1, 46 | has_default_value=False, default_value=_b("").decode('utf-8'), 47 | message_type=None, enum_type=None, containing_type=None, 48 | is_extension=False, extension_scope=None, 49 | options=None), 50 | _descriptor.FieldDescriptor( 51 | name='n_v', full_name='dist_sgd.ack.n_v', index=2, 52 | number=3, type=2, cpp_type=6, label=1, 53 | has_default_value=False, default_value=0, 54 | message_type=None, enum_type=None, containing_type=None, 55 | is_extension=False, extension_scope=None, 56 | options=None), 57 | ], 58 | extensions=[ 59 | ], 60 | nested_types=[], 61 | enum_types=[ 62 | ], 63 | options=None, 64 | is_extendable=False, 65 | syntax='proto3', 66 | extension_ranges=[], 67 | oneofs=[ 68 | ], 69 | serialized_start=32, 70 | serialized_end=72, 71 | ) 72 | 73 | 74 | _PROPOSAL = _descriptor.Descriptor( 75 | name='proposal', 76 | full_name='dist_sgd.proposal', 77 | filename=None, 78 | file=DESCRIPTOR, 79 | containing_type=None, 80 | fields=[ 81 | _descriptor.FieldDescriptor( 82 | name='n', full_name='dist_sgd.proposal.n', index=0, 83 | number=1, type=2, cpp_type=6, label=1, 84 | has_default_value=False, default_value=0, 85 | message_type=None, enum_type=None, containing_type=None, 86 | is_extension=False, extension_scope=None, 87 | options=None), 88 | ], 89 | extensions=[ 90 | ], 91 | nested_types=[], 92 | enum_types=[ 93 | ], 94 | options=None, 95 | is_extendable=False, 96 | syntax='proto3', 97 | extension_ranges=[], 98 | oneofs=[ 99 | ], 100 | serialized_start=74, 101 | serialized_end=95, 102 | ) 103 | 104 | 105 | _REQUEST_ACCEPTANCE = _descriptor.Descriptor( 106 | name='request_acceptance', 107 | full_name='dist_sgd.request_acceptance', 108 | filename=None, 109 | file=DESCRIPTOR, 110 | containing_type=None, 111 | fields=[ 112 | _descriptor.FieldDescriptor( 113 | name='n', full_name='dist_sgd.request_acceptance.n', index=0, 114 | number=1, type=2, cpp_type=6, label=1, 115 | has_default_value=False, default_value=0, 116 | message_type=None, enum_type=None, containing_type=None, 117 | is_extension=False, extension_scope=None, 118 | options=None), 119 | _descriptor.FieldDescriptor( 120 | name='v', full_name='dist_sgd.request_acceptance.v', index=1, 121 | number=2, type=9, cpp_type=9, label=1, 122 | has_default_value=False, default_value=_b("").decode('utf-8'), 123 | message_type=None, enum_type=None, containing_type=None, 124 | is_extension=False, extension_scope=None, 125 | options=None), 126 | ], 127 | extensions=[ 128 | ], 129 | nested_types=[], 130 | enum_types=[ 131 | ], 132 | options=None, 133 | is_extendable=False, 134 | syntax='proto3', 135 | extension_ranges=[], 136 | oneofs=[ 137 | ], 138 | serialized_start=97, 139 | serialized_end=139, 140 | ) 141 | 142 | 143 | _ACQUIESCENCE = _descriptor.Descriptor( 144 | name='acquiescence', 145 | full_name='dist_sgd.acquiescence', 146 | filename=None, 147 | file=DESCRIPTOR, 148 | containing_type=None, 149 | fields=[ 150 | _descriptor.FieldDescriptor( 151 | name='accept_bool', full_name='dist_sgd.acquiescence.accept_bool', index=0, 152 | number=1, type=8, cpp_type=7, label=1, 153 | has_default_value=False, default_value=False, 154 | message_type=None, enum_type=None, containing_type=None, 155 | is_extension=False, extension_scope=None, 156 | options=None), 157 | ], 158 | extensions=[ 159 | ], 160 | nested_types=[], 161 | enum_types=[ 162 | ], 163 | options=None, 164 | is_extendable=False, 165 | syntax='proto3', 166 | extension_ranges=[], 167 | oneofs=[ 168 | ], 169 | serialized_start=141, 170 | serialized_end=176, 171 | ) 172 | 173 | 174 | _CONSENSUS = _descriptor.Descriptor( 175 | name='consensus', 176 | full_name='dist_sgd.consensus', 177 | filename=None, 178 | file=DESCRIPTOR, 179 | containing_type=None, 180 | fields=[ 181 | _descriptor.FieldDescriptor( 182 | name='n', full_name='dist_sgd.consensus.n', index=0, 183 | number=1, type=2, cpp_type=6, label=1, 184 | has_default_value=False, default_value=0, 185 | message_type=None, enum_type=None, containing_type=None, 186 | is_extension=False, extension_scope=None, 187 | options=None), 188 | _descriptor.FieldDescriptor( 189 | name='v', full_name='dist_sgd.consensus.v', index=1, 190 | number=2, type=9, cpp_type=9, label=1, 191 | has_default_value=False, default_value=_b("").decode('utf-8'), 192 | message_type=None, enum_type=None, containing_type=None, 193 | is_extension=False, extension_scope=None, 194 | options=None), 195 | ], 196 | extensions=[ 197 | ], 198 | nested_types=[], 199 | enum_types=[ 200 | ], 201 | options=None, 202 | is_extendable=False, 203 | syntax='proto3', 204 | extension_ranges=[], 205 | oneofs=[ 206 | ], 207 | serialized_start=178, 208 | serialized_end=211, 209 | ) 210 | 211 | 212 | _EMPTY = _descriptor.Descriptor( 213 | name='empty', 214 | full_name='dist_sgd.empty', 215 | filename=None, 216 | file=DESCRIPTOR, 217 | containing_type=None, 218 | fields=[ 219 | ], 220 | extensions=[ 221 | ], 222 | nested_types=[], 223 | enum_types=[ 224 | ], 225 | options=None, 226 | is_extendable=False, 227 | syntax='proto3', 228 | extension_ranges=[], 229 | oneofs=[ 230 | ], 231 | serialized_start=213, 232 | serialized_end=220, 233 | ) 234 | 235 | DESCRIPTOR.message_types_by_name['ack'] = _ACK 236 | DESCRIPTOR.message_types_by_name['proposal'] = _PROPOSAL 237 | DESCRIPTOR.message_types_by_name['request_acceptance'] = _REQUEST_ACCEPTANCE 238 | DESCRIPTOR.message_types_by_name['acquiescence'] = _ACQUIESCENCE 239 | DESCRIPTOR.message_types_by_name['consensus'] = _CONSENSUS 240 | DESCRIPTOR.message_types_by_name['empty'] = _EMPTY 241 | 242 | ack = _reflection.GeneratedProtocolMessageType('ack', (_message.Message,), dict( 243 | DESCRIPTOR = _ACK, 244 | __module__ = 'protos.paxos_pb2' 245 | # @@protoc_insertion_point(class_scope:dist_sgd.ack) 246 | )) 247 | _sym_db.RegisterMessage(ack) 248 | 249 | proposal = _reflection.GeneratedProtocolMessageType('proposal', (_message.Message,), dict( 250 | DESCRIPTOR = _PROPOSAL, 251 | __module__ = 'protos.paxos_pb2' 252 | # @@protoc_insertion_point(class_scope:dist_sgd.proposal) 253 | )) 254 | _sym_db.RegisterMessage(proposal) 255 | 256 | request_acceptance = _reflection.GeneratedProtocolMessageType('request_acceptance', (_message.Message,), dict( 257 | DESCRIPTOR = _REQUEST_ACCEPTANCE, 258 | __module__ = 'protos.paxos_pb2' 259 | # @@protoc_insertion_point(class_scope:dist_sgd.request_acceptance) 260 | )) 261 | _sym_db.RegisterMessage(request_acceptance) 262 | 263 | acquiescence = _reflection.GeneratedProtocolMessageType('acquiescence', (_message.Message,), dict( 264 | DESCRIPTOR = _ACQUIESCENCE, 265 | __module__ = 'protos.paxos_pb2' 266 | # @@protoc_insertion_point(class_scope:dist_sgd.acquiescence) 267 | )) 268 | _sym_db.RegisterMessage(acquiescence) 269 | 270 | consensus = _reflection.GeneratedProtocolMessageType('consensus', (_message.Message,), dict( 271 | DESCRIPTOR = _CONSENSUS, 272 | __module__ = 'protos.paxos_pb2' 273 | # @@protoc_insertion_point(class_scope:dist_sgd.consensus) 274 | )) 275 | _sym_db.RegisterMessage(consensus) 276 | 277 | empty = _reflection.GeneratedProtocolMessageType('empty', (_message.Message,), dict( 278 | DESCRIPTOR = _EMPTY, 279 | __module__ = 'protos.paxos_pb2' 280 | # @@protoc_insertion_point(class_scope:dist_sgd.empty) 281 | )) 282 | _sym_db.RegisterMessage(empty) 283 | 284 | 285 | DESCRIPTOR.has_options = True 286 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n\013io.dist_sgdB\007DistSGDP\001')) 287 | import abc 288 | import six 289 | from grpc.beta import implementations as beta_implementations 290 | from grpc.framework.common import cardinality 291 | from grpc.framework.interfaces.face import utilities as face_utilities 292 | 293 | class BetaPaxosServerServicer(six.with_metaclass(abc.ABCMeta, object)): 294 | """""" 295 | @abc.abstractmethod 296 | def prepare(self, request, context): 297 | raise NotImplementedError() 298 | @abc.abstractmethod 299 | def accept(self, request, context): 300 | raise NotImplementedError() 301 | @abc.abstractmethod 302 | def accepted(self, request, context): 303 | raise NotImplementedError() 304 | @abc.abstractmethod 305 | def ping(self, request, context): 306 | raise NotImplementedError() 307 | 308 | class BetaPaxosServerStub(six.with_metaclass(abc.ABCMeta, object)): 309 | """The interface to which stubs will conform.""" 310 | @abc.abstractmethod 311 | def prepare(self, request, timeout): 312 | raise NotImplementedError() 313 | prepare.future = None 314 | @abc.abstractmethod 315 | def accept(self, request, timeout): 316 | raise NotImplementedError() 317 | accept.future = None 318 | @abc.abstractmethod 319 | def accepted(self, request, timeout): 320 | raise NotImplementedError() 321 | accepted.future = None 322 | @abc.abstractmethod 323 | def ping(self, request, timeout): 324 | raise NotImplementedError() 325 | ping.future = None 326 | 327 | def beta_create_PaxosServer_server(servicer, pool=None, pool_size=None, default_timeout=None, maximum_timeout=None): 328 | import protos.paxos_pb2 329 | import protos.paxos_pb2 330 | import protos.paxos_pb2 331 | import protos.paxos_pb2 332 | import protos.paxos_pb2 333 | import protos.paxos_pb2 334 | import protos.paxos_pb2 335 | import protos.paxos_pb2 336 | request_deserializers = { 337 | ('dist_sgd.PaxosServer', 'accept'): protos.paxos_pb2.request_acceptance.FromString, 338 | ('dist_sgd.PaxosServer', 'accepted'): protos.paxos_pb2.consensus.FromString, 339 | ('dist_sgd.PaxosServer', 'ping'): protos.paxos_pb2.empty.FromString, 340 | ('dist_sgd.PaxosServer', 'prepare'): protos.paxos_pb2.proposal.FromString, 341 | } 342 | response_serializers = { 343 | ('dist_sgd.PaxosServer', 'accept'): protos.paxos_pb2.acquiescence.SerializeToString, 344 | ('dist_sgd.PaxosServer', 'accepted'): protos.paxos_pb2.empty.SerializeToString, 345 | ('dist_sgd.PaxosServer', 'ping'): protos.paxos_pb2.empty.SerializeToString, 346 | ('dist_sgd.PaxosServer', 'prepare'): protos.paxos_pb2.ack.SerializeToString, 347 | } 348 | method_implementations = { 349 | ('dist_sgd.PaxosServer', 'accept'): face_utilities.unary_unary_inline(servicer.accept), 350 | ('dist_sgd.PaxosServer', 'accepted'): face_utilities.unary_unary_inline(servicer.accepted), 351 | ('dist_sgd.PaxosServer', 'ping'): face_utilities.unary_unary_inline(servicer.ping), 352 | ('dist_sgd.PaxosServer', 'prepare'): face_utilities.unary_unary_inline(servicer.prepare), 353 | } 354 | server_options = beta_implementations.server_options(request_deserializers=request_deserializers, response_serializers=response_serializers, thread_pool=pool, thread_pool_size=pool_size, default_timeout=default_timeout, maximum_timeout=maximum_timeout) 355 | return beta_implementations.server(method_implementations, options=server_options) 356 | 357 | def beta_create_PaxosServer_stub(channel, host=None, metadata_transformer=None, pool=None, pool_size=None): 358 | import protos.paxos_pb2 359 | import protos.paxos_pb2 360 | import protos.paxos_pb2 361 | import protos.paxos_pb2 362 | import protos.paxos_pb2 363 | import protos.paxos_pb2 364 | import protos.paxos_pb2 365 | import protos.paxos_pb2 366 | request_serializers = { 367 | ('dist_sgd.PaxosServer', 'accept'): protos.paxos_pb2.request_acceptance.SerializeToString, 368 | ('dist_sgd.PaxosServer', 'accepted'): protos.paxos_pb2.consensus.SerializeToString, 369 | ('dist_sgd.PaxosServer', 'ping'): protos.paxos_pb2.empty.SerializeToString, 370 | ('dist_sgd.PaxosServer', 'prepare'): protos.paxos_pb2.proposal.SerializeToString, 371 | } 372 | response_deserializers = { 373 | ('dist_sgd.PaxosServer', 'accept'): protos.paxos_pb2.acquiescence.FromString, 374 | ('dist_sgd.PaxosServer', 'accepted'): protos.paxos_pb2.empty.FromString, 375 | ('dist_sgd.PaxosServer', 'ping'): protos.paxos_pb2.empty.FromString, 376 | ('dist_sgd.PaxosServer', 'prepare'): protos.paxos_pb2.ack.FromString, 377 | } 378 | cardinalities = { 379 | 'accept': cardinality.Cardinality.UNARY_UNARY, 380 | 'accepted': cardinality.Cardinality.UNARY_UNARY, 381 | 'ping': cardinality.Cardinality.UNARY_UNARY, 382 | 'prepare': cardinality.Cardinality.UNARY_UNARY, 383 | } 384 | stub_options = beta_implementations.stub_options(host=host, metadata_transformer=metadata_transformer, request_serializers=request_serializers, response_deserializers=response_deserializers, thread_pool=pool, thread_pool_size=pool_size) 385 | return beta_implementations.dynamic_stub(channel, 'dist_sgd.PaxosServer', cardinalities, options=stub_options) 386 | # @@protoc_insertion_point(module_scope) 387 | -------------------------------------------------------------------------------- /python-python/run_codegen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2015, Google Inc. 3 | # All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are 7 | # met: 8 | # 9 | # * Redistributions of source code must retain the above copyright 10 | # notice, this list of conditions and the following disclaimer. 11 | # * Redistributions in binary form must reproduce the above 12 | # copyright notice, this list of conditions and the following disclaimer 13 | # in the documentation and/or other materials provided with the 14 | # distribution. 15 | # * Neither the name of Google Inc. nor the names of its 16 | # contributors may be used to endorse or promote products derived from 17 | # this software without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | 31 | # Runs the protoc with gRPC plugin to generate protocol messages and gRPC stubs. 32 | protoc -I . --python_out=. --grpc_out=. --plugin=protoc-gen-grpc=`which grpc_python_plugin` ./protos/dist_sgd.proto 33 | protoc -I . --python_out=. --grpc_out=. --plugin=protoc-gen-grpc=`which grpc_python_plugin` ./protos/paxos.proto 34 | -------------------------------------------------------------------------------- /python-python/server.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------ 2 | # Implements a parameter server. The server takes parameter updates in and 3 | # sends back the most up to date parameters. This server also keeps track of 4 | # the current training/test error. 5 | # ------------------------------------------------------------ 6 | 7 | from __future__ import absolute_import 8 | from __future__ import print_function 9 | import time 10 | 11 | import dist_sgd_pb2 12 | from sets import Set 13 | 14 | import autograd.numpy as np 15 | import autograd.numpy.random as npr 16 | from autograd import grad 17 | 18 | from nnet.neural_net import * 19 | from protobuf_utils.utils import * 20 | from server_utils.utils import * 21 | 22 | import traceback 23 | 24 | _ONE_DAY_IN_SECONDS = 60 * 60 * 24 25 | 26 | _REQUIRED_CHILDREN = 1 27 | 28 | # Data files that we are training from. This is the small demo set. 29 | images_fname = 'data/images(16).npy' 30 | labels_fname = 'data/output_labels(16).npy' 31 | 32 | class ParamFeeder(dist_sgd_pb2.BetaParamFeederServicer): 33 | def __init__(self, W = None, prevBatch=None): 34 | # Keeps track of all child IDs that it has seen so far 35 | self.child_ids = Set([]) 36 | 37 | # Load and process Caltech data 38 | self.train_images, self.train_labels, self.test_images, self.test_labels = load_caltech100(images_fname, labels_fname) 39 | self.image_input_d = self.train_images.shape[1] 40 | 41 | # Network parameters 42 | self.layer_sizes = [self.image_input_d, 800, 600, 400, 350, 250, 101] 43 | 44 | # Training parameters 45 | self.param_scale = 0.1 46 | self.learning_rate = 1e-5 47 | self.momentum = 0.9 48 | self.batch_size = 256 49 | self.num_epochs = 50 50 | self.L2_reg = 1.0 51 | 52 | # Make neural net functions 53 | self.N_weights, self.pred_fun, self.loss_fun, self.frac_err = make_nn_funs(self.layer_sizes, self.L2_reg) 54 | self.loss_grad = grad(self.loss_fun) 55 | 56 | # Initialize weights 57 | if W is None: 58 | rs = npr.RandomState() 59 | self.W = rs.randn(self.N_weights) * self.param_scale 60 | else: 61 | # Passed in weights 62 | self.W = W 63 | self.param_len = self.W.shape[0] 64 | log_info("# of parameters:") 65 | log_info(self.param_len) 66 | 67 | # Train with sgd 68 | self.batch_idxs = make_batches(self.train_images.shape[0], self.batch_size) 69 | 70 | # Set the current batch to zero unless it has been passed in 71 | self.epoch = 0 72 | if prevBatch is None: 73 | self.batch_num = 0 74 | else: 75 | self.batch_num = prevBatch 76 | self.n_batches = len(self.batch_idxs) 77 | 78 | # Initialize information about the clients 79 | self.n_childs = 0 80 | self.max_client_id = 0 81 | 82 | # Intializes starting information about training 83 | self.prev_test_perf = 1 84 | 85 | # The batches that are currently being processed 86 | self.batches_processing = {} 87 | 88 | # The batches that were failed to process, model training machine may have failed 89 | # Send these batches to a new machine 90 | self.batches_unprocessed = [] 91 | 92 | log_info('Data loaded on server, waiting for clients....') 93 | log_info('Number of child processes: 0') 94 | 95 | # Logs the current performance of the model. Called once per epoch. 96 | def log_info_perf(self, epoch): 97 | test_perf = self.frac_err(self.W, self.test_images, self.test_labels) 98 | train_perf = self.frac_err(self.W, self.train_images, self.train_labels) 99 | if test_perf > self.prev_test_perf: 100 | self.learning_rate = 0.1 * self.learning_rate 101 | self.prev_test_perf = test_perf 102 | log_info("Epoch {0}, TrainErr {1:5}, TestErr {2:5}, LR {3:2}".format(self.epoch, train_perf, test_perf, self.learning_rate)) 103 | 104 | # Streams updates from the client. 105 | def GetUpdates(self, request_iterator, context): 106 | tensor_bytes = '' 107 | for subtensor in request_iterator: 108 | tensor_bytes = tensor_bytes + subtensor.tensor_content 109 | 110 | # Serialize the tensor 111 | grad_W = convert_bytes_to_array(tensor_bytes) 112 | 113 | # Gradient descent 114 | self.W -= 0.5 * self.learning_rate * grad_W 115 | 116 | return dist_sgd_pb2.StatusCode(status=1) 117 | 118 | # Sends the next batch that the client should process 119 | def SendNextBatch(self, request, context): 120 | # Figure out what the maximum client_id is. If client_id does not exist, 121 | # assigns the client a new id. 122 | if request.client_id == 0: 123 | self.max_client_id += 1 124 | request.client_id = self.max_client_id 125 | else: 126 | self.max_client_id = max(request.client_id, self.max_client_id) 127 | 128 | # Does not start until a sufficient number of child processes exists 129 | self.child_ids.add(request.client_id) 130 | if len(self.child_ids) != self.n_childs: 131 | self.n_childs = len(self.child_ids) 132 | log_info('Number of child processes: ' + str(len(self.child_ids))) 133 | if len(self.child_ids) < _REQUIRED_CHILDREN: 134 | return dist_sgd_pb2.NextBatch(client_id=request.client_id, data_indx = -1) 135 | 136 | # Logs information about previous batch timing 137 | if request.prev_data_indx != -1: 138 | log_info('Time taken to process batch {0} was {1:.2f} by client {2}'.format(request.prev_data_indx, (time.time() - self.batches_processing[request.prev_data_indx]), request.client_id)) 139 | del self.batches_processing[request.prev_data_indx] 140 | 141 | # log_info epoch information if we've hit the end of an epoch 142 | if self.batch_num == self.n_batches: 143 | self.batch_num, self.epoch = 0, self.epoch + 1 144 | self.log_info_perf(self.epoch) 145 | 146 | # Takes any previously failed batches first, otherwise takes next batch 147 | if self.batches_unprocessed != []: 148 | cur_batchnum = self.batches_unprocessed.pop(0) 149 | else: 150 | cur_batchnum, self.batch_num = self.batch_num, self.batch_num + 1 151 | 152 | # Save the time that the next batch was sent out on the server 153 | self.batches_processing[cur_batchnum] = time.time() 154 | 155 | return dist_sgd_pb2.NextBatch(client_id=request.client_id, data_indx = cur_batchnum) 156 | 157 | # This sends the parameters from the server to the client by converting the tensor into a 158 | # protobuffer and streaming it 159 | def SendParams(self, request, context): 160 | CHUNK_SIZE = 524228 161 | tensor_bytes = convert_array_to_bytes(self.W) 162 | tensor_bytes_len = len(tensor_bytes) 163 | tensor_chunk_count = 0 164 | try: 165 | while len(tensor_bytes): 166 | tensor_chunk_count += 1 167 | tensor_content = tensor_bytes[:CHUNK_SIZE] 168 | tensor_bytes = tensor_bytes[CHUNK_SIZE:] 169 | yield dist_sgd_pb2.SubTensor(tensor_len = tensor_bytes_len, tensor_chunk = tensor_chunk_count, tensor_content = tensor_content, data_indx= -1) 170 | except Exception, e: 171 | traceback.print_exc() 172 | 173 | # Function to ping the server to see if it is available 174 | def ping(self, request, context): 175 | return dist_sgd_pb2.empty() 176 | 177 | # Main function that is called to instantiate the server and have 178 | # it connect and send or receieve parameters from clients. 179 | def serve(hostname, W = None, prev_batch = None, local_id = None): 180 | # Set up the server on port 50051 181 | hostname = '[::]:50051' 182 | BATCH_TRAIN_TIMEOUT = 60 183 | 184 | # Instantiate the server and add the port 185 | param_feeder = ParamFeeder(W, prev_batch) 186 | server = dist_sgd_pb2.beta_create_ParamFeeder_server(param_feeder) 187 | server.add_insecure_port(hostname) 188 | 189 | # Begin the server 190 | server.start() 191 | try: 192 | while True: 193 | time.sleep(BATCH_TRAIN_TIMEOUT) 194 | 195 | except KeyboardInterrupt: 196 | server.stop(0) 197 | raise KeyboardInterrupt 198 | 199 | if __name__ == '__main__': 200 | serve('[::]:50051') -------------------------------------------------------------------------------- /python-python/start.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #! /home/candokevin/anaconda2/bin/python 3 | cd /home/candokevin/stash/distributed-sgd/python-python 4 | git pull 5 | rm /home/candokevin/log.txt 6 | while true; do 7 | python client.py >> /home/candokevin/log.txt 8 | done 9 | -------------------------------------------------------------------------------- /slides/.Rhistory: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/.Rhistory -------------------------------------------------------------------------------- /slides/common_slides.sty: -------------------------------------------------------------------------------- 1 | \setbeamertemplate{navigation symbols}{} 2 | \let\tempone\itemize 3 | \let\temptwo\enditemize 4 | \renewenvironment{itemize}{\tempone\addtolength{\itemsep}{0.5\baselineskip}}{\temptwo} 5 | % \usepackage{beamerthemeshadow} 6 | \usepackage{ulem} 7 | % \usepackage{movie15} 8 | \usepackage{mathpazo} 9 | % \usepackage{palatino} 10 | 11 | \usepackage{tikz} 12 | \usepackage{hyperref} 13 | \usepackage{natbib} 14 | \usepackage{pgffor} 15 | \usepackage{booktabs} 16 | \usepackage{amssymb} 17 | \usepackage{tikz,etoolbox} 18 | \usepackage{subcaption} 19 | \usepackage{url} 20 | \usepackage{pgf} 21 | \usepackage{latexsym} 22 | \usepackage{amsfonts} 23 | \usepackage{amssymb} 24 | \usepackage{amsthm} 25 | \usepackage{algorithm} 26 | \usepackage{amsmath} 27 | \usepackage{tabularx} 28 | \usepackage{mathtools} 29 | \usepackage{algorithm} 30 | \usepackage{algpseudocode} 31 | 32 | \usetikzlibrary{arrows,positioning,automata,positioning,spy,matrix,scopes,chains} 33 | 34 | \setbeamersize{text margin left=6mm} 35 | \setbeamersize{text margin right=6mm} 36 | \renewcommand{\insertnavigation}[1]{} 37 | \setbeamertemplate{headline}{} 38 | \setbeamertemplate{footline}{} 39 | % \usefonttheme{professionalfonts} 40 | % make itemize things larger 41 | %\setbeamerfont*{itemize/enumerate body}{size=\Large} 42 | %\setbeamerfont*{itemize/enumerate subbody}{size=\large} 43 | \setbeamercovered{transparent} 44 | \mode 45 | %\mode 46 | \linespread{1.25} 47 | 48 | \usepackage{color} 49 | \usepackage{multirow} 50 | \usepackage{rotating} 51 | \usepackage[all,dvips]{xy} 52 | \usepackage{colortbl} 53 | \usepackage{graphicx} 54 | \usepackage{verbatim} 55 | \usepackage{framed} 56 | \usepackage{natbib} 57 | \usepackage[labelformat=empty]{caption} 58 | \newcommand{\air}{\vspace{0.25cm}} 59 | % \newcommand{\mair}{\vspace{-0.25cm}} 60 | 61 | \setbeamertemplate{navigation symbols}{}%remove navigation symbols 62 | \renewcommand{\rmdefault}{crm} 63 | \newcommand{\lnbrack}{{\normalfont [}} 64 | \newcommand{\rnbrack}{{\normalfont ]}\thinspace} 65 | \newcommand{\lbbrack}{\textcolor{red}{\textbf{[}}} 66 | \newcommand{\rbbrack}{\textcolor{red}{\textbf{]}}\thinspace} 67 | \definecolor{vermillion}{RGB}{213,94,0} 68 | 69 | \definecolor{orange}{RGB}{230,159,0} 70 | \definecolor{skyblue}{RGB}{86,180,233} 71 | \definecolor{bluegreen}{RGB}{0,158,115} 72 | \definecolor{myyellow}{RGB}{240,228,66} % i dunno if this is the same as standard yellow 73 | \definecolor{myblue}{RGB}{0,114,178} 74 | \definecolor{vermillion}{RGB}{213,94,0} 75 | \definecolor{redpurple}{RGB}{204,121,167} 76 | \definecolor{lightgrey}{RGB}{234,234,234} 77 | 78 | \AtBeginSection[] 79 | { 80 | \begin{frame} 81 | \frametitle{Contents} 82 | \tableofcontents[currentsection] 83 | \end{frame} 84 | } 85 | % \AtBeginSection[]{ 86 | % \begin{frame} 87 | % \vfill 88 | % \centering 89 | % \begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title} 90 | % \usebeamerfont{title}\insertsectionhead\par% 91 | % \end{beamercolorbox} 92 | % \vfill 93 | % \end{frame} 94 | % } 95 | 96 | \newcommand{\boldA}{\mathbf{A}} 97 | \newcommand{\boldB}{\mathbf{B}} 98 | \newcommand{\boldC}{\mathbf{C}} 99 | \newcommand{\boldD}{\mathbf{D}} 100 | \newcommand{\boldE}{\mathbf{E}} 101 | \newcommand{\boldF}{\mathbf{F}} 102 | \newcommand{\boldG}{\mathbf{G}} 103 | \newcommand{\boldH}{\mathbf{H}} 104 | \newcommand{\boldI}{\mathbf{I}} 105 | \newcommand{\boldJ}{\mathbf{J}} 106 | \newcommand{\boldK}{\mathbf{K}} 107 | \newcommand{\boldL}{\mathbf{L}} 108 | \newcommand{\boldM}{\mathbf{M}} 109 | \newcommand{\boldN}{\mathbf{N}} 110 | \newcommand{\boldO}{\mathbf{O}} 111 | \newcommand{\boldP}{\mathbf{P}} 112 | \newcommand{\boldQ}{\mathbf{Q}} 113 | \newcommand{\boldR}{\mathbf{R}} 114 | \newcommand{\boldS}{\mathbf{S}} 115 | \newcommand{\boldT}{\mathbf{T}} 116 | \newcommand{\boldU}{\mathbf{U}} 117 | \newcommand{\boldV}{\mathbf{V}} 118 | \newcommand{\boldW}{\mathbf{W}} 119 | \newcommand{\boldX}{\mathbf{X}} 120 | \newcommand{\boldY}{\mathbf{Y}} 121 | \newcommand{\boldZ}{\mathbf{Z}} 122 | \newcommand{\bolda}{\mathbf{a}} 123 | \newcommand{\boldb}{\mathbf{b}} 124 | \newcommand{\boldc}{\mathbf{c}} 125 | \newcommand{\boldd}{\mathbf{d}} 126 | \newcommand{\bolde}{\mathbf{e}} 127 | \newcommand{\boldf}{\mathbf{f}} 128 | \newcommand{\boldg}{\mathbf{g}} 129 | \newcommand{\boldh}{\mathbf{h}} 130 | \newcommand{\boldi}{\mathbf{i}} 131 | \newcommand{\boldj}{\mathbf{j}} 132 | \newcommand{\boldk}{\mathbf{k}} 133 | \newcommand{\boldl}{\mathbf{l}} 134 | \newcommand{\boldm}{\mathbf{m}} 135 | \newcommand{\boldn}{\mathbf{n}} 136 | \newcommand{\boldo}{\mathbf{o}} 137 | \newcommand{\boldp}{\mathbf{p}} 138 | \newcommand{\boldq}{\mathbf{q}} 139 | \newcommand{\boldr}{\mathbf{r}} 140 | \newcommand{\bolds}{\mathbf{s}} 141 | \newcommand{\boldt}{\mathbf{t}} 142 | \newcommand{\boldu}{\mathbf{u}} 143 | \newcommand{\boldv}{\mathbf{v}} 144 | \newcommand{\boldw}{\mathbf{w}} 145 | \newcommand{\boldx}{\mathbf{x}} 146 | \newcommand{\boldy}{\mathbf{y}} 147 | \newcommand{\boldz}{\mathbf{z}} 148 | 149 | \newcommand{\bolddelta}{\boldsymbol{\delta}} 150 | \newcommand{\indicator}{\mathbf{1}} 151 | \newcommand{\mcA}{\mathcal{A}} 152 | \newcommand{\mcB}{\mathcal{B}} 153 | \newcommand{\mcC}{\mathcal{C}} 154 | \newcommand{\mcD}{\mathcal{D}} 155 | \newcommand{\mcE}{\mathcal{E}} 156 | \newcommand{\mcF}{\mathcal{F}} 157 | \newcommand{\mcG}{\mathcal{G}} 158 | \newcommand{\mcH}{\mathcal{H}} 159 | \newcommand{\mcI}{\mathcal{I}} 160 | \newcommand{\mcJ}{\mathcal{J}} 161 | \newcommand{\mcK}{\mathcal{K}} 162 | \newcommand{\mcL}{\mathcal{L}} 163 | \newcommand{\mcM}{\mathcal{M}} 164 | \newcommand{\mcN}{\mathcal{N}} 165 | \newcommand{\mcO}{\mathcal{O}} 166 | \newcommand{\mcP}{\mathcal{P}} 167 | \newcommand{\mcQ}{\mathcal{Q}} 168 | \newcommand{\mcR}{\mathcal{R}} 169 | \newcommand{\mcS}{\mathcal{S}} 170 | \newcommand{\mcT}{\mathcal{T}} 171 | \newcommand{\mcU}{\mathcal{U}} 172 | \newcommand{\mcV}{\mathcal{V}} 173 | \newcommand{\mcW}{\mathcal{W}} 174 | \newcommand{\mcX}{\mathcal{X}} 175 | \newcommand{\mcY}{\mathcal{Y}} 176 | \newcommand{\mcZ}{\mathcal{Z}} 177 | 178 | \newcommand{\reals}{\ensuremath{\mathbb{R}}} 179 | \newcommand{\integers}{\ensuremath{\mathbb{Z}}} 180 | \newcommand{\rationals}{\ensuremath{\mathbb{Q}}} 181 | \newcommand{\naturals}{\ensuremath{\mathbb{N}}} 182 | \newcommand{\trans}{\ensuremath{\mathsf{T}}} 183 | \newcommand{\ident}{\mathbf{I}} 184 | \newcommand{\bzero}{\mathbf{0}} 185 | 186 | \newcommand{\balpha}{\boldsymbol{\alpha}} 187 | \newcommand{\bbeta}{\boldsymbol{\beta}} 188 | \newcommand{\boldeta}{\boldsymbol{\eta}} 189 | \newcommand{\bkappa}{\boldsymbol{\kappa}} 190 | \newcommand{\bgamma}{\boldsymbol{\gamma}} 191 | \newcommand{\bmu}{\boldsymbol{\mu}} 192 | \newcommand{\bphi}{\boldsymbol{\phi}} 193 | \newcommand{\bpi}{\boldsymbol{\pi}} 194 | \newcommand{\bpsi}{\boldsymbol{\psi}} 195 | \newcommand{\bsigma}{\boldsymbol{\sigma}} 196 | \newcommand{\btheta}{\boldsymbol{\theta}} 197 | \newcommand{\bxi}{\boldsymbol{\xi}} 198 | \newcommand{\bGamma}{\boldsymbol{\Gamma}} 199 | \newcommand{\bLambda}{\boldsymbol{\Lambda}} 200 | \newcommand{\bOmega}{\boldsymbol{\Omega}} 201 | \newcommand{\bPhi}{\boldsymbol{\Phi}} 202 | \newcommand{\bPi}{\boldsymbol{\Pi}} 203 | \newcommand{\bPsi}{\boldsymbol{\Psi}} 204 | \newcommand{\bSigma}{\boldsymbol{\Sigma}} 205 | \newcommand{\bTheta}{\boldsymbol{\Theta}} 206 | \newcommand{\bUpsilon}{\boldsymbol{\Upsilon}} 207 | \newcommand{\bXi}{\boldsymbol{\Xi}} 208 | \newcommand{\bepsilon}{\boldsymbol{\epsilon}} 209 | 210 | \def\argmin{\operatornamewithlimits{arg\,min}} 211 | \def\argmax{\operatornamewithlimits{arg\,max}} 212 | \def\softmax{\operatornamewithlimits{softmax}} 213 | \def\relu{\operatornamewithlimits{ReLU}} 214 | 215 | \newcommand{\given}{\,|\,} 216 | \newcommand{\distNorm}{\mathcal{N}} 217 | 218 | 219 | 220 | \newcommand{\din}{{d_{\mathrm{in}}}} 221 | \newcommand{\dhid}{{d_{\mathrm{hid}}}} 222 | \newcommand{\dwin}{{d_{\mathrm{win}}}} 223 | \newcommand{\dout}{{d_{\mathrm{out}}}} 224 | \newcommand{\demb}{{d_{\mathrm{emb}}}} 225 | 226 | \algtext*{EndWhile}% Remove "end while" text 227 | \algtext*{EndFor}% Remove "end while" text 228 | \algtext*{EndIf}% Remove "end if" text 229 | \algtext*{EndProcedure}% Remove "end while" text 230 | -------------------------------------------------------------------------------- /slides/img/2d_func.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/2d_func.jpg -------------------------------------------------------------------------------- /slides/img/dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/dataset.png -------------------------------------------------------------------------------- /slides/img/deep_learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/deep_learning.png -------------------------------------------------------------------------------- /slides/img/dist_16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/dist_16.png -------------------------------------------------------------------------------- /slides/img/dist_train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/dist_train.png -------------------------------------------------------------------------------- /slides/img/downpour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/downpour.png -------------------------------------------------------------------------------- /slides/img/gRPC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/gRPC.png -------------------------------------------------------------------------------- /slides/img/large_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/large_data.png -------------------------------------------------------------------------------- /slides/img/lin_v_nonlin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/lin_v_nonlin.png -------------------------------------------------------------------------------- /slides/img/sandblaster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/sandblaster.png -------------------------------------------------------------------------------- /slides/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/main.pdf -------------------------------------------------------------------------------- /slides/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{./common_slides} 3 | \usepackage[absolute,overlay]{textpos} 4 | \usepackage{graphicx} 5 | 6 | 7 | \title{ Distributed Stochastic Gradient Descent } 8 | 9 | \author{Kevin Yang and Michael Farrell} 10 | \begin{document} 11 | 12 | \begin{frame}w 13 | \titlepage 14 | \end{frame} 15 | 16 | \begin{frame}{Motivation - Deep Learning} 17 | 18 | \begin{columns}[T] % align columns 19 | \begin{column}{.48\textwidth} 20 | \begin{itemize} 21 | \item Deep-Learning 22 | \begin{itemize} 23 | \item Objective: Learn a complicated, non-linear function that minimizes some loss function 24 | \end{itemize} 25 | \item Why do we need deep models? 26 | \begin{itemize} 27 | \item The class of linear functions is inadequate for many problems. 28 | \end{itemize} 29 | \end{itemize} 30 | \end{column}% 31 | \hfill% 32 | \begin{column}{.48\textwidth} 33 | \begin{figure} 34 | \includegraphics[scale = .35]{./img/deep_learning} 35 | \caption{\scalebox{.3}{http://www.rsipvision.com/exploring-deep-learning/}} 36 | \end{figure} 37 | \begin{figure} 38 | \includegraphics[scale = .17]{./img/lin_v_nonlin} 39 | \caption{\scalebox{.3}{http://sebastianraschka.com/Articles/2014{\_}naive{\_}bayes{\_}1.html}} 40 | \end{figure} 41 | \end{column}% 42 | \end{columns} 43 | \end{frame} 44 | 45 | \begin{frame}{Motivation - Deep Learning} 46 | \begin{itemize} 47 | \item How do we learn these deep models? 48 | \begin{itemize} 49 | \item Choose a random example 50 | \item Run the neural network on the example 51 | \item Adjust the parameters of the network such that our loss function is minimized more than it was before 52 | \item Repeat 53 | \end{itemize} 54 | \pause 55 | \item Difficulties? 56 | \begin{itemize} 57 | \item Local Minima 58 | \item Non-convexity 59 | \item Neural Networks can have millions or even billions of parameters 60 | \end{itemize} 61 | \end{itemize} 62 | \begin{textblock*}{5cm}(8cm,.5cm) % {block width} (coords) 63 | \includegraphics[scale = .3]{./img/2d_func} 64 | \end{textblock*} 65 | \end{frame} 66 | 67 | \begin{frame}{Motivation - SGD} 68 | \begin{itemize} 69 | \item How do we maximize our reward function? 70 | \begin{itemize} 71 | \item One common technique is Stochastic Gradient Descent 72 | \item $\mathbf w$ is the vector of parameters for the model 73 | \item $\eta$ is the learning rate 74 | \item $\mathbf f(\mathbf w)$ is the loss function evaluated with the current parameters $\mathbf w$ 75 | \item 76 | \begin{algorithmic} 77 | \State $\mathbf w \gets \mathbf 0$ 78 | \While {$\mathbf f(\mathbf w)$ is not minimized} 79 | \For {$i = 1, n$} 80 | \State $\mathbf w \gets \mathbf w - \eta\nabla f(\mathbf w)$ 81 | \EndFor 82 | \EndWhile 83 | 84 | \end{algorithmic} 85 | \item As the number of training examples, $n$, and the number of parameters, $|\mathbf w|$, increases, this algorithm quickly becomes very slow... 86 | \end{itemize} 87 | \end{itemize} 88 | \end{frame} 89 | 90 | \begin{frame}{Motivation - Distributed SGD} 91 | \begin{itemize} 92 | \item Since some of these models take days/weeks/months to run, we would hope that we could use a distributed computing cluster in order to parallelize this process. 93 | \pause 94 | \item Learn from Google! 95 | \begin{itemize} 96 | \item DistBelief- 2012 97 | \begin{itemize} 98 | \item Downpour SGD 99 | \item Sandblaster L-BFGS 100 | \end{itemize} 101 | \item TensorFlow- 2015 102 | \begin{itemize} 103 | \item gRPC 104 | \end{itemize} 105 | \end{itemize} 106 | \end{itemize} 107 | 108 | \end{frame} 109 | 110 | \begin{frame}{DistBelief - Downpour SGD} 111 | \begin{itemize} 112 | \item ``An asynchronous stochastic gradient descent procedure supporting a large number of model replicas." \footnote{Diagram taken from Dean et al. \it{Large Scale Distributed Deep Networks} 113 | } 114 | \end{itemize} 115 | $$\includegraphics[scale = .5]{./img/downpour}$$ 116 | \end{frame} 117 | 118 | \begin{frame}{DistBelief - Sandblaster L-BFGS} 119 | \begin{itemize} 120 | \item ``A framework that supports a variety of distributed batch optimization procedures, including a distributed implementation of L-BFGS" \footnote{Diagram taken from Dean et al. \it{Large Scale Distributed Deep Networks}} 121 | \end{itemize} 122 | $$\includegraphics[scale = .5]{./img/sandblaster}$$ 123 | \end{frame} 124 | 125 | \begin{frame}{TensorFlow-GRPC} 126 | \begin{itemize} 127 | \item Second Generation ML Model focused on distributing models to CPUs and GPUs 128 | \item Uses the high performance RPC framework (GRPC \footnote{Diagram taken from http://www.grpc.io/}) in order to communicate between separate processes 129 | \begin{itemize} 130 | \item Uses Protocol Buffers -v3 131 | \item C-based 132 | \item Client-server stubs in 10+ languages and counting 133 | \end{itemize} 134 | \end{itemize} 135 | $$\includegraphics[scale = .2]{./img/gRPC}$$ 136 | \end{frame} 137 | 138 | \begin{frame}{DistBelief/TensorFlow Summary} 139 | \begin{itemize} 140 | \item TensorFlow is basically the second version of DistBelief that is approximately twice as fast and much more user-friendly. 141 | \item Results from DistBelief" \footnote{Diagram taken from Dean et al. \it{Large Scale Distributed Deep Networks}}: 142 | \end{itemize} 143 | $$\includegraphics[scale = .18]{./img/dist_train}\includegraphics[scale = .18]{./img/dist_16}$$ 144 | \end{frame} 145 | 146 | \begin{frame}{Our Project} 147 | \begin{itemize} 148 | \item We frequently run into scenarios where we have a model that trains incredibly slowly on our local machines. As a consequence, we hope to benefit from additional cloud computing resources and build our own Distributed SGD system based on DistBelief and TensorFlow systems. 149 | \begin{itemize} 150 | \item The Distributed SGD system will have the user give a function that returns the outputs of a model, a function that returns the gradients of a model, and the number of machines to train the model on. 151 | \item Use GRPC with Protocol Buffers to communicated between processes, similar to TensorFlow. 152 | \item Implement Downpour-SGD which seems to be the most effective model with limited resources. 153 | \end{itemize} 154 | \end{itemize} 155 | \end{frame} 156 | 157 | 158 | \begin{frame}{Our Example} 159 | \begin{itemize} 160 | \item To test our system, we're working with the Caltech 101 Computational Vision dataset \footnote{L. Fei-Fei, R. Fergus and P. Perona. \it{Learning generative visual models 161 | from few training examples: an incremental Bayesian approach tested on 162 | 101 object categories.}}. In this dataset, there are about 20,000 pictures of objects in 101 categories. All of these images are around 300 x 200 pixels in size. 163 | \item We've implemented a convolutional neural net that tries to classify what object is represented in the image. 164 | 165 | $$\includegraphics[scale = .30]{./img/dataset.png}$$ 166 | \end{itemize} 167 | \end{frame} 168 | 169 | \begin{frame}{Computational Resources} 170 | \begin{itemize} 171 | \item We are using Google Cloud Compute Engine to set up VMs and run the code. To run classification on our image dataset, we're using small instances with 6GB of RAM with 2 cores. This has a rate of 7.8 cents per hour. 172 | \item On a machine of this size, running 10 epochs of gradient descent takes 56 minutes. 173 | \item To streamline things, we've preconfigured images of a parameter server and model training server that are already set up with relevant code, tools, and libraries. 174 | \item As a result, setting up and launching the compute instances necessary for model training takes only a couple lines. 175 | \end{itemize} 176 | \end{frame} 177 | 178 | \begin{frame}{Implementing Downpour-SGD} 179 | \begin{itemize} 180 | \item The Downpour-SGD requires the passing of parameters and parameter updates between processes. In our example, we have 74,770,901 parameters and the size of our parameters is 0.5GB. 181 | \item Bottleneck here is the network. Parameters can be $>>$0.5Gb. 182 | \item We can leverage the fact that some of these models are extremely sparse 183 | \begin{itemize} 184 | \item only send parameters updated 185 | \item only update parameters every $n_x$ times 186 | \end{itemize} 187 | \item Explore protocol buffer streams 188 | \end{itemize} 189 | $$\includegraphics[scale = .27]{./img/large_data}$$ 190 | \end{frame} 191 | 192 | \begin{frame}{Main Distributed System Challenges} 193 | \begin{itemize} 194 | \item Network Issues 195 | \begin{itemize} 196 | \item We have to deal with network latency and try to reduce transportation cost as much as possible in order for our models to train properly. 197 | \item We would like to experiment with a couple different RPCs to optimize the speed of our system. 198 | \end{itemize} 199 | \item Fault tolerance 200 | \begin{itemize} 201 | \item We need to make our system as resilient as possible against failures. Because all of these machines are doing a lot of computation while running gradient descent and manipulating parameters, these systems are bound to fail with relatively high frequently. 202 | \item Having methods in place to detect and remedy the failure of parameter servers and model replicas will be critical. 203 | \end{itemize} 204 | 205 | \end{itemize} 206 | \end{frame} 207 | 208 | \end{document} 209 | --------------------------------------------------------------------------------