├── .gitignore
├── .gitmodules
├── CS262%20Final%20Project
├── common.sty
├── images
│ ├── downpour.png
│ ├── locally.png
│ ├── remotely.png
│ ├── sgd_results.png
│ └── speeds.png
├── main.aux
├── main.bcf
├── main.bib
├── main.log
├── main.out
├── main.pdf
├── main.run.xml
├── main.synctex.gz
└── main.tex
├── Dev-Notebook-Kevin.md
├── Dev-Notebook-Mike.md
├── README.md
├── client_list.txt
├── lua-lua
├── README.md
├── cleanup.py
├── copy_files.py
├── data
│ ├── demo-train.hdf5
│ ├── demo-val.hdf5
│ ├── demo.src.dict
│ ├── demo.targ.dict
│ ├── src-train.txt
│ ├── src-val.txt
│ ├── targ-train.txt
│ └── targ-val.txt
├── demo_server.lua
├── gcloud_commands.txt
├── install_parallel.sh
├── locally.png
├── outputs
│ ├── 104.154.239.139
│ │ ├── ada_4_rem.png
│ │ ├── ada_4_rem.txt
│ │ ├── ada_8_rem.png
│ │ └── ada_8_rem.txt
│ ├── 104.197.106.197
│ │ ├── ada_2_rem.png
│ │ └── ada_2_rem.txt
│ ├── 104.197.222.148
│ │ ├── ada_2.txt
│ │ ├── ada_2_loc.png
│ │ ├── ada_2_loc.txt
│ │ └── reg_2.txt
│ ├── 104.197.250.103
│ │ ├── reg_1.txt
│ │ ├── reg_2.txt
│ │ ├── reg_2_loc.png
│ │ └── reg_2_loc.txt
│ ├── 130.211.192.196
│ │ ├── reg_1_loc.png
│ │ ├── reg_1_loc.txt
│ │ └── reg_2.txt
│ └── 130.211.204.149
│ │ ├── ada_1.txt
│ │ ├── ada_1_loc.png
│ │ ├── ada_1_loc.txt
│ │ └── reg_2.txt
├── parallel
│ └── init.lua
├── parse_outputs.py
├── remotely.png
├── server.lua
├── setup_image.sh
└── startup.sh
├── python-python
├── README.md
├── client.py
├── data
│ ├── images(16).npy
│ └── output_labels(16).npy
├── dist_sgd_pb2.py
├── image_classes.txt
├── neural_net.py
├── nnet
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── neural_net.py
│ └── neural_net.pyc
├── paxos.py
├── paxos_pb2.py
├── protobuf_utils
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── utils.py
│ └── utils.pyc
├── protos
│ ├── dist_sgd.proto
│ ├── dist_sgd_pb2.py
│ ├── paxos.proto
│ └── paxos_pb2.py
├── run_codegen.sh
├── server.py
└── start.sh
└── slides
├── .Rhistory
├── common_slides.sty
├── img
├── 2d_func.jpg
├── dataset.png
├── deep_learning.png
├── dist_16.png
├── dist_train.png
├── downpour.png
├── gRPC.png
├── large_data.png
├── lin_v_nonlin.png
└── sandblaster.png
├── main.pdf
└── main.tex
/.gitignore:
--------------------------------------------------------------------------------
1 | # Annoying files
2 | .DS_Store
3 | .ipynb_checkpoints
4 | Icon
5 |
6 | # large data files
7 | basic/output_labels(128).npy
8 |
9 | # Install files
10 | install/
11 |
12 | # Model saves
13 | *.t7
14 |
15 |
16 | # Annoying text files
17 | slides/main.aux
18 | slides/main.log
19 | slides/main.nav
20 | slides/main.out
21 | slides/main.snm
22 | slides/main.synctex.gz
23 | slides/main.toc
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "lua-lua/End-To-End-Generative-Dialogue"]
2 | path = lua-lua/End-To-End-Generative-Dialogue
3 | url = https://github.com/michaelfarrell76/End-To-End-Generative-Dialogue.git
4 |
--------------------------------------------------------------------------------
/CS262%20Final%20Project/common.sty:
--------------------------------------------------------------------------------
1 | \usepackage{amsmath}
2 | \usepackage{amssymb}
3 | \usepackage{url}
4 | \usepackage{mathpazo}
5 | \usepackage{palatino}
6 | \usepackage{fullpage,graphicx}
7 | \usepackage{tikz}
8 | \usepackage{tikz-qtree}
9 | \usepackage[font={it}]{caption}
10 | % \usepackage[right, mathlines]{lineno}
11 |
12 | \usepackage[procnames]{listings}
13 | \usepackage{color}
14 |
15 | \definecolor{keywords}{RGB}{255,0,90}
16 | \definecolor{comments}{RGB}{0,0,113}
17 | \definecolor{red}{RGB}{160,0,0}
18 | \definecolor{green}{RGB}{0,150,0}
19 |
20 | \lstset{language=Python,
21 | basicstyle=\ttfamily\small,
22 | keywordstyle=\color{keywords},
23 | commentstyle=\color{comments},
24 | stringstyle=\color{red},
25 | showstringspaces=false,
26 | identifierstyle=\color{green},
27 | procnamekeys={def,class}}
28 |
29 | % \linenumbers
30 |
31 | \usetikzlibrary{shapes.geometric}
32 | \usetikzlibrary{patterns}
33 | \usetikzlibrary{matrix}
34 | \usetikzlibrary{automata}
35 | \usepackage{booktabs}
36 |
37 | % \pagestyle{empty}
38 | \pagenumbering{arabic}
39 | \usepackage{subfig}
40 | \usepackage{comment}
41 |
42 | \newcommand{\boldA}{\boldsymbol{A}}
43 | \newcommand{\boldB}{\boldsymbol{B}}
44 | \newcommand{\boldC}{\boldsymbol{C}}
45 | \newcommand{\boldD}{\boldsymbol{D}}
46 | \newcommand{\boldE}{\boldsymbol{E}}
47 | \newcommand{\boldF}{\boldsymbol{F}}
48 | \newcommand{\boldG}{\boldsymbol{G}}
49 | \newcommand{\boldH}{\boldsymbol{H}}
50 | \newcommand{\boldI}{\boldsymbol{I}}
51 | \newcommand{\boldJ}{\boldsymbol{J}}
52 | \newcommand{\boldK}{\boldsymbol{K}}
53 | \newcommand{\boldL}{\boldsymbol{L}}
54 | \newcommand{\boldM}{\boldsymbol{M}}
55 | \newcommand{\boldN}{\boldsymbol{N}}
56 | \newcommand{\boldO}{\boldsymbol{O}}
57 | \newcommand{\boldP}{\boldsymbol{P}}
58 | \newcommand{\boldQ}{\boldsymbol{Q}}
59 | \newcommand{\boldR}{\boldsymbol{R}}
60 | \newcommand{\boldS}{\boldsymbol{S}}
61 | \newcommand{\boldT}{\boldsymbol{T}}
62 | \newcommand{\boldU}{\boldsymbol{U}}
63 | \newcommand{\boldV}{\boldsymbol{V}}
64 | \newcommand{\boldW}{\boldsymbol{W}}
65 | \newcommand{\boldX}{\boldsymbol{X}}
66 | \newcommand{\boldY}{\boldsymbol{Y}}
67 | \newcommand{\boldZ}{\boldsymbol{Z}}
68 | \newcommand{\bolda}{\boldsymbol{a}}
69 | \newcommand{\boldb}{\boldsymbol{b}}
70 | \newcommand{\boldc}{\boldsymbol{c}}
71 | \newcommand{\boldd}{\boldsymbol{d}}
72 | \newcommand{\bolde}{\boldsymbol{e}}
73 | \newcommand{\boldf}{\boldsymbol{f}}
74 | \newcommand{\boldg}{\boldsymbol{g}}
75 | \newcommand{\boldh}{\boldsymbol{h}}
76 | \newcommand{\boldi}{\boldsymbol{i}}
77 | \newcommand{\boldj}{\boldsymbol{j}}
78 | \newcommand{\boldk}{\boldsymbol{k}}
79 | \newcommand{\boldl}{\boldsymbol{l}}
80 | \newcommand{\boldm}{\boldsymbol{m}}
81 | \newcommand{\boldn}{\boldsymbol{n}}
82 | \newcommand{\boldo}{\boldsymbol{o}}
83 | \newcommand{\boldp}{\boldsymbol{p}}
84 | \newcommand{\boldq}{\boldsymbol{q}}
85 | \newcommand{\boldr}{\boldsymbol{r}}
86 | \newcommand{\bolds}{\boldsymbol{s}}
87 | \newcommand{\boldt}{\boldsymbol{t}}
88 | \newcommand{\boldu}{\boldsymbol{u}}
89 | \newcommand{\boldv}{\boldsymbol{v}}
90 | \newcommand{\boldw}{\boldsymbol{w}}
91 | \newcommand{\boldx}{\boldsymbol{x}}
92 | \newcommand{\boldy}{\boldsymbol{y}}
93 | \newcommand{\boldz}{\boldsymbol{z}}
94 |
95 | \newcommand{\mcA}{\mathcal{A}}
96 | \newcommand{\mcB}{\mathcal{B}}
97 | \newcommand{\mcC}{\mathcal{C}}
98 | \newcommand{\mcD}{\mathcal{D}}
99 | \newcommand{\mcE}{\mathcal{E}}
100 | \newcommand{\mcF}{\mathcal{F}}
101 | \newcommand{\mcG}{\mathcal{G}}
102 | \newcommand{\mcH}{\mathcal{H}}
103 | \newcommand{\mcI}{\mathcal{I}}
104 | \newcommand{\mcJ}{\mathcal{J}}
105 | \newcommand{\mcK}{\mathcal{K}}
106 | \newcommand{\mcL}{\mathcal{L}}
107 | \newcommand{\mcM}{\mathcal{M}}
108 | \newcommand{\mcN}{\mathcal{N}}
109 | \newcommand{\mcO}{\mathcal{O}}
110 | \newcommand{\mcP}{\mathcal{P}}
111 | \newcommand{\mcQ}{\mathcal{Q}}
112 | \newcommand{\mcR}{\mathcal{R}}
113 | \newcommand{\mcS}{\mathcal{S}}
114 | \newcommand{\mcT}{\mathcal{T}}
115 | \newcommand{\mcU}{\mathcal{U}}
116 | \newcommand{\mcV}{\mathcal{V}}
117 | \newcommand{\mcW}{\mathcal{W}}
118 | \newcommand{\mcX}{\mathcal{X}}
119 | \newcommand{\mcY}{\mathcal{Y}}
120 | \newcommand{\mcZ}{\mathcal{Z}}
121 |
122 | \newcommand{\reals}{\ensuremath{\mathbb{R}}}
123 | \newcommand{\integers}{\ensuremath{\mathbb{Z}}}
124 | \newcommand{\rationals}{\ensuremath{\mathbb{Q}}}
125 | \newcommand{\naturals}{\ensuremath{\mathbb{N}}}
126 | \newcommand{\trans}{\ensuremath{\mathsf{T}}}
127 | \newcommand{\ident}{\boldsymbol{I}}
128 | \newcommand{\bzero}{\boldsymbol{0}}
129 |
130 | \newcommand{\balpha}{\boldsymbol{\alpha}}
131 | \newcommand{\bbeta}{\boldsymbol{\beta}}
132 | \newcommand{\boldeta}{\boldsymbol{\eta}}
133 | \newcommand{\bkappa}{\boldsymbol{\kappa}}
134 | \newcommand{\bgamma}{\boldsymbol{\gamma}}
135 | \newcommand{\bmu}{\boldsymbol{\mu}}
136 | \newcommand{\bphi}{\boldsymbol{\phi}}
137 | \newcommand{\bpi}{\boldsymbol{\pi}}
138 | \newcommand{\bpsi}{\boldsymbol{\psi}}
139 | \newcommand{\bsigma}{\boldsymbol{\sigma}}
140 | \newcommand{\btheta}{\boldsymbol{\theta}}
141 | \newcommand{\bxi}{\boldsymbol{\xi}}
142 | \newcommand{\bGamma}{\boldsymbol{\Gamma}}
143 | \newcommand{\bLambda}{\boldsymbol{\Lambda}}
144 | \newcommand{\bOmega}{\boldsymbol{\Omega}}
145 | \newcommand{\bPhi}{\boldsymbol{\Phi}}
146 | \newcommand{\bPi}{\boldsymbol{\Pi}}
147 | \newcommand{\bPsi}{\boldsymbol{\Psi}}
148 | \newcommand{\bSigma}{\boldsymbol{\Sigma}}
149 | \newcommand{\bTheta}{\boldsymbol{\Theta}}
150 | \newcommand{\bUpsilon}{\boldsymbol{\Upsilon}}
151 | \newcommand{\bXi}{\boldsymbol{\Xi}}
152 | \newcommand{\bepsilon}{\boldsymbol{\epsilon}}
153 |
154 | \def\argmin{\operatornamewithlimits{arg\,min}}
155 | \def\argmax{\operatornamewithlimits{arg\,max}}
156 |
157 | \newcommand{\given}{\,|\,}
158 | \newcommand{\distNorm}{\mathcal{N}}
159 |
160 |
161 | \usepackage{tabularx}
162 | \usepackage{algorithm}
163 | \usepackage{algpseudocode}
164 |
165 | \newcommand{\msc}[1]{\mathrm{\textsc{#1}}}
166 | \newcommand{\air}{\vspace{0.5cm}}
167 |
168 | \algtext*{EndWhile}% Remove "end while" text
169 | \algtext*{EndFor}% Remove "end while" text
170 | \algtext*{EndIf}% Remove "end if" text
171 | \algtext*{EndProcedure}% Remove "end while" text
172 |
173 | \newtheorem{theorem}{Theorem}
174 | \newtheorem{defn}{Definition}
175 |
176 | \newcommand{\Scribe}[1]{\def\ScribeStr{Scribe: #1}}
177 | \newcommand{\Scribes}[1]{\def\ScribeStr{Scribes: #1}}
178 | \newcommand{\Lecturer}[1]{\def\LecStr{Lecturer: #1}}
179 | \newcommand{\Lecturers}[1]{\def\LecStr{Lecturers: #1}}
180 | \newcommand{\LectureNumber}[1]{\def\LecNum{#1}}
181 | \newcommand{\LectureDate}[1]{\def\LecDate{#1}}
182 | \newcommand{\LectureTitle}[1]{\def\LecTitle{#1}}
183 |
184 | \newdimen\headerwidth
185 |
186 | \newcommand{\MakeScribeTop}{
187 | \noindent
188 | \begin{center}
189 | \framebox{
190 | \vbox{
191 | \headerwidth=\textwidth
192 | % \advance\headerwidth by -0.22in
193 | \hbox to \headerwidth {{\bf Artificial Intelligence \hfill (Harvard CS182, Fall 2015)} }
194 | \vspace{4mm}
195 | \hbox to \headerwidth {{\Large \hfill {\LecTitle} \hfill}}
196 | \vspace{2mm}
197 | \hbox to \headerwidth {\hfill \LecDate \hfill}
198 | \vspace{2mm}
199 | \hbox to \headerwidth {{\it \hfill \LecStr \hfill }}
200 | }
201 | }
202 | \end{center}
203 | \vspace*{4mm}}
204 |
205 |
206 | \newcommand*{\QED}{\hfill\ensuremath{\square}}%
207 |
208 | \newtheorem{exercise}[theorem]{Question}
209 | \let\checkmark\undefined
210 |
211 | \newcommand{\exinline}[1]{(\refstepcounter{theorem}Question~\thetheorem\label{#1})}
212 |
213 | \usepackage[utf8]{inputenc}
214 |
215 | % \DeclareUnicodeCharacter{2693}{\anchor}
216 | \usepackage{bbding}
217 | \usepackage{soul}
218 |
219 | \ifthenelse{\isundefined{\StudentVersion}}{
220 | \newcommand{\censor}[1]{
221 | {\small \textcolor{red}{\SunshineOpenCircled}} \textcolor{red}{#1}
222 | }
223 | \newcommand{\censorm}[1]{
224 | \hbox{{\small \textcolor{red}{\SunshineOpenCircled}}} \textcolor{red}{#1}
225 | }
226 |
227 | }{
228 | \DeclareRobustCommand*\censor{%
229 | {\small \textcolor{red}{\SunshineOpenCircled}}
230 | \SOUL@setup%
231 | \def\SOUL@everytoken{\phantom{\the\SOUL@token}}%
232 | \def\SOUL@everyhyphen{%
233 | \discretionary{%
234 | \SOUL@setkern\SOUL@hyphkern%
235 | \phantom{\SOUL@sethyphenchar}%
236 | }{}{}%
237 | }%
238 | \def\SOUL@everyexhyphen##1{%
239 | \SOUL@setkern\SOUL@hyphkern%
240 | \hbox{\phantom{##1}}%
241 | \discretionary{}{}{%
242 | \SOUL@setkern\SOUL@charkern%
243 | }%
244 | }%
245 | \SOUL@%
246 | }
247 | \newcommand{\censorm}[1]{
248 | \hbox{{\small \textcolor{red}{\SunshineOpenCircled}}} \hspace*{5cm}
249 | }
250 |
251 | }
252 |
253 |
254 |
255 |
256 | \newcommand{\bolddelta}{\boldsymbol{\delta}}
257 | \newcommand{\indicator}{\mathbf{1}}
258 |
259 |
260 | \def\argmin{\operatornamewithlimits{arg\,min}}
261 | \def\argmax{\operatornamewithlimits{arg\,max}}
262 | \def\softmax{\operatornamewithlimits{softmax}}
263 | \def\relu{\operatornamewithlimits{ReLU}}
264 |
265 | \newcommand{\din}{{d_{\mathrm{in}}}}
266 | \newcommand{\dout}{{d_{\mathrm{out}}}}
--------------------------------------------------------------------------------
/CS262%20Final%20Project/images/downpour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/downpour.png
--------------------------------------------------------------------------------
/CS262%20Final%20Project/images/locally.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/locally.png
--------------------------------------------------------------------------------
/CS262%20Final%20Project/images/remotely.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/remotely.png
--------------------------------------------------------------------------------
/CS262%20Final%20Project/images/sgd_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/sgd_results.png
--------------------------------------------------------------------------------
/CS262%20Final%20Project/images/speeds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/images/speeds.png
--------------------------------------------------------------------------------
/CS262%20Final%20Project/main.aux:
--------------------------------------------------------------------------------
1 | \relax
2 | \providecommand\hyper@newdestlabel[2]{}
3 | \providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
4 | \HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
5 | \global\let\oldcontentsline\contentsline
6 | \gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
7 | \global\let\oldnewlabel\newlabel
8 | \gdef\newlabel#1#2{\newlabelxx{#1}#2}
9 | \gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
10 | \AtEndDocument{\ifx\hyper@anchor\@undefined
11 | \let\contentsline\oldcontentsline
12 | \let\newlabel\oldnewlabel
13 | \fi}
14 | \fi}
15 | \global\let\hyper@last\relax
16 | \gdef\HyperFirstAtBeginDocument#1{#1}
17 | \providecommand\HyField@AuxAddToFields[1]{}
18 | \providecommand\HyField@AuxAddToCoFields[2]{}
19 | \abx@aux@sortscheme{ynt}
20 | \abx@aux@cite{bengio-emb}
21 | \abx@aux@cite{distbelief}
22 | \@writefile{toc}{\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax }
23 | \@writefile{lof}{\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax }
24 | \@writefile{lot}{\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax }
25 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}}
26 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {2}Background on Downpour SGD}{1}{section.2}}
27 | \abx@aux@cite{tensorflow}
28 | \abx@aux@cite{protobuf}
29 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces A graphic modeling the functionality of Downpour SGD \cite {distbelief}\relax }}{2}{figure.caption.1}}
30 | \providecommand*\caption@xref[2]{\@setref\relax\@undefined{#1}}
31 | \newlabel{fig:downpour}{{1}{2}{A graphic modeling the functionality of Downpour SGD \cite {distbelief}\relax }{figure.caption.1}{}}
32 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {3}Motivation}{2}{section.3}}
33 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {4}Challenges}{3}{section.4}}
34 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {5}Methods and Design}{3}{section.5}}
35 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces On left, transfer speeds for different amounts of parameters. On right, transfer speeds based on chunk size while streaming the parameters.\relax }}{4}{figure.caption.2}}
36 | \newlabel{fig:local}{{2}{4}{On left, transfer speeds for different amounts of parameters. On right, transfer speeds based on chunk size while streaming the parameters.\relax }{figure.caption.2}{}}
37 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {6}Results and Discussion}{6}{section.6}}
38 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces On left, training classification error rates per epoch. On right, training classification error rates over time.\relax }}{6}{figure.caption.3}}
39 | \newlabel{fig:local}{{3}{6}{On left, training classification error rates per epoch. On right, training classification error rates over time.\relax }{figure.caption.3}{}}
40 | \abx@aux@cite{adagrad}
41 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {7}Applying SGD in Lua/Torch}{7}{section.7}}
42 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces The results of running our rnn model for 7 epochs locally.\relax }}{7}{figure.caption.4}}
43 | \newlabel{fig:local}{{4}{7}{The results of running our rnn model for 7 epochs locally.\relax }{figure.caption.4}{}}
44 | \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces The result of running our rnn model for 10 epochs remotely.\relax }}{9}{figure.caption.5}}
45 | \newlabel{fig:remote}{{5}{9}{The result of running our rnn model for 10 epochs remotely.\relax }{figure.caption.5}{}}
46 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {8}Conclusion}{9}{section.8}}
47 | \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {9}Code}{9}{section.9}}
48 |
--------------------------------------------------------------------------------
/CS262%20Final%20Project/main.bib:
--------------------------------------------------------------------------------
1 | @inproceedings{distbelief,
2 | title = {Large Scale Distributed Deep Networks},
3 | author = {Jeffrey Dean and Greg S. Corrado and Rajat Monga and Kai Chen and Matthieu Devin and Quoc V. Le and Mark Z. Mao and Marc’Aurelio Ranzato and Andrew Senior and Paul Tucker and Ke Yang and Andrew Y. Ng},
4 | year = 2012,
5 | booktitle = {NIPS}
6 | }
7 | @article{bengio-emb,
8 | author = {Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Janvin, Christian},
9 | title = {A Neural Probabilistic Language Model},
10 | journal = {J. Mach. Learn. Res.},
11 | issue_date = {3/1/2003},
12 | volume = {3},
13 | month = mar,
14 | year = {2003},
15 | issn = {1532-4435},
16 | pages = {1137--1155},
17 | numpages = {19},
18 | url = {http://dl.acm.org/citation.cfm?id=944919.944966},
19 | acmid = {944966},
20 | publisher = {JMLR.org},
21 | }
22 |
23 | @article{tensorflow,
24 | author = {Mart{\'{\i}}n Abadi and
25 | Ashish Agarwal and
26 | Paul Barham and
27 | Eugene Brevdo and
28 | Zhifeng Chen and
29 | Craig Citro and
30 | Gregory S. Corrado and
31 | Andy Davis and
32 | Jeffrey Dean and
33 | Matthieu Devin and
34 | Sanjay Ghemawat and
35 | Ian J. Goodfellow and
36 | Andrew Harp and
37 | Geoffrey Irving and
38 | Michael Isard and
39 | Yangqing Jia and
40 | Rafal J{\'{o}}zefowicz and
41 | Lukasz Kaiser and
42 | Manjunath Kudlur and
43 | Josh Levenberg and
44 | Dan Mane and
45 | Rajat Monga and
46 | Sherry Moore and
47 | Derek Gordon Murray and
48 | Chris Olah and
49 | Mike Schuster and
50 | Jonathon Shlens and
51 | Benoit Steiner and
52 | Ilya Sutskever and
53 | Kunal Talwar and
54 | Paul A. Tucker and
55 | Vincent Vanhoucke and
56 | Vijay Vasudevan and
57 | Fernanda B. Vi{\'{e}}gas and
58 | Oriol Vinyals and
59 | Pete Warden and
60 | Martin Wattenberg and
61 | Martin Wicke and
62 | Yuan Yu and
63 | Xiaoqiang Zheng},
64 | title = {TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed
65 | Systems},
66 | journal = {CoRR},
67 | volume = {abs/1603.04467},
68 | year = {2016},
69 | url = {http://arxiv.org/abs/1603.04467},
70 | timestamp = {Sun, 03 Apr 2016 11:52:22 +0200},
71 | biburl = {http://dblp.uni-trier.de/rec/bib/journals/corr/AbadiABBCCCDDDG16},
72 | bibsource = {dblp computer science bibliography, http://dblp.org}
73 | }
74 | @MISC{protobuf,
75 | title={Protocol Buffers},
76 | author={Kenton Varda},
77 | howpublished={\url{http://code.google.com/apis/protocolbuffers/}},
78 | }
79 | @techreport{adagrad,
80 | Author = {Duchi, John and Hazan, Elad and Singer, Yoram},
81 | Title = {Adaptive Subgradient Methods for Online Learning and Stochastic Optimization},
82 | Institution = {EECS Department, University of California, Berkeley},
83 | Year = {2010},
84 | Month = {Mar},
85 | URL = {http://www.eecs.berkeley.edu/Pubs/TechRpts/2010/EECS-2010-24.html},
86 | Number = {UCB/EECS-2010-24},
87 | Abstract = {We present a new family of subgradient methods that dynamically incorporate knowledge of the geometry of the data observed in earlier iterations to perform more informative gradient-based learning. Metaphorically, the adaptation allows us to find needles in haystacks in the form of very predictive but rarely seen features. Our paradigm stems from recent advances in stochastic optimization and online learning which employ proximal functions to control the gradient steps of the algorithm. We describe and analyze an apparatus for adaptively modifying the proximal function, which significantly simplifies setting a learning rate and results in regret guarantees that are provably as good as the best proximal function that can be chosen in hindsight. We give several efficient algorithms for empirical risk minimization problems with common and important regularization functions and domain constraints. We experimentally study our theoretical analysis and show that adaptive subgradient methods significantly outperform state-of-the-art, yet non-adaptive, subgradient algorithms.}
88 | }
--------------------------------------------------------------------------------
/CS262%20Final%20Project/main.out:
--------------------------------------------------------------------------------
1 | \BOOKMARK [1][-]{section.1}{Introduction}{}% 1
2 | \BOOKMARK [1][-]{section.2}{Background on Downpour SGD}{}% 2
3 | \BOOKMARK [1][-]{section.3}{Motivation}{}% 3
4 | \BOOKMARK [1][-]{section.4}{Challenges}{}% 4
5 | \BOOKMARK [1][-]{section.5}{Methods and Design}{}% 5
6 | \BOOKMARK [1][-]{section.6}{Results and Discussion}{}% 6
7 | \BOOKMARK [1][-]{section.7}{Applying SGD in Lua/Torch}{}% 7
8 | \BOOKMARK [1][-]{section.8}{Conclusion}{}% 8
9 | \BOOKMARK [1][-]{section.9}{Code}{}% 9
10 |
--------------------------------------------------------------------------------
/CS262%20Final%20Project/main.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/main.pdf
--------------------------------------------------------------------------------
/CS262%20Final%20Project/main.run.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
23 |
28 |
33 |
36 |
39 |
42 | ]>
43 |
44 |
45 | latex
46 |
47 | main.bcf
48 |
49 |
50 | main.bbl
51 |
52 |
53 | blx-dm.def
54 | blx-compat.def
55 | biblatex.def
56 | alphabetic.bbx
57 | standard.bbx
58 | alphabetic.cbx
59 | biblatex.cfg
60 | english.lbx
61 |
62 |
63 |
64 | biber
65 |
66 | biber
67 | main
68 |
69 |
70 | main.bcf
71 |
72 |
75 |
76 | main.bbl
77 |
78 |
79 | main.bcf
80 |
81 |
82 | main.bib
83 |
84 |
85 |
86 |
--------------------------------------------------------------------------------
/CS262%20Final%20Project/main.synctex.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/CS262%20Final%20Project/main.synctex.gz
--------------------------------------------------------------------------------
/Dev-Notebook-Kevin.md:
--------------------------------------------------------------------------------
1 |
2 | Played around with autograd in python. Looking for a reasonable toy dataset to test sgd on distributed system
3 | Looked into the convolutional network example for autograd https://github.com/HIPS/autograd/blob/master/examples/neural_net.py
4 | This ended up being perfect because it spits out a long vector of gradients that it uses
5 | Looking for a far heavier dataset. MSINT runs in a 1-2 minutes.
6 | Found Caltech 101, built some preprocessing code, modified some of the code for the neural network
7 | Needed to downsize the images substantially. 240 x 240 is around 12 GB of data. Shrunk it down to 128 x 128, making it 4 Gb of data. New gradients are around 0.5Gb. This makes network speeds pretty prohibitive though.
8 | Epochs take a couple minutes to run. Batches takes around 10-15 seconds each. Seems rather reasonable
9 |
10 |
11 | Looking into Azure for launching VMs
12 | Discovered CLI for Azure
13 | Set up 5 different accounts all using the Bizspark subscription. One email account also has a free subscription activated.
14 | Emails and passwords are listed below:
15 |
16 | (candokevin2@hotmail.com, cs262michaelkevin)
17 | (candokevin3@hotmail.com, cs262michaelkevin)
18 |
19 |
20 | Log into portal.azure.com to interact more with the system
21 |
22 | Received instructions from Mike on how to setup grpc. For replicability on later Linux VMs we launch, I've documented the steps
23 | I took below:
24 |
25 | Set up Protobufs 3.0.0
26 | https://github.com/google/protobuf/releases/download/v3.0.0-beta-2/protobuf-python-3.0.0-beta-2.zip
27 | ./autogen.sh
28 | ./configure
29 | make
30 | make check
31 | make install
32 |
33 | Set up grpc
34 | git clone https://github.com/grpc/grpc.git
35 | sudo make grpc_python_plugin
36 | sudo vim /etc/paths, add the line /Users/candokevin/stash/grpc/bins/opt
37 |
38 |
39 | It might be a good idea to look into Docker containers, and Docker networks for launching and setting up VMs.
40 |
41 | This site suggests that Google Compute might actually be the best platform for this
42 | https://gigaom.com/2014/04/12/need-for-speed-testing-the-networking-performance-of-the-top-4-cloud-providers/
43 | https://cloudplatform.googleblog.com/2014/04/enter-andromeda-zone-google-cloud-platforms-latest-networking-stack.html
44 | Get started, generate a project ID
45 | Network speed is critical considering how huge our gradients may be.
46 |
47 | Persistent 10GB disk for saving the state of machine
48 | Allows you to save the state of a machine
49 |
50 | gcloud compute instances create example-instance --image test-image --zone us-central1-b
51 | gcloud compute ssh large-example-instance --zone
52 | gcloud compute copy-files /Users/candokevin/stash/distributed-sgd/scp extra-large-example-instance:~/scp/ --zone us-central1-b
53 |
54 |
55 | Generate some code that performs the following
56 |
57 | Initializes the parameters to some certain set of values
58 | Updates parameters given some gradient
59 | Sends parameters to different servers
60 |
--------------------------------------------------------------------------------
/Dev-Notebook-Mike.md:
--------------------------------------------------------------------------------
1 | - need to install proto3 protocol buffers
2 |
3 | download link:
4 | https://github.com/google/protobuf/releases/download/v3.0.0-beta-2/protobuf-python-3.0.0-beta-2.zip
5 |
6 | https://github.com/google/protobuf
7 |
8 | example:
9 | https://github.com/grpc/grpc/tree/release-0_13/examples/python/helloworld
10 |
11 | cd into directory
12 | brew update && brew remove gmp && brew install gmp && brew link gmp
13 |
14 | ./autogen.sh
15 |
16 | ./configure
17 |
18 | make
19 |
20 | make check
21 |
22 | make install
23 |
24 | example usage
25 | protoc -I=$SRC_DIR --python_out=$DST_DIR $SRC_DIR/addressbook.proto
26 |
27 | - installed grpc according to the following instructions listed here: https://github.com/grpc/grpc/tree/release-0_13/examples/python an outline of the command I ran are the following:
28 |
29 | sudo pip install grpcio
30 |
31 | git clone https://github.com/grpc/grpc
32 |
33 | - We can test to see if the helloworld example works:
34 |
35 | cd grpc/examples/python/helloworld
36 |
37 | - Run the server
38 |
39 | python2.7 greeter_server.py &
40 |
41 | - Run the client
42 |
43 | python2.7 greeter_client.py
44 |
45 | -You should see the output "Greeter client received: Hello, you!"
46 |
47 | Instead going to copy the necessary files into our directory and have a small running example
48 |
49 | in the folder Distributed-SGD/helloworld:
50 |
51 | have the files:
52 |
53 | greeter_client.py
54 | greeter_server.py
55 |
56 |
57 | sudo pip install grpcio --upgrade
58 |
59 |
60 |
61 |
62 |
63 | HOW I GOT IT TO WORK
64 | Used this link:
65 | https://github.com/grpc/homebrew-grpc
66 |
67 |
68 | curl -fsSL https://goo.gl/getgrpc | bash -
69 |
70 | virtualenv venv
71 | source venv/bin/activate
72 |
73 | curl -fsSL https://goo.gl/getgrpc | bash -s python
74 |
75 | cd venv
76 |
77 | git clone https://github.com/grpc/grpc.git
78 |
79 | cd grpc
80 |
81 | make grpc_python_plugin
82 |
83 |
84 |
85 |
86 | here we go:
87 |
88 | cd /usr/local/
89 | mkdir manual
90 | cd manual
91 |
92 | curl -fsSL https://goo.gl/getgrpc | bash -
93 |
94 | virtualenv venv
95 |
96 | source venv/bin/activate
97 |
98 | curl -fsSL https://goo.gl/getgrpc | bash -s python
99 |
100 | pip install numpy
101 | pip install scipy
102 | sudo pip install pillow
103 | pip install sklearn
104 | pip install autograd
105 |
106 | cd venv
107 |
108 | git clone https://github.com/grpc/grpc.git
109 | cd grpc
110 |
111 | make grpc_python_plugin
112 |
113 | sudo vim /etc/paths
114 |
115 | and add the line:
116 |
117 | /usr/local/manual/venv/grpc/bins/opt
118 |
119 |
120 |
121 | BEFORE RUNNING ANYTHING
122 |
123 | source /usr/local/manual/venv/bin/activate
124 |
125 |
126 | Important links:
127 | https://github.com/grpc/homebrew-grpc
128 | https://docs.docker.com/engine/userguide/networking/
129 | http://www.bpython-interpreter.org
130 | https://github.com/mila-udem/fuel
131 |
132 |
133 |
134 |
135 |
136 |
137 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Distributed-SGD
2 |
3 | An implementation of distributed stochastic gradient descent for both local and remote clients.
4 |
5 | The [paper](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/CS262%2520Final%2520Project/main.pdf) describing this project.
6 |
7 | ## Usage
8 |
9 | The usage varies depending on the native language.
10 |
11 | Please see the respective directory for the language you are interested in:
12 |
13 | Usage in [python](https://github.com/michaelfarrell76/Distributed-SGD/tree/master/python-python)
14 |
15 | Usage in [lua/torch](https://github.com/michaelfarrell76/Distributed-SGD/tree/master/lua-lua)
16 |
17 | ## Table of Contents
18 | ```
19 | .
20 | ├── lua-lua # Implementation of Distributed SGD in lua/torch
21 | ├── python-python # Implementation of Distributed SGD in python
22 | ├── slides # presentation slides about this project
23 | ├──.gitignore
24 | ├──.gitmodules
25 | ├── Dev-Notebook-Kevin.md # Development notes
26 | ├── Dev-Notebook-Mike.md
27 | ├── README.md
28 | └── client_list.txt # List of available server ip addresses
29 | ```
30 |
31 |
32 | ## Primary contributors
33 |
34 | [Kevin Yang](https://github.com/kyang01)
35 |
36 | [Michael Farrell](https://github.com/michaelfarrell76)
37 |
38 |
--------------------------------------------------------------------------------
/client_list.txt:
--------------------------------------------------------------------------------
1 | 130.211.204.149
2 | 104.197.250.103
3 | 130.211.192.196
4 | 104.197.222.148
5 | 104.197.106.197
6 | 104.197.167.23
7 | 104.154.239.139
8 | 130.211.206.66
9 | 104.197.137.32
10 | 104.197.174.106
11 |
--------------------------------------------------------------------------------
/lua-lua/README.md:
--------------------------------------------------------------------------------
1 | # Distributed-SGD: lua-lua
2 | An implementation of distributed stochastic gradient descent in lua/torch. Clients can be local and remote.
3 |
4 | ## Requirements
5 |
6 | This code is written in Lua, and an installation of [Torch](https://github.com/torch/torch7/) is assumed. Training requires a few packages which can easily be installed through [LuaRocks](https://github.com/keplerproject/luarocks) (which comes with a Torch installation). Datasets are formatted and loaded using [hdf5](https://en.wikipedia.org/wiki/Hierarchical_Data_Format), which can be installed using this [guide](https://github.com/deepmind/torch-hdf5/blob/master/doc/usage.md).
7 |
8 | Once torch and torch-hdf5 are installed, use luarocks to install the other dependencies used in the example:
9 |
10 | ```bash
11 | $ luarocks install nn
12 | $ luarocks install rnn
13 | ```
14 | If you want to train on an Nvidia GPU using CUDA, you'll need to install the [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit) as well as the `cutorch` and `cunn` packages:
15 | ```bash
16 | $ luarocks install cutorch
17 | $ luarocks install cunn
18 | ```
19 | We need to ensure that our local version of parallel is installed. This can be done with a short bash script from the lua-lua folder:
20 | ```bash
21 | $ cd lua-lua
22 | $ bash install_parallel.sh
23 | ```
24 |
25 | ## Directory Table of Contents
26 | ```
27 | .
28 | ├── data # Folder holding data used for demo
29 | ├── parallel # Folder containing the changes we added to the parallel class
30 | ├── End-To-End-Generative-Dialgoue # Folder of our other repo containing the code used in demo
31 | ├── README.md # lua-lua usage
32 | ├── server.lua # Main server file
33 | ├── README.md
34 | ├── startup.sh # Startup script for remote gcloud servers
35 | ├── setup_image.sh # Script that copies startup.sh to remote server and calls startup.sh
36 | ├── install_parallel.sh # script that installs our version of parallel
37 | └── demo_server.lua # A demo class that implements the server
38 | ```
39 |
40 | ## Description
41 |
42 | ## Demo-Usage
43 | Code is run from the lua-lua folder:
44 | ```bash
45 | $ cd lua-lua
46 | ```
47 |
48 | #### Local
49 |
50 | To run a worker with 2 parallel clients on your own machine:
51 | ```bash
52 | $ th server.lua -n_proc 2
53 | ```
54 |
55 | #### Remote - localhost
56 |
57 | In order to get the demo to connect through localhost rather than simply forking, we must first setup an .ssh key for this project.
58 |
59 | Note: This is basically doing the same thing as [local](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/lua-lua/README.md#local), except we now connect to the clients through localhost. This is a good tool to use to debug problems with clients running on remote servers.
60 |
61 | ##### Generate ssh key
62 | Replace USERNAME with your username on the computer you want to connect to:
63 | ```bash
64 | $ USERNAME=michaelfarrell
65 | $ ssh-keygen -t rsa -f ~/.ssh/dist-sgd-sshkey -C $USERNAME
66 | ```
67 | Hit enter twice and a key should have been generated.
68 |
69 | ##### Add ssh-key to authorized keys
70 |
71 | In order to connect to clients through localhost, we must add the key to our list of authorized_keys:
72 | ```bash
73 | $ cat ~/.ssh/dist-sgd-sshkey.pub >> ~/.ssh/authorized_keys
74 | $ chmod og-wx ~/.ssh/authorized_keys
75 | ```
76 |
77 | ##### Allow ssh connections
78 |
79 | In order to connect through localhost, you must allow your computer to allow incoming ssh connections.
80 |
81 | On a Mac, this can be done by going to:
82 |
83 | System Preferences > Sharing
84 |
85 | and checking the 'Remote Login' box
86 |
87 |
88 | ##### Connect via localhost
89 |
90 | You can now communicate over localhost using the command:
91 |
92 | ```bash
93 | $ EXTENSION=Desktop/GoogleDrive/FinalProject/Distributed-SGD/lua-lua/
94 | $ TORCH_PATH=/Users/michaelfarrell/torch/install/bin/th
95 | $ th server.lua -n_proc 4 -localhost -extension $EXTENSION -torch_path $TORCH_PATH
96 | ```
97 | where $EXTENSION is the relative path to the lua-lua folder from the your directory and $TORCH_PATH is the absolute path to torch on your computer
98 |
99 | #### Remote - gcloud
100 |
101 | Instead of having the client programs running on your own computer, you can farm them out to any number of remote computers. Below is a description of how to setup remote clients using google cloud (gcloud offers 60 day free trials with $300 worth of credit).
102 |
103 | ##### Adding ssh key to gcloud servers
104 |
105 | We have to allow our gcloud servers to accept incoming ssh connections from our computer.
106 |
107 | If you have yet to do so, [generate an ssh-key](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/lua-lua/README.md#generate-ssh-key)
108 |
109 | Once you have created the key print it out:
110 |
111 | ```bash
112 | $ cat ~/.ssh/dist-sgd-sshkey.pub
113 | ```
114 |
115 | Next you must add the key to the set of public keys :
116 | - Login to your google compute account.
117 | - Go to compute engine dashboard
118 | - Go to metdata tab
119 | - Go to ssh-key subtab
120 | - Click edit
121 | - Add the key you copied as a new line
122 |
123 | Restrict external access to the key:
124 | ```bash
125 | $ chmod 400 ~/.ssh/dist-sgd-sshkey
126 | ```
127 |
128 | ##### Create a baseline startup image
129 |
130 | We only have to setup and install everything once, after which we can clone that client.
131 |
132 | ###### Create the image
133 | - Click on the 'VM Instances' tab
134 | - Create Instance
135 | - Give the instance a name i.e. 'demo-baseline'
136 | - Set the zone to us-central1-b
137 | - Choose 8vCPU highmem as machine type
138 | - Under boot disk click change
139 | - Choose Ubuntu 14.04 LTS
140 | - At the bottom change size to 30 GB and click 'select'
141 | - Allow HTTP traffic
142 | - Allow HTTPS traffic
143 | - Click 'Management, disk, networking, SSH keys' to dropdown more options
144 | - Under 'Disk' unclick 'Delete boot disk when instance is deleted'
145 | - Click 'Create' an you should see your new instance listed in the table
146 |
147 | ###### Allow tcp connections
148 | - Wait for the VM instance to startup (indicated by a green check next to the instance)
149 | - Under the 'network' column, click 'default'
150 | - Go to 'Firewall rules' and Add a new rule
151 | - Set name to be 'all'
152 | - Set source filter to allow from any source
153 | - Under allowed protocols, put 'tcp:0-65535; udp:0-65535; icmp'
154 | - Create
155 |
156 | ###### Setup the disk
157 | - Return to the 'VM instances' tab
158 | - Grab the external IP address for the instance
159 | ```bash
160 | $ EXTERNAL_IP=104.154.48.250
161 | $ USERNAME=michaelfarrell
162 | ```
163 | - Next you must modify the 'startup.sh' script to also include any additional installs that you may need on the server. This script is run from the home directory of the remote client. To run the demo, you do not need to modify this script.
164 | - Next you must modify the 'setup_image.sh' script so that it correctly calls your startup.sh script on the remote server. If you did not change 'startup.sh' script, you should probably not be changing this script either.
165 | - Setup the image:
166 | ```bash
167 | $ source setup_image.sh
168 | ```
169 | Note you can connect to the server:
170 | ```bash
171 | $ ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$EXTERNAL_IP
172 | ```
173 | - Once the server is setup to your liking, disconnect from the server and return to your google cloud dashboard
174 | - Go to the 'VM Dashboard'
175 | - Click on the instance you just setup, and delete it. This should remove the instance and save it as a disk. If you click on the 'disks' tab, you should see the instance name you just deleted.
176 |
177 | ###### Create the image
178 |
179 | - Click on the 'Images' tab
180 | - 'Create Image'
181 | - Give it a name i.e. 'demo-image'
182 | - Under Source-Disk, choose the disk that you just created
183 | - Create
184 |
185 | ##### Generate an 'Instance Template'
186 | - Click on the 'Instance templates' tab
187 | - Create new
188 | - Name the template i.e. 'demo-template'
189 | - Under 'Boot Disk' click change
190 | - At the top click 'Your image'
191 | - Choose the image you just created i.e. 'demo-image'
192 | - Set size to 30 GB
193 | - Select
194 | - Allow HTTP traffic
195 | - Allow HTTPS traffic
196 | - Under more->Disks, unclick 'Delete boot disk when instance is deleted'
197 | - Create
198 |
199 | ##### Generate an 'Instance Group'
200 | - Go to the "Instance groups" tab
201 | - Create instance group
202 | - Give the group a name, i.e. 'demo-group'
203 | - Give a description
204 | - Set zone to us-central1-b
205 | - Use instance template
206 | - Choose the template you just made i.e. 'demo-template'
207 | - Set the number of instances
208 | - Create
209 | - Wait for the instances to launch
210 | - Once there is a green checkmark, click on the new instance
211 |
212 | ##### Adding remote clients
213 | You will want to add your list of client servers to the file 'client_list.txt' where each line in the file is one of the external ip addresses located in the Instance group you are currently using. You will need to copy this list of files to the computer that you are going to use as the main parameter server. Choose an IP from the freshly updated 'client_list.txt' and set the $SERVER_IP environment variable:
214 | ```bash
215 | $ SERVER_IP=130.211.160.115
216 | ```
217 | Copy over 'client_list.txt' to the main server:
218 | ```bash
219 | $ scp -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey ../client_list.txt $USERNAME@$SERVER_IP:~/Distributed-SGD
220 | ```
221 |
222 | ##### Connecting to gcloud servers
223 |
224 | You can connect to one of the servers by running:
225 | ```bash
226 | $ ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$SERVER_IP
227 | ```
228 | Note: the flag `-o "StrictHostKeyChecking no"` automatically adds the host to your list and does not prompt confirmation.
229 |
230 | If you get an error like this:
231 | ```bash
232 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
233 | @ WARNING: REMOTE HOST IDENTIFICATION HAS CHANGED! @
234 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
235 | ```
236 | then you'll want to
237 | ```bash
238 | $ vim ~/.ssh/known_hosts
239 | ```
240 | and delete the last few lines that were added. They should look like some ip address and then something that starts with AAAA. You can delete lines in vim by typing 'dd' to delete the current line. This can happen when you restart the servers and they change ip addresses, among other things.
241 |
242 | ##### Adding ssh keys again
243 |
244 | If the servers have been initialized, you will first want to connect to the computer above that you chose to be the main server
245 | ```bash
246 | $ ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$SERVER_IP
247 | ```
248 |
249 | Once connected, you need to again setup an ssh key from the computer that you are using as the client.
250 |
251 | 1) [generate an ssh-key](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/lua-lua/README.md#generate-ssh-key)
252 |
253 | 2) [add key to gcloud server account](https://github.com/michaelfarrell76/Distributed-SGD/blob/master/lua-lua/README.md#adding-ssh-key-to-gcloud-servers)
254 |
255 | ##### Running on remote servers:
256 |
257 | Once this is done, you can run the server with remote gcloud clients using the command:
258 | ```bash
259 | $ cd Distributed-SGD/lua-lua
260 | $ EXTENSION=Distributed-SGD/lua-lua/
261 | $ TORCH_PATH=/home/michaelfarrell/torch/install/bin/th
262 | $ th server.lua -n_proc 4 -remote -extension $EXTENSION -torch_path $TORCH_PATH
263 |
264 | ```
265 |
266 | ## For Personal Usage
267 |
268 | If you wish to extend this demo to work with your own SGD model you must simply create a new server class specific to your task, replacing the 'demo_server' class. Use the file 'demo_server.lua' as an example. The server only needs to have __init(opt) and run() functions defined in order to work. Once this class is properly defined (i.e. named 'new_server'), you can run the following to initiate your task:
269 |
270 | ```bash
271 | $ NEW_SERVER_NAME=new_server
272 | $ th server.lua -server_class $NEW_SERVER_NAME # Plus Additional arguments
273 |
274 | ```
275 |
276 | When developing, all command line arguments should be added in the file server.lua. Look at the command arguments
277 | ```bash
278 | $ th server.lua --help
279 | ```
280 | that already exist and use those names when developing your model. If you need an additional command line argument, add it in server.lua. Other than this, there should be no reason to edit the server.lua file.
281 |
282 | If you are having your clients run remotely, you may also need to modify 'startup.sh' and 'setup_image.sh' so that they setup the server environements according to the specifications that you need.
283 |
284 |
285 | ## TODO
286 | - Document data folder and include description in demo-usage about what the demo is
287 | - Add in documentation of how the data needs to be formatted in order to run the demo
288 | - Finish description
289 | - Finish Acknowledgements
290 | - Add in proto implementation
291 | - Add in git pull at startup
292 | - add way to catch if failure down and reset
293 | - maybe add paxos if kevin is successful
294 | - try adding protobufs
295 | - get results
296 | - Add in addtional catches for errors like add to path
297 |
298 |
299 | ## Acknowledgments
300 | This example is also apart of another one of our repos: https://github.com/michaelfarrell76/End-To-End-Generative-Dialogue
301 |
302 | Our implementation utilizes code from the following:
303 |
304 | * [Yoon Kim's seq2seq-attn repo](https://github.com/harvardnlp/seq2seq-attn)
305 | * [Element rnn library](https://github.com/Element-Research/rnn)
306 | * [Facebook's neural attention model](https://github.com/facebook/NAMAS)
307 |
--------------------------------------------------------------------------------
/lua-lua/cleanup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """Copy files from servers
5 | """
6 |
7 | import sys
8 | import os
9 | import time
10 |
11 |
12 | def child(ip_addr):
13 | if not os.path.exists('outputs/' + ip_addr):
14 | os.makedirs('outputs/' + ip_addr)
15 | os.system('(echo " echo starting; pkill torch; pkill lua; cd Distributed-SGD/lua-lua/; git pull; cd End-To-End-Generative-Dialogue/; git pull origin master; exit") | ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey michaelfarrell@%s' % ip_addr)
16 | os._exit(0)
17 |
18 |
19 | def main(arguments):
20 | with open('../client_list.txt') as f:
21 | if not os.path.exists('outputs'):
22 | os.makedirs('outputs')
23 | pids = []
24 | for line in f:
25 | # os.system('echo ' + line)
26 | newpid = os.fork()
27 | pids.append(newpid)
28 | if newpid == 0:
29 | if line[-1] == '\n':
30 | child(line[:-1])
31 | else:
32 | child(line)
33 |
34 |
35 | if __name__ == '__main__':
36 | sys.exit(main(sys.argv[1:]))
--------------------------------------------------------------------------------
/lua-lua/copy_files.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """Copy files from servers
5 | """
6 |
7 | import sys
8 | import os
9 | import time
10 |
11 |
12 | def child(ip_addr):
13 | if not os.path.exists('outputs/' + ip_addr):
14 | os.makedirs('outputs/' + ip_addr)
15 | cmd = 'scp -r -i ~/.ssh/dist-sgd-sshkey michaelfarrell@%s:~/Distributed-SGD/lua-lua/*.txt ~/Desktop/GoogleDrive/FinalProject/Distributed-SGD/lua-lua/outputs/%s/ &> /dev/null' % (ip_addr, ip_addr)
16 |
17 | os.system(cmd)
18 | os._exit(0)
19 |
20 |
21 | def main(arguments):
22 | with open('../client_list.txt') as f:
23 | if not os.path.exists('outputs'):
24 | os.makedirs('outputs')
25 | pids = []
26 | for line in f:
27 | # os.system('echo ' + line)
28 | newpid = os.fork()
29 | pids.append(newpid)
30 | if newpid == 0:
31 | if line[-1] == '\n':
32 | child(line[:-1])
33 | else:
34 | child(line)
35 |
36 |
37 | time.sleep(5)
38 | if __name__ == '__main__':
39 | sys.exit(main(sys.argv[1:]))
--------------------------------------------------------------------------------
/lua-lua/data/demo-train.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/data/demo-train.hdf5
--------------------------------------------------------------------------------
/lua-lua/data/demo-val.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/data/demo-val.hdf5
--------------------------------------------------------------------------------
/lua-lua/demo_server.lua:
--------------------------------------------------------------------------------
1 | ------------------------------------------------------------------------
2 | -- demo_server.lua
3 | --
4 | -- This is the example of a class that is used to implement a sever in
5 | -- server.lua. This class has an _init(opt) function that takes in
6 | -- the global parameters, loads in the data and builds the model on
7 | -- the parameter server. The class also has a run() function that
8 | -- forks out the child clients and executes the function 'worker'
9 | -- on each corresponding client.
10 | --
11 | -- If you wish to develop your own SGD model, create a new class that is
12 | -- similar to this.
13 | ------------------------------------------------------------------------
14 | local demo_server = torch.class('demo_server')
15 |
16 | ------------
17 | -- Worker code
18 | ------------
19 | function worker()
20 | -- Used to check files
21 | require "lfs"
22 |
23 | -- Used to update path
24 | require 'package'
25 |
26 | -- Alert successfully started up
27 | parallel.print('Im a worker, my ID is: ', parallel.id, ' and my IP: ', parallel.ip)
28 |
29 | -- Global indicating is a child
30 | ischild = true
31 |
32 | -- Extension to lua-lua folder from home directory. Set to no extension as default
33 | ext = ""
34 |
35 | -- Number of packages received
36 | local n_pkg = 0
37 | while true do
38 |
39 | -- Allow the parent to terminate the child
40 | m = parallel.yield()
41 | if m == 'break' then break end
42 |
43 | -- Receive data
44 | local pkg = parallel.parent:receive()
45 |
46 |
47 | -- Make sure to clean everything up since big files are being passed
48 | io.write('.') io.flush()
49 | collectgarbage()
50 |
51 |
52 | if n_pkg == 0 then
53 | -- This is the first time receiving a package, it has the globals
54 |
55 | -- Receive and parse global parameters
56 | parallel.print('Recieved initialization parameters')
57 | cmd, arg, ext = pkg.cmd, pkg.arg, pkg.ext
58 | opt = cmd:parse(arg)
59 |
60 | -- Update path
61 | package.path = opt.add_to_path .. package.path
62 |
63 | -- Add in additional necessary parameters
64 | opt.print = parallel.print
65 | opt.parallel = true
66 |
67 |
68 | -- Library used to handle data types
69 | local data_loc = ext .. 'End-To-End-Generative-Dialogue/src/data'
70 | if not lfs.attributes(data_loc .. '.lua') then
71 | print('The file data.lua could not be found in ' .. data_loc .. '.lua')
72 | os.exit()
73 | end
74 | data = require(data_loc)
75 |
76 | -- Load in helper functions for this model defined in End-To-End-Generative-Dialogue
77 | local model_funcs_loc = ext .. "End-To-End-Generative-Dialogue/src/model_functions.lua"
78 | if not lfs.attributes(model_funcs_loc) then
79 | print('The file model_functions.lua could not be found in ' .. model_funcs_loc)
80 | os.exit()
81 | end
82 | funcs = loadfile(model_funcs_loc)
83 | funcs()
84 |
85 | -- Change the locations of the datafiles based on new extension
86 | opt.data_file = ext .. opt.data_file
87 | opt.val_data_file = ext .. opt.val_data_file
88 |
89 | --point the wordvec to the right place if exists
90 | if opt.pre_word_vecs ~= "" then
91 | opt.pre_word_vecs = opt.extension .. opt.pre_word_vecs
92 | end
93 |
94 | -- Load in data to client
95 | train_data, valid_data, opt = load_data(opt)
96 |
97 | -- Build the model on the client
98 | model, criterion = build()
99 |
100 | -- send some data back
101 | parallel.parent:send('Received parameters and loaded data successfully')
102 | else
103 | parallel.print('received params from batch with index: ', pkg.index)
104 |
105 | -- Load in the parameters sent from the parent
106 | for i = 1, #model.params do
107 | model.params[i]:copy(pkg.parameters[i])
108 | end
109 |
110 | -- Training the model at the given index
111 | local pkg_o = train_ind(pkg.index, model, criterion, train_data)
112 |
113 | -- send some data back
114 | parallel.print('sending back derivative for batch with index: ', pkg.index)
115 | parallel.parent:send(pkg_o)
116 | end
117 | n_pkg = n_pkg + 1
118 | end
119 | end
120 |
121 |
122 | ------------
123 | -- Server class
124 | ------------
125 |
126 | -- Initialization function for the server object. Here we load in the data, build our
127 | -- model, and then add any remote client objects if necessary.
128 | function demo_server:__init(opt)
129 | -- Save the command line options
130 | self.opt = opt
131 |
132 | -- Used to check files
133 | require "lfs"
134 |
135 | -- Library used to handle data types
136 | local data_loc = 'End-To-End-Generative-Dialogue/src/data'
137 | if not lfs.attributes(data_loc .. '.lua') then
138 | print('The file data.lua could not be found in ' .. data_loc .. '.lua')
139 | os.exit()
140 | end
141 | data = require(data_loc)
142 |
143 | -- Load in helper functions for this model defined in End-To-End-Generative-Dialogue
144 | local model_funcs_loc = "End-To-End-Generative-Dialogue/src/model_functions.lua"
145 | if not lfs.attributes(model_funcs_loc) then
146 | print('The file model_functions.lua could not be found in ' .. model_funcs_loc)
147 | os.exit()
148 | end
149 | funcs = loadfile(model_funcs_loc)
150 | funcs()
151 |
152 | -- Load in the data
153 | self:load_data()
154 |
155 | -- Setup and build the model
156 | self:build()
157 |
158 | -- Add remote computers if necessary
159 | if self.opt.remote then
160 | parallel.print('Runnings clients remotely')
161 |
162 | -- Open the list of client ip addresses
163 | local fh,err = io.open("../client_list.txt")
164 | if err then print("../client_list.txt not found"); return; end
165 |
166 | -- line by line
167 | while true do
168 | local line = fh:read()
169 | if line == nil then break end
170 | local addr = self.opt.username .. '@' .. line
171 | addr = string.gsub(addr, "\n", "") -- remove line breaks
172 |
173 | -- Add the remote server by ip address
174 | parallel.addremote( {ip=addr, cores=4, lua=self.opt.torch_path, protocol='ssh -ttq -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey'})
175 | parallel.print('Adding address ', addr)
176 | end
177 | elseif opt.localhost then
178 | -- Has remote clients launched through localhost
179 | parallel.print('Running clients through localhost')
180 |
181 | parallel.addremote({ip='localhost', cores=4, lua=self.opt.torch_path, protocol='ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey'})
182 | end
183 | end
184 |
185 | -- Main function that runs the server. Here the child clients are forked off and
186 | -- the code in the 'worker' function is sent to the clients to be run. Once
187 | -- the connection is established, :send() and :recieve() are used to pass
188 | -- parameters between the client and the server
189 | function demo_server:run()
190 | parallel.print('Forking ', self.opt.n_proc, ' processes')
191 | parallel.sfork(self.opt.n_proc)
192 | parallel.print('Forked')
193 |
194 | -- exec worker code in each process
195 | parallel.children:exec(worker)
196 | parallel.print('Finished telling workers to execute')
197 |
198 | --send the global parameters to the children
199 | parallel.children:join()
200 | parallel.print('Sending parameters to children')
201 | parallel.children:send({cmd = cmd, arg = arg, ext = self.opt.extension})
202 |
203 | -- Get the responses from the children
204 | replies = parallel.children:receive()
205 | parallel.print('Replies from children', replies)
206 |
207 | -- Train the model
208 | train(self.model, self.criterion, self.train_data, self.valid_data)
209 | parallel.print('Finished training the model')
210 |
211 | -- sync/terminate when all workers are done
212 | parallel.children:join('break')
213 | parallel.print('All processes terminated')
214 | end
215 |
216 | -- Function loads in the training and validation data into self.train_data and
217 | -- seld.valid_data.
218 | function demo_server:load_data()
219 | -- Simply calls the load_data function defined in "End-To-End-Generative-Dialogue/src/model_functions.lua"
220 | self.train_data, self.valid_data, self.opt = load_data(self.opt)
221 | end
222 |
223 | -- Function loads in the nn model and criterion into self.model and self.criterion
224 | function demo_server:build()
225 | -- Simply calls the build function defined in "End-To-End-Generative-Dialogue/src/model_functions.lua"
226 | self.model, self.criterion = build()
227 | end
228 |
229 | -- Return the server
230 | return demo_server
231 |
--------------------------------------------------------------------------------
/lua-lua/gcloud_commands.txt:
--------------------------------------------------------------------------------
1 | th server.lua -n_proc 2 -remote -extension $EXTENSION -add_to_path '/home/michaelfarrell/Distributed-SGD/lua-lua/End-To-End-Generative-Dialogue/src/?.lua;' -torch_path $TORCH_PATH | tee ada_2_rem.txt
2 |
3 |
4 |
5 | th server.lua -n_proc 4 -remote -extension $EXTENSION -add_to_path '/home/michaelfarrell/Distributed-SGD/lua-lua/End-To-End-Generative-Dialogue/src/?.lua;' -torch_path $TORCH_PATH | tee ada_4_rem.txt
6 |
7 | th server.lua -n_proc 8 -remote -extension $EXTENSION -add_to_path '/home/michaelfarrell/Distributed-SGD/lua-lua/End-To-End-Generative-Dialogue/src/?.lua;' -torch_path $TORCH_PATH -num_epochs 20 | tee ada_8_rem.txt
8 |
9 | th server.lua -n_proc 2 -ada_grad -learning_rate .1 | tee reg_2_loc.txt
10 |
11 | cd End-To-End-Generative-Dialogue/; git pull origin master; cd ..
--------------------------------------------------------------------------------
/lua-lua/install_parallel.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Install script for parallel that uses local file init.lua
4 | #
5 |
6 |
7 | # Move into install directory
8 | cd ..
9 | if [ -e "install" ]
10 | then
11 | echo -e "\033[0;32minstall folder exists\033[0m"
12 | else
13 | echo -e "\033[0;34mMaking install repo ...\033[0m"
14 | mkdir install
15 | fi
16 | cd install
17 |
18 | # Ensure that parallel is downloaded and installed with local version
19 | if [ -e "lua---parallel" ]
20 | then
21 | echo -e "\033[0;32mparallel exists\033[0m"
22 | else
23 | echo -e "\033[0;34mCloining Parallel Repo ...\033[0m"
24 | git clone https://github.com/clementfarabet/lua---parallel.git &> /dev/null
25 | fi
26 |
27 | cd lua---parallel
28 | echo -e "\033[0;34mCopying local init.lua file for parallel...\033[0m"
29 | cp ../../lua-lua/parallel/init.lua .
30 | echo -e "\033[0;34mBuilding local version of parallel...\033[0m"
31 | luarocks remove parallel &> /dev/null
32 | luarocks make &> /dev/null
33 | echo -e "\033[0;32mInstall complete\033[0m"
34 |
35 |
--------------------------------------------------------------------------------
/lua-lua/locally.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/locally.png
--------------------------------------------------------------------------------
/lua-lua/outputs/104.154.239.139/ada_4_rem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.154.239.139/ada_4_rem.png
--------------------------------------------------------------------------------
/lua-lua/outputs/104.154.239.139/ada_8_rem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.154.239.139/ada_8_rem.png
--------------------------------------------------------------------------------
/lua-lua/outputs/104.197.106.197/ada_2_rem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.197.106.197/ada_2_rem.png
--------------------------------------------------------------------------------
/lua-lua/outputs/104.197.222.148/ada_2_loc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.197.222.148/ada_2_loc.png
--------------------------------------------------------------------------------
/lua-lua/outputs/104.197.222.148/reg_2.txt:
--------------------------------------------------------------------------------
1 | Im the parent, my ID is: 0 and my IP: 10.251.50.144
2 | Loading data...
3 | Source vocab size: 28721, Target vocab size: 42787
4 | Source max sent len: 52, Target max sent len: 52
5 | Done loading data!
6 |
7 | Building model with specs:
8 | Layer type: lstm
9 | Model type: red
10 | Embedding size: 300
11 | Hidden layer size: 300
12 | Number of layers: 2
13 | Number of parameters: 37219687
14 |
15 | Forking 2 processes
16 | Forked
17 | Finished telling workers to execute
18 | Sending parameters to children
19 | Im a worker, my ID is: 2 and my IP: 10.251.50.144
20 | . Im a worker, my ID is: 1 and my IP: 10.251.50.144
21 | . Recieved initialization parameters
22 | Recieved initialization parameters
23 | Replies from children {
24 | 1 : "Received parameters and loaded data successfully"
25 | 2 : "Received parameters and loaded data successfully"
26 | }
27 | Beginning training...
28 | Loading data...
29 | Source vocab size: 28721, Target vocab size: 42787
30 | Source max sent len: 52, Target max sent len: 52
31 | Done loading data!
32 |
33 | Building model with specs:
34 | Layer type: lstm
35 | Model type: red
36 | Embedding size: 300
37 | Hidden layer size: 300
38 | Number of layers: 2
39 | Number of parameters: 37219687
40 |
41 | . Loading data...
42 | Source vocab size: 28721, Target vocab size: 42787
43 | Source max sent len: 52, Target max sent len: 52
44 | Done loading data!
45 |
46 | Building model with specs:
47 | Layer type: lstm
48 | Model type: red
49 | Embedding size: 300
50 | Hidden layer size: 300
51 | Number of layers: 2
52 | Number of parameters: 37219687
53 |
54 | . received params from batch with index: 21
55 | sending back derivative for batch with index: 21
56 |
--------------------------------------------------------------------------------
/lua-lua/outputs/104.197.250.103/reg_2.txt:
--------------------------------------------------------------------------------
1 | Im the parent, my ID is: 0 and my IP: 10.251.50.144
2 | Loading data...
3 | Source vocab size: 28721, Target vocab size: 42787
4 | Source max sent len: 52, Target max sent len: 52
5 | Done loading data!
6 |
7 | Building model with specs:
8 | Layer type: lstm
9 | Model type: red
10 | Embedding size: 300
11 | Hidden layer size: 300
12 | Number of layers: 2
13 | Number of parameters: 37219687
14 |
15 | Forking 2 processes
16 | Forked
17 | Finished telling workers to execute
18 | Sending parameters to children
19 | Im a worker, my ID is: 2 and my IP: 10.251.50.144
20 | . Im a worker, my ID is: 1 and my IP: 10.251.50.144
21 | . Recieved initialization parameters
22 | Recieved initialization parameters
23 | Replies from children {
24 | 1 : "Received parameters and loaded data successfully"
25 | 2 : "Received parameters and loaded data successfully"
26 | }
27 | Beginning training...
28 | Loading data...
29 | Source vocab size: 28721, Target vocab size: 42787
30 | Source max sent len: 52, Target max sent len: 52
31 | Done loading data!
32 |
33 | Building model with specs:
34 | Layer type: lstm
35 | Model type: red
36 | Embedding size: 300
37 | Hidden layer size: 300
38 | Number of layers: 2
39 | Number of parameters: 37219687
40 |
41 | . Loading data...
42 | Source vocab size: 28721, Target vocab size: 42787
43 | Source max sent len: 52, Target max sent len: 52
44 | Done loading data!
45 |
46 | Building model with specs:
47 | Layer type: lstm
48 | Model type: red
49 | Embedding size: 300
50 | Hidden layer size: 300
51 | Number of layers: 2
52 | Number of parameters: 37219687
53 |
54 | . received params from batch with index: 21
55 | sending back derivative for batch with index: 21
56 |
--------------------------------------------------------------------------------
/lua-lua/outputs/104.197.250.103/reg_2_loc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/104.197.250.103/reg_2_loc.png
--------------------------------------------------------------------------------
/lua-lua/outputs/130.211.192.196/reg_1_loc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/130.211.192.196/reg_1_loc.png
--------------------------------------------------------------------------------
/lua-lua/outputs/130.211.192.196/reg_2.txt:
--------------------------------------------------------------------------------
1 | Im the parent, my ID is: 0 and my IP: 10.251.50.144
2 | Loading data...
3 | Source vocab size: 28721, Target vocab size: 42787
4 | Source max sent len: 52, Target max sent len: 52
5 | Done loading data!
6 |
7 | Building model with specs:
8 | Layer type: lstm
9 | Model type: red
10 | Embedding size: 300
11 | Hidden layer size: 300
12 | Number of layers: 2
13 | Number of parameters: 37219687
14 |
15 | Forking 2 processes
16 | Forked
17 | Finished telling workers to execute
18 | Sending parameters to children
19 | Im a worker, my ID is: 2 and my IP: 10.251.50.144
20 | . Im a worker, my ID is: 1 and my IP: 10.251.50.144
21 | . Recieved initialization parameters
22 | Recieved initialization parameters
23 | Replies from children {
24 | 1 : "Received parameters and loaded data successfully"
25 | 2 : "Received parameters and loaded data successfully"
26 | }
27 | Beginning training...
28 | Loading data...
29 | Source vocab size: 28721, Target vocab size: 42787
30 | Source max sent len: 52, Target max sent len: 52
31 | Done loading data!
32 |
33 | Building model with specs:
34 | Layer type: lstm
35 | Model type: red
36 | Embedding size: 300
37 | Hidden layer size: 300
38 | Number of layers: 2
39 | Number of parameters: 37219687
40 |
41 | . Loading data...
42 | Source vocab size: 28721, Target vocab size: 42787
43 | Source max sent len: 52, Target max sent len: 52
44 | Done loading data!
45 |
46 | Building model with specs:
47 | Layer type: lstm
48 | Model type: red
49 | Embedding size: 300
50 | Hidden layer size: 300
51 | Number of layers: 2
52 | Number of parameters: 37219687
53 |
54 | . received params from batch with index: 21
55 | sending back derivative for batch with index: 21
56 |
--------------------------------------------------------------------------------
/lua-lua/outputs/130.211.204.149/ada_1_loc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/outputs/130.211.204.149/ada_1_loc.png
--------------------------------------------------------------------------------
/lua-lua/outputs/130.211.204.149/reg_2.txt:
--------------------------------------------------------------------------------
1 | Im the parent, my ID is: 0 and my IP: 10.251.50.144
2 | Loading data...
3 | Source vocab size: 28721, Target vocab size: 42787
4 | Source max sent len: 52, Target max sent len: 52
5 | Done loading data!
6 |
7 | Building model with specs:
8 | Layer type: lstm
9 | Model type: red
10 | Embedding size: 300
11 | Hidden layer size: 300
12 | Number of layers: 2
13 | Number of parameters: 37219687
14 |
15 | Forking 2 processes
16 | Forked
17 | Finished telling workers to execute
18 | Sending parameters to children
19 | Im a worker, my ID is: 2 and my IP: 10.251.50.144
20 | . Im a worker, my ID is: 1 and my IP: 10.251.50.144
21 | . Recieved initialization parameters
22 | Recieved initialization parameters
23 | Replies from children {
24 | 1 : "Received parameters and loaded data successfully"
25 | 2 : "Received parameters and loaded data successfully"
26 | }
27 | Beginning training...
28 | Loading data...
29 | Source vocab size: 28721, Target vocab size: 42787
30 | Source max sent len: 52, Target max sent len: 52
31 | Done loading data!
32 |
33 | Building model with specs:
34 | Layer type: lstm
35 | Model type: red
36 | Embedding size: 300
37 | Hidden layer size: 300
38 | Number of layers: 2
39 | Number of parameters: 37219687
40 |
41 | . Loading data...
42 | Source vocab size: 28721, Target vocab size: 42787
43 | Source max sent len: 52, Target max sent len: 52
44 | Done loading data!
45 |
46 | Building model with specs:
47 | Layer type: lstm
48 | Model type: red
49 | Embedding size: 300
50 | Hidden layer size: 300
51 | Number of layers: 2
52 | Number of parameters: 37219687
53 |
54 | . received params from batch with index: 21
55 | sending back derivative for batch with index: 21
56 |
--------------------------------------------------------------------------------
/lua-lua/parse_outputs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | """Copy files from servers
6 | """
7 |
8 | from __future__ import print_function
9 |
10 | import sys
11 | import os
12 | import re
13 | import numpy as np
14 | import warnings
15 | warnings.filterwarnings("ignore", category=UserWarning)
16 | import matplotlib.pyplot as plt
17 |
18 |
19 | class Print:
20 | def red(self, prt): print("\033[91m{}\033[00m" .format(prt), end="")
21 | def green(self, prt): print("\033[92m{}\033[00m" .format(prt), end="")
22 | def yellow(self, prt): print("\033[93m{}\033[00m" .format(prt), end="")
23 | def lightpurple(self, prt): print("\033[94m{}\033[00m" .format(prt), end="")
24 | def purple(self, prt): print("\033[95m{}\033[00m" .format(prt), end="")
25 | def cyan(self, prt): print("\033[96m{}\033[00m" .format(prt), end="")
26 | def lightgray(self, prt): print("\033[97m{}\033[00m" .format(prt), end="")
27 | def black(self, prt): print("\033[98m{}\033[00m" .format(prt), end="")
28 |
29 | class Result:
30 | def __init__(self, floc):
31 | self.results = []
32 | self.floc = floc
33 | self.loc_split = floc.split('/')
34 | self.fname = self.loc_split[-1]
35 | self.ip_addr = self.loc_split[-2]
36 | self.no_ext = self.fname.split('.')[0]
37 | self.ada_grad, self.n_proc, self.loc = self.no_ext.split('_')
38 | self.n_proc = int(self.n_proc)
39 |
40 | if self.ada_grad == 'ada':
41 | self.ada_grad = 'ada grad SGD'
42 | else:
43 | self.ada_grad = 'simple SGD'
44 |
45 | if self.loc == 'rem':
46 | self.loc = 'remotely'
47 | else:
48 | self.loc = 'locally'
49 |
50 | self.description = '%d processes, %s, running %s' % (self.n_proc, self.ada_grad, self.loc)
51 |
52 | def add_result(self, result):
53 | self.results.append(result)
54 |
55 | def get_data(self, max_epoch, min_t):
56 | return [result.time_ellapse for result in self.results if (max_epoch is None or result.epoch <= max_epoch) and (min_t is None or result.time_ellapse >= min_t)], [np.log(result.perplexity) for result in self.results if (max_epoch is None or result.epoch <= max_epoch) and (min_t is None or result.time_ellapse >= min_t)]
57 |
58 | def graph(self, close = True, out_name = None, max_epoch = None, min_t = None):
59 | times, log_perps = self.get_data(max_epoch, min_t)
60 |
61 | plt.ylabel('Log perplexity')
62 | plt.xlabel('Time (s)')
63 |
64 | plt.title(self.description)
65 | plt.plot(times, log_perps, label = self.description)
66 |
67 | if close:
68 | if out_name == None:
69 | out_name = "/".join(self.loc_split[:-1]) + '/' + self.no_ext + '.png'
70 |
71 | plt.savefig(out_name)
72 | plt.clf()
73 | plt.cla()
74 | plt.close()
75 |
76 |
77 | def display(self):
78 | Print().green('Results for file %s \n' % self.floc)
79 |
80 | Print().lightpurple('Number of processes: ')
81 | print(self.n_proc)
82 |
83 | Print().lightpurple('SGD type: ')
84 | print(self.ada_grad)
85 |
86 | Print().lightpurple('Running location: ')
87 | print(self.loc)
88 |
89 | Print().lightpurple('Server: ')
90 | print(self.ip_addr)
91 |
92 | if len(self.results) == 0:
93 | Print().red('No results\n')
94 | return
95 |
96 | Print().lightpurple('Number of batches: ')
97 | print(self.results[0].n_batch)
98 |
99 | epoch = -1
100 | for result in self.results:
101 | if result.epoch != epoch:
102 | epoch = result.epoch
103 | Print().yellow('Epoch: %d\n' % epoch)
104 | result.display()
105 |
106 | class DataPoint:
107 | def __init__(self, line):
108 | # Store the line itself
109 | self.line = line
110 |
111 | # The epoch we're on
112 | self.epoch = int(self.clean_match('Epoch: (.*?), Batch:', line))
113 |
114 | # Current batch, total number of batches, current batchsuze
115 | self.batch_str = self.clean_match('Batch: (.*?), Batch size:', line)
116 | batch_splt = str.split(self.batch_str, '/')
117 | self.batch, self.n_batch = [int(ind) for ind in batch_splt]
118 | self.batch_size = int(self.clean_match('Batch size: (.*?), LR:', line))
119 |
120 | self.learning_rate = float(self.clean_match('LR: (.*?), PPL: ', line))
121 |
122 | self.perplexity = float(self.clean_match('PPL: (.*?), |Param|:', line))
123 |
124 | self.speed = self.clean_match('Training: (.*?) total/source/target', line)
125 |
126 | self.time_ellapse = int(str.split(line)[-1])
127 |
128 |
129 | def clean_match(self, pattern, string):
130 | res = re.findall(pattern, string)
131 | return filter(lambda x: x != '', res)[0]
132 | def display(self):
133 | args = (self.batch, self.perplexity, self.time_ellapse)
134 | print('Batch: %d, perplexity: %.2f, time: %d\n' % args, end = "")
135 |
136 | class Results:
137 | def __init__(self):
138 | self.results = []
139 |
140 | def add_result(self, result):
141 | self.results.append(result)
142 |
143 | def graph(self, location = None, max_epoch = None, min_t = None):
144 | for result in self.results:
145 | if location == None or result.loc == location:
146 | result.graph(close = False, max_epoch = max_epoch, min_t = min_t)
147 | if location == None:
148 | out_name = "All.png"
149 | else:
150 | out_name = location + ".png"
151 | plt.title(location)
152 | plt.legend(bbox_to_anchor=(1.05, 1))
153 |
154 | plt.savefig(out_name)
155 | plt.clf()
156 | plt.cla()
157 | plt.close()
158 |
159 |
160 |
161 | def process_file(path_to_file):
162 | result = Result(path_to_file)
163 | with open(path_to_file) as f:
164 | for line in f:
165 | if 'total/source/target' in line:
166 | # Parse the line into a DataPoint object
167 | data_point = DataPoint(line)
168 |
169 | # Add the datapoint to the result
170 | result.add_result(data_point)
171 |
172 | result.display()
173 | result.graph()
174 | return result
175 |
176 |
177 |
178 | def main(arguments):
179 |
180 | while True:
181 | print('Copying over files')
182 | # Updating files
183 | os.system('python copy_files.py')
184 |
185 | import time
186 | time.sleep(3)
187 |
188 | # hold the results
189 | results = Results()
190 |
191 | # Get all folders of ip addresses
192 | for ip_fold in os.walk('outputs'):
193 |
194 | # Find the .txt files
195 | for file in os.listdir(ip_fold[0]):
196 | if file.endswith(".txt") and len(file.split('_')) == 3:
197 |
198 | # Full path to the file
199 | full_path = ip_fold[0] + '/' + file
200 |
201 | result = process_file(full_path)
202 |
203 | results.add_result(result)
204 |
205 | results.graph(location = 'locally', max_epoch = 7)
206 | results.graph(location = 'remotely', min_t = 50, max_epoch = 10)
207 | time.sleep(20)
208 |
209 | if __name__ == '__main__':
210 | sys.exit(main(sys.argv[1:]))
--------------------------------------------------------------------------------
/lua-lua/remotely.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/lua-lua/remotely.png
--------------------------------------------------------------------------------
/lua-lua/server.lua:
--------------------------------------------------------------------------------
1 | ------------------------------------------------------------------------
2 | -- server.lua
3 | --
4 | -- A general Distributed SGD Parameter server written in lua/torch
5 | --
6 | -- The is a general parameter server file. It takes in the command line
7 | -- options that are necessary to launch the server. The server
8 | -- will be of the class: 'server_class'. The 'server_class' must
9 | -- be a class definied with two required functions: :__init() and :run()
10 | -- This file will load in the class, call, making a new object via
11 | -- the __init() functionm and then call the run() function inside
12 | -- a protected loop
13 | --
14 | -- The 'add_to_path' option is a string that will be appended onto the
15 | -- path before requiring the new 'server_class'
16 | --
17 | -- Run
18 | -- th server.lua --help
19 | -- to see a full list of options for the parameter server
20 | ------------------------------------------------------------------------
21 |
22 | -- Library used to run clients in parallel
23 | require 'parallel'
24 |
25 | -- Used to update the path variable
26 | require 'package'
27 |
28 | ------------
29 | -- Options
30 | ------------
31 |
32 | cmd = torch.CmdLine()
33 |
34 | cmd:text("")
35 | cmd:text("**General options**")
36 | cmd:text("")
37 |
38 | cmd:option('-server_class', 'demo_server', 'Class name to use')
39 | cmd:option('-add_to_path' , './End-To-End-Generative-Dialogue/src/?.lua;', 'A string that will be appended on to the front of the path')
40 |
41 | cmd:text("")
42 | cmd:text("**_____________________________**")
43 | cmd:text("Below are all options specific to models")
44 | cmd:text("**_____________________________**")
45 | cmd:text("")
46 |
47 | cmd:text("")
48 | cmd:text("**Data options**")
49 | cmd:text("")
50 | cmd:option('-data_file', 'data/demo-train.hdf5', 'Path to the training *.hdf5 file')
51 | cmd:option('-val_data_file','data/demo-val.hdf5', 'Path to validation *.hdf5 file')
52 | cmd:option('-save_file', 'demo-seq2seq_lstm', 'Save file name (model will be saved as savefile_epochX_PPL.t7 where X is the X-th epoch and PPL is the validation perplexity')
53 | cmd:option('-train_from', '', 'If training from a checkpoint then this is the path to the pretrained model.')
54 |
55 | cmd:text("")
56 | cmd:text("**Model options**")
57 | cmd:text("")
58 |
59 | cmd:option('-num_layers', 2, 'Number of layers in the LSTM encoder/decoder')
60 | cmd:option('-hidden_size', 300, 'Size of LSTM hidden states')
61 | cmd:option('-word_vec_size', 300, 'Word embedding sizes')
62 | cmd:option('-layer_type', 'lstm', 'Recurrent layer type (rnn, gru, lstm, fast)')
63 | cmd:option('-model_type', 'red', 'Model structure (red, hred)')
64 |
65 |
66 | cmd:text("")
67 | cmd:text("**Optimization options**")
68 | cmd:text("")
69 |
70 | cmd:option('-num_epochs', 10, 'Number of training epochs')
71 | cmd:option('-start_epoch', 1, 'If loading from a checkpoint, the epoch from which to start')
72 | cmd:option('-param_init', 0.1, 'Parameters are initialized over uniform distribution with support (-param_init, param_init)')
73 | cmd:option('-learning_rate', .01, 'Starting learning rate')
74 | cmd:option('-ada_grad', true, 'When true, update parameters using adagrad algorithm')
75 | cmd:option('-max_grad_norm', 5, 'If the norm of the gradient vector exceeds this, renormalize it to have the norm equal to max_grad_norm')
76 | cmd:option('-dropout', 0.3, 'Dropout probability. Dropout is applied between vertical LSTM stacks.')
77 | cmd:option('-lr_decay', 0.5, 'Decay learning rate by this much if (i) perplexity does not decrease on the validation set or (ii) epoch has gone past the start_decay_at_limit')
78 | cmd:option('-start_decay_at', 9, 'Start decay after this epoch')
79 | cmd:option('-fix_word_vecs', 0, 'If = 1, fix lookup table word embeddings')
80 | cmd:option('-beam_k', 5, 'K value to use with beam search')
81 | cmd:option('-max_bleu', 4, 'The number of n-grams used in calculating the bleu score')
82 | cmd:option('-pre_word_vecs', '', 'If a valid path is specified, then this will load pretrained word embeddings (hdf5 file) on the encoder side. See README for specific formatting instructions.')
83 |
84 | cmd:text("")
85 | cmd:text("**Other options**")
86 | cmd:text("")
87 |
88 | -- GPU (not supported on servers)
89 | cmd:option('-gpuid', -1, 'Which gpu to use. -1 = use CPU')
90 | cmd:option('-gpuid2', -1, 'If this is >= 0, then the model will use two GPUs whereby the encoder is on the first GPU and the decoder is on the second GPU. This will allow you to train with bigger batches/models.')
91 |
92 | -- Bookkeeping
93 | cmd:option('-save_every', 1, 'Save every this many epochs')
94 | cmd:option('-print_every', 5, 'Print stats after this many batches')
95 | cmd:option('-seed', 3435, 'Seed for random initialization')
96 |
97 |
98 | -- Parallel options
99 | cmd:option('-n_proc', 4, 'The number of processes to farm out')
100 | cmd:option('-remote', false, 'When true, the farmed out processes are run on remote servers. This overrides localhost')
101 | cmd:option('-localhost', false, 'When true, the farmed out processes are run on localhost. ')
102 |
103 | cmd:option('-torch_path', '/Users/michaelfarrell/torch/install/bin/th', 'The path to the torch directory on the client computers')
104 | cmd:option('-extension', '', 'The location from the home directory to the lua-lua folder on the client computer')
105 | cmd:option('-username', 'michaelfarrell', 'The username for connecting used for connecting to remote clients')
106 |
107 | -- Parse arguments
108 | opt = cmd:parse(arg)
109 | torch.manualSeed(opt.seed)
110 |
111 | -- Indicate we are running things in parallel
112 | opt.parallel = true
113 |
114 | -- The print function
115 | opt.print = parallel.print
116 |
117 | -- Add on location to path of new class if not already in path
118 | package.path = opt.add_to_path .. package.path
119 |
120 | -- Main server function, initializes and runs
121 | function server_main()
122 | -- Load in the class type
123 | server = require(opt.server_class)
124 |
125 | -- Print from parent process
126 | parallel.print('Im the parent, my ID is: ', parallel.id, ' and my IP: ', parallel.ip)
127 |
128 | -- Create a new server
129 | param_server = server.new(opt)
130 |
131 | -- Run the server
132 | param_server:run()
133 |
134 | end
135 |
136 | -- Protected execution of parllalel script:
137 | ok, err = pcall(server_main)
138 | if not ok then print(err) parallel.close() end
139 |
--------------------------------------------------------------------------------
/lua-lua/setup_image.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # setup_image.sh
4 | #
5 | # This is a bash script that is used to setup an image on the google cloud server
6 | # it copies over the startup script, runs the script, disconnects and reconnects,
7 | # then reruns the startup script
8 |
9 | # Copy over the startup script
10 | scp -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey startup.sh $USERNAME@$EXTERNAL_IP:~/
11 |
12 | # Run the startup script on the server
13 | echo "bash startup.sh" | ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$EXTERNAL_IP
14 |
15 | # Disconnect from the server, reconnect and finish running last things needed for initialization
16 | echo "bash startup.sh; " | ssh -o "StrictHostKeyChecking no" -i ~/.ssh/dist-sgd-sshkey $USERNAME@$EXTERNAL_IP
17 |
--------------------------------------------------------------------------------
/lua-lua/startup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # gcloud_startup.sh
4 | #
5 | # This is a bash script that is used to setup a google cloud server. This script
6 | # will install the following on the server:
7 | # - git
8 | # - luarocks
9 | # - pip
10 | # - torch
11 | # - lua-parallel (local version)
12 | # - rnn (torch)
13 | # - hdf5 (torch)
14 | # - anaconda
15 | # - h5py
16 | # The script will also clone the Distributed-SGD repo onto the server
17 |
18 | # Ensure that git is installed
19 | if hash git &> /dev/null
20 | then
21 | echo -e "\033[0;32mgit installed\033[0m"
22 | else
23 | echo -e "\033[0;34mInstalling git ...\033[0m"
24 | (echo "Y" | sudo apt-get install git) > /dev/null
25 | fi
26 |
27 | # Ensure that luarocks is installed
28 | if hash luarocks &> /dev/null
29 | then
30 | echo -e "\033[0;32mluarocks installed\033[0m"
31 | else
32 | echo -e "\033[0;34mInstalling luarocks ...\033[0m"
33 | (echo "Y" | sudo apt-get install luarocks) &> /dev/null
34 | fi
35 |
36 | # Ensure that pip is installed
37 | if hash pip &> /dev/null
38 | then
39 | echo -e "\033[0;32mpython-pip installed\033[0m"
40 | else
41 | echo -e "\033[0;34mInstalling python-pip ...\033[0m"
42 | (echo "Y" | sudo apt-get install python-pip) > /dev/null
43 | fi
44 |
45 | source ~/.profile
46 |
47 | # Ensure that torch is installed
48 | if hash th &> /dev/null
49 | then
50 | echo -e "\033[0;32mtorch installed\033[0m"
51 | else
52 | echo -e "\033[0;34mInstalling torch ...\033[0m"
53 | git clone https://github.com/torch/distro.git ~/torch --recursive &> /dev/null
54 | cd ~/torch
55 | bash install-deps 2&>1 > /dev/null
56 | echo "yes" | ./install.sh 2&>1 > /dev/null
57 | cd ..
58 | source ~/.profile
59 | fi
60 |
61 | # Ensure that rnn is installed
62 | if (luarocks list | grep -q rnn) &> /dev/null
63 | then
64 | echo -e "\033[0;32mrnn installed\033[0m"
65 | else
66 | echo -e "\033[0;34mInstalling rnn ...\033[0m"
67 | luarocks install rnn &> /dev/null
68 | fi
69 |
70 | # Ensure that torch-hdf5 is installed
71 | if (luarocks list | grep -q hdf5) &> /dev/null
72 | then
73 | echo -e "\033[0;32mhdf5 installed\033[0m"
74 | else
75 | echo -e "\033[0;34mInstalling hdf5 ...\033[0m"
76 | echo "Y" | sudo apt-get install libhdf5-serial-dev hdf5-tools > /dev/null
77 | git clone https://github.com/deepmind/torch-hdf5.git &> /dev/null
78 | cd torch-hdf5
79 | luarocks make hdf5-0-0.rockspec &> /dev/null
80 | cd ..
81 | fi
82 |
83 | # Make sure that the Distributed SGD is downloaded and isntalled
84 | if [ -e "Distributed-SGD" ]
85 | then
86 | # Update the repos
87 | echo -e "\033[0;34mPulling Distributed-SGD repo changes ...\033[0m"
88 | cd Distributed-SGD
89 | git pull &> /dev/null
90 | cd lua-lua/End-To-End-Generative-Dialogue
91 | echo -e "\033[0;34mPulling End-To-End-Generative-Dialogue repo changes ...\033[0m"
92 | git pull origin master &> /dev/null
93 |
94 | cd ../../..
95 | else
96 | # Clone repo and install parallel
97 | echo -e "\033[0;34mCloning repo Distributed-SGD ...\033[0m"
98 | git clone --recursive https://github.com/michaelfarrell76/Distributed-SGD.git &> /dev/null
99 | cd Distributed-SGD/lua-lua
100 | bash install_parallel.sh
101 | cd ../../
102 | fi
103 |
104 | # Ensure that anaconda is installed
105 | if [ -e "anaconda2" ]
106 | then
107 | echo -e "\033[0;32manaconda installed\033[0m"
108 | echo -e "\033[0;34mInstalling h5py ...\033[0m"
109 |
110 | # Install hdf5 for python
111 | echo "y" | conda install h5py &> /dev/null
112 | else
113 | echo -e "\033[0;34mDownloading anaconda ...\033[0m"
114 | wget http://repo.continuum.io/archive/Anaconda2-4.0.0-Linux-x86_64.sh &> /dev/null
115 | echo -e "\033[0;34mInstalling anaconda ...\033[0m"
116 | bash Anaconda2-4.0.0-Linux-x86_64.sh -b > /dev/null
117 | rm Anaconda2-4.0.0-Linux-x86_64.sh
118 | echo 'export PATH="/home/michaelfarrell/anaconda2/bin:$PATH"' > .bashrc
119 | echo -e "\033[0;33mIn order for python to be run, you must logout and log back in\033[0m"
120 | fi
121 |
122 |
--------------------------------------------------------------------------------
/python-python/README.md:
--------------------------------------------------------------------------------
1 | # Distributed-SGD for Python
2 | An implementation of distributed stochastic gradient descent in python. Clients can be local and remote. For this task, you can download the data from http://www.vision.caltech.edu/Image_Datasets/Caltech101/.
3 |
4 | ## Requirements
5 |
6 | This code is written entirely in Python, and an installation of gRPC, Numpy, Scipy, and Autograd are necessary. These packages can be easily installed through PIP using the following commands.
7 |
8 | ```bash
9 | $ pip install numpy
10 | $ pip install scipy
11 | $ pip install autograd
12 | $ pip install grpcio
13 | ```
14 |
15 | For launching the code remotely, we will be working with Google Cloud Compute. In order to interact with GCloud instances, please install the GCloud sdk. This is located here: https://cloud.google.com/sdk/.
16 |
17 | ## Directory Table of Contents
18 | ```
19 | .
20 | ├── 101_ObjectCategories # Folder holding the raw data from the 101_ObjectCategories
21 | |-- data # Folder holding the processed data
22 | ├── client.py # Python script used to initiate a client
23 | |── server.py # Python script to manually initiate a server
24 | ├── dist_sgd_pb2.py # Automatically compiled protobufs for the parameter server
25 | ├── README.md # Python usage
26 | ├── images(16).npy # Extremely small dataset included for reference
27 | ├── output_labels(16).npy # Classifications of each image for the extremely small dataset
28 | ├── nnet # Folder that includes a module for a convolution neural net
29 | ├── protobuf_utils # Folder that includes utilities for manipulating tensor protobuffers
30 | ├── run_codegen.sh # Shell command used to generates the protobuffers
31 | └── start.sh # Script that launches client.py on when running within gCloud
32 | ```
33 |
34 | ## Description
35 |
36 | ## Local Usage Instructions
37 | To launch clients locally, in three different terminals, simply run:
38 | ```bash
39 | $ python client.py --id 1
40 | $ python client.py --id 2
41 | $ python client.py --id 3
42 | ```
43 |
44 | #### Remote Usage Instructions
45 |
46 | ##### Create a baseline startup image
47 |
48 | We only have to setup and install everything once, after which we can clone that image repeatedly when we launch VMs.
49 |
50 | ###### Create the image
51 | - Click on the 'VM Instances' tab
52 | - Create Instance
53 | - Give the instance a name i.e. 'train-conv-nn'
54 | - Set the zone to us-central1-b
55 | - Choose 2vCPU highmem as machine type
56 | - Under boot disk click change
57 | - Choose Ubuntu 14.04 LTS
58 | - At the bottom change size to 30 GB and click 'select'
59 | - Allow HTTP traffic
60 | - Allow HTTPS traffic
61 | - Click 'Management, disk, networking, SSH keys' to dropdown more options
62 | - Under 'Disk' unclick 'Delete boot disk when instance is deleted'
63 | - Click 'Create' an you should see your new instance listed in the table
64 |
65 | ###### Setup the disk
66 | - Run the command gcloud init and log into your Google Cloud account
67 | - Run the command to SSH into your instance:
68 | ```bash
69 | $ gcloud compute ssh train-conv-nn --zone us-central1-b
70 | ```
71 | - After logging in, we can clone the repository and install the necessary requirements.
72 | - Once the server is setup to your liking, disconnect from the server and return to your google cloud dashboard
73 | - Go to the 'VM Dashboard'
74 | - Click on the instance you just setup, and delete it. This should remove the instance and save it as a disk. If you click on the 'disks' tab, you should see the instance name you just deleted.
75 |
76 | ###### Create the image
77 |
78 | - Click on the 'Images' tab
79 | - 'Create Image'
80 | - Give it a name i.e. 'train-conv-image'
81 | - Under Source-Disk, choose the disk that you just created
82 | - Create
83 |
84 | ##### Generate an 'Instance Template'
85 | - Click on the 'Instance templates' tab
86 | - Create new
87 | - Name the template i.e. 'train-conv-template'
88 | - Under 'Boot Disk' click change
89 | - At the top click 'Your image'
90 | - Choose the image you just created i.e. 'train-conv-image'
91 | - Set size to 30 GB
92 | - Select
93 | - Allow HTTP traffic
94 | - Allow HTTPS traffic
95 | - Under more->Management, include cd ~/distributed-sgd/python-python; sh start.sh
96 | in startup script
97 | - Under more->Disks, unclick 'Delete boot disk when instance is deleted'
98 | - Create
99 |
100 | ##### Generate an 'Instance Group'
101 | - Go to the "Instance groups" tab
102 | - Create instance group
103 | - Give the group a name, i.e. 'train-conv-group'
104 | - Give a description
105 | - Set zone to us-central1-b
106 | - Use instance template
107 | - Choose the template you just made i.e. 'train-conv-template'
108 | - Set the number of instances
109 | - Create
110 | - Wait for the instances to launch
111 | - Once there is a green checkmark, click on the new instance
112 |
113 | All instances in the instance group are now running the python client.py command and will begin training.
114 | SSH into any of the instances to see their progress.
115 |
116 | ## Acknowledgments
117 |
118 | Our implementation adapts code for the convolutional neural net from the Autograd convolution neural net example:
119 |
120 | * [Autograd](https://github.com/HIPS/autograd)
--------------------------------------------------------------------------------
/python-python/client.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------
2 | # Implements a client that runs backpropogation on batches
3 | # provided by the server. If no server exists, then Paxos
4 | # is called to generate a server.
5 | # ------------------------------------------------------------
6 |
7 | from __future__ import print_function
8 | from __future__ import absolute_import
9 | from grpc.beta import implementations
10 | import time
11 | import sys
12 |
13 | import dist_sgd_pb2
14 | import argparse
15 | import traceback
16 |
17 | import autograd.numpy as np
18 | import autograd.numpy.random as npr
19 | from autograd import grad
20 |
21 | from nnet.neural_net import *
22 | from protobuf_utils.utils import *
23 | from server_utils.utils import *
24 |
25 | from server import serve
26 | from paxos import run_paxos
27 | import subprocess
28 |
29 |
30 | # Loads in a really small version of the data that could fit in Github.
31 | # It will train extremely quickly as a result.
32 | images_fname = 'data/images(16).npy'
33 | labels_fname = 'data/output_labels(16).npy'
34 |
35 | _TIMEOUT_SECONDS = 20
36 | TENSOR_TIMEOUT_SECONDS = 60
37 | SERVER_PORT = 50051
38 |
39 | # Loops through all possible addressses that are part of the instance
40 | # group if this is launched on a remote server. Loops through all possible
41 | # addresses that are part of the local server as well.
42 | # Determines whether or not a server exists by trying to connect with the
43 | # a predefined port on the server
44 | def find_server(local_id=None):
45 | TOT_ATTEMPTS = 1
46 | for i in range(TOT_ATTEMPTS):
47 | # Generates local address information
48 | local_address = gen_local_address(local_id)
49 | server_addresses = gen_server_addresses(local_id, local_address)
50 | server_addresses.remove(local_address)
51 |
52 | # Loops through all the servers and tries to makes the server stub
53 | for server_address in server_addresses:
54 | if local_id is not None:
55 | channel = implementations.insecure_channel('localhost', SERVER_PORT)
56 | else:
57 | channel = implementations.insecure_channel(server_address, SERVER_PORT)
58 | stub = dist_sgd_pb2.beta_create_ParamFeeder_stub(channel)
59 | try:
60 | # Attempts to ping the server to see if the port is open
61 | response = stub.ping(dist_sgd_pb2.empty(), _TIMEOUT_SECONDS)
62 |
63 | # If the PING succeeds, then it is the server
64 | return server_address
65 |
66 | except Exception as e:
67 | # Log any network or expiration errors we run into
68 | if ('ExpirationError' in str(e) or 'NetworkError' in str(e)):
69 | log_info(str(e))
70 | continue
71 | else:
72 | # More severe error, should log and crash
73 | traceback.print_exc()
74 | sys.exit(1)
75 | time.sleep(1 * TOT_ATTEMPTS)
76 | return ''
77 |
78 | # After determining the correct server, generate the stub for it
79 | def connect_server_stub(server_addr, local_id):
80 | if local_id is not None:
81 | channel = implementations.insecure_channel('localhost', SERVER_PORT)
82 | else:
83 | channel = implementations.insecure_channel(server_addr, SERVER_PORT)
84 | stub = dist_sgd_pb2.beta_create_ParamFeeder_stub(channel)
85 | return stub
86 |
87 |
88 | # Main function of the client that loops forever. Receieves parameters and
89 | # batch information from the server. Calculates gradients and sends them
90 | # to the server
91 | def run(local_id = None):
92 | # Load and process Caltech data
93 | train_images, train_labels, test_images, test_labels = load_caltech100(images_fname, labels_fname)
94 | image_input_d = train_images.shape[1]
95 |
96 | # Network parameters
97 | layer_sizes = [image_input_d, 800, 600, 400, 350, 250, 101]
98 |
99 | L2_reg = 1.0
100 |
101 | # Training parameters
102 | param_scale = 0.1
103 | momentum = 0.9
104 | batch_size = 256
105 | num_epochs = 50
106 |
107 | # Make neural net functions
108 | N_weights, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg)
109 | loss_grad = grad(loss_fun)
110 |
111 | # Train with sgd
112 | batch_idxs = make_batches(train_images.shape[0], batch_size)
113 | cur_dir = np.zeros(N_weights)
114 |
115 | # Previous batch for the purpose of timing
116 | prev_data_indx = -1
117 |
118 | # Number of consective expirations, used to detect server failure
119 | consec_expiration = 0
120 |
121 | # Determine the server address by running Paxos or pinging all addresses
122 | server_addr = ''
123 | while server_addr == '':
124 | server_addr = run_paxos(local_id)
125 | if server_addr == '':
126 | server_addr = find_server(local_id)
127 | log_info('Server address is ' + server_addr)
128 |
129 | # If this client is selected to be server, then transform into a server
130 | if server_addr == gen_local_address(local_id):
131 | log_info('Transforming into the server')
132 | try:
133 | serve(server_addr, None, prev_data_indx, local_id)
134 | except KeyboardInterrupt as e:
135 | log_info('interrupted')
136 | sys.exit(0)
137 | return
138 |
139 | # Generates the server stub and connects with it
140 | stub = connect_server_stub(server_addr, local_id)
141 | client_id = 0
142 |
143 | log_info('Data loaded and connected to server:')
144 |
145 | try:
146 | # Gets the next batch that it should run
147 | response = stub.SendNextBatch(dist_sgd_pb2.PrevBatch(client_id=client_id, prev_data_indx=prev_data_indx), _TIMEOUT_SECONDS)
148 | while response.data_indx != -2:
149 | client_id = response.client_id
150 | # If this fails, it keeps on trying to get your first batch
151 | while response.data_indx == -1:
152 | time.sleep(5)
153 | log_info('Waiting for server to send next batch')
154 | response = stub.SendNextBatch(dist_sgd_pb2.PrevBatch(client_id=client_id, prev_data_indx=prev_data_indx), _TIMEOUT_SECONDS)
155 | log_info('Processing parameters in batch %d!' % response.data_indx)
156 |
157 | # Generates the W matrix
158 | get_parameters_time = time.time()
159 | W_bytes = ''
160 | W_subtensors_iter = stub.SendParams(dist_sgd_pb2.ClientInfo(client_id=client_id), TENSOR_TIMEOUT_SECONDS)
161 | for W_subtensor_pb in W_subtensors_iter:
162 | W_bytes = W_bytes + W_subtensor_pb.tensor_content
163 | W = convert_bytes_to_array(W_bytes)
164 | log_info('Received parameters in {0:.2f}s'.format(time.time() - get_parameters_time))
165 |
166 | # Calculate the gradients
167 | grad_start = time.time()
168 | grad_W = loss_grad(W, train_images[batch_idxs[response.data_indx]], train_labels[batch_idxs[response.data_indx]])
169 | log_info('Done calculating gradients in {0:.2f}s'.format(time.time() - grad_start))
170 |
171 | # Serialize the gradients
172 | tensor_compress_start = time.time()
173 | tensor_bytes = convert_array_to_bytes(grad_W)
174 | tensor_iterator = convert_tensor_iter(tensor_bytes, response.data_indx)
175 | log_info('Done compressing gradients in {0:.2f}s'.format(time.time() - tensor_compress_start))
176 |
177 | # Send the gradients
178 | send_grad_start = time.time()
179 | stub.GetUpdates(tensor_iterator, _TIMEOUT_SECONDS)
180 | log_info('Done sending gradients through in {0:.2f}s'.format(time.time() - send_grad_start))
181 |
182 | # Get the next batch to process
183 | prev_data_indx = response.data_indx
184 | response = stub.SendNextBatch(dist_sgd_pb2.PrevBatch(client_id=client_id, prev_data_indx=prev_data_indx), _TIMEOUT_SECONDS)
185 |
186 | consec_expiration = 0
187 | except KeyboardInterrupt as e:
188 | sys.exit(1)
189 | except Exception as e:
190 | if ('ExpirationError' in str(e) or 'NetworkError' in str(e)):
191 | SERVER_CONSEC_FAILURE = 2
192 | # Count the failures of the server
193 | consec_expiration += 1
194 |
195 | # If consecutive failures exceed a predefined value, then we look for
196 | # the server by pinging available instances or by restarting Paxos
197 | if consec_expiration == SERVER_CONSEC_FAILURE:
198 | log_info('Failure to connect to server_stub. Starting Paxos')
199 | # Launches paxos and then looks for the server
200 | while server_addr == '':
201 | server_addr = run_paxos(local_id)
202 | if server_addr == '':
203 | server_addr = find_server(local_id)
204 | # Generates the server if it is chosen to be the server
205 | if server_addr == gen_local_address(local_id):
206 | serve(server_addr, W, prev_data_indx, local_id)
207 | return
208 | # Connects to the server
209 | stub = connect_server_stub(server_addr)
210 | else:
211 | log_info(traceback.print_exc())
212 | sys.exit(0)
213 |
214 | if __name__ == '__main__':
215 | log_info('Starting client')
216 | parser = argparse.ArgumentParser()
217 | parser.add_argument('--id')
218 | args = parser.parse_args()
219 |
220 | # Local id is only used if running the machine locally
221 | local_id = args.id
222 | if local_id is not None:
223 | local_id = int(local_id)
224 | assert(local_id > 0)
225 | while True:
226 | run(local_id)
--------------------------------------------------------------------------------
/python-python/data/images(16).npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/data/images(16).npy
--------------------------------------------------------------------------------
/python-python/data/output_labels(16).npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/data/output_labels(16).npy
--------------------------------------------------------------------------------
/python-python/dist_sgd_pb2.py:
--------------------------------------------------------------------------------
1 | # Generated by the protocol buffer compiler. DO NOT EDIT!
2 | # source: dist_sgd.proto
3 |
4 | import sys
5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
6 | from google.protobuf import descriptor as _descriptor
7 | from google.protobuf import message as _message
8 | from google.protobuf import reflection as _reflection
9 | from google.protobuf import symbol_database as _symbol_database
10 | from google.protobuf import descriptor_pb2
11 | # @@protoc_insertion_point(imports)
12 |
13 | _sym_db = _symbol_database.Default()
14 |
15 |
16 |
17 |
18 | DESCRIPTOR = _descriptor.FileDescriptor(
19 | name='dist_sgd.proto',
20 | package='dist_sgd',
21 | syntax='proto3',
22 | serialized_pb=_b('\n\x0e\x64ist_sgd.proto\x12\x08\x64ist_sgd\"`\n\tSubTensor\x12\x12\n\ntensor_len\x18\x01 \x01(\x05\x12\x14\n\x0ctensor_chunk\x18\x02 \x01(\x05\x12\x16\n\x0etensor_content\x18\x03 \x01(\x0c\x12\x11\n\tdata_indx\x18\x04 \x01(\x05\"\x1f\n\nClientInfo\x12\x11\n\tclient_id\x18\x01 \x01(\x05\"\x1c\n\nStatusCode\x12\x0e\n\x06status\x18\x01 \x01(\x05\"6\n\tPrevBatch\x12\x11\n\tclient_id\x18\x01 \x01(\x05\x12\x16\n\x0eprev_data_indx\x18\x02 \x01(\x05\"1\n\tNextBatch\x12\x11\n\tclient_id\x18\x01 \x01(\x05\x12\x11\n\tdata_indx\x18\x02 \x01(\x05\"\x07\n\x05\x65mpty2\xf0\x01\n\x0bParamFeeder\x12;\n\nSendParams\x12\x14.dist_sgd.ClientInfo\x1a\x13.dist_sgd.SubTensor\"\x00\x30\x01\x12;\n\rSendNextBatch\x12\x13.dist_sgd.PrevBatch\x1a\x13.dist_sgd.NextBatch\"\x00\x12;\n\nGetUpdates\x12\x13.dist_sgd.SubTensor\x1a\x14.dist_sgd.StatusCode\"\x00(\x01\x12*\n\x04ping\x12\x0f.dist_sgd.empty\x1a\x0f.dist_sgd.empty\"\x00\x42\x18\n\x0bio.dist_sgdB\x07\x44istSGDP\x01\x62\x06proto3')
23 | )
24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
25 |
26 |
27 |
28 |
29 | _SUBTENSOR = _descriptor.Descriptor(
30 | name='SubTensor',
31 | full_name='dist_sgd.SubTensor',
32 | filename=None,
33 | file=DESCRIPTOR,
34 | containing_type=None,
35 | fields=[
36 | _descriptor.FieldDescriptor(
37 | name='tensor_len', full_name='dist_sgd.SubTensor.tensor_len', index=0,
38 | number=1, type=5, cpp_type=1, label=1,
39 | has_default_value=False, default_value=0,
40 | message_type=None, enum_type=None, containing_type=None,
41 | is_extension=False, extension_scope=None,
42 | options=None),
43 | _descriptor.FieldDescriptor(
44 | name='tensor_chunk', full_name='dist_sgd.SubTensor.tensor_chunk', index=1,
45 | number=2, type=5, cpp_type=1, label=1,
46 | has_default_value=False, default_value=0,
47 | message_type=None, enum_type=None, containing_type=None,
48 | is_extension=False, extension_scope=None,
49 | options=None),
50 | _descriptor.FieldDescriptor(
51 | name='tensor_content', full_name='dist_sgd.SubTensor.tensor_content', index=2,
52 | number=3, type=12, cpp_type=9, label=1,
53 | has_default_value=False, default_value=_b(""),
54 | message_type=None, enum_type=None, containing_type=None,
55 | is_extension=False, extension_scope=None,
56 | options=None),
57 | _descriptor.FieldDescriptor(
58 | name='data_indx', full_name='dist_sgd.SubTensor.data_indx', index=3,
59 | number=4, type=5, cpp_type=1, label=1,
60 | has_default_value=False, default_value=0,
61 | message_type=None, enum_type=None, containing_type=None,
62 | is_extension=False, extension_scope=None,
63 | options=None),
64 | ],
65 | extensions=[
66 | ],
67 | nested_types=[],
68 | enum_types=[
69 | ],
70 | options=None,
71 | is_extendable=False,
72 | syntax='proto3',
73 | extension_ranges=[],
74 | oneofs=[
75 | ],
76 | serialized_start=28,
77 | serialized_end=124,
78 | )
79 |
80 |
81 | _CLIENTINFO = _descriptor.Descriptor(
82 | name='ClientInfo',
83 | full_name='dist_sgd.ClientInfo',
84 | filename=None,
85 | file=DESCRIPTOR,
86 | containing_type=None,
87 | fields=[
88 | _descriptor.FieldDescriptor(
89 | name='client_id', full_name='dist_sgd.ClientInfo.client_id', index=0,
90 | number=1, type=5, cpp_type=1, label=1,
91 | has_default_value=False, default_value=0,
92 | message_type=None, enum_type=None, containing_type=None,
93 | is_extension=False, extension_scope=None,
94 | options=None),
95 | ],
96 | extensions=[
97 | ],
98 | nested_types=[],
99 | enum_types=[
100 | ],
101 | options=None,
102 | is_extendable=False,
103 | syntax='proto3',
104 | extension_ranges=[],
105 | oneofs=[
106 | ],
107 | serialized_start=126,
108 | serialized_end=157,
109 | )
110 |
111 |
112 | _STATUSCODE = _descriptor.Descriptor(
113 | name='StatusCode',
114 | full_name='dist_sgd.StatusCode',
115 | filename=None,
116 | file=DESCRIPTOR,
117 | containing_type=None,
118 | fields=[
119 | _descriptor.FieldDescriptor(
120 | name='status', full_name='dist_sgd.StatusCode.status', index=0,
121 | number=1, type=5, cpp_type=1, label=1,
122 | has_default_value=False, default_value=0,
123 | message_type=None, enum_type=None, containing_type=None,
124 | is_extension=False, extension_scope=None,
125 | options=None),
126 | ],
127 | extensions=[
128 | ],
129 | nested_types=[],
130 | enum_types=[
131 | ],
132 | options=None,
133 | is_extendable=False,
134 | syntax='proto3',
135 | extension_ranges=[],
136 | oneofs=[
137 | ],
138 | serialized_start=159,
139 | serialized_end=187,
140 | )
141 |
142 |
143 | _PREVBATCH = _descriptor.Descriptor(
144 | name='PrevBatch',
145 | full_name='dist_sgd.PrevBatch',
146 | filename=None,
147 | file=DESCRIPTOR,
148 | containing_type=None,
149 | fields=[
150 | _descriptor.FieldDescriptor(
151 | name='client_id', full_name='dist_sgd.PrevBatch.client_id', index=0,
152 | number=1, type=5, cpp_type=1, label=1,
153 | has_default_value=False, default_value=0,
154 | message_type=None, enum_type=None, containing_type=None,
155 | is_extension=False, extension_scope=None,
156 | options=None),
157 | _descriptor.FieldDescriptor(
158 | name='prev_data_indx', full_name='dist_sgd.PrevBatch.prev_data_indx', index=1,
159 | number=2, type=5, cpp_type=1, label=1,
160 | has_default_value=False, default_value=0,
161 | message_type=None, enum_type=None, containing_type=None,
162 | is_extension=False, extension_scope=None,
163 | options=None),
164 | ],
165 | extensions=[
166 | ],
167 | nested_types=[],
168 | enum_types=[
169 | ],
170 | options=None,
171 | is_extendable=False,
172 | syntax='proto3',
173 | extension_ranges=[],
174 | oneofs=[
175 | ],
176 | serialized_start=189,
177 | serialized_end=243,
178 | )
179 |
180 |
181 | _NEXTBATCH = _descriptor.Descriptor(
182 | name='NextBatch',
183 | full_name='dist_sgd.NextBatch',
184 | filename=None,
185 | file=DESCRIPTOR,
186 | containing_type=None,
187 | fields=[
188 | _descriptor.FieldDescriptor(
189 | name='client_id', full_name='dist_sgd.NextBatch.client_id', index=0,
190 | number=1, type=5, cpp_type=1, label=1,
191 | has_default_value=False, default_value=0,
192 | message_type=None, enum_type=None, containing_type=None,
193 | is_extension=False, extension_scope=None,
194 | options=None),
195 | _descriptor.FieldDescriptor(
196 | name='data_indx', full_name='dist_sgd.NextBatch.data_indx', index=1,
197 | number=2, type=5, cpp_type=1, label=1,
198 | has_default_value=False, default_value=0,
199 | message_type=None, enum_type=None, containing_type=None,
200 | is_extension=False, extension_scope=None,
201 | options=None),
202 | ],
203 | extensions=[
204 | ],
205 | nested_types=[],
206 | enum_types=[
207 | ],
208 | options=None,
209 | is_extendable=False,
210 | syntax='proto3',
211 | extension_ranges=[],
212 | oneofs=[
213 | ],
214 | serialized_start=245,
215 | serialized_end=294,
216 | )
217 |
218 |
219 | _EMPTY = _descriptor.Descriptor(
220 | name='empty',
221 | full_name='dist_sgd.empty',
222 | filename=None,
223 | file=DESCRIPTOR,
224 | containing_type=None,
225 | fields=[
226 | ],
227 | extensions=[
228 | ],
229 | nested_types=[],
230 | enum_types=[
231 | ],
232 | options=None,
233 | is_extendable=False,
234 | syntax='proto3',
235 | extension_ranges=[],
236 | oneofs=[
237 | ],
238 | serialized_start=296,
239 | serialized_end=303,
240 | )
241 |
242 | DESCRIPTOR.message_types_by_name['SubTensor'] = _SUBTENSOR
243 | DESCRIPTOR.message_types_by_name['ClientInfo'] = _CLIENTINFO
244 | DESCRIPTOR.message_types_by_name['StatusCode'] = _STATUSCODE
245 | DESCRIPTOR.message_types_by_name['PrevBatch'] = _PREVBATCH
246 | DESCRIPTOR.message_types_by_name['NextBatch'] = _NEXTBATCH
247 | DESCRIPTOR.message_types_by_name['empty'] = _EMPTY
248 |
249 | SubTensor = _reflection.GeneratedProtocolMessageType('SubTensor', (_message.Message,), dict(
250 | DESCRIPTOR = _SUBTENSOR,
251 | __module__ = 'dist_sgd_pb2'
252 | # @@protoc_insertion_point(class_scope:dist_sgd.SubTensor)
253 | ))
254 | _sym_db.RegisterMessage(SubTensor)
255 |
256 | ClientInfo = _reflection.GeneratedProtocolMessageType('ClientInfo', (_message.Message,), dict(
257 | DESCRIPTOR = _CLIENTINFO,
258 | __module__ = 'dist_sgd_pb2'
259 | # @@protoc_insertion_point(class_scope:dist_sgd.ClientInfo)
260 | ))
261 | _sym_db.RegisterMessage(ClientInfo)
262 |
263 | StatusCode = _reflection.GeneratedProtocolMessageType('StatusCode', (_message.Message,), dict(
264 | DESCRIPTOR = _STATUSCODE,
265 | __module__ = 'dist_sgd_pb2'
266 | # @@protoc_insertion_point(class_scope:dist_sgd.StatusCode)
267 | ))
268 | _sym_db.RegisterMessage(StatusCode)
269 |
270 | PrevBatch = _reflection.GeneratedProtocolMessageType('PrevBatch', (_message.Message,), dict(
271 | DESCRIPTOR = _PREVBATCH,
272 | __module__ = 'dist_sgd_pb2'
273 | # @@protoc_insertion_point(class_scope:dist_sgd.PrevBatch)
274 | ))
275 | _sym_db.RegisterMessage(PrevBatch)
276 |
277 | NextBatch = _reflection.GeneratedProtocolMessageType('NextBatch', (_message.Message,), dict(
278 | DESCRIPTOR = _NEXTBATCH,
279 | __module__ = 'dist_sgd_pb2'
280 | # @@protoc_insertion_point(class_scope:dist_sgd.NextBatch)
281 | ))
282 | _sym_db.RegisterMessage(NextBatch)
283 |
284 | empty = _reflection.GeneratedProtocolMessageType('empty', (_message.Message,), dict(
285 | DESCRIPTOR = _EMPTY,
286 | __module__ = 'dist_sgd_pb2'
287 | # @@protoc_insertion_point(class_scope:dist_sgd.empty)
288 | ))
289 | _sym_db.RegisterMessage(empty)
290 |
291 |
292 | DESCRIPTOR.has_options = True
293 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n\013io.dist_sgdB\007DistSGDP\001'))
294 | import abc
295 | import six
296 | from grpc.beta import implementations as beta_implementations
297 | from grpc.framework.common import cardinality
298 | from grpc.framework.interfaces.face import utilities as face_utilities
299 |
300 | class BetaParamFeederServicer(six.with_metaclass(abc.ABCMeta, object)):
301 | """"""
302 | @abc.abstractmethod
303 | def SendParams(self, request, context):
304 | raise NotImplementedError()
305 | @abc.abstractmethod
306 | def SendNextBatch(self, request, context):
307 | raise NotImplementedError()
308 | @abc.abstractmethod
309 | def GetUpdates(self, request_iterator, context):
310 | raise NotImplementedError()
311 | @abc.abstractmethod
312 | def ping(self, request, context):
313 | raise NotImplementedError()
314 |
315 | class BetaParamFeederStub(six.with_metaclass(abc.ABCMeta, object)):
316 | """The interface to which stubs will conform."""
317 | @abc.abstractmethod
318 | def SendParams(self, request, timeout):
319 | raise NotImplementedError()
320 | @abc.abstractmethod
321 | def SendNextBatch(self, request, timeout):
322 | raise NotImplementedError()
323 | SendNextBatch.future = None
324 | @abc.abstractmethod
325 | def GetUpdates(self, request_iterator, timeout):
326 | raise NotImplementedError()
327 | GetUpdates.future = None
328 | @abc.abstractmethod
329 | def ping(self, request, timeout):
330 | raise NotImplementedError()
331 | ping.future = None
332 |
333 | def beta_create_ParamFeeder_server(servicer, pool=None, pool_size=None, default_timeout=None, maximum_timeout=None):
334 | import dist_sgd_pb2
335 | import dist_sgd_pb2
336 | import dist_sgd_pb2
337 | import dist_sgd_pb2
338 | import dist_sgd_pb2
339 | import dist_sgd_pb2
340 | import dist_sgd_pb2
341 | import dist_sgd_pb2
342 | request_deserializers = {
343 | ('dist_sgd.ParamFeeder', 'GetUpdates'): dist_sgd_pb2.SubTensor.FromString,
344 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): dist_sgd_pb2.PrevBatch.FromString,
345 | ('dist_sgd.ParamFeeder', 'SendParams'): dist_sgd_pb2.ClientInfo.FromString,
346 | ('dist_sgd.ParamFeeder', 'ping'): dist_sgd_pb2.empty.FromString,
347 | }
348 | response_serializers = {
349 | ('dist_sgd.ParamFeeder', 'GetUpdates'): dist_sgd_pb2.StatusCode.SerializeToString,
350 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): dist_sgd_pb2.NextBatch.SerializeToString,
351 | ('dist_sgd.ParamFeeder', 'SendParams'): dist_sgd_pb2.SubTensor.SerializeToString,
352 | ('dist_sgd.ParamFeeder', 'ping'): dist_sgd_pb2.empty.SerializeToString,
353 | }
354 | method_implementations = {
355 | ('dist_sgd.ParamFeeder', 'GetUpdates'): face_utilities.stream_unary_inline(servicer.GetUpdates),
356 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): face_utilities.unary_unary_inline(servicer.SendNextBatch),
357 | ('dist_sgd.ParamFeeder', 'SendParams'): face_utilities.unary_stream_inline(servicer.SendParams),
358 | ('dist_sgd.ParamFeeder', 'ping'): face_utilities.unary_unary_inline(servicer.ping),
359 | }
360 | server_options = beta_implementations.server_options(request_deserializers=request_deserializers, response_serializers=response_serializers, thread_pool=pool, thread_pool_size=pool_size, default_timeout=default_timeout, maximum_timeout=maximum_timeout)
361 | return beta_implementations.server(method_implementations, options=server_options)
362 |
363 | def beta_create_ParamFeeder_stub(channel, host=None, metadata_transformer=None, pool=None, pool_size=None):
364 | import dist_sgd_pb2
365 | import dist_sgd_pb2
366 | import dist_sgd_pb2
367 | import dist_sgd_pb2
368 | import dist_sgd_pb2
369 | import dist_sgd_pb2
370 | import dist_sgd_pb2
371 | import dist_sgd_pb2
372 | request_serializers = {
373 | ('dist_sgd.ParamFeeder', 'GetUpdates'): dist_sgd_pb2.SubTensor.SerializeToString,
374 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): dist_sgd_pb2.PrevBatch.SerializeToString,
375 | ('dist_sgd.ParamFeeder', 'SendParams'): dist_sgd_pb2.ClientInfo.SerializeToString,
376 | ('dist_sgd.ParamFeeder', 'ping'): dist_sgd_pb2.empty.SerializeToString,
377 | }
378 | response_deserializers = {
379 | ('dist_sgd.ParamFeeder', 'GetUpdates'): dist_sgd_pb2.StatusCode.FromString,
380 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): dist_sgd_pb2.NextBatch.FromString,
381 | ('dist_sgd.ParamFeeder', 'SendParams'): dist_sgd_pb2.SubTensor.FromString,
382 | ('dist_sgd.ParamFeeder', 'ping'): dist_sgd_pb2.empty.FromString,
383 | }
384 | cardinalities = {
385 | 'GetUpdates': cardinality.Cardinality.STREAM_UNARY,
386 | 'SendNextBatch': cardinality.Cardinality.UNARY_UNARY,
387 | 'SendParams': cardinality.Cardinality.UNARY_STREAM,
388 | 'ping': cardinality.Cardinality.UNARY_UNARY,
389 | }
390 | stub_options = beta_implementations.stub_options(host=host, metadata_transformer=metadata_transformer, request_serializers=request_serializers, response_deserializers=response_deserializers, thread_pool=pool, thread_pool_size=pool_size)
391 | return beta_implementations.dynamic_stub(channel, 'dist_sgd.ParamFeeder', cardinalities, options=stub_options)
392 | # @@protoc_insertion_point(module_scope)
393 |
--------------------------------------------------------------------------------
/python-python/image_classes.txt:
--------------------------------------------------------------------------------
1 | 0,accordion
2 | 1,airplanes
3 | 2,anchor
4 | 3,ant
5 | 4,BACKGROUND_Google
6 | 5,barrel
7 | 6,bass
8 | 7,beaver
9 | 8,binocular
10 | 9,bonsai
11 | 10,brain
12 | 11,brontosaurus
13 | 12,buddha
14 | 13,butterfly
15 | 14,camera
16 | 15,cannon
17 | 16,car_side
18 | 17,ceiling_fan
19 | 18,cellphone
20 | 19,chair
21 | 20,chandelier
22 | 21,cougar_body
23 | 22,cougar_face
24 | 23,crab
25 | 24,crayfish
26 | 25,crocodile
27 | 26,crocodile_head
28 | 27,cup
29 | 28,dalmatian
30 | 29,dollar_bill
31 | 30,dolphin
32 | 31,dragonfly
33 | 32,electric_guitar
34 | 33,elephant
35 | 34,emu
36 | 35,euphonium
37 | 36,ewer
38 | 37,Faces
39 | 38,Faces_easy
40 | 39,ferry
41 | 40,flamingo
42 | 41,flamingo_head
43 | 42,garfield
44 | 43,gerenuk
45 | 44,gramophone
46 | 45,grand_piano
47 | 46,hawksbill
48 | 47,headphone
49 | 48,hedgehog
50 | 49,helicopter
51 | 50,ibis
52 | 51,inline_skate
53 | 52,joshua_tree
54 | 53,kangaroo
55 | 54,ketch
56 | 55,lamp
57 | 56,laptop
58 | 57,Leopards
59 | 58,llama
60 | 59,lobster
61 | 60,lotus
62 | 61,mandolin
63 | 62,mayfly
64 | 63,menorah
65 | 64,metronome
66 | 65,minaret
67 | 66,Motorbikes
68 | 67,nautilus
69 | 68,octopus
70 | 69,okapi
71 | 70,pagoda
72 | 71,panda
73 | 72,pigeon
74 | 73,pizza
75 | 74,platypus
76 | 75,pyramid
77 | 76,revolver
78 | 77,rhino
79 | 78,rooster
80 | 79,saxophone
81 | 80,schooner
82 | 81,scissors
83 | 82,scorpion
84 | 83,sea_horse
85 | 84,snoopy
86 | 85,soccer_ball
87 | 86,stapler
88 | 87,starfish
89 | 88,stegosaurus
90 | 89,stop_sign
91 | 90,strawberry
92 | 91,sunflower
93 | 92,tick
94 | 93,trilobite
95 | 94,umbrella
96 | 95,watch
97 | 96,water_lilly
98 | 97,wheelchair
99 | 98,wild_cat
100 | 99,windsor_chair
101 | 100,wrench
102 | 101,yin_yang
103 |
--------------------------------------------------------------------------------
/python-python/neural_net.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import print_function
3 | from scipy.ndimage import imread
4 | from scipy.misc import imresize
5 | from sklearn.cross_validation import train_test_split
6 | from os import listdir
7 | from os.path import isfile, join
8 | import autograd.numpy as np
9 | import autograd.numpy.random as npr
10 | from autograd.scipy.misc import logsumexp
11 | from autograd import grad
12 | from autograd.util import quick_grad_check
13 | import gc
14 | import resource
15 | from sys import getsizeof
16 | import os
17 | import psutil
18 |
19 | # {0: 'accordion', 1: 'airplanes', 2: 'anchor', 3: 'ant', 4: 'BACKGROUND_Google', 5: 'barrel', 6: 'bass', 7: 'beaver', 8: 'binocular', 9: 'bonsai', 10: 'brain', 11: 'brontosaurus', 12: 'buddha', 13: 'butterfly', 14: 'camera', 15: 'cannon', 16: 'car_side', 17: 'ceiling_fan', 18: 'cellphone', 19: 'chair', 20: 'chandelier', 21: 'cougar_body', 22: 'cougar_face', 23: 'crab', 24: 'crayfish', 25: 'crocodile', 26: 'crocodile_head', 27: 'cup', 28: 'dalmatian', 29: 'dollar_bill', 30: 'dolphin', 31: 'dragonfly', 32: 'electric_guitar', 33: 'elephant', 34: 'emu', 35: 'euphonium', 36: 'ewer', 37: 'Faces', 38: 'Faces_easy', 39: 'ferry', 40: 'flamingo', 41: 'flamingo_head', 42: 'garfield', 43: 'gerenuk', 44: 'gramophone', 45: 'grand_piano', 46: 'hawksbill', 47: 'headphone', 48: 'hedgehog', 49: 'helicopter', 50: 'ibis', 51: 'inline_skate', 52: 'joshua_tree', 53: 'kangaroo', 54: 'ketch', 55: 'lamp', 56: 'laptop', 57: 'Leopards', 58: 'llama', 59: 'lobster', 60: 'lotus', 61: 'mandolin', 62: 'mayfly', 63: 'menorah', 64: 'metronome', 65: 'minaret', 66: 'Motorbikes', 67: 'nautilus', 68: 'octopus', 69: 'okapi', 70: 'pagoda', 71: 'panda', 72: 'pigeon', 73: 'pizza', 74: 'platypus', 75: 'pyramid', 76: 'revolver', 77: 'rhino', 78: 'rooster', 79: 'saxophone', 80: 'schooner', 81: 'scissors', 82: 'scorpion', 83: 'sea_horse', 84: 'snoopy', 85: 'soccer_ball', 86: 'stapler', 87: 'starfish', 88: 'stegosaurus', 89: 'stop_sign', 90: 'strawberry', 91: 'sunflower', 92: 'tick', 93: 'trilobite', 94: 'umbrella', 95: 'watch', 96: 'water_lilly', 97: 'wheelchair', 98: 'wild_cat', 99: 'windsor_chair', 100: 'wrench', 101: 'yin_yang'}
20 |
21 | images_fname = 'images(128).npy'
22 | output_labels_fname = 'output_labels(128).npy'
23 |
24 | def make_nn_funs(layer_sizes, L2_reg):
25 | shapes = zip(layer_sizes[:-1], layer_sizes[1:])
26 | N = sum((m+1)*n for m, n in shapes)
27 |
28 | def unpack_layers(W_vect):
29 | for m, n in shapes:
30 | yield W_vect[:m*n].reshape((m,n)), W_vect[m*n:m*n+n]
31 | W_vect = W_vect[(m+1)*n:]
32 |
33 | def predictions(W_vect, inputs):
34 | for W, b in unpack_layers(W_vect):
35 | outputs = np.dot(inputs, W) + b
36 | inputs = np.tanh(outputs)
37 | return outputs - logsumexp(outputs, axis=1, keepdims=True)
38 |
39 | def loss(W_vect, X, T):
40 | log_prior = -L2_reg * np.dot(W_vect, W_vect)
41 | log_lik = np.sum(predictions(W_vect, X) * T)
42 | return - log_prior - log_lik
43 |
44 | def frac_err(W_vect, X, T):
45 | return np.mean(np.argmax(T, axis=1) != np.argmax(predictions(W_vect, X), axis=1))
46 |
47 | return N, predictions, loss, frac_err
48 |
49 | def convert_bw_to_rgb(im):
50 | im.resize((im.shape[0], im.shape[1], 1))
51 | return np.repeat(im.astype(np.uint8), 3, 2)
52 |
53 | def standarizeImage(im):
54 | if len(im.shape) < 3:
55 | im = convert_bw_to_rgb(im)
56 | im = np.array(im, 'float32')
57 | if im.shape[0] != 64:
58 | im = imresize(im, (64, 64, 3))
59 | if np.amax(im) > 1.1:
60 | im = im / 255.0
61 | assert((np.amax(im) > 0.01) & (np.amax(im) <= 1))
62 | assert((np.amin(im) >= 0.00))
63 | return im
64 |
65 | def gen_data():
66 | category_paths = [f for f in listdir('101_ObjectCategories/')]
67 | image_paths = [f for f in listdir('101_ObjectCategories/menorah/') if isfile(join('101_ObjectCategories/menorah/', f))]
68 |
69 | images = []
70 | output_labels = []
71 | # Include all categories with mappings to the integer representing the category
72 | categories_dict = {}
73 |
74 | category = 0
75 | for category_path in category_paths:
76 | image_paths = [f for f in listdir('101_ObjectCategories/' + category_path + '/')]
77 | for image_path in image_paths:
78 | im = standarizeImage(imread('101_ObjectCategories/' + category_path + '/' + image_path))
79 | if im.shape == (64, 64, 3):
80 | images.append(im)
81 | output_labels.append(category)
82 | categories_dict[category] = category_path
83 | category = category + 1
84 |
85 | images = np.array(images)
86 | partial_flatten = lambda x : np.reshape(x, (x.shape[0], np.prod(x.shape[1:])))
87 | images = partial_flatten(images)
88 |
89 | np.save('images.npy', images)
90 | np.save('output_labels.npy', output_labels)
91 |
92 | def make_batches(N_data, batch_size):
93 | return [slice(i, min(i+batch_size, N_data))
94 | for i in range(0, N_data, batch_size)]
95 |
96 | def load_caltech100():
97 | # gen_data()
98 | one_hot = lambda x, K: np.array(x[:,None] == np.arange(K)[None, :], dtype=int)
99 | images = np.load(images_fname)
100 | output_labels = np.load(output_labels_fname)
101 | train_images, valid_images, train_labels, valid_labels = train_test_split(images, output_labels, test_size=0.20, random_state=1729)
102 | train_labels = one_hot(train_labels, 101)
103 | valid_labels = one_hot(valid_labels, 101)
104 | # import bpdb; bpdb.set_trace()
105 | return train_images, train_labels, valid_images, valid_labels
106 |
107 | if __name__ == '__main__':
108 |
109 | print(resource.getrusage(resource.RUSAGE_SELF))
110 | process = psutil.Process(os.getpid())
111 | print (process.memory_info().rss)
112 |
113 | # Load and process Caltech data
114 | train_images, train_labels, test_images, test_labels = load_caltech100()
115 | image_input_d = train_images.shape[1]
116 |
117 | # Network parameters
118 | layer_sizes = [image_input_d, 1500, 650, 101]
119 | L2_reg = 1.0
120 |
121 | # Training parameters
122 | param_scale = 0.1
123 | learning_rate = 1e-3
124 | momentum = 0.9
125 | batch_size = 256
126 | num_epochs = 50
127 |
128 | # Make neural net functions
129 | N_weights, pred_fun, loss_fun, frac_err = make_nn_funs(layer_sizes, L2_reg)
130 | loss_grad = grad(loss_fun)
131 |
132 | # Initialize weights
133 | rs = npr.RandomState()
134 | W = rs.randn(N_weights) * param_scale
135 |
136 | # Check the gradients numerically, just to be safe
137 | # quick_grad_check(loss_fun, W, (train_images, train_labels))
138 |
139 | print(" Epoch | Train err | Test err ")
140 |
141 | def print_perf(epoch, W):
142 | test_perf = frac_err(W, test_images, test_labels)
143 | train_perf = frac_err(W, train_images, train_labels)
144 | print("{0:15}|{1:15}|{2:15}".format(epoch, train_perf, test_perf))
145 |
146 | # Train with sgd
147 | batch_idxs = make_batches(train_images.shape[0], batch_size)
148 | import bpdb; bpdb.set_trace()
149 | cur_dir = np.zeros(N_weights)
150 |
151 | for epoch in range(num_epochs):
152 | print_perf(epoch, W)
153 | for idxs in batch_idxs:
154 | grad_W = loss_grad(W, train_images[idxs], train_labels[idxs])
155 | print('----------------------------')
156 | print(getsizeof(grad_W))
157 | #print(process.memory_info().rss)
158 | #print(resource.getrusage(resource.RUSAGE_SELF))
159 | gc.collect()
160 | #print(process.memory_info().rss)
161 | cur_dir = momentum * cur_dir + (1.0 - momentum) * grad_W
162 | W -= learning_rate * cur_dir
--------------------------------------------------------------------------------
/python-python/nnet/__init__.py:
--------------------------------------------------------------------------------
1 | # Default python file required for initializing the module for
2 | # neural net class. More documentation included in the next file.
--------------------------------------------------------------------------------
/python-python/nnet/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/nnet/__init__.pyc
--------------------------------------------------------------------------------
/python-python/nnet/neural_net.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import print_function
3 | import time
4 |
5 | from scipy.ndimage import imread
6 | from scipy.misc import imresize
7 | from sklearn.cross_validation import train_test_split
8 | from autograd.scipy.misc import logsumexp
9 |
10 | from os import listdir
11 | from os.path import isfile, join
12 |
13 | import autograd.numpy as np
14 | import autograd.numpy.random as npr
15 | from autograd import grad
16 |
17 | import traceback
18 |
19 | # Set up a basic convolutional neural net is adapted from Ryan Adam's example
20 | # with Autograd located below:
21 | # https://github.com/twitter/torch-autograd/blob/master/examples/train-mnist-cnn.lua
22 |
23 | # We apply this model to the Caltech 101 dataset rather than the MNIST dataset
24 | # to increase the difficulty of the task
25 | def make_nn_funs(layer_sizes, L2_reg):
26 | shapes = zip(layer_sizes[:-1], layer_sizes[1:])
27 | N = sum((m+1)*n for m, n in shapes)
28 |
29 | def unpack_layers(W_vect):
30 | for m, n in shapes:
31 | yield W_vect[:m*n].reshape((m,n)), W_vect[m*n:m*n+n]
32 | W_vect = W_vect[(m+1)*n:]
33 |
34 | def predictions(W_vect, inputs):
35 | for W, b in unpack_layers(W_vect):
36 | outputs = np.dot(inputs, W) + b
37 | inputs = np.tanh(outputs)
38 | return outputs - logsumexp(outputs, axis=1, keepdims=True)
39 |
40 | def loss(W_vect, X, T):
41 | log_prior = -L2_reg * np.dot(W_vect.T, W_vect)
42 | log_lik = np.sum(predictions(W_vect, X) * T)
43 | return - log_prior - log_lik
44 |
45 | def frac_err(W_vect, X, T):
46 | return np.mean(np.argmax(T, axis=1) != np.argmax(predictions(W_vect, X), axis=1))
47 |
48 | return N, predictions, loss, frac_err
49 |
50 | def convert_bw_to_rgb(im):
51 | im.resize((im.shape[0], im.shape[1], 1))
52 | return np.repeat(im.astype(np.uint8), 3, 2)
53 |
54 | def standarizeImage(im):
55 | if len(im.shape) < 3:
56 | im = convert_bw_to_rgb(im)
57 | im = np.array(im, 'float32')
58 | if im.shape[0] != 64:
59 | im = imresize(im, (64, 64, 3))
60 | if np.amax(im) > 1.1:
61 | im = im / 255.0
62 | assert((np.amax(im) > 0.01) & (np.amax(im) <= 1))
63 | assert((np.amin(im) >= 0.00))
64 | return im
65 |
66 | def gen_data():
67 | category_paths = [f for f in listdir('101_ObjectCategories/')]
68 | image_paths = [f for f in listdir('101_ObjectCategories/menorah/') if isfile(join('101_ObjectCategories/menorah/', f))]
69 |
70 | images = []
71 | output_labels = []
72 | # Include all categories with mappings to the integer representing the category
73 | categories_dict = {}
74 |
75 | category = 0
76 | for category_path in category_paths:
77 | image_paths = [f for f in listdir('101_ObjectCategories/' + category_path + '/')]
78 | for image_path in image_paths:
79 | im = standarizeImage(imread('101_ObjectCategories/' + category_path + '/' + image_path))
80 | if im.shape == (64, 64, 3):
81 | images.append(im)
82 | output_labels.append(category)
83 | categories_dict[category] = category_path
84 | category = category + 1
85 |
86 | images = np.array(images)
87 | partial_flatten = lambda x : np.reshape(x, (x.shape[0], np.prod(x.shape[1:])))
88 | images = partial_flatten(images)
89 |
90 | np.save('images(64).npy', images)
91 | np.save('output_labels(64).npy', output_labels)
92 |
93 | def make_batches(N_data, batch_size):
94 | return [slice(i, min(i+batch_size, N_data))
95 | for i in range(0, N_data, batch_size)]
96 |
97 | def load_caltech100(images_fname, labels_fname):
98 | # if images(64).npy or output_labels(64).npy missing then
99 | # print('Generating data because it does not exist. Note that this may take a while')
100 | # gen_data()
101 | one_hot = lambda x, K: np.array(x[:,None] == np.arange(K)[None, :], dtype=int)
102 | images = np.load(images_fname)
103 | output_labels = np.load(labels_fname)
104 | output_labels = np.load(labels_fname)
105 | train_images, valid_images, train_labels, valid_labels = train_test_split(images, output_labels, test_size=0.20, random_state=1729)
106 | train_labels = one_hot(train_labels, 101)
107 | valid_labels = one_hot(valid_labels, 101)
108 | return train_images, train_labels, valid_images, valid_labels
109 |
--------------------------------------------------------------------------------
/python-python/nnet/neural_net.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/nnet/neural_net.pyc
--------------------------------------------------------------------------------
/python-python/paxos.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------
2 | # Implements a Paxos server and runs Paxos with this server.
3 | # This function is called through run_paxos if the client_server
4 | # has gone down.
5 | # ------------------------------------------------------------
6 |
7 | from __future__ import print_function
8 | from __future__ import absolute_import
9 | from grpc.beta import implementations
10 | import time
11 | import sys
12 | from threading import Thread
13 |
14 | import paxos_pb2
15 | import argparse
16 | import traceback
17 |
18 | import autograd.numpy as np
19 | import autograd.numpy.random as npr
20 | from autograd import grad
21 | import random
22 |
23 | from protobuf_utils.utils import *
24 | from server_utils.utils import *
25 |
26 | import subprocess
27 |
28 | _TIMEOUT_SECONDS = 4
29 | PAXOS_PORT_STR = 50052
30 |
31 | # Actual implementation of the PaxosServer that is used to communicate between the clients.
32 | # Paxos is called to determine the future main server from amongst many different clients.
33 | class PaxosServer(paxos_pb2.BetaPaxosServerServicer):
34 | def __init__(self, hostname):
35 | # Initial consensus value is none, this will be the server
36 | self.new_server = ''
37 | self.consensus_value = None
38 | self.consensus_reached = False
39 |
40 | # Values for paxos
41 | self.n = random.random()
42 | self.prop_n = 0
43 | self.v = ''
44 | self.n_v = 0
45 |
46 | # Exponential backoff to prevent spamming other servers
47 | # Randomness is introduced to help Paxos converge quicker
48 | self.backoff = (1 * random.gauss(1, 0.25))
49 | if self.backoff < 0:
50 | self.backoff = 1
51 |
52 | # Saves the server's address as well
53 | self.address = hostname
54 |
55 | # Runs the prepare phase of the Paxos algorithm
56 | def prepare(self, request, context):
57 | # Update the highest seen proposal
58 | if request.n > self.prop_n:
59 | self.prop_n = request.n
60 | # Returns an acknowledgement containing highest accepted proposal
61 | return paxos_pb2.ack(n=self.n, v=self.v, n_v=self.n_v)
62 |
63 | # Accepts the proposal if it is higher than
64 | def accept(self, request, context):
65 | if request.n >= self.prop_n:
66 | self.n_v = request.n
67 | self.v = request.v
68 | self.n = request.n
69 | return paxos_pb2.acquiescence(accept_bool=True)
70 | else:
71 | return paxos_pb2.acquiescence(accept_bool=False)
72 |
73 | # Notifies the server that consensus has been reached
74 | def accepted(self, request, context):
75 | self.consensus_reached = True
76 | self.new_server = request.v
77 | return paxos_pb2.empty()
78 |
79 | # Ping function to allow confirmation between PaxosServer that they
80 | # are still running
81 | def ping(self, request, context):
82 | return paxos_pb2.empty()
83 |
84 | # Runs the PaxosServer. Checks periodically to see if a consensus has
85 | # been reached.
86 | def run_server(server, paxos_server):
87 | server.start()
88 | while True:
89 | time.sleep(0.1)
90 | try:
91 | if paxos_server.consensus_reached:
92 | if paxos_server.new_server != '':
93 | log_info('Consensus reached, server shutting down')
94 | # Wait briefly for the consensus message to propogate out
95 | time.sleep(5)
96 | server.stop(0)
97 | break
98 | time.sleep(1)
99 | except KeyboardInterrupt:
100 | server.stop(0)
101 |
102 | # Actually instantiates the Paxos Server according to a defined port
103 | def create_server(hostname, local_id):
104 | # Allow argument that allows this parameter to be changsed
105 | paxos_server = PaxosServer(hostname)
106 | server = paxos_pb2.beta_create_PaxosServer_server(paxos_server)
107 | if local_id is None:
108 | server.add_insecure_port(hostname + ':' + str(PAXOS_PORT_STR))
109 | else:
110 | server.add_insecure_port(hostname)
111 | return paxos_server, server
112 |
113 | # Attempts to send proposals to all the other servers
114 | def send_proposals(server_stubs, self_paxos_server):
115 | # Increments the proposal number from the previous one that it sends out
116 | self_paxos_server.n = self_paxos_server.n * (1 + random.random())
117 | self_paxos_server.v = self_paxos_server.address
118 | n_proposal = self_paxos_server.n
119 | value = self_paxos_server.address
120 | log_info('Making a proposal from {0} for n = {1} '.format(self_paxos_server.address, n_proposal))
121 |
122 | # Track the failures of the proposals
123 | n_so_far = 0
124 | failed = False
125 | responded = 0
126 |
127 | for server_stub in server_stubs:
128 | # Makes the connection to the server
129 | try:
130 | # gRPC call to other Paxos Servers to see if they acceept the proposal
131 | response = server_stub.prepare(paxos_pb2.proposal(n=n_proposal), _TIMEOUT_SECONDS)
132 |
133 | # Sees a higher n value then it's current value and immediately stops the process
134 | if response.n >= n_proposal:
135 | failed = True
136 | log_info('Proposal ' + str(n_proposal) + ' failed')
137 | break
138 | else:
139 | # If the response is positive, then it notes the positive response
140 | if response.n_v > n_so_far:
141 | n_so_far = response.n
142 | value = response.v
143 | responded += 1
144 | except Exception as e:
145 | if ('ExpirationError' in str(e)):
146 | log_info('Failure to connect to server_stub')
147 | continue
148 | else:
149 | # More severe error, should log and crash
150 | traceback.print_exc()
151 | sys.exit(1)
152 |
153 | # No proposals have been sent so far, suggests its own IP
154 | if value is None:
155 | value = self_paxos_server.address
156 |
157 | # If it does not have a majority of responses, Paxos fails
158 | if responded < len(server_stubs) / 2.0:
159 | failed = True
160 |
161 | return(failed, n_proposal, value)
162 |
163 | # Requests that the other Paxos Server accepts the proposal
164 | def request_accept(server_stubs, self_paxos_server, n_proposal, value):
165 | accepted = 0
166 | for stub in server_stubs:
167 | try:
168 | response = stub.accept(paxos_pb2.request_acceptance(n=n_proposal, v=value), _TIMEOUT_SECONDS)
169 | except Exception as e:
170 | traceback.print_exc()
171 | return False
172 | if response.accept_bool:
173 | accepted += 1
174 |
175 | # If the majority accept the proposal, then it passes
176 | if accepted > len(server_stubs) / 2.0:
177 | log_info('Proposal accepted')
178 | return True
179 | else:
180 | log_info('Proposal {0} rejected with value {1}'.format(n_proposal, value))
181 | return False
182 |
183 | # Checks to ensure that all the stubs are currently available by pinging them
184 | # If more than half of them are available, it begins Paxos. Otherwise, it waits.
185 | def check_stubs_up(stubs):
186 | responses = 0
187 | for stub in stubs:
188 | try:
189 | response = stub.ping(paxos_pb2.empty(), _TIMEOUT_SECONDS)
190 | responses += 1
191 | except Exception as e:
192 | if ('ExpirationError' in str(e)):
193 | log_info('Failure to connect to server_stub during startup')
194 | continue
195 | else:
196 | # More severe error, should log and crash
197 | traceback.print_exc()
198 | sys.exit(1)
199 | if responses < len(stubs) / 2:
200 | return False
201 | else:
202 | return True
203 |
204 | # Make sure that all machines are aware that the Paxos algorithm is finishing
205 | # Not all machines are aware that the server has failed at the same time. Could
206 | # be in the middle of calculating gradients or waiting to be timed out.
207 | def gen_server_stubs(self_paxos_server, local_id):
208 | TOT_ATTEMPTS = 3
209 | for i in range(TOT_ATTEMPTS):
210 | server_addresses = gen_server_addresses(local_id, self_paxos_server.address)
211 | print(server_addresses)
212 | server_addresses.remove(self_paxos_server.address)
213 | stubs = []
214 | for server_address in server_addresses:
215 | if not self_paxos_server.consensus_reached:
216 | if local_id is not None:
217 | server_port = int(server_address[-5:])
218 | channel = implementations.insecure_channel('localhost', server_port)
219 | else:
220 | channel = implementations.insecure_channel(server_address, PAXOS_PORT_STR)
221 |
222 | stub = paxos_pb2.beta_create_PaxosServer_stub(channel)
223 | stubs.append(stub)
224 | all_stubs_responsive = check_stubs_up(stubs)
225 | if all_stubs_responsive:
226 | return stubs
227 | time.sleep(1 * TOT_ATTEMPTS)
228 | return None
229 |
230 | # Sends to all servers that consensus was reached and a server was chosen.
231 | def broadcast_consensus(server_stubs, self_paxos_server, value):
232 | for stub in server_stubs:
233 | response = stub.accepted(paxos_pb2.consensus(n=self_paxos_server.n, v=value), 2 * _TIMEOUT_SECONDS)
234 |
235 | # Begins the Paxos protocol
236 | def start_paxos(server_stubs, self_paxos_server):
237 | proposal_failed, n_proposal, value = send_proposals(server_stubs, self_paxos_server)
238 | if not proposal_failed and not self_paxos_server.consensus_reached:
239 | # Have everyone accept the proposal
240 | accepted = request_accept(server_stubs, self_paxos_server, n_proposal, value)
241 | if accepted and not self_paxos_server.consensus_reached:
242 | # If accepted, let everyone know that the server has been chosen
243 | broadcast_consensus(server_stubs, self_paxos_server, value)
244 | self_paxos_server.new_server = value
245 | self_paxos_server.consensus_reached = True
246 | return True
247 |
248 | # If proposal failed, backoff to try again later
249 | self_paxos_server.backoff = self_paxos_server.backoff * (1 + 10 * random.random())
250 | return False
251 |
252 | # Client loops and runs the paxos algorithm every few seconds
253 | def paxos_loop(self_paxos_server, local_id):
254 | time_slept = 0
255 | send_proposal_time = self_paxos_server.backoff
256 |
257 | while not self_paxos_server.consensus_reached:
258 | time.sleep(0.1)
259 | time_slept += 0.1
260 |
261 | # Send a proposal at allocated time
262 | if time_slept > send_proposal_time and not self_paxos_server.consensus_reached:
263 | time.sleep(random.random())
264 | server_stubs = gen_server_stubs(self_paxos_server, local_id)
265 | if server_stubs is None:
266 | self_paxos_server.new_server = ''
267 | break
268 | start_paxos(server_stubs, self_paxos_server)
269 | send_proposal_time = (random.gauss(1, 0.25) * self_paxos_server.backoff)
270 | time_slept = 0
271 |
272 | # If proposal fails, revert to checking for a server
273 | if send_proposal_time > 60:
274 | self_paxos_server.consensus_reached = True
275 | self_paxos_server.consensus_value = ''
276 | break
277 |
278 | # This is the final function that exterior functions like client.py will call
279 | def run_paxos(local_id=None):
280 | # Generates the host name
281 | hostname = gen_local_address(local_id)
282 | log_info(hostname + ' called to run Paxos for determining the server')
283 |
284 | # Generates the server
285 | paxos_server, server = create_server(hostname, local_id)
286 | try:
287 | # Launch the server on a separate thread
288 | Thread(target=run_server, args=(server,paxos_server,)).start()
289 | start_paxos = time.time()
290 |
291 | # Begin to run Paxos
292 | paxos_loop(paxos_server, local_id)
293 | if paxos_server.new_server != '':
294 | log_info('Done, new server is: {0} finished paxos in {1:2}s'.format(paxos_server.new_server, time.time()-start_paxos))
295 | else:
296 | # New server is empty only when a suitable server was not found after a predefined amount of time
297 | log_info('Failure to connect to other allocated instances. Stopping paxos.')
298 | except KeyboardInterrupt:
299 | sys.exit(0)
300 | finally:
301 | paxos_server.consensus_reached = True
302 | server.stop(0)
303 | return paxos_server.new_server
304 |
305 | if __name__ == '__main__':
306 | parser = argparse.ArgumentParser()
307 | parser.add_argument('--id')
308 | args = parser.parse_args()
309 | local_id = args.id
310 | if local_id is not None:
311 | local_id = int(local_id)
312 | assert(local_id > 0)
313 | log_info(run_paxos(local_id))
314 |
--------------------------------------------------------------------------------
/python-python/paxos_pb2.py:
--------------------------------------------------------------------------------
1 | # Generated by the protocol buffer compiler. DO NOT EDIT!
2 | # source: paxos.proto
3 |
4 | import sys
5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
6 | from google.protobuf import descriptor as _descriptor
7 | from google.protobuf import message as _message
8 | from google.protobuf import reflection as _reflection
9 | from google.protobuf import symbol_database as _symbol_database
10 | from google.protobuf import descriptor_pb2
11 | # @@protoc_insertion_point(imports)
12 |
13 | _sym_db = _symbol_database.Default()
14 |
15 |
16 |
17 |
18 | DESCRIPTOR = _descriptor.FileDescriptor(
19 | name='paxos.proto',
20 | package='dist_sgd',
21 | syntax='proto3',
22 | serialized_pb=_b('\n\x0bpaxos.proto\x12\x08\x64ist_sgd\"(\n\x03\x61\x63k\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\x12\x0b\n\x03n_v\x18\x03 \x01(\x02\"\x15\n\x08proposal\x12\t\n\x01n\x18\x01 \x01(\x02\"*\n\x12request_acceptance\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\"#\n\x0c\x61\x63quiescence\x12\x13\n\x0b\x61\x63\x63\x65pt_bool\x18\x01 \x01(\x08\"!\n\tconsensus\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\"\x07\n\x05\x65mpty2\xdf\x01\n\x0bPaxosServer\x12.\n\x07prepare\x12\x12.dist_sgd.proposal\x1a\r.dist_sgd.ack\"\x00\x12@\n\x06\x61\x63\x63\x65pt\x12\x1c.dist_sgd.request_acceptance\x1a\x16.dist_sgd.acquiescence\"\x00\x12\x32\n\x08\x61\x63\x63\x65pted\x12\x13.dist_sgd.consensus\x1a\x0f.dist_sgd.empty\"\x00\x12*\n\x04ping\x12\x0f.dist_sgd.empty\x1a\x0f.dist_sgd.empty\"\x00\x42\x18\n\x0bio.dist_sgdB\x07\x44istSGDP\x01\x62\x06proto3')
23 | )
24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
25 |
26 |
27 |
28 |
29 | _ACK = _descriptor.Descriptor(
30 | name='ack',
31 | full_name='dist_sgd.ack',
32 | filename=None,
33 | file=DESCRIPTOR,
34 | containing_type=None,
35 | fields=[
36 | _descriptor.FieldDescriptor(
37 | name='n', full_name='dist_sgd.ack.n', index=0,
38 | number=1, type=2, cpp_type=6, label=1,
39 | has_default_value=False, default_value=0,
40 | message_type=None, enum_type=None, containing_type=None,
41 | is_extension=False, extension_scope=None,
42 | options=None),
43 | _descriptor.FieldDescriptor(
44 | name='v', full_name='dist_sgd.ack.v', index=1,
45 | number=2, type=9, cpp_type=9, label=1,
46 | has_default_value=False, default_value=_b("").decode('utf-8'),
47 | message_type=None, enum_type=None, containing_type=None,
48 | is_extension=False, extension_scope=None,
49 | options=None),
50 | _descriptor.FieldDescriptor(
51 | name='n_v', full_name='dist_sgd.ack.n_v', index=2,
52 | number=3, type=2, cpp_type=6, label=1,
53 | has_default_value=False, default_value=0,
54 | message_type=None, enum_type=None, containing_type=None,
55 | is_extension=False, extension_scope=None,
56 | options=None),
57 | ],
58 | extensions=[
59 | ],
60 | nested_types=[],
61 | enum_types=[
62 | ],
63 | options=None,
64 | is_extendable=False,
65 | syntax='proto3',
66 | extension_ranges=[],
67 | oneofs=[
68 | ],
69 | serialized_start=25,
70 | serialized_end=65,
71 | )
72 |
73 |
74 | _PROPOSAL = _descriptor.Descriptor(
75 | name='proposal',
76 | full_name='dist_sgd.proposal',
77 | filename=None,
78 | file=DESCRIPTOR,
79 | containing_type=None,
80 | fields=[
81 | _descriptor.FieldDescriptor(
82 | name='n', full_name='dist_sgd.proposal.n', index=0,
83 | number=1, type=2, cpp_type=6, label=1,
84 | has_default_value=False, default_value=0,
85 | message_type=None, enum_type=None, containing_type=None,
86 | is_extension=False, extension_scope=None,
87 | options=None),
88 | ],
89 | extensions=[
90 | ],
91 | nested_types=[],
92 | enum_types=[
93 | ],
94 | options=None,
95 | is_extendable=False,
96 | syntax='proto3',
97 | extension_ranges=[],
98 | oneofs=[
99 | ],
100 | serialized_start=67,
101 | serialized_end=88,
102 | )
103 |
104 |
105 | _REQUEST_ACCEPTANCE = _descriptor.Descriptor(
106 | name='request_acceptance',
107 | full_name='dist_sgd.request_acceptance',
108 | filename=None,
109 | file=DESCRIPTOR,
110 | containing_type=None,
111 | fields=[
112 | _descriptor.FieldDescriptor(
113 | name='n', full_name='dist_sgd.request_acceptance.n', index=0,
114 | number=1, type=2, cpp_type=6, label=1,
115 | has_default_value=False, default_value=0,
116 | message_type=None, enum_type=None, containing_type=None,
117 | is_extension=False, extension_scope=None,
118 | options=None),
119 | _descriptor.FieldDescriptor(
120 | name='v', full_name='dist_sgd.request_acceptance.v', index=1,
121 | number=2, type=9, cpp_type=9, label=1,
122 | has_default_value=False, default_value=_b("").decode('utf-8'),
123 | message_type=None, enum_type=None, containing_type=None,
124 | is_extension=False, extension_scope=None,
125 | options=None),
126 | ],
127 | extensions=[
128 | ],
129 | nested_types=[],
130 | enum_types=[
131 | ],
132 | options=None,
133 | is_extendable=False,
134 | syntax='proto3',
135 | extension_ranges=[],
136 | oneofs=[
137 | ],
138 | serialized_start=90,
139 | serialized_end=132,
140 | )
141 |
142 |
143 | _ACQUIESCENCE = _descriptor.Descriptor(
144 | name='acquiescence',
145 | full_name='dist_sgd.acquiescence',
146 | filename=None,
147 | file=DESCRIPTOR,
148 | containing_type=None,
149 | fields=[
150 | _descriptor.FieldDescriptor(
151 | name='accept_bool', full_name='dist_sgd.acquiescence.accept_bool', index=0,
152 | number=1, type=8, cpp_type=7, label=1,
153 | has_default_value=False, default_value=False,
154 | message_type=None, enum_type=None, containing_type=None,
155 | is_extension=False, extension_scope=None,
156 | options=None),
157 | ],
158 | extensions=[
159 | ],
160 | nested_types=[],
161 | enum_types=[
162 | ],
163 | options=None,
164 | is_extendable=False,
165 | syntax='proto3',
166 | extension_ranges=[],
167 | oneofs=[
168 | ],
169 | serialized_start=134,
170 | serialized_end=169,
171 | )
172 |
173 |
174 | _CONSENSUS = _descriptor.Descriptor(
175 | name='consensus',
176 | full_name='dist_sgd.consensus',
177 | filename=None,
178 | file=DESCRIPTOR,
179 | containing_type=None,
180 | fields=[
181 | _descriptor.FieldDescriptor(
182 | name='n', full_name='dist_sgd.consensus.n', index=0,
183 | number=1, type=2, cpp_type=6, label=1,
184 | has_default_value=False, default_value=0,
185 | message_type=None, enum_type=None, containing_type=None,
186 | is_extension=False, extension_scope=None,
187 | options=None),
188 | _descriptor.FieldDescriptor(
189 | name='v', full_name='dist_sgd.consensus.v', index=1,
190 | number=2, type=9, cpp_type=9, label=1,
191 | has_default_value=False, default_value=_b("").decode('utf-8'),
192 | message_type=None, enum_type=None, containing_type=None,
193 | is_extension=False, extension_scope=None,
194 | options=None),
195 | ],
196 | extensions=[
197 | ],
198 | nested_types=[],
199 | enum_types=[
200 | ],
201 | options=None,
202 | is_extendable=False,
203 | syntax='proto3',
204 | extension_ranges=[],
205 | oneofs=[
206 | ],
207 | serialized_start=171,
208 | serialized_end=204,
209 | )
210 |
211 |
212 | _EMPTY = _descriptor.Descriptor(
213 | name='empty',
214 | full_name='dist_sgd.empty',
215 | filename=None,
216 | file=DESCRIPTOR,
217 | containing_type=None,
218 | fields=[
219 | ],
220 | extensions=[
221 | ],
222 | nested_types=[],
223 | enum_types=[
224 | ],
225 | options=None,
226 | is_extendable=False,
227 | syntax='proto3',
228 | extension_ranges=[],
229 | oneofs=[
230 | ],
231 | serialized_start=206,
232 | serialized_end=213,
233 | )
234 |
235 | DESCRIPTOR.message_types_by_name['ack'] = _ACK
236 | DESCRIPTOR.message_types_by_name['proposal'] = _PROPOSAL
237 | DESCRIPTOR.message_types_by_name['request_acceptance'] = _REQUEST_ACCEPTANCE
238 | DESCRIPTOR.message_types_by_name['acquiescence'] = _ACQUIESCENCE
239 | DESCRIPTOR.message_types_by_name['consensus'] = _CONSENSUS
240 | DESCRIPTOR.message_types_by_name['empty'] = _EMPTY
241 |
242 | ack = _reflection.GeneratedProtocolMessageType('ack', (_message.Message,), dict(
243 | DESCRIPTOR = _ACK,
244 | __module__ = 'paxos_pb2'
245 | # @@protoc_insertion_point(class_scope:dist_sgd.ack)
246 | ))
247 | _sym_db.RegisterMessage(ack)
248 |
249 | proposal = _reflection.GeneratedProtocolMessageType('proposal', (_message.Message,), dict(
250 | DESCRIPTOR = _PROPOSAL,
251 | __module__ = 'paxos_pb2'
252 | # @@protoc_insertion_point(class_scope:dist_sgd.proposal)
253 | ))
254 | _sym_db.RegisterMessage(proposal)
255 |
256 | request_acceptance = _reflection.GeneratedProtocolMessageType('request_acceptance', (_message.Message,), dict(
257 | DESCRIPTOR = _REQUEST_ACCEPTANCE,
258 | __module__ = 'paxos_pb2'
259 | # @@protoc_insertion_point(class_scope:dist_sgd.request_acceptance)
260 | ))
261 | _sym_db.RegisterMessage(request_acceptance)
262 |
263 | acquiescence = _reflection.GeneratedProtocolMessageType('acquiescence', (_message.Message,), dict(
264 | DESCRIPTOR = _ACQUIESCENCE,
265 | __module__ = 'paxos_pb2'
266 | # @@protoc_insertion_point(class_scope:dist_sgd.acquiescence)
267 | ))
268 | _sym_db.RegisterMessage(acquiescence)
269 |
270 | consensus = _reflection.GeneratedProtocolMessageType('consensus', (_message.Message,), dict(
271 | DESCRIPTOR = _CONSENSUS,
272 | __module__ = 'paxos_pb2'
273 | # @@protoc_insertion_point(class_scope:dist_sgd.consensus)
274 | ))
275 | _sym_db.RegisterMessage(consensus)
276 |
277 | empty = _reflection.GeneratedProtocolMessageType('empty', (_message.Message,), dict(
278 | DESCRIPTOR = _EMPTY,
279 | __module__ = 'paxos_pb2'
280 | # @@protoc_insertion_point(class_scope:dist_sgd.empty)
281 | ))
282 | _sym_db.RegisterMessage(empty)
283 |
284 |
285 | DESCRIPTOR.has_options = True
286 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n\013io.dist_sgdB\007DistSGDP\001'))
287 | import abc
288 | import six
289 | from grpc.beta import implementations as beta_implementations
290 | from grpc.framework.common import cardinality
291 | from grpc.framework.interfaces.face import utilities as face_utilities
292 |
293 | class BetaPaxosServerServicer(six.with_metaclass(abc.ABCMeta, object)):
294 | """"""
295 | @abc.abstractmethod
296 | def prepare(self, request, context):
297 | raise NotImplementedError()
298 | @abc.abstractmethod
299 | def accept(self, request, context):
300 | raise NotImplementedError()
301 | @abc.abstractmethod
302 | def accepted(self, request, context):
303 | raise NotImplementedError()
304 | @abc.abstractmethod
305 | def ping(self, request, context):
306 | raise NotImplementedError()
307 |
308 | class BetaPaxosServerStub(six.with_metaclass(abc.ABCMeta, object)):
309 | """The interface to which stubs will conform."""
310 | @abc.abstractmethod
311 | def prepare(self, request, timeout):
312 | raise NotImplementedError()
313 | prepare.future = None
314 | @abc.abstractmethod
315 | def accept(self, request, timeout):
316 | raise NotImplementedError()
317 | accept.future = None
318 | @abc.abstractmethod
319 | def accepted(self, request, timeout):
320 | raise NotImplementedError()
321 | accepted.future = None
322 | @abc.abstractmethod
323 | def ping(self, request, timeout):
324 | raise NotImplementedError()
325 | ping.future = None
326 |
327 | def beta_create_PaxosServer_server(servicer, pool=None, pool_size=None, default_timeout=None, maximum_timeout=None):
328 | import paxos_pb2
329 | import paxos_pb2
330 | import paxos_pb2
331 | import paxos_pb2
332 | import paxos_pb2
333 | import paxos_pb2
334 | import paxos_pb2
335 | import paxos_pb2
336 | request_deserializers = {
337 | ('dist_sgd.PaxosServer', 'accept'): paxos_pb2.request_acceptance.FromString,
338 | ('dist_sgd.PaxosServer', 'accepted'): paxos_pb2.consensus.FromString,
339 | ('dist_sgd.PaxosServer', 'ping'): paxos_pb2.empty.FromString,
340 | ('dist_sgd.PaxosServer', 'prepare'): paxos_pb2.proposal.FromString,
341 | }
342 | response_serializers = {
343 | ('dist_sgd.PaxosServer', 'accept'): paxos_pb2.acquiescence.SerializeToString,
344 | ('dist_sgd.PaxosServer', 'accepted'): paxos_pb2.empty.SerializeToString,
345 | ('dist_sgd.PaxosServer', 'ping'): paxos_pb2.empty.SerializeToString,
346 | ('dist_sgd.PaxosServer', 'prepare'): paxos_pb2.ack.SerializeToString,
347 | }
348 | method_implementations = {
349 | ('dist_sgd.PaxosServer', 'accept'): face_utilities.unary_unary_inline(servicer.accept),
350 | ('dist_sgd.PaxosServer', 'accepted'): face_utilities.unary_unary_inline(servicer.accepted),
351 | ('dist_sgd.PaxosServer', 'ping'): face_utilities.unary_unary_inline(servicer.ping),
352 | ('dist_sgd.PaxosServer', 'prepare'): face_utilities.unary_unary_inline(servicer.prepare),
353 | }
354 | server_options = beta_implementations.server_options(request_deserializers=request_deserializers, response_serializers=response_serializers, thread_pool=pool, thread_pool_size=pool_size, default_timeout=default_timeout, maximum_timeout=maximum_timeout)
355 | return beta_implementations.server(method_implementations, options=server_options)
356 |
357 | def beta_create_PaxosServer_stub(channel, host=None, metadata_transformer=None, pool=None, pool_size=None):
358 | import paxos_pb2
359 | import paxos_pb2
360 | import paxos_pb2
361 | import paxos_pb2
362 | import paxos_pb2
363 | import paxos_pb2
364 | import paxos_pb2
365 | import paxos_pb2
366 | request_serializers = {
367 | ('dist_sgd.PaxosServer', 'accept'): paxos_pb2.request_acceptance.SerializeToString,
368 | ('dist_sgd.PaxosServer', 'accepted'): paxos_pb2.consensus.SerializeToString,
369 | ('dist_sgd.PaxosServer', 'ping'): paxos_pb2.empty.SerializeToString,
370 | ('dist_sgd.PaxosServer', 'prepare'): paxos_pb2.proposal.SerializeToString,
371 | }
372 | response_deserializers = {
373 | ('dist_sgd.PaxosServer', 'accept'): paxos_pb2.acquiescence.FromString,
374 | ('dist_sgd.PaxosServer', 'accepted'): paxos_pb2.empty.FromString,
375 | ('dist_sgd.PaxosServer', 'ping'): paxos_pb2.empty.FromString,
376 | ('dist_sgd.PaxosServer', 'prepare'): paxos_pb2.ack.FromString,
377 | }
378 | cardinalities = {
379 | 'accept': cardinality.Cardinality.UNARY_UNARY,
380 | 'accepted': cardinality.Cardinality.UNARY_UNARY,
381 | 'ping': cardinality.Cardinality.UNARY_UNARY,
382 | 'prepare': cardinality.Cardinality.UNARY_UNARY,
383 | }
384 | stub_options = beta_implementations.stub_options(host=host, metadata_transformer=metadata_transformer, request_serializers=request_serializers, response_deserializers=response_deserializers, thread_pool=pool, thread_pool_size=pool_size)
385 | return beta_implementations.dynamic_stub(channel, 'dist_sgd.PaxosServer', cardinalities, options=stub_options)
386 | # @@protoc_insertion_point(module_scope)
387 |
--------------------------------------------------------------------------------
/python-python/protobuf_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Default python file required for initializing the module for
2 | # protobuffer utilities. More documentation included in the next file.
--------------------------------------------------------------------------------
/python-python/protobuf_utils/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/protobuf_utils/__init__.pyc
--------------------------------------------------------------------------------
/python-python/protobuf_utils/utils.py:
--------------------------------------------------------------------------------
1 | import autograd.numpy as np
2 | import autograd.numpy.random as npr
3 | from autograd import grad
4 | import dist_sgd_pb2
5 |
6 | def convert_array_to_bytes(params):
7 | if (params.dtype == np.float64):
8 | params = params.astype(np.float32)
9 | param_bytes = params.tostring()
10 | return param_bytes
11 |
12 | def convert_bytes_to_array(param_bytes):
13 | params = np.fromstring(param_bytes, dtype=np.float32)
14 | return params
15 |
16 | def convert_tensor_iter(tensor_bytes, data_indx):
17 | CHUNK_SIZE = 524228
18 | tensor_bytes_len = len(tensor_bytes)
19 | tensor_chunk_count = 0
20 | while len(tensor_bytes):
21 | tensor_chunk_count += 1
22 | tensor_content = tensor_bytes[:CHUNK_SIZE]
23 | tensor_bytes = tensor_bytes[CHUNK_SIZE:]
24 | yield dist_sgd_pb2.SubTensor(tensor_len = tensor_bytes_len, tensor_chunk = tensor_chunk_count, tensor_content = tensor_content, data_indx = data_indx)
--------------------------------------------------------------------------------
/python-python/protobuf_utils/utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/python-python/protobuf_utils/utils.pyc
--------------------------------------------------------------------------------
/python-python/protos/dist_sgd.proto:
--------------------------------------------------------------------------------
1 | //Protocol buffers for project
2 |
3 | syntax = "proto3";
4 |
5 | package dist_sgd;
6 |
7 | option java_multiple_files = true;
8 | option java_package = "io.dist_sgd";
9 | option java_outer_classname = "DistSGD";
10 | //option objc_class_prefix = "DSG";
11 |
12 | // Main server for passing infromation around
13 | service ParamFeeder {
14 | // Sends the parameters back and forth between server and client
15 | rpc SendParams (ClientInfo) returns (stream SubTensor) {}
16 |
17 | // Sends information about the next batch
18 | rpc SendNextBatch (PrevBatch) returns (NextBatch) {}
19 |
20 | // Gets gardient updates from client servers
21 | rpc GetUpdates (stream SubTensor) returns (StatusCode) {}
22 |
23 | // This call simply makes sure that all machines have begun to run Paxos.
24 | rpc ping (empty) returns (empty) {}
25 |
26 | }
27 |
28 | message SubTensor {
29 | // Length of the tensor getting passed
30 | int32 tensor_len = 1;
31 |
32 | // Current chunk of the tensor
33 | int32 tensor_chunk = 2;
34 |
35 | // Serialized tensor getting passed
36 | bytes tensor_content = 3;
37 |
38 | // Batch for gradient update, used to determine whether or not
39 | // the gradient is stale and should be thrown out
40 | int32 data_indx = 4;
41 | }
42 |
43 | // Later on we can extend client info to include information about processing speed, etc.
44 | message ClientInfo {
45 | int32 client_id = 1;
46 | }
47 |
48 | // Includes information about sucesss and failure
49 | message StatusCode {
50 | int32 status = 1;
51 | }
52 |
53 | message PrevBatch {
54 | int32 client_id = 1;
55 |
56 | int32 prev_data_indx = 2;
57 | }
58 |
59 | message NextBatch {
60 | int32 client_id = 1;
61 |
62 | int32 data_indx = 2;
63 | }
64 |
65 | message empty {}
--------------------------------------------------------------------------------
/python-python/protos/dist_sgd_pb2.py:
--------------------------------------------------------------------------------
1 | # Generated by the protocol buffer compiler. DO NOT EDIT!
2 | # source: protos/dist_sgd.proto
3 |
4 | import sys
5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
6 | from google.protobuf import descriptor as _descriptor
7 | from google.protobuf import message as _message
8 | from google.protobuf import reflection as _reflection
9 | from google.protobuf import symbol_database as _symbol_database
10 | from google.protobuf import descriptor_pb2
11 | # @@protoc_insertion_point(imports)
12 |
13 | _sym_db = _symbol_database.Default()
14 |
15 |
16 |
17 |
18 | DESCRIPTOR = _descriptor.FileDescriptor(
19 | name='protos/dist_sgd.proto',
20 | package='dist_sgd',
21 | syntax='proto3',
22 | serialized_pb=_b('\n\x15protos/dist_sgd.proto\x12\x08\x64ist_sgd\"`\n\tSubTensor\x12\x12\n\ntensor_len\x18\x01 \x01(\x05\x12\x14\n\x0ctensor_chunk\x18\x02 \x01(\x05\x12\x16\n\x0etensor_content\x18\x03 \x01(\x0c\x12\x11\n\tdata_indx\x18\x04 \x01(\x05\"\x1f\n\nClientInfo\x12\x11\n\tclient_id\x18\x01 \x01(\x05\"\x1c\n\nStatusCode\x12\x0e\n\x06status\x18\x01 \x01(\x05\"6\n\tPrevBatch\x12\x11\n\tclient_id\x18\x01 \x01(\x05\x12\x16\n\x0eprev_data_indx\x18\x02 \x01(\x05\"1\n\tNextBatch\x12\x11\n\tclient_id\x18\x01 \x01(\x05\x12\x11\n\tdata_indx\x18\x02 \x01(\x05\"\x07\n\x05\x65mpty2\xf0\x01\n\x0bParamFeeder\x12;\n\nSendParams\x12\x14.dist_sgd.ClientInfo\x1a\x13.dist_sgd.SubTensor\"\x00\x30\x01\x12;\n\rSendNextBatch\x12\x13.dist_sgd.PrevBatch\x1a\x13.dist_sgd.NextBatch\"\x00\x12;\n\nGetUpdates\x12\x13.dist_sgd.SubTensor\x1a\x14.dist_sgd.StatusCode\"\x00(\x01\x12*\n\x04ping\x12\x0f.dist_sgd.empty\x1a\x0f.dist_sgd.empty\"\x00\x42\x18\n\x0bio.dist_sgdB\x07\x44istSGDP\x01\x62\x06proto3')
23 | )
24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
25 |
26 |
27 |
28 |
29 | _SUBTENSOR = _descriptor.Descriptor(
30 | name='SubTensor',
31 | full_name='dist_sgd.SubTensor',
32 | filename=None,
33 | file=DESCRIPTOR,
34 | containing_type=None,
35 | fields=[
36 | _descriptor.FieldDescriptor(
37 | name='tensor_len', full_name='dist_sgd.SubTensor.tensor_len', index=0,
38 | number=1, type=5, cpp_type=1, label=1,
39 | has_default_value=False, default_value=0,
40 | message_type=None, enum_type=None, containing_type=None,
41 | is_extension=False, extension_scope=None,
42 | options=None),
43 | _descriptor.FieldDescriptor(
44 | name='tensor_chunk', full_name='dist_sgd.SubTensor.tensor_chunk', index=1,
45 | number=2, type=5, cpp_type=1, label=1,
46 | has_default_value=False, default_value=0,
47 | message_type=None, enum_type=None, containing_type=None,
48 | is_extension=False, extension_scope=None,
49 | options=None),
50 | _descriptor.FieldDescriptor(
51 | name='tensor_content', full_name='dist_sgd.SubTensor.tensor_content', index=2,
52 | number=3, type=12, cpp_type=9, label=1,
53 | has_default_value=False, default_value=_b(""),
54 | message_type=None, enum_type=None, containing_type=None,
55 | is_extension=False, extension_scope=None,
56 | options=None),
57 | _descriptor.FieldDescriptor(
58 | name='data_indx', full_name='dist_sgd.SubTensor.data_indx', index=3,
59 | number=4, type=5, cpp_type=1, label=1,
60 | has_default_value=False, default_value=0,
61 | message_type=None, enum_type=None, containing_type=None,
62 | is_extension=False, extension_scope=None,
63 | options=None),
64 | ],
65 | extensions=[
66 | ],
67 | nested_types=[],
68 | enum_types=[
69 | ],
70 | options=None,
71 | is_extendable=False,
72 | syntax='proto3',
73 | extension_ranges=[],
74 | oneofs=[
75 | ],
76 | serialized_start=35,
77 | serialized_end=131,
78 | )
79 |
80 |
81 | _CLIENTINFO = _descriptor.Descriptor(
82 | name='ClientInfo',
83 | full_name='dist_sgd.ClientInfo',
84 | filename=None,
85 | file=DESCRIPTOR,
86 | containing_type=None,
87 | fields=[
88 | _descriptor.FieldDescriptor(
89 | name='client_id', full_name='dist_sgd.ClientInfo.client_id', index=0,
90 | number=1, type=5, cpp_type=1, label=1,
91 | has_default_value=False, default_value=0,
92 | message_type=None, enum_type=None, containing_type=None,
93 | is_extension=False, extension_scope=None,
94 | options=None),
95 | ],
96 | extensions=[
97 | ],
98 | nested_types=[],
99 | enum_types=[
100 | ],
101 | options=None,
102 | is_extendable=False,
103 | syntax='proto3',
104 | extension_ranges=[],
105 | oneofs=[
106 | ],
107 | serialized_start=133,
108 | serialized_end=164,
109 | )
110 |
111 |
112 | _STATUSCODE = _descriptor.Descriptor(
113 | name='StatusCode',
114 | full_name='dist_sgd.StatusCode',
115 | filename=None,
116 | file=DESCRIPTOR,
117 | containing_type=None,
118 | fields=[
119 | _descriptor.FieldDescriptor(
120 | name='status', full_name='dist_sgd.StatusCode.status', index=0,
121 | number=1, type=5, cpp_type=1, label=1,
122 | has_default_value=False, default_value=0,
123 | message_type=None, enum_type=None, containing_type=None,
124 | is_extension=False, extension_scope=None,
125 | options=None),
126 | ],
127 | extensions=[
128 | ],
129 | nested_types=[],
130 | enum_types=[
131 | ],
132 | options=None,
133 | is_extendable=False,
134 | syntax='proto3',
135 | extension_ranges=[],
136 | oneofs=[
137 | ],
138 | serialized_start=166,
139 | serialized_end=194,
140 | )
141 |
142 |
143 | _PREVBATCH = _descriptor.Descriptor(
144 | name='PrevBatch',
145 | full_name='dist_sgd.PrevBatch',
146 | filename=None,
147 | file=DESCRIPTOR,
148 | containing_type=None,
149 | fields=[
150 | _descriptor.FieldDescriptor(
151 | name='client_id', full_name='dist_sgd.PrevBatch.client_id', index=0,
152 | number=1, type=5, cpp_type=1, label=1,
153 | has_default_value=False, default_value=0,
154 | message_type=None, enum_type=None, containing_type=None,
155 | is_extension=False, extension_scope=None,
156 | options=None),
157 | _descriptor.FieldDescriptor(
158 | name='prev_data_indx', full_name='dist_sgd.PrevBatch.prev_data_indx', index=1,
159 | number=2, type=5, cpp_type=1, label=1,
160 | has_default_value=False, default_value=0,
161 | message_type=None, enum_type=None, containing_type=None,
162 | is_extension=False, extension_scope=None,
163 | options=None),
164 | ],
165 | extensions=[
166 | ],
167 | nested_types=[],
168 | enum_types=[
169 | ],
170 | options=None,
171 | is_extendable=False,
172 | syntax='proto3',
173 | extension_ranges=[],
174 | oneofs=[
175 | ],
176 | serialized_start=196,
177 | serialized_end=250,
178 | )
179 |
180 |
181 | _NEXTBATCH = _descriptor.Descriptor(
182 | name='NextBatch',
183 | full_name='dist_sgd.NextBatch',
184 | filename=None,
185 | file=DESCRIPTOR,
186 | containing_type=None,
187 | fields=[
188 | _descriptor.FieldDescriptor(
189 | name='client_id', full_name='dist_sgd.NextBatch.client_id', index=0,
190 | number=1, type=5, cpp_type=1, label=1,
191 | has_default_value=False, default_value=0,
192 | message_type=None, enum_type=None, containing_type=None,
193 | is_extension=False, extension_scope=None,
194 | options=None),
195 | _descriptor.FieldDescriptor(
196 | name='data_indx', full_name='dist_sgd.NextBatch.data_indx', index=1,
197 | number=2, type=5, cpp_type=1, label=1,
198 | has_default_value=False, default_value=0,
199 | message_type=None, enum_type=None, containing_type=None,
200 | is_extension=False, extension_scope=None,
201 | options=None),
202 | ],
203 | extensions=[
204 | ],
205 | nested_types=[],
206 | enum_types=[
207 | ],
208 | options=None,
209 | is_extendable=False,
210 | syntax='proto3',
211 | extension_ranges=[],
212 | oneofs=[
213 | ],
214 | serialized_start=252,
215 | serialized_end=301,
216 | )
217 |
218 |
219 | _EMPTY = _descriptor.Descriptor(
220 | name='empty',
221 | full_name='dist_sgd.empty',
222 | filename=None,
223 | file=DESCRIPTOR,
224 | containing_type=None,
225 | fields=[
226 | ],
227 | extensions=[
228 | ],
229 | nested_types=[],
230 | enum_types=[
231 | ],
232 | options=None,
233 | is_extendable=False,
234 | syntax='proto3',
235 | extension_ranges=[],
236 | oneofs=[
237 | ],
238 | serialized_start=303,
239 | serialized_end=310,
240 | )
241 |
242 | DESCRIPTOR.message_types_by_name['SubTensor'] = _SUBTENSOR
243 | DESCRIPTOR.message_types_by_name['ClientInfo'] = _CLIENTINFO
244 | DESCRIPTOR.message_types_by_name['StatusCode'] = _STATUSCODE
245 | DESCRIPTOR.message_types_by_name['PrevBatch'] = _PREVBATCH
246 | DESCRIPTOR.message_types_by_name['NextBatch'] = _NEXTBATCH
247 | DESCRIPTOR.message_types_by_name['empty'] = _EMPTY
248 |
249 | SubTensor = _reflection.GeneratedProtocolMessageType('SubTensor', (_message.Message,), dict(
250 | DESCRIPTOR = _SUBTENSOR,
251 | __module__ = 'protos.dist_sgd_pb2'
252 | # @@protoc_insertion_point(class_scope:dist_sgd.SubTensor)
253 | ))
254 | _sym_db.RegisterMessage(SubTensor)
255 |
256 | ClientInfo = _reflection.GeneratedProtocolMessageType('ClientInfo', (_message.Message,), dict(
257 | DESCRIPTOR = _CLIENTINFO,
258 | __module__ = 'protos.dist_sgd_pb2'
259 | # @@protoc_insertion_point(class_scope:dist_sgd.ClientInfo)
260 | ))
261 | _sym_db.RegisterMessage(ClientInfo)
262 |
263 | StatusCode = _reflection.GeneratedProtocolMessageType('StatusCode', (_message.Message,), dict(
264 | DESCRIPTOR = _STATUSCODE,
265 | __module__ = 'protos.dist_sgd_pb2'
266 | # @@protoc_insertion_point(class_scope:dist_sgd.StatusCode)
267 | ))
268 | _sym_db.RegisterMessage(StatusCode)
269 |
270 | PrevBatch = _reflection.GeneratedProtocolMessageType('PrevBatch', (_message.Message,), dict(
271 | DESCRIPTOR = _PREVBATCH,
272 | __module__ = 'protos.dist_sgd_pb2'
273 | # @@protoc_insertion_point(class_scope:dist_sgd.PrevBatch)
274 | ))
275 | _sym_db.RegisterMessage(PrevBatch)
276 |
277 | NextBatch = _reflection.GeneratedProtocolMessageType('NextBatch', (_message.Message,), dict(
278 | DESCRIPTOR = _NEXTBATCH,
279 | __module__ = 'protos.dist_sgd_pb2'
280 | # @@protoc_insertion_point(class_scope:dist_sgd.NextBatch)
281 | ))
282 | _sym_db.RegisterMessage(NextBatch)
283 |
284 | empty = _reflection.GeneratedProtocolMessageType('empty', (_message.Message,), dict(
285 | DESCRIPTOR = _EMPTY,
286 | __module__ = 'protos.dist_sgd_pb2'
287 | # @@protoc_insertion_point(class_scope:dist_sgd.empty)
288 | ))
289 | _sym_db.RegisterMessage(empty)
290 |
291 |
292 | DESCRIPTOR.has_options = True
293 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n\013io.dist_sgdB\007DistSGDP\001'))
294 | import abc
295 | import six
296 | from grpc.beta import implementations as beta_implementations
297 | from grpc.framework.common import cardinality
298 | from grpc.framework.interfaces.face import utilities as face_utilities
299 |
300 | class BetaParamFeederServicer(six.with_metaclass(abc.ABCMeta, object)):
301 | """"""
302 | @abc.abstractmethod
303 | def SendParams(self, request, context):
304 | raise NotImplementedError()
305 | @abc.abstractmethod
306 | def SendNextBatch(self, request, context):
307 | raise NotImplementedError()
308 | @abc.abstractmethod
309 | def GetUpdates(self, request_iterator, context):
310 | raise NotImplementedError()
311 | @abc.abstractmethod
312 | def ping(self, request, context):
313 | raise NotImplementedError()
314 |
315 | class BetaParamFeederStub(six.with_metaclass(abc.ABCMeta, object)):
316 | """The interface to which stubs will conform."""
317 | @abc.abstractmethod
318 | def SendParams(self, request, timeout):
319 | raise NotImplementedError()
320 | @abc.abstractmethod
321 | def SendNextBatch(self, request, timeout):
322 | raise NotImplementedError()
323 | SendNextBatch.future = None
324 | @abc.abstractmethod
325 | def GetUpdates(self, request_iterator, timeout):
326 | raise NotImplementedError()
327 | GetUpdates.future = None
328 | @abc.abstractmethod
329 | def ping(self, request, timeout):
330 | raise NotImplementedError()
331 | ping.future = None
332 |
333 | def beta_create_ParamFeeder_server(servicer, pool=None, pool_size=None, default_timeout=None, maximum_timeout=None):
334 | import protos.dist_sgd_pb2
335 | import protos.dist_sgd_pb2
336 | import protos.dist_sgd_pb2
337 | import protos.dist_sgd_pb2
338 | import protos.dist_sgd_pb2
339 | import protos.dist_sgd_pb2
340 | import protos.dist_sgd_pb2
341 | import protos.dist_sgd_pb2
342 | request_deserializers = {
343 | ('dist_sgd.ParamFeeder', 'GetUpdates'): protos.dist_sgd_pb2.SubTensor.FromString,
344 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): protos.dist_sgd_pb2.PrevBatch.FromString,
345 | ('dist_sgd.ParamFeeder', 'SendParams'): protos.dist_sgd_pb2.ClientInfo.FromString,
346 | ('dist_sgd.ParamFeeder', 'ping'): protos.dist_sgd_pb2.empty.FromString,
347 | }
348 | response_serializers = {
349 | ('dist_sgd.ParamFeeder', 'GetUpdates'): protos.dist_sgd_pb2.StatusCode.SerializeToString,
350 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): protos.dist_sgd_pb2.NextBatch.SerializeToString,
351 | ('dist_sgd.ParamFeeder', 'SendParams'): protos.dist_sgd_pb2.SubTensor.SerializeToString,
352 | ('dist_sgd.ParamFeeder', 'ping'): protos.dist_sgd_pb2.empty.SerializeToString,
353 | }
354 | method_implementations = {
355 | ('dist_sgd.ParamFeeder', 'GetUpdates'): face_utilities.stream_unary_inline(servicer.GetUpdates),
356 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): face_utilities.unary_unary_inline(servicer.SendNextBatch),
357 | ('dist_sgd.ParamFeeder', 'SendParams'): face_utilities.unary_stream_inline(servicer.SendParams),
358 | ('dist_sgd.ParamFeeder', 'ping'): face_utilities.unary_unary_inline(servicer.ping),
359 | }
360 | server_options = beta_implementations.server_options(request_deserializers=request_deserializers, response_serializers=response_serializers, thread_pool=pool, thread_pool_size=pool_size, default_timeout=default_timeout, maximum_timeout=maximum_timeout)
361 | return beta_implementations.server(method_implementations, options=server_options)
362 |
363 | def beta_create_ParamFeeder_stub(channel, host=None, metadata_transformer=None, pool=None, pool_size=None):
364 | import protos.dist_sgd_pb2
365 | import protos.dist_sgd_pb2
366 | import protos.dist_sgd_pb2
367 | import protos.dist_sgd_pb2
368 | import protos.dist_sgd_pb2
369 | import protos.dist_sgd_pb2
370 | import protos.dist_sgd_pb2
371 | import protos.dist_sgd_pb2
372 | request_serializers = {
373 | ('dist_sgd.ParamFeeder', 'GetUpdates'): protos.dist_sgd_pb2.SubTensor.SerializeToString,
374 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): protos.dist_sgd_pb2.PrevBatch.SerializeToString,
375 | ('dist_sgd.ParamFeeder', 'SendParams'): protos.dist_sgd_pb2.ClientInfo.SerializeToString,
376 | ('dist_sgd.ParamFeeder', 'ping'): protos.dist_sgd_pb2.empty.SerializeToString,
377 | }
378 | response_deserializers = {
379 | ('dist_sgd.ParamFeeder', 'GetUpdates'): protos.dist_sgd_pb2.StatusCode.FromString,
380 | ('dist_sgd.ParamFeeder', 'SendNextBatch'): protos.dist_sgd_pb2.NextBatch.FromString,
381 | ('dist_sgd.ParamFeeder', 'SendParams'): protos.dist_sgd_pb2.SubTensor.FromString,
382 | ('dist_sgd.ParamFeeder', 'ping'): protos.dist_sgd_pb2.empty.FromString,
383 | }
384 | cardinalities = {
385 | 'GetUpdates': cardinality.Cardinality.STREAM_UNARY,
386 | 'SendNextBatch': cardinality.Cardinality.UNARY_UNARY,
387 | 'SendParams': cardinality.Cardinality.UNARY_STREAM,
388 | 'ping': cardinality.Cardinality.UNARY_UNARY,
389 | }
390 | stub_options = beta_implementations.stub_options(host=host, metadata_transformer=metadata_transformer, request_serializers=request_serializers, response_deserializers=response_deserializers, thread_pool=pool, thread_pool_size=pool_size)
391 | return beta_implementations.dynamic_stub(channel, 'dist_sgd.ParamFeeder', cardinalities, options=stub_options)
392 | # @@protoc_insertion_point(module_scope)
393 |
--------------------------------------------------------------------------------
/python-python/protos/paxos.proto:
--------------------------------------------------------------------------------
1 | syntax = "proto3";
2 |
3 | package dist_sgd;
4 |
5 | option java_multiple_files = true;
6 | option java_package = "io.dist_sgd";
7 | option java_outer_classname = "DistSGD";
8 |
9 | // After getting a majority of proposals without getting rejected, the
10 | // machine chooses an IP from the most recent acknowledgements or one
11 | // that it generates itself and calls accept on all servers.
12 |
13 | // The servers return whether or not they accept. If majority of servers accept,
14 | // then it calls consensus on all servers and sends them the decided upon IP address
15 | // that is server
16 |
17 | // TECHNICALLY, each machine should call consensus
18 |
19 | // Main server for running the Paxos Algorithm. Everyone hosts this server on
20 | // their localhost. Used for sending and receiving messages for coordinating Paxos.
21 | service PaxosServer {
22 |
23 | // The machine sends each server a proposal. The server then
24 | // sends an acknowledgement accepting or rejecting the proposal.
25 | rpc prepare(proposal) returns (ack) {}
26 |
27 | // Requests that people accept the proposal
28 | rpc accept(request_acceptance) returns (acquiescence) {}
29 |
30 | // Notified that consensus has been achieved about a server
31 | // Technically each server should broadcast that it accepted the consensus
32 | rpc accepted (consensus) returns (empty) {}
33 |
34 | // This call simply makes sure that all machines have begun to run Paxos.
35 | rpc ping (empty) returns (empty) {}
36 | }
37 |
38 | message ack {
39 | float n = 1;
40 | string v = 2;
41 | float n_v = 3;
42 | }
43 |
44 | message proposal {
45 | float n = 1;
46 | }
47 |
48 | message request_acceptance{
49 | float n = 1;
50 | string v = 2;
51 | }
52 |
53 | message acquiescence {
54 | bool accept_bool = 1;
55 | }
56 |
57 | message consensus {
58 | float n = 1;
59 | string v = 2;
60 | }
61 |
62 | message empty {
63 | }
--------------------------------------------------------------------------------
/python-python/protos/paxos_pb2.py:
--------------------------------------------------------------------------------
1 | # Generated by the protocol buffer compiler. DO NOT EDIT!
2 | # source: protos/paxos.proto
3 |
4 | import sys
5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
6 | from google.protobuf import descriptor as _descriptor
7 | from google.protobuf import message as _message
8 | from google.protobuf import reflection as _reflection
9 | from google.protobuf import symbol_database as _symbol_database
10 | from google.protobuf import descriptor_pb2
11 | # @@protoc_insertion_point(imports)
12 |
13 | _sym_db = _symbol_database.Default()
14 |
15 |
16 |
17 |
18 | DESCRIPTOR = _descriptor.FileDescriptor(
19 | name='protos/paxos.proto',
20 | package='dist_sgd',
21 | syntax='proto3',
22 | serialized_pb=_b('\n\x12protos/paxos.proto\x12\x08\x64ist_sgd\"(\n\x03\x61\x63k\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\x12\x0b\n\x03n_v\x18\x03 \x01(\x02\"\x15\n\x08proposal\x12\t\n\x01n\x18\x01 \x01(\x02\"*\n\x12request_acceptance\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\"#\n\x0c\x61\x63quiescence\x12\x13\n\x0b\x61\x63\x63\x65pt_bool\x18\x01 \x01(\x08\"!\n\tconsensus\x12\t\n\x01n\x18\x01 \x01(\x02\x12\t\n\x01v\x18\x02 \x01(\t\"\x07\n\x05\x65mpty2\xdf\x01\n\x0bPaxosServer\x12.\n\x07prepare\x12\x12.dist_sgd.proposal\x1a\r.dist_sgd.ack\"\x00\x12@\n\x06\x61\x63\x63\x65pt\x12\x1c.dist_sgd.request_acceptance\x1a\x16.dist_sgd.acquiescence\"\x00\x12\x32\n\x08\x61\x63\x63\x65pted\x12\x13.dist_sgd.consensus\x1a\x0f.dist_sgd.empty\"\x00\x12*\n\x04ping\x12\x0f.dist_sgd.empty\x1a\x0f.dist_sgd.empty\"\x00\x42\x18\n\x0bio.dist_sgdB\x07\x44istSGDP\x01\x62\x06proto3')
23 | )
24 | _sym_db.RegisterFileDescriptor(DESCRIPTOR)
25 |
26 |
27 |
28 |
29 | _ACK = _descriptor.Descriptor(
30 | name='ack',
31 | full_name='dist_sgd.ack',
32 | filename=None,
33 | file=DESCRIPTOR,
34 | containing_type=None,
35 | fields=[
36 | _descriptor.FieldDescriptor(
37 | name='n', full_name='dist_sgd.ack.n', index=0,
38 | number=1, type=2, cpp_type=6, label=1,
39 | has_default_value=False, default_value=0,
40 | message_type=None, enum_type=None, containing_type=None,
41 | is_extension=False, extension_scope=None,
42 | options=None),
43 | _descriptor.FieldDescriptor(
44 | name='v', full_name='dist_sgd.ack.v', index=1,
45 | number=2, type=9, cpp_type=9, label=1,
46 | has_default_value=False, default_value=_b("").decode('utf-8'),
47 | message_type=None, enum_type=None, containing_type=None,
48 | is_extension=False, extension_scope=None,
49 | options=None),
50 | _descriptor.FieldDescriptor(
51 | name='n_v', full_name='dist_sgd.ack.n_v', index=2,
52 | number=3, type=2, cpp_type=6, label=1,
53 | has_default_value=False, default_value=0,
54 | message_type=None, enum_type=None, containing_type=None,
55 | is_extension=False, extension_scope=None,
56 | options=None),
57 | ],
58 | extensions=[
59 | ],
60 | nested_types=[],
61 | enum_types=[
62 | ],
63 | options=None,
64 | is_extendable=False,
65 | syntax='proto3',
66 | extension_ranges=[],
67 | oneofs=[
68 | ],
69 | serialized_start=32,
70 | serialized_end=72,
71 | )
72 |
73 |
74 | _PROPOSAL = _descriptor.Descriptor(
75 | name='proposal',
76 | full_name='dist_sgd.proposal',
77 | filename=None,
78 | file=DESCRIPTOR,
79 | containing_type=None,
80 | fields=[
81 | _descriptor.FieldDescriptor(
82 | name='n', full_name='dist_sgd.proposal.n', index=0,
83 | number=1, type=2, cpp_type=6, label=1,
84 | has_default_value=False, default_value=0,
85 | message_type=None, enum_type=None, containing_type=None,
86 | is_extension=False, extension_scope=None,
87 | options=None),
88 | ],
89 | extensions=[
90 | ],
91 | nested_types=[],
92 | enum_types=[
93 | ],
94 | options=None,
95 | is_extendable=False,
96 | syntax='proto3',
97 | extension_ranges=[],
98 | oneofs=[
99 | ],
100 | serialized_start=74,
101 | serialized_end=95,
102 | )
103 |
104 |
105 | _REQUEST_ACCEPTANCE = _descriptor.Descriptor(
106 | name='request_acceptance',
107 | full_name='dist_sgd.request_acceptance',
108 | filename=None,
109 | file=DESCRIPTOR,
110 | containing_type=None,
111 | fields=[
112 | _descriptor.FieldDescriptor(
113 | name='n', full_name='dist_sgd.request_acceptance.n', index=0,
114 | number=1, type=2, cpp_type=6, label=1,
115 | has_default_value=False, default_value=0,
116 | message_type=None, enum_type=None, containing_type=None,
117 | is_extension=False, extension_scope=None,
118 | options=None),
119 | _descriptor.FieldDescriptor(
120 | name='v', full_name='dist_sgd.request_acceptance.v', index=1,
121 | number=2, type=9, cpp_type=9, label=1,
122 | has_default_value=False, default_value=_b("").decode('utf-8'),
123 | message_type=None, enum_type=None, containing_type=None,
124 | is_extension=False, extension_scope=None,
125 | options=None),
126 | ],
127 | extensions=[
128 | ],
129 | nested_types=[],
130 | enum_types=[
131 | ],
132 | options=None,
133 | is_extendable=False,
134 | syntax='proto3',
135 | extension_ranges=[],
136 | oneofs=[
137 | ],
138 | serialized_start=97,
139 | serialized_end=139,
140 | )
141 |
142 |
143 | _ACQUIESCENCE = _descriptor.Descriptor(
144 | name='acquiescence',
145 | full_name='dist_sgd.acquiescence',
146 | filename=None,
147 | file=DESCRIPTOR,
148 | containing_type=None,
149 | fields=[
150 | _descriptor.FieldDescriptor(
151 | name='accept_bool', full_name='dist_sgd.acquiescence.accept_bool', index=0,
152 | number=1, type=8, cpp_type=7, label=1,
153 | has_default_value=False, default_value=False,
154 | message_type=None, enum_type=None, containing_type=None,
155 | is_extension=False, extension_scope=None,
156 | options=None),
157 | ],
158 | extensions=[
159 | ],
160 | nested_types=[],
161 | enum_types=[
162 | ],
163 | options=None,
164 | is_extendable=False,
165 | syntax='proto3',
166 | extension_ranges=[],
167 | oneofs=[
168 | ],
169 | serialized_start=141,
170 | serialized_end=176,
171 | )
172 |
173 |
174 | _CONSENSUS = _descriptor.Descriptor(
175 | name='consensus',
176 | full_name='dist_sgd.consensus',
177 | filename=None,
178 | file=DESCRIPTOR,
179 | containing_type=None,
180 | fields=[
181 | _descriptor.FieldDescriptor(
182 | name='n', full_name='dist_sgd.consensus.n', index=0,
183 | number=1, type=2, cpp_type=6, label=1,
184 | has_default_value=False, default_value=0,
185 | message_type=None, enum_type=None, containing_type=None,
186 | is_extension=False, extension_scope=None,
187 | options=None),
188 | _descriptor.FieldDescriptor(
189 | name='v', full_name='dist_sgd.consensus.v', index=1,
190 | number=2, type=9, cpp_type=9, label=1,
191 | has_default_value=False, default_value=_b("").decode('utf-8'),
192 | message_type=None, enum_type=None, containing_type=None,
193 | is_extension=False, extension_scope=None,
194 | options=None),
195 | ],
196 | extensions=[
197 | ],
198 | nested_types=[],
199 | enum_types=[
200 | ],
201 | options=None,
202 | is_extendable=False,
203 | syntax='proto3',
204 | extension_ranges=[],
205 | oneofs=[
206 | ],
207 | serialized_start=178,
208 | serialized_end=211,
209 | )
210 |
211 |
212 | _EMPTY = _descriptor.Descriptor(
213 | name='empty',
214 | full_name='dist_sgd.empty',
215 | filename=None,
216 | file=DESCRIPTOR,
217 | containing_type=None,
218 | fields=[
219 | ],
220 | extensions=[
221 | ],
222 | nested_types=[],
223 | enum_types=[
224 | ],
225 | options=None,
226 | is_extendable=False,
227 | syntax='proto3',
228 | extension_ranges=[],
229 | oneofs=[
230 | ],
231 | serialized_start=213,
232 | serialized_end=220,
233 | )
234 |
235 | DESCRIPTOR.message_types_by_name['ack'] = _ACK
236 | DESCRIPTOR.message_types_by_name['proposal'] = _PROPOSAL
237 | DESCRIPTOR.message_types_by_name['request_acceptance'] = _REQUEST_ACCEPTANCE
238 | DESCRIPTOR.message_types_by_name['acquiescence'] = _ACQUIESCENCE
239 | DESCRIPTOR.message_types_by_name['consensus'] = _CONSENSUS
240 | DESCRIPTOR.message_types_by_name['empty'] = _EMPTY
241 |
242 | ack = _reflection.GeneratedProtocolMessageType('ack', (_message.Message,), dict(
243 | DESCRIPTOR = _ACK,
244 | __module__ = 'protos.paxos_pb2'
245 | # @@protoc_insertion_point(class_scope:dist_sgd.ack)
246 | ))
247 | _sym_db.RegisterMessage(ack)
248 |
249 | proposal = _reflection.GeneratedProtocolMessageType('proposal', (_message.Message,), dict(
250 | DESCRIPTOR = _PROPOSAL,
251 | __module__ = 'protos.paxos_pb2'
252 | # @@protoc_insertion_point(class_scope:dist_sgd.proposal)
253 | ))
254 | _sym_db.RegisterMessage(proposal)
255 |
256 | request_acceptance = _reflection.GeneratedProtocolMessageType('request_acceptance', (_message.Message,), dict(
257 | DESCRIPTOR = _REQUEST_ACCEPTANCE,
258 | __module__ = 'protos.paxos_pb2'
259 | # @@protoc_insertion_point(class_scope:dist_sgd.request_acceptance)
260 | ))
261 | _sym_db.RegisterMessage(request_acceptance)
262 |
263 | acquiescence = _reflection.GeneratedProtocolMessageType('acquiescence', (_message.Message,), dict(
264 | DESCRIPTOR = _ACQUIESCENCE,
265 | __module__ = 'protos.paxos_pb2'
266 | # @@protoc_insertion_point(class_scope:dist_sgd.acquiescence)
267 | ))
268 | _sym_db.RegisterMessage(acquiescence)
269 |
270 | consensus = _reflection.GeneratedProtocolMessageType('consensus', (_message.Message,), dict(
271 | DESCRIPTOR = _CONSENSUS,
272 | __module__ = 'protos.paxos_pb2'
273 | # @@protoc_insertion_point(class_scope:dist_sgd.consensus)
274 | ))
275 | _sym_db.RegisterMessage(consensus)
276 |
277 | empty = _reflection.GeneratedProtocolMessageType('empty', (_message.Message,), dict(
278 | DESCRIPTOR = _EMPTY,
279 | __module__ = 'protos.paxos_pb2'
280 | # @@protoc_insertion_point(class_scope:dist_sgd.empty)
281 | ))
282 | _sym_db.RegisterMessage(empty)
283 |
284 |
285 | DESCRIPTOR.has_options = True
286 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n\013io.dist_sgdB\007DistSGDP\001'))
287 | import abc
288 | import six
289 | from grpc.beta import implementations as beta_implementations
290 | from grpc.framework.common import cardinality
291 | from grpc.framework.interfaces.face import utilities as face_utilities
292 |
293 | class BetaPaxosServerServicer(six.with_metaclass(abc.ABCMeta, object)):
294 | """"""
295 | @abc.abstractmethod
296 | def prepare(self, request, context):
297 | raise NotImplementedError()
298 | @abc.abstractmethod
299 | def accept(self, request, context):
300 | raise NotImplementedError()
301 | @abc.abstractmethod
302 | def accepted(self, request, context):
303 | raise NotImplementedError()
304 | @abc.abstractmethod
305 | def ping(self, request, context):
306 | raise NotImplementedError()
307 |
308 | class BetaPaxosServerStub(six.with_metaclass(abc.ABCMeta, object)):
309 | """The interface to which stubs will conform."""
310 | @abc.abstractmethod
311 | def prepare(self, request, timeout):
312 | raise NotImplementedError()
313 | prepare.future = None
314 | @abc.abstractmethod
315 | def accept(self, request, timeout):
316 | raise NotImplementedError()
317 | accept.future = None
318 | @abc.abstractmethod
319 | def accepted(self, request, timeout):
320 | raise NotImplementedError()
321 | accepted.future = None
322 | @abc.abstractmethod
323 | def ping(self, request, timeout):
324 | raise NotImplementedError()
325 | ping.future = None
326 |
327 | def beta_create_PaxosServer_server(servicer, pool=None, pool_size=None, default_timeout=None, maximum_timeout=None):
328 | import protos.paxos_pb2
329 | import protos.paxos_pb2
330 | import protos.paxos_pb2
331 | import protos.paxos_pb2
332 | import protos.paxos_pb2
333 | import protos.paxos_pb2
334 | import protos.paxos_pb2
335 | import protos.paxos_pb2
336 | request_deserializers = {
337 | ('dist_sgd.PaxosServer', 'accept'): protos.paxos_pb2.request_acceptance.FromString,
338 | ('dist_sgd.PaxosServer', 'accepted'): protos.paxos_pb2.consensus.FromString,
339 | ('dist_sgd.PaxosServer', 'ping'): protos.paxos_pb2.empty.FromString,
340 | ('dist_sgd.PaxosServer', 'prepare'): protos.paxos_pb2.proposal.FromString,
341 | }
342 | response_serializers = {
343 | ('dist_sgd.PaxosServer', 'accept'): protos.paxos_pb2.acquiescence.SerializeToString,
344 | ('dist_sgd.PaxosServer', 'accepted'): protos.paxos_pb2.empty.SerializeToString,
345 | ('dist_sgd.PaxosServer', 'ping'): protos.paxos_pb2.empty.SerializeToString,
346 | ('dist_sgd.PaxosServer', 'prepare'): protos.paxos_pb2.ack.SerializeToString,
347 | }
348 | method_implementations = {
349 | ('dist_sgd.PaxosServer', 'accept'): face_utilities.unary_unary_inline(servicer.accept),
350 | ('dist_sgd.PaxosServer', 'accepted'): face_utilities.unary_unary_inline(servicer.accepted),
351 | ('dist_sgd.PaxosServer', 'ping'): face_utilities.unary_unary_inline(servicer.ping),
352 | ('dist_sgd.PaxosServer', 'prepare'): face_utilities.unary_unary_inline(servicer.prepare),
353 | }
354 | server_options = beta_implementations.server_options(request_deserializers=request_deserializers, response_serializers=response_serializers, thread_pool=pool, thread_pool_size=pool_size, default_timeout=default_timeout, maximum_timeout=maximum_timeout)
355 | return beta_implementations.server(method_implementations, options=server_options)
356 |
357 | def beta_create_PaxosServer_stub(channel, host=None, metadata_transformer=None, pool=None, pool_size=None):
358 | import protos.paxos_pb2
359 | import protos.paxos_pb2
360 | import protos.paxos_pb2
361 | import protos.paxos_pb2
362 | import protos.paxos_pb2
363 | import protos.paxos_pb2
364 | import protos.paxos_pb2
365 | import protos.paxos_pb2
366 | request_serializers = {
367 | ('dist_sgd.PaxosServer', 'accept'): protos.paxos_pb2.request_acceptance.SerializeToString,
368 | ('dist_sgd.PaxosServer', 'accepted'): protos.paxos_pb2.consensus.SerializeToString,
369 | ('dist_sgd.PaxosServer', 'ping'): protos.paxos_pb2.empty.SerializeToString,
370 | ('dist_sgd.PaxosServer', 'prepare'): protos.paxos_pb2.proposal.SerializeToString,
371 | }
372 | response_deserializers = {
373 | ('dist_sgd.PaxosServer', 'accept'): protos.paxos_pb2.acquiescence.FromString,
374 | ('dist_sgd.PaxosServer', 'accepted'): protos.paxos_pb2.empty.FromString,
375 | ('dist_sgd.PaxosServer', 'ping'): protos.paxos_pb2.empty.FromString,
376 | ('dist_sgd.PaxosServer', 'prepare'): protos.paxos_pb2.ack.FromString,
377 | }
378 | cardinalities = {
379 | 'accept': cardinality.Cardinality.UNARY_UNARY,
380 | 'accepted': cardinality.Cardinality.UNARY_UNARY,
381 | 'ping': cardinality.Cardinality.UNARY_UNARY,
382 | 'prepare': cardinality.Cardinality.UNARY_UNARY,
383 | }
384 | stub_options = beta_implementations.stub_options(host=host, metadata_transformer=metadata_transformer, request_serializers=request_serializers, response_deserializers=response_deserializers, thread_pool=pool, thread_pool_size=pool_size)
385 | return beta_implementations.dynamic_stub(channel, 'dist_sgd.PaxosServer', cardinalities, options=stub_options)
386 | # @@protoc_insertion_point(module_scope)
387 |
--------------------------------------------------------------------------------
/python-python/run_codegen.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2015, Google Inc.
3 | # All rights reserved.
4 | #
5 | # Redistribution and use in source and binary forms, with or without
6 | # modification, are permitted provided that the following conditions are
7 | # met:
8 | #
9 | # * Redistributions of source code must retain the above copyright
10 | # notice, this list of conditions and the following disclaimer.
11 | # * Redistributions in binary form must reproduce the above
12 | # copyright notice, this list of conditions and the following disclaimer
13 | # in the documentation and/or other materials provided with the
14 | # distribution.
15 | # * Neither the name of Google Inc. nor the names of its
16 | # contributors may be used to endorse or promote products derived from
17 | # this software without specific prior written permission.
18 | #
19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
31 | # Runs the protoc with gRPC plugin to generate protocol messages and gRPC stubs.
32 | protoc -I . --python_out=. --grpc_out=. --plugin=protoc-gen-grpc=`which grpc_python_plugin` ./protos/dist_sgd.proto
33 | protoc -I . --python_out=. --grpc_out=. --plugin=protoc-gen-grpc=`which grpc_python_plugin` ./protos/paxos.proto
34 |
--------------------------------------------------------------------------------
/python-python/server.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------
2 | # Implements a parameter server. The server takes parameter updates in and
3 | # sends back the most up to date parameters. This server also keeps track of
4 | # the current training/test error.
5 | # ------------------------------------------------------------
6 |
7 | from __future__ import absolute_import
8 | from __future__ import print_function
9 | import time
10 |
11 | import dist_sgd_pb2
12 | from sets import Set
13 |
14 | import autograd.numpy as np
15 | import autograd.numpy.random as npr
16 | from autograd import grad
17 |
18 | from nnet.neural_net import *
19 | from protobuf_utils.utils import *
20 | from server_utils.utils import *
21 |
22 | import traceback
23 |
24 | _ONE_DAY_IN_SECONDS = 60 * 60 * 24
25 |
26 | _REQUIRED_CHILDREN = 1
27 |
28 | # Data files that we are training from. This is the small demo set.
29 | images_fname = 'data/images(16).npy'
30 | labels_fname = 'data/output_labels(16).npy'
31 |
32 | class ParamFeeder(dist_sgd_pb2.BetaParamFeederServicer):
33 | def __init__(self, W = None, prevBatch=None):
34 | # Keeps track of all child IDs that it has seen so far
35 | self.child_ids = Set([])
36 |
37 | # Load and process Caltech data
38 | self.train_images, self.train_labels, self.test_images, self.test_labels = load_caltech100(images_fname, labels_fname)
39 | self.image_input_d = self.train_images.shape[1]
40 |
41 | # Network parameters
42 | self.layer_sizes = [self.image_input_d, 800, 600, 400, 350, 250, 101]
43 |
44 | # Training parameters
45 | self.param_scale = 0.1
46 | self.learning_rate = 1e-5
47 | self.momentum = 0.9
48 | self.batch_size = 256
49 | self.num_epochs = 50
50 | self.L2_reg = 1.0
51 |
52 | # Make neural net functions
53 | self.N_weights, self.pred_fun, self.loss_fun, self.frac_err = make_nn_funs(self.layer_sizes, self.L2_reg)
54 | self.loss_grad = grad(self.loss_fun)
55 |
56 | # Initialize weights
57 | if W is None:
58 | rs = npr.RandomState()
59 | self.W = rs.randn(self.N_weights) * self.param_scale
60 | else:
61 | # Passed in weights
62 | self.W = W
63 | self.param_len = self.W.shape[0]
64 | log_info("# of parameters:")
65 | log_info(self.param_len)
66 |
67 | # Train with sgd
68 | self.batch_idxs = make_batches(self.train_images.shape[0], self.batch_size)
69 |
70 | # Set the current batch to zero unless it has been passed in
71 | self.epoch = 0
72 | if prevBatch is None:
73 | self.batch_num = 0
74 | else:
75 | self.batch_num = prevBatch
76 | self.n_batches = len(self.batch_idxs)
77 |
78 | # Initialize information about the clients
79 | self.n_childs = 0
80 | self.max_client_id = 0
81 |
82 | # Intializes starting information about training
83 | self.prev_test_perf = 1
84 |
85 | # The batches that are currently being processed
86 | self.batches_processing = {}
87 |
88 | # The batches that were failed to process, model training machine may have failed
89 | # Send these batches to a new machine
90 | self.batches_unprocessed = []
91 |
92 | log_info('Data loaded on server, waiting for clients....')
93 | log_info('Number of child processes: 0')
94 |
95 | # Logs the current performance of the model. Called once per epoch.
96 | def log_info_perf(self, epoch):
97 | test_perf = self.frac_err(self.W, self.test_images, self.test_labels)
98 | train_perf = self.frac_err(self.W, self.train_images, self.train_labels)
99 | if test_perf > self.prev_test_perf:
100 | self.learning_rate = 0.1 * self.learning_rate
101 | self.prev_test_perf = test_perf
102 | log_info("Epoch {0}, TrainErr {1:5}, TestErr {2:5}, LR {3:2}".format(self.epoch, train_perf, test_perf, self.learning_rate))
103 |
104 | # Streams updates from the client.
105 | def GetUpdates(self, request_iterator, context):
106 | tensor_bytes = ''
107 | for subtensor in request_iterator:
108 | tensor_bytes = tensor_bytes + subtensor.tensor_content
109 |
110 | # Serialize the tensor
111 | grad_W = convert_bytes_to_array(tensor_bytes)
112 |
113 | # Gradient descent
114 | self.W -= 0.5 * self.learning_rate * grad_W
115 |
116 | return dist_sgd_pb2.StatusCode(status=1)
117 |
118 | # Sends the next batch that the client should process
119 | def SendNextBatch(self, request, context):
120 | # Figure out what the maximum client_id is. If client_id does not exist,
121 | # assigns the client a new id.
122 | if request.client_id == 0:
123 | self.max_client_id += 1
124 | request.client_id = self.max_client_id
125 | else:
126 | self.max_client_id = max(request.client_id, self.max_client_id)
127 |
128 | # Does not start until a sufficient number of child processes exists
129 | self.child_ids.add(request.client_id)
130 | if len(self.child_ids) != self.n_childs:
131 | self.n_childs = len(self.child_ids)
132 | log_info('Number of child processes: ' + str(len(self.child_ids)))
133 | if len(self.child_ids) < _REQUIRED_CHILDREN:
134 | return dist_sgd_pb2.NextBatch(client_id=request.client_id, data_indx = -1)
135 |
136 | # Logs information about previous batch timing
137 | if request.prev_data_indx != -1:
138 | log_info('Time taken to process batch {0} was {1:.2f} by client {2}'.format(request.prev_data_indx, (time.time() - self.batches_processing[request.prev_data_indx]), request.client_id))
139 | del self.batches_processing[request.prev_data_indx]
140 |
141 | # log_info epoch information if we've hit the end of an epoch
142 | if self.batch_num == self.n_batches:
143 | self.batch_num, self.epoch = 0, self.epoch + 1
144 | self.log_info_perf(self.epoch)
145 |
146 | # Takes any previously failed batches first, otherwise takes next batch
147 | if self.batches_unprocessed != []:
148 | cur_batchnum = self.batches_unprocessed.pop(0)
149 | else:
150 | cur_batchnum, self.batch_num = self.batch_num, self.batch_num + 1
151 |
152 | # Save the time that the next batch was sent out on the server
153 | self.batches_processing[cur_batchnum] = time.time()
154 |
155 | return dist_sgd_pb2.NextBatch(client_id=request.client_id, data_indx = cur_batchnum)
156 |
157 | # This sends the parameters from the server to the client by converting the tensor into a
158 | # protobuffer and streaming it
159 | def SendParams(self, request, context):
160 | CHUNK_SIZE = 524228
161 | tensor_bytes = convert_array_to_bytes(self.W)
162 | tensor_bytes_len = len(tensor_bytes)
163 | tensor_chunk_count = 0
164 | try:
165 | while len(tensor_bytes):
166 | tensor_chunk_count += 1
167 | tensor_content = tensor_bytes[:CHUNK_SIZE]
168 | tensor_bytes = tensor_bytes[CHUNK_SIZE:]
169 | yield dist_sgd_pb2.SubTensor(tensor_len = tensor_bytes_len, tensor_chunk = tensor_chunk_count, tensor_content = tensor_content, data_indx= -1)
170 | except Exception, e:
171 | traceback.print_exc()
172 |
173 | # Function to ping the server to see if it is available
174 | def ping(self, request, context):
175 | return dist_sgd_pb2.empty()
176 |
177 | # Main function that is called to instantiate the server and have
178 | # it connect and send or receieve parameters from clients.
179 | def serve(hostname, W = None, prev_batch = None, local_id = None):
180 | # Set up the server on port 50051
181 | hostname = '[::]:50051'
182 | BATCH_TRAIN_TIMEOUT = 60
183 |
184 | # Instantiate the server and add the port
185 | param_feeder = ParamFeeder(W, prev_batch)
186 | server = dist_sgd_pb2.beta_create_ParamFeeder_server(param_feeder)
187 | server.add_insecure_port(hostname)
188 |
189 | # Begin the server
190 | server.start()
191 | try:
192 | while True:
193 | time.sleep(BATCH_TRAIN_TIMEOUT)
194 |
195 | except KeyboardInterrupt:
196 | server.stop(0)
197 | raise KeyboardInterrupt
198 |
199 | if __name__ == '__main__':
200 | serve('[::]:50051')
--------------------------------------------------------------------------------
/python-python/start.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | #! /home/candokevin/anaconda2/bin/python
3 | cd /home/candokevin/stash/distributed-sgd/python-python
4 | git pull
5 | rm /home/candokevin/log.txt
6 | while true; do
7 | python client.py >> /home/candokevin/log.txt
8 | done
9 |
--------------------------------------------------------------------------------
/slides/.Rhistory:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/.Rhistory
--------------------------------------------------------------------------------
/slides/common_slides.sty:
--------------------------------------------------------------------------------
1 | \setbeamertemplate{navigation symbols}{}
2 | \let\tempone\itemize
3 | \let\temptwo\enditemize
4 | \renewenvironment{itemize}{\tempone\addtolength{\itemsep}{0.5\baselineskip}}{\temptwo}
5 | % \usepackage{beamerthemeshadow}
6 | \usepackage{ulem}
7 | % \usepackage{movie15}
8 | \usepackage{mathpazo}
9 | % \usepackage{palatino}
10 |
11 | \usepackage{tikz}
12 | \usepackage{hyperref}
13 | \usepackage{natbib}
14 | \usepackage{pgffor}
15 | \usepackage{booktabs}
16 | \usepackage{amssymb}
17 | \usepackage{tikz,etoolbox}
18 | \usepackage{subcaption}
19 | \usepackage{url}
20 | \usepackage{pgf}
21 | \usepackage{latexsym}
22 | \usepackage{amsfonts}
23 | \usepackage{amssymb}
24 | \usepackage{amsthm}
25 | \usepackage{algorithm}
26 | \usepackage{amsmath}
27 | \usepackage{tabularx}
28 | \usepackage{mathtools}
29 | \usepackage{algorithm}
30 | \usepackage{algpseudocode}
31 |
32 | \usetikzlibrary{arrows,positioning,automata,positioning,spy,matrix,scopes,chains}
33 |
34 | \setbeamersize{text margin left=6mm}
35 | \setbeamersize{text margin right=6mm}
36 | \renewcommand{\insertnavigation}[1]{}
37 | \setbeamertemplate{headline}{}
38 | \setbeamertemplate{footline}{}
39 | % \usefonttheme{professionalfonts}
40 | % make itemize things larger
41 | %\setbeamerfont*{itemize/enumerate body}{size=\Large}
42 | %\setbeamerfont*{itemize/enumerate subbody}{size=\large}
43 | \setbeamercovered{transparent}
44 | \mode
45 | %\mode
46 | \linespread{1.25}
47 |
48 | \usepackage{color}
49 | \usepackage{multirow}
50 | \usepackage{rotating}
51 | \usepackage[all,dvips]{xy}
52 | \usepackage{colortbl}
53 | \usepackage{graphicx}
54 | \usepackage{verbatim}
55 | \usepackage{framed}
56 | \usepackage{natbib}
57 | \usepackage[labelformat=empty]{caption}
58 | \newcommand{\air}{\vspace{0.25cm}}
59 | % \newcommand{\mair}{\vspace{-0.25cm}}
60 |
61 | \setbeamertemplate{navigation symbols}{}%remove navigation symbols
62 | \renewcommand{\rmdefault}{crm}
63 | \newcommand{\lnbrack}{{\normalfont [}}
64 | \newcommand{\rnbrack}{{\normalfont ]}\thinspace}
65 | \newcommand{\lbbrack}{\textcolor{red}{\textbf{[}}}
66 | \newcommand{\rbbrack}{\textcolor{red}{\textbf{]}}\thinspace}
67 | \definecolor{vermillion}{RGB}{213,94,0}
68 |
69 | \definecolor{orange}{RGB}{230,159,0}
70 | \definecolor{skyblue}{RGB}{86,180,233}
71 | \definecolor{bluegreen}{RGB}{0,158,115}
72 | \definecolor{myyellow}{RGB}{240,228,66} % i dunno if this is the same as standard yellow
73 | \definecolor{myblue}{RGB}{0,114,178}
74 | \definecolor{vermillion}{RGB}{213,94,0}
75 | \definecolor{redpurple}{RGB}{204,121,167}
76 | \definecolor{lightgrey}{RGB}{234,234,234}
77 |
78 | \AtBeginSection[]
79 | {
80 | \begin{frame}
81 | \frametitle{Contents}
82 | \tableofcontents[currentsection]
83 | \end{frame}
84 | }
85 | % \AtBeginSection[]{
86 | % \begin{frame}
87 | % \vfill
88 | % \centering
89 | % \begin{beamercolorbox}[sep=8pt,center,shadow=true,rounded=true]{title}
90 | % \usebeamerfont{title}\insertsectionhead\par%
91 | % \end{beamercolorbox}
92 | % \vfill
93 | % \end{frame}
94 | % }
95 |
96 | \newcommand{\boldA}{\mathbf{A}}
97 | \newcommand{\boldB}{\mathbf{B}}
98 | \newcommand{\boldC}{\mathbf{C}}
99 | \newcommand{\boldD}{\mathbf{D}}
100 | \newcommand{\boldE}{\mathbf{E}}
101 | \newcommand{\boldF}{\mathbf{F}}
102 | \newcommand{\boldG}{\mathbf{G}}
103 | \newcommand{\boldH}{\mathbf{H}}
104 | \newcommand{\boldI}{\mathbf{I}}
105 | \newcommand{\boldJ}{\mathbf{J}}
106 | \newcommand{\boldK}{\mathbf{K}}
107 | \newcommand{\boldL}{\mathbf{L}}
108 | \newcommand{\boldM}{\mathbf{M}}
109 | \newcommand{\boldN}{\mathbf{N}}
110 | \newcommand{\boldO}{\mathbf{O}}
111 | \newcommand{\boldP}{\mathbf{P}}
112 | \newcommand{\boldQ}{\mathbf{Q}}
113 | \newcommand{\boldR}{\mathbf{R}}
114 | \newcommand{\boldS}{\mathbf{S}}
115 | \newcommand{\boldT}{\mathbf{T}}
116 | \newcommand{\boldU}{\mathbf{U}}
117 | \newcommand{\boldV}{\mathbf{V}}
118 | \newcommand{\boldW}{\mathbf{W}}
119 | \newcommand{\boldX}{\mathbf{X}}
120 | \newcommand{\boldY}{\mathbf{Y}}
121 | \newcommand{\boldZ}{\mathbf{Z}}
122 | \newcommand{\bolda}{\mathbf{a}}
123 | \newcommand{\boldb}{\mathbf{b}}
124 | \newcommand{\boldc}{\mathbf{c}}
125 | \newcommand{\boldd}{\mathbf{d}}
126 | \newcommand{\bolde}{\mathbf{e}}
127 | \newcommand{\boldf}{\mathbf{f}}
128 | \newcommand{\boldg}{\mathbf{g}}
129 | \newcommand{\boldh}{\mathbf{h}}
130 | \newcommand{\boldi}{\mathbf{i}}
131 | \newcommand{\boldj}{\mathbf{j}}
132 | \newcommand{\boldk}{\mathbf{k}}
133 | \newcommand{\boldl}{\mathbf{l}}
134 | \newcommand{\boldm}{\mathbf{m}}
135 | \newcommand{\boldn}{\mathbf{n}}
136 | \newcommand{\boldo}{\mathbf{o}}
137 | \newcommand{\boldp}{\mathbf{p}}
138 | \newcommand{\boldq}{\mathbf{q}}
139 | \newcommand{\boldr}{\mathbf{r}}
140 | \newcommand{\bolds}{\mathbf{s}}
141 | \newcommand{\boldt}{\mathbf{t}}
142 | \newcommand{\boldu}{\mathbf{u}}
143 | \newcommand{\boldv}{\mathbf{v}}
144 | \newcommand{\boldw}{\mathbf{w}}
145 | \newcommand{\boldx}{\mathbf{x}}
146 | \newcommand{\boldy}{\mathbf{y}}
147 | \newcommand{\boldz}{\mathbf{z}}
148 |
149 | \newcommand{\bolddelta}{\boldsymbol{\delta}}
150 | \newcommand{\indicator}{\mathbf{1}}
151 | \newcommand{\mcA}{\mathcal{A}}
152 | \newcommand{\mcB}{\mathcal{B}}
153 | \newcommand{\mcC}{\mathcal{C}}
154 | \newcommand{\mcD}{\mathcal{D}}
155 | \newcommand{\mcE}{\mathcal{E}}
156 | \newcommand{\mcF}{\mathcal{F}}
157 | \newcommand{\mcG}{\mathcal{G}}
158 | \newcommand{\mcH}{\mathcal{H}}
159 | \newcommand{\mcI}{\mathcal{I}}
160 | \newcommand{\mcJ}{\mathcal{J}}
161 | \newcommand{\mcK}{\mathcal{K}}
162 | \newcommand{\mcL}{\mathcal{L}}
163 | \newcommand{\mcM}{\mathcal{M}}
164 | \newcommand{\mcN}{\mathcal{N}}
165 | \newcommand{\mcO}{\mathcal{O}}
166 | \newcommand{\mcP}{\mathcal{P}}
167 | \newcommand{\mcQ}{\mathcal{Q}}
168 | \newcommand{\mcR}{\mathcal{R}}
169 | \newcommand{\mcS}{\mathcal{S}}
170 | \newcommand{\mcT}{\mathcal{T}}
171 | \newcommand{\mcU}{\mathcal{U}}
172 | \newcommand{\mcV}{\mathcal{V}}
173 | \newcommand{\mcW}{\mathcal{W}}
174 | \newcommand{\mcX}{\mathcal{X}}
175 | \newcommand{\mcY}{\mathcal{Y}}
176 | \newcommand{\mcZ}{\mathcal{Z}}
177 |
178 | \newcommand{\reals}{\ensuremath{\mathbb{R}}}
179 | \newcommand{\integers}{\ensuremath{\mathbb{Z}}}
180 | \newcommand{\rationals}{\ensuremath{\mathbb{Q}}}
181 | \newcommand{\naturals}{\ensuremath{\mathbb{N}}}
182 | \newcommand{\trans}{\ensuremath{\mathsf{T}}}
183 | \newcommand{\ident}{\mathbf{I}}
184 | \newcommand{\bzero}{\mathbf{0}}
185 |
186 | \newcommand{\balpha}{\boldsymbol{\alpha}}
187 | \newcommand{\bbeta}{\boldsymbol{\beta}}
188 | \newcommand{\boldeta}{\boldsymbol{\eta}}
189 | \newcommand{\bkappa}{\boldsymbol{\kappa}}
190 | \newcommand{\bgamma}{\boldsymbol{\gamma}}
191 | \newcommand{\bmu}{\boldsymbol{\mu}}
192 | \newcommand{\bphi}{\boldsymbol{\phi}}
193 | \newcommand{\bpi}{\boldsymbol{\pi}}
194 | \newcommand{\bpsi}{\boldsymbol{\psi}}
195 | \newcommand{\bsigma}{\boldsymbol{\sigma}}
196 | \newcommand{\btheta}{\boldsymbol{\theta}}
197 | \newcommand{\bxi}{\boldsymbol{\xi}}
198 | \newcommand{\bGamma}{\boldsymbol{\Gamma}}
199 | \newcommand{\bLambda}{\boldsymbol{\Lambda}}
200 | \newcommand{\bOmega}{\boldsymbol{\Omega}}
201 | \newcommand{\bPhi}{\boldsymbol{\Phi}}
202 | \newcommand{\bPi}{\boldsymbol{\Pi}}
203 | \newcommand{\bPsi}{\boldsymbol{\Psi}}
204 | \newcommand{\bSigma}{\boldsymbol{\Sigma}}
205 | \newcommand{\bTheta}{\boldsymbol{\Theta}}
206 | \newcommand{\bUpsilon}{\boldsymbol{\Upsilon}}
207 | \newcommand{\bXi}{\boldsymbol{\Xi}}
208 | \newcommand{\bepsilon}{\boldsymbol{\epsilon}}
209 |
210 | \def\argmin{\operatornamewithlimits{arg\,min}}
211 | \def\argmax{\operatornamewithlimits{arg\,max}}
212 | \def\softmax{\operatornamewithlimits{softmax}}
213 | \def\relu{\operatornamewithlimits{ReLU}}
214 |
215 | \newcommand{\given}{\,|\,}
216 | \newcommand{\distNorm}{\mathcal{N}}
217 |
218 |
219 |
220 | \newcommand{\din}{{d_{\mathrm{in}}}}
221 | \newcommand{\dhid}{{d_{\mathrm{hid}}}}
222 | \newcommand{\dwin}{{d_{\mathrm{win}}}}
223 | \newcommand{\dout}{{d_{\mathrm{out}}}}
224 | \newcommand{\demb}{{d_{\mathrm{emb}}}}
225 |
226 | \algtext*{EndWhile}% Remove "end while" text
227 | \algtext*{EndFor}% Remove "end while" text
228 | \algtext*{EndIf}% Remove "end if" text
229 | \algtext*{EndProcedure}% Remove "end while" text
230 |
--------------------------------------------------------------------------------
/slides/img/2d_func.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/2d_func.jpg
--------------------------------------------------------------------------------
/slides/img/dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/dataset.png
--------------------------------------------------------------------------------
/slides/img/deep_learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/deep_learning.png
--------------------------------------------------------------------------------
/slides/img/dist_16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/dist_16.png
--------------------------------------------------------------------------------
/slides/img/dist_train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/dist_train.png
--------------------------------------------------------------------------------
/slides/img/downpour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/downpour.png
--------------------------------------------------------------------------------
/slides/img/gRPC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/gRPC.png
--------------------------------------------------------------------------------
/slides/img/large_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/large_data.png
--------------------------------------------------------------------------------
/slides/img/lin_v_nonlin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/lin_v_nonlin.png
--------------------------------------------------------------------------------
/slides/img/sandblaster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/img/sandblaster.png
--------------------------------------------------------------------------------
/slides/main.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/michaelfarrell76/Distributed-SGD/6ce9fcb83dcb72f1fcf45bb2fe5091e2f9212178/slides/main.pdf
--------------------------------------------------------------------------------
/slides/main.tex:
--------------------------------------------------------------------------------
1 | \documentclass{beamer}
2 | \usepackage{./common_slides}
3 | \usepackage[absolute,overlay]{textpos}
4 | \usepackage{graphicx}
5 |
6 |
7 | \title{ Distributed Stochastic Gradient Descent }
8 |
9 | \author{Kevin Yang and Michael Farrell}
10 | \begin{document}
11 |
12 | \begin{frame}w
13 | \titlepage
14 | \end{frame}
15 |
16 | \begin{frame}{Motivation - Deep Learning}
17 |
18 | \begin{columns}[T] % align columns
19 | \begin{column}{.48\textwidth}
20 | \begin{itemize}
21 | \item Deep-Learning
22 | \begin{itemize}
23 | \item Objective: Learn a complicated, non-linear function that minimizes some loss function
24 | \end{itemize}
25 | \item Why do we need deep models?
26 | \begin{itemize}
27 | \item The class of linear functions is inadequate for many problems.
28 | \end{itemize}
29 | \end{itemize}
30 | \end{column}%
31 | \hfill%
32 | \begin{column}{.48\textwidth}
33 | \begin{figure}
34 | \includegraphics[scale = .35]{./img/deep_learning}
35 | \caption{\scalebox{.3}{http://www.rsipvision.com/exploring-deep-learning/}}
36 | \end{figure}
37 | \begin{figure}
38 | \includegraphics[scale = .17]{./img/lin_v_nonlin}
39 | \caption{\scalebox{.3}{http://sebastianraschka.com/Articles/2014{\_}naive{\_}bayes{\_}1.html}}
40 | \end{figure}
41 | \end{column}%
42 | \end{columns}
43 | \end{frame}
44 |
45 | \begin{frame}{Motivation - Deep Learning}
46 | \begin{itemize}
47 | \item How do we learn these deep models?
48 | \begin{itemize}
49 | \item Choose a random example
50 | \item Run the neural network on the example
51 | \item Adjust the parameters of the network such that our loss function is minimized more than it was before
52 | \item Repeat
53 | \end{itemize}
54 | \pause
55 | \item Difficulties?
56 | \begin{itemize}
57 | \item Local Minima
58 | \item Non-convexity
59 | \item Neural Networks can have millions or even billions of parameters
60 | \end{itemize}
61 | \end{itemize}
62 | \begin{textblock*}{5cm}(8cm,.5cm) % {block width} (coords)
63 | \includegraphics[scale = .3]{./img/2d_func}
64 | \end{textblock*}
65 | \end{frame}
66 |
67 | \begin{frame}{Motivation - SGD}
68 | \begin{itemize}
69 | \item How do we maximize our reward function?
70 | \begin{itemize}
71 | \item One common technique is Stochastic Gradient Descent
72 | \item $\mathbf w$ is the vector of parameters for the model
73 | \item $\eta$ is the learning rate
74 | \item $\mathbf f(\mathbf w)$ is the loss function evaluated with the current parameters $\mathbf w$
75 | \item
76 | \begin{algorithmic}
77 | \State $\mathbf w \gets \mathbf 0$
78 | \While {$\mathbf f(\mathbf w)$ is not minimized}
79 | \For {$i = 1, n$}
80 | \State $\mathbf w \gets \mathbf w - \eta\nabla f(\mathbf w)$
81 | \EndFor
82 | \EndWhile
83 |
84 | \end{algorithmic}
85 | \item As the number of training examples, $n$, and the number of parameters, $|\mathbf w|$, increases, this algorithm quickly becomes very slow...
86 | \end{itemize}
87 | \end{itemize}
88 | \end{frame}
89 |
90 | \begin{frame}{Motivation - Distributed SGD}
91 | \begin{itemize}
92 | \item Since some of these models take days/weeks/months to run, we would hope that we could use a distributed computing cluster in order to parallelize this process.
93 | \pause
94 | \item Learn from Google!
95 | \begin{itemize}
96 | \item DistBelief- 2012
97 | \begin{itemize}
98 | \item Downpour SGD
99 | \item Sandblaster L-BFGS
100 | \end{itemize}
101 | \item TensorFlow- 2015
102 | \begin{itemize}
103 | \item gRPC
104 | \end{itemize}
105 | \end{itemize}
106 | \end{itemize}
107 |
108 | \end{frame}
109 |
110 | \begin{frame}{DistBelief - Downpour SGD}
111 | \begin{itemize}
112 | \item ``An asynchronous stochastic gradient descent procedure supporting a large number of model replicas." \footnote{Diagram taken from Dean et al. \it{Large Scale Distributed Deep Networks}
113 | }
114 | \end{itemize}
115 | $$\includegraphics[scale = .5]{./img/downpour}$$
116 | \end{frame}
117 |
118 | \begin{frame}{DistBelief - Sandblaster L-BFGS}
119 | \begin{itemize}
120 | \item ``A framework that supports a variety of distributed batch optimization procedures, including a distributed implementation of L-BFGS" \footnote{Diagram taken from Dean et al. \it{Large Scale Distributed Deep Networks}}
121 | \end{itemize}
122 | $$\includegraphics[scale = .5]{./img/sandblaster}$$
123 | \end{frame}
124 |
125 | \begin{frame}{TensorFlow-GRPC}
126 | \begin{itemize}
127 | \item Second Generation ML Model focused on distributing models to CPUs and GPUs
128 | \item Uses the high performance RPC framework (GRPC \footnote{Diagram taken from http://www.grpc.io/}) in order to communicate between separate processes
129 | \begin{itemize}
130 | \item Uses Protocol Buffers -v3
131 | \item C-based
132 | \item Client-server stubs in 10+ languages and counting
133 | \end{itemize}
134 | \end{itemize}
135 | $$\includegraphics[scale = .2]{./img/gRPC}$$
136 | \end{frame}
137 |
138 | \begin{frame}{DistBelief/TensorFlow Summary}
139 | \begin{itemize}
140 | \item TensorFlow is basically the second version of DistBelief that is approximately twice as fast and much more user-friendly.
141 | \item Results from DistBelief" \footnote{Diagram taken from Dean et al. \it{Large Scale Distributed Deep Networks}}:
142 | \end{itemize}
143 | $$\includegraphics[scale = .18]{./img/dist_train}\includegraphics[scale = .18]{./img/dist_16}$$
144 | \end{frame}
145 |
146 | \begin{frame}{Our Project}
147 | \begin{itemize}
148 | \item We frequently run into scenarios where we have a model that trains incredibly slowly on our local machines. As a consequence, we hope to benefit from additional cloud computing resources and build our own Distributed SGD system based on DistBelief and TensorFlow systems.
149 | \begin{itemize}
150 | \item The Distributed SGD system will have the user give a function that returns the outputs of a model, a function that returns the gradients of a model, and the number of machines to train the model on.
151 | \item Use GRPC with Protocol Buffers to communicated between processes, similar to TensorFlow.
152 | \item Implement Downpour-SGD which seems to be the most effective model with limited resources.
153 | \end{itemize}
154 | \end{itemize}
155 | \end{frame}
156 |
157 |
158 | \begin{frame}{Our Example}
159 | \begin{itemize}
160 | \item To test our system, we're working with the Caltech 101 Computational Vision dataset \footnote{L. Fei-Fei, R. Fergus and P. Perona. \it{Learning generative visual models
161 | from few training examples: an incremental Bayesian approach tested on
162 | 101 object categories.}}. In this dataset, there are about 20,000 pictures of objects in 101 categories. All of these images are around 300 x 200 pixels in size.
163 | \item We've implemented a convolutional neural net that tries to classify what object is represented in the image.
164 |
165 | $$\includegraphics[scale = .30]{./img/dataset.png}$$
166 | \end{itemize}
167 | \end{frame}
168 |
169 | \begin{frame}{Computational Resources}
170 | \begin{itemize}
171 | \item We are using Google Cloud Compute Engine to set up VMs and run the code. To run classification on our image dataset, we're using small instances with 6GB of RAM with 2 cores. This has a rate of 7.8 cents per hour.
172 | \item On a machine of this size, running 10 epochs of gradient descent takes 56 minutes.
173 | \item To streamline things, we've preconfigured images of a parameter server and model training server that are already set up with relevant code, tools, and libraries.
174 | \item As a result, setting up and launching the compute instances necessary for model training takes only a couple lines.
175 | \end{itemize}
176 | \end{frame}
177 |
178 | \begin{frame}{Implementing Downpour-SGD}
179 | \begin{itemize}
180 | \item The Downpour-SGD requires the passing of parameters and parameter updates between processes. In our example, we have 74,770,901 parameters and the size of our parameters is 0.5GB.
181 | \item Bottleneck here is the network. Parameters can be $>>$0.5Gb.
182 | \item We can leverage the fact that some of these models are extremely sparse
183 | \begin{itemize}
184 | \item only send parameters updated
185 | \item only update parameters every $n_x$ times
186 | \end{itemize}
187 | \item Explore protocol buffer streams
188 | \end{itemize}
189 | $$\includegraphics[scale = .27]{./img/large_data}$$
190 | \end{frame}
191 |
192 | \begin{frame}{Main Distributed System Challenges}
193 | \begin{itemize}
194 | \item Network Issues
195 | \begin{itemize}
196 | \item We have to deal with network latency and try to reduce transportation cost as much as possible in order for our models to train properly.
197 | \item We would like to experiment with a couple different RPCs to optimize the speed of our system.
198 | \end{itemize}
199 | \item Fault tolerance
200 | \begin{itemize}
201 | \item We need to make our system as resilient as possible against failures. Because all of these machines are doing a lot of computation while running gradient descent and manipulating parameters, these systems are bound to fail with relatively high frequently.
202 | \item Having methods in place to detect and remedy the failure of parameter servers and model replicas will be critical.
203 | \end{itemize}
204 |
205 | \end{itemize}
206 | \end{frame}
207 |
208 | \end{document}
209 |
--------------------------------------------------------------------------------