├── .gitignore
├── README.md
├── _config.yml
├── assets
    ├── paper.pdf
    └── poster.pdf
└── space_time_benchmarks
    ├── Makefile
    ├── bin
        ├── download-ckpts.sh
        ├── format-code.sh
        ├── start-mps.sh
        └── stop-mps.sh
    ├── notebooks
        ├── 1-anomoly.ipynb
        ├── 2-basic-plots.ipynb
        ├── 3-mps-batch-plots.ipynb
        ├── 4-anomoly-mps.ipynb
        └── 5-POC-Threading-With-Torch.ipynb
    ├── requirements.txt
    └── src
        ├── client.py
        ├── experiments
            ├── config.py
            └── generate.py
        ├── master.py
        └── models
            ├── __init__.py
            ├── tf_models.py
            └── torch_models.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | learningsys-2018-gpu-mux/
 3 | *.log
 4 | Makefile.run
 5 | ckpts/
 6 | .ipynb_checkpoints/
 7 | *.png
 8 | __pycache__/
 9 | *.pq
10 | *.ckpt
11 | tf-models
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Paper repo for "Dynamic Space-Time Scheduling for GPU Inference"
 2 | -----------------------------------------
 3 | 
 4 | ## Paper and Posters
 5 | [Paper](https://github.com/ucbrise/caravel/raw/master/assets/paper.pdf)
 6 | 
 7 | [Poster](https://github.com/ucbrise/caravel/raw/master/assets/poster.pdf)
 8 | 
 9 | ## Abstract
10 | Serving deep neural networks in latency critical interactive settings often requires GPU acceleration. However, the small batch sizes typical in online inference results in poor GPU utilization, a potential performance gap which GPU resource sharing can address.
11 | 
12 | In this paper, we explore several techniques to leverage both temporal and spatial multiplexing to improve GPU utilization for deep learning inference workloads. We evaluate the performance trade-offs of each approach with respect to resource-efficiency, latency predictability, and isolation when compared with conventional batched inference.
13 | 
14 | Our experimental analysis suggests up to a 5x potential for improved utilization through the exploration of more advanced spatial and temporal multiplexing strategies. Our preliminary prototype of a dynamic space-time scheduler demonstrates a 3.23x floating-point throughput increase over space-only multiplexing and a 7.73x increase over time-only multiplexing for convolutions, while also providing better isolation and latency predictability.
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-dinky


--------------------------------------------------------------------------------
/assets/paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbrise/caravel/0588b54a77b95d91665ddc4b2686e9077fcb0dc0/assets/paper.pdf


--------------------------------------------------------------------------------
/assets/poster.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucbrise/caravel/0588b54a77b95d91665ddc4b2686e9077fcb0dc0/assets/poster.pdf


--------------------------------------------------------------------------------
/space_time_benchmarks/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 	echo "Please don't just run make"
 3 | 	echo "Run me with make setup or make run"
 4 | 
 5 | setup: clone-repos ckpts s3-dir
 6 | 
 7 | clone-repos:
 8 | 	git clone http://github.com/tensorflow/models.git tf-models
 9 | 
10 | ckpts:
11 | 	bash bin/download-ckpts.sh
12 | 
13 | s3-dir:
14 | 	mkdir learningsys-2018-gpu-mux
15 | 	aws s3 sync s3://learningsys-2018-gpu-mux learningsys-2018-gpu-mux
16 | 
17 | format:
18 | 	bash bin/format-code.sh
19 | 	jupyter nbconvert --ClearOutputPreprocessor.enabled=True --inplace notebooks/*.ipynb
20 | 
21 | sync:
22 | 	cd learningsys-2018-gpu-mux && bash sync.sh
23 | 
24 | push: format sync
25 | 	git commit -a
26 | 	git push origin master
27 | 
28 | run-all:
29 | 	python src/experiments/generate.py > Makefile.run
30 | 	make all -f Makefile.run | tee run-all.log
31 | 	
32 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/bin/download-ckpts.sh:
--------------------------------------------------------------------------------
 1 | mkdir ckpts
 2 | cd ckpts
 3 | 
 4 | wget http://download.tensorflow.org/models/resnet_v1_50_2016_08_28.tar.gz
 5 | tar xzf resnet_v1_50_2016_08_28.tar.gz
 6 | rm resnet_v1_50_2016_08_28.tar.gz
 7 | 
 8 | wget http://download.tensorflow.org/models/resnet_v1_152_2016_08_28.tar.gz
 9 | tar xzf resnet_v1_152_2016_08_28.tar.gz
10 | rm resnet_v1_152_2016_08_28.tar.gz
11 | 
12 | wget https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_96.tgz
13 | tar -xvf mobilenet_v2_1.0_96.tgz
14 | rm  mobilenet_v2_1.0_96.tgz
15 | 
16 | wget https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_224.tgz
17 | tar -xvf mobilenet_v2_1.0_224.tgz
18 | rm  mobilenet_v2_1.0_224.tgz
19 | 
20 | cd ..
21 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/bin/format-code.sh:
--------------------------------------------------------------------------------
1 | black src
2 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/bin/start-mps.sh:
--------------------------------------------------------------------------------
1 | sudo nvidia-cuda-mps-control -d || true
2 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/bin/stop-mps.sh:
--------------------------------------------------------------------------------
1 | sudo echo quit | sudo nvidia-cuda-mps-control || true
2 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/notebooks/1-anomoly.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "import matplotlib.pyplot as plt\n",
 12 |     "%matplotlib inline\n",
 13 |     "\n",
 14 |     "import seaborn as sns"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from glob import glob"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "sns.set(style='whitegrid', palette='muted', font='serif')"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "dfs = []\n",
 42 |     "for fn in glob('../learningsys-2018-gpu-mux/p3-8xlarge-random-placement/**/*.pq', recursive=True):\n",
 43 |     "    df = pd.read_parquet(fn)\n",
 44 |     "    approach, model, proc, i  = fn.split(\"/\")[4:]\n",
 45 |     "    df['approach'] = approach\n",
 46 |     "    df['model'] = model\n",
 47 |     "    df['n_procs'] = int(proc)\n",
 48 |     "    df['proc'] = int(i.split('.')[0])\n",
 49 |     "    dfs.append(df)\n",
 50 |     "df = pd.concat(dfs)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# dfs = []\n",
 60 |     "# for fn in glob('learningsys-2018-gpu-mux/**/*.pq', recursive=True):\n",
 61 |     "#     df = pd.read_parquet(fn)\n",
 62 |     "#     _, _, approach, model, proc, i  = fn.split(\"/\")\n",
 63 |     "#     df['approach'] = approach\n",
 64 |     "#     df['model'] = model\n",
 65 |     "#     df['n_procs'] = int(proc)\n",
 66 |     "#     df['proc'] = int(i.split('.')[0])\n",
 67 |     "#     dfs.append(df)\n",
 68 |     "# df = pd.concat(dfs)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "anomoly = df.groupby(\n",
 78 |     "    ['approach', 'model', 'n_procs', 'proc']\n",
 79 |     ")[['duration_ms']].mean().reset_index().groupby(\n",
 80 |     "    ['approach', 'model', 'n_procs'])[['duration_ms']].apply(lambda s: s.max() - s.min()).reset_index()"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "anomoly = anomoly[(anomoly['duration_ms']>0.1) & (anomoly['approach'] == 'mps')]"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": []
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "plt.figure(figsize=(18,6))\n",
106 |     "plt.subplot(1,3,1)\n",
107 |     "sns.pointplot(x='n_procs', y='duration_ms', hue='model', data=anomoly)\n",
108 |     "plt.axhline(0.1, label='0.1 lower bound')\n",
109 |     "plt.legend()\n",
110 |     "plt.xlabel('Number of Replicas')\n",
111 |     "plt.ylabel('Latency (ms)')\n",
112 |     "plt.title('Max difference between processes')\n",
113 |     "\n",
114 |     "plt.subplot(1,3, 2)\n",
115 |     "res152_5_procs = df[\n",
116 |     "    (df['model'] == 'mobilenet') & \n",
117 |     "    (df['n_procs'] == 16) & \n",
118 |     "    (df['approach'] == 'mps')][['duration_ms', 'proc']]\n",
119 |     "grouped = res152_5_procs.groupby('proc')[['duration_ms']].rolling(20).mean().dropna().reset_index()\n",
120 |     "grouped['level_1'] = grouped['level_1'].astype(int)\n",
121 |     "sns.lineplot(x='level_1', y='duration_ms', hue='proc', data=grouped)\n",
122 |     "plt.xlabel('Query IDs')\n",
123 |     "plt.ylabel('Latency (ms)')\n",
124 |     "plt.title('Example: 16 Processes')\n",
125 |     "\n",
126 |     "\n",
127 |     "plt.subplot(1,3,3)\n",
128 |     "res152_5_procs = df[\n",
129 |     "    (df['model'] == 'mobilenet') & \n",
130 |     "    (df['n_procs'] == 15) & \n",
131 |     "    (df['approach'] == 'mps')][['duration_ms', 'proc']]\n",
132 |     "grouped = res152_5_procs.groupby('proc')[['duration_ms']].rolling(20).mean().dropna().reset_index()\n",
133 |     "grouped['level_1'] = grouped['level_1'].astype(int)\n",
134 |     "sns.lineplot(x='level_1', y='duration_ms', hue='proc', data=grouped)\n",
135 |     "plt.xlabel('Query IDs')\n",
136 |     "plt.ylabel('Latency (ms)')\n",
137 |     "plt.title('Example: 15 Processes')\n",
138 |     "\n",
139 |     "plt.savefig('anomoly.png', dpi=300)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": []
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "grouped.groupby('proc')['duration_ms'].mean().sort_values()"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "res152_5_procs"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": []
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "for _, row in anomoly.iterrows():\n",
181 |     "    print(f\"{row['approach']}-{row['model']}-{row['n_procs']}\")"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "len(anomoly)"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": []
199 |   }
200 |  ],
201 |  "metadata": {
202 |   "kernelspec": {
203 |    "display_name": "Environment (conda_tensorflow_p36)",
204 |    "language": "python",
205 |    "name": "conda_tensorflow_p36"
206 |   },
207 |   "language_info": {
208 |    "codemirror_mode": {
209 |     "name": "ipython",
210 |     "version": 3
211 |    },
212 |    "file_extension": ".py",
213 |    "mimetype": "text/x-python",
214 |    "name": "python",
215 |    "nbconvert_exporter": "python",
216 |    "pygments_lexer": "ipython3",
217 |    "version": "3.6.5"
218 |   }
219 |  },
220 |  "nbformat": 4,
221 |  "nbformat_minor": 2
222 | }
223 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/notebooks/2-basic-plots.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "import matplotlib.pyplot as plt\n",
 12 |     "%matplotlib inline\n",
 13 |     "\n",
 14 |     "import seaborn as sns\n",
 15 |     "sns.set(style='whitegrid', palette='muted', font='serif')"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "from glob import glob"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "def plot_line(ax, model_name, estimator):\n",
 34 |     "    sns.lineplot(\n",
 35 |     "        x='n_procs', \n",
 36 |     "        y='duration_ms', \n",
 37 |     "        hue='approach',\n",
 38 |     "        style='approach',\n",
 39 |     "        markers=True,\n",
 40 |     "        data=df[df['model'] == model_name], \n",
 41 |     "        estimator=estimator,\n",
 42 |     "        ci=None,\n",
 43 |     "        ax=ax\n",
 44 |     "    )"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "def plot_model(model_name):\n",
 54 |     "    fig, (ax_mean, ax_p99) = plt.subplots(1,2,sharey=True, figsize=(15,7))\n",
 55 |     "    plot_line(ax_mean, model_name, lambda s: np.mean(s))\n",
 56 |     "    plot_line(ax_p99, model_name, lambda s: np.percentile(s, 99))\n",
 57 |     "    ax_mean.set_xlabel(\"Number of Replica\")\n",
 58 |     "    ax_p99.set_xlabel(\"Number of Replica\")\n",
 59 |     "    ax_mean.set_ylabel(\"Latency (ms)\")\n",
 60 |     "    ax_mean.set_title(\"Mean Latency\")\n",
 61 |     "    ax_p99.set_title(\"P99 Latency\")\n",
 62 |     "    fig.suptitle(model_name)\n",
 63 |     "    fig.savefig(model_name+'.png')"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "dfs = []\n",
 73 |     "for fn in glob('../learningsys-2018-gpu-mux/p3-8xlarge-random-placement/**/*.pq', recursive=True):\n",
 74 |     "    df = pd.read_parquet(fn)\n",
 75 |     "    approach, model, proc, i  = fn.split(\"/\")[4:]\n",
 76 |     "    df['approach'] = approach\n",
 77 |     "    df['model'] = model\n",
 78 |     "    df['n_procs'] = int(proc)\n",
 79 |     "    df['proc'] = i.split('.')[0]\n",
 80 |     "    dfs.append(df)\n",
 81 |     "df = pd.concat(dfs)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "mean = df[df['approach'] == 'mps'].groupby(['n_procs', 'proc'])['duration_ms'].mean().reset_index()"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "mean.plot.scatter(x='n_procs', y='duration_ms', figsize=(8,6), marker='+')\n",
100 |     "plt.xlabel(\"Number of Replica\")\n",
101 |     "plt.ylabel(\"Mean Latency (ms)\")\n",
102 |     "plt.title(\"Mean Latency per process, Mobilenet with MPS\")\n",
103 |     "plt.savefig(\"scatter.png\",dpi=400)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "plot_model('mobilenet')"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "df_two_on_one = df[(df['approach'] == 'mps') & (df['model'] == 'mobilenet')].copy()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "random = df[(df['approach'] == 'mps') & (df['model'] == 'mobilenet')]"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "random['placement'] = 'random'\n",
140 |     "df_two_on_one['placement'] = 'two_per_core'"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "new_df = pd.concat([random, df_two_on_one])"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "new_df.head(1)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "plt.figure(figsize=(8,6))\n",
168 |     "sns.lineplot(\n",
169 |     "        x='n_procs', \n",
170 |     "        y='duration_ms', \n",
171 |     "        hue='placement',\n",
172 |     "        style='placement',\n",
173 |     "        markers=True,\n",
174 |     "        data=new_df, \n",
175 |     "        estimator=np.mean,\n",
176 |     "        ci=None,\n",
177 |     "    )\n",
178 |     "plt.title(\"Pin-to-core Placement Policy\")"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": []
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "plot_model('mobilenet')"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": []
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "# plot_model('mobilenet-224')"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "# plot_model('res50')"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "# plot_model('res152')"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": []
237 |   }
238 |  ],
239 |  "metadata": {
240 |   "kernelspec": {
241 |    "display_name": "Environment (conda_tensorflow_p36)",
242 |    "language": "python",
243 |    "name": "conda_tensorflow_p36"
244 |   },
245 |   "language_info": {
246 |    "codemirror_mode": {
247 |     "name": "ipython",
248 |     "version": 3
249 |    },
250 |    "file_extension": ".py",
251 |    "mimetype": "text/x-python",
252 |    "name": "python",
253 |    "nbconvert_exporter": "python",
254 |    "pygments_lexer": "ipython3",
255 |    "version": "3.6.5"
256 |   }
257 |  },
258 |  "nbformat": 4,
259 |  "nbformat_minor": 2
260 | }
261 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/notebooks/3-mps-batch-plots.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "import matplotlib.pyplot as plt\n",
 12 |     "%matplotlib inline\n",
 13 |     "\n",
 14 |     "import seaborn as sns\n",
 15 |     "sns.set(style='whitegrid', palette='muted', font='serif')"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "from glob import glob"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "dfs = []\n",
 34 |     "for fn in glob('learningsys-2018-gpu-mux/**/*.pq', recursive=True):\n",
 35 |     "    df = pd.read_parquet(fn)\n",
 36 |     "    _, _, approach, model, proc, i  = fn.split(\"/\")\n",
 37 |     "    df['approach'] = approach\n",
 38 |     "    df['model'] = model\n",
 39 |     "    df['n_procs'] = int(proc)\n",
 40 |     "    df['proc'] = i.split('.')[0]\n",
 41 |     "    dfs.append(df)\n",
 42 |     "df = pd.concat(dfs)"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "df = df[df['approach'].isin(['mps', 'mps-even', 'mps-even-times-2', 'batch'])]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "def plot_line(ax, model_name, estimator):\n",
 61 |     "    sns.lineplot(\n",
 62 |     "        x='n_procs', \n",
 63 |     "        y='duration_ms', \n",
 64 |     "        hue='approach',\n",
 65 |     "        style='approach',\n",
 66 |     "        markers=True,\n",
 67 |     "        data=df[df['model'] == model_name], \n",
 68 |     "        estimator=estimator,\n",
 69 |     "        ci=None,\n",
 70 |     "        ax=ax\n",
 71 |     "    )"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "def plot_model(model_name):\n",
 81 |     "    fig, (ax_mean, ax_p99) = plt.subplots(1,2,sharey=True, figsize=(10,5))\n",
 82 |     "    plot_line(ax_mean, model_name, lambda s: np.mean(s))\n",
 83 |     "    plot_line(ax_p99, model_name, lambda s: np.percentile(s, 99))\n",
 84 |     "    ax_mean.set_xlabel(\"Number of Replica\")\n",
 85 |     "    ax_p99.set_xlabel(\"Number of Replica\")\n",
 86 |     "    ax_mean.set_ylabel(\"Latency (ms)\")\n",
 87 |     "    ax_mean.set_title(\"Mean Latency\")\n",
 88 |     "    ax_p99.set_title(\"P99 Latency\")\n",
 89 |     "    fig.suptitle(model_name)\n",
 90 |     "    fig.savefig(model_name+'.png')"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "plot_model('mobilenet')"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "plot_model('mobilenet-224')"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "plot_model('res50')"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "plot_model('res152')"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": []
135 |   }
136 |  ],
137 |  "metadata": {
138 |   "kernelspec": {
139 |    "display_name": "Environment (conda_tensorflow_p36)",
140 |    "language": "python",
141 |    "name": "conda_tensorflow_p36"
142 |   },
143 |   "language_info": {
144 |    "codemirror_mode": {
145 |     "name": "ipython",
146 |     "version": 3
147 |    },
148 |    "file_extension": ".py",
149 |    "mimetype": "text/x-python",
150 |    "name": "python",
151 |    "nbconvert_exporter": "python",
152 |    "pygments_lexer": "ipython3",
153 |    "version": "3.6.0"
154 |   }
155 |  },
156 |  "nbformat": 4,
157 |  "nbformat_minor": 2
158 | }
159 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/notebooks/4-anomoly-mps.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "import matplotlib.pyplot as plt\n",
 12 |     "%matplotlib inline\n",
 13 |     "\n",
 14 |     "import seaborn as sns"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from glob import glob"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "dfs = []\n",
 33 |     "for fn in glob('learningsys-2018-gpu-mux/**/*.pq', recursive=True):\n",
 34 |     "    df = pd.read_parquet(fn)\n",
 35 |     "    _, _, approach, model, proc, i  = fn.split(\"/\")\n",
 36 |     "    df['approach'] = approach\n",
 37 |     "    df['model'] = model\n",
 38 |     "    df['n_procs'] = int(proc)\n",
 39 |     "    df['proc'] = int(i.split('.')[0])\n",
 40 |     "    dfs.append(df)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "df = pd.concat(dfs)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "df = df[df['approach'].isin(['mps', 'mps-even', 'mps-even-times-2', 'batch'])]"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "anomoly = df.groupby(\n",
 68 |     "    ['approach', 'model', 'n_procs', 'proc']\n",
 69 |     ")[['duration_ms']].mean().reset_index().groupby(\n",
 70 |     "    ['approach', 'model', 'n_procs'])[['duration_ms']].apply(lambda s: s.max() - s.min()).reset_index()"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "anomoly = anomoly[anomoly['duration_ms']>0.1]"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "anomoly"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "sns.set(style='whitegrid', palette='muted', font='serif')"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": []
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "def plot_anomoly(approach):\n",
114 |     "    plt.figure(figsize=(16,6))\n",
115 |     "    plt.subplot(1,2,1)\n",
116 |     "    sns.pointplot(x='n_procs', y='duration_ms', hue='model', data=anomoly[anomoly['approach'] == approach])\n",
117 |     "    plt.axhline(0.1, label='0.1 lower bound')\n",
118 |     "    plt.legend()\n",
119 |     "    plt.xlabel('Number of Replicas')\n",
120 |     "    plt.ylabel(r'Mean Latency $\\Delta$ (ms)')\n",
121 |     "    plt.title('Max difference between processes')\n",
122 |     "\n",
123 |     "    plt.subplot(1,2,2)\n",
124 |     "    res152_5_procs = df[(df['model'] == 'res152') & (df['n_procs'] == 5) & (df['approach'] == approach)][['duration_ms', 'proc', 'query_id']]\n",
125 |     "    sns.lineplot(x='query_id', y='duration_ms', hue='proc', data=res152_5_procs)\n",
126 |     "    plt.xlabel('Query IDs')\n",
127 |     "    plt.ylabel('Latency (ms)')\n",
128 |     "    plt.title(f'Example: Res152, {approach}, 5 Processes')\n",
129 |     "\n",
130 |     "    plt.savefig(f'anomoly-{approach}.png', dpi=300)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "plot_anomoly('mps')"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "plot_anomoly('mps-even')"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "plot_anomoly('mps-even-times-2')"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": []
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": []
173 |   }
174 |  ],
175 |  "metadata": {
176 |   "kernelspec": {
177 |    "display_name": "Environment (conda_tensorflow_p36)",
178 |    "language": "python",
179 |    "name": "conda_tensorflow_p36"
180 |   },
181 |   "language_info": {
182 |    "codemirror_mode": {
183 |     "name": "ipython",
184 |     "version": 3
185 |    },
186 |    "file_extension": ".py",
187 |    "mimetype": "text/x-python",
188 |    "name": "python",
189 |    "nbconvert_exporter": "python",
190 |    "pygments_lexer": "ipython3",
191 |    "version": "3.6.0"
192 |   }
193 |  },
194 |  "nbformat": 4,
195 |  "nbformat_minor": 2
196 | }
197 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/notebooks/5-POC-Threading-With-Torch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%%timeit\n",
 10 |     "model(inp)"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import torch\n",
 20 |     "import torchvision"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "import time"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "import numpy as np"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "torch.backends.cudnn.enabled = False"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "def profile_in_thread():\n",
 57 |     "    times = []\n",
 58 |     "    stream = torch.cuda.Stream()\n",
 59 |     "    with torch.cuda.stream(stream):\n",
 60 |     "        model = torchvision.models.resnet50().cuda()\n",
 61 |     "        inp = torch.randn(1,3,224,224).cuda()\n",
 62 |     "        torch.cuda.synchronize()\n",
 63 |     "        print(\"Model started\")\n",
 64 |     "        for i in range(100):\n",
 65 |     "            a = time.perf_counter()\n",
 66 |     "            with torch.no_grad():\n",
 67 |     "                model(inp)\n",
 68 |     "#             torch.cuda.synchronize()\n",
 69 |     "            b = time.perf_counter()\n",
 70 |     "            times.append((b-a)*1000)\n",
 71 |     "            print(i, (b-a)*1000)\n",
 72 |     "        print(\"Perf Done\")\n",
 73 |     "#     times = times[200:]\n",
 74 |     "    print(np.mean(times), np.percentile(times, 99), np.std(times))"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "import threading"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "ts = [threading.Thread(target=profile_in_thread) for _ in range(3)]\n",
 93 |     "[t.start() for t in ts]\n",
 94 |     "results = [t.join() for t in ts]"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": []
103 |   }
104 |  ],
105 |  "metadata": {
106 |   "kernelspec": {
107 |    "display_name": "Python 3",
108 |    "language": "python",
109 |    "name": "python3"
110 |   },
111 |   "language_info": {
112 |    "codemirror_mode": {
113 |     "name": "ipython",
114 |     "version": 3
115 |    },
116 |    "file_extension": ".py",
117 |    "mimetype": "text/x-python",
118 |    "name": "python",
119 |    "nbconvert_exporter": "python",
120 |    "pygments_lexer": "ipython3",
121 |    "version": "3.6.5"
122 |   }
123 |  },
124 |  "nbformat": 4,
125 |  "nbformat_minor": 2
126 | }
127 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/requirements.txt:
--------------------------------------------------------------------------------
 1 | ujson
 2 | black
 3 | clicks
 4 | pyzmq
 5 | tensorflow-gpu==1.10.0
 6 | redis
 7 | torch
 8 | torchvision
 9 | python-snappy
10 | fastparquet
11 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/src/client.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import click
  3 | import redis
  4 | import os
  5 | import pandas as pd
  6 | import models
  7 | import numpy as np
  8 | 
  9 | 
 10 | def _block_until(key, val):
 11 |     """A semaphore implemented in redis"""
 12 |     r = redis.Redis()
 13 |     r.incr(key)
 14 |     while int(r.get(key)) != val:
 15 |         pass
 16 | 
 17 | 
 18 | def _log(*args, **kwargs):
 19 |     print("[Client]", *args, **kwargs)
 20 | 
 21 | 
 22 | @click.command()
 23 | @click.option("--mem-frac", type=float, required=True)
 24 | @click.option("--allow-growth", is_flag=True)
 25 | @click.option("--num-replicas", type=int, required=True)
 26 | @click.option("--model-name", type=click.Choice(models.SUPPORTED_MODELS), required=True)
 27 | @click.option("--power-graph", is_flag=True)
 28 | @click.option("--power-graph-count", type=int, default=1)
 29 | @click.option("--batch-size", type=int, default=1)
 30 | @click.option("--result-path", required=True)
 31 | @click.option("--force", is_flag=True)
 32 | def start_client(
 33 |     mem_frac,
 34 |     allow_growth,
 35 |     num_replicas,
 36 |     model_name,
 37 |     power_graph,
 38 |     power_graph_count,
 39 |     batch_size,
 40 |     result_path,
 41 |     force,
 42 | ):
 43 |     if os.path.exists(result_path) and not force:
 44 |         _log(f"Path {result_path} exists. Skipping")
 45 |         return
 46 |     os.makedirs(os.path.split(result_path)[0], exist_ok=True)
 47 | 
 48 |     require_locks = not (power_graph or batch_size != 1)
 49 |     print(f"Require Locks {require_locks}")
 50 |     # Load Model
 51 |     if model_name.startswith("torch_"):
 52 |         sess_run, threads = models.get_model_pytorch(
 53 |             model_name, power_graph, power_graph_count, batch_size
 54 |         )
 55 |     else:
 56 |         sess_run = models.get_model(
 57 |             model_name,
 58 |             power_graph,
 59 |             power_graph_count,
 60 |             batch_size,
 61 |             mem_frac,
 62 |             allow_growth,
 63 |         )
 64 | 
 65 |     if require_locks:
 66 |         _block_until("connect-lock", num_replicas)
 67 |     _log("Model Loaded")
 68 | 
 69 |     # Model Warmup
 70 |     for _ in range(200):
 71 |         sess_run()
 72 |     _log("Warmup finished")
 73 |     if require_locks:
 74 |         _block_until("warmup-lock", num_replicas)
 75 | 
 76 |     # Model Evaluation
 77 |     durations = []
 78 |     for _ in range(2000):
 79 |         start = time.perf_counter()
 80 |         sess_run()
 81 |         end = time.perf_counter()
 82 |         duration_ms = (end - start) * 1000
 83 |         durations.append(duration_ms)
 84 | 
 85 |     durations = durations[500:1500]
 86 | 
 87 |     if require_locks:
 88 |         _block_until("exit-lock", num_replicas)
 89 |     # Save Data
 90 |     df = pd.DataFrame({"duration_ms": durations})
 91 |     df.to_csv(result_path)
 92 |     mean, p99 = df["duration_ms"].mean(), np.percentile(durations, 99)
 93 |     _log(f"Mean Latency: {mean}, P99: {p99}")
 94 |     import sys
 95 | 
 96 |     sys.exit(0)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     start_client()
101 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/src/experiments/config.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/src/experiments/generate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from itertools import product
 3 | import time
 4 | 
 5 | approaches = ["mux", "mps", "batch"]
 6 | # models = ["res50", "res152", "mobilenet"]
 7 | models = ["torch_res50", "torch_res152", "torch_squeezenet"]
 8 | replicas = {
 9 |     "torch_res50": list(range(1, 10)),
10 |     "torch_res152": list(range(1, 5)),
11 |     "torch_squeezenet": list(range(1, 15)),
12 |     "res50": list(range(1, 15)),
13 |     "res152": list(range(1, 7)),
14 |     "mobilenet": list(range(1, 21)),
15 | }
16 | # placement_policy = {"mux": [0, 1, 2, 4], "mps": [0, 1, 2, 4], "batch": [0, 1]}
17 | placement_policy = {"mux": [1], "mps": [1], "batch": [1]}
18 | result_dir_root = "learningsys-2018-gpu-mux"
19 | force = [False, True]
20 | 
21 | 
22 | def generate_command(approach, model, replica, force, placement_policy):
23 |     mps_req = "start-mps" if approach.startswith("mps") else "stop-mps"
24 | 
25 |     name = f"{approach}-{model}-{replica}-pp{placement_policy}"
26 |     if force:
27 |         name += "-force"
28 | 
29 |     cmd = f"""python src/master.py --result-dir \
30 |     {os.path.join(result_dir_root, 'result', approach, model, str(replica), str(placement_policy))} \
31 |     --num-procs {replica} --model-name {model} --placement-policy {placement_policy} \
32 |     """
33 |     if approach == "powergraph":
34 |         cmd += "\t --power-graph"
35 |     if approach == "batch":
36 |         cmd += "\t --batch"
37 |     if force:
38 |         cmd += "\t --force"
39 | 
40 |     print(
41 |         f"""
42 | {name}:
43 | \t  bash bin/{mps_req}.sh
44 | \t  {cmd}
45 |     """
46 |     )
47 | 
48 |     return name
49 | 
50 | 
51 | def main():
52 |     all_names = []
53 |     for approach in approaches:
54 |         for model in models:
55 |             for replica in replicas[model]:
56 |                 for pp in placement_policy[approach]:
57 |                     if "res152" in model and approach == "batch":
58 |                         continue
59 |                     # Special Case
60 |                     # Takes too long, b.c. unified memory?
61 |                     if pp == 4 and replica == 20:
62 |                         continue
63 | 
64 |                     all_names.append(
65 |                         generate_command(
66 |                             approach, model, replica, force=False, placement_policy=pp
67 |                         )
68 |                     )
69 | 
70 |     print(
71 |         f"""
72 | all: {' '.join(all_names)}
73 |     """
74 |     )
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/src/master.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import socket
  3 | import time
  4 | from shlex import split
  5 | from subprocess import Popen
  6 | import click
  7 | import redis
  8 | import numpy as np
  9 | 
 10 | TOTAL_CORES = 32
 11 | 
 12 | 
 13 | class ClientRun:
 14 |     def __init__(self, model_name, result_path, num_procs):
 15 |         self.model_name = model_name
 16 |         self.result_path = result_path
 17 |         self.num_proc = num_procs
 18 | 
 19 |         self.mem_frac = str(0.95 / self.num_proc)
 20 |         self.allow_growth = False
 21 | 
 22 |         self.power_graph = False
 23 |         self.batch_size = 1
 24 |         self.force = False
 25 | 
 26 |         self.proc = None
 27 | 
 28 |         self.core = None
 29 | 
 30 |     def run(self):
 31 |         cmd = ["python", "src/client.py"]
 32 |         cmd += ["--model-name", self.model_name]
 33 |         cmd += ["--result-path", self.result_path]
 34 |         cmd += ["--num-replicas", self.num_proc]
 35 |         cmd += ["--mem-frac", self.mem_frac]
 36 |         if self.allow_growth:
 37 |             cmd += ["--allow-growth"]
 38 |         if self.power_graph:
 39 |             cmd += ["--power-graph", "--power-graph-count", self.power_graph_count]
 40 |         if self.batch_size != 1:
 41 |             cmd += ["--batch-size", self.batch_size]
 42 |         if self.force:
 43 |             cmd += ["--force"]
 44 | 
 45 |         if self.core is not None:
 46 |             cmd = ["numactl", "-C", self.core] + cmd
 47 | 
 48 |         cmd = split(" ".join(map(str, cmd)))
 49 |         print(" ".join(cmd))
 50 |         self.proc = Popen(cmd, env=dict(os.environ, CUDA_VISIBLE_DEVICES="0"))
 51 | 
 52 |     def wait(self):
 53 |         self.proc.wait()
 54 | 
 55 |     def set_tf_mem_frac(self, frac):
 56 |         self.mem_frac = str(frac)
 57 | 
 58 |     def set_tf_allow_growth(self):
 59 |         self.allow_growth = True
 60 | 
 61 |     def set_power_graph(self, n):
 62 |         self.power_graph = True
 63 |         self.power_graph_count = n
 64 | 
 65 |     def set_batch_size(self, n):
 66 |         self.batch_size = n
 67 | 
 68 |     def set_force(self):
 69 |         self.force = True
 70 | 
 71 |     def set_core(self, core):
 72 |         self.core = core
 73 | 
 74 |     @property
 75 |     def returncode(self):
 76 |         return self.proc.returncode
 77 | 
 78 |     def set_num_proc(self, num_proc):
 79 |         self.num_proc = num_proc
 80 | 
 81 | 
 82 | @click.command()
 83 | @click.option("--mem-frac", type=float)
 84 | @click.option("--allow-growth", is_flag=True)
 85 | @click.option("--result-dir", required=True)
 86 | @click.option("--num-procs", "-n", type=int, default=5, required=True)
 87 | @click.option("--model-name", required=True)
 88 | @click.option("--power-graph", is_flag=True)
 89 | @click.option("--force", is_flag=True)
 90 | @click.option("--batch", is_flag=True)
 91 | @click.option("--placement-policy", type=int, default=1)
 92 | def master(
 93 |     mem_frac,
 94 |     allow_growth,
 95 |     result_dir,
 96 |     num_procs,
 97 |     model_name,
 98 |     power_graph,
 99 |     force,
100 |     batch,
101 |     placement_policy,
102 | ):
103 | 
104 |     # reset the warmup lock
105 |     r = redis.Redis()
106 |     r.set("warmup-lock", 0)
107 |     r.set("connect-lock", 0)
108 |     r.set("exit-lock", 0)
109 | 
110 |     clients = [
111 |         ClientRun(model_name, os.path.join(result_dir, f"{i}.pq"), num_procs)
112 |         for i in range(num_procs)
113 |     ]
114 |     if mem_frac:
115 |         [c.set_tf_mem_frac(mem_frac) for c in clients]
116 | 
117 |     if allow_growth:
118 |         [c.set_tf_allow_growth() for c in clients]
119 | 
120 |     if batch:
121 |         batch_client = clients[0]
122 |         batch_client.set_batch_size(num_procs)
123 |         batch_client.set_num_proc(1)
124 |         clients = [batch_client]
125 | 
126 |     if force:
127 |         [c.set_force() for c in clients]
128 | 
129 |     if power_graph:
130 |         power_graph_client = clients[0]
131 |         power_graph_client.set_power_graph(num_procs)
132 |         clients = [power_graph_client]
133 | 
134 |     if placement_policy != 0:  # not random placement
135 |         if TOTAL_CORES * placement_policy < len(clients):
136 |             raise Exception(
137 |                 f"We have {num_procs} clients but we can only fit {TOTAL_CORES*placement}. Please change the placement policy"
138 |             )
139 |         all_cores = np.arange(TOTAL_CORES)
140 |         expanded = np.repeat(all_cores, int(placement_policy))
141 |         for core, client in zip(expanded, clients):
142 |             client.set_core(core)
143 | 
144 |     [c.run() for c in clients]
145 |     [c.wait() for c in clients]
146 |     print([c.returncode for c in clients])
147 | 
148 | 
149 | if __name__ == "__main__":
150 |     master()
151 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/src/models/__init__.py:
--------------------------------------------------------------------------------
 1 | SUPPORTED_MODELS = ["res50", "res152", "mobilenet", "mobilenet-224"]
 2 | SUPPORTED_MODELS += ["torch_res50", "torch_res152", "torch_squeezenet"]
 3 | 
 4 | 
 5 | def get_model(
 6 |     model_name, powergraph, power_graph_count, batch_size, mem_frac, allow_growth
 7 | ):
 8 |     """Return a lambda closure"""
 9 | 
10 |     from .tf_models import (
11 |         SUPPORTED_MODELS as tf_models,
12 |         load_tf_power_graph,
13 |         get_input,
14 |         load_tf_sess,
15 |     )
16 | 
17 |     assert model_name in tf_models
18 | 
19 |     if powergraph:
20 |         sess, img_tensors, predictions = load_tf_power_graph(
21 |             mem_frac, allow_growth, model_name, power_graph_count
22 |         )
23 |         sess_run = lambda: sess.run(predictions)
24 |     else:
25 |         sess, img_tensor, predictions = load_tf_sess(
26 |             mem_frac, allow_growth, model_name, batch_size
27 |         )
28 |         sess_run = lambda: sess.run(predictions)
29 | 
30 |     return sess_run
31 | 
32 | 
33 | def get_model_pytorch(
34 |     model_name,
35 |     powergraph,
36 |     power_graph_count,
37 |     batch_size,
38 |     mem_frac=None,
39 |     allow_growth=None,
40 | ):
41 |     from .torch_models import load_torch_model, load_torch_power_graph
42 |     import torch
43 | 
44 |     torch.backends.cudnn.enabled = False
45 | 
46 |     if powergraph and batch_size == 1:
47 |         inp_qs, out_qs, ts = load_torch_power_graph(model_name, power_graph_count)
48 | 
49 |         def run_one_predict():
50 |             [inp_q.put("") for inp_q in inp_qs]
51 |             [out_q.get() for out_q in out_qs]
52 | 
53 |         return run_one_predict, ts
54 |     else:
55 |         stream = torch.cuda.Stream()
56 | 
57 |         with torch.cuda.stream(stream):
58 |             model, inp = load_torch_model(model_name, batch_size)
59 | 
60 |         def run_one_predict():
61 |             with torch.cuda.stream(stream):
62 |                 with torch.no_grad():
63 |                     model(inp)
64 | 
65 |         return run_one_predict, None
66 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/src/models/tf_models.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import click
  4 | import numpy as np
  5 | import pandas as pd
  6 | import tensorflow as tf
  7 | import tensorflow.contrib.slim as slim
  8 | from tensorflow.contrib.slim.nets import resnet_v1
  9 | 
 10 | sys.path.append("tf-models/research/slim")
 11 | from nets.mobilenet import mobilenet_v2  # isort:skip
 12 | 
 13 | 
 14 | SUPPORTED_MODELS = ["res50", "res152", "mobilenet", "mobilenet-224"]
 15 | MODELS_TO_CKPT = {
 16 |     "res50": "ckpts/resnet_v1_50.ckpt",
 17 |     "res152": "ckpts/resnet_v1_152.ckpt",
 18 |     "mobilenet": "ckpts/mobilenet_v2_1.0_96.ckpt",
 19 |     "mobilenet-224": "ckpts/mobilenet_v2_1.0_224.ckpt",
 20 | }
 21 | MODELS_TO_SHAPE = {
 22 |     "res50": [1, 224, 224, 3],
 23 |     "res152": [1, 224, 224, 3],
 24 |     "mobilenet": [1, 96, 96, 3],
 25 |     "mobilenet-224": [1, 224, 224, 3],
 26 | }
 27 | 
 28 | 
 29 | def get_input(model_name, batch_size=1):
 30 |     shape = MODELS_TO_SHAPE[model_name]
 31 |     shape[0] = batch_size
 32 |     with tf.device("/gpu:0"):
 33 |         tensor = tf.random_normal(shape)
 34 |     return tensor
 35 | 
 36 | 
 37 | def _get_endpoints(model_name, img_tensor):
 38 |     if model_name == "res50":
 39 |         with slim.arg_scope(resnet_v1.resnet_arg_scope()):
 40 |             _, end_points = resnet_v1.resnet_v1_50(img_tensor, 1000, is_training=False)
 41 |         return end_points["predictions"]
 42 | 
 43 |     elif model_name == "res152":
 44 |         with slim.arg_scope(resnet_v1.resnet_arg_scope()):
 45 |             _, end_points = resnet_v1.resnet_v1_152(img_tensor, 1000, is_training=False)
 46 |         return end_points["predictions"]
 47 | 
 48 |     elif model_name.startswith("mobilenet"):
 49 |         with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope(is_training=False)):
 50 |             _, endpoints = mobilenet_v2.mobilenet(img_tensor)
 51 |         return endpoints["Predictions"]
 52 | 
 53 | 
 54 | def load_tf_sess(mem_frac=0.1, allow_growth=False, model_name=None, batch_size=1):
 55 |     graph = tf.Graph()
 56 |     with graph.as_default():
 57 |         shape = MODELS_TO_SHAPE[model_name]
 58 |         shape[0] = batch_size
 59 |         # img_tensor = tf.constant(
 60 |         #     np.random.randn(*shape).astype(np.float32)
 61 |         # )
 62 |         img_tensor = tf.placeholder(tf.float32, shape)
 63 |         predictions = _get_endpoints(model_name, img_tensor)
 64 |         saver = tf.train.Saver()
 65 |     config = tf.ConfigProto()
 66 |     config.gpu_options.allow_growth = True
 67 |     config.gpu_options.per_process_gpu_memory_fraction = 1.1
 68 |     config.gpu_options.allocator_type = "BFC"
 69 |     sess = tf.Session(config=config, graph=graph)
 70 |     saver.restore(sess, MODELS_TO_CKPT[model_name])
 71 |     return sess, img_tensor, predictions, graph
 72 | 
 73 | 
 74 | # How to merge many subgraphs into a single graph
 75 | # https://stackoverflow.com/questions/42858785/connect-input-and-output-tensors-of-two-different-graphs-tensorflow
 76 | 
 77 | 
 78 | def _create_subgraph(name_scope, ckpt_path, model_name):
 79 |     graph = tf.Graph()
 80 |     with graph.as_default():
 81 |         with tf.name_scope(name_scope):
 82 |             inp = tf.constant(
 83 |                 np.random.randn(*MODELS_TO_SHAPE[model_name]).astype(np.float32)
 84 |             )
 85 |             predictions = _get_endpoints(model_name, inp)
 86 |             saver = tf.train.Saver()
 87 |             with tf.Session() as sess:
 88 |                 saver.restore(sess, MODELS_TO_CKPT[model_name])
 89 |                 saver.save(sess, ckpt_path)
 90 |     return graph.as_graph_def(), inp.name, predictions.name, saver
 91 | 
 92 | 
 93 | def load_tf_power_graph(mem_frac=0.1, allow_growth=False, model_name=None, num_graph=1):
 94 |     graph = tf.Graph()
 95 |     inps, outs, savers, ckpt_paths = [], [], [], []
 96 | 
 97 |     with graph.as_default():
 98 |         for i in range(num_graph):
 99 |             path = f"/tmp/graph_{i}.ckpt"
100 |             graph_def, inp_name, out_name, saver = _create_subgraph(
101 |                 f"graph_{i}", path, model_name
102 |             )
103 | 
104 |             inp, out = tf.import_graph_def(
105 |                 graph_def, return_elements=[inp_name, out_name], name=""
106 |             )
107 | 
108 |             inps.append(inp)
109 |             outs.append(out)
110 |             savers.append(saver)
111 |             ckpt_paths.append(path)
112 | 
113 |     config = tf.ConfigProto()
114 |     config.gpu_options.allow_growth = allow_growth
115 |     config.gpu_options.per_process_gpu_memory_fraction = mem_frac
116 |     sess = tf.Session(config=config, graph=graph)
117 | 
118 |     for saver, path in zip(savers, ckpt_paths):
119 |         saver.restore(sess, path)
120 | 
121 |     return sess, inps, outs
122 | 


--------------------------------------------------------------------------------
/space_time_benchmarks/src/models/torch_models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | from threading import Thread
 4 | from queue import Queue
 5 | 
 6 | torch.backends.cudnn.enabled = False
 7 | 
 8 | SUPPORTED_MODELS = ["torch_res50", "torch_res152", "torch_squeezenet"]
 9 | MODEL_SHAPES = [1, 3, 224, 224]
10 | 
11 | 
12 | def load_torch_model(model_name, batch_size):
13 |     if model_name == "torch_res50":
14 |         model = torchvision.models.resnet50().cuda()
15 |     elif model_name == "torch_res152":
16 |         model = torchvision.models.resnet152().cuda()
17 |     elif model_name == "torch_squeezenet":
18 |         model = torchvision.models.squeezenet1_1().cuda()
19 | 
20 |     shape = list(MODEL_SHAPES)
21 |     shape[0] = batch_size
22 |     inp = torch.randn(*shape).cuda()
23 | 
24 |     model(inp)
25 |     torch.cuda.empty_cache()
26 |     return model, inp
27 | 
28 | 
29 | def _run_inference_subgraph(inp_queue, out_queue, model_name, batch_size):
30 |     with torch.cuda.stream(torch.cuda.Stream()):
31 |         model, inp = load_torch_model(model_name, batch_size)
32 | 
33 |         with torch.no_grad():
34 |             model(inp)
35 |         torch.cuda.empty_cache()
36 | 
37 |         while True:
38 |             inp_queue.get()
39 |             with torch.no_grad():
40 |                 model(inp)
41 |             out_queue.put("")
42 | 
43 | 
44 | def load_torch_power_graph(model_name, power_graph_count, batch_size=1):
45 |     inp_qs = [Queue() for _ in range(power_graph_count)]
46 |     out_qs = [Queue() for _ in range(power_graph_count)]
47 |     ts = [
48 |         Thread(
49 |             target=_run_inference_subgraph, args=(inp_q, out_q, model_name, batch_size)
50 |         )
51 |         for inp_q, out_q in zip(inp_qs, out_qs)
52 |     ]
53 |     [t.start() for t in ts]
54 |     return inp_qs, out_qs, ts
55 | 


--------------------------------------------------------------------------------