├── .gitattributes ├── README.md ├── LICENSE ├── dataset_tutorial.ipynb └── .ipynb_checkpoints └── dataset_tutorial-checkpoint.ipynb /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow Dataset Tutorial 2 | 3 | This repository contains the notebook used in my medium article: 4 | 5 | https://medium.com/@FrancescoZ/how-to-use-dataset-in-tensorflow-c758ef9e4428 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 FrancescoSaverioZuppichini 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /dataset_tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/usr/local/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 13 | " from ._conv import register_converters as _register_converters\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import tensorflow as tf\n", 19 | "import numpy as np" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stdout", 29 | "output_type": "stream", 30 | "text": [ 31 | "[0.50035296 0.92651365]\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "x = np.random.sample((100,2))\n", 37 | "# make a dataset from a numpy array\n", 38 | "dataset = tf.data.Dataset.from_tensor_slices(x)\n", 39 | "\n", 40 | "iter = dataset.make_one_shot_iterator()\n", 41 | "el = iter.get_next()\n", 42 | "\n", 43 | "with tf.Session() as sess:\n", 44 | " print(sess.run(el))" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "(array([0.33327842, 0.90874317]), array([0.02171065]))\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "# using two numpy arrays\n", 62 | "features, labels = (np.random.sample((100,2)), np.random.sample((100,1)))\n", 63 | "dataset = tf.data.Dataset.from_tensor_slices((features,labels))\n", 64 | "\n", 65 | "iter = dataset.make_one_shot_iterator()\n", 66 | "el = iter.get_next()\n", 67 | "\n", 68 | "with tf.Session() as sess:\n", 69 | " print(sess.run(el))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "[0.00786543 0.26009214]\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "# using a tensor\n", 87 | "dataset = tf.data.Dataset.from_tensor_slices(tf.random_uniform([100, 2]))\n", 88 | "\n", 89 | "iter = dataset.make_initializable_iterator()\n", 90 | "el = iter.get_next()\n", 91 | "\n", 92 | "with tf.Session() as sess:\n", 93 | " sess.run(iter.initializer)\n", 94 | " print(sess.run(el))" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "[0.03433903 0.7280311 ]\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "# using a placeholder\n", 112 | "x = tf.placeholder(tf.float32, shape=[None,2])\n", 113 | "dataset = tf.data.Dataset.from_tensor_slices(x)\n", 114 | "\n", 115 | "data = np.random.sample((100,2))\n", 116 | "\n", 117 | "iter = dataset.make_initializable_iterator()\n", 118 | "el = iter.get_next()\n", 119 | "\n", 120 | "with tf.Session() as sess:\n", 121 | " sess.run(iter.initializer, feed_dict={ x: data })\n", 122 | " print(sess.run(el))" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 6, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "[[1]]\n", 135 | "[[2]\n", 136 | " [3]]\n", 137 | "[[3]\n", 138 | " [4]\n", 139 | " [5]]\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "# from generator\n", 145 | "sequence = np.array([[[1]],[[2],[3]],[[3],[4],[5]]])\n", 146 | "\n", 147 | "def generator():\n", 148 | " for el in sequence:\n", 149 | " yield el\n", 150 | "\n", 151 | "dataset = tf.data.Dataset().batch(1).from_generator(generator,\n", 152 | " output_types= tf.int64, \n", 153 | " output_shapes=(tf.TensorShape([None, 1])))\n", 154 | "\n", 155 | "iter = dataset.make_initializable_iterator()\n", 156 | "el = iter.get_next()\n", 157 | "\n", 158 | "with tf.Session() as sess:\n", 159 | " sess.run(iter.initializer)\n", 160 | " print(sess.run(el))\n", 161 | " print(sess.run(el))\n", 162 | " print(sess.run(el))\n" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 7, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "[array([1., 2.], dtype=float32), array([0.], dtype=float32)]\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "# initializable iterator to switch between data\n", 180 | "EPOCHS = 10\n", 181 | "\n", 182 | "x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])\n", 183 | "dataset = tf.data.Dataset.from_tensor_slices((x, y))\n", 184 | "\n", 185 | "train_data = (np.random.sample((100,2)), np.random.sample((100,1)))\n", 186 | "test_data = (np.array([[1,2]]), np.array([[0]]))\n", 187 | "\n", 188 | "iter = dataset.make_initializable_iterator()\n", 189 | "features, labels = iter.get_next()\n", 190 | "\n", 191 | "with tf.Session() as sess:\n", 192 | "# initialise iterator with train data\n", 193 | " sess.run(iter.initializer, feed_dict={ x: train_data[0], y: train_data[1]})\n", 194 | " for _ in range(EPOCHS):\n", 195 | " sess.run([features, labels])\n", 196 | "# switch to test data\n", 197 | " sess.run(iter.initializer, feed_dict={ x: test_data[0], y: test_data[1]})\n", 198 | " print(sess.run([features, labels]))\n", 199 | "\n", 200 | " \n", 201 | " " 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 8, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "[array([0.94182994, 0.26802265]), array([0.81551463])]\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "# Reinitializable iterator to switch between Datasets\n", 219 | "EPOCHS = 10\n", 220 | "# making fake data using numpy\n", 221 | "train_data = (np.random.sample((100,2)), np.random.sample((100,1)))\n", 222 | "test_data = (np.random.sample((10,2)), np.random.sample((10,1)))\n", 223 | "# create two datasets, one for training and one for test\n", 224 | "train_dataset = tf.data.Dataset.from_tensor_slices(train_data)\n", 225 | "test_dataset = tf.data.Dataset.from_tensor_slices(test_data)\n", 226 | "# create a iterator of the correct shape and type\n", 227 | "iter = tf.data.Iterator.from_structure(train_dataset.output_types,\n", 228 | " train_dataset.output_shapes)\n", 229 | "features, labels = iter.get_next()\n", 230 | "# create the initialisation operations\n", 231 | "train_init_op = iter.make_initializer(train_dataset)\n", 232 | "test_init_op = iter.make_initializer(test_dataset)\n", 233 | "\n", 234 | "with tf.Session() as sess:\n", 235 | " sess.run(train_init_op) # switch to train dataset\n", 236 | " for _ in range(EPOCHS):\n", 237 | " sess.run([features, labels])\n", 238 | " sess.run(test_init_op) # switch to val dataset\n", 239 | " print(sess.run([features, labels]))\n", 240 | "\n", 241 | " \n", 242 | " " 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 9, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "name": "stdout", 252 | "output_type": "stream", 253 | "text": [ 254 | "[0.8552025 0.13344285] [0.24534453]\n", 255 | "[0.23880187 0.2294315 ] [0.77315474]\n", 256 | "[0.763904 0.439595] [0.42727667]\n", 257 | "[0.6563372 0.1366187] [0.02278621]\n", 258 | "[0.71135175 0.394754 ] [0.8552778]\n", 259 | "[0.7329701 0.42924434] [0.43608633]\n", 260 | "[0.8240853 0.7750715] [0.5140434]\n", 261 | "[0.65556693 0.67978406] [0.8228361]\n", 262 | "[0.02365288 0.18461536] [0.85140544]\n", 263 | "[0.48037764 0.7320316 ] [0.773141]\n", 264 | "[0.6671238 0.8491173] [0.45188755]\n" 265 | ] 266 | } 267 | ], 268 | "source": [ 269 | "# feedable iterator to switch between iterators\n", 270 | "EPOCHS = 10\n", 271 | "# making fake data using numpy\n", 272 | "train_data = (np.random.sample((100,2)), np.random.sample((100,1)))\n", 273 | "test_data = (np.random.sample((10,2)), np.random.sample((10,1)))\n", 274 | "# create placeholder\n", 275 | "x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])\n", 276 | "# create two datasets, one for training and one for test\n", 277 | "train_dataset = tf.data.Dataset.from_tensor_slices((x,y))\n", 278 | "test_dataset = tf.data.Dataset.from_tensor_slices((x,y))\n", 279 | "# create the iterators from the dataset\n", 280 | "train_iterator = train_dataset.make_initializable_iterator()\n", 281 | "test_iterator = test_dataset.make_initializable_iterator()\n", 282 | "# same as in the doc https://www.tensorflow.org/programmers_guide/datasets#creating_an_iterator\n", 283 | "handle = tf.placeholder(tf.string, shape=[])\n", 284 | "iter = tf.data.Iterator.from_string_handle(\n", 285 | " handle, train_dataset.output_types, train_dataset.output_shapes)\n", 286 | "next_elements = iter.get_next()\n", 287 | "\n", 288 | "with tf.Session() as sess:\n", 289 | " train_handle = sess.run(train_iterator.string_handle())\n", 290 | " test_handle = sess.run(test_iterator.string_handle())\n", 291 | " \n", 292 | " # initialise iterators. In our case we could have used the 'one-shot' iterator instead,\n", 293 | " # and directly feed the data insted the Dataset.from_tensor_slices function, but this\n", 294 | " # approach is more general\n", 295 | " sess.run(train_iterator.initializer, feed_dict={ x: train_data[0], y: train_data[1]})\n", 296 | " sess.run(test_iterator.initializer, feed_dict={ x: test_data[0], y: test_data[1]})\n", 297 | " \n", 298 | " for _ in range(EPOCHS):\n", 299 | " x,y = sess.run(next_elements, feed_dict = {handle: train_handle})\n", 300 | " print(x, y)\n", 301 | " \n", 302 | " x,y = sess.run(next_elements, feed_dict = {handle: test_handle})\n", 303 | " print(x,y)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 10, 309 | "metadata": { 310 | "scrolled": true 311 | }, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "[[0.70861276 0.91522017]\n", 318 | " [0.993154 0.74425373]\n", 319 | " [0.42730845 0.03037355]\n", 320 | " [0.54031161 0.57429001]]\n" 321 | ] 322 | } 323 | ], 324 | "source": [ 325 | "# BATCHING\n", 326 | "BATCH_SIZE = 4\n", 327 | "x = np.random.sample((100,2))\n", 328 | "# make a dataset from a numpy array\n", 329 | "dataset = tf.data.Dataset.from_tensor_slices(x).batch(BATCH_SIZE)\n", 330 | "\n", 331 | "iter = dataset.make_one_shot_iterator()\n", 332 | "el = iter.get_next()\n", 333 | "\n", 334 | "with tf.Session() as sess:\n", 335 | " print(sess.run(el))" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 4, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "[1]\n", 348 | "[2]\n", 349 | "[3]\n", 350 | "[4]\n", 351 | "[1]\n", 352 | "[2]\n", 353 | "[3]\n", 354 | "[4]\n" 355 | ] 356 | } 357 | ], 358 | "source": [ 359 | "# REPEAT\n", 360 | "BATCH_SIZE = 4\n", 361 | "x = np.array([[1],[2],[3],[4]])\n", 362 | "# make a dataset from a numpy array\n", 363 | "dataset = tf.data.Dataset.from_tensor_slices(x)\n", 364 | "dataset = dataset.repeat()\n", 365 | "\n", 366 | "iter = dataset.make_one_shot_iterator()\n", 367 | "el = iter.get_next()\n", 368 | "\n", 369 | "with tf.Session() as sess:\n", 370 | " for _ in range(8):\n", 371 | " print(sess.run(el))" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "# MAP\n", 381 | "x = np.array([[1],[2],[3],[4]])\n", 382 | "# make a dataset from a numpy array\n", 383 | "dataset = tf.data.Dataset.from_tensor_slices(x)\n", 384 | "dataset = dataset.map(lambda x: x*2)\n", 385 | "\n", 386 | "iter = dataset.make_one_shot_iterator()\n", 387 | "el = iter.get_next()\n", 388 | "\n", 389 | "with tf.Session() as sess:\n", 390 | "# this will run forever\n", 391 | " for _ in range(len(x)):\n", 392 | " print(sess.run(el))" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 12, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "name": "stdout", 402 | "output_type": "stream", 403 | "text": [ 404 | "[[3]\n", 405 | " [1]\n", 406 | " [2]\n", 407 | " [4]]\n" 408 | ] 409 | } 410 | ], 411 | "source": [ 412 | "# SHUFFLE\n", 413 | "BATCH_SIZE = 4\n", 414 | "x = np.array([[1],[2],[3],[4]])\n", 415 | "# make a dataset from a numpy array\n", 416 | "dataset = tf.data.Dataset.from_tensor_slices(x)\n", 417 | "dataset = dataset.shuffle(buffer_size=100)\n", 418 | "dataset = dataset.batch(BATCH_SIZE)\n", 419 | "\n", 420 | "iter = dataset.make_one_shot_iterator()\n", 421 | "el = iter.get_next()\n", 422 | "\n", 423 | "with tf.Session() as sess:\n", 424 | " print(sess.run(el))" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 13, 430 | "metadata": {}, 431 | "outputs": [ 432 | { 433 | "name": "stdout", 434 | "output_type": "stream", 435 | "text": [ 436 | "Iter: 0, Loss: 0.1913\n", 437 | "Iter: 1, Loss: 0.1814\n", 438 | "Iter: 2, Loss: 0.1720\n", 439 | "Iter: 3, Loss: 0.1631\n", 440 | "Iter: 4, Loss: 0.1547\n", 441 | "Iter: 5, Loss: 0.1469\n", 442 | "Iter: 6, Loss: 0.1397\n", 443 | "Iter: 7, Loss: 0.1329\n", 444 | "Iter: 8, Loss: 0.1267\n", 445 | "Iter: 9, Loss: 0.1210\n" 446 | ] 447 | } 448 | ], 449 | "source": [ 450 | "# how to pass the value to a model\n", 451 | "EPOCHS = 10\n", 452 | "BATCH_SIZE = 16\n", 453 | "# using two numpy arrays\n", 454 | "features, labels = (np.array([np.random.sample((100,2))]), \n", 455 | " np.array([np.random.sample((100,1))]))\n", 456 | "\n", 457 | "dataset = tf.data.Dataset.from_tensor_slices((features,labels)).repeat().batch(BATCH_SIZE)\n", 458 | "\n", 459 | "iter = dataset.make_one_shot_iterator()\n", 460 | "x, y = iter.get_next()\n", 461 | "\n", 462 | "# make a simple model\n", 463 | "net = tf.layers.dense(x, 8, activation=tf.tanh) # pass the first value from iter.get_next() as input\n", 464 | "net = tf.layers.dense(net, 8, activation=tf.tanh)\n", 465 | "prediction = tf.layers.dense(net, 1, activation=tf.tanh)\n", 466 | "\n", 467 | "loss = tf.losses.mean_squared_error(prediction, y) # pass the second value from iter.get_net() as label\n", 468 | "train_op = tf.train.AdamOptimizer().minimize(loss)\n", 469 | "\n", 470 | "with tf.Session() as sess:\n", 471 | " sess.run(tf.global_variables_initializer())\n", 472 | " for i in range(EPOCHS):\n", 473 | " _, loss_value = sess.run([train_op, loss])\n", 474 | " print(\"Iter: {}, Loss: {:.4f}\".format(i, loss_value))" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 18, 480 | "metadata": {}, 481 | "outputs": [ 482 | { 483 | "name": "stdout", 484 | "output_type": "stream", 485 | "text": [ 486 | "3\n", 487 | "Training...\n", 488 | "Iter: 0, Loss: 1.4389\n", 489 | "Iter: 1, Loss: 1.4704\n", 490 | "Iter: 2, Loss: 1.4081\n", 491 | "Iter: 3, Loss: 1.2877\n", 492 | "Iter: 4, Loss: 1.1842\n", 493 | "Iter: 5, Loss: 1.1944\n", 494 | "Iter: 6, Loss: 1.1166\n", 495 | "Iter: 7, Loss: 0.9924\n", 496 | "Iter: 8, Loss: 0.8997\n", 497 | "Iter: 9, Loss: 0.8817\n", 498 | "Test Loss: 0.836423\n" 499 | ] 500 | } 501 | ], 502 | "source": [ 503 | "# Wrapping all together -> Switch between train and test set using Initializable iterator\n", 504 | "EPOCHS = 10\n", 505 | "# create a placeholder to dynamically switch between batch sizes\n", 506 | "batch_size = tf.placeholder(tf.int64)\n", 507 | "BATCH_SIZE = 32\n", 508 | "\n", 509 | "x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])\n", 510 | "dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size).repeat()\n", 511 | "\n", 512 | "# using two numpy arrays\n", 513 | "train_data = (np.random.sample((100,2)), np.random.sample((100,1)))\n", 514 | "test_data = (np.random.sample((20,2)), np.random.sample((20,1)))\n", 515 | "\n", 516 | "iter = dataset.make_initializable_iterator()\n", 517 | "features, labels = iter.get_next()\n", 518 | "# make a simple model\n", 519 | "net = tf.layers.dense(features, 8, activation=tf.tanh) # pass the first value from iter.get_next() as input\n", 520 | "net = tf.layers.dense(net, 8, activation=tf.tanh)\n", 521 | "prediction = tf.layers.dense(net, 1, activation=tf.tanh)\n", 522 | "\n", 523 | "loss = tf.losses.mean_squared_error(prediction, labels) # pass the second value from iter.get_net() as label\n", 524 | "train_op = tf.train.AdamOptimizer().minimize(loss)\n", 525 | "\n", 526 | "n_batches = train_data[0].shape[0] // BATCH_SIZE\n", 527 | "\n", 528 | "with tf.Session() as sess:\n", 529 | " sess.run(tf.global_variables_initializer())\n", 530 | " # initialise iterator with train data\n", 531 | " sess.run(iter.initializer, feed_dict={ x: train_data[0], y: train_data[1], batch_size: BATCH_SIZE})\n", 532 | " print('Training...')\n", 533 | " for i in range(EPOCHS):\n", 534 | " tot_loss = 0\n", 535 | " for _ in range(n_batches):\n", 536 | " _, loss_value = sess.run([train_op, loss])\n", 537 | " tot_loss += loss_value\n", 538 | " print(\"Iter: {}, Loss: {:.4f}\".format(i, tot_loss / n_batches))\n", 539 | " # initialise iterator with test data\n", 540 | " sess.run(iter.initializer, feed_dict={ x: test_data[0], y: test_data[1], batch_size: test_data[0].shape[0]})\n", 541 | " print('Test Loss: {:4f}'.format(sess.run(loss)))\n" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 10, 547 | "metadata": {}, 548 | "outputs": [ 549 | { 550 | "name": "stdout", 551 | "output_type": "stream", 552 | "text": [ 553 | "Training...\n", 554 | "Iter: 0, Loss: 0.1602\n", 555 | "Iter: 1, Loss: 0.1191\n", 556 | "Iter: 2, Loss: 0.0964\n", 557 | "Iter: 3, Loss: 0.0907\n", 558 | "Iter: 4, Loss: 0.0738\n", 559 | "Iter: 5, Loss: 0.0819\n", 560 | "Iter: 6, Loss: 0.0728\n", 561 | "Iter: 7, Loss: 0.0881\n", 562 | "Iter: 8, Loss: 0.0765\n", 563 | "Iter: 9, Loss: 0.0729\n", 564 | "Test Loss: 0.091081\n" 565 | ] 566 | } 567 | ], 568 | "source": [ 569 | "# Wrapping all together -> Switch between train and test set using Reinitializable iterator\n", 570 | "EPOCHS = 10\n", 571 | "# create a placeholder to dynamically switch between batch sizes\n", 572 | "batch_size = tf.placeholder(tf.int64)\n", 573 | "\n", 574 | "x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])\n", 575 | "train_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(batch_size).repeat()\n", 576 | "test_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(batch_size) # always batch even if you want to one shot it\n", 577 | "# using two numpy arrays\n", 578 | "train_data = (np.random.sample((100,2)), np.random.sample((100,1)))\n", 579 | "test_data = (np.random.sample((20,2)), np.random.sample((20,1)))\n", 580 | "\n", 581 | "# create a iterator of the correct shape and type\n", 582 | "iter = tf.data.Iterator.from_structure(train_dataset.output_types,\n", 583 | " train_dataset.output_shapes)\n", 584 | "features, labels = iter.get_next()\n", 585 | "# create the initialisation operations\n", 586 | "train_init_op = iter.make_initializer(train_dataset)\n", 587 | "test_init_op = iter.make_initializer(test_dataset)\n", 588 | "\n", 589 | "# make a simple model\n", 590 | "net = tf.layers.dense(features, 8, activation=tf.tanh) # pass the first value from iter.get_next() as input\n", 591 | "net = tf.layers.dense(net, 8, activation=tf.tanh)\n", 592 | "prediction = tf.layers.dense(net, 1, activation=tf.tanh)\n", 593 | "\n", 594 | "loss = tf.losses.mean_squared_error(prediction, labels) # pass the second value from iter.get_net() as label\n", 595 | "train_op = tf.train.AdamOptimizer().minimize(loss)\n", 596 | "\n", 597 | "with tf.Session() as sess:\n", 598 | " sess.run(tf.global_variables_initializer())\n", 599 | " # initialise iterator with train data\n", 600 | " sess.run(train_init_op, feed_dict = {x : train_data[0], y: train_data[1], batch_size: 16})\n", 601 | " print('Training...')\n", 602 | " for i in range(EPOCHS):\n", 603 | " tot_loss = 0\n", 604 | " for _ in range(n_batches):\n", 605 | " _, loss_value = sess.run([train_op, loss])\n", 606 | " tot_loss += loss_value\n", 607 | " print(\"Iter: {}, Loss: {:.4f}\".format(i, tot_loss / n_batches))\n", 608 | " # initialise iterator with test data\n", 609 | " sess.run(test_init_op, feed_dict = {x : test_data[0], y: test_data[1], batch_size:len(test_data[0])})\n", 610 | " print('Test Loss: {:4f}'.format(sess.run(loss)))\n" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 32, 616 | "metadata": {}, 617 | "outputs": [ 618 | { 619 | "name": "stdout", 620 | "output_type": "stream", 621 | "text": [ 622 | "{'sentiment': , 'text': }\n", 623 | "[array([b\"@MENTION, i agree! i'm trying to finish my thesis..and so far it's not going anywhere\",\n", 624 | " b'@MENTION erm, sacre coeur even... james -1',\n", 625 | " b\"@MENTION now am depressed and br'3ii is sleepin since i came home\",\n", 626 | " b'just finishing what turned out to be a nice day',\n", 627 | " b\"it's over. it was great. dollhouse\",\n", 628 | " b'just got bck from cross-country practice wooo 3 miles!',\n", 629 | " b'@MENTION you mean a man who cheats on his wife habitually and is a complete hypocrite... they have plenty of those already.',\n", 630 | " b'i feel bad for che ming wang though, he cant seem to get it together... and has an era of 30+',\n", 631 | " b'@MENTION @MENTION ok.. so i tweeted about the rains and it is not raining anymore. atleast i wont have to water the plants tomorrow.',\n", 632 | " b'is gutted about katie & peter love them!',\n", 633 | " b\"finally home... long day... had a great time with vet's from every era!\",\n", 634 | " b\"@MENTION yucky!!! for whatevv reason i can't eat 5 guys anymore. just makes me gag!!!\",\n", 635 | " b'@MENTION ever been to the antiques roadshow? my favorite show',\n", 636 | " b'lovin virginia! hopefully ill pick up an accent 4 a lil while',\n", 637 | " b\"@MENTION yea me too i just drink a cup of tea, but don't worry i got some snacks with me\",\n", 638 | " b'got what could quite possibly be the worst paper cut ever today. corners of file folders = paper daggers. ouchie.',\n", 639 | " b'sneaking in some computer time. wish you all a good day!',\n", 640 | " b\"@MENTION i'm pretty bored with it too\",\n", 641 | " b'too early to call a landslide victory for m14? lebanonelections',\n", 642 | " b\"school has ruined me so much that i don't even know how to sleep in anymore\",\n", 643 | " b\"is at work! it's been a long weekend\",\n", 644 | " b'feeling a bit sick also very bored!',\n", 645 | " b'misses mr. hollinger. misses callin him hubby more than anything',\n", 646 | " b'616 words i loooveeee jackson rathbone <\\xc3\\xb4\\xc3\\xb8\\xcf\\x89',\n", 647 | " b'sucky day.. first i havta take the bus 2 work then it breaks down nd work suckd of course',\n", 648 | " b'back from skaterhockey. crazy old bears 8 @MENTION reloaded 2. 6 goals against us in the last 20 mins',\n", 649 | " b'this is sad! john and kate are officially filing for divorce...as bad as the times have been this still shocks me.',\n", 650 | " b'@MENTION i wish i could',\n", 651 | " b'@MENTION and what do u see!?!? *raises an eyebrow in amusement*',\n", 652 | " b'@MENTION helloooo promm dress..ahh i need to get onee :| whens yours ? debenhams av a salee x',\n", 653 | " b'@MENTION heyy! thx 4 making \"back around+lyrics\" gosshh! its totally amazing ! amazing work u got there!',\n", 654 | " b'@MENTION aside from the use of \"sex\" as a term, i like that latest post'],\n", 655 | " dtype=object), array([0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0,\n", 656 | " 0, 1, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int32)]\n" 657 | ] 658 | } 659 | ], 660 | "source": [ 661 | "# load a csv\n", 662 | "CSV_PATH = './tweets.csv'\n", 663 | "dataset = tf.contrib.data.make_csv_dataset(CSV_PATH, batch_size=32)\n", 664 | "iter = dataset.make_one_shot_iterator()\n", 665 | "next = iter.get_next()\n", 666 | "print(next) # next is a dict with key=columns names and value=column data\n", 667 | "inputs, labels = next['text'], next['sentiment']\n", 668 | "\n", 669 | "with tf.Session() as sess:\n", 670 | " print(sess.run([inputs,labels]))" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": 2, 676 | "metadata": {}, 677 | "outputs": [], 678 | "source": [ 679 | "log_time = {}\n", 680 | "# copied form https://medium.com/pythonhive/python-decorator-to-measure-the-execution-time-of-methods-fa04cb6bb36d\n", 681 | "def how_much(method):\n", 682 | " def timed(*args, **kw):\n", 683 | " ts = time.time()\n", 684 | " result = method(*args, **kw)\n", 685 | " te = time.time()\n", 686 | " \n", 687 | " if 'log_time' in kw:\n", 688 | " name = kw.get('log_name', method.__name__)\n", 689 | " kw['log_time'][name] = (te - ts)\n", 690 | " \n", 691 | " return result\n", 692 | " return timed" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": 3, 698 | "metadata": {}, 699 | "outputs": [ 700 | { 701 | "name": "stdout", 702 | "output_type": "stream", 703 | "text": [ 704 | "((5000, 32, 32), (5000, 20)) ((1000, 32, 32), (1000, 20))\n" 705 | ] 706 | } 707 | ], 708 | "source": [ 709 | "# benchmark\n", 710 | "import time\n", 711 | "DATA_SIZE = 5000\n", 712 | "DATA_SHAPE = ((32,32),(20,))\n", 713 | "BATCH_SIZE = 64 \n", 714 | "N_BATCHES = DATA_SIZE // BATCH_SIZE\n", 715 | "EPOCHS = 10\n", 716 | "\n", 717 | "test_size = (DATA_SIZE//100)*20 \n", 718 | "\n", 719 | "train_shape = ((DATA_SIZE, *DATA_SHAPE[0]),(DATA_SIZE, *DATA_SHAPE[1]))\n", 720 | "test_shape = ((test_size, *DATA_SHAPE[0]),(test_size, *DATA_SHAPE[1]))\n", 721 | "print(train_shape, test_shape)\n", 722 | "train_data = (np.random.sample(train_shape[0]), np.random.sample(train_shape[1]))\n", 723 | "test_data = (np.random.sample(test_shape[0]), np.random.sample(test_shape[1])) " 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": 4, 729 | "metadata": { 730 | "scrolled": false 731 | }, 732 | "outputs": [ 733 | { 734 | "name": "stdout", 735 | "output_type": "stream", 736 | "text": [ 737 | "[None, 32, 32] [None, 20]\n", 738 | "one_shot\n", 739 | "0\n", 740 | "1\n", 741 | "2\n", 742 | "3\n", 743 | "4\n", 744 | "5\n", 745 | "6\n", 746 | "7\n", 747 | "8\n", 748 | "9\n", 749 | "initialisable\n", 750 | "0\n", 751 | "1\n", 752 | "2\n", 753 | "3\n", 754 | "4\n", 755 | "5\n", 756 | "6\n", 757 | "7\n", 758 | "8\n", 759 | "9\n", 760 | "reinitializable\n", 761 | "0\n", 762 | "1\n", 763 | "2\n", 764 | "3\n", 765 | "4\n", 766 | "5\n", 767 | "6\n", 768 | "7\n", 769 | "8\n", 770 | "9\n", 771 | "feedable\n", 772 | "0\n", 773 | "1\n", 774 | "2\n", 775 | "3\n", 776 | "4\n", 777 | "5\n", 778 | "6\n", 779 | "7\n", 780 | "8\n", 781 | "9\n" 782 | ] 783 | }, 784 | { 785 | "data": { 786 | "text/plain": [ 787 | "[(1.5659220218658447, 'reinitializable'),\n", 788 | " (1.581655740737915, 'initialisable'),\n", 789 | " (1.7346899509429932, 'feedable'),\n", 790 | " (2.3557801246643066, 'one_shot')]" 791 | ] 792 | }, 793 | "execution_count": 4, 794 | "metadata": {}, 795 | "output_type": "execute_result" 796 | } 797 | ], 798 | "source": [ 799 | "# used to keep track of the methodds\n", 800 | "log_time = {}\n", 801 | "\n", 802 | "tf.reset_default_graph()\n", 803 | "sess = tf.InteractiveSession()\n", 804 | "\n", 805 | "input_shape = [None, *DATA_SHAPE[0]] # [None, 64, 64, 3]\n", 806 | "output_shape = [None,*DATA_SHAPE[1]] # [None, 20]\n", 807 | "print(input_shape, output_shape)\n", 808 | "\n", 809 | "x, y = tf.placeholder(tf.float32, shape=input_shape), tf.placeholder(tf.float32, shape=output_shape)\n", 810 | "\n", 811 | "@how_much\n", 812 | "def one_shot(**kwargs):\n", 813 | " print('one_shot')\n", 814 | " train_dataset = tf.data.Dataset.from_tensor_slices(train_data).batch(BATCH_SIZE).repeat()\n", 815 | " train_el = train_dataset.make_one_shot_iterator().get_next()\n", 816 | " \n", 817 | " test_dataset = tf.data.Dataset.from_tensor_slices(test_data).batch(BATCH_SIZE).repeat()\n", 818 | " test_el = test_dataset.make_one_shot_iterator().get_next()\n", 819 | " for i in range(EPOCHS):\n", 820 | " print(i)\n", 821 | " for _ in range(N_BATCHES):\n", 822 | " sess.run(train_el)\n", 823 | " for _ in range(N_BATCHES):\n", 824 | " sess.run(test_el)\n", 825 | " \n", 826 | "@how_much\n", 827 | "def initialisable(**kwargs):\n", 828 | " print('initialisable')\n", 829 | " dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(BATCH_SIZE).repeat()\n", 830 | "\n", 831 | " iter = dataset.make_initializable_iterator()\n", 832 | " elements = iter.get_next()\n", 833 | " \n", 834 | " for i in range(EPOCHS):\n", 835 | " print(i)\n", 836 | " sess.run(iter.initializer, feed_dict={ x: train_data[0], y: train_data[1]})\n", 837 | " for _ in range(N_BATCHES):\n", 838 | " sess.run(elements)\n", 839 | " sess.run(iter.initializer, feed_dict={ x: test_data[0], y: test_data[1]})\n", 840 | " for _ in range(N_BATCHES):\n", 841 | " sess.run(elements)\n", 842 | "@how_much \n", 843 | "def reinitializable(**kwargs):\n", 844 | " print('reinitializable')\n", 845 | " # create two datasets, one for training and one for test\n", 846 | " train_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(BATCH_SIZE).repeat()\n", 847 | " test_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(BATCH_SIZE).repeat()\n", 848 | " # create a iterator of the correct shape and type\n", 849 | " iter = tf.data.Iterator.from_structure(train_dataset.output_types,\n", 850 | " train_dataset.output_shapes)\n", 851 | " elements = iter.get_next()\n", 852 | " # create the initialisation operations\n", 853 | " train_init_op = iter.make_initializer(train_dataset)\n", 854 | " test_init_op = iter.make_initializer(test_dataset)\n", 855 | " \n", 856 | " for i in range(EPOCHS):\n", 857 | " print(i)\n", 858 | " sess.run(train_init_op, feed_dict={ x: train_data[0], y: train_data[1]})\n", 859 | " for _ in range(N_BATCHES):\n", 860 | " sess.run(elements)\n", 861 | " sess.run(test_init_op, feed_dict={ x: test_data[0], y: test_data[1]})\n", 862 | " for _ in range(N_BATCHES):\n", 863 | " sess.run(elements)\n", 864 | "@how_much \n", 865 | "def feedable(**kwargs):\n", 866 | " print('feedable')\n", 867 | " # create two datasets, one for training and one for test\n", 868 | " train_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(BATCH_SIZE).repeat()\n", 869 | " test_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(BATCH_SIZE).repeat()\n", 870 | " # create the iterators from the dataset\n", 871 | " train_iterator = train_dataset.make_initializable_iterator()\n", 872 | " test_iterator = test_dataset.make_initializable_iterator()\n", 873 | "\n", 874 | " handle = tf.placeholder(tf.string, shape=[])\n", 875 | " iter = tf.data.Iterator.from_string_handle(\n", 876 | " handle, train_dataset.output_types, train_dataset.output_shapes)\n", 877 | " elements = iter.get_next()\n", 878 | "\n", 879 | " train_handle = sess.run(train_iterator.string_handle())\n", 880 | " test_handle = sess.run(test_iterator.string_handle())\n", 881 | "\n", 882 | " sess.run(train_iterator.initializer, feed_dict={ x: train_data[0], y: train_data[1]})\n", 883 | " sess.run(test_iterator.initializer, feed_dict={ x: test_data[0], y: test_data[1]})\n", 884 | "\n", 885 | " for i in range(EPOCHS):\n", 886 | " print(i)\n", 887 | " for _ in range(N_BATCHES):\n", 888 | " sess.run(elements, feed_dict={handle: train_handle})\n", 889 | " for _ in range(N_BATCHES):\n", 890 | " sess.run(elements, feed_dict={handle: test_handle})\n", 891 | " \n", 892 | "one_shot(log_time=log_time)\n", 893 | "initialisable(log_time=log_time)\n", 894 | "reinitializable(log_time=log_time)\n", 895 | "feedable(log_time=log_time)\n", 896 | "\n", 897 | "sorted((value,key) for (key,value) in log_time.items())\n" 898 | ] 899 | }, 900 | { 901 | "cell_type": "code", 902 | "execution_count": null, 903 | "metadata": {}, 904 | "outputs": [], 905 | "source": [] 906 | } 907 | ], 908 | "metadata": { 909 | "kernelspec": { 910 | "display_name": "Python 3", 911 | "language": "python", 912 | "name": "python3" 913 | }, 914 | "language_info": { 915 | "codemirror_mode": { 916 | "name": "ipython", 917 | "version": 3 918 | }, 919 | "file_extension": ".py", 920 | "mimetype": "text/x-python", 921 | "name": "python", 922 | "nbconvert_exporter": "python", 923 | "pygments_lexer": "ipython3", 924 | "version": "3.6.5" 925 | } 926 | }, 927 | "nbformat": 4, 928 | "nbformat_minor": 2 929 | } 930 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/dataset_tutorial-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/usr/local/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 13 | " from ._conv import register_converters as _register_converters\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import tensorflow as tf\n", 19 | "import numpy as np" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stdout", 29 | "output_type": "stream", 30 | "text": [ 31 | "[0.50035296 0.92651365]\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "x = np.random.sample((100,2))\n", 37 | "# make a dataset from a numpy array\n", 38 | "dataset = tf.data.Dataset.from_tensor_slices(x)\n", 39 | "\n", 40 | "iter = dataset.make_one_shot_iterator()\n", 41 | "el = iter.get_next()\n", 42 | "\n", 43 | "with tf.Session() as sess:\n", 44 | " print(sess.run(el))" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "(array([0.33327842, 0.90874317]), array([0.02171065]))\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "# using two numpy arrays\n", 62 | "features, labels = (np.random.sample((100,2)), np.random.sample((100,1)))\n", 63 | "dataset = tf.data.Dataset.from_tensor_slices((features,labels))\n", 64 | "\n", 65 | "iter = dataset.make_one_shot_iterator()\n", 66 | "el = iter.get_next()\n", 67 | "\n", 68 | "with tf.Session() as sess:\n", 69 | " print(sess.run(el))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "[0.00786543 0.26009214]\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "# using a tensor\n", 87 | "dataset = tf.data.Dataset.from_tensor_slices(tf.random_uniform([100, 2]))\n", 88 | "\n", 89 | "iter = dataset.make_initializable_iterator()\n", 90 | "el = iter.get_next()\n", 91 | "\n", 92 | "with tf.Session() as sess:\n", 93 | " sess.run(iter.initializer)\n", 94 | " print(sess.run(el))" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "[0.03433903 0.7280311 ]\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "# using a placeholder\n", 112 | "x = tf.placeholder(tf.float32, shape=[None,2])\n", 113 | "dataset = tf.data.Dataset.from_tensor_slices(x)\n", 114 | "\n", 115 | "data = np.random.sample((100,2))\n", 116 | "\n", 117 | "iter = dataset.make_initializable_iterator()\n", 118 | "el = iter.get_next()\n", 119 | "\n", 120 | "with tf.Session() as sess:\n", 121 | " sess.run(iter.initializer, feed_dict={ x: data })\n", 122 | " print(sess.run(el))" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 6, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "[[1]]\n", 135 | "[[2]\n", 136 | " [3]]\n", 137 | "[[3]\n", 138 | " [4]\n", 139 | " [5]]\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "# from generator\n", 145 | "sequence = np.array([[[1]],[[2],[3]],[[3],[4],[5]]])\n", 146 | "\n", 147 | "def generator():\n", 148 | " for el in sequence:\n", 149 | " yield el\n", 150 | "\n", 151 | "dataset = tf.data.Dataset().batch(1).from_generator(generator,\n", 152 | " output_types= tf.int64, \n", 153 | " output_shapes=(tf.TensorShape([None, 1])))\n", 154 | "\n", 155 | "iter = dataset.make_initializable_iterator()\n", 156 | "el = iter.get_next()\n", 157 | "\n", 158 | "with tf.Session() as sess:\n", 159 | " sess.run(iter.initializer)\n", 160 | " print(sess.run(el))\n", 161 | " print(sess.run(el))\n", 162 | " print(sess.run(el))\n" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 7, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "[array([1., 2.], dtype=float32), array([0.], dtype=float32)]\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "# initializable iterator to switch between data\n", 180 | "EPOCHS = 10\n", 181 | "\n", 182 | "x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])\n", 183 | "dataset = tf.data.Dataset.from_tensor_slices((x, y))\n", 184 | "\n", 185 | "train_data = (np.random.sample((100,2)), np.random.sample((100,1)))\n", 186 | "test_data = (np.array([[1,2]]), np.array([[0]]))\n", 187 | "\n", 188 | "iter = dataset.make_initializable_iterator()\n", 189 | "features, labels = iter.get_next()\n", 190 | "\n", 191 | "with tf.Session() as sess:\n", 192 | "# initialise iterator with train data\n", 193 | " sess.run(iter.initializer, feed_dict={ x: train_data[0], y: train_data[1]})\n", 194 | " for _ in range(EPOCHS):\n", 195 | " sess.run([features, labels])\n", 196 | "# switch to test data\n", 197 | " sess.run(iter.initializer, feed_dict={ x: test_data[0], y: test_data[1]})\n", 198 | " print(sess.run([features, labels]))\n", 199 | "\n", 200 | " \n", 201 | " " 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 8, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "[array([0.94182994, 0.26802265]), array([0.81551463])]\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "# Reinitializable iterator to switch between Datasets\n", 219 | "EPOCHS = 10\n", 220 | "# making fake data using numpy\n", 221 | "train_data = (np.random.sample((100,2)), np.random.sample((100,1)))\n", 222 | "test_data = (np.random.sample((10,2)), np.random.sample((10,1)))\n", 223 | "# create two datasets, one for training and one for test\n", 224 | "train_dataset = tf.data.Dataset.from_tensor_slices(train_data)\n", 225 | "test_dataset = tf.data.Dataset.from_tensor_slices(test_data)\n", 226 | "# create a iterator of the correct shape and type\n", 227 | "iter = tf.data.Iterator.from_structure(train_dataset.output_types,\n", 228 | " train_dataset.output_shapes)\n", 229 | "features, labels = iter.get_next()\n", 230 | "# create the initialisation operations\n", 231 | "train_init_op = iter.make_initializer(train_dataset)\n", 232 | "test_init_op = iter.make_initializer(test_dataset)\n", 233 | "\n", 234 | "with tf.Session() as sess:\n", 235 | " sess.run(train_init_op) # switch to train dataset\n", 236 | " for _ in range(EPOCHS):\n", 237 | " sess.run([features, labels])\n", 238 | " sess.run(test_init_op) # switch to val dataset\n", 239 | " print(sess.run([features, labels]))\n", 240 | "\n", 241 | " \n", 242 | " " 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 9, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "name": "stdout", 252 | "output_type": "stream", 253 | "text": [ 254 | "[0.8552025 0.13344285] [0.24534453]\n", 255 | "[0.23880187 0.2294315 ] [0.77315474]\n", 256 | "[0.763904 0.439595] [0.42727667]\n", 257 | "[0.6563372 0.1366187] [0.02278621]\n", 258 | "[0.71135175 0.394754 ] [0.8552778]\n", 259 | "[0.7329701 0.42924434] [0.43608633]\n", 260 | "[0.8240853 0.7750715] [0.5140434]\n", 261 | "[0.65556693 0.67978406] [0.8228361]\n", 262 | "[0.02365288 0.18461536] [0.85140544]\n", 263 | "[0.48037764 0.7320316 ] [0.773141]\n", 264 | "[0.6671238 0.8491173] [0.45188755]\n" 265 | ] 266 | } 267 | ], 268 | "source": [ 269 | "# feedable iterator to switch between iterators\n", 270 | "EPOCHS = 10\n", 271 | "# making fake data using numpy\n", 272 | "train_data = (np.random.sample((100,2)), np.random.sample((100,1)))\n", 273 | "test_data = (np.random.sample((10,2)), np.random.sample((10,1)))\n", 274 | "# create placeholder\n", 275 | "x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])\n", 276 | "# create two datasets, one for training and one for test\n", 277 | "train_dataset = tf.data.Dataset.from_tensor_slices((x,y))\n", 278 | "test_dataset = tf.data.Dataset.from_tensor_slices((x,y))\n", 279 | "# create the iterators from the dataset\n", 280 | "train_iterator = train_dataset.make_initializable_iterator()\n", 281 | "test_iterator = test_dataset.make_initializable_iterator()\n", 282 | "# same as in the doc https://www.tensorflow.org/programmers_guide/datasets#creating_an_iterator\n", 283 | "handle = tf.placeholder(tf.string, shape=[])\n", 284 | "iter = tf.data.Iterator.from_string_handle(\n", 285 | " handle, train_dataset.output_types, train_dataset.output_shapes)\n", 286 | "next_elements = iter.get_next()\n", 287 | "\n", 288 | "with tf.Session() as sess:\n", 289 | " train_handle = sess.run(train_iterator.string_handle())\n", 290 | " test_handle = sess.run(test_iterator.string_handle())\n", 291 | " \n", 292 | " # initialise iterators. In our case we could have used the 'one-shot' iterator instead,\n", 293 | " # and directly feed the data insted the Dataset.from_tensor_slices function, but this\n", 294 | " # approach is more general\n", 295 | " sess.run(train_iterator.initializer, feed_dict={ x: train_data[0], y: train_data[1]})\n", 296 | " sess.run(test_iterator.initializer, feed_dict={ x: test_data[0], y: test_data[1]})\n", 297 | " \n", 298 | " for _ in range(EPOCHS):\n", 299 | " x,y = sess.run(next_elements, feed_dict = {handle: train_handle})\n", 300 | " print(x, y)\n", 301 | " \n", 302 | " x,y = sess.run(next_elements, feed_dict = {handle: test_handle})\n", 303 | " print(x,y)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 10, 309 | "metadata": { 310 | "scrolled": true 311 | }, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "[[0.70861276 0.91522017]\n", 318 | " [0.993154 0.74425373]\n", 319 | " [0.42730845 0.03037355]\n", 320 | " [0.54031161 0.57429001]]\n" 321 | ] 322 | } 323 | ], 324 | "source": [ 325 | "# BATCHING\n", 326 | "BATCH_SIZE = 4\n", 327 | "x = np.random.sample((100,2))\n", 328 | "# make a dataset from a numpy array\n", 329 | "dataset = tf.data.Dataset.from_tensor_slices(x).batch(BATCH_SIZE)\n", 330 | "\n", 331 | "iter = dataset.make_one_shot_iterator()\n", 332 | "el = iter.get_next()\n", 333 | "\n", 334 | "with tf.Session() as sess:\n", 335 | " print(sess.run(el))" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 4, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "[1]\n", 348 | "[2]\n", 349 | "[3]\n", 350 | "[4]\n", 351 | "[1]\n", 352 | "[2]\n", 353 | "[3]\n", 354 | "[4]\n" 355 | ] 356 | } 357 | ], 358 | "source": [ 359 | "# REPEAT\n", 360 | "BATCH_SIZE = 4\n", 361 | "x = np.array([[1],[2],[3],[4]])\n", 362 | "# make a dataset from a numpy array\n", 363 | "dataset = tf.data.Dataset.from_tensor_slices(x)\n", 364 | "dataset = dataset.repeat()\n", 365 | "\n", 366 | "iter = dataset.make_one_shot_iterator()\n", 367 | "el = iter.get_next()\n", 368 | "\n", 369 | "with tf.Session() as sess:\n", 370 | " for _ in range(8):\n", 371 | " print(sess.run(el))" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "# MAP\n", 381 | "x = np.array([[1],[2],[3],[4]])\n", 382 | "# make a dataset from a numpy array\n", 383 | "dataset = tf.data.Dataset.from_tensor_slices(x)\n", 384 | "dataset = dataset.map(lambda x: x*2)\n", 385 | "\n", 386 | "iter = dataset.make_one_shot_iterator()\n", 387 | "el = iter.get_next()\n", 388 | "\n", 389 | "with tf.Session() as sess:\n", 390 | "# this will run forever\n", 391 | " for _ in range(len(x)):\n", 392 | " print(sess.run(el))" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 12, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "name": "stdout", 402 | "output_type": "stream", 403 | "text": [ 404 | "[[3]\n", 405 | " [1]\n", 406 | " [2]\n", 407 | " [4]]\n" 408 | ] 409 | } 410 | ], 411 | "source": [ 412 | "# SHUFFLE\n", 413 | "BATCH_SIZE = 4\n", 414 | "x = np.array([[1],[2],[3],[4]])\n", 415 | "# make a dataset from a numpy array\n", 416 | "dataset = tf.data.Dataset.from_tensor_slices(x)\n", 417 | "dataset = dataset.shuffle(buffer_size=100)\n", 418 | "dataset = dataset.batch(BATCH_SIZE)\n", 419 | "\n", 420 | "iter = dataset.make_one_shot_iterator()\n", 421 | "el = iter.get_next()\n", 422 | "\n", 423 | "with tf.Session() as sess:\n", 424 | " print(sess.run(el))" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 13, 430 | "metadata": {}, 431 | "outputs": [ 432 | { 433 | "name": "stdout", 434 | "output_type": "stream", 435 | "text": [ 436 | "Iter: 0, Loss: 0.1913\n", 437 | "Iter: 1, Loss: 0.1814\n", 438 | "Iter: 2, Loss: 0.1720\n", 439 | "Iter: 3, Loss: 0.1631\n", 440 | "Iter: 4, Loss: 0.1547\n", 441 | "Iter: 5, Loss: 0.1469\n", 442 | "Iter: 6, Loss: 0.1397\n", 443 | "Iter: 7, Loss: 0.1329\n", 444 | "Iter: 8, Loss: 0.1267\n", 445 | "Iter: 9, Loss: 0.1210\n" 446 | ] 447 | } 448 | ], 449 | "source": [ 450 | "# how to pass the value to a model\n", 451 | "EPOCHS = 10\n", 452 | "BATCH_SIZE = 16\n", 453 | "# using two numpy arrays\n", 454 | "features, labels = (np.array([np.random.sample((100,2))]), \n", 455 | " np.array([np.random.sample((100,1))]))\n", 456 | "\n", 457 | "dataset = tf.data.Dataset.from_tensor_slices((features,labels)).repeat().batch(BATCH_SIZE)\n", 458 | "\n", 459 | "iter = dataset.make_one_shot_iterator()\n", 460 | "x, y = iter.get_next()\n", 461 | "\n", 462 | "# make a simple model\n", 463 | "net = tf.layers.dense(x, 8, activation=tf.tanh) # pass the first value from iter.get_next() as input\n", 464 | "net = tf.layers.dense(net, 8, activation=tf.tanh)\n", 465 | "prediction = tf.layers.dense(net, 1, activation=tf.tanh)\n", 466 | "\n", 467 | "loss = tf.losses.mean_squared_error(prediction, y) # pass the second value from iter.get_net() as label\n", 468 | "train_op = tf.train.AdamOptimizer().minimize(loss)\n", 469 | "\n", 470 | "with tf.Session() as sess:\n", 471 | " sess.run(tf.global_variables_initializer())\n", 472 | " for i in range(EPOCHS):\n", 473 | " _, loss_value = sess.run([train_op, loss])\n", 474 | " print(\"Iter: {}, Loss: {:.4f}\".format(i, loss_value))" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 18, 480 | "metadata": {}, 481 | "outputs": [ 482 | { 483 | "name": "stdout", 484 | "output_type": "stream", 485 | "text": [ 486 | "3\n", 487 | "Training...\n", 488 | "Iter: 0, Loss: 1.4389\n", 489 | "Iter: 1, Loss: 1.4704\n", 490 | "Iter: 2, Loss: 1.4081\n", 491 | "Iter: 3, Loss: 1.2877\n", 492 | "Iter: 4, Loss: 1.1842\n", 493 | "Iter: 5, Loss: 1.1944\n", 494 | "Iter: 6, Loss: 1.1166\n", 495 | "Iter: 7, Loss: 0.9924\n", 496 | "Iter: 8, Loss: 0.8997\n", 497 | "Iter: 9, Loss: 0.8817\n", 498 | "Test Loss: 0.836423\n" 499 | ] 500 | } 501 | ], 502 | "source": [ 503 | "# Wrapping all together -> Switch between train and test set using Initializable iterator\n", 504 | "EPOCHS = 10\n", 505 | "# create a placeholder to dynamically switch between batch sizes\n", 506 | "batch_size = tf.placeholder(tf.int64)\n", 507 | "BATCH_SIZE = 32\n", 508 | "\n", 509 | "x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])\n", 510 | "dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size).repeat()\n", 511 | "\n", 512 | "# using two numpy arrays\n", 513 | "train_data = (np.random.sample((100,2)), np.random.sample((100,1)))\n", 514 | "test_data = (np.random.sample((20,2)), np.random.sample((20,1)))\n", 515 | "\n", 516 | "iter = dataset.make_initializable_iterator()\n", 517 | "features, labels = iter.get_next()\n", 518 | "# make a simple model\n", 519 | "net = tf.layers.dense(features, 8, activation=tf.tanh) # pass the first value from iter.get_next() as input\n", 520 | "net = tf.layers.dense(net, 8, activation=tf.tanh)\n", 521 | "prediction = tf.layers.dense(net, 1, activation=tf.tanh)\n", 522 | "\n", 523 | "loss = tf.losses.mean_squared_error(prediction, labels) # pass the second value from iter.get_net() as label\n", 524 | "train_op = tf.train.AdamOptimizer().minimize(loss)\n", 525 | "\n", 526 | "n_batches = train_data[0].shape[0] // BATCH_SIZE\n", 527 | "\n", 528 | "with tf.Session() as sess:\n", 529 | " sess.run(tf.global_variables_initializer())\n", 530 | " # initialise iterator with train data\n", 531 | " sess.run(iter.initializer, feed_dict={ x: train_data[0], y: train_data[1], batch_size: BATCH_SIZE})\n", 532 | " print('Training...')\n", 533 | " for i in range(EPOCHS):\n", 534 | " tot_loss = 0\n", 535 | " for _ in range(n_batches):\n", 536 | " _, loss_value = sess.run([train_op, loss])\n", 537 | " tot_loss += loss_value\n", 538 | " print(\"Iter: {}, Loss: {:.4f}\".format(i, tot_loss / n_batches))\n", 539 | " # initialise iterator with test data\n", 540 | " sess.run(iter.initializer, feed_dict={ x: test_data[0], y: test_data[1], batch_size: test_data[0].shape[0]})\n", 541 | " print('Test Loss: {:4f}'.format(sess.run(loss)))\n" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 10, 547 | "metadata": {}, 548 | "outputs": [ 549 | { 550 | "name": "stdout", 551 | "output_type": "stream", 552 | "text": [ 553 | "Training...\n", 554 | "Iter: 0, Loss: 0.1602\n", 555 | "Iter: 1, Loss: 0.1191\n", 556 | "Iter: 2, Loss: 0.0964\n", 557 | "Iter: 3, Loss: 0.0907\n", 558 | "Iter: 4, Loss: 0.0738\n", 559 | "Iter: 5, Loss: 0.0819\n", 560 | "Iter: 6, Loss: 0.0728\n", 561 | "Iter: 7, Loss: 0.0881\n", 562 | "Iter: 8, Loss: 0.0765\n", 563 | "Iter: 9, Loss: 0.0729\n", 564 | "Test Loss: 0.091081\n" 565 | ] 566 | } 567 | ], 568 | "source": [ 569 | "# Wrapping all together -> Switch between train and test set using Reinitializable iterator\n", 570 | "EPOCHS = 10\n", 571 | "# create a placeholder to dynamically switch between batch sizes\n", 572 | "batch_size = tf.placeholder(tf.int64)\n", 573 | "\n", 574 | "x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])\n", 575 | "train_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(batch_size).repeat()\n", 576 | "test_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(batch_size) # always batch even if you want to one shot it\n", 577 | "# using two numpy arrays\n", 578 | "train_data = (np.random.sample((100,2)), np.random.sample((100,1)))\n", 579 | "test_data = (np.random.sample((20,2)), np.random.sample((20,1)))\n", 580 | "\n", 581 | "# create a iterator of the correct shape and type\n", 582 | "iter = tf.data.Iterator.from_structure(train_dataset.output_types,\n", 583 | " train_dataset.output_shapes)\n", 584 | "features, labels = iter.get_next()\n", 585 | "# create the initialisation operations\n", 586 | "train_init_op = iter.make_initializer(train_dataset)\n", 587 | "test_init_op = iter.make_initializer(test_dataset)\n", 588 | "\n", 589 | "# make a simple model\n", 590 | "net = tf.layers.dense(features, 8, activation=tf.tanh) # pass the first value from iter.get_next() as input\n", 591 | "net = tf.layers.dense(net, 8, activation=tf.tanh)\n", 592 | "prediction = tf.layers.dense(net, 1, activation=tf.tanh)\n", 593 | "\n", 594 | "loss = tf.losses.mean_squared_error(prediction, labels) # pass the second value from iter.get_net() as label\n", 595 | "train_op = tf.train.AdamOptimizer().minimize(loss)\n", 596 | "\n", 597 | "with tf.Session() as sess:\n", 598 | " sess.run(tf.global_variables_initializer())\n", 599 | " # initialise iterator with train data\n", 600 | " sess.run(train_init_op, feed_dict = {x : train_data[0], y: train_data[1], batch_size: 16})\n", 601 | " print('Training...')\n", 602 | " for i in range(EPOCHS):\n", 603 | " tot_loss = 0\n", 604 | " for _ in range(n_batches):\n", 605 | " _, loss_value = sess.run([train_op, loss])\n", 606 | " tot_loss += loss_value\n", 607 | " print(\"Iter: {}, Loss: {:.4f}\".format(i, tot_loss / n_batches))\n", 608 | " # initialise iterator with test data\n", 609 | " sess.run(test_init_op, feed_dict = {x : test_data[0], y: test_data[1], batch_size:len(test_data[0])})\n", 610 | " print('Test Loss: {:4f}'.format(sess.run(loss)))\n" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 32, 616 | "metadata": {}, 617 | "outputs": [ 618 | { 619 | "name": "stdout", 620 | "output_type": "stream", 621 | "text": [ 622 | "{'sentiment': , 'text': }\n", 623 | "[array([b\"@MENTION, i agree! i'm trying to finish my thesis..and so far it's not going anywhere\",\n", 624 | " b'@MENTION erm, sacre coeur even... james -1',\n", 625 | " b\"@MENTION now am depressed and br'3ii is sleepin since i came home\",\n", 626 | " b'just finishing what turned out to be a nice day',\n", 627 | " b\"it's over. it was great. dollhouse\",\n", 628 | " b'just got bck from cross-country practice wooo 3 miles!',\n", 629 | " b'@MENTION you mean a man who cheats on his wife habitually and is a complete hypocrite... they have plenty of those already.',\n", 630 | " b'i feel bad for che ming wang though, he cant seem to get it together... and has an era of 30+',\n", 631 | " b'@MENTION @MENTION ok.. so i tweeted about the rains and it is not raining anymore. atleast i wont have to water the plants tomorrow.',\n", 632 | " b'is gutted about katie & peter love them!',\n", 633 | " b\"finally home... long day... had a great time with vet's from every era!\",\n", 634 | " b\"@MENTION yucky!!! for whatevv reason i can't eat 5 guys anymore. just makes me gag!!!\",\n", 635 | " b'@MENTION ever been to the antiques roadshow? my favorite show',\n", 636 | " b'lovin virginia! hopefully ill pick up an accent 4 a lil while',\n", 637 | " b\"@MENTION yea me too i just drink a cup of tea, but don't worry i got some snacks with me\",\n", 638 | " b'got what could quite possibly be the worst paper cut ever today. corners of file folders = paper daggers. ouchie.',\n", 639 | " b'sneaking in some computer time. wish you all a good day!',\n", 640 | " b\"@MENTION i'm pretty bored with it too\",\n", 641 | " b'too early to call a landslide victory for m14? lebanonelections',\n", 642 | " b\"school has ruined me so much that i don't even know how to sleep in anymore\",\n", 643 | " b\"is at work! it's been a long weekend\",\n", 644 | " b'feeling a bit sick also very bored!',\n", 645 | " b'misses mr. hollinger. misses callin him hubby more than anything',\n", 646 | " b'616 words i loooveeee jackson rathbone <\\xc3\\xb4\\xc3\\xb8\\xcf\\x89',\n", 647 | " b'sucky day.. first i havta take the bus 2 work then it breaks down nd work suckd of course',\n", 648 | " b'back from skaterhockey. crazy old bears 8 @MENTION reloaded 2. 6 goals against us in the last 20 mins',\n", 649 | " b'this is sad! john and kate are officially filing for divorce...as bad as the times have been this still shocks me.',\n", 650 | " b'@MENTION i wish i could',\n", 651 | " b'@MENTION and what do u see!?!? *raises an eyebrow in amusement*',\n", 652 | " b'@MENTION helloooo promm dress..ahh i need to get onee :| whens yours ? debenhams av a salee x',\n", 653 | " b'@MENTION heyy! thx 4 making \"back around+lyrics\" gosshh! its totally amazing ! amazing work u got there!',\n", 654 | " b'@MENTION aside from the use of \"sex\" as a term, i like that latest post'],\n", 655 | " dtype=object), array([0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0,\n", 656 | " 0, 1, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int32)]\n" 657 | ] 658 | } 659 | ], 660 | "source": [ 661 | "# load a csv\n", 662 | "CSV_PATH = './tweets.csv'\n", 663 | "dataset = tf.contrib.data.make_csv_dataset(CSV_PATH, batch_size=32)\n", 664 | "iter = dataset.make_one_shot_iterator()\n", 665 | "next = iter.get_next()\n", 666 | "print(next) # next is a dict with key=columns names and value=column data\n", 667 | "inputs, labels = next['text'], next['sentiment']\n", 668 | "\n", 669 | "with tf.Session() as sess:\n", 670 | " print(sess.run([inputs,labels]))" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": 2, 676 | "metadata": {}, 677 | "outputs": [], 678 | "source": [ 679 | "log_time = {}\n", 680 | "# copied form https://medium.com/pythonhive/python-decorator-to-measure-the-execution-time-of-methods-fa04cb6bb36d\n", 681 | "def how_much(method):\n", 682 | " def timed(*args, **kw):\n", 683 | " ts = time.time()\n", 684 | " result = method(*args, **kw)\n", 685 | " te = time.time()\n", 686 | " \n", 687 | " if 'log_time' in kw:\n", 688 | " name = kw.get('log_name', method.__name__)\n", 689 | " kw['log_time'][name] = (te - ts)\n", 690 | " \n", 691 | " return result\n", 692 | " return timed" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": 3, 698 | "metadata": {}, 699 | "outputs": [ 700 | { 701 | "name": "stdout", 702 | "output_type": "stream", 703 | "text": [ 704 | "((5000, 32, 32), (5000, 20)) ((1000, 32, 32), (1000, 20))\n" 705 | ] 706 | } 707 | ], 708 | "source": [ 709 | "# benchmark\n", 710 | "import time\n", 711 | "DATA_SIZE = 5000\n", 712 | "DATA_SHAPE = ((32,32),(20,))\n", 713 | "BATCH_SIZE = 64 \n", 714 | "N_BATCHES = DATA_SIZE // BATCH_SIZE\n", 715 | "EPOCHS = 10\n", 716 | "\n", 717 | "test_size = (DATA_SIZE//100)*20 \n", 718 | "\n", 719 | "train_shape = ((DATA_SIZE, *DATA_SHAPE[0]),(DATA_SIZE, *DATA_SHAPE[1]))\n", 720 | "test_shape = ((test_size, *DATA_SHAPE[0]),(test_size, *DATA_SHAPE[1]))\n", 721 | "print(train_shape, test_shape)\n", 722 | "train_data = (np.random.sample(train_shape[0]), np.random.sample(train_shape[1]))\n", 723 | "test_data = (np.random.sample(test_shape[0]), np.random.sample(test_shape[1])) " 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": 4, 729 | "metadata": { 730 | "scrolled": false 731 | }, 732 | "outputs": [ 733 | { 734 | "name": "stdout", 735 | "output_type": "stream", 736 | "text": [ 737 | "[None, 32, 32] [None, 20]\n", 738 | "one_shot\n", 739 | "0\n", 740 | "1\n", 741 | "2\n", 742 | "3\n", 743 | "4\n", 744 | "5\n", 745 | "6\n", 746 | "7\n", 747 | "8\n", 748 | "9\n", 749 | "initialisable\n", 750 | "0\n", 751 | "1\n", 752 | "2\n", 753 | "3\n", 754 | "4\n", 755 | "5\n", 756 | "6\n", 757 | "7\n", 758 | "8\n", 759 | "9\n", 760 | "reinitializable\n", 761 | "0\n", 762 | "1\n", 763 | "2\n", 764 | "3\n", 765 | "4\n", 766 | "5\n", 767 | "6\n", 768 | "7\n", 769 | "8\n", 770 | "9\n", 771 | "feedable\n", 772 | "0\n", 773 | "1\n", 774 | "2\n", 775 | "3\n", 776 | "4\n", 777 | "5\n", 778 | "6\n", 779 | "7\n", 780 | "8\n", 781 | "9\n" 782 | ] 783 | }, 784 | { 785 | "data": { 786 | "text/plain": [ 787 | "[(1.5659220218658447, 'reinitializable'),\n", 788 | " (1.581655740737915, 'initialisable'),\n", 789 | " (1.7346899509429932, 'feedable'),\n", 790 | " (2.3557801246643066, 'one_shot')]" 791 | ] 792 | }, 793 | "execution_count": 4, 794 | "metadata": {}, 795 | "output_type": "execute_result" 796 | } 797 | ], 798 | "source": [ 799 | "# used to keep track of the methodds\n", 800 | "log_time = {}\n", 801 | "\n", 802 | "tf.reset_default_graph()\n", 803 | "sess = tf.InteractiveSession()\n", 804 | "\n", 805 | "input_shape = [None, *DATA_SHAPE[0]] # [None, 64, 64, 3]\n", 806 | "output_shape = [None,*DATA_SHAPE[1]] # [None, 20]\n", 807 | "print(input_shape, output_shape)\n", 808 | "\n", 809 | "x, y = tf.placeholder(tf.float32, shape=input_shape), tf.placeholder(tf.float32, shape=output_shape)\n", 810 | "\n", 811 | "@how_much\n", 812 | "def one_shot(**kwargs):\n", 813 | " print('one_shot')\n", 814 | " train_dataset = tf.data.Dataset.from_tensor_slices(train_data).batch(BATCH_SIZE).repeat()\n", 815 | " train_el = train_dataset.make_one_shot_iterator().get_next()\n", 816 | " \n", 817 | " test_dataset = tf.data.Dataset.from_tensor_slices(test_data).batch(BATCH_SIZE).repeat()\n", 818 | " test_el = test_dataset.make_one_shot_iterator().get_next()\n", 819 | " for i in range(EPOCHS):\n", 820 | " print(i)\n", 821 | " for _ in range(N_BATCHES):\n", 822 | " sess.run(train_el)\n", 823 | " for _ in range(N_BATCHES):\n", 824 | " sess.run(test_el)\n", 825 | " \n", 826 | "@how_much\n", 827 | "def initialisable(**kwargs):\n", 828 | " print('initialisable')\n", 829 | " dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(BATCH_SIZE).repeat()\n", 830 | "\n", 831 | " iter = dataset.make_initializable_iterator()\n", 832 | " elements = iter.get_next()\n", 833 | " \n", 834 | " for i in range(EPOCHS):\n", 835 | " print(i)\n", 836 | " sess.run(iter.initializer, feed_dict={ x: train_data[0], y: train_data[1]})\n", 837 | " for _ in range(N_BATCHES):\n", 838 | " sess.run(elements)\n", 839 | " sess.run(iter.initializer, feed_dict={ x: test_data[0], y: test_data[1]})\n", 840 | " for _ in range(N_BATCHES):\n", 841 | " sess.run(elements)\n", 842 | "@how_much \n", 843 | "def reinitializable(**kwargs):\n", 844 | " print('reinitializable')\n", 845 | " # create two datasets, one for training and one for test\n", 846 | " train_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(BATCH_SIZE).repeat()\n", 847 | " test_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(BATCH_SIZE).repeat()\n", 848 | " # create a iterator of the correct shape and type\n", 849 | " iter = tf.data.Iterator.from_structure(train_dataset.output_types,\n", 850 | " train_dataset.output_shapes)\n", 851 | " elements = iter.get_next()\n", 852 | " # create the initialisation operations\n", 853 | " train_init_op = iter.make_initializer(train_dataset)\n", 854 | " test_init_op = iter.make_initializer(test_dataset)\n", 855 | " \n", 856 | " for i in range(EPOCHS):\n", 857 | " print(i)\n", 858 | " sess.run(train_init_op, feed_dict={ x: train_data[0], y: train_data[1]})\n", 859 | " for _ in range(N_BATCHES):\n", 860 | " sess.run(elements)\n", 861 | " sess.run(test_init_op, feed_dict={ x: test_data[0], y: test_data[1]})\n", 862 | " for _ in range(N_BATCHES):\n", 863 | " sess.run(elements)\n", 864 | "@how_much \n", 865 | "def feedable(**kwargs):\n", 866 | " print('feedable')\n", 867 | " # create two datasets, one for training and one for test\n", 868 | " train_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(BATCH_SIZE).repeat()\n", 869 | " test_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(BATCH_SIZE).repeat()\n", 870 | " # create the iterators from the dataset\n", 871 | " train_iterator = train_dataset.make_initializable_iterator()\n", 872 | " test_iterator = test_dataset.make_initializable_iterator()\n", 873 | "\n", 874 | " handle = tf.placeholder(tf.string, shape=[])\n", 875 | " iter = tf.data.Iterator.from_string_handle(\n", 876 | " handle, train_dataset.output_types, train_dataset.output_shapes)\n", 877 | " elements = iter.get_next()\n", 878 | "\n", 879 | " train_handle = sess.run(train_iterator.string_handle())\n", 880 | " test_handle = sess.run(test_iterator.string_handle())\n", 881 | "\n", 882 | " sess.run(train_iterator.initializer, feed_dict={ x: train_data[0], y: train_data[1]})\n", 883 | " sess.run(test_iterator.initializer, feed_dict={ x: test_data[0], y: test_data[1]})\n", 884 | "\n", 885 | " for i in range(EPOCHS):\n", 886 | " print(i)\n", 887 | " for _ in range(N_BATCHES):\n", 888 | " sess.run(elements, feed_dict={handle: train_handle})\n", 889 | " for _ in range(N_BATCHES):\n", 890 | " sess.run(elements, feed_dict={handle: test_handle})\n", 891 | " \n", 892 | "one_shot(log_time=log_time)\n", 893 | "initialisable(log_time=log_time)\n", 894 | "reinitializable(log_time=log_time)\n", 895 | "feedable(log_time=log_time)\n", 896 | "\n", 897 | "sorted((value,key) for (key,value) in log_time.items())\n" 898 | ] 899 | }, 900 | { 901 | "cell_type": "code", 902 | "execution_count": null, 903 | "metadata": {}, 904 | "outputs": [], 905 | "source": [] 906 | } 907 | ], 908 | "metadata": { 909 | "kernelspec": { 910 | "display_name": "Python 3", 911 | "language": "python", 912 | "name": "python3" 913 | }, 914 | "language_info": { 915 | "codemirror_mode": { 916 | "name": "ipython", 917 | "version": 3 918 | }, 919 | "file_extension": ".py", 920 | "mimetype": "text/x-python", 921 | "name": "python", 922 | "nbconvert_exporter": "python", 923 | "pygments_lexer": "ipython3", 924 | "version": "3.6.5" 925 | } 926 | }, 927 | "nbformat": 4, 928 | "nbformat_minor": 2 929 | } 930 | --------------------------------------------------------------------------------