├── Benchmark.ipynb ├── CMakeLists.txt ├── LICENSE.txt ├── README.md ├── bn_ops.py ├── bnmatmul_op.cc ├── bnmatmul_op.h ├── bnmatmul_op_gpu.cu └── matrix_benchmark.png /Benchmark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:0f432d21c085f17d98f35eaab10ae4220c5d5d99109e504a1ab7cb7ffd776dec" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "import matplotlib\n", 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "%matplotlib inline" 19 | ], 20 | "language": "python", 21 | "metadata": {}, 22 | "outputs": [], 23 | "prompt_number": 1 24 | }, 25 | { 26 | "cell_type": "code", 27 | "collapsed": false, 28 | "input": [ 29 | "import tensorflow as tf\n", 30 | "import bn_ops\n", 31 | "import time" 32 | ], 33 | "language": "python", 34 | "metadata": {}, 35 | "outputs": [], 36 | "prompt_number": 2 37 | }, 38 | { 39 | "cell_type": "code", 40 | "collapsed": false, 41 | "input": [ 42 | "sess = tf.InteractiveSession()\n", 43 | "\n", 44 | "sizes = [256, 512, 1024, 2048, 4096, 8192]\n", 45 | "timenative = []\n", 46 | "timexnor = []\n", 47 | "\n", 48 | "iteration = 50\n", 49 | "for size in sizes:\n", 50 | " mat0 = tf.get_variable(\"mat_%d_0_%f\" % (size, time.time()), [size, size], initializer=tf.random_uniform_initializer(-1., 1.))\n", 51 | " mat1 = tf.get_variable(\"mat_%d_1_%f\" % (size, time.time()), [size, size], initializer=tf.random_uniform_initializer(-1., 1.))\n", 52 | " bmat0 = tf.sign(mat0)\n", 53 | " bmat1 = tf.sign(mat1)\n", 54 | " result1 = tf.matmul(bmat0, bmat1)\n", 55 | " result2 = bn_ops.bn_matmul(bmat0, bmat1)\n", 56 | " \n", 57 | " mat0.initializer.run()\n", 58 | " mat1.initializer.run()\n", 59 | " # tensorflow needs this first computation to even out the speed\n", 60 | " result1.eval()\n", 61 | " result2.eval()\n", 62 | " time1 = time.time()\n", 63 | " for _ in range(iteration):\n", 64 | " result1.eval()\n", 65 | " time1_end = time.time()\n", 66 | " timenative.append((time1_end - time1)/iteration)\n", 67 | " \n", 68 | " time1 = time.time()\n", 69 | " for _ in range(iteration):\n", 70 | " result2.eval()\n", 71 | " time1_end = time.time()\n", 72 | " timexnor.append((time1_end - time1)/iteration)" 73 | ], 74 | "language": "python", 75 | "metadata": {}, 76 | "outputs": [], 77 | "prompt_number": 3 78 | }, 79 | { 80 | "cell_type": "code", 81 | "collapsed": false, 82 | "input": [ 83 | "plt.plot(sizes, timenative, label='cuBlas')\n", 84 | "plt.plot(sizes, timexnor, label='XNOR')\n", 85 | "plt.ylabel(\"Time (s)\")\n", 86 | "plt.xlabel(\"Matrix size (single dimension)\")\n", 87 | "plt.legend()\n", 88 | "plt.show()" 89 | ], 90 | "language": "python", 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "metadata": {}, 95 | "output_type": "display_data", 96 | "png": "iVBORw0KGgoAAAANSUhEUgAAAZEAAAEPCAYAAACDTflkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xt8zvX/x/HHNkY2hwnJcXNIElIS1TdLDitESfgqmxTq\nK98O31/Hr286ffsqUt9vJVIolEqJ0uaQOR/DRlImcyZnZnb+/P54X9uuzXBtu659rmt73m+367bP\n9Tldr+sq1+t6n0FERERERERERERERERERERERERERETKkAhgO7ADeLaA472AOGAT8DPQyelYIhDv\nOLbOo1GKiIjXCQASgFCgPLAZaJ7vnCCn7ZaO87PtAqp7MD4RESkmfw/eux0mKSQC6cAXmJKHs7NO\n28HA0XzH/TwVnIiIFJ8nk0hdYK/T832Offn1Bn4FfgRGOu23gEXABuARD8UoIiLFUM6D97ZcPG+O\n4/EX4DOgmWP/LcBBoCawENO2stzNMYqISDF4MonsB+o7Pa+PKY1cyHJHPJcDxzAJBOAI8C2meixP\nEmncuLG1c+dOd8UrIlJW7ASauONGnqzO2gA0xTSsBwL9gLn5zmlMbrvH9Y6/x4BKQGXH8yCgK7Al\n/wvs3LkTy7K86vHSSy/ZHoOvxKWYFFNZiMsbY3J897qFJ0siGcAIIAbTU+tjTNvHMMfxiUAfYBCm\n4T0J6O84Vhv4xinGGcACD8YqIiJF4MkkAqax/Md8+yY6bb/peOT3B3Cdp4ISERH38GR1VpkUHh5u\ndwgF8sa4FJNrFJPrvDEub4zJnXx9HIblqN8TEREX+fn5gZu+/z1dnSUiUmjVq1fnxIkTdofh80JC\nQjh+/LhHX0MlERHxOn5+fujfdvFd6HN0Z0lEbSIiIlJkSiIiIlJkSiIiIlJkSiIiIl5g9OjRPPjg\ng3aHUWhKIiIiJSA8PJzLLruMypUrU61aNTp27MjWrVtzjjsau32OkoiISAnw8/Pj/fff58yZMxw/\nfpzw8PA8JQ9f7Y2mJCIiUkh79+7l3nvvpVatWtSoUYPHH3/8vOqoxMRE/P39ycrKOu96f39/+vXr\nx7Zt2y74Gn379uXKK6/MKbU4nzt//nxatGhBlSpVqFevHuPGjXPvGywEJRERkULIzMykR48ehIWF\nsXv3bg4cOED//v1dqo7KLm2kpaUxY8YMOnTocMFzu3fvTkJCAkeOHOH6669n4MCBOceGDBnCpEmT\nOH36NL/88gudOnUq/hsrIiUREfFJfn7ueRTWunXrOHjwIG+99RaXXXYZgYGB3HLLLZesjrIsi5Ej\nRxISEkKVKlX44IMP+Ne//nXB86OioggKCqJ8+fK89NJLxMXFcebMGQACAwP55ZdfOH36NFWrVqVN\nmzaFfyNuoiQiIj7JstzzKKy9e/fSsGFD/P0L9/Xp5+fH//73P06cOEFKSgrz5s3jvvvuY8uW85ZK\nIjMzk+eee44mTZpQtWpVwsLC8PPz4+jRowDMnj2b+fPnExoaSnh4OGvWrCn8G3ETJRERkUKoX78+\ne/bsITMzM8/+4OBgkpOTc54fOnToove59dZbadKkCQsXLjzv2MyZM5k7dy6LFy/m1KlT7Nq1y3lB\nKdq2bcucOXM4cuQIvXv35v7773fDOysaJRERkUK46aabuPLKK3nuuedITk4mJSWFVatWcd1117Fs\n2TL27t3LqVOneOONN8671rnKa/Xq1Wzbto0WLVqcd15SUhIVKlSgevXqnD17lhdeeCHnWHp6OjNm\nzODUqVMEBARQuXJlAgICPPNmXaAkIiJSCP7+/sybN4+EhAQaNGhA/fr1+fLLL+ncuTP9+vWjVatW\n3HjjjfTs2fO8xvYRI0ZQuXJlKleuzKBBg3j99dfp1q0bYKq7ss8fNGgQDRs2pG7dulx77bV06NAh\nz72mT59OWFgYVatWZdKkScyYMaPkPoB8fHN0Sy7N4itSCmkWX/fQLL4iIuLVlERERKTIlERERKTI\nlERERKTIlERERKTIPJ1EIoDtwA7g2QKO9wLigE3Az4DzBDCXulZERGzmyS6+AcBvQGdgP7AeGAD8\n6nROEHDWsd0S+BZo4uK1oC6+IqWSuvi6h6938W0HJACJQDrwBabk4eys03YwcLQQ14qIiM08mUTq\nAnudnu9z7MuvN6aE8SMwspDXioiIjTyZRFwti84BmgM9gc8oZBFr9OjROY/Y2NjCRSgiUkhJSUmE\nhYUxc+bMnH1nzpyhQYMGzJ49m6ioKPz9/Vm/fn3O8YSEhPNm/f3+++9p164dwcHB1KhRgwceeID9\n+/fnHJ86dWrO3FhVq1alVatWfPvtt0WKOTY2Ns93pa9oD0Q7PX+eSzeQ7wQuL8S1loiUPt7+bzsm\nJsaqWbOmdeTIEcuyLGv48OFWnz59LMuyrMjISOvyyy+3unbtmnP+jh07LD8/v5znX331lVWlShXr\n888/t1JSUqxDhw5ZDz30kBUaGmqdOHHCsizLmjJlivWXv/zFsizLysrKsiZOnGhVrFjROn78uMtx\nXuhzxPUf+ZfkyZLIBqApEAoEAv2AufnOaUxuyeN6x99jLl4rImKLrl270r17d0aOHElsbCxfffUV\nH3zwAWAarSMjI4mPj2fZsmXnXWtZFk8//TSjRo2if//+VKhQgSuuuILJkycTHBzM+PHj85ybfc8H\nHniA1NRUdu7cWTJv0kWeTCIZwAggBtgGzMK0fQxzPAD6AFswXXzfBfpf4loREa8wfvx4lixZQt++\nfRk3bhy1atXKOVapUiVeeOEFXnzxxfOu++2339i7dy99+/bNs9/Pz48+ffoUuL5IZmYmU6ZMoVq1\najRr1sz9b6YYynn4/j86Hs4mOm2/6Xi4eq2ICAB+L7tnhIL1UtFqdqpVq0aLFi1Ys2YN99xzT55j\nfn5+DBs2jLFjxxIdHU2TJk1yjmWvTnjllVeed8/atWvnHAdYs2YNISEhnD17lnLlyrFw4UIqV65c\npHg9xdNJRETEI4r65e8u06dPZ/fu3XTu3Jlnn32WCRMm5DkeGBjIqFGjGDVqFF988UXO/ho1agBw\n8OBBGjZsmOeagwcPUrNmzZzn7du3Z/ny5Zw9e5YhQ4YwZswY5s71rpp9TXsiIlJIf/75J0899RST\nJ0/mww8/5Msvv2TFihU5x7PbMqKiojh58iSzZ8/OOdasWTPq1avHl19+meeeWVlZzJ49mzvuuOO8\n1wsKCmLChAksXbqUpUuXeuhdFY2SiIhIIY0YMYJ77rmHjh07Urt2bd58800eeeQR0tLS8owQL1eu\nHC+//DJjxozJ2efn58fYsWN57bXX+Pzzz0lJSeHQoUM8/PDDJCUl8eSTTxb4miEhIQwdOpT//Oc/\nHn9/haEkIiJSCHPmzGHVqlW89dZbOfuGDBlCnTp1eOWVV/IscwswYMAA6tSpk2ff/fffz2effcb4\n8eOpUaMGLVq0IDU1lZUrVxISEgJw3n0AnnjiCZYsWUJ8fLyH36XrtDyuiHgdzZ3lHr4+d5aIiJRy\nSiIiIlJkSiIiIlJkSiIiIlJkSiIiIlJkSiIiIlJkmvZERLxOSEjIeWMkpPCyx5x4kq//V9I4EREp\ntOefh2XLYPFiqFjR7mhKnjvHiagkIiJlykcfwddfw+rVZTOBuJtKIiJSZsTEQGQkLF8OTZvaHY19\nVBIRESmk+Hh48EH45puynUDcTb2zRKTUO3AAevaEd9+FW2+1O5rSRUlEREq1pCTo0QOGDYMBA+yO\npvRRm4iIlFoZGdC7N9SubRrU1WvY0Cy+IiKXYFnwxBOQmgoTJiiBeIoa1kWkVHrnHYiNhZUroXx5\nu6MpvZRERKTU+fZbGDsWVq2CqlXtjqZ08/UCntpERCSPdeuge3f48Udo29buaLyTL7WJRADbgR3A\nswUcHwjEAfHASqCV07FEx/5NwDqPRikipcKuXaYh/eOPlUBKiidLIgHAb0BnYD+wHhgA/Op0Tgdg\nG3AKk3BGA+0dx3YBNwDHL/IaKomICAAnTsAtt8Dw4TBypN3ReDdfKYm0AxIwJYp04AugV75zVmMS\nCMBaoF6+475e3SYiJSAtDfr0gS5dlEBKmieTSF1gr9PzfY59FzIEmO/03AIWARuAR9wenYiUCpYF\nQ4dC5crw9tt2R1P2eLJ3VmHqmW4HHgJucdp3C3AQqAksxLStLM9/4ejRo3O2w8PDCQ8PL3ykIuKz\nXnsNtm6FpUshIMDuaLxTbGwssbGxHrm3J6uL2mPaOCIcz58HsoAx+c5rBXzjOC/hAvd6CUgCxuXb\nrzYRkTJs+nT45z9hzRozKl1c4yttIhuApkAoEAj0A+bmO6cBJoE8QN4EUgmo7NgOAroCWzwYq4j4\nmKVL4amn4IcflEDs5MnqrAxgBBCD6an1MaZn1jDH8YnAv4AQYIJjXzqmQb42JrlkxzgDWODBWEXE\nh/z2G9x/P8ycCS1a2B1N2ebrvZ9UnSVSxhw5Au3bw4svwkMP2R2Nb3JndZaSiIj4jHPnoFMnuOMO\n06AuRaMkkktJRKSMyMqCfv3MZIozZmhW3uLQ8rgiUuY89xwcOgSLFimBeBMlERHxehMnwpw5sHo1\nVKhgdzTizNfzuaqzREq56GiIioIVK6BJE7ujKR1UnSUiZUJcHAwaZEohSiDeScvjiohX2r8fevaE\n996Dm2+2Oxq5ECUREfE6Z86YhaUee8wMKhTvpTYREfEqGRlw991Qty5MmqSeWJ7gK3NniYgUimWZ\n9UAyM+GDD5RAfIEa1kXEa7z9tumFtWKFGVQo3k9JRES8wuzZMH68GQtSpYrd0YirfL2wqDYRkVJg\nzRrTEysmBq6/3u5oSj+1iYhIqfHHH3DPPTBlihKIL1ISERHbnDhhuvL+85/Qo4fd0UhRqDpLRGyR\nlgbdukGbNqZBXUqOpoLPpSQi4oMsCyIjISkJvvoKAgLsjqhs0dxZIuLTXn4Ztm+H2FglEF+nJCIi\nJerTT2HaNNMjq1Ilu6OR4lJ1loiUmNhYszrhkiVwzTV2R1N2qYuviPicX381CeTzz5VAShMlERHx\nuMOHTVfeN9+ETp3sjkbcSUlERDwqOdnMyvvAA6ZHlpQunk4iEcB2YAfwbAHHBwJxQDywEmhViGtF\nxMtlZcGDD8JVV5keWZIrJSOF5PRku8MoNk8mkQDgPUwyuAYYADTPd84fwG2Y5PEqMKkQ14qIl3vm\nGTh6FCZP1rTuztbvX88Nk25gRvwMu0Mptkt18S0PdMV80YcCFrAbWAbEABkXubYdkAAkOp5/AfQC\nfnU6Z7XT9lqgXiGuFREv9sEH8P33sGoVVKhgdzTeITUjlVeWvsLkTZN5N+Jd+rXoZ3dIxXaxJDIK\n6IP5ol8H/IQpuVwJ9AReB74GXrvA9XWBvU7P9wE3XeT1hgDzi3itiHiRH36AV18164JUr253NN5h\n48GNRM6JpHFIY+KGx1E7uLbdIbnFxZJIHCZBFDQQ4xNMQrnYlGmFGcBxO/AQcEthrx09enTOdnh4\nOOHh4YV4WRFxt02bICoK5s6Fxo3tjsZ+aZlpvL7sdT78+UPe7vo2f2351+xxGiUmNjaW2NhYj9y7\nsO/EHwgGTrtwbntgNKZdA+B5IAsYk++8VsA3jvMSCnmtBhuKeJF9+6BDB7O41H332R2N/eIOxRE5\nJ5L6VeszscdE6lSuY3dIQMkPNvwcqAIEAVsx7RLPuHDdBqAppi0lEOgHzM13TgNMAnmA3ATi6rUi\n4kVOnzZjQUaOVAJJz0znlaWv0OWzLjzZ/knm9p/rNQnE3VyZO+saTMljIPAj8BywEXjzEtdlACMw\nDfABwMeYBDTMcXwi8C8gBJjg2JeOaVS/0LUi4oXS0+H++00p5B//sDsae205vIXIOZHUDq7NpmGb\nqFulrt0heZQrxZlfgOuAmcD7QCxmXEeri1xTUlSdJWIzy4Lhw2HPHpg3D8qV0WldM7IyeHPlm4xf\nM54xnccw+LrBJd724aqSngp+IqarbTyma28ocModLy4ivu+tt8yMvMuXl90Esu3INiLnRBJSMYSf\nh/5Mg6oN7A6pxBQlE/lhkk+6m2MpCpVERGz01Vfw1FOwejXUq3fp80ubjKwMxq0ax9jVY3m90+s8\ncv0jXlv6cFZSJZEoYDrnDyi0MAkkENNOMsUdgYiIb1m9Gv72N1iwoGwmkO1HtxM1J4qgwCDWP7Ke\n0Gqhdodki4slkWBgPWb+qg3AQUzmqg20Ba4GPvJ0gCLifXbuhHvvhalT4brr7I6mZGVmZfLOmnd4\nY8UbvHr7qwxrOwx/v7I7l+2lijN+mAGAt2K644KZ9mQFsIrCDSj0BFVniZSwY8fg5pvhiSfg0Uft\njqZk/X7sdwZ/N5jy/uX5pNcnNAppZHdIReLO6izvr7y7OCURkRKUmgpdukC7djB2rN3RlJwsK4v/\nrv0vry17jZc6vsTf2v3Np0sfSiK5lERESohlmTVBUlPhyy/B33e/Qwsl4XgCD333EBYWU3pNoUn1\nJnaHVGxaHldEStxLL5m2kM8+KxsJJMvK4r1179F+cnvubX4vsZGxpSKBuFsZ7dUtIoUxdSpMn27G\ng1x2md3ReN6uE7t4aO5DpGaksvKhlTSr0czukLyWK78namOmHYl2PL8GM227iJQBixfDs8+a6d1r\n1bI7Gs/KsrKYsH4CN350I92bdmf54OVKIJfgSp1YNGYsyIuYqU7KA5uAaz0Yl6vUJiLiQdu2QXi4\naQMp7ass7D65myFzh3Am7QxTe02lec3Su5hqSbeJ1ABmAZmO5+lcfEVDESkFDh0ys/K+/XbpTiCW\nZfHRzx/R9qO2dGnUhZUPrSzVCcTdXGkTSQIud3reHs2dJVKqJSfD3XebxaUeeMDuaDxn76m9PDzv\nYY4lHyM2MpYWtVrYHZLPcaUk8jQwD2iEGWD4GTDSk0GJiH0yM2HgQGjeHP71L7uj8QzLsvhk0ydc\nP+l6bmtwG6uHrFYCKSJX68TKA1c5zv8N75h8EdQmIuJ2Tz4JmzdDTAwEBtodjfvtP72fR+Y9wqGk\nQ0ztPZVWV3jDqhYlq6TbRMoBdwGdgW6YUshT7nhxEfEu770H0dHwzTelL4FYlsWncZ/SZmIb2tdr\nz9qH15bJBOJurrSJzAPOAVsw65yLSCn0/ffw73/DypUQEmJ3NO518MxBhn0/jN2ndrPgwQVcV7uM\nzRrpQa4kkbp4xyqGIuIhGzfC4MEmkYSF2R2N+1iWxcwtM3ky5kmGtx3O1/d/TWBAKSti2cyVJLIA\nU40V4+FYRMQGe/aYnlgTJ8JNN9kdjfscTjrM8B+Gs+PYDn4c+CM31LnB7pBKJVfaRFYB3wIpwBnH\n47QngxKRknHqlBkL8uSTZn2Q0sCyLGZtnUXrD1tzTY1r+Hnoz0ogHuRK63wicDewFe9rE1HvLJEi\nSk83CaRpU9Og7gOrul7SkbNHeGz+Y/zy5y9M7T2VdnXb2R2SVyrp3ll7gF/wvgQiIkVkWfDYY1C+\nPLz7bulIIF9v+5pWH7aiUbVGbBy2UQmkhLjSJrILWAL8CKQ59lnA254KSkQ8a8wY2LABli+Hcj4+\nl/fR5KOMmD+CTYc28c3939Chfge7QypTXCmJ7AJ+AgIx665XdjxcEYFZo30H8GwBx68GVmPaW57O\ndywRiMdM9rjOxdcTkUuYNQs++MD0xAoOtjua4vn2129pNaEVdSvXZfOwzUogNvBkITYAM7q9M7Af\nWA8MAH51Oqcm0BDoDZwAxjkd2wXcABy/yGuoTUSkEFauhN69YdEiaN3a7miK7vi544z8cSRr969l\nSq8p3NrgVrtD8ikl1SbynuPvvAIec124dzsgAVOiSAe+AHrlO+cIsIELT6NSCmpqRbxDQgL06WNW\nJvTlBDLvt3m0nNCSGpVqEDc8TgnEZherDY0ERpC3dJDNlZ//dYG9Ts/3AYXphW4BizBT0E8EPirE\ntSLi5NgxuOsueOUViIiwO5qiOXHuBE/EPMGKPSuYee9MOoZ2tDsk4eJJJMHxN7aI9y5uPdMtwEFM\nlddCTNvK8vwnjR49Omc7PDyc8NK88IFIEaSkmCqse+6BoUPtjqZo5u+Yz9B5Q7nn6nuIHx5PUGCQ\n3SH5lNjYWGJjYz1y74tVF+3D9MAq6BxXeme1B0ZjGtcBnsd0Ex5TwLkvYdYtKajUc7HjahMRuYis\nLDOte2YmfPEF+LvSlcaLnEo5xVMxT/FT4k98cvcn3B52u90hlQol1SYSgOmFFVzAw5XeWRuApkAo\npmdXPy7clpL/zVRyeo0goCtmAkgRKYRRo2D3bpg2zfcSyIKdC2g5oSWBAYHED49XAvFSF6vOOgS8\nXIx7Z2DaVGIwCeljTM+sYY7jE4HamF5bVTCllL8D1wC1gG+cYpyBmcNLRFz0ySemO+/q1XDZZXZH\n47rTqaf5x4J/ELMzho/v/pgujbvYHZJcxMWKM5uANiUVSBGpOkukAIsWmWqsZcugWTO7o3Hdoj8W\n8fDch+nSqAvjuo2jSoUqdodUKrmzOutiN7kcOOaOF/EgJRGRfLZuhU6d4Ouv4bbb7I7GNUlpSTyz\n8Bm+//17JvWcREQTH+1C5iNKqk3E2xOIiORz8CD06AHjx/tOAolNjKXVhFakZKQQ/2i8EoiP8fFZ\nc0Qk29mz0LMnPPywqcrydmfTzvLcouf4dvu3TOwxke5Xdbc7JCkCH+uvISIFycyEAQOgZUt48UW7\no7m05buX0/rD1pxOO82WR7cogfgwlURESoGnnjIlka+/9u5p3ZPTk3lx8Yt8ue1LJnSfwN3N7rY7\nJCkmJRERH/ff/5reWCtXQqAXLx++au8qouZEcWPdG4kfHs/llS63OyRxAyURER/23XdmbZCVK6Fa\nNbujKdi59HOMWjKKGVtm8P5d73Nv81KyDq8ASiIiPmvDBtOIPn8+hIbaHU3B1uxbQ9ScKFrXbk38\n8HhqBtW0OyRxMyURER+0ezf06gWTJ8ONN9odzflSMlIYHTuaqZun8r87/0ffFn3tDkk8RElExMec\nOgXdu8P//Z9JJN5m/f71RH0XxdU1rib+0XhqBdWyOyTxIC/ux+ESjViXMiU93awLcvXVpkHdm3pi\npWak8srSV5i8aTLvRrxLvxb9skdGi5dx54h1lUREfIRlwfDhULEivPOOdyWQjQc3EjknksYhjYkb\nHkft4Np2hyQlRElExEf8+9+weTMsXQoBAXZHY6RlpvH6steZsGECb3d7m4EtB6r0UcYoiYj4gM8/\nh0mTYM0aCA62Oxoj7lAckXMiqV+1PpuHb6ZO5Tp2hyQ28PWfDGoTkVJv+XLo0wcWLzbTmtgtPTOd\n/6z4D/9b9z/e6vIWg1oPUunDx6hNRKSM+P136NsXpk/3jgSy5fAWor6LolZQLTYO20i9KvXsDkls\npgkYRbzUkSOmK+9rr0HXrvbGkpGVwb+X/5tOn3bisbaPMf+v85VABFBJRMQrpaRA796mFPLww/bG\nsu3INiLnRBJSMYSfh/5Mg6oN7A1IvIqvV2SqTURKnawsM627nx/MnAn+NtUXZGRlMG7VOMauHsvr\nnV7nkesfUdtHKaE2EZFS7MUXYf9+MzOvXQlk+9HtRM2JIigwiPWPrCe0Wqg9gYjXU5uIiBf56COz\nJsicOWZQYUnLzMpk3Kpx3PrJrQxqPYiFDy5UApGLUklExEssWACjRpkuvTVqlPzr/37sdwZ/N5jy\n/uVZ98g6GoU0KvkgxOeoJCLiBeLj4YEHTCmkadOSfe0sK4t317zLzR/fTP8W/fkp8iclEHGZp5NI\nBLAd2AE8W8Dxq4HVQArwdCGvFSkVDhyAnj3NhIq33lqyr51wPIHwqeF8te0r1jy8hsdvehx/P/22\nFNd58v+WAOA9TDK4BhgANM93zjHgcWBsEa4V8XlJSdCjBwwbBv37l9zrZllZvLfuPdpPbs89V9/D\n0qilNKnepOQCkFLDk20i7YAEINHx/AugF/Cr0zlHHI/uRbhWxKdlZpquvNdfD88/X3Kvu+vELh6a\n+xCpGamsfGglzWo0K7kXl1LHkyWRusBep+f7HPs8fa2I17Ms+PvfzaDCCRNKZlr3LCuLCesn0G5y\nO7o37c7ywcuVQKTYPFkSKc4oQJevHT16dM52eHg44eHhxXhZkZLxzjtmSvcVK6B8ec+/3u6Tuxky\ndwhn0s6wLGoZzWuqdrgsiY2NJTY21iP39uTvn/bAaEy7BsDzQBYwpoBzXwKSgHGFvFYj1sXnfPst\njBgBq1dDAw/PIGJZFpM3TuaFn17g6Q5P84+b/0E5f/XsL+t8ZcT6BqApEAocAPphGsgLkv/NFOZa\nEZ+xbh0MHQrR0Z5PIHtP7eXheQ9zLPkYSyKXcG2taz37glImebJNJAMYAcQA24BZmIbxYY4HQG1M\n28eTwD+BPUDwRa4V8VmJiWZSxY8/hhtu8NzrWJbFlE1TuH7S9dzW4DZWD1mtBCIe4+uzqak6S3zC\nyZNw883w6KPw+OOee539p/cz9PuhHDhzgGm9p9HqilaeezHxWe6sztKoIhEPS0uDe+81a4J4KoFY\nlsWncZ/SZmIb2tVpx7qH1ymBSIlQSUTEgywLBg82JZHZsyEgwP2vcfDMQYZ9P4zEk4lM6z2NNle2\ncf+LSKmikoiIj3jtNfjlF5gxw/0JxLIsZsTPoPWHrWl9RWs2DN2gBCIlTn39RDxkxgzTiL5mDQQF\nuffeh5MOM/yH4ew4toP5A+fTtk5b976AiItUEhHxgKVL4ckn4YcfoHZt993XsixmbZ1Fqw9b0bxG\nc34e+rMSiNhKJRERN/vtN7j/fvj8c2jRwn33PXL2CI/Nf4ytf25l3oB5tKvbzn03FykilURE3OjI\nEbjrLvjPf+COO9x339nbZtPqw1Y0qtaITcM2KYGI11BJRMRNzp2Du+82M/MOHuyeex5NPsqI+SPY\ndGgT39z/DR3qd3DPjUXcRCURETfIyoJBgyAsDF591T33nLN9Dq0mtKJu5bpsHrZZCUS8kkoiIm7w\n3HNw+DAsXFj8ad2PnzvOyB9Hsnb/Wr7s+yW3Nijh5Q5FCkElEZFimjgRvvvOzM5boULx7jXvt3m0\nnNCSyy9eFCcaAAAVfElEQVS7nM3DNiuBiNfTiHWRYoiONu0fy5dDk2KsLnsy5SRPRD/B8j3L+eTu\nT+gY2tF9QYrkoxHrIl4gLs60g8yeXbwEMn/HfFpOaElwYDBxw+OUQMSnqE1EpAj274eePeG998zs\nvEVxKuUUT8U8xU+JPzGt9zQ6hXVyb5AiJUAlEZFCOnMGuneHv/3NDCosigU7F9ByQksCAwKJHx6v\nBCI+S20iIoWQkWHGgtSrZxrUC9sT63Tqaf6x4B/E7Ixhcs/JdGncxTOBilyE2kREbGBZMHIkZGbC\n++8XPoEs/mMxrSa0wrIs4ofHK4FIqaA2EREXvf02rFhhHuXLu35dUloSzyx8hnm/z+Ojnh8R0STC\nc0GKlDCVRERcMHs2jB9vZuWtUsX162ITY2k1oRXnMs6x5dEtSiBS6qhNROQS1qwxPbEWLIA2Lq75\ndDbtLM8vfp5vfv2GiT0m0v2q7p4NUqQQ1CYiUkL++APuuQemTnU9gSzfvZzWH7bmZMpJtjy6RQlE\nSjW1iYhcwIkTpivvP/9p/l5KcnoyLy5+kVm/zGJC9wn0urqX54MUsZlKIiL5ZGXB4sVmXZA77zTj\nQS5l1d5VXPfhdfyZ/CdbHt2iBCJlhqdLIhHAO0AAMBkYU8A5/wXuBJKBKGCTY38icBrIBNIBrcIj\nHvXHH6baato0CAmBIUPgsccufs259HOMWjKKGVtm8P5d73Nv83tLJFYRb+HJJBIAvAd0BvYD64G5\nwK9O59wFNAGaAjcBE4D2jmMWEA4c92CMUsYlJcHXX8OUKbBtG/z1r2ZG3uuuu/S1a/etJXJOJK1r\ntyZ+eDw1g2p6PmARL+PJJNIOSMCUKAC+AHqRN4ncDUxzbK8FqgFXAIcd+3y995h4Icsys+5OmWKm\nb//LX+Dvf4cePSAw8NLXp2SkMDp2NFM3T+W/d/6X+1sUce4TkVLAk0mkLrDX6fk+TGnjUufUxSQR\nC1iEqc6aCHzksUilTNizx1RVTZ0KFSuaKdzfeANq13b9Huv3ryfquyiaXd6MuOFxXBF8hcfiFfEF\nnkwirg7guFBp41bgAFATWAhsB5bnP2n06NE52+Hh4YSHhxcmRinlkpNNaWPKFNi0Cfr1gy++gLZt\nCzdtSWpGKq8ue5WPNn7EO93eof+1/bP72ot4vdjYWGJjYz1yb0/+K2gPjMY0rgM8D2SRt3H9QyAW\nU9UFJlF0JLc6K9tLQBIwLt9+DTaU81gWrF5tShxffw033QRRUdCrlymBFNbGgxuJnBNJo5BGTOwx\nkdrBhSi6iHghdw429GRJZAOmwTwUU6LoBwzId85cYAQmibQHTmISSCVMw/wZIAjoCrzswVilFNi/\nHz791CQPMNVVW7ZA3bpFu19aZhqvL3udCRsm8Ha3txnYcqBKHyL5eDKJZGASRAwmIXyMaVQf5jg+\nEZiP6aGVAJwFBjuO1Qa+cYpxBrDAg7GKj0pJMb2ppk6FtWvhvvtM1VWHDoWfZRcgMyuT9QfWE5MQ\nw6xfZtEopBGbh2+mTuU6bo9dpDTw9Z9Vqs4qgywLNmwwyWLWLDMdyeDBZnqSSpUKf78DZw4QkxBD\n9M5oFv2xiDqV6xDROIK7mt5FeGi4Sh9S6rizOsvX/3UoiZQhhw7B9Omm1HHunGnnGDQIGjYs3H1S\nM1JZuXcl0QnRRCdEs+/0Pjo36kxEkwi6Nu5KvSr1PBG+iNdQEsmlJFLKpaXB99+bUsfy5aa0MXiw\nGdtRmAJCwvEEohOiidkZw9LEpVxT8xoimkTQrXE3bqx7I+X8NY2clB1KIrmUREqpzZtN4vj8c7jm\nGlPquO8+CA527fqktCSW7FqSkziS05Pp1qQbEY0j6NyoM5dXutyj8Yt4M1/pnSVSKEeOwMyZJnmc\nOGESx+rV0Ljxpa+1LIv4w/HE7IwhOiGa9QfW065uOyIaR/BNv29oWaul2jZEPMDX/1WpJOLj0tMh\nOtokjp9+Mos/RUXB7beD/yXmmD6WfIyFfywkZmcMMQkxVCpfiYgmEUQ0iSA8NJzgQBeLLSJljKqz\ncimJ+KitW00D+fTppqQxeDD07QtVq174moysDNbvX59TRbXtyDbCQ8Pp1rgb3Zp0o0n1JiUWv4gv\nUxLJpSTiQ44fN20cU6fCwYOmZ1VkJDRrduFr9p3eR0xCDDE7Y1j0xyLqV61PROMIujXpxi31b6FC\nuQolFr9IaaEkkktJxMtlZpq1yadOhZgYiIgwpY7OnSEg4PzzUzJSWLFnRU5p48CZA3Rp1CWn+60G\n/YkUn5JILiURL/Xbb6ad47PPoF49087Rv79Z7MmZZVnsOL4jZ7Df8t3LubbWtXRr3I2IJhG0rdOW\nAP8Cso2IFJmSSC4lES9y6pQZQT51KuzaBQ88YJJHixZ5zzuTeoafdv2U05MqNTM1p4qqc6POVL+s\nuh3hi5QZSiK5lERslpVlelVNmQI//GCqqaKiTLVVOUcHcsuyiDscl1NFteHABtrXa59T2mhRs4W6\n34qUICWRXEoiNtm5M3c98ho1TDvHgAFmG+Bo8lEW7lxI9M5oYhJiqFKhSs4I8fDQcIICg2yNX6Qs\nUxLJpSRSgs6cyV2PfPt2GDjQlDpatzbdb9fuW5tT2vjt2G+Eh4bnVFM1Cmlkd/gi4qAkkktJxMOy\nsmDZMlPqmDMHOnY0pY677oLD5/bmtGss3rWY0GqhOVVUN9e/mcAAFxYsF5ESpySSS0nEQxITcxd4\nCgoyiaNPvxR+S1mW05PqcNJhujbuSrfG3ejauCtXVr7S7rBFxAVKIrmURNwoORlmzzaJIy4O+vW3\n6NT3d/ZWiCZmZzQr96yk5RUtiWhspha5/srr1f1WxAcpieRSEikmy4JVq0zimD0b2t5ymta9f+Jk\njWgW7oom08rMade4I+wOQi4LueQ9RcS7KYnkUhIpon37THXVlKlZZNTYTLPu0ZyqEcPW4xvpUK9D\nzkSGzWs0V/dbkVJGSSSXkkghpKSYxvFJ04+w9tgC6t4WzdFqC6hVOSSnQbxjaEcqlS/CGrMi4jOU\nRHIpiRQgI8M0jG/7NYsN2w+yeXcivx9OJDH5FwKvWUBmlQTuaHw73a8y1VSh1ULtDllESpCSSK4y\nnUSOH7dYveUwa7YnEr93FwlHEzmQnMhp/134V08kq/IeKlKNWoGhhIWE0aZBU3q17EyHeh0oH1De\n7vBFxCZKIrlKdRKxLItDp4+y6tddrN+RyNZ9ifxxfBeHUhM57Z9IZvBuymUFUdUKo3bFUBpVD6VV\n/TDaNg2lWa1QGlZrqKopETmPLyWRCOAdIACYDIwp4Jz/AncCyUAUsKkQ1/p0ErEsixMpJ9h1Yhe/\n7E9kw85dbDuQSOLJRP5M30VSuUSs9IoEng0lxC+MOpVCaVIzlNYNQunQPIwbmzakcgWt3iciheMr\nSSQA+A3oDOwH1gMDgF+dzrkLGOH4exPwLtDexWvBC5NIbGws4eHhOc9Pppwk8aRJDDuP72LLnkS2\nH05kz+ldHM1IJCvTH7+TYVgnQ7k8IJT6wWFcVSuUNmGh3HxNKG2uqUIlNxQm8sflDRSTaxST67wx\nLm+MyZ1JpJw7bnIB7YAEINHx/AugF3kTwd3ANMf2WqAaUBsIc+Faj7Isi7TMNJLSkjibftb8TTN/\nnfc57z+bfpZl05ZTd29DEo7sZt/ZRDIyM6iYEoZ1IpTkA2FUzgylYdVwOtYO44bGobRpXo2rr4Y6\ndcCTPWm98X9kxeQaxeQ6b4zLG2NyJ08mkbrAXqfn+zCljUudUxeo48K1LtlwYAMHzxzkdOppjief\n4vjZ05xIPs2Jc6c4de40p1JPcyb1NGfST3M2PYlzGWdJyUoiJessfvgRSDDlrCDKZQYTkBmMX2YQ\nfmnBkBaMlRpEVmowmcnBZJyrTPrZK0n9fTfbf3+QsGqhRNQJpVXT6lzdxo+rr4amTXFLqUJExFt4\nMom4Ws/k0XaZe1+ZwoHk3WSdq4KVUpXymVUol1WFwKw6VKAKgVYVLvOvQkX/ytQIrExwhSCqVAym\nSsUgqgQFEhRkvviDgnBpe+zY47z6ah9PviURkTKhPRDt9Px54Nl853wI9Hd6vh24wsVrwVR5WXro\noYceehTqkYAPKAfsBEKBQGAz0DzfOXcB8x3b7YE1hbhWRERKuTsxvawSMKUJgGGOR7b3HMfjgOsv\nca2IiIiIiIi9IjBtKDsouL3EnT4BDgNbnPZVBxYCvwMLMN2Tsz3viGs70NVp/w2Oe+zAjIkpjvrA\nEuAXYCsw0gviqojpqr0Z2Aa84QUxZQvADGSd5yUxJQLxjpjWeUlM1YCvMV3pt2F6RNodUzPMZ5T9\nOIX5f93uuJ7H/NvbAswEKnhBTH933GurYxsviMlrBWCquUKB8ni+zeQvQBvyJpE3gWcc288C/3Fs\nX+OIp7wjvgRye6Ctw4yfAdMWFFGMmGoD1zm2gzFVf829IK7sTszlMG1ct3pBTABPATOAuY7ndse0\nC/MP3JndMU0DHnJslwOqekFMzvyBg5gfUHbGFQr8gUkcALOASJtjuhbz/VQR8/24EGhsc0xerQN5\ne28953h4Uih5k0h2TzIwX+jbHdv5e5JFYzoNXEnewZL9Mb3T3GUOZoS/t8RVCTPTQAsviKkesAi4\nndySiN0x7QIuz7fPzpiqYr4Y87P7c3LWFVjuBXFVx/xoC8Ek23lAF5tjug8zPVS2f2KSh8dj8i9i\nwHa70CDFknQFpooLx9/s/1B1HPFkcx5A6bx/P+6LORRTUlrrBXH5Y37hHCa3us3umMYD/wdkOe2z\nOyYLk9g2AI94QUxhwBFgCrAR+AgIsjmm/PoDnzu27YzrODAO2AMcAE5ifvnbGdNWTI1JdcwPuLsw\nP548HpOvJhHL7gDyye57bYdgYDamDvRMvmN2xJWFqWarB9yG+fVvZ0w9gD8x9ekXGthqx+d0Cybx\n3wn8DfMFYGdM5TC9Iz9w/D3L+aV7O/8/DwR6Al8VcKyk42oMPIH58VYH82/wAZtj2o6ZpHYB8CPm\nh1xmScTkq0lkP6ZeNFt98mbPknAYUzwEUwT807GdP7Z6mNj2O7ad9+8vZgzlMQnkM0x1lrfEBaYB\n9AdMI52dMd2MmaNtF+ZXbCfM52X353TQ8fcI8C2mDtrOmPY5Husdz7/GJJNDNsbk7E7gZ8znBfZ+\nVm2BVcAxIAP4BlPFbvdn9Ykjto7ACUxjut3/n3stOwYjhnJ+w3p2neJznN9gFYipIthJ7i/gtZge\nL34Uv8HKD/gUU1XjzM64apDb++MyYBlwh80xOetIbpuInTFVAio7toOAlZj6frs/p2XAVY7t0Y54\n7I4p2xeYxutsdsbVGlN9dJnjXtMwpUm7P6tajr8NMO0a2R0jvOG/n1cqycGIn2PqPtMwbTGDMXWP\niyi469wLjri2A92c9md3nUvArKNSHLdiqo42k9v9McLmuFpi6tM3Y7qv/p9jv92fVbaO5PbOsjOm\nMMxntBnzZZT9/6/dn1NrTEkkDvPruqoXxAQm0R4lN/HiBXE9Q24X32mYWgG7Y1rmiGkzudXIdsck\nIiIiIiIiIiIiIiIiIiIiIiIiIiIiIt4vCzPqO1s5zEjieQWfnqM1ZnzPhdxA8aeerkPBU2MUxSLy\njk3I7yOKPsg1lLyDVwtrNPC0Y/tlzKDOktaToi/DcAW5q5qKSBlzBjOosKLj+Z2YwY5zL3iFEQX8\n7wLHyrklMvfpBLzvwfuHUrwk8hK5ScRXzSDvaqbig3x17iyx33ygu2N7AGZUf/a0Ce0wcwttxEzp\ncRVmeoVXgH6YhHM/5tf0Z8AKzBQuztOSvAOMcmx3A5YWEENHckfrb8SMbA4l98t5stPxP53u93+Y\nNRPiHDEU5K/Ad47tIMw8YJsd9+7r2B9L7pdgEvCa45zV5E5B0Rizrkq843j+STLBrP/wllNMQy8Q\n04uYWRqWYxZryp5MbyrQx7GdCPzb8Z43OOJbgBl97LwsdUGfQShmuoxJmJH0MeT+UBiJGQ0dh1mE\nCfL+KAgFfnIcX0TuvExTMaXLlZipNbLjBPOjY8AF3quIlGJnMFOcfIVZmGcTeRNAZcwXI5g1Tr52\nbEeSdxqF0ZhpNrIX9wl3usdlmC+y2zHTMoQVEMdczMR3YOajCqDgX/gNMV+A9TFzVE107Pd3vF7+\nGXTBfJlmLxrVB/PFmq2K4+8ScpNIFrlJdQzmCx/ge0ziBPMlnp1EnOMc6nR+BcxnEpovnhswiagi\n5vPdgVloC8z07fc6tneRmyzedlwThJnX7JBj/4U+g1AgHWjlODYLGOjY3o+Z2sP5/UeSm0TmAQ86\ntgdjJpUEk0RmObabO+LOFoaZp0l8mEoiUlRbMF86AzC/0p1lL7O6BfNFdo1jvx95p2O3MIkgtYD7\nn8Oss7EQ80W1q4BzVmImoHwcs0BQ/qmvwXzpfuU4Zy/mC7QrJvH9jPlF36SA6+pg1o0A80XcBTN5\n3a3A6QLOTyP3c/iZ3CTQntw2ms8pWFdgkCOmNZjklT+mv2Dms0rBJKKLVR1mH9uCKRWdxcw9lYqZ\nD+tin8EuzPvN/z7iMSWQgRT8Obcnt4QyHfM5gflvnD3D9K/krmcBZibjUMSneVs9tPiWucBYTCmk\nptP+V4HFwD2YUkDsRe6RfJFjrTAN9hdaFGcM5pd+d0xC6cb5CelDTEL7yWnfG+QtWVzKDszaH90x\nVVKLMe/RWbrTdhaF/7c1ApMwL8QibwK+0NookPsZZGGSW0FxFfQZhJL388vElAjBvPfbMI3pL2JK\novljuFBMaRc4xw/vWxtICkklESmOTzBVUr/k218FM+sxmKqNbKe5eG8nZw0x1TXZCze1K+Ccxo7X\nfhNTBdQs3/G/YRYMetNpXwxmHfEgx/O65E2A2Q6QW511JaYEMAOTNNu4+B7AlCzuc2z3v8A5McBj\n5H7BX0XuWvXZlgG9ya3O6uHCaxf0pW7h+mfgfJ8GmB8Dz2FKM8H5zllF7vsb6Ij3Uq4Edrtwnngx\nlUSkKLJ/Pe4H3nPal73/Tcz02P/EVPFk71+C+RLahPkl7Hyv/PeYjOl9dAgYgqlbb0veX7V/x7SZ\nZGHaT37EfCFm3+Npx/mbHM8nYH59N8dU84CpGnqA3MWOsq0AbsR84bbENHxnYUocw8/7RC78Pp7A\nVO+84LjXqQKumYwpBWzEfGH/iSnFOduEaVuIcxxfV0AMBcWUPy4wJZ6CPoOCVr6zMG1Nn2GShx+m\nofxUvvMfx7TN/J8jvsH57lHQdjtcSzYiIj4nHJN0iusyp+3+5DY4iynZFaZUJyLiUy412NAVt2K6\n/cZhqoMaFfN+pUUtzu+QISIiIiIiIiIiIiIiIiIiIiIiIiIiIiLF8/90egeLdMwceAAAAABJRU5E\nrkJggg==\n", 97 | "text": [ 98 | "" 99 | ] 100 | } 101 | ], 102 | "prompt_number": 4 103 | }, 104 | { 105 | "cell_type": "code", 106 | "collapsed": false, 107 | "input": [], 108 | "language": "python", 109 | "metadata": {}, 110 | "outputs": [] 111 | } 112 | ], 113 | "metadata": {} 114 | } 115 | ] 116 | } -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | project(xnor_lstm C CXX) 4 | 5 | include(CheckCXXCompilerFlag) 6 | CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11) 7 | CHECK_CXX_COMPILER_FLAG("-std=c++0x" COMPILER_SUPPORTS_CXX0X) 8 | if(COMPILER_SUPPORTS_CXX11) 9 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 10 | elseif(COMPILER_SUPPORTS_CXX0X) 11 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x") 12 | else() 13 | message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.") 14 | endif() 15 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -msse4.2 -D GOOGLE_CUDA=1") ## Optimize 16 | 17 | # CUDA flags 18 | find_package(CUDA QUIET REQUIRED COMPONENTS cuda cudart) 19 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11 -D GOOGLE_CUDA=1 -x cu -Xcompiler" ) 20 | 21 | execute_process(COMMAND python -c "import tensorflow as tf; print(tf.sysconfig.get_include())" 2>/dev/null 22 | OUTPUT_VARIABLE TF_INCLUDE) 23 | 24 | MESSAGE(STATUS "Found Tensorflow headers: " ${TF_INCLUDE}) 25 | include_directories(${TF_INCLUDE}) 26 | 27 | #include 28 | cuda_add_library(bnmatmul_op_gpu SHARED 29 | bnmatmul_op_gpu.cu) 30 | CUDA_ADD_CUBLAS_TO_TARGET(bnmatmul_op_gpu) 31 | target_link_libraries(bnmatmul_op_gpu 32 | ${CUDA_LIBRARIES}) 33 | 34 | add_library(bnmatmul_op SHARED 35 | bnmatmul_op.cc) 36 | SET_TARGET_PROPERTIES(bnmatmul_op PROPERTIES PREFIX "") 37 | target_link_libraries(bnmatmul_op 38 | bnmatmul_op_gpu 39 | ) 40 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Weiyi Zheng, Yina Tang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This module is a prototype for complete the implementation of the xnor kernel on CUDA. With a tensorflow interface. 2 | 3 | Heavily inspired by the [original implementation](https://github.com/MatthieuCourbariaux/BinaryNet) in Theano by Matthieu Courbariaux 4 | 5 | Major feature: 6 | 7 | 1. Supports arbitrary size matrices. 8 | 2. Comes with Tensorflow Binding 9 | 10 | ### Speed up 11 | Generated with the ipython notebook that is also in this repo. benchmark ran with CUDA 7.5, cuDNN v4 on Titan Black, Intel core i7-5820K 12 | 13 | ![Speed Up comparison with cublas](matrix_benchmark.png?raw=true "Comparison") 14 | 15 | Note: This code probably not the most optimized code, since it's my first CUDA program. Suggestions are welcome 16 | -------------------------------------------------------------------------------- /bn_ops.py: -------------------------------------------------------------------------------- 1 | """Cuda op Python library.""" 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import os.path 7 | 8 | import tensorflow as tf 9 | from tensorflow.python.framework import ops 10 | from tensorflow.python.ops import common_shapes 11 | from tensorflow.python.ops import math_ops 12 | from tensorflow.python.ops import array_ops 13 | 14 | 15 | #if tf.test.is_built_with_cuda(): 16 | _cuda_op_module = tf.load_op_library(os.path.join( 17 | tf.resource_loader.get_data_files_path(), 'bnmatmul_op.so')) 18 | 19 | bn_matmul = _cuda_op_module.bn_matmul 20 | ops.RegisterShape("BnMatmul")(common_shapes.matmul_shape) 21 | @ops.RegisterGradient("BnMatmul") 22 | def _BnMatMulGrad(op, grad): 23 | t_a = op.get_attr("transpose_a") 24 | t_b = op.get_attr("transpose_b") 25 | if not t_a and not t_b: 26 | return (math_ops.matmul(grad, op.inputs[1], transpose_b=True), 27 | math_ops.matmul(op.inputs[0], grad, transpose_a=True)) 28 | elif not t_a and t_b: 29 | return (math_ops.matmul(grad, op.inputs[1]), 30 | math_ops.matmul(grad, op.inputs[0], transpose_a=True)) 31 | elif t_a and not t_b: 32 | return (math_ops.matmul(op.inputs[1], grad, transpose_b=True), 33 | math_ops.matmul(op.inputs[0], grad)) 34 | elif t_a and t_b: 35 | return (math_ops.matmul(op.inputs[1], grad, transpose_a=True, 36 | transpose_b=True), 37 | math_ops.matmul(grad, op.inputs[0], transpose_a=True, 38 | transpose_b=True)) 39 | 40 | -------------------------------------------------------------------------------- /bnmatmul_op.cc: -------------------------------------------------------------------------------- 1 | #define EIGEN_USE_THREADS 2 | /* 3 | * performance notes: 4 | * popcnt 5 | * xnor 1.227158, native 0.114228, dumb 0.000003, reduce 0.432185 6 | * __builtin_popcnt 7 | * xnor 1.220534, native 0.109216, dumb 0.000002, reduce 0.424627 8 | * uint64_t 9 | * xnor 0.751118, native 0.110391, dumb 0.000003, reduce 0.433113 10 | * 4 unrolled 11 | * xnor 0.685397, native 0.109669, dumb 0.000005, reduce 0.444948 12 | * _mm_popcnt_u64 13 | * xnor 0.693196, native 0.115468, dumb 0.000005, reduce 0.448353 14 | * gpu: 15 | * xnor 0.030549, native 0.101626, dumb 0.000006, reduce 0.23896 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include "bnmatmul_op.h" 23 | 24 | 25 | namespace tensorflow { 26 | 27 | typedef Eigen::ThreadPoolDevice CPUDevice; 28 | typedef Eigen::GpuDevice GPUDevice; 29 | 30 | REGISTER_OP("DumbMatmul") 31 | .Input("a: T") 32 | .Input("b: T") 33 | .Output("product: T") 34 | .Attr("T: {half, float, double, int32, complex64, complex128}") 35 | .Doc(R"doc( 36 | Multiply the matrix "a" by the matrix "b". 37 | The inputs must be two-dimensional matrices and the inner dimension of 38 | "a" (after being transposed if transpose_a is true) must match the 39 | outer dimension of "b" (after being transposed if transposed_b is 40 | true). 41 | *Note*: The default kernel implementation for MatMul on GPUs uses 42 | cublas. 43 | transpose_a: If true, "a" is transposed before multiplication. 44 | transpose_b: If true, "b" is transposed before multiplication. 45 | )doc"); 46 | 47 | REGISTER_OP("ReduceMatmul") 48 | .Input("a: T") 49 | .Input("b: T") 50 | .Output("product: T") 51 | .Attr("T: {half, float, double, int32}") 52 | .Doc("TODO"); 53 | REGISTER_OP("BnMatmul") 54 | .Input("a: T") 55 | .Input("b: T") 56 | .Output("product: T") 57 | .Attr("transpose_a: bool = false") 58 | .Attr("transpose_b: bool = false") 59 | .Attr("T: {half, float, double, int32}") 60 | .Doc("TODO"); 61 | 62 | 63 | template 64 | class DumbMatmulOp : public OpKernel { 65 | public: 66 | explicit DumbMatmulOp(OpKernelConstruction* context) : OpKernel(context) {} 67 | 68 | void Compute(OpKernelContext* ctx) override { 69 | // Grab the input tensor 70 | const Tensor& a = ctx->input(0); 71 | const Tensor& b = ctx->input(1); 72 | //auto input = input_tensor.flat(); 73 | 74 | 75 | // Check that the dimensions of the two matrices are valid. 76 | OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()), 77 | errors::InvalidArgument("In[0] is not a matrix")); 78 | OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()), 79 | errors::InvalidArgument("In[1] is not a matrix")); 80 | Eigen::array, 1> dim_pair; 81 | // XXX 82 | bool transpose_a_ = false; 83 | bool transpose_b_ = false; 84 | dim_pair[0].first = transpose_a_ ? 0 : 1; 85 | dim_pair[0].second = transpose_b_ ? 1 : 0; 86 | OP_REQUIRES(ctx, 87 | a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second), 88 | errors::InvalidArgument("Matrix size-compatible: In[0]: ", 89 | a.shape().DebugString(), ", In[1]: ", 90 | b.shape().DebugString())); 91 | 92 | // Create an output tensor 93 | int a_dim_remaining = 1 - dim_pair[0].first; 94 | int b_dim_remaining = 1 - dim_pair[0].second; 95 | TensorShape out_shape( 96 | {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)}); 97 | Tensor* out = nullptr; 98 | OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out)); 99 | auto matA = a.matrix(); 100 | auto matB = b.matrix(); 101 | auto matOut = out->matrix(); 102 | 103 | //auto output_flat = out->template flat(); 104 | 105 | // Set all the elements of the output tensor to 0 106 | for ( int r = 0; r < out->dim_size(0); r++ ) 107 | { 108 | for ( int c = 0; c < out->dim_size(1); c++ ) 109 | { 110 | matOut(r,c) = 0; 111 | for ( int n = 0; n < a.dim_size(dim_pair[0].first); n++ ) 112 | { 113 | matOut(r,c) += matA(r, n) * matB(n, c); 114 | } 115 | } 116 | } 117 | } 118 | }; 119 | // Note that TypeConstraint("T") means that attr "T" (defined 120 | // in the Op registration above) must be "int32" to use this template 121 | // instantiation. 122 | REGISTER_KERNEL_BUILDER( 123 | Name("DumbMatmul") 124 | .Device(DEVICE_CPU) 125 | .TypeConstraint("T"), 126 | DumbMatmulOp); 127 | REGISTER_KERNEL_BUILDER( 128 | Name("DumbMatmul") 129 | .Device(DEVICE_CPU) 130 | .TypeConstraint("T"), 131 | DumbMatmulOp); 132 | REGISTER_KERNEL_BUILDER( 133 | Name("DumbMatmul") 134 | .Device(DEVICE_CPU) 135 | .TypeConstraint("T"), 136 | DumbMatmulOp); 137 | 138 | template 139 | class ReduceMatmulOp : public OpKernel { 140 | public: 141 | explicit ReduceMatmulOp(OpKernelConstruction* context) : OpKernel(context) { 142 | } 143 | 144 | void Compute(OpKernelContext* ctx) override { 145 | // Grab the input tensor 146 | const Tensor& a = ctx->input(0); 147 | const Tensor& b = ctx->input(1); 148 | //auto input = input_tensor.flat(); 149 | 150 | 151 | // Check that the dimensions of the two matrices are valid. 152 | OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()), 153 | errors::InvalidArgument("In[0] is not a matrix")); 154 | OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()), 155 | errors::InvalidArgument("In[1] is not a matrix")); 156 | Eigen::array, 1> dim_pair; 157 | // XXX 158 | bool transpose_a_ = false; 159 | bool transpose_b_ = false; 160 | dim_pair[0].first = transpose_a_ ? 0 : 1; 161 | dim_pair[0].second = transpose_b_ ? 1 : 0; 162 | OP_REQUIRES(ctx, 163 | a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second), 164 | errors::InvalidArgument("Matrix size-compatible: In[0]: ", 165 | a.shape().DebugString(), ", In[1]: ", 166 | b.shape().DebugString())); 167 | 168 | // Create an output tensor 169 | int a_dim_remaining = 1 - dim_pair[0].first; 170 | int b_dim_remaining = 1 - dim_pair[0].second; 171 | TensorShape out_shape( 172 | {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)}); 173 | Tensor* out = nullptr; 174 | OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out)); 175 | auto matA = a.matrix(); 176 | auto matB = b.matrix(); 177 | auto matOut = out->matrix(); 178 | 179 | 180 | // Set all the elements of the output tensor to 0 181 | // still not at fast as the native implementation 182 | matOut.device(ctx->eigen_device()) = matA.contract(matB, dim_pair); 183 | } 184 | private: 185 | bool transpose_a_; 186 | bool transpose_b_; 187 | }; 188 | // Note that TypeConstraint("T") means that attr "T" (defined 189 | // in the Op registration above) must be "int32" to use this template 190 | // instantiation. 191 | REGISTER_KERNEL_BUILDER( 192 | Name("ReduceMatmul") 193 | .Device(DEVICE_CPU) 194 | .TypeConstraint("T"), 195 | ReduceMatmulOp); 196 | REGISTER_KERNEL_BUILDER( 197 | Name("ReduceMatmul") 198 | .Device(DEVICE_CPU) 199 | .TypeConstraint("T"), 200 | ReduceMatmulOp); 201 | REGISTER_KERNEL_BUILDER( 202 | Name("ReduceMatmul") 203 | .Device(DEVICE_CPU) 204 | .TypeConstraint("T"), 205 | ReduceMatmulOp); 206 | 207 | #define INTWIDTH 64 208 | template 209 | class BnMatmulOp : public OpKernel { 210 | public: 211 | typedef Eigen::Matrix MaskMatrix; 212 | 213 | explicit BnMatmulOp(OpKernelConstruction* ctx) : OpKernel(ctx) { 214 | OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_)); 215 | OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_)); 216 | } 217 | 218 | void Compute(OpKernelContext* ctx) override { 219 | // Grab the input tensor 220 | const Tensor& a = ctx->input(0); 221 | const Tensor& b = ctx->input(1); 222 | 223 | 224 | // Check that the dimensions of the two matrices are valid. 225 | OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()), 226 | errors::InvalidArgument("In[0] is not a matrix")); 227 | OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()), 228 | errors::InvalidArgument("In[1] is not a matrix")); 229 | OP_REQUIRES(ctx, transpose_a_ == false, errors::InvalidArgument("transpose not supported yet")); 230 | OP_REQUIRES(ctx, transpose_b_ == false, errors::InvalidArgument("transpose not supported yet")); 231 | 232 | Eigen::array, 1> dim_pair; 233 | dim_pair[0].first = transpose_a_ ? 0 : 1; 234 | dim_pair[0].second = transpose_b_ ? 1 : 0; 235 | OP_REQUIRES(ctx, 236 | a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second), 237 | errors::InvalidArgument("Matrix size-compatible: In[0]: ", 238 | a.shape().DebugString(), ", In[1]: ", 239 | b.shape().DebugString())); 240 | 241 | // Create an output tensor 242 | int a_dim_remaining = 1 - dim_pair[0].first; 243 | int b_dim_remaining = 1 - dim_pair[0].second; 244 | TensorShape out_shape( 245 | {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)}); 246 | Tensor* out = nullptr; 247 | OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out)); 248 | auto matA = a.matrix(); 249 | auto matB = b.matrix(); 250 | auto matOut = out->matrix(); 251 | 252 | //auto output_flat = out->template flat(); 253 | auto d = ctx->eigen_device(); 254 | concatenate_and_compute(d, matA, matB, matOut); 255 | 256 | } 257 | private: 258 | bool transpose_a_; 259 | bool transpose_b_; 260 | 261 | void concatenate_row( 262 | typename MatMulTypes::in_type array, 263 | MaskMatrix &out) 264 | { 265 | int colSize = int((array.dimension(1)+INTWIDTH-1 )/ INTWIDTH); 266 | out.resize(array.dimension(0), colSize); 267 | for ( int r = 0; r < array.dimension(0); ++ r ) 268 | { 269 | for ( int c=0; c< colSize; ++ c) 270 | { 271 | uint64_t rvalue=0; 272 | uint64_t sign; 273 | for ( int i=0; i< INTWIDTH; ++i ) { 274 | int colIdx = c*INTWIDTH + i; 275 | if ( colIdx > array.dimension(1)-1 ) { 276 | break; 277 | } 278 | sign = (array(r, colIdx)>=0); 279 | rvalue = rvalue | (sign <::in_type array, 287 | Tensor &out) 288 | { 289 | int colSize = int((array.dimension(1)+INTWIDTH-1 )/ INTWIDTH); 290 | TensorShape b_shape( 291 | {array.dimension(0),colSize}); 292 | out.set_shape(b_shape); 293 | auto out_ = out.matrix(); 294 | for ( int r = 0; r < array.dimension(0); ++ r ) 295 | { 296 | for ( int c=0; c< colSize; ++ c) 297 | { 298 | uint64_t rvalue=0; 299 | uint64_t sign; 300 | for ( int i=0; i< INTWIDTH; ++i ) { 301 | int colIdx = c*INTWIDTH + i; 302 | if ( colIdx > array.dimension(1)-1 ) { 303 | break; 304 | } 305 | sign = (array(r, colIdx)>=0); 306 | rvalue = rvalue | (sign <::in_type array, 314 | MaskMatrix &out) 315 | { 316 | int rowSize = int((array.dimension(0)+INTWIDTH-1)/ INTWIDTH); 317 | out.resize(array.dimension(1),rowSize ); 318 | 319 | for ( int c=0; c< array.dimension(1); ++ c) 320 | { 321 | for ( int r = 0; r < rowSize; ++ r ) 322 | { 323 | uint64_t rvalue=0; 324 | uint64_t sign; 325 | for ( int i=0; i< INTWIDTH; ++i ) { 326 | int rowIdx = r*INTWIDTH + i; 327 | if ( rowIdx > array.dimension(0)-1 ) { 328 | break; 329 | } 330 | sign = (array(rowIdx, c )>=0); 331 | rvalue = rvalue | (sign <::in_type array, 339 | Tensor &out) 340 | { 341 | int rowSize = int((array.dimension(0)+INTWIDTH-1)/ INTWIDTH); 342 | TensorShape b_shape( 343 | {array.dimension(1),rowSize}); 344 | out.set_shape(b_shape); 345 | auto out_ = out.matrix(); 346 | 347 | for ( int c=0; c< array.dimension(1); ++ c) 348 | { 349 | for ( int r = 0; r < rowSize; ++ r ) 350 | { 351 | uint64_t rvalue=0; 352 | uint64_t sign; 353 | for ( int i=0; i< INTWIDTH; ++i ) { 354 | int rowIdx = r*INTWIDTH + i; 355 | if ( rowIdx > array.dimension(0)-1 ) { 356 | break; 357 | } 358 | sign = (array(rowIdx, c )>=0); 359 | rvalue = rvalue | (sign < fsec; 368 | 369 | void concatenate_and_compute( 370 | const CPUDevice &d, 371 | typename MatMulTypes::in_type a, 372 | typename MatMulTypes::in_type b, 373 | typename MatMulTypes::out_type out) 374 | { 375 | MaskMatrix a_; 376 | MaskMatrix b_; 377 | auto t0 = Time::now(); 378 | concatenate_row(a, a_); 379 | concatenate_col(b, b_); 380 | auto t1 = Time::now(); 381 | ms d1 = std::chrono::duration_cast(t1-t0); 382 | 383 | // major time consumer 384 | //version 1 385 | int loopsize = int(a_.cols() /4) * 4 ; 386 | for (int ar=0; ar < a_.rows(); ar++) 387 | { 388 | for (int br=0; br< b_.rows(); br++) { 389 | unsigned int Cvalue = 0; 390 | for (int c=0; c< loopsize; c += 4) 391 | { 392 | Cvalue +=__builtin_popcountll(a_(ar, c) ^ b_(br,c)); 393 | Cvalue +=__builtin_popcountll(a_(ar, c+1) ^ b_(br,c+1)); 394 | Cvalue +=__builtin_popcountll(a_(ar, c+2) ^ b_(br,c+2)); 395 | Cvalue +=__builtin_popcountll(a_(ar, c+3) ^ b_(br,c+3)); 396 | //unsigned int value =popcnt(a_(ar, c) ^ b_(br,c)); 397 | //unsigned int value =__builtin_popcount(a_(ar, c) ^ b_(br,c)); 398 | //unsigned int value =__builtin_popcountll(a_(ar, c) ^ b_(br,c)); 399 | //Cvalue += value; 400 | } 401 | for ( int c=loopsize; c< a_.cols(); c++ ) 402 | { 403 | Cvalue +=__builtin_popcountll(a_(ar, c) ^ b_(br,c)); 404 | } 405 | out(ar, br) = - ( 2*(float)Cvalue - a.dimension(1) ); 406 | } 407 | } 408 | auto t2 = Time::now(); 409 | ms d2 = std::chrono::duration_cast(t2-t1); 410 | 411 | } 412 | }; 413 | // Note that TypeConstraint("T") means that attr "T" (defined 414 | // in the Op registration above) must be "int32" to use this template 415 | // instantiation. 416 | REGISTER_KERNEL_BUILDER( 417 | Name("BnMatmul") 418 | .Device(DEVICE_CPU) 419 | .TypeConstraint("T"), 420 | BnMatmulOp); 421 | REGISTER_KERNEL_BUILDER( 422 | Name("BnMatmul") 423 | .Device(DEVICE_CPU) 424 | .TypeConstraint("T"), 425 | BnMatmulOp); 426 | REGISTER_KERNEL_BUILDER( 427 | Name("BnMatmul") 428 | .Device(DEVICE_CPU) 429 | .TypeConstraint("T"), 430 | BnMatmulOp); 431 | 432 | // GPU implementation 433 | template 434 | class BnMatmulGPUOp : public OpKernel { 435 | public: 436 | //typedef Eigen::Matrix MaskMatrix; 437 | //typedef Eigen::Matrix MatrixXT; 438 | 439 | explicit BnMatmulGPUOp(OpKernelConstruction* ctx) : OpKernel(ctx) { 440 | OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_)); 441 | OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_)); 442 | } 443 | 444 | void Compute(OpKernelContext* ctx) override { 445 | // Grab the input tensor 446 | const Tensor& a = ctx->input(0); 447 | const Tensor& b = ctx->input(1); 448 | 449 | 450 | // Check that the dimensions of the two matrices are valid. 451 | OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()), 452 | errors::InvalidArgument("In[0] is not a matrix")); 453 | OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()), 454 | errors::InvalidArgument("In[1] is not a matrix")); 455 | // TODO: support transpose 456 | OP_REQUIRES(ctx, transpose_a_ == false, errors::InvalidArgument("transpose not supported yet")); 457 | OP_REQUIRES(ctx, transpose_b_ == false, errors::InvalidArgument("transpose not supported yet")); 458 | Eigen::array, 1> dim_pair; 459 | dim_pair[0].first = transpose_a_ ? 0 : 1; 460 | dim_pair[0].second = transpose_b_ ? 1 : 0; 461 | OP_REQUIRES(ctx, 462 | a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second), 463 | errors::InvalidArgument("Matrix size-compatible: In[0]: ", 464 | a.shape().DebugString(), ", In[1]: ", 465 | b.shape().DebugString())); 466 | 467 | // Create an output tensor 468 | int a_dim_remaining = 1 - dim_pair[0].first; 469 | int b_dim_remaining = 1 - dim_pair[0].second; 470 | TensorShape out_shape( 471 | {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)}); 472 | Tensor* out = nullptr; 473 | OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out)); 474 | auto matA = a.flat().data(); 475 | auto matB = b.flat().data(); 476 | auto matOut = out->flat().data(); 477 | 478 | XNORGemmKernelDevice(ctx, matA, matB, a.dim_size(a_dim_remaining), a.dim_size(dim_pair[0].first), b.dim_size(b_dim_remaining), 479 | matOut); 480 | 481 | } 482 | private: 483 | bool transpose_a_; 484 | bool transpose_b_; 485 | }; 486 | 487 | REGISTER_KERNEL_BUILDER( 488 | Name("BnMatmul") 489 | .Device(DEVICE_GPU) 490 | .TypeConstraint("T"), 491 | BnMatmulGPUOp); 492 | 493 | } // namespace tensorflow 494 | -------------------------------------------------------------------------------- /bnmatmul_op.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace tensorflow { 8 | 9 | template 10 | struct MatMulTypes { 11 | typedef Eigen::TensorMap, Eigen::Aligned> 12 | out_type; 13 | typedef Eigen::TensorMap, 14 | Eigen::Aligned> in_type; 15 | }; 16 | 17 | void XNORGemmKernelDevice(OpKernelContext *ctx, const float* in0, const float* in1, const int m, const int n, const int k, float* out); 18 | } 19 | -------------------------------------------------------------------------------- /bnmatmul_op_gpu.cu: -------------------------------------------------------------------------------- 1 | #if GOOGLE_CUDA 2 | #define EIGEN_USE_GPU 3 | 4 | //#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 5 | #include 6 | #include 7 | #include 8 | #include "bnmatmul_op.h" 9 | 10 | #define BLOCK_SIZE 16 11 | 12 | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } 13 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 14 | { 15 | if (code != cudaSuccess) 16 | { 17 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 18 | if (abort) exit(code); 19 | } 20 | } 21 | 22 | namespace tensorflow { 23 | // 32 single float array -> 32 bits unsigned int 24 | __device__ unsigned int concatenate(const float* array, int size) 25 | { 26 | unsigned int rvalue=0; 27 | unsigned int sign; 28 | 29 | for (int i = 0; i < size; i++) 30 | { 31 | sign = (array[i]>=0); 32 | rvalue = rvalue | (sign<>>(fA, Aconc, m, n, stride); 44 | // n is the original dimension 45 | __global__ void concatenate_rows_kernel(const float *a, unsigned int *b, int m, int n, int stride) 46 | { 47 | int col = blockIdx.x * blockDim.x + threadIdx.x; 48 | int realCol = col * 32; 49 | if ( realCol >= n ) 50 | return; 51 | int startRow = (blockIdx.y * blockDim.y + threadIdx.y) * stride; 52 | int totalBCol = (n+31)/32; 53 | int size = 32; 54 | if ( realCol > (n - size) ) 55 | size = n - realCol; 56 | for ( int s = 0; s < stride; s++ ) { 57 | int realRow = startRow + s; 58 | if ( realRow >= m ) 59 | return; 60 | int offset = realRow * totalBCol + col; 61 | int offsetF = realRow * n + realCol; 62 | b[offset] = concatenate(&a[offsetF], size); 63 | } 64 | } 65 | 66 | // m and n are the rows and cols of the transposed matrix, not the original 67 | // to transpose, use 68 | // float const alpha(1.0); 69 | // float const beta(0.0); 70 | // cublasSgeam( handle, CUBLAS_OP_T, CUBLAS_OP_N, N, K, &alpha, fB, K, &beta, fB, N, BT, N ); 71 | __global__ void concatenate_cols_T_kernel(const float *a, unsigned int *b, int m, int n, int stride) 72 | { 73 | int col = blockIdx.x * blockDim.x + threadIdx.x; 74 | int realCol = col * 32; 75 | int startRow = (blockIdx.y * blockDim.y + threadIdx.y) * stride; 76 | if ( realCol >= n ) 77 | return; 78 | int size = 32; 79 | if ( realCol > (n - size) ) 80 | size = n - realCol; 81 | for ( int s = 0; s < stride; s++ ) { 82 | int realRow = startRow + s; 83 | if ( realRow >= m ) 84 | return; 85 | //int offset = realRow * totalBCol + col ; 86 | int offset = col * m + realRow; 87 | int offsetF = realRow * n + realCol; 88 | //int offsetF = col * m + realRow; 89 | b[offset] = concatenate(&a[offsetF], size); 90 | } 91 | } 92 | 93 | // n is the original dimension 94 | // launch this like 95 | // dim3 bDim(blockSize,blockSize); 96 | // dim3 gDim2(n /stride/ blockSize +1 , k / 32 / blockSize + 1); 97 | //concatenate_cols_kernel<<>>(fB, Bconc, n, k, stride); 98 | __global__ void concatenate_cols_kernel(const float *a, unsigned int *b, int n, int k, int stride) 99 | { 100 | 101 | int startCol = (blockIdx.x * blockDim.x + threadIdx.x) * stride; 102 | int row = blockIdx.y * blockDim.y + threadIdx.y; 103 | int realRow = row * 32; 104 | float *array = new float[32]; 105 | for ( int s =0; s < stride; ++s ) 106 | { 107 | int realCol = startCol + s; 108 | if ( realCol >= k || realRow >= n ) { 109 | delete[] array; 110 | return; 111 | } 112 | int offset = row * k + realCol; 113 | int size = 32; 114 | if ( realRow > (n - size) ) 115 | size = n - realRow; 116 | for ( int i= 0; i < size; i++ ) 117 | { 118 | array[i] = a[(realRow + i) * k + realCol]; 119 | } 120 | b[offset] = concatenate(array, size); 121 | /* 122 | if ( realCol == 32 ) { 123 | printf("size to take %d is %d\n", row, size); 124 | printf("row %d %u \n", (row), b[offset]); 125 | }*/ 126 | //printf("processing %d with size %d, starting %.2f, result %d\n", offset, size, a[(realRow*k + realCol)], b[offset]); 127 | } 128 | delete[] array; 129 | } 130 | 131 | 132 | // A is shape (m,n), B is shape (n,k) and C is shape (m,k) 133 | // launch like this 134 | // tiled memory requires constant value 135 | //dim3 blockDim(16, 16); 136 | //dim3 gridDim(N / 16 + 1, N / 16 + 1); 137 | //xnor_gemm<<>>(Aconc, Bconc, fC, m, n, k); 138 | __global__ void xnor_gemm(unsigned int* A, unsigned int* B, float* C, int m, int n, int k) 139 | { 140 | 141 | // Block row and column 142 | int blockRow = blockIdx.y; 143 | int blockCol = blockIdx.x; 144 | 145 | // Thread row and column within Csub 146 | int row = threadIdx.y; 147 | int col = threadIdx.x; 148 | int realRow = blockRow*BLOCK_SIZE + row; 149 | int realCol = blockCol*BLOCK_SIZE+ col; 150 | 151 | // Each thread block computes one sub-matrix Csub of C 152 | //float* Csub = &C[BLOCK_SIZE * k * blockRow + BLOCK_SIZE * blockCol]; 153 | float* Csub = &C[blockDim.y * k * blockRow + blockDim.y * blockCol]; 154 | 155 | int ndex = int((n + 32 - 1)/32); 156 | // Shared memory used to store Asub and Bsub respectively 157 | __shared__ unsigned int As[BLOCK_SIZE][BLOCK_SIZE]; 158 | __shared__ unsigned int Bs[BLOCK_SIZE][BLOCK_SIZE]; 159 | 160 | // Each thread computes one element of Csub 161 | // by accumulating results into Cvalue 162 | // block_size = 16 -> 256 threads, one per Csub element 163 | unsigned int Cvalue = 0; 164 | 165 | // Loop over all the sub-matrices of A and B that are 166 | // required to compute Csub 167 | // Multiply each pair of sub-matrices together 168 | // and accumulate the results 169 | for (int i = 0; i < (ndex + BLOCK_SIZE-1) / BLOCK_SIZE; ++i) { 170 | int ibs = BLOCK_SIZE * i; 171 | 172 | // Get sub-matrix Asub of A 173 | unsigned int* Asub = &A[BLOCK_SIZE * blockRow * ndex + BLOCK_SIZE * i]; 174 | 175 | // Get sub-matrix Bsub of B 176 | unsigned int* Bsub = &B[BLOCK_SIZE * k * i + BLOCK_SIZE * blockCol]; 177 | 178 | // Load Asub and Bsub from device memory to shared memory 179 | // Each thread loads one element of each sub-matrix 180 | if ( (ibs + col) < ndex && realRow < m ) { 181 | As[row][col] = Asub[row*ndex+col]; 182 | } else { 183 | As[row][col] = 0; 184 | } 185 | 186 | if ( (ibs + row) < ndex && realCol < k ) { 187 | Bs[row][col] = Bsub[row*k+col]; 188 | } else { 189 | Bs[row][col] = 0; 190 | } 191 | 192 | // Synchronize to make sure the sub-matrices are loaded 193 | // before starting the computation 194 | __syncthreads(); 195 | 196 | // Multiply Asub and Bsub together 197 | // THIS IS THE MOST INTERESTING PART 198 | for (int j = 0; j < BLOCK_SIZE; ++j) { 199 | Cvalue += __popc(As[row][j]^Bs[j][col]); 200 | } 201 | 202 | // Synchronize to make sure that the preceding 203 | // computation is done before loading two new 204 | // sub-matrices of A and B in the next iteration 205 | __syncthreads(); 206 | } 207 | 208 | // Write Csub to device memory 209 | // Each thread writes one element 210 | if( realCol < k && realRow < m) { 211 | Csub[row*k+col] = -(2*(float)Cvalue-n); 212 | } 213 | } 214 | 215 | void tobinstr(unsigned int value, int bitsCount, char* output) 216 | { 217 | int i; 218 | output[bitsCount] = '\0'; 219 | for (i = bitsCount - 1; i >= 0; --i, value >>= 1) 220 | { 221 | output[i] = (value & 1) + '0'; 222 | } 223 | } 224 | void print2i(unsigned int *a, int m, int n) 225 | { 226 | for (int i =0; i< m; i++) { 227 | for ( int j=0; j eigen_device(); 250 | unsigned int* binary_in0 = reinterpret_cast(d.allocate(m * (n+32-1)/32 * sizeof(unsigned int))); 251 | unsigned int* binary_in1 = reinterpret_cast(d.allocate((n+32-1)/32 * k * sizeof(unsigned int))); 252 | float *in1T = reinterpret_cast(d.allocate(n * k * sizeof(float))); 253 | 254 | // smaller stride, larger blocksize helps 255 | int stride = 4; 256 | // 64 blows up? the benchmark program seems to run fine 257 | int blockSize = 32; 258 | dim3 bDim(blockSize,blockSize); 259 | dim3 gDim( n/sizeof(unsigned int) / blockSize + 1, m/ stride / blockSize + 1); 260 | concatenate_rows_kernel<<>>(in0, binary_in0, m, n, stride); 261 | gpuErrchk( cudaPeekAtLastError() ); 262 | //gpuErrchk( cudaDeviceSynchronize() ); 263 | //print2f(result2, m, n); 264 | 265 | // this is slower than the tranpose + concatenate cols_T due to memory access latency 266 | //dim3 bDim2(blockSize,blockSize); 267 | //dim3 gDim2(k / stride / blockSize + 1, n /sizeof(unsigned int)/ blockSize +1 ); 268 | //concatenate_cols_kernel<<>>(in1, binary_in1, n, k, stride); 269 | 270 | //auto stream = ctx->op_device_context()->stream(); 271 | //stream->ThenBlasSgeam( CUBLAS_OP_T, CUBLAS_OP_N, n, k, &alpha, in1, k, &beta, in1, n, in1T, n ); 272 | float const alpha(1.0); 273 | float const beta(0.0); 274 | // this handle slows down the execution 275 | cublasHandle_t handle; 276 | cublasCreate(&handle); 277 | cublasSetStream(handle, d.stream()); 278 | cublasSgeam( handle, CUBLAS_OP_T, CUBLAS_OP_N, n, k, &alpha, in1, k, &beta, in1, n, in1T, n ); 279 | dim3 bDim2(blockSize,blockSize); 280 | dim3 gDim2(n /sizeof(unsigned int)/ blockSize +1 , k / stride / blockSize + 1); 281 | concatenate_cols_T_kernel<<>>(in1T, binary_in1, k, n, stride); 282 | gpuErrchk( cudaPeekAtLastError() ); 283 | //gpuErrchk( cudaDeviceSynchronize() ); 284 | 285 | dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE); 286 | dim3 gridDim(k / BLOCK_SIZE + 1, m / BLOCK_SIZE + 1); 287 | xnor_gemm<<>>(binary_in0, binary_in1, out, m, n, k); 288 | d.deallocate(binary_in0); 289 | d.deallocate(binary_in1); 290 | d.deallocate(in1T); 291 | cublasDestroy(handle); 292 | } 293 | 294 | } 295 | #endif 296 | -------------------------------------------------------------------------------- /matrix_benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhengwy888/binary_ops/0b3afb231c89c632786847ee995e13288740e72a/matrix_benchmark.png --------------------------------------------------------------------------------