├── .gitignore
├── 13_hardware_acceleration_architecture_overview.ipynb
├── 14_hardware_acceleration_architecture_overview.ipynb
├── CMakeLists.txt
├── LICENSE
├── Makefile
├── README.md
├── python
    └── needle
    │   ├── __init__.py
    │   ├── autograd.py
    │   ├── backend_ndarray
    │       ├── __init__.py
    │       ├── ndarray.py
    │       └── ndarray_backend_numpy.py
    │   ├── backend_numpy.py
    │   ├── backend_selection.py
    │   ├── data
    │       ├── __init__.py
    │       ├── data_basic.py
    │       ├── data_transforms.py
    │       └── datasets
    │       │   ├── __init__.py
    │       │   ├── mnist_dataset.py
    │       │   └── ndarray_dataset.py
    │   ├── init
    │       ├── __init__.py
    │       ├── init_basic.py
    │       └── init_initializers.py
    │   ├── nn
    │       ├── __init__.py
    │       └── nn_basic.py
    │   ├── ops
    │       ├── __init__.py
    │       ├── ops_logarithmic.py
    │       ├── ops_mathematic.py
    │       └── ops_tuple.py
    │   └── optim.py
└── src
    ├── ndarray_backend_cpu.cc
    └── ndarray_backend_cuda.cu


/.gitignore:
--------------------------------------------------------------------------------
  1 | .idea/
  2 | data/
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | *~
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 


--------------------------------------------------------------------------------
/13_hardware_acceleration_architecture_overview.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |   "nbformat": 4,
   3 |   "nbformat_minor": 0,
   4 |   "metadata": {
   5 |     "colab": {
   6 |       "provenance": [],
   7 |       "authorship_tag": "ABX9TyNWBRQkcr+1xpaWxfYNIxpF",
   8 |       "include_colab_link": true
   9 |     },
  10 |     "kernelspec": {
  11 |       "name": "python3",
  12 |       "display_name": "Python 3"
  13 |     },
  14 |     "language_info": {
  15 |       "name": "python"
  16 |     },
  17 |     "accelerator": "GPU"
  18 |   },
  19 |   "cells": [
  20 |     {
  21 |       "cell_type": "markdown",
  22 |       "metadata": {
  23 |         "id": "view-in-github",
  24 |         "colab_type": "text"
  25 |       },
  26 |       "source": [
  27 |         "<a href=\"https://colab.research.google.com/github/dlsyscourse/lecture13/blob/main/13_hardware_acceleration_architecture_overview.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
  28 |       ]
  29 |     },
  30 |     {
  31 |       "cell_type": "markdown",
  32 |       "metadata": {
  33 |         "id": "Mpn1ti5Urdsv"
  34 |       },
  35 |       "source": [
  36 |         "# Lecture 13: Hardware Acceleration Implementation\n",
  37 |         "\n",
  38 |         "In this lecture, we will to walk through backend scafoldings to get us hardware accelerations for needle.\n",
  39 |         "\n",
  40 |         "\n"
  41 |       ]
  42 |     },
  43 |     {
  44 |       "cell_type": "markdown",
  45 |       "metadata": {
  46 |         "id": "MkXPIjVd90z7"
  47 |       },
  48 |       "source": [
  49 |         "## Select a GPU runtime type\n",
  50 |         "In this lecture, we are going to make use of c++ and CUDA to build accelerated linear algebra libraries. In order to do so, please make sure you select a runtime type with GPU and rerun the cells if needed:\n",
  51 |         "- Click on the \"Runtime\" tab\n",
  52 |         "- Click \"Change runtime type\"\n",
  53 |         "- Select GPU\n",
  54 |         "\n",
  55 |         "After you started the right runtime, you can run the following command to check if there is a GPU available."
  56 |       ]
  57 |     },
  58 |     {
  59 |       "cell_type": "code",
  60 |       "metadata": {
  61 |         "colab": {
  62 |           "base_uri": "https://localhost:8080/"
  63 |         },
  64 |         "id": "5VM6IcuZ-kv6",
  65 |         "outputId": "58553884-f279-40b0-8889-b9b42ad2c7fd"
  66 |       },
  67 |       "source": [
  68 |         "!nvidia-smi"
  69 |       ],
  70 |       "execution_count": null,
  71 |       "outputs": [
  72 |         {
  73 |           "output_type": "stream",
  74 |           "name": "stdout",
  75 |           "text": [
  76 |             "Tue Oct  8 01:26:45 2024       \n",
  77 |             "+---------------------------------------------------------------------------------------+\n",
  78 |             "| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |\n",
  79 |             "|-----------------------------------------+----------------------+----------------------+\n",
  80 |             "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
  81 |             "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
  82 |             "|                                         |                      |               MIG M. |\n",
  83 |             "|=========================================+======================+======================|\n",
  84 |             "|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |\n",
  85 |             "| N/A   53C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |\n",
  86 |             "|                                         |                      |                  N/A |\n",
  87 |             "+-----------------------------------------+----------------------+----------------------+\n",
  88 |             "                                                                                         \n",
  89 |             "+---------------------------------------------------------------------------------------+\n",
  90 |             "| Processes:                                                                            |\n",
  91 |             "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
  92 |             "|        ID   ID                                                             Usage      |\n",
  93 |             "|=======================================================================================|\n",
  94 |             "|  No running processes found                                                           |\n",
  95 |             "+---------------------------------------------------------------------------------------+\n"
  96 |           ]
  97 |         }
  98 |       ]
  99 |     },
 100 |     {
 101 |       "cell_type": "markdown",
 102 |       "metadata": {
 103 |         "id": "qXysoqn-vZuF"
 104 |       },
 105 |       "source": [
 106 |         "## Prepare the codebase\n",
 107 |         "\n",
 108 |         "To get started, we can clone the related repo from the github."
 109 |       ]
 110 |     },
 111 |     {
 112 |       "cell_type": "code",
 113 |       "metadata": {
 114 |         "id": "JjEIRTyr8ajf",
 115 |         "colab": {
 116 |           "base_uri": "https://localhost:8080/"
 117 |         },
 118 |         "outputId": "c13f17d1-a504-4560-e161-90566afc3963"
 119 |       },
 120 |       "source": [
 121 |         "# Code to set up the assignment\n",
 122 |         "from google.colab import drive\n",
 123 |         "drive.mount('/content/drive')\n",
 124 |         "%cd /content/drive/MyDrive/\n",
 125 |         "!mkdir -p 10714f24\n",
 126 |         "%cd /content/drive/MyDrive/10714f24\n",
 127 |         "# comment out the following line if you run it for the second time\n",
 128 |         "# as you already have a local copy of lecture13\n",
 129 |         "#!git clone https://github.com/dlsyscourse/lecture13\n",
 130 |         "!rm -rf /content/needle\n",
 131 |         "!ln -s /content/drive/MyDrive/10714f24/lecture13 /content/needle"
 132 |       ],
 133 |       "execution_count": null,
 134 |       "outputs": [
 135 |         {
 136 |           "output_type": "stream",
 137 |           "name": "stdout",
 138 |           "text": [
 139 |             "Mounted at /content/drive\n",
 140 |             "/content/drive/MyDrive\n",
 141 |             "/content/drive/MyDrive/10714f24\n",
 142 |             "Cloning into 'lecture14'...\n",
 143 |             "remote: Enumerating objects: 99, done.\u001b[K\n",
 144 |             "remote: Counting objects: 100% (99/99), done.\u001b[K\n",
 145 |             "remote: Compressing objects: 100% (71/71), done.\u001b[K\n",
 146 |             "remote: Total 99 (delta 31), reused 85 (delta 23), pack-reused 0 (from 0)\u001b[K\n",
 147 |             "Receiving objects: 100% (99/99), 53.49 KiB | 1.14 MiB/s, done.\n",
 148 |             "Resolving deltas: 100% (31/31), done.\n"
 149 |           ]
 150 |         }
 151 |       ]
 152 |     },
 153 |     {
 154 |       "cell_type": "code",
 155 |       "metadata": {
 156 |         "colab": {
 157 |           "base_uri": "https://localhost:8080/"
 158 |         },
 159 |         "id": "Xe3vClsD9jlq",
 160 |         "outputId": "0ff2a35c-4d81-45d7-98bf-45c22fc05279"
 161 |       },
 162 |       "source": [
 163 |         "!python3 -m pip install pybind11"
 164 |       ],
 165 |       "execution_count": null,
 166 |       "outputs": [
 167 |         {
 168 |           "output_type": "stream",
 169 |           "name": "stdout",
 170 |           "text": [
 171 |             "Collecting pybind11\n",
 172 |             "  Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)\n",
 173 |             "Downloading pybind11-2.13.6-py3-none-any.whl (243 kB)\n",
 174 |             "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/243.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[91m━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.6/243.3 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m243.3/243.3 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
 175 |             "\u001b[?25hInstalling collected packages: pybind11\n",
 176 |             "Successfully installed pybind11-2.13.6\n"
 177 |           ]
 178 |         }
 179 |       ]
 180 |     },
 181 |     {
 182 |       "cell_type": "markdown",
 183 |       "metadata": {
 184 |         "id": "O_RrW38i_JNp"
 185 |       },
 186 |       "source": [
 187 |         "### Build the needle cuda library\n",
 188 |         "\n",
 189 |         "We leverage pybind to build a c++/cuda library for acceleration. You can type make to build the corresponding library."
 190 |       ]
 191 |     },
 192 |     {
 193 |       "cell_type": "code",
 194 |       "metadata": {
 195 |         "colab": {
 196 |           "base_uri": "https://localhost:8080/"
 197 |         },
 198 |         "id": "o0EdAcB19saK",
 199 |         "outputId": "28c177ad-c7b2-46fb-eb9f-8f65878597db"
 200 |       },
 201 |       "source": [
 202 |         "%cd /content/needle\n",
 203 |         "!make clean\n",
 204 |         "!make"
 205 |       ],
 206 |       "execution_count": null,
 207 |       "outputs": [
 208 |         {
 209 |           "output_type": "stream",
 210 |           "name": "stdout",
 211 |           "text": [
 212 |             "/content/drive/MyDrive/10714f24/lecture14\n",
 213 |             "rm -rf build python/needle/backend_ndarray/ndarray_backend*.so\n",
 214 |             "\u001b[0mCMake Deprecation Warning at CMakeLists.txt:1 (cmake_minimum_required):\n",
 215 |             "  Compatibility with CMake < 3.5 will be removed from a future version of\n",
 216 |             "  CMake.\n",
 217 |             "\n",
 218 |             "  Update the VERSION argument <min> value or use a ...<max> suffix to tell\n",
 219 |             "  CMake that the project does not need compatibility with older versions.\n",
 220 |             "\n",
 221 |             "\u001b[0m\n",
 222 |             "-- The C compiler identification is GNU 11.4.0\n",
 223 |             "-- The CXX compiler identification is GNU 11.4.0\n",
 224 |             "-- Detecting C compiler ABI info\n",
 225 |             "-- Detecting C compiler ABI info - done\n",
 226 |             "-- Check for working C compiler: /usr/bin/cc - skipped\n",
 227 |             "-- Detecting C compile features\n",
 228 |             "-- Detecting C compile features - done\n",
 229 |             "-- Detecting CXX compiler ABI info\n",
 230 |             "-- Detecting CXX compiler ABI info - done\n",
 231 |             "-- Check for working CXX compiler: /usr/bin/c++ - skipped\n",
 232 |             "-- Detecting CXX compile features\n",
 233 |             "-- Detecting CXX compile features - done\n",
 234 |             "-- Found Python: /usr/local/bin/python (found version \"3.10.12\") found components: Development Interpreter Development.Module Development.Embed\n",
 235 |             "-- Performing Test HAS_FLTO\n",
 236 |             "-- Performing Test HAS_FLTO - Success\n",
 237 |             "-- Found pybind11: /usr/local/lib/python3.10/dist-packages/pybind11/include (found version \"2.13.6\")\n",
 238 |             "\u001b[33mCMake Warning (dev) at CMakeLists.txt:55 (find_package):\n",
 239 |             "  Policy CMP0146 is not set: The FindCUDA module is removed.  Run \"cmake\n",
 240 |             "  --help-policy CMP0146\" for policy details.  Use the cmake_policy command to\n",
 241 |             "  set the policy and suppress this warning.\n",
 242 |             "\n",
 243 |             "This warning is for project developers.  Use -Wno-dev to suppress it.\n",
 244 |             "\u001b[0m\n",
 245 |             "-- Performing Test CMAKE_HAVE_LIBC_PTHREAD\n",
 246 |             "-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success\n",
 247 |             "-- Found Threads: TRUE\n",
 248 |             "-- Found CUDA: /usr/local/cuda (found version \"12.2\")\n",
 249 |             "-- Found cuda, building cuda backend\n",
 250 |             "Tue Oct  8 01:30:32 2024       \n",
 251 |             "+---------------------------------------------------------------------------------------+\n",
 252 |             "| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |\n",
 253 |             "|-----------------------------------------+----------------------+----------------------+\n",
 254 |             "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
 255 |             "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
 256 |             "|                                         |                      |               MIG M. |\n",
 257 |             "|=========================================+======================+======================|\n",
 258 |             "|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |\n",
 259 |             "| N/A   45C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |\n",
 260 |             "|                                         |                      |                  N/A |\n",
 261 |             "+-----------------------------------------+----------------------+----------------------+\n",
 262 |             "                                                                                         \n",
 263 |             "+---------------------------------------------------------------------------------------+\n",
 264 |             "| Processes:                                                                            |\n",
 265 |             "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
 266 |             "|        ID   ID                                                             Usage      |\n",
 267 |             "|=======================================================================================|\n",
 268 |             "|  No running processes found                                                           |\n",
 269 |             "+---------------------------------------------------------------------------------------+\n",
 270 |             "-- Autodetected CUDA architecture(s):  7.5\n",
 271 |             "-- Configuring done (6.0s)\n",
 272 |             "-- Generating done (0.3s)\n",
 273 |             "-- Build files have been written to: /content/drive/MyDrive/10714f24/lecture14/build\n",
 274 |             "make[1]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
 275 |             "make[2]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
 276 |             "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
 277 |             "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
 278 |             "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
 279 |             "[-25%] \u001b[32mBuilding CXX object CMakeFiles/ndarray_backend_cpu.dir/src/ndarray_backend_cpu.cc.o\u001b[0m\n",
 280 |             "[  0%] \u001b[32m\u001b[1mLinking CXX shared module /content/drive/MyDrive/10714f24/lecture14/python/needle/backend_ndarray/ndarray_backend_cpu.cpython-310-x86_64-linux-gnu.so\u001b[0m\n",
 281 |             "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
 282 |             "[  0%] Built target ndarray_backend_cpu\n",
 283 |             "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
 284 |             "[ 25%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/ndarray_backend_cuda.dir/src/ndarray_backend_cuda_generated_ndarray_backend_cuda.cu.o\u001b[0m\n",
 285 |             "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
 286 |             "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
 287 |             "[ 50%] \u001b[32m\u001b[1mLinking CXX shared module /content/drive/MyDrive/10714f24/lecture14/python/needle/backend_ndarray/ndarray_backend_cuda.cpython-310-x86_64-linux-gnu.so\u001b[0m\n",
 288 |             "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
 289 |             "[ 50%] Built target ndarray_backend_cuda\n",
 290 |             "make[2]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
 291 |             "make[1]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n"
 292 |           ]
 293 |         }
 294 |       ]
 295 |     },
 296 |     {
 297 |       "cell_type": "markdown",
 298 |       "metadata": {
 299 |         "id": "DFxG3p3S1sBq"
 300 |       },
 301 |       "source": [
 302 |         "We can then run the following command to make the path to the package available in colab's environment as well as the PYTHONPATH."
 303 |       ]
 304 |     },
 305 |     {
 306 |       "cell_type": "code",
 307 |       "metadata": {
 308 |         "id": "bix8OXLuCOKt",
 309 |         "colab": {
 310 |           "base_uri": "https://localhost:8080/"
 311 |         },
 312 |         "outputId": "a250842b-c671-4ba1-e539-efa93f7fc35e"
 313 |       },
 314 |       "source": [
 315 |         "%set_env PYTHONPATH /content/needle/python:/env/python\n",
 316 |         "import sys\n",
 317 |         "sys.path.append(\"/content/needle/python\")"
 318 |       ],
 319 |       "execution_count": null,
 320 |       "outputs": [
 321 |         {
 322 |           "output_type": "stream",
 323 |           "name": "stdout",
 324 |           "text": [
 325 |             "env: PYTHONPATH=/content/needle/python:/env/python\n"
 326 |           ]
 327 |         }
 328 |       ]
 329 |     },
 330 |     {
 331 |       "cell_type": "markdown",
 332 |       "metadata": {
 333 |         "id": "BBIuE2jc1DaU"
 334 |       },
 335 |       "source": [
 336 |         "## Codebase walkthrough\n",
 337 |         "\n",
 338 |         "\n",
 339 |         "Now click the files panel on the left side. You should be able to see these files\n",
 340 |         "\n",
 341 |         "Python:\n",
 342 |         "- needle/backend_ndarray/ndarray.py\n",
 343 |         "- needle/backend_ndarray/ndarray_backend_numpy.py\n",
 344 |         "\n",
 345 |         "C++/CUDA\n",
 346 |         "- src/ndarray_backend_cpu.cc\n",
 347 |         "- src/ndarray_backend_cuda.cu\n",
 348 |         "\n",
 349 |         "The main goal of this lecture is to create an accelerated ndarray library.\n",
 350 |         "As a result, we do not need to deal with needle.Tensor for now and will focus on backend_ndarray's implementation.\n",
 351 |         "\n",
 352 |         "After we build up this array library, we can then use it to power backend array computations in needle.\n"
 353 |       ]
 354 |     },
 355 |     {
 356 |       "cell_type": "markdown",
 357 |       "metadata": {
 358 |         "id": "x1Z8wSsI6PrU"
 359 |       },
 360 |       "source": [
 361 |         "## Creating a CUDA NDArray\n",
 362 |         "\n",
 363 |         "\n",
 364 |         "\n",
 365 |         "\n"
 366 |       ]
 367 |     },
 368 |     {
 369 |       "cell_type": "code",
 370 |       "metadata": {
 371 |         "id": "N2bm_WB9uF4V",
 372 |         "colab": {
 373 |           "base_uri": "https://localhost:8080/"
 374 |         },
 375 |         "outputId": "874e2b52-6487-434b-fc94-b36c3fc57733"
 376 |       },
 377 |       "source": [
 378 |         "from needle import backend_ndarray as nd"
 379 |       ],
 380 |       "execution_count": null,
 381 |       "outputs": [
 382 |         {
 383 |           "output_type": "stream",
 384 |           "name": "stdout",
 385 |           "text": [
 386 |             "Using needle backend\n"
 387 |           ]
 388 |         }
 389 |       ]
 390 |     },
 391 |     {
 392 |       "cell_type": "markdown",
 393 |       "metadata": {
 394 |         "id": "wZGnTUsKF1x1"
 395 |       },
 396 |       "source": [
 397 |         "We can create a CUDA tensor from the data by specifying a device keyword."
 398 |       ]
 399 |     },
 400 |     {
 401 |       "cell_type": "code",
 402 |       "metadata": {
 403 |         "id": "1h5iAYFfBRED"
 404 |       },
 405 |       "source": [
 406 |         "x = nd.NDArray([1, 2, 3], device=nd.cuda())"
 407 |       ],
 408 |       "execution_count": null,
 409 |       "outputs": []
 410 |     },
 411 |     {
 412 |       "cell_type": "code",
 413 |       "metadata": {
 414 |         "id": "CulMPqJkhkpE"
 415 |       },
 416 |       "source": [
 417 |         "y = x + 1"
 418 |       ],
 419 |       "execution_count": null,
 420 |       "outputs": []
 421 |     },
 422 |     {
 423 |       "cell_type": "code",
 424 |       "metadata": {
 425 |         "id": "t4UuEs9KAkDR",
 426 |         "colab": {
 427 |           "base_uri": "https://localhost:8080/"
 428 |         },
 429 |         "outputId": "0b671189-f629-4829-bcfa-56aaeefad557"
 430 |       },
 431 |       "source": [
 432 |         "x.numpy()"
 433 |       ],
 434 |       "execution_count": null,
 435 |       "outputs": [
 436 |         {
 437 |           "output_type": "execute_result",
 438 |           "data": {
 439 |             "text/plain": [
 440 |               "array([1., 2., 3.], dtype=float32)"
 441 |             ]
 442 |           },
 443 |           "metadata": {},
 444 |           "execution_count": 18
 445 |         }
 446 |       ]
 447 |     },
 448 |     {
 449 |       "cell_type": "code",
 450 |       "metadata": {
 451 |         "id": "WBMvL6QEBtG7",
 452 |         "colab": {
 453 |           "base_uri": "https://localhost:8080/"
 454 |         },
 455 |         "outputId": "818f1fdc-69b0-4a95-f6da-36f3ba67a1ab"
 456 |       },
 457 |       "source": [
 458 |         "x.device"
 459 |       ],
 460 |       "execution_count": null,
 461 |       "outputs": [
 462 |         {
 463 |           "output_type": "execute_result",
 464 |           "data": {
 465 |             "text/plain": [
 466 |               "cuda()"
 467 |             ]
 468 |           },
 469 |           "metadata": {},
 470 |           "execution_count": 19
 471 |         }
 472 |       ]
 473 |     },
 474 |     {
 475 |       "cell_type": "code",
 476 |       "metadata": {
 477 |         "id": "qJSv7D8NGfAr"
 478 |       },
 479 |       "source": [
 480 |         "y = x + 1"
 481 |       ],
 482 |       "execution_count": null,
 483 |       "outputs": []
 484 |     },
 485 |     {
 486 |       "cell_type": "code",
 487 |       "metadata": {
 488 |         "id": "yZ7hmyBVGhGd",
 489 |         "colab": {
 490 |           "base_uri": "https://localhost:8080/"
 491 |         },
 492 |         "outputId": "60d5e07f-13f7-417e-9557-8abc87656d13"
 493 |       },
 494 |       "source": [
 495 |         "y.device"
 496 |       ],
 497 |       "execution_count": null,
 498 |       "outputs": [
 499 |         {
 500 |           "output_type": "execute_result",
 501 |           "data": {
 502 |             "text/plain": [
 503 |               "cuda()"
 504 |             ]
 505 |           },
 506 |           "metadata": {},
 507 |           "execution_count": 21
 508 |         }
 509 |       ]
 510 |     },
 511 |     {
 512 |       "cell_type": "code",
 513 |       "metadata": {
 514 |         "id": "NQVtUgK-f7_y",
 515 |         "colab": {
 516 |           "base_uri": "https://localhost:8080/"
 517 |         },
 518 |         "outputId": "ff69ffd8-720c-492d-a818-7d1f57dcd63f"
 519 |       },
 520 |       "source": [
 521 |         "y.numpy()"
 522 |       ],
 523 |       "execution_count": null,
 524 |       "outputs": [
 525 |         {
 526 |           "output_type": "execute_result",
 527 |           "data": {
 528 |             "text/plain": [
 529 |               "array([2., 3., 4.], dtype=float32)"
 530 |             ]
 531 |           },
 532 |           "metadata": {},
 533 |           "execution_count": 22
 534 |         }
 535 |       ]
 536 |     },
 537 |     {
 538 |       "cell_type": "markdown",
 539 |       "metadata": {
 540 |         "id": "SPjNJfJsf_T9"
 541 |       },
 542 |       "source": [
 543 |         "### Key Data Structures\n",
 544 |         "\n",
 545 |         "Key data structures in backend_ndarray\n",
 546 |         "\n",
 547 |         "- NDArray: the container to hold device specific ndarray\n",
 548 |         "- BackendDevice: backend device\n",
 549 |         "    - mod holds the module implementation that implements all functions\n",
 550 |         "    - checkout ndarray_backend_numpy.py for a python-side reference.\n",
 551 |         "\n"
 552 |       ]
 553 |     },
 554 |     {
 555 |       "cell_type": "markdown",
 556 |       "metadata": {
 557 |         "id": "HxKF9dcFhTy3"
 558 |       },
 559 |       "source": [
 560 |         "## Trace GPU execution\n",
 561 |         "\n",
 562 |         "Now, let us take a look at what happens when we execute the following code\n"
 563 |       ]
 564 |     },
 565 |     {
 566 |       "cell_type": "code",
 567 |       "metadata": {
 568 |         "id": "PLLzZzuthhBH"
 569 |       },
 570 |       "source": [
 571 |         "x = nd.NDArray([1, 2, 3], device=nd.cuda())\n",
 572 |         "y = x + 1"
 573 |       ],
 574 |       "execution_count": null,
 575 |       "outputs": []
 576 |     },
 577 |     {
 578 |       "cell_type": "code",
 579 |       "metadata": {
 580 |         "colab": {
 581 |           "base_uri": "https://localhost:8080/"
 582 |         },
 583 |         "id": "V9NV0JFESkIe",
 584 |         "outputId": "011bc9d0-bcd5-4830-8d71-a2c373eb2202"
 585 |       },
 586 |       "source": [
 587 |         "x.device.from_numpy"
 588 |       ],
 589 |       "execution_count": null,
 590 |       "outputs": [
 591 |         {
 592 |           "output_type": "execute_result",
 593 |           "data": {
 594 |             "text/plain": [
 595 |               "<function needle.backend_ndarray.ndarray_backend_cuda.PyCapsule.from_numpy>"
 596 |             ]
 597 |           },
 598 |           "metadata": {},
 599 |           "execution_count": 24
 600 |         }
 601 |       ]
 602 |     },
 603 |     {
 604 |       "cell_type": "code",
 605 |       "metadata": {
 606 |         "id": "H6vwR3yBRI9F"
 607 |       },
 608 |       "source": [
 609 |         "x = nd.NDArray([1, 2, 3])"
 610 |       ],
 611 |       "execution_count": null,
 612 |       "outputs": []
 613 |     },
 614 |     {
 615 |       "cell_type": "code",
 616 |       "metadata": {
 617 |         "colab": {
 618 |           "base_uri": "https://localhost:8080/",
 619 |           "height": 121
 620 |         },
 621 |         "id": "0PxoH_UzRMd3",
 622 |         "outputId": "54fd43b1-e18f-4f6d-e92a-4b04744fde90"
 623 |       },
 624 |       "source": [
 625 |         "x.device.from_numpy"
 626 |       ],
 627 |       "execution_count": null,
 628 |       "outputs": [
 629 |         {
 630 |           "output_type": "execute_result",
 631 |           "data": {
 632 |             "text/plain": [
 633 |               "<function needle.backend_ndarray.ndarray_backend_numpy.from_numpy(a, out)>"
 634 |             ],
 635 |             "text/html": [
 636 |               "<div style=\"max-width:800px; border: 1px solid var(--colab-border-color);\"><style>\n",
 637 |               "      pre.function-repr-contents {\n",
 638 |               "        overflow-x: auto;\n",
 639 |               "        padding: 8px 12px;\n",
 640 |               "        max-height: 500px;\n",
 641 |               "      }\n",
 642 |               "\n",
 643 |               "      pre.function-repr-contents.function-repr-contents-collapsed {\n",
 644 |               "        cursor: pointer;\n",
 645 |               "        max-height: 100px;\n",
 646 |               "      }\n",
 647 |               "    </style>\n",
 648 |               "    <pre style=\"white-space: initial; background:\n",
 649 |               "         var(--colab-secondary-surface-color); padding: 8px 12px;\n",
 650 |               "         border-bottom: 1px solid var(--colab-border-color);\"><b>needle.backend_ndarray.ndarray_backend_numpy.from_numpy</b><br/>def from_numpy(a, out)</pre><pre class=\"function-repr-contents function-repr-contents-collapsed\" style=\"\"><a class=\"filepath\" style=\"display:none\" href=\"#\">/content/needle/python/needle/backend_ndarray/ndarray_backend_numpy.py</a>&lt;no docstring&gt;</pre>\n",
 651 |               "      <script>\n",
 652 |               "      if (google.colab.kernel.accessAllowed && google.colab.files && google.colab.files.view) {\n",
 653 |               "        for (const element of document.querySelectorAll('.filepath')) {\n",
 654 |               "          element.style.display = 'block'\n",
 655 |               "          element.onclick = (event) => {\n",
 656 |               "            event.preventDefault();\n",
 657 |               "            event.stopPropagation();\n",
 658 |               "            google.colab.files.view(element.textContent, 24);\n",
 659 |               "          };\n",
 660 |               "        }\n",
 661 |               "      }\n",
 662 |               "      for (const element of document.querySelectorAll('.function-repr-contents')) {\n",
 663 |               "        element.onclick = (event) => {\n",
 664 |               "          event.preventDefault();\n",
 665 |               "          event.stopPropagation();\n",
 666 |               "          element.classList.toggle('function-repr-contents-collapsed');\n",
 667 |               "        };\n",
 668 |               "      }\n",
 669 |               "      </script>\n",
 670 |               "      </div>"
 671 |             ]
 672 |           },
 673 |           "metadata": {},
 674 |           "execution_count": 26
 675 |         }
 676 |       ]
 677 |     },
 678 |     {
 679 |       "cell_type": "markdown",
 680 |       "metadata": {
 681 |         "id": "xU5PFJJ-iR7J"
 682 |       },
 683 |       "source": [
 684 |         "Have the following trace:\n",
 685 |         "\n",
 686 |         "backend_ndarray/ndarray.py\n",
 687 |         "- `NDArray.__add__`\n",
 688 |         "- `NDArray.ewise_or_scalar`\n",
 689 |         "- `ndarray_backend_cpu.cc:ScalarAdd`"
 690 |       ]
 691 |     },
 692 |     {
 693 |       "cell_type": "code",
 694 |       "metadata": {
 695 |         "colab": {
 696 |           "base_uri": "https://localhost:8080/"
 697 |         },
 698 |         "id": "TxAKyM6yjr_R",
 699 |         "outputId": "ba1bfe21-77be-4cd2-f625-f2a3afbcfb57"
 700 |       },
 701 |       "source": [
 702 |         "y.numpy()"
 703 |       ],
 704 |       "execution_count": null,
 705 |       "outputs": [
 706 |         {
 707 |           "output_type": "execute_result",
 708 |           "data": {
 709 |             "text/plain": [
 710 |               "array([2., 3., 4.], dtype=float32)"
 711 |             ]
 712 |           },
 713 |           "metadata": {},
 714 |           "execution_count": 27
 715 |         }
 716 |       ]
 717 |     },
 718 |     {
 719 |       "cell_type": "markdown",
 720 |       "metadata": {
 721 |         "id": "F4vqb_a4j2O8"
 722 |       },
 723 |       "source": [
 724 |         "Have the following trace:\n",
 725 |         "\n",
 726 |         "- `NDArray.numpy`\n",
 727 |         "- `ndarray_backend_cpu.cc:to_numpy`"
 728 |       ]
 729 |     },
 730 |     {
 731 |       "cell_type": "markdown",
 732 |       "metadata": {
 733 |         "id": "tMiFJmJVlD6j"
 734 |       },
 735 |       "source": [
 736 |         "## Guidelines for Reading C++/CUDA related Files\n",
 737 |         "\n",
 738 |         "Read\n",
 739 |         "- src/ndarray_backend_cpu.cc\n",
 740 |         "- src/ndarray_backend_cuda.cu\n",
 741 |         "\n",
 742 |         "\n",
 743 |         "Optional\n",
 744 |         "- CMakeLists.txt: this is used to setup the build and likely you do not need to tweak it.\n",
 745 |         "\n",
 746 |         "\n",
 747 |         "\n",
 748 |         "\n",
 749 |         "\n"
 750 |       ]
 751 |     },
 752 |     {
 753 |       "cell_type": "markdown",
 754 |       "metadata": {
 755 |         "id": "uEpPbwQKkSkZ"
 756 |       },
 757 |       "source": [
 758 |         "## NDArray Data Structure\n",
 759 |         "\n",
 760 |         "Open up `python/needle/backend_ndarray/ndarray.py`.\n",
 761 |         "\n",
 762 |         "An NDArray contains the following fields:\n",
 763 |         "- handle: The backend handle that build a flat array which stores the data.\n",
 764 |         "- shape: The shape of the NDArray\n",
 765 |         "- strides: The strides that shows how do we access multi-dimensional elements\n",
 766 |         "- offset: The offset of the first element.\n",
 767 |         "- device: The backend device that backs the computation\n",
 768 |         "\n",
 769 |         "\n",
 770 |         "\n",
 771 |         "\n"
 772 |       ]
 773 |     },
 774 |     {
 775 |       "cell_type": "markdown",
 776 |       "metadata": {
 777 |         "id": "875DgxFFACqb"
 778 |       },
 779 |       "source": [
 780 |         "## Transformation as Strided Computation\n",
 781 |         "\n",
 782 |         "We can leverage the strides and offset to perform transform/slicing with zero copy.\n",
 783 |         "\n",
 784 |         "- Broadcast: insert strides that equals 0\n",
 785 |         "- Tranpose: swap the strides\n",
 786 |         "- Slice: change the offset and shape\n",
 787 |         "\n",
 788 |         "For most of the computations, however, we will call `array.compact()` first to get a contiguous and aligned memory before running the computation."
 789 |       ]
 790 |     },
 791 |     {
 792 |       "cell_type": "code",
 793 |       "metadata": {
 794 |         "id": "I49fcoiyWYLt"
 795 |       },
 796 |       "source": [
 797 |         "import numpy as np\n"
 798 |       ],
 799 |       "execution_count": null,
 800 |       "outputs": []
 801 |     },
 802 |     {
 803 |       "cell_type": "code",
 804 |       "metadata": {
 805 |         "id": "qGbICVsb6y98"
 806 |       },
 807 |       "source": [
 808 |         "x = nd.NDArray([1, 2, 3, 4], device=nd.cpu_numpy())"
 809 |       ],
 810 |       "execution_count": null,
 811 |       "outputs": []
 812 |     },
 813 |     {
 814 |       "cell_type": "code",
 815 |       "metadata": {
 816 |         "colab": {
 817 |           "base_uri": "https://localhost:8080/"
 818 |         },
 819 |         "id": "iofcuXso64yk",
 820 |         "outputId": "66e792a6-918e-4a01-b237-c8ee97c694a8"
 821 |       },
 822 |       "source": [
 823 |         "x.numpy()"
 824 |       ],
 825 |       "execution_count": null,
 826 |       "outputs": [
 827 |         {
 828 |           "output_type": "execute_result",
 829 |           "data": {
 830 |             "text/plain": [
 831 |               "array([1., 2., 3., 4.], dtype=float32)"
 832 |             ]
 833 |           },
 834 |           "metadata": {},
 835 |           "execution_count": 34
 836 |         }
 837 |       ]
 838 |     },
 839 |     {
 840 |       "cell_type": "markdown",
 841 |       "metadata": {
 842 |         "id": "oceIop5P7RHW"
 843 |       },
 844 |       "source": [
 845 |         "We can use strides and shape manipulation to create different views of the same array."
 846 |       ]
 847 |     },
 848 |     {
 849 |       "cell_type": "code",
 850 |       "metadata": {
 851 |         "id": "C7zCed7e7B4u"
 852 |       },
 853 |       "source": [
 854 |         "y = nd.NDArray.make(shape=(2, 2), strides=(2, 1), device=x.device, handle=x._handle, offset=0)"
 855 |       ],
 856 |       "execution_count": null,
 857 |       "outputs": []
 858 |     },
 859 |     {
 860 |       "cell_type": "code",
 861 |       "metadata": {
 862 |         "colab": {
 863 |           "base_uri": "https://localhost:8080/"
 864 |         },
 865 |         "id": "oaEPCvR17OMf",
 866 |         "outputId": "eee40587-5da5-4e01-a539-69f7788c638b"
 867 |       },
 868 |       "source": [
 869 |         "y.numpy()"
 870 |       ],
 871 |       "execution_count": null,
 872 |       "outputs": [
 873 |         {
 874 |           "output_type": "execute_result",
 875 |           "data": {
 876 |             "text/plain": [
 877 |               "array([[1., 2.],\n",
 878 |               "       [3., 4.]], dtype=float32)"
 879 |             ]
 880 |           },
 881 |           "metadata": {},
 882 |           "execution_count": 36
 883 |         }
 884 |       ]
 885 |     },
 886 |     {
 887 |       "cell_type": "code",
 888 |       "metadata": {
 889 |         "id": "5rNS5MW67XyX"
 890 |       },
 891 |       "source": [
 892 |         "z = nd.NDArray.make(shape=(2, 1), strides=(2, 1), device=x.device, handle=x._handle, offset=1)"
 893 |       ],
 894 |       "execution_count": null,
 895 |       "outputs": []
 896 |     },
 897 |     {
 898 |       "cell_type": "code",
 899 |       "metadata": {
 900 |         "colab": {
 901 |           "base_uri": "https://localhost:8080/"
 902 |         },
 903 |         "id": "HzhpVtKB7b97",
 904 |         "outputId": "fe5e6ac3-3458-4140-a5f6-fae9547f6e99"
 905 |       },
 906 |       "source": [
 907 |         "z.numpy()"
 908 |       ],
 909 |       "execution_count": null,
 910 |       "outputs": [
 911 |         {
 912 |           "output_type": "execute_result",
 913 |           "data": {
 914 |             "text/plain": [
 915 |               "array([[2.],\n",
 916 |               "       [4.]], dtype=float32)"
 917 |             ]
 918 |           },
 919 |           "metadata": {},
 920 |           "execution_count": 38
 921 |         }
 922 |       ]
 923 |     },
 924 |     {
 925 |       "cell_type": "markdown",
 926 |       "metadata": {
 927 |         "id": "5ONkZbUuj6Dx"
 928 |       },
 929 |       "source": [
 930 |         "## CUDA Acceleration\n",
 931 |         "\n",
 932 |         "Now let us open `src/ndarray_cuda_backend.cu` and take a look at current implementation of GPU ops.\n"
 933 |       ]
 934 |     },
 935 |     {
 936 |       "cell_type": "markdown",
 937 |       "metadata": {
 938 |         "id": "Og8N3iuZiZ4g"
 939 |       },
 940 |       "source": [
 941 |         "## Steps for adding a new operator implementation\n",
 942 |         "- Add an implementation in `ndarray_backend_cuda.cu`, expose via pybind\n",
 943 |         "- Call into the operator in ndarray.py\n",
 944 |         "- Write up testcases"
 945 |       ]
 946 |     },
 947 |     {
 948 |       "cell_type": "code",
 949 |       "metadata": {
 950 |         "id": "xV1I7I2lkOJG",
 951 |         "colab": {
 952 |           "base_uri": "https://localhost:8080/"
 953 |         },
 954 |         "outputId": "5701699e-a11c-4ee3-e98b-a6211908f329"
 955 |       },
 956 |       "source": [
 957 |         "!make"
 958 |       ],
 959 |       "execution_count": null,
 960 |       "outputs": [
 961 |         {
 962 |           "output_type": "stream",
 963 |           "name": "stdout",
 964 |           "text": [
 965 |             "\u001b[0mCMake Deprecation Warning at CMakeLists.txt:1 (cmake_minimum_required):\n",
 966 |             "  Compatibility with CMake < 3.5 will be removed from a future version of\n",
 967 |             "  CMake.\n",
 968 |             "\n",
 969 |             "  Update the VERSION argument <min> value or use a ...<max> suffix to tell\n",
 970 |             "  CMake that the project does not need compatibility with older versions.\n",
 971 |             "\n",
 972 |             "\u001b[0m\n",
 973 |             "-- Found pybind11: /usr/local/lib/python3.10/dist-packages/pybind11/include (found version \"2.13.6\")\n",
 974 |             "\u001b[33mCMake Warning (dev) at CMakeLists.txt:55 (find_package):\n",
 975 |             "  Policy CMP0146 is not set: The FindCUDA module is removed.  Run \"cmake\n",
 976 |             "  --help-policy CMP0146\" for policy details.  Use the cmake_policy command to\n",
 977 |             "  set the policy and suppress this warning.\n",
 978 |             "\n",
 979 |             "This warning is for project developers.  Use -Wno-dev to suppress it.\n",
 980 |             "\u001b[0m\n",
 981 |             "-- Found cuda, building cuda backend\n",
 982 |             "Tue Oct  8 01:33:00 2024       \n",
 983 |             "+---------------------------------------------------------------------------------------+\n",
 984 |             "| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |\n",
 985 |             "|-----------------------------------------+----------------------+----------------------+\n",
 986 |             "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
 987 |             "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
 988 |             "|                                         |                      |               MIG M. |\n",
 989 |             "|=========================================+======================+======================|\n",
 990 |             "|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |\n",
 991 |             "| N/A   55C    P0              29W /  70W |    105MiB / 15360MiB |      0%      Default |\n",
 992 |             "|                                         |                      |                  N/A |\n",
 993 |             "+-----------------------------------------+----------------------+----------------------+\n",
 994 |             "                                                                                         \n",
 995 |             "+---------------------------------------------------------------------------------------+\n",
 996 |             "| Processes:                                                                            |\n",
 997 |             "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
 998 |             "|        ID   ID                                                             Usage      |\n",
 999 |             "|=======================================================================================|\n",
1000 |             "+---------------------------------------------------------------------------------------+\n",
1001 |             "-- Autodetected CUDA architecture(s):  7.5\n",
1002 |             "-- Configuring done (0.4s)\n",
1003 |             "-- Generating done (0.4s)\n",
1004 |             "-- Build files have been written to: /content/drive/MyDrive/10714f24/lecture14/build\n",
1005 |             "make[1]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1006 |             "make[2]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1007 |             "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1008 |             "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1009 |             "[  0%] Built target ndarray_backend_cpu\n",
1010 |             "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1011 |             "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1012 |             "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1013 |             "[ 25%] \u001b[32m\u001b[1mLinking CXX shared module /content/drive/MyDrive/10714f24/lecture14/python/needle/backend_ndarray/ndarray_backend_cuda.cpython-310-x86_64-linux-gnu.so\u001b[0m\n",
1014 |             "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1015 |             "[ 50%] Built target ndarray_backend_cuda\n",
1016 |             "make[2]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1017 |             "make[1]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n"
1018 |           ]
1019 |         }
1020 |       ]
1021 |     },
1022 |     {
1023 |       "cell_type": "markdown",
1024 |       "source": [
1025 |         "If we directly run the code block, we will see an error, because ewise mul is not yet implemented"
1026 |       ],
1027 |       "metadata": {
1028 |         "id": "OpWxZKGb_4cJ"
1029 |       }
1030 |     },
1031 |     {
1032 |       "cell_type": "code",
1033 |       "metadata": {
1034 |         "id": "YU870vVVZkzg",
1035 |         "colab": {
1036 |           "base_uri": "https://localhost:8080/",
1037 |           "height": 311
1038 |         },
1039 |         "outputId": "99ae1dbc-7ebd-4229-e3de-50eabafc8d5f"
1040 |       },
1041 |       "source": [
1042 |         "x = nd.NDArray([1,2,3], device=nd.cuda())\n",
1043 |         "x * 2"
1044 |       ],
1045 |       "execution_count": null,
1046 |       "outputs": [
1047 |         {
1048 |           "output_type": "error",
1049 |           "ename": "AttributeError",
1050 |           "evalue": "module 'needle.backend_ndarray.ndarray_backend_cuda' has no attribute 'ewise_mul'",
1051 |           "traceback": [
1052 |             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1053 |             "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
1054 |             "\u001b[0;32m<ipython-input-40-dabe3239c8f9>\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNDArray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mx\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
1055 |             "\u001b[0;32m/content/needle/python/needle/backend_ndarray/ndarray.py\u001b[0m in \u001b[0;36m__mul__\u001b[0;34m(self, other)\u001b[0m\n\u001b[1;32m    419\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__mul__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    420\u001b[0m         return self.ewise_or_scalar(\n\u001b[0;32m--> 421\u001b[0;31m             \u001b[0mother\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mewise_mul\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscalar_mul\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    422\u001b[0m         )\n\u001b[1;32m    423\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
1056 |             "\u001b[0;32m/content/needle/python/needle/backend_ndarray/ndarray.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m     26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     27\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__getattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     30\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0menabled\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1057 |             "\u001b[0;31mAttributeError\u001b[0m: module 'needle.backend_ndarray.ndarray_backend_cuda' has no attribute 'ewise_mul'"
1058 |           ]
1059 |         }
1060 |       ]
1061 |     },
1062 |     {
1063 |       "cell_type": "markdown",
1064 |       "metadata": {
1065 |         "id": "VEtbnbvr6Wt7"
1066 |       },
1067 |       "source": [
1068 |         "## Connect back to needle Tensor\n",
1069 |         "\n",
1070 |         "So far we only played with the `backend_ndarray` subpackage, which is a self-contained ndarray implementation within needle.\n",
1071 |         "\n",
1072 |         "We can connect the ndarray back to needle as a backend."
1073 |       ]
1074 |     },
1075 |     {
1076 |       "cell_type": "code",
1077 |       "metadata": {
1078 |         "id": "JeThSA8zAu_v"
1079 |       },
1080 |       "source": [
1081 |         "import needle as ndl"
1082 |       ],
1083 |       "execution_count": null,
1084 |       "outputs": []
1085 |     },
1086 |     {
1087 |       "cell_type": "code",
1088 |       "metadata": {
1089 |         "id": "dobDH96Ql8SV",
1090 |         "colab": {
1091 |           "base_uri": "https://localhost:8080/"
1092 |         },
1093 |         "outputId": "cd23f433-8c4c-4eb3-d67f-8b492a703261"
1094 |       },
1095 |       "source": [
1096 |         "x = ndl.Tensor([1,2,3], device=nd.cuda(), dtype=\"float32\")\n",
1097 |         "y = ndl.Tensor([2,3,5], device=nd.cuda(), dtype=\"float32\")\n",
1098 |         "z = x + y\n",
1099 |         "z"
1100 |       ],
1101 |       "execution_count": null,
1102 |       "outputs": [
1103 |         {
1104 |           "output_type": "execute_result",
1105 |           "data": {
1106 |             "text/plain": [
1107 |               "needle.Tensor([3. 5. 8.])"
1108 |             ]
1109 |           },
1110 |           "metadata": {},
1111 |           "execution_count": 43
1112 |         }
1113 |       ]
1114 |     },
1115 |     {
1116 |       "cell_type": "code",
1117 |       "metadata": {
1118 |         "colab": {
1119 |           "base_uri": "https://localhost:8080/"
1120 |         },
1121 |         "id": "ouXpj1v6g3z1",
1122 |         "outputId": "ed910149-2d9e-4055-b31d-52fb698e463d"
1123 |       },
1124 |       "source": [
1125 |         "z.device"
1126 |       ],
1127 |       "execution_count": null,
1128 |       "outputs": [
1129 |         {
1130 |           "output_type": "execute_result",
1131 |           "data": {
1132 |             "text/plain": [
1133 |               "cuda()"
1134 |             ]
1135 |           },
1136 |           "metadata": {},
1137 |           "execution_count": 44
1138 |         }
1139 |       ]
1140 |     },
1141 |     {
1142 |       "cell_type": "code",
1143 |       "metadata": {
1144 |         "colab": {
1145 |           "base_uri": "https://localhost:8080/",
1146 |           "height": 187
1147 |         },
1148 |         "id": "4827VUz3bwvA",
1149 |         "outputId": "94b59197-251e-4be5-8bb3-c0b704809477"
1150 |       },
1151 |       "source": [
1152 |         "type(z.cached_data)"
1153 |       ],
1154 |       "execution_count": null,
1155 |       "outputs": [
1156 |         {
1157 |           "output_type": "execute_result",
1158 |           "data": {
1159 |             "text/plain": [
1160 |               "needle.backend_ndarray.ndarray.NDArray"
1161 |             ],
1162 |             "text/html": [
1163 |               "<div style=\"max-width:800px; border: 1px solid var(--colab-border-color);\"><style>\n",
1164 |               "      pre.function-repr-contents {\n",
1165 |               "        overflow-x: auto;\n",
1166 |               "        padding: 8px 12px;\n",
1167 |               "        max-height: 500px;\n",
1168 |               "      }\n",
1169 |               "\n",
1170 |               "      pre.function-repr-contents.function-repr-contents-collapsed {\n",
1171 |               "        cursor: pointer;\n",
1172 |               "        max-height: 100px;\n",
1173 |               "      }\n",
1174 |               "    </style>\n",
1175 |               "    <pre style=\"white-space: initial; background:\n",
1176 |               "         var(--colab-secondary-surface-color); padding: 8px 12px;\n",
1177 |               "         border-bottom: 1px solid var(--colab-border-color);\"><b>needle.backend_ndarray.ndarray.NDArray</b><br/>def __init__(other, device=None)</pre><pre class=\"function-repr-contents function-repr-contents-collapsed\" style=\"\"><a class=\"filepath\" style=\"display:none\" href=\"#\">/content/needle/python/needle/backend_ndarray/ndarray.py</a>A generic ND array class that may contain multipe different backends\n",
1178 |               "i.e., a Numpy backend, a native CPU backend, or a GPU backend.\n",
1179 |               "\n",
1180 |               "This class will only contains those functions that you need to implement\n",
1181 |               "to actually get the desired functionality for the programming examples\n",
1182 |               "in the homework, and no more.\n",
1183 |               "\n",
1184 |               "For now, for simplicity the class only supports float32 types, though\n",
1185 |               "this can be extended if desired.</pre>\n",
1186 |               "      <script>\n",
1187 |               "      if (google.colab.kernel.accessAllowed && google.colab.files && google.colab.files.view) {\n",
1188 |               "        for (const element of document.querySelectorAll('.filepath')) {\n",
1189 |               "          element.style.display = 'block'\n",
1190 |               "          element.onclick = (event) => {\n",
1191 |               "            event.preventDefault();\n",
1192 |               "            event.stopPropagation();\n",
1193 |               "            google.colab.files.view(element.textContent, 88);\n",
1194 |               "          };\n",
1195 |               "        }\n",
1196 |               "      }\n",
1197 |               "      for (const element of document.querySelectorAll('.function-repr-contents')) {\n",
1198 |               "        element.onclick = (event) => {\n",
1199 |               "          event.preventDefault();\n",
1200 |               "          event.stopPropagation();\n",
1201 |               "          element.classList.toggle('function-repr-contents-collapsed');\n",
1202 |               "        };\n",
1203 |               "      }\n",
1204 |               "      </script>\n",
1205 |               "      </div>"
1206 |             ]
1207 |           },
1208 |           "metadata": {},
1209 |           "execution_count": 45
1210 |         }
1211 |       ]
1212 |     },
1213 |     {
1214 |       "cell_type": "markdown",
1215 |       "metadata": {
1216 |         "id": "74OUUH2REG18"
1217 |       },
1218 |       "source": [
1219 |         "## Write Standalone Python Test Files\n",
1220 |         "\n",
1221 |         "Now that we have additional c++/cuda libraries in needle, we will need to type make in order to rebuild the library. Additionally, because the colab environment caches the old library, it is inconvenient to use the ipython cells to debug the updated library.\n",
1222 |         "\n",
1223 |         "\n"
1224 |       ]
1225 |     },
1226 |     {
1227 |       "cell_type": "code",
1228 |       "metadata": {
1229 |         "colab": {
1230 |           "base_uri": "https://localhost:8080/"
1231 |         },
1232 |         "id": "sgLoV-_KHAM3",
1233 |         "outputId": "ae19d49d-faa3-4aa7-cd6f-ed61815cfce6"
1234 |       },
1235 |       "source": [
1236 |         "!make"
1237 |       ],
1238 |       "execution_count": null,
1239 |       "outputs": [
1240 |         {
1241 |           "output_type": "stream",
1242 |           "name": "stdout",
1243 |           "text": [
1244 |             "\u001b[0mCMake Deprecation Warning at CMakeLists.txt:1 (cmake_minimum_required):\n",
1245 |             "  Compatibility with CMake < 3.5 will be removed from a future version of\n",
1246 |             "  CMake.\n",
1247 |             "\n",
1248 |             "  Update the VERSION argument <min> value or use a ...<max> suffix to tell\n",
1249 |             "  CMake that the project does not need compatibility with older versions.\n",
1250 |             "\n",
1251 |             "\u001b[0m\n",
1252 |             "-- Found pybind11: /usr/local/lib/python3.10/dist-packages/pybind11/include (found version \"2.13.6\")\n",
1253 |             "\u001b[33mCMake Warning (dev) at CMakeLists.txt:55 (find_package):\n",
1254 |             "  Policy CMP0146 is not set: The FindCUDA module is removed.  Run \"cmake\n",
1255 |             "  --help-policy CMP0146\" for policy details.  Use the cmake_policy command to\n",
1256 |             "  set the policy and suppress this warning.\n",
1257 |             "\n",
1258 |             "This warning is for project developers.  Use -Wno-dev to suppress it.\n",
1259 |             "\u001b[0m\n",
1260 |             "-- Found cuda, building cuda backend\n",
1261 |             "Tue Oct  8 01:33:33 2024       \n",
1262 |             "+---------------------------------------------------------------------------------------+\n",
1263 |             "| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |\n",
1264 |             "|-----------------------------------------+----------------------+----------------------+\n",
1265 |             "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
1266 |             "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
1267 |             "|                                         |                      |               MIG M. |\n",
1268 |             "|=========================================+======================+======================|\n",
1269 |             "|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |\n",
1270 |             "| N/A   58C    P0              30W /  70W |    105MiB / 15360MiB |      0%      Default |\n",
1271 |             "|                                         |                      |                  N/A |\n",
1272 |             "+-----------------------------------------+----------------------+----------------------+\n",
1273 |             "                                                                                         \n",
1274 |             "+---------------------------------------------------------------------------------------+\n",
1275 |             "| Processes:                                                                            |\n",
1276 |             "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
1277 |             "|        ID   ID                                                             Usage      |\n",
1278 |             "|=======================================================================================|\n",
1279 |             "+---------------------------------------------------------------------------------------+\n",
1280 |             "-- Autodetected CUDA architecture(s):  7.5\n",
1281 |             "-- Configuring done (0.3s)\n",
1282 |             "-- Generating done (0.3s)\n",
1283 |             "-- Build files have been written to: /content/drive/MyDrive/10714f24/lecture14/build\n",
1284 |             "make[1]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1285 |             "make[2]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1286 |             "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1287 |             "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1288 |             "[  0%] Built target ndarray_backend_cpu\n",
1289 |             "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1290 |             "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1291 |             "[ 50%] Built target ndarray_backend_cuda\n",
1292 |             "make[2]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n",
1293 |             "make[1]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n"
1294 |           ]
1295 |         }
1296 |       ]
1297 |     },
1298 |     {
1299 |       "cell_type": "markdown",
1300 |       "metadata": {
1301 |         "id": "dudnLHRqoKY2"
1302 |       },
1303 |       "source": [
1304 |         "\n",
1305 |         "We recommend writing separate python files and invoke them from the command line. Create a new file `tests/mytest.py` and write your local tests. This is also a common develop practice in big projects that involves python c++ FFI."
1306 |       ]
1307 |     },
1308 |     {
1309 |       "cell_type": "code",
1310 |       "metadata": {
1311 |         "id": "TubIHJrkn4Sk",
1312 |         "colab": {
1313 |           "base_uri": "https://localhost:8080/"
1314 |         },
1315 |         "outputId": "cf8889e2-605f-4310-d89a-e4a49bc254e4"
1316 |       },
1317 |       "source": [
1318 |         "!python tests/mytest.py"
1319 |       ],
1320 |       "execution_count": null,
1321 |       "outputs": [
1322 |         {
1323 |           "output_type": "stream",
1324 |           "name": "stdout",
1325 |           "text": [
1326 |             "python3: can't open file '/content/drive/MyDrive/10714f24/lecture14/tests/mytest.py': [Errno 2] No such file or directory\n"
1327 |           ]
1328 |         }
1329 |       ]
1330 |     },
1331 |     {
1332 |       "cell_type": "markdown",
1333 |       "metadata": {
1334 |         "id": "ei0UR-FYoY1-"
1335 |       },
1336 |       "source": [
1337 |         "After we have building the library, we could choose to fully restart the runtime (factory reset runtime) if you want to bring the updated change back to another colab. Note that you will need to save your code changes to the drive or a private github repo."
1338 |       ]
1339 |     }
1340 |   ]
1341 | }


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.2)
 2 | project(needle C CXX)
 3 | 
 4 | # find correct version of Python
 5 | execute_process(COMMAND python3-config --prefix
 6 |   OUTPUT_VARIABLE Python_ROOT_DIR)
 7 | find_package(Python COMPONENTS Development Interpreter REQUIRED)
 8 | include_directories(${Python_INCLUDE_DIRS})
 9 | 
10 | # find pybind
11 | execute_process(COMMAND python3 -m pybind11 --cmakedir
12 |   RESULT_VARIABLE __pybind_exit_code
13 |   OUTPUT_VARIABLE __pybind_path
14 |   OUTPUT_STRIP_TRAILING_WHITESPACE)
15 | find_package(pybind11 PATHS ${__pybind_path})
16 | 
17 | 
18 | if(NOT MSVC)
19 |   set(CMAKE_CXX_FLAGS "-std=c++11 -O2 -march=native ${CMAKE_CXX_FLAGS}")
20 |   set(CMAKE_CUDA_STANDARD 14)
21 | else()
22 |   set(CMAKE_CXX_FLAGS "/std:c++11 -O2 -march=native ${CMAKE_CXX_FLAGS}")
23 |   set(CMAKE_CUDA_STANDARD 14)
24 | endif()
25 | 
26 | include_directories(SYSTEM ${pybind11_INCLUDE_DIRS})
27 | list(APPEND LINKER_LIBS ${pybind11_LIBRARIES})
28 | 
29 | 
30 | ###################
31 | ### CPU BACKEND ###
32 | ###################
33 | add_library(ndarray_backend_cpu MODULE src/ndarray_backend_cpu.cc)
34 | target_link_libraries(ndarray_backend_cpu PUBLIC ${LINKER_LIBS})
35 | pybind11_extension(ndarray_backend_cpu)
36 | pybind11_strip(ndarray_backend_cpu)
37 | 
38 | 
39 | # directly output to ffi folder
40 | set_target_properties(ndarray_backend_cpu
41 |   PROPERTIES
42 |   LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/python/needle/backend_ndarray
43 |   CXX_VISIBILITY_PRESET "hidden"
44 | )
45 | 
46 | if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
47 |   set_property(TARGET ndarray_backend_cpu PROPERTY LINK_OPTIONS -undefined dynamic_lookup)
48 | endif()
49 | 
50 | 
51 | 
52 | ####################
53 | ### CUDA BACKEND ###
54 | ####################
55 | find_package(CUDA)
56 | if(CUDA_FOUND)
57 |   message(STATUS "Found cuda, building cuda backend")
58 | 
59 |   include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
60 |   list(APPEND LINKER_LIBS ${CUDA_CUDART_LIBRARY})
61 | 
62 |   # invoke nvidia smi to detect if we really have a GPU
63 |   execute_process(COMMAND "nvidia-smi" ERROR_QUIET  RESULT_VARIABLE NV_RET)
64 |   if(NV_RET EQUAL "0")
65 |     CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS Auto)
66 |   else()
67 |     # set to 3.7 the flag of K80
68 |     CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.7)
69 |   endif()
70 | 
71 |   # set arch flags properly
72 |   CUDA_ADD_LIBRARY(ndarray_backend_cuda MODULE src/ndarray_backend_cuda.cu OPTIONS ${ARCH_FLAGS})
73 | 
74 |   target_link_libraries(ndarray_backend_cuda ${LINKER_LIBS})
75 |   pybind11_extension(ndarray_backend_cuda)
76 |   pybind11_strip(ndarray_backend_cuda)
77 | 
78 |   # directly output to ffi folder
79 |   set_target_properties(ndarray_backend_cuda
80 |     PROPERTIES
81 |     LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/python/needle/backend_ndarray
82 |     CXX_VISIBILITY_PRESET "hidden"
83 |     CUDA_VISIBILITY_PRESET "hidden"
84 | )
85 | 
86 | endif()
87 | 
88 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: lib, pybind, clean, format, all
 2 | 
 3 | all: lib
 4 | 
 5 | 
 6 | lib:
 7 | 	@mkdir -p build
 8 | 	@cd build; cmake ..
 9 | 	@cd build; $(MAKE)
10 | 
11 | format:
12 | 	python3 -m black .
13 | 	clang-format -i src/*.cc src/*.cu
14 | 
15 | clean:
16 | 	rm -rf build python/needle/backend_ndarray/ndarray_backend*.so
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Lecture13
2 | 
3 | This repo contains the infrastructure code needed for hardware acceleration implementation
4 | It can be safely replaced by hw3 repo eventually.
5 | 


--------------------------------------------------------------------------------
/python/needle/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import ops
 2 | from .ops import *
 3 | from .autograd import Tensor, cpu, all_devices
 4 | 
 5 | from . import init
 6 | from .init import ones, zeros, zeros_like, ones_like
 7 | 
 8 | from . import data
 9 | from . import nn
10 | from . import optim
11 | from .backend_selection import *
12 | 


--------------------------------------------------------------------------------
/python/needle/autograd.py:
--------------------------------------------------------------------------------
  1 | """Core data structures."""
  2 | import needle
  3 | from .backend_numpy import Device, cpu, all_devices
  4 | from typing import List, Optional, NamedTuple, Tuple, Union
  5 | from collections import namedtuple
  6 | import numpy
  7 | 
  8 | from needle import init
  9 | 
 10 | # needle version
 11 | LAZY_MODE = False
 12 | TENSOR_COUNTER = 0
 13 | 
 14 | # NOTE: we will import numpy as the array_api
 15 | # as the backend for our computations, this line will change in later homeworks
 16 | 
 17 | import numpy as array_api
 18 | NDArray = numpy.ndarray
 19 | 
 20 | from .backend_selection import array_api, NDArray
 21 | 
 22 | 
 23 | class Op:
 24 |     """Operator definition."""
 25 | 
 26 |     def __call__(self, *args):
 27 |         raise NotImplementedError()
 28 | 
 29 |     def compute(self, *args: Tuple[NDArray]):
 30 |         """Calculate forward pass of operator.
 31 | 
 32 |         Parameters
 33 |         ----------
 34 |         input: np.ndarray
 35 |             A list of input arrays to the function
 36 | 
 37 |         Returns
 38 |         -------
 39 |         output: nd.array
 40 |             Array output of the operation
 41 | 
 42 |         """
 43 |         raise NotImplementedError()
 44 | 
 45 |     def gradient(
 46 |         self, out_grad: "Value", node: "Value"
 47 |     ) -> Union["Value", Tuple["Value"]]:
 48 |         """Compute partial adjoint for each input value for a given output adjoint.
 49 | 
 50 |         Parameters
 51 |         ----------
 52 |         out_grad: Value
 53 |             The adjoint wrt to the output value.
 54 | 
 55 |         node: Value
 56 |             The value node of forward evaluation.
 57 | 
 58 |         Returns
 59 |         -------
 60 |         input_grads: Value or Tuple[Value]
 61 |             A list containing partial gradient adjoints to be propagated to
 62 |             each of the input node.
 63 |         """
 64 |         raise NotImplementedError()
 65 | 
 66 |     def gradient_as_tuple(self, out_grad: "Value", node: "Value") -> Tuple["Value"]:
 67 |         """Convenience method to always return a tuple from gradient call"""
 68 |         output = self.gradient(out_grad, node)
 69 |         if isinstance(output, tuple):
 70 |             return output
 71 |         elif isinstance(output, list):
 72 |             return tuple(output)
 73 |         else:
 74 |             return (output,)
 75 | 
 76 | 
 77 | class TensorOp(Op):
 78 |     """Op class specialized to output tensors, will be alternate subclasses for other structures"""
 79 | 
 80 |     def __call__(self, *args):
 81 |         return Tensor.make_from_op(self, args)
 82 | 
 83 | 
 84 | class TensorTupleOp(Op):
 85 |     """Op class specialized to output TensorTuple"""
 86 | 
 87 |     def __call__(self, *args):
 88 |         return TensorTuple.make_from_op(self, args)
 89 | 
 90 | 
 91 | class Value:
 92 |     """A value in the computational graph."""
 93 | 
 94 |     # trace of computational graph
 95 |     op: Optional[Op]
 96 |     inputs: List["Value"]
 97 |     # The following fields are cached fields for
 98 |     # dynamic computation
 99 |     cached_data: NDArray
100 |     requires_grad: bool
101 | 
102 |     def realize_cached_data(self):
103 |         """Run compute to realize the cached data"""
104 |         # avoid recomputation
105 |         if self.cached_data is not None:
106 |             return self.cached_data
107 |         # note: data implicitly calls realized cached data
108 |         self.cached_data = self.op.compute(
109 |             *[x.realize_cached_data() for x in self.inputs]
110 |         )
111 |         return self.cached_data
112 | 
113 |     def is_leaf(self):
114 |         return self.op is None
115 | 
116 |     def __del__(self):
117 |         global TENSOR_COUNTER
118 |         TENSOR_COUNTER -= 1
119 | 
120 |     def _init(
121 |         self,
122 |         op: Optional[Op],
123 |         inputs: List["Tensor"],
124 |         *,
125 |         num_outputs: int = 1,
126 |         cached_data: List[object] = None,
127 |         requires_grad: Optional[bool] = None
128 |     ):
129 |         global TENSOR_COUNTER
130 |         TENSOR_COUNTER += 1
131 |         if requires_grad is None:
132 |             requires_grad = any(x.requires_grad for x in inputs)
133 |         self.op = op
134 |         self.inputs = inputs
135 |         self.num_outputs = num_outputs
136 |         self.cached_data = cached_data
137 |         self.requires_grad = requires_grad
138 | 
139 |     @classmethod
140 |     def make_const(cls, data, *, requires_grad=False):
141 |         value = cls.__new__(cls)
142 |         value._init(
143 |             None,
144 |             [],
145 |             cached_data=data,
146 |             requires_grad=requires_grad,
147 |         )
148 |         return value
149 | 
150 |     @classmethod
151 |     def make_from_op(cls, op: Op, inputs: List["Value"]):
152 |         value = cls.__new__(cls)
153 |         value._init(op, inputs)
154 | 
155 |         if not LAZY_MODE:
156 |             if not value.requires_grad:
157 |                 return value.detach()
158 |             value.realize_cached_data()
159 |         return value
160 | 
161 | 
162 | ### Not needed in HW1
163 | class TensorTuple(Value):
164 |     """Represent a tuple of tensors.
165 | 
166 |     To keep things simple, we do not support nested tuples.
167 |     """
168 | 
169 |     def __len__(self):
170 |         cdata = self.realize_cached_data()
171 |         return len(cdata)
172 | 
173 |     def __getitem__(self, index: int):
174 |         return needle.ops.tuple_get_item(self, index)
175 | 
176 |     def tuple(self):
177 |         return tuple([x for x in self])
178 | 
179 |     def __repr__(self):
180 |         return "needle.TensorTuple" + str(self.tuple())
181 | 
182 |     def __str__(self):
183 |         return self.__repr__()
184 | 
185 |     def __add__(self, other):
186 |         assert isinstance(other, TensorTuple)
187 |         assert len(self) == len(other)
188 |         return needle.ops.make_tuple(*[self[i] + other[i] for i in range(len(self))])
189 | 
190 |     def detach(self):
191 |         """Create a new tensor that shares the data but detaches from the graph."""
192 |         return Tuple.make_const(self.realize_cached_data())
193 | 
194 | 
195 | class Tensor(Value):
196 |     grad: "Tensor"
197 | 
198 |     def __init__(
199 |         self,
200 |         array,
201 |         *,
202 |         device: Optional[Device] = None,
203 |         dtype=None,
204 |         requires_grad=True,
205 |         **kwargs
206 |     ):
207 |         if isinstance(array, Tensor):
208 |             if device is None:
209 |                 device = array.device
210 |             if dtype is None:
211 |                 dtype = array.dtype
212 |             if device == array.device and dtype == array.dtype:
213 |                 cached_data = array.realize_cached_data()
214 |             else:
215 |                 # fall back, copy through numpy conversion
216 |                 cached_data = Tensor._array_from_numpy(
217 |                     array.numpy(), device=device, dtype=dtype
218 |                 )
219 |         else:
220 |             device = device if device else cpu()
221 |             cached_data = Tensor._array_from_numpy(array, device=device, dtype=dtype)
222 | 
223 |         self._init(
224 |             None,
225 |             [],
226 |             cached_data=cached_data,
227 |             requires_grad=requires_grad,
228 |         )
229 | 
230 |     @staticmethod
231 |     def _array_from_numpy(numpy_array, device, dtype):
232 |         if array_api is numpy:
233 |             return numpy.array(numpy_array, dtype=dtype)
234 |         return array_api.array(numpy_array, device=device, dtype=dtype)
235 | 
236 |     @staticmethod
237 |     def make_from_op(op: Op, inputs: List["Value"]):
238 |         tensor = Tensor.__new__(Tensor)
239 |         tensor._init(op, inputs)
240 |         if not LAZY_MODE:
241 |             if not tensor.requires_grad:
242 |                 return tensor.detach()
243 |             tensor.realize_cached_data()
244 |         return tensor
245 | 
246 |     @staticmethod
247 |     def make_const(data, requires_grad=False):
248 |         tensor = Tensor.__new__(Tensor)
249 |         tensor._init(
250 |             None,
251 |             [],
252 |             cached_data=data
253 |             if not isinstance(data, Tensor)
254 |             else data.realize_cached_data(),
255 |             requires_grad=requires_grad,
256 |         )
257 |         return tensor
258 | 
259 |     @property
260 |     def data(self):
261 |         return self.detach()
262 | 
263 |     @data.setter
264 |     def data(self, value):
265 |         assert isinstance(value, Tensor)
266 |         assert value.dtype == self.dtype, "%s %s" % (
267 |             value.dtype,
268 |             self.dtype,
269 |         )
270 |         self.cached_data = value.realize_cached_data()
271 | 
272 |     def detach(self):
273 |         """Create a new tensor that shares the data but detaches from the graph."""
274 |         return Tensor.make_const(self.realize_cached_data())
275 | 
276 |     @property
277 |     def shape(self):
278 |         return self.realize_cached_data().shape
279 | 
280 |     @property
281 |     def dtype(self):
282 |         return self.realize_cached_data().dtype
283 | 
284 |     @property
285 |     def device(self):
286 |         data = self.realize_cached_data()
287 |         # numpy array always sits on cpu
288 |         if array_api is numpy:
289 |             return cpu()
290 |         return data.device
291 | 
292 |     def backward(self, out_grad=None):
293 |         out_grad = (
294 |             out_grad
295 |             if out_grad
296 |             else init.ones(*self.shape, dtype=self.dtype, device=self.device)
297 |         )
298 |         compute_gradient_of_variables(self, out_grad)
299 | 
300 |     def __repr__(self):
301 |         return "needle.Tensor(" + str(self.realize_cached_data()) + ")"
302 | 
303 |     def __str__(self):
304 |         return self.realize_cached_data().__str__()
305 | 
306 |     def numpy(self):
307 |         data = self.realize_cached_data()
308 |         if array_api is numpy:
309 |             return data
310 |         return data.numpy()
311 | 
312 |     def __add__(self, other):
313 |         if isinstance(other, Tensor):
314 |             return needle.ops.EWiseAdd()(self, other)
315 |         else:
316 |             return needle.ops.AddScalar(other)(self)
317 | 
318 |     def __mul__(self, other):
319 |         if isinstance(other, Tensor):
320 |             return needle.ops.EWiseMul()(self, other)
321 |         else:
322 |             return needle.ops.MulScalar(other)(self)
323 | 
324 |     def __pow__(self, other):
325 |         if isinstance(other, Tensor):
326 |             return needle.ops.EWisePow()(self, other)
327 |         else:
328 |             return needle.ops.PowerScalar(other)(self)
329 | 
330 |     def __sub__(self, other):
331 |         if isinstance(other, Tensor):
332 |             return needle.ops.EWiseAdd()(self, needle.ops.Negate()(other))
333 |         else:
334 |             return needle.ops.AddScalar(-other)(self)
335 | 
336 |     def __truediv__(self, other):
337 |         if isinstance(other, Tensor):
338 |             return needle.ops.EWiseDiv()(self, other)
339 |         else:
340 |             return needle.ops.DivScalar(other)(self)
341 | 
342 |     def __matmul__(self, other):
343 |         return needle.ops.MatMul()(self, other)
344 | 
345 |     def matmul(self, other):
346 |         return needle.ops.MatMul()(self, other)
347 | 
348 |     def sum(self, axes=None):
349 |         return needle.ops.Summation(axes)(self)
350 | 
351 |     def broadcast_to(self, shape):
352 |         return needle.ops.BroadcastTo(shape)(self)
353 | 
354 |     def reshape(self, shape):
355 |         return needle.ops.Reshape(shape)(self)
356 | 
357 |     def __neg__(self):
358 |         return needle.ops.Negate()(self)
359 | 
360 |     def transpose(self, axes=None):
361 |         return needle.ops.Transpose(axes)(self)
362 | 
363 |     __radd__ = __add__
364 |     __rmul__ = __mul__
365 |     __rsub__ = __sub__
366 |     __rmatmul__ = __matmul__
367 | 
368 | 
369 | def compute_gradient_of_variables(output_tensor, out_grad):
370 |     """Take gradient of output node with respect to each node in node_list.
371 | 
372 |     Store the computed result in the grad field of each Variable.
373 |     """
374 |     # a map from node to a list of gradient contributions from each output node
375 |     node_to_output_grads_list: Dict[Tensor, List[Tensor]] = {}
376 |     # Special note on initializing gradient of
377 |     # We are really taking a derivative of the scalar reduce_sum(output_node)
378 |     # instead of the vector output_node. But this is the common case for loss function.
379 |     node_to_output_grads_list[output_tensor] = [out_grad]
380 | 
381 |     # Traverse graph in reverse topological order given the output_node that we are taking gradient wrt.
382 |     reverse_topo_order = list(reversed(find_topo_sort([output_tensor])))
383 | 
384 |     ### BEGIN YOUR SOLUTION
385 |     raise NotImplementedError()
386 |     ### END YOUR SOLUTION
387 | 
388 | 
389 | def find_topo_sort(node_list: List[Value]) -> List[Value]:
390 |     """Given a list of nodes, return a topological sort list of nodes ending in them.
391 | 
392 |     A simple algorithm is to do a post-order DFS traversal on the given nodes,
393 |     going backwards based on input edges. Since a node is added to the ordering
394 |     after all its predecessors are traversed due to post-order DFS, we get a topological
395 |     sort.
396 |     """
397 |     ### BEGIN YOUR SOLUTION
398 |     raise NotImplementedError()
399 |     ### END YOUR SOLUTION
400 | 
401 | 
402 | def topo_sort_dfs(node, visited, topo_order):
403 |     """Post-order DFS"""
404 |     ### BEGIN YOUR SOLUTION
405 |     raise NotImplementedError()
406 |     ### END YOUR SOLUTION
407 | 
408 | 
409 | ##############################
410 | ####### Helper Methods #######
411 | ##############################
412 | 
413 | 
414 | def sum_node_list(node_list):
415 |     """Custom sum function in order to avoid create redundant nodes in Python sum implementation."""
416 |     from operator import add
417 |     from functools import reduce
418 | 
419 |     return reduce(add, node_list)
420 | 


--------------------------------------------------------------------------------
/python/needle/backend_ndarray/__init__.py:
--------------------------------------------------------------------------------
1 | from .ndarray import *
2 | 


--------------------------------------------------------------------------------
/python/needle/backend_ndarray/ndarray.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | import math
  3 | from functools import reduce
  4 | import numpy as np
  5 | from . import ndarray_backend_numpy
  6 | from . import ndarray_backend_cpu
  7 | 
  8 | 
  9 | # math.prod not in Python 3.7
 10 | def prod(x):
 11 |     return reduce(operator.mul, x, 1)
 12 | 
 13 | 
 14 | class BackendDevice:
 15 |     """A backend device, wrapps the implementation module."""
 16 | 
 17 |     def __init__(self, name, mod):
 18 |         self.name = name
 19 |         self.mod = mod
 20 | 
 21 |     def __eq__(self, other):
 22 |         return self.name == other.name
 23 | 
 24 |     def __repr__(self):
 25 |         return self.name + "()"
 26 | 
 27 |     def __getattr__(self, name):
 28 |         return getattr(self.mod, name)
 29 | 
 30 |     def enabled(self):
 31 |         return self.mod is not None
 32 | 
 33 |     def randn(self, *shape, dtype="float32"):
 34 |         # note: numpy doesn't support types within standard random routines, and
 35 |         # .astype("float32") does work if we're generating a singleton
 36 |         return NDArray(np.random.randn(*shape).astype(dtype), device=self)
 37 | 
 38 |     def rand(self, *shape, dtype="float32"):
 39 |         # note: numpy doesn't support types within standard random routines, and
 40 |         # .astype("float32") does work if we're generating a singleton
 41 |         return NDArray(np.random.rand(*shape).astype(dtype), device=self)
 42 | 
 43 |     def one_hot(self, n, i, dtype="float32"):
 44 |         return NDArray(np.eye(n, dtype=dtype)[i], device=self)
 45 | 
 46 |     def empty(self, shape, dtype="float32"):
 47 |         dtype = "float32" if dtype is None else dtype
 48 |         assert dtype == "float32"
 49 |         return NDArray.make(shape, device=self)
 50 | 
 51 |     def full(self, shape, fill_value, dtype="float32"):
 52 |         dtype = "float32" if dtype is None else dtype
 53 |         assert dtype == "float32"
 54 |         arr = self.empty(shape, dtype)
 55 |         arr.fill(fill_value)
 56 |         return arr
 57 | 
 58 | 
 59 | def cuda():
 60 |     """Return cuda device"""
 61 |     try:
 62 |         from . import ndarray_backend_cuda
 63 | 
 64 |         return BackendDevice("cuda", ndarray_backend_cuda)
 65 |     except ImportError:
 66 |         return BackendDevice("cuda", None)
 67 | 
 68 | 
 69 | def cpu_numpy():
 70 |     """Return numpy device"""
 71 |     return BackendDevice("cpu_numpy", ndarray_backend_numpy)
 72 | 
 73 | 
 74 | def cpu():
 75 |     """Return cpu device"""
 76 |     return BackendDevice("cpu", ndarray_backend_cpu)
 77 | 
 78 | 
 79 | def default_device():
 80 |     return cpu_numpy()
 81 | 
 82 | 
 83 | def all_devices():
 84 |     """return a list of all available devices"""
 85 |     return [cpu(), cuda(), cpu_numpy()]
 86 | 
 87 | 
 88 | class NDArray:
 89 |     """A generic ND array class that may contain multipe different backends
 90 |     i.e., a Numpy backend, a native CPU backend, or a GPU backend.
 91 | 
 92 |     This class will only contains those functions that you need to implement
 93 |     to actually get the desired functionality for the programming examples
 94 |     in the homework, and no more.
 95 | 
 96 |     For now, for simplicity the class only supports float32 types, though
 97 |     this can be extended if desired.
 98 |     """
 99 | 
100 |     def __init__(self, other, device=None):
101 |         """Create by copying another NDArray, or from numpy"""
102 |         if isinstance(other, NDArray):
103 |             # create a copy of existing NDArray
104 |             if device is None:
105 |                 device = other.device
106 |             self._init(other.to(device) + 0.0)  # this creates a copy
107 |         elif isinstance(other, np.ndarray):
108 |             # create copy from numpy array
109 |             device = device if device is not None else default_device()
110 |             array = self.make(other.shape, device=device)
111 |             array.device.from_numpy(np.ascontiguousarray(other), array._handle)
112 |             self._init(array)
113 |         else:
114 |             # see if we can create a numpy array from input
115 |             array = NDArray(np.array(other), device=device)
116 |             self._init(array)
117 | 
118 |     def _init(self, other):
119 |         self._shape = other._shape
120 |         self._strides = other._strides
121 |         self._offset = other._offset
122 |         self._device = other._device
123 |         self._handle = other._handle
124 | 
125 |     @staticmethod
126 |     def compact_strides(shape):
127 |         """Utility function to compute compact strides"""
128 |         stride = 1
129 |         res = []
130 |         for i in range(1, len(shape) + 1):
131 |             res.append(stride)
132 |             stride *= shape[-i]
133 |         return tuple(res[::-1])
134 | 
135 |     @staticmethod
136 |     def make(shape, strides=None, device=None, handle=None, offset=0):
137 |         """Create a new NDArray with the given properties.  This will allocation the
138 |         memory if handle=None, otherwise it will use the handle of an existing
139 |         array."""
140 |         array = NDArray.__new__(NDArray)
141 |         array._shape = tuple(shape)
142 |         array._strides = NDArray.compact_strides(shape) if strides is None else strides
143 |         array._offset = offset
144 |         array._device = device if device is not None else default_device()
145 |         if handle is None:
146 |             array._handle = array.device.Array(prod(shape))
147 |         else:
148 |             array._handle = handle
149 |         return array
150 | 
151 |     ### Properies and string representations
152 |     @property
153 |     def shape(self):
154 |         return self._shape
155 | 
156 |     @property
157 |     def strides(self):
158 |         return self._strides
159 | 
160 |     @property
161 |     def device(self):
162 |         return self._device
163 | 
164 |     @property
165 |     def dtype(self):
166 |         # only support float32 for now
167 |         return "float32"
168 | 
169 |     @property
170 |     def ndim(self):
171 |         """Return number of dimensions."""
172 |         return len(self._shape)
173 | 
174 |     @property
175 |     def size(self):
176 |         return prod(self._shape)
177 | 
178 |     def __repr__(self):
179 |         return "NDArray(" + self.numpy().__str__() + f", device={self.device})"
180 | 
181 |     def __str__(self):
182 |         return self.numpy().__str__()
183 | 
184 |     ### Basic array manipulation
185 |     def fill(self, value):
186 |         """Fill (in place) with a constant value."""
187 |         self._device.fill(self._handle, value)
188 | 
189 |     def to(self, device):
190 |         """Convert between devices, using to/from numpy calls as the unifying bridge."""
191 |         if device == self.device:
192 |             return self
193 |         else:
194 |             return NDArray(self.numpy(), device=device)
195 | 
196 |     def numpy(self):
197 |         """convert to a numpy array"""
198 |         return self.device.to_numpy(
199 |             self._handle, self.shape, self.strides, self._offset
200 |         )
201 | 
202 |     def is_compact(self):
203 |         """Return true if array is compact in memory and internal size equals product
204 |         of the shape dimensions"""
205 |         return (
206 |             self._strides == self.compact_strides(self._shape)
207 |             and prod(self.shape) == self._handle.size
208 |         )
209 | 
210 |     def compact(self):
211 |         """Convert a matrix to be compact"""
212 |         if self.is_compact():
213 |             return self
214 |         else:
215 |             out = NDArray.make(self.shape, device=self.device)
216 |             self.device.compact(
217 |                 self._handle, out._handle, self.shape, self.strides, self._offset
218 |             )
219 |             return out
220 | 
221 |     def as_strided(self, shape, strides):
222 |         """Restride the matrix without copying memory."""
223 |         assert len(shape) == len(strides)
224 |         return NDArray.make(
225 |             shape, strides=strides, device=self.device, handle=self._handle
226 |         )
227 | 
228 |     @property
229 |     def flat(self):
230 |         return self.reshape((self.size,))
231 | 
232 |     def reshape(self, new_shape):
233 |         """
234 |         Reshape the matrix without copying memory.  This will return a matrix
235 |         that corresponds to a reshaped array but points to the same memory as
236 |         the original array.
237 | 
238 |         Raises:
239 |             ValueError if product of current shape is not equal to the product
240 |             of the new shape, or if the matrix is not compact.
241 | 
242 |         Args:
243 |             new_shape (tuple): new shape of the array
244 | 
245 |         Returns:
246 |             NDArray : reshaped array; this will point to thep
247 |         """
248 | 
249 |         ### BEGIN YOUR SOLUTION
250 |         raise NotImplementedError()
251 |         ### END YOUR SOLUTION
252 | 
253 |     def permute(self, new_axes):
254 |         """
255 |         Permute order of the dimensions.  new_axes describes a permuation of the
256 |         existing axes, so e.g.:
257 |           - If we have an array with dimension "BHWC" then .permute((0,3,1,2))
258 |             would convert this to "BCHW" order.
259 |           - For a 2D array, .permute((1,0)) would transpose the array.
260 |         Like reshape, this operation should not copy memory, but achieves the
261 |         permuting by just adjusting the shape/strides of the array.  That is,
262 |         it returns a new array that has the dimensions permuted as desired, but
263 |         which points to the same memroy as the original array.
264 | 
265 |         Args:
266 |             new_axes (tuple): permuation order of the dimensions
267 | 
268 |         Returns:
269 |             NDarray : new NDArray object with permuted dimensions, pointing
270 |             to the same memory as the original NDArray (i.e., just shape and
271 |             strides changed).
272 |         """
273 | 
274 |         ### BEGIN YOUR SOLUTION
275 |         raise NotImplementedError()
276 |         ### END YOUR SOLUTION
277 | 
278 |     def broadcast_to(self, new_shape):
279 |         """
280 |         Broadcast an array to a new shape.  new_shape's elements must be the
281 |         same as the original shape, except for dimensions in the self where
282 |         the size = 1 (which can then be broadcast to any size).  As with the
283 |         previous calls, this will not copy memory, and just achieves
284 |         broadcasting by manipulating the strides.
285 | 
286 |         Raises:
287 |             assertion error if new_shape[i] != shape[i] for all i where
288 |             shape[i] != 1
289 | 
290 |         Args:
291 |             new_shape (tuple): shape to broadcast to
292 | 
293 |         Returns:
294 |             NDArray: the new NDArray object with the new broadcast shape; should
295 |             point to the same memory as the original array.
296 |         """
297 | 
298 |         ### BEGIN YOUR SOLUTION
299 |         raise NotImplementedError()
300 |         ### END YOUR SOLUTION
301 | 
302 |     ### Get and set elements
303 | 
304 |     def process_slice(self, sl, dim):
305 |         """Convert a slice to an explicit start/stop/step"""
306 |         start, stop, step = sl.start, sl.stop, sl.step
307 |         if start == None:
308 |             start = 0
309 |         if start < 0:
310 |             start = self.shape[dim]
311 |         if stop == None:
312 |             stop = self.shape[dim]
313 |         if stop < 0:
314 |             stop = self.shape[dim] + stop
315 |         if step == None:
316 |             step = 1
317 | 
318 |         # we're not gonna handle negative strides and that kind of thing
319 |         assert stop > start, "Start must be less than stop"
320 |         assert step > 0, "No support for  negative increments"
321 |         return slice(start, stop, step)
322 | 
323 |     def __getitem__(self, idxs):
324 |         """
325 |         The __getitem__ operator in Python allows us to access elements of our
326 |         array.  When passed notation such as a[1:5,:-1:2,4,:] etc, Python will
327 |         convert this to a tuple of slices and integers (for singletons like the
328 |         '4' in this example).  Slices can be a bit odd to work with (they have
329 |         three elements .start .stop .step), which can be None or have negative
330 |         entries, so for simplicity we wrote the code for you to convert these
331 |         to always be a tuple of slices, one of each dimension.
332 | 
333 |         For this tuple of slices, return an array that subsets the desired
334 |         elements.  As before, this can be done entirely through compute a new
335 |         shape, stride, and offset for the new "view" into the original array,
336 |         pointing to the same memory
337 | 
338 |         Raises:
339 |             AssertionError if a slice has negative size or step, or if number
340 |             of slices is not equal to the number of dimension (the stub code
341 |             already raises all these errors.
342 | 
343 |         Args:
344 |             idxs tuple: (after stub code processes), a tuple of slice elements
345 |             coresponding to the subset of the matrix to get
346 | 
347 |         Returns:
348 |             NDArray: a new NDArray object corresponding to the selected
349 |             subset of elements.  As before, this should not copy memroy but just
350 |             manipulate the shape/strides/offset of the new array, referecing
351 |             the same array as the original one.
352 |         """
353 | 
354 |         # handle singleton as tuple, everything as slices
355 |         if not isinstance(idxs, tuple):
356 |             idxs = (idxs,)
357 |         idxs = tuple(
358 |             [
359 |                 self.process_slice(s, i) if isinstance(s, slice) else slice(s, s + 1, 1)
360 |                 for i, s in enumerate(idxs)
361 |             ]
362 |         )
363 |         assert len(idxs) == self.ndim, "Need indexes equal to number of dimensions"
364 | 
365 |         ### BEGIN YOUR SOLUTION
366 |         raise NotImplementedError()
367 |         ### END YOUR SOLUTION
368 | 
369 |     def __setitem__(self, idxs, other):
370 |         """Set the values of a view into an array, using the same semantics
371 |         as __getitem__()."""
372 |         view = self.__getitem__(idxs)
373 |         if isinstance(other, NDArray):
374 |             assert prod(view.shape) == prod(other.shape)
375 |             self.device.ewise_setitem(
376 |                 other.compact()._handle,
377 |                 view._handle,
378 |                 view.shape,
379 |                 view.strides,
380 |                 view._offset,
381 |             )
382 |         else:
383 |             self.device.scalar_setitem(
384 |                 prod(view.shape),
385 |                 other,
386 |                 view._handle,
387 |                 view.shape,
388 |                 view.strides,
389 |                 view._offset,
390 |             )
391 | 
392 |     ### Collection of elementwise and scalar function: add, multiply, boolean, etc
393 | 
394 |     def ewise_or_scalar(self, other, ewise_func, scalar_func):
395 |         """Run either an elementwise or scalar version of a function,
396 |         depending on whether "other" is an NDArray or scalar
397 |         """
398 |         out = NDArray.make(self.shape, device=self.device)
399 |         if isinstance(other, NDArray):
400 |             assert self.shape == other.shape, "operation needs two equal-sized arrays"
401 |             ewise_func(self.compact()._handle, other.compact()._handle, out._handle)
402 |         else:
403 |             scalar_func(self.compact()._handle, other, out._handle)
404 |         return out
405 | 
406 |     def __add__(self, other):
407 |         return self.ewise_or_scalar(
408 |             other, self.device.ewise_add, self.device.scalar_add
409 |         )
410 | 
411 |     __radd__ = __add__
412 | 
413 |     def __sub__(self, other):
414 |         return self + (-other)
415 | 
416 |     def __rsub__(self, other):
417 |         return other + (-self)
418 | 
419 |     def __mul__(self, other):
420 |         return self.ewise_or_scalar(
421 |             other, self.device.ewise_mul, self.device.scalar_mul
422 |         )
423 | 
424 |     __rmul__ = __mul__
425 | 
426 |     def __truediv__(self, other):
427 |         return self.ewise_or_scalar(
428 |             other, self.device.ewise_div, self.device.scalar_div
429 |         )
430 | 
431 |     def __neg__(self):
432 |         return self * (-1)
433 | 
434 |     def __pow__(self, other):
435 |         out = NDArray.make(self.shape, device=self.device)
436 |         self.device.scalar_power(self.compact()._handle, other, out._handle)
437 |         return out
438 | 
439 |     def maximum(self, other):
440 |         return self.ewise_or_scalar(
441 |             other, self.device.ewise_maximum, self.device.scalar_maximum
442 |         )
443 | 
444 |     ### Binary operators all return (0.0, 1.0) floating point values, could of course be optimized
445 |     def __eq__(self, other):
446 |         return self.ewise_or_scalar(other, self.device.ewise_eq, self.device.scalar_eq)
447 | 
448 |     def __ge__(self, other):
449 |         return self.ewise_or_scalar(other, self.device.ewise_ge, self.device.scalar_ge)
450 | 
451 |     def __ne__(self, other):
452 |         return 1 - (self == other)
453 | 
454 |     def __gt__(self, other):
455 |         return (self >= other) * (self != other)
456 | 
457 |     def __lt__(self, other):
458 |         return 1 - (self >= other)
459 | 
460 |     def __le__(self, other):
461 |         return 1 - (self > other)
462 | 
463 |     ### Elementwise functions
464 | 
465 |     def log(self):
466 |         out = NDArray.make(self.shape, device=self.device)
467 |         self.device.ewise_log(self.compact()._handle, out._handle)
468 |         return out
469 | 
470 |     def exp(self):
471 |         out = NDArray.make(self.shape, device=self.device)
472 |         self.device.ewise_exp(self.compact()._handle, out._handle)
473 |         return out
474 | 
475 |     def tanh(self):
476 |         out = NDArray.make(self.shape, device=self.device)
477 |         self.device.ewise_tanh(self.compact()._handle, out._handle)
478 |         return out
479 | 
480 |     ### Matrix multiplication
481 |     def __matmul__(self, other):
482 |         """Matrix multplication of two arrays.  This requires that both arrays
483 |         be 2D (i.e., we don't handle batch matrix multiplication), and that the
484 |         sizes match up properly for matrix multiplication.
485 | 
486 |         In the case of the CPU backend, you will implement an efficient "tiled"
487 |         version of matrix multiplication for the case when all dimensions of
488 |         the array are divisible by self.device.__tile_size__.  In this case,
489 |         the code below will restride and compact the matrix into tiled form,
490 |         and then pass to the relevant CPU backend.  For the CPU version we will
491 |         just fall back to the naive CPU implementation if the array shape is not
492 |         a multiple of the tile size
493 | 
494 |         The GPU (and numpy) versions don't have any tiled version (or rather,
495 |         the GPU version will just work natively by tiling any input size).
496 |         """
497 | 
498 |         assert self.ndim == 2 and other.ndim == 2
499 |         assert self.shape[1] == other.shape[0]
500 | 
501 |         m, n, p = self.shape[0], self.shape[1], other.shape[1]
502 | 
503 |         # if the matrix is aligned, use tiled matrix multiplication
504 |         if hasattr(self.device, "matmul_tiled") and all(
505 |             d % self.device.__tile_size__ == 0 for d in (m, n, p)
506 |         ):
507 | 
508 |             def tile(a, tile):
509 |                 return a.as_strided(
510 |                     (a.shape[0] // tile, a.shape[1] // tile, tile, tile),
511 |                     (a.shape[1] * tile, tile, self.shape[1], 1),
512 |                 )
513 | 
514 |             t = self.device.__tile_size__
515 |             a = tile(self.compact(), t).compact()
516 |             b = tile(other.compact(), t).compact()
517 |             out = NDArray.make((a.shape[0], b.shape[1], t, t), device=self.device)
518 |             self.device.matmul_tiled(a._handle, b._handle, out._handle, m, n, p)
519 | 
520 |             return (
521 |                 out.permute((0, 2, 1, 3))
522 |                 .compact()
523 |                 .reshape((self.shape[0], other.shape[1]))
524 |             )
525 | 
526 |         else:
527 |             out = NDArray.make((m, p), device=self.device)
528 |             self.device.matmul(
529 |                 self.compact()._handle, other.compact()._handle, out._handle, m, n, p
530 |             )
531 |             return out
532 | 
533 |     ### Reductions, i.e., sum/max over all element or over given axis
534 |     def reduce_view_out(self, axis):
535 |         """Return a view to the array set up for reduction functions and output array."""
536 |         if axis is None:
537 |             view = self.reshape((1,) * (self.ndim - 1) + (prod(self.shape),))
538 |             out = NDArray.make((1,) * self.ndim, device=self.device)
539 |         else:
540 |             if isinstance(axis, (tuple, list)):
541 |                 assert len(axis) == 1, "Only support reduction over a single axis"
542 |                 axis = axis[0]
543 | 
544 |             view = self.permute(
545 |                 tuple([a for a in range(self.ndim) if a != axis]) + (axis,)
546 |             )
547 |             out = NDArray.make(
548 |                 tuple([1 if i == axis else s for i, s in enumerate(self.shape)]),
549 |                 device=self.device,
550 |             )
551 |         return view, out
552 | 
553 |     def sum(self, axis=None):
554 |         view, out = self.reduce_view_out(axis)
555 |         self.device.reduce_sum(view.compact()._handle, out._handle, view.shape[-1])
556 |         return out
557 | 
558 |     def max(self, axis=None):
559 |         view, out = self.reduce_view_out(axis)
560 |         self.device.reduce_max(view.compact()._handle, out._handle, view.shape[-1])
561 |         return out
562 | 
563 | 
564 | def array(a, dtype="float32", device=None):
565 |     """Convenience methods to match numpy a bit more closely."""
566 |     dtype = "float32" if dtype is None else dtype
567 |     assert dtype == "float32"
568 |     return NDArray(a, device=device)
569 | 
570 | 
571 | def empty(shape, dtype="float32", device=None):
572 |     device = device if device is not None else default_device()
573 |     return device.empty(shape, dtype)
574 | 
575 | 
576 | def full(shape, fill_value, dtype="float32", device=None):
577 |     device = device if device is not None else default_device()
578 |     return device.full(shape, fill_value, dtype)
579 | 
580 | 
581 | def broadcast_to(array, new_shape):
582 |     return array.broadcast_to(new_shape)
583 | 
584 | 
585 | def reshape(array, new_shape):
586 |     return array.reshape(new_shape)
587 | 
588 | 
589 | def maximum(a, b):
590 |     return a.maximum(b)
591 | 
592 | 
593 | def log(a):
594 |     return a.log()
595 | 
596 | 
597 | def exp(a):
598 |     return a.exp()
599 | 
600 | 
601 | def tanh(a):
602 |     return a.tanh()
603 | 
604 | 
605 | def sum(a, axis=None):
606 |     return a.sum(axis=axis)
607 | 


--------------------------------------------------------------------------------
/python/needle/backend_ndarray/ndarray_backend_numpy.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | __device_name__ = "numpy"
  5 | _datatype = np.float32
  6 | _datetype_size = np.dtype(_datatype).itemsize
  7 | 
  8 | 
  9 | class Array:
 10 |     def __init__(self, size):
 11 |         self.array = np.empty(size, dtype=np.float32)
 12 | 
 13 |     @property
 14 |     def size(self):
 15 |         return self.array.size
 16 | 
 17 | 
 18 | def to_numpy(a, shape, strides, offset):
 19 |     return np.lib.stride_tricks.as_strided(
 20 |         a.array[offset:], shape, tuple([s * _datetype_size for s in strides])
 21 |     )
 22 | 
 23 | 
 24 | def from_numpy(a, out):
 25 |     out.array[:] = a.flatten()
 26 | 
 27 | 
 28 | def fill(out, val):
 29 |     out.array.fill(val)
 30 | 
 31 | 
 32 | def compact(a, out, shape, strides, offset):
 33 |     out.array[:] = to_numpy(a, shape, strides, offset).flatten()
 34 | 
 35 | 
 36 | def ewise_setitem(a, out, shape, strides, offset):
 37 |     to_numpy(out, shape, strides, offset)[:] = a.array.reshape(shape)
 38 | 
 39 | 
 40 | def scalar_setitem(size, val, out, shape, strides, offset):
 41 |     to_numpy(out, shape, strides, offset)[:] = val
 42 | 
 43 | 
 44 | def ewise_add(a, b, out):
 45 |     out.array[:] = a.array + b.array
 46 | 
 47 | 
 48 | def scalar_add(a, val, out):
 49 |     out.array[:] = a.array + val
 50 | 
 51 | 
 52 | def ewise_mul(a, b, out):
 53 |     out.array[:] = a.array * b.array
 54 | 
 55 | 
 56 | def scalar_mul(a, val, out):
 57 |     out.array[:] = a.array * val
 58 | 
 59 | 
 60 | def ewise_div(a, b, out):
 61 |     out.array[:] = a.array / b.array
 62 | 
 63 | 
 64 | def scalar_div(a, val, out):
 65 |     out.array[:] = a.array / val
 66 | 
 67 | 
 68 | def scalar_power(a, val, out):
 69 |     out.array[:] = a.array**val
 70 | 
 71 | 
 72 | def ewise_maximum(a, b, out):
 73 |     out.array[:] = np.maximum(a.array, b.array)
 74 | 
 75 | 
 76 | def scalar_maximum(a, val, out):
 77 |     out.array[:] = np.maximum(a.array, val)
 78 | 
 79 | 
 80 | def ewise_eq(a, b, out):
 81 |     out.array[:] = (a.array == b.array).astype(np.float32)
 82 | 
 83 | 
 84 | def scalar_eq(a, val, out):
 85 |     out.array[:] = (a.array == val).astype(np.float32)
 86 | 
 87 | 
 88 | def ewise_ge(a, b, out):
 89 |     out.array[:] = (a.array >= b.array).astype(np.float32)
 90 | 
 91 | 
 92 | def scalar_ge(a, val, out):
 93 |     out.array[:] = (a.array >= val).astype(np.float32)
 94 | 
 95 | 
 96 | def ewise_log(a, out):
 97 |     out.array[:] = np.log(a.array)
 98 | 
 99 | 
100 | def ewise_exp(a, out):
101 |     out.array[:] = np.exp(a.array)
102 | 
103 | 
104 | def ewise_tanh(a, out):
105 |     out.array[:] = np.tanh(a.array)
106 | 
107 | 
108 | def matmul(a, b, out, m, n, p):
109 |     out.array[:] = (a.array.reshape(m, n) @ b.array.reshape(n, p)).reshape(-1)
110 | 
111 | 
112 | def reduce_max(a, out, reduce_size):
113 |     out.array[:] = a.array[:].reshape(-1, reduce_size).max(axis=1)
114 | 
115 | 
116 | def reduce_sum(a, out, reduce_size):
117 |     out.array[:] = a.array[:].reshape(-1, reduce_size).sum(axis=1)
118 | 


--------------------------------------------------------------------------------
/python/needle/backend_numpy.py:
--------------------------------------------------------------------------------
 1 | """This file defies specific implementations of devices when using numpy as NDArray backend.
 2 | """
 3 | import numpy
 4 | 
 5 | 
 6 | class Device:
 7 |     """Baseclass of all device"""
 8 | 
 9 | 
10 | class CPUDevice(Device):
11 |     """Represents data that sits in CPU"""
12 | 
13 |     def __repr__(self):
14 |         return "needle.cpu()"
15 | 
16 |     def __hash__(self):
17 |         return self.__repr__().__hash__()
18 | 
19 |     def __eq__(self, other):
20 |         return isinstance(other, CPUDevice)
21 | 
22 |     def enabled(self):
23 |         return True
24 | 
25 |     def zeros(self, *shape, dtype="float32"):
26 |         return numpy.zeros(shape, dtype=dtype)
27 | 
28 |     def ones(self, *shape, dtype="float32"):
29 |         return numpy.ones(shape, dtype=dtype)
30 | 
31 |     def randn(self, *shape):
32 |         # note: numpy doesn't support types within standard random routines, and
33 |         # .astype("float32") does work if we're generating a singleton
34 |         return numpy.random.randn(*shape)
35 | 
36 |     def rand(self, *shape):
37 |         # note: numpy doesn't support types within standard random routines, and
38 |         # .astype("float32") does work if we're generating a singleton
39 |         return numpy.random.rand(*shape)
40 | 
41 |     def one_hot(self, n, i, dtype="float32"):
42 |         return numpy.eye(n, dtype=dtype)[i]
43 | 
44 |     def empty(self, shape, dtype="float32"):
45 |         return numpy.empty(shape, dtype=dtype)
46 | 
47 |     def full(self, shape, fill_value, dtype="float32"):
48 |         return numpy.full(shape, fill_value, dtype=dtype)
49 | 
50 | 
51 | def cpu():
52 |     """Return cpu device"""
53 |     return CPUDevice()
54 | 
55 | 
56 | def default_device():
57 |     return cpu()
58 | 
59 | 
60 | def all_devices():
61 |     """return a list of all available devices"""
62 |     return [cpu()]
63 | 


--------------------------------------------------------------------------------
/python/needle/backend_selection.py:
--------------------------------------------------------------------------------
 1 | """Logic for backend selection"""
 2 | import os
 3 | 
 4 | 
 5 | BACKEND = os.environ.get("NEEDLE_BACKEND", "nd")
 6 | 
 7 | 
 8 | if BACKEND == "nd":
 9 |     print("Using needle backend")
10 |     from . import backend_ndarray as array_api
11 |     from .backend_ndarray import (
12 |         all_devices,
13 |         cuda,
14 |         cpu,
15 |         cpu_numpy,
16 |         default_device,
17 |         BackendDevice as Device,
18 |     )
19 | 
20 |     NDArray = array_api.NDArray
21 | elif BACKEND == "np":
22 |     print("Using numpy backend")
23 |     import numpy as array_api
24 |     from .backend_numpy import all_devices, cpu, default_device, Device
25 | 
26 |     NDArray = array_api.ndarray
27 | else:
28 |     raise RuntimeError("Unknown needle array backend %s" % BACKEND)
29 | 


--------------------------------------------------------------------------------
/python/needle/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_basic import *
2 | from .data_transforms import *
3 | from .datasets import *
4 | 


--------------------------------------------------------------------------------
/python/needle/data/data_basic.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from ..autograd import Tensor
 3 | 
 4 | from typing import Iterator, Optional, List, Sized, Union, Iterable, Any
 5 | 
 6 | 
 7 | 
 8 | class Dataset:
 9 |     r"""An abstract class representing a `Dataset`.
10 | 
11 |     All subclasses should overwrite :meth:`__getitem__`, supporting fetching a
12 |     data sample for a given key. Subclasses must also overwrite
13 |     :meth:`__len__`, which is expected to return the size of the dataset.
14 |     """
15 | 
16 |     def __init__(self, transforms: Optional[List] = None):
17 |         self.transforms = transforms
18 | 
19 |     def __getitem__(self, index) -> object:
20 |         raise NotImplementedError
21 | 
22 |     def __len__(self) -> int:
23 |         raise NotImplementedError
24 |     
25 |     def apply_transforms(self, x):
26 |         if self.transforms is not None:
27 |             # apply the transforms
28 |             for tform in self.transforms:
29 |                 x = tform(x)
30 |         return x
31 | 
32 | 
33 | class DataLoader:
34 |     r"""
35 |     Data loader. Combines a dataset and a sampler, and provides an iterable over
36 |     the given dataset.
37 |     Args:
38 |         dataset (Dataset): dataset from which to load the data.
39 |         batch_size (int, optional): how many samples per batch to load
40 |             (default: ``1``).
41 |         shuffle (bool, optional): set to ``True`` to have the data reshuffled
42 |             at every epoch (default: ``False``).
43 |      """
44 |     dataset: Dataset
45 |     batch_size: Optional[int]
46 | 
47 |     def __init__(
48 |         self,
49 |         dataset: Dataset,
50 |         batch_size: Optional[int] = 1,
51 |         shuffle: bool = False,
52 |     ):
53 | 
54 |         self.dataset = dataset
55 |         self.shuffle = shuffle
56 |         self.batch_size = batch_size
57 |         if not self.shuffle:
58 |             self.ordering = np.array_split(np.arange(len(dataset)), 
59 |                                            range(batch_size, len(dataset), batch_size))
60 | 
61 |     def __iter__(self):
62 |         ### BEGIN YOUR SOLUTION
63 |         raise NotImplementedError()
64 |         ### END YOUR SOLUTION
65 |         return self
66 | 
67 |     def __next__(self):
68 |         ### BEGIN YOUR SOLUTION
69 |         raise NotImplementedError()
70 |         ### END YOUR SOLUTION
71 | 
72 | 


--------------------------------------------------------------------------------
/python/needle/data/data_transforms.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Transform:
 4 |     def __call__(self, x):
 5 |         raise NotImplementedError
 6 | 
 7 | 
 8 | class RandomFlipHorizontal(Transform):
 9 |     def __init__(self, p = 0.5):
10 |         self.p = p
11 | 
12 |     def __call__(self, img):
13 |         """
14 |         Horizonally flip an image, specified as an H x W x C NDArray.
15 |         Args:
16 |             img: H x W x C NDArray of an image
17 |         Returns:
18 |             H x W x C ndarray corresponding to image flipped with probability self.p
19 |         Note: use the provided code to provide randomness, for easier testing
20 |         """
21 |         flip_img = np.random.rand() < self.p
22 |         ### BEGIN YOUR SOLUTION
23 |         raise NotImplementedError()
24 |         ### END YOUR SOLUTION
25 | 
26 | 
27 | class RandomCrop(Transform):
28 |     def __init__(self, padding=3):
29 |         self.padding = padding
30 | 
31 |     def __call__(self, img):
32 |         """ Zero pad and then randomly crop an image.
33 |         Args:
34 |              img: H x W x C NDArray of an image
35 |         Return 
36 |             H x W x C NAArray of cliped image
37 |         Note: generate the image shifted by shift_x, shift_y specified below
38 |         """
39 |         shift_x, shift_y = np.random.randint(low=-self.padding, high=self.padding+1, size=2)
40 |         ### BEGIN YOUR SOLUTION
41 |         raise NotImplementedError()
42 |         ### END YOUR SOLUTION
43 | 


--------------------------------------------------------------------------------
/python/needle/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .mnist_dataset import *
2 | from .ndarray_dataset import *
3 | 


--------------------------------------------------------------------------------
/python/needle/data/datasets/mnist_dataset.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | from ..data_basic import Dataset
 3 | import numpy as np
 4 | 
 5 | class MNISTDataset(Dataset):
 6 |     def __init__(
 7 |         self,
 8 |         image_filename: str,
 9 |         label_filename: str,
10 |         transforms: Optional[List] = None,
11 |     ):
12 |         ### BEGIN YOUR SOLUTION
13 |         raise NotImplementedError()
14 |         ### END YOUR SOLUTION
15 | 
16 |     def __getitem__(self, index) -> object:
17 |         ### BEGIN YOUR SOLUTION
18 |         raise NotImplementedError()
19 |         ### END YOUR SOLUTION
20 | 
21 |     def __len__(self) -> int:
22 |         ### BEGIN YOUR SOLUTION
23 |         raise NotImplementedError()
24 |         ### END YOUR SOLUTION


--------------------------------------------------------------------------------
/python/needle/data/datasets/ndarray_dataset.py:
--------------------------------------------------------------------------------
 1 | from ..data_basic import Dataset
 2 | 
 3 | class NDArrayDataset(Dataset):
 4 |     def __init__(self, *arrays):
 5 |         self.arrays = arrays
 6 | 
 7 |     def __len__(self) -> int:
 8 |         return self.arrays[0].shape[0]
 9 | 
10 |     def __getitem__(self, i) -> object:
11 |         return tuple([a[i] for a in self.arrays])


--------------------------------------------------------------------------------
/python/needle/init/__init__.py:
--------------------------------------------------------------------------------
1 | from .init_basic import *
2 | 
3 | from .init_initializers import *
4 | 


--------------------------------------------------------------------------------
/python/needle/init/init_basic.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import needle as ndl
 3 | 
 4 | 
 5 | def rand(*shape, low=0.0, high=1.0, device=None, dtype="float32", requires_grad=False):
 6 |     """Generate random numbers uniform between low and high"""
 7 |     device = ndl.cpu() if device is None else device
 8 |     array = device.rand(*shape) * (high - low) + low
 9 |     return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
10 | 
11 | 
12 | def randn(*shape, mean=0.0, std=1.0, device=None, dtype="float32", requires_grad=False):
13 |     """Generate random normal with specified mean and std deviation"""
14 |     device = ndl.cpu() if device is None else device
15 |     array = device.randn(*shape) * std + mean
16 |     return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
17 | 
18 | 
19 | def constant(*shape, c=1.0, device=None, dtype="float32", requires_grad=False):
20 |     """Generate constant Tensor"""
21 |     device = ndl.cpu() if device is None else device
22 |     array = device.ones(*shape, dtype=dtype) * c  # note: can change dtype
23 |     return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
24 | 
25 | 
26 | def ones(*shape, device=None, dtype="float32", requires_grad=False):
27 |     """Generate all-ones Tensor"""
28 |     return constant(
29 |         *shape, c=1.0, device=device, dtype=dtype, requires_grad=requires_grad
30 |     )
31 | 
32 | 
33 | def zeros(*shape, device=None, dtype="float32", requires_grad=False):
34 |     """Generate all-zeros Tensor"""
35 |     return constant(
36 |         *shape, c=0.0, device=device, dtype=dtype, requires_grad=requires_grad
37 |     )
38 | 
39 | 
40 | def randb(*shape, p=0.5, device=None, dtype="bool", requires_grad=False):
41 |     """Generate binary random Tensor"""
42 |     device = ndl.cpu() if device is None else device
43 |     array = device.rand(*shape) <= p
44 |     return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad)
45 | 
46 | 
47 | def one_hot(n, i, device=None, dtype="float32", requires_grad=False):
48 |     """Generate one-hot encoding Tensor"""
49 |     device = ndl.cpu() if device is None else device
50 |     return ndl.Tensor(
51 |         device.one_hot(n, i.numpy(), dtype=dtype),
52 |         device=device,
53 |         requires_grad=requires_grad,
54 |     )
55 | 
56 | 
57 | def zeros_like(array, *, device=None, requires_grad=False):
58 |     device = device if device else array.device
59 |     return zeros(
60 |         *array.shape, dtype=array.dtype, device=device, requires_grad=requires_grad
61 |     )
62 | 
63 | 
64 | def ones_like(array, *, device=None, requires_grad=False):
65 |     device = device if device else array.device
66 |     return ones(
67 |         *array.shape, dtype=array.dtype, device=device, requires_grad=requires_grad
68 |     )
69 | 


--------------------------------------------------------------------------------
/python/needle/init/init_initializers.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from .init_basic import *
 3 | 
 4 | 
 5 | def xavier_uniform(fan_in, fan_out, gain=1.0, **kwargs):
 6 |     ### BEGIN YOUR SOLUTION
 7 |     raise NotImplementedError()
 8 |     ### END YOUR SOLUTION
 9 | 
10 | 
11 | def xavier_normal(fan_in, fan_out, gain=1.0, **kwargs):
12 |     ### BEGIN YOUR SOLUTION
13 |     raise NotImplementedError()
14 |     ### END YOUR SOLUTION
15 | 
16 | 
17 | def kaiming_uniform(fan_in, fan_out, nonlinearity="relu", **kwargs):
18 |     assert nonlinearity == "relu", "Only relu supported currently"
19 |     ### BEGIN YOUR SOLUTION
20 |     raise NotImplementedError()
21 |     ### END YOUR SOLUTION
22 | 
23 | 
24 | def kaiming_normal(fan_in, fan_out, nonlinearity="relu", **kwargs):
25 |     assert nonlinearity == "relu", "Only relu supported currently"
26 |     ### BEGIN YOUR SOLUTION
27 |     raise NotImplementedError()
28 |     ### END YOUR SOLUTION
29 | 


--------------------------------------------------------------------------------
/python/needle/nn/__init__.py:
--------------------------------------------------------------------------------
1 | from .nn_basic import *
2 | 


--------------------------------------------------------------------------------
/python/needle/nn/nn_basic.py:
--------------------------------------------------------------------------------
  1 | """The module.
  2 | """
  3 | from typing import List, Callable, Any
  4 | from needle.autograd import Tensor
  5 | from needle import ops
  6 | import needle.init as init
  7 | import numpy as np
  8 | 
  9 | 
 10 | class Parameter(Tensor):
 11 |     """A special kind of tensor that represents parameters."""
 12 | 
 13 | 
 14 | def _unpack_params(value: object) -> List[Tensor]:
 15 |     if isinstance(value, Parameter):
 16 |         return [value]
 17 |     elif isinstance(value, Module):
 18 |         return value.parameters()
 19 |     elif isinstance(value, dict):
 20 |         params = []
 21 |         for k, v in value.items():
 22 |             params += _unpack_params(v)
 23 |         return params
 24 |     elif isinstance(value, (list, tuple)):
 25 |         params = []
 26 |         for v in value:
 27 |             params += _unpack_params(v)
 28 |         return params
 29 |     else:
 30 |         return []
 31 | 
 32 | 
 33 | def _child_modules(value: object) -> List["Module"]:
 34 |     if isinstance(value, Module):
 35 |         modules = [value]
 36 |         modules.extend(_child_modules(value.__dict__))
 37 |         return modules
 38 |     if isinstance(value, dict):
 39 |         modules = []
 40 |         for k, v in value.items():
 41 |             modules += _child_modules(v)
 42 |         return modules
 43 |     elif isinstance(value, (list, tuple)):
 44 |         modules = []
 45 |         for v in value:
 46 |             modules += _child_modules(v)
 47 |         return modules
 48 |     else:
 49 |         return []
 50 | 
 51 | 
 52 | class Module:
 53 |     def __init__(self):
 54 |         self.training = True
 55 | 
 56 |     def parameters(self) -> List[Tensor]:
 57 |         """Return the list of parameters in the module."""
 58 |         return _unpack_params(self.__dict__)
 59 | 
 60 |     def _children(self) -> List["Module"]:
 61 |         return _child_modules(self.__dict__)
 62 | 
 63 |     def eval(self):
 64 |         self.training = False
 65 |         for m in self._children():
 66 |             m.training = False
 67 | 
 68 |     def train(self):
 69 |         self.training = True
 70 |         for m in self._children():
 71 |             m.training = True
 72 | 
 73 |     def __call__(self, *args, **kwargs):
 74 |         return self.forward(*args, **kwargs)
 75 | 
 76 | 
 77 | class Identity(Module):
 78 |     def forward(self, x):
 79 |         return x
 80 | 
 81 | 
 82 | class Linear(Module):
 83 |     def __init__(
 84 |         self, in_features, out_features, bias=True, device=None, dtype="float32"
 85 |     ):
 86 |         super().__init__()
 87 |         self.in_features = in_features
 88 |         self.out_features = out_features
 89 | 
 90 |         ### BEGIN YOUR SOLUTION
 91 |         raise NotImplementedError()
 92 |         ### END YOUR SOLUTION
 93 | 
 94 |     def forward(self, X: Tensor) -> Tensor:
 95 |         ### BEGIN YOUR SOLUTION
 96 |         raise NotImplementedError()
 97 |         ### END YOUR SOLUTION
 98 | 
 99 | 
100 | class Flatten(Module):
101 |     def forward(self, X):
102 |         ### BEGIN YOUR SOLUTION
103 |         raise NotImplementedError()
104 |         ### END YOUR SOLUTION
105 | 
106 | 
107 | class ReLU(Module):
108 |     def forward(self, x: Tensor) -> Tensor:
109 |         ### BEGIN YOUR SOLUTION
110 |         raise NotImplementedError()
111 |         ### END YOUR SOLUTION
112 | 
113 | 
114 | class Sequential(Module):
115 |     def __init__(self, *modules):
116 |         super().__init__()
117 |         self.modules = modules
118 | 
119 |     def forward(self, x: Tensor) -> Tensor:
120 |         ### BEGIN YOUR SOLUTION
121 |         raise NotImplementedError()
122 |         ### END YOUR SOLUTION
123 | 
124 | 
125 | class SoftmaxLoss(Module):
126 |     def forward(self, logits: Tensor, y: Tensor):
127 |         ### BEGIN YOUR SOLUTION
128 |         raise NotImplementedError()
129 |         ### END YOUR SOLUTION
130 | 
131 | 
132 | class BatchNorm1d(Module):
133 |     def __init__(self, dim, eps=1e-5, momentum=0.1, device=None, dtype="float32"):
134 |         super().__init__()
135 |         self.dim = dim
136 |         self.eps = eps
137 |         self.momentum = momentum
138 |         ### BEGIN YOUR SOLUTION
139 |         raise NotImplementedError()
140 |         ### END YOUR SOLUTION
141 | 
142 |     def forward(self, x: Tensor) -> Tensor:
143 |         ### BEGIN YOUR SOLUTION
144 |         raise NotImplementedError()
145 |         ### END YOUR SOLUTION
146 | 
147 | 
148 | class LayerNorm1d(Module):
149 |     def __init__(self, dim, eps=1e-5, device=None, dtype="float32"):
150 |         super().__init__()
151 |         self.dim = dim
152 |         self.eps = eps
153 |         ### BEGIN YOUR SOLUTION
154 |         raise NotImplementedError()
155 |         ### END YOUR SOLUTION
156 | 
157 |     def forward(self, x: Tensor) -> Tensor:
158 |         ### BEGIN YOUR SOLUTION
159 |         raise NotImplementedError()
160 |         ### END YOUR SOLUTION
161 | 
162 | 
163 | class Dropout(Module):
164 |     def __init__(self, p=0.5):
165 |         super().__init__()
166 |         self.p = p
167 | 
168 |     def forward(self, x: Tensor) -> Tensor:
169 |         ### BEGIN YOUR SOLUTION
170 |         raise NotImplementedError()
171 |         ### END YOUR SOLUTION
172 | 
173 | 
174 | class Residual(Module):
175 |     def __init__(self, fn: Module):
176 |         super().__init__()
177 |         self.fn = fn
178 | 
179 |     def forward(self, x: Tensor) -> Tensor:
180 |         ### BEGIN YOUR SOLUTION
181 |         raise NotImplementedError()
182 |         ### END YOUR SOLUTION
183 | 


--------------------------------------------------------------------------------
/python/needle/ops/__init__.py:
--------------------------------------------------------------------------------
1 | from .ops_mathematic import *
2 | 
3 | from .ops_logarithmic import *
4 | from .ops_tuple import *
5 | 


--------------------------------------------------------------------------------
/python/needle/ops/ops_logarithmic.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from ..autograd import NDArray
 3 | from ..autograd import Op, Tensor, Value, TensorOp
 4 | from ..autograd import TensorTuple, TensorTupleOp
 5 | 
 6 | from .ops_mathematic import *
 7 | 
 8 | import numpy as array_api
 9 | 
10 | class LogSoftmax(TensorOp):
11 |     def compute(self, Z):
12 |         ### BEGIN YOUR SOLUTION
13 |         raise NotImplementedError()
14 |         ### END YOUR SOLUTION
15 | 
16 |     def gradient(self, out_grad, node):
17 |         ### BEGIN YOUR SOLUTION
18 |         raise NotImplementedError()
19 |         ### END YOUR SOLUTION
20 | 
21 | 
22 | def logsoftmax(a):
23 |     return LogSoftmax()(a)
24 | 
25 | 
26 | class LogSumExp(TensorOp):
27 |     def __init__(self, axes: Optional[tuple] = None):
28 |         self.axes = axes
29 | 
30 |     def compute(self, Z):
31 |         ### BEGIN YOUR SOLUTION
32 |         raise NotImplementedError()
33 |         ### END YOUR SOLUTION
34 | 
35 |     def gradient(self, out_grad, node):
36 |         ### BEGIN YOUR SOLUTION
37 |         raise NotImplementedError()
38 |         ### END YOUR SOLUTION
39 | 
40 | 
41 | def logsumexp(a, axes=None):
42 |     return LogSumExp(axes=axes)(a)
43 | 
44 | 


--------------------------------------------------------------------------------
/python/needle/ops/ops_mathematic.py:
--------------------------------------------------------------------------------
  1 | """Operator implementations."""
  2 | 
  3 | from numbers import Number
  4 | from typing import Optional, List, Tuple, Union
  5 | 
  6 | from ..autograd import NDArray
  7 | from ..autograd import Op, Tensor, Value, TensorOp
  8 | from ..autograd import TensorTuple, TensorTupleOp
  9 | import numpy
 10 | 
 11 | # NOTE: we will import numpy as the array_api
 12 | # as the backend for our computations, this line will change in later homeworks
 13 | 
 14 | import numpy as array_api
 15 | 
 16 | 
 17 | class EWiseAdd(TensorOp):
 18 |     def compute(self, a: NDArray, b: NDArray):
 19 |         return a + b
 20 | 
 21 |     def gradient(self, out_grad: Tensor, node: Tensor):
 22 |         return out_grad, out_grad
 23 | 
 24 | 
 25 | def add(a, b):
 26 |     return EWiseAdd()(a, b)
 27 | 
 28 | 
 29 | class AddScalar(TensorOp):
 30 |     def __init__(self, scalar):
 31 |         self.scalar = scalar
 32 | 
 33 |     def compute(self, a: NDArray):
 34 |         return a + self.scalar
 35 | 
 36 |     def gradient(self, out_grad: Tensor, node: Tensor):
 37 |         return out_grad
 38 | 
 39 | 
 40 | def add_scalar(a, scalar):
 41 |     return AddScalar(scalar)(a)
 42 | 
 43 | 
 44 | class EWiseMul(TensorOp):
 45 |     def compute(self, a: NDArray, b: NDArray):
 46 |         return a * b
 47 | 
 48 |     def gradient(self, out_grad: Tensor, node: Tensor):
 49 |         lhs, rhs = node.inputs
 50 |         return out_grad * rhs, out_grad * lhs
 51 | 
 52 | 
 53 | def multiply(a, b):
 54 |     return EWiseMul()(a, b)
 55 | 
 56 | 
 57 | class MulScalar(TensorOp):
 58 |     def __init__(self, scalar):
 59 |         self.scalar = scalar
 60 | 
 61 |     def compute(self, a: NDArray):
 62 |         return a * self.scalar
 63 | 
 64 |     def gradient(self, out_grad: Tensor, node: Tensor):
 65 |         return (out_grad * self.scalar,)
 66 | 
 67 | 
 68 | def mul_scalar(a, scalar):
 69 |     return MulScalar(scalar)(a)
 70 | 
 71 | 
 72 | class EWisePow(TensorOp):
 73 |     """Op to element-wise raise a tensor to a power."""
 74 | 
 75 |     def compute(self, a: NDArray, b: NDArray) -> NDArray:
 76 |         return a**b
 77 | 
 78 |     def gradient(self, out_grad, node):
 79 |         if not isinstance(node.inputs[0], NDArray) or not isinstance(
 80 |             node.inputs[1], NDArray
 81 |         ):
 82 |             raise ValueError("Both inputs must be tensors (NDArray).")
 83 | 
 84 |         a, b = node.inputs[0], node.inputs[1]
 85 |         grad_a = out_grad * b * (a ** (b - 1))
 86 |         grad_b = out_grad * (a**b) * log(a)
 87 |         return grad_a, grad_b
 88 | 
 89 | def power(a, b):
 90 |     return EWisePow()(a, b)
 91 | 
 92 | 
 93 | class PowerScalar(TensorOp):
 94 |     """Op raise a tensor to an (integer) power."""
 95 | 
 96 |     def __init__(self, scalar: int):
 97 |         self.scalar = scalar
 98 | 
 99 |     def compute(self, a: NDArray) -> NDArray:
100 |         ### BEGIN YOUR SOLUTION
101 |         raise NotImplementedError()
102 |         ### END YOUR SOLUTION
103 | 
104 |     def gradient(self, out_grad, node):
105 |         ### BEGIN YOUR SOLUTION
106 |         raise NotImplementedError()
107 |         ### END YOUR SOLUTION
108 | 
109 | 
110 | def power_scalar(a, scalar):
111 |     return PowerScalar(scalar)(a)
112 | 
113 | 
114 | class EWiseDiv(TensorOp):
115 |     """Op to element-wise divide two nodes."""
116 | 
117 |     def compute(self, a, b):
118 |         ### BEGIN YOUR SOLUTION
119 |         raise NotImplementedError()
120 |         ### END YOUR SOLUTION
121 | 
122 |     def gradient(self, out_grad, node):
123 |         ### BEGIN YOUR SOLUTION
124 |         raise NotImplementedError()
125 |         ### END YOUR SOLUTION
126 | 
127 | 
128 | def divide(a, b):
129 |     return EWiseDiv()(a, b)
130 | 
131 | 
132 | class DivScalar(TensorOp):
133 |     def __init__(self, scalar):
134 |         self.scalar = scalar
135 | 
136 |     def compute(self, a):
137 |         ### BEGIN YOUR SOLUTION
138 |         raise NotImplementedError()
139 |         ### END YOUR SOLUTION
140 | 
141 |     def gradient(self, out_grad, node):
142 |         ### BEGIN YOUR SOLUTION
143 |         raise NotImplementedError()
144 |         ### END YOUR SOLUTION
145 | 
146 | 
147 | def divide_scalar(a, scalar):
148 |     return DivScalar(scalar)(a)
149 | 
150 | 
151 | class Transpose(TensorOp):
152 |     def __init__(self, axes: Optional[tuple] = None):
153 |         self.axes = axes
154 | 
155 |     def compute(self, a):
156 |         ### BEGIN YOUR SOLUTION
157 |         raise NotImplementedError()
158 |         ### END YOUR SOLUTION
159 | 
160 |     def gradient(self, out_grad, node):
161 |         ### BEGIN YOUR SOLUTION
162 |         raise NotImplementedError()
163 |         ### END YOUR SOLUTION
164 | 
165 | 
166 | def transpose(a, axes=None):
167 |     return Transpose(axes)(a)
168 | 
169 | 
170 | class Reshape(TensorOp):
171 |     def __init__(self, shape):
172 |         self.shape = shape
173 | 
174 |     def compute(self, a):
175 |         ### BEGIN YOUR SOLUTION
176 |         raise NotImplementedError()
177 |         ### END YOUR SOLUTION
178 | 
179 |     def gradient(self, out_grad, node):
180 |         ### BEGIN YOUR SOLUTION
181 |         raise NotImplementedError()
182 |         ### END YOUR SOLUTION
183 | 
184 | 
185 | def reshape(a, shape):
186 |     return Reshape(shape)(a)
187 | 
188 | 
189 | class BroadcastTo(TensorOp):
190 |     def __init__(self, shape):
191 |         self.shape = shape
192 | 
193 |     def compute(self, a):
194 |         ### BEGIN YOUR SOLUTION
195 |         raise NotImplementedError()
196 |         ### END YOUR SOLUTION
197 | 
198 |     def gradient(self, out_grad, node):
199 |         ### BEGIN YOUR SOLUTION
200 |         raise NotImplementedError()
201 |         ### END YOUR SOLUTION
202 | 
203 | 
204 | def broadcast_to(a, shape):
205 |     return BroadcastTo(shape)(a)
206 | 
207 | 
208 | class Summation(TensorOp):
209 |     def __init__(self, axes: Optional[tuple] = None):
210 |         self.axes = axes
211 | 
212 |     def compute(self, a):
213 |         ### BEGIN YOUR SOLUTION
214 |         raise NotImplementedError()
215 |         ### END YOUR SOLUTION
216 | 
217 |     def gradient(self, out_grad, node):
218 |         ### BEGIN YOUR SOLUTION
219 |         raise NotImplementedError()
220 |         ### END YOUR SOLUTION
221 | 
222 | 
223 | def summation(a, axes=None):
224 |     return Summation(axes)(a)
225 | 
226 | 
227 | class MatMul(TensorOp):
228 |     def compute(self, a, b):
229 |         ### BEGIN YOUR SOLUTION
230 |         raise NotImplementedError()
231 |         ### END YOUR SOLUTION
232 | 
233 |     def gradient(self, out_grad, node):
234 |         ### BEGIN YOUR SOLUTION
235 |         raise NotImplementedError()
236 |         ### END YOUR SOLUTION
237 | 
238 | 
239 | def matmul(a, b):
240 |     return MatMul()(a, b)
241 | 
242 | 
243 | class Negate(TensorOp):
244 |     def compute(self, a):
245 |         ### BEGIN YOUR SOLUTION
246 |         raise NotImplementedError()
247 |         ### END YOUR SOLUTION
248 | 
249 |     def gradient(self, out_grad, node):
250 |         ### BEGIN YOUR SOLUTION
251 |         raise NotImplementedError()
252 |         ### END YOUR SOLUTION
253 | 
254 | 
255 | def negate(a):
256 |     return Negate()(a)
257 | 
258 | 
259 | class Log(TensorOp):
260 |     def compute(self, a):
261 |         ### BEGIN YOUR SOLUTION
262 |         raise NotImplementedError()
263 |         ### END YOUR SOLUTION
264 | 
265 |     def gradient(self, out_grad, node):
266 |         ### BEGIN YOUR SOLUTION
267 |         raise NotImplementedError()
268 |         ### END YOUR SOLUTION
269 | 
270 | 
271 | def log(a):
272 |     return Log()(a)
273 | 
274 | 
275 | class Exp(TensorOp):
276 |     def compute(self, a):
277 |         ### BEGIN YOUR SOLUTION
278 |         raise NotImplementedError()
279 |         ### END YOUR SOLUTION
280 | 
281 |     def gradient(self, out_grad, node):
282 |         ### BEGIN YOUR SOLUTION
283 |         raise NotImplementedError()
284 |         ### END YOUR SOLUTION
285 | 
286 | 
287 | def exp(a):
288 |     return Exp()(a)
289 | 
290 | 
291 | class ReLU(TensorOp):
292 |     def compute(self, a):
293 |         ### BEGIN YOUR SOLUTION
294 |         raise NotImplementedError()
295 |         ### END YOUR SOLUTION
296 | 
297 |     def gradient(self, out_grad, node):
298 |         ### BEGIN YOUR SOLUTION
299 |         raise NotImplementedError()
300 |         ### END YOUR SOLUTION
301 | 
302 | 
303 | def relu(a):
304 |     return ReLU()(a)
305 | 


--------------------------------------------------------------------------------
/python/needle/ops/ops_tuple.py:
--------------------------------------------------------------------------------
 1 | from ..autograd import Op, Tensor, TensorTuple, Value, TensorOp, TensorTupleOp
 2 | 
 3 | 
 4 | class MakeTensorTuple(TensorTupleOp):
 5 |     def compute(self, *args) -> tuple:
 6 |         return tuple(args)
 7 | 
 8 |     def gradient(self, out_grad, node):
 9 |         assert isinstance(out_grad, TensorTuple)
10 |         return tuple(*[out_grad[i] for i in range(len(out_grad))])
11 | 
12 | 
13 | def make_tuple(*args):
14 |     return MakeTensorTuple()(*args)
15 | 
16 | 
17 | class TupleGetItem(TensorOp):
18 |     def __init__(self, index):
19 |         self.index = index
20 | 
21 |     def __call__(self, a: TensorTuple, fold_const=True) -> Value:
22 |         assert isinstance(a, TensorTuple)
23 |         # constant folding
24 |         if fold_const and isinstance(a.op, MakeTensorTuple):
25 |             return a.inputs[self.index]
26 |         return Tensor.make_from_op(self, [a])
27 | 
28 |     def compute(self, a):
29 |         return a[self.index]
30 | 
31 |     def gradient(self, out_grad, node):
32 |         index = self.index
33 |         in_grad = []
34 |         for i, value in enumerate(node.inputs[0]):
35 |             if i != index:
36 |                 in_grad.append(init.zeros_like(value))
37 |             else:
38 |                 in_grad.append(out_grad)
39 |         return MakeTensorTuple()(*in_grad)
40 | 
41 | 
42 | def tuple_get_item(value, index):
43 |     return TupleGetItem(index)(value)
44 | 
45 | 
46 | class FusedAddScalars(TensorTupleOp):
47 |     def __init__(self, c0: float, c1: float):
48 |         self.c0 = c0
49 |         self.c1 = c1
50 | 
51 |     def compute(self, a):
52 |         return a + self.c0, a + self.c1
53 | 
54 |     def gradient(self, out_grad, node):
55 |         return out_grad[0] + out_grad[1]
56 | 
57 | 
58 | def fused_add_scalars(x, c0, c1):
59 |     return FusedAddScalars(c0, c1)(x)
60 | 


--------------------------------------------------------------------------------
/python/needle/optim.py:
--------------------------------------------------------------------------------
 1 | """Optimization module"""
 2 | import needle as ndl
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Optimizer:
 7 |     def __init__(self, params):
 8 |         self.params = params
 9 | 
10 |     def step(self):
11 |         raise NotImplementedError()
12 | 
13 |     def reset_grad(self):
14 |         for p in self.params:
15 |             p.grad = None
16 | 
17 | 
18 | class SGD(Optimizer):
19 |     def __init__(self, params, lr=0.01, momentum=0.0, weight_decay=0.0):
20 |         super().__init__(params)
21 |         self.lr = lr
22 |         self.momentum = momentum
23 |         self.u = {}
24 |         self.weight_decay = weight_decay
25 | 
26 |     def step(self):
27 |         ### BEGIN YOUR SOLUTION
28 |         raise NotImplementedError()
29 |         ### END YOUR SOLUTION
30 | 
31 |     def clip_grad_norm(self, max_norm=0.25):
32 |         """
33 |         Clips gradient norm of parameters.
34 |         """
35 |         ### BEGIN YOUR SOLUTION
36 |         raise NotImplementedError()
37 |         ### END YOUR SOLUTION
38 | 
39 | 
40 | class Adam(Optimizer):
41 |     def __init__(
42 |         self,
43 |         params,
44 |         lr=0.01,
45 |         beta1=0.9,
46 |         beta2=0.999,
47 |         eps=1e-8,
48 |         weight_decay=0.0,
49 |     ):
50 |         super().__init__(params)
51 |         self.lr = lr
52 |         self.beta1 = beta1
53 |         self.beta2 = beta2
54 |         self.eps = eps
55 |         self.weight_decay = weight_decay
56 |         self.t = 0
57 | 
58 |         self.m = {}
59 |         self.v = {}
60 | 
61 |     def step(self):
62 |         ### BEGIN YOUR SOLUTION
63 |         raise NotImplementedError()
64 |         ### END YOUR SOLUTION
65 | 


--------------------------------------------------------------------------------
/src/ndarray_backend_cpu.cc:
--------------------------------------------------------------------------------
  1 | #include <pybind11/numpy.h>
  2 | #include <pybind11/pybind11.h>
  3 | #include <pybind11/stl.h>
  4 | 
  5 | #include <cmath>
  6 | #include <iostream>
  7 | #include <stdexcept>
  8 | 
  9 | namespace needle {
 10 | namespace cpu {
 11 | 
 12 | #define ALIGNMENT 256
 13 | #define TILE 8
 14 | typedef float scalar_t;
 15 | const size_t ELEM_SIZE = sizeof(scalar_t);
 16 | 
 17 | 
 18 | /**
 19 |  * This is a utility structure for maintaining an array aligned to ALIGNMENT boundaries in
 20 |  * memory.  This alignment should be at least TILE * ELEM_SIZE, though we make it even larger
 21 |  * here by default.
 22 |  */
 23 | struct AlignedArray {
 24 |   AlignedArray(const size_t size) {
 25 |     int ret = posix_memalign((void**)&ptr, ALIGNMENT, size * ELEM_SIZE);
 26 |     if (ret != 0) throw std::bad_alloc();
 27 |     this->size = size;
 28 |   }
 29 |   ~AlignedArray() { free(ptr); }
 30 |   size_t ptr_as_int() {return (size_t)ptr; }
 31 |   scalar_t* ptr;
 32 |   size_t size;
 33 | };
 34 | 
 35 | 
 36 | 
 37 | void Fill(AlignedArray* out, scalar_t val) {
 38 |   /**
 39 |    * Fill the values of an aligned array with val
 40 |    */
 41 |   for (int i = 0; i < out->size; i++) {
 42 |     out->ptr[i] = val;
 43 |   }
 44 | }
 45 | 
 46 | 
 47 | void Compact(const AlignedArray& a, AlignedArray* out, std::vector<int32_t> shape,
 48 |              std::vector<int32_t> strides, size_t offset) {
 49 |   /**
 50 |    * Compact an array in memory
 51 |    *
 52 |    * Args:
 53 |    *   a: non-compact representation of the array, given as input
 54 |    *   out: compact version of the array to be written
 55 |    *   shape: shapes of each dimension for a and out
 56 |    *   strides: strides of the *a* array (not out, which has compact strides)
 57 |    *   offset: offset of the *a* array (not out, which has zero offset, being compact)
 58 |    *
 59 |    * Returns:
 60 |    *  void (you need to modify out directly, rather than returning anything; this is true for all the
 61 |    *  function will implement here, so we won't repeat this note.)
 62 |    */
 63 |   /// BEGIN SOLUTION
 64 |   /// END SOLUTION
 65 | }
 66 | 
 67 | void EwiseSetitem(const AlignedArray& a, AlignedArray* out, std::vector<int32_t> shape,
 68 |                   std::vector<int32_t> strides, size_t offset) {
 69 |   /**
 70 |    * Set items in a (non-compact) array
 71 |    *
 72 |    * Args:
 73 |    *   a: _compact_ array whose items will be written to out
 74 |    *   out: non-compact array whose items are to be written
 75 |    *   shape: shapes of each dimension for a and out
 76 |    *   strides: strides of the *out* array (not a, which has compact strides)
 77 |    *   offset: offset of the *out* array (not a, which has zero offset, being compact)
 78 |    */
 79 |   /// BEGIN SOLUTION
 80 |   /// END SOLUTION
 81 | }
 82 | 
 83 | void ScalarSetitem(const size_t size, scalar_t val, AlignedArray* out, std::vector<int32_t> shape,
 84 |                    std::vector<int32_t> strides, size_t offset) {
 85 |   /**
 86 |    * Set items is a (non-compact) array
 87 |    *
 88 |    * Args:
 89 |    *   size: number of elements to write in out array (note that this will note be the same as
 90 |    *         out.size, because out is a non-compact subset array);  it _will_ be the same as the
 91 |    *         product of items in shape, but convenient to just pass it here.
 92 |    *   val: scalar value to write to
 93 |    *   out: non-compact array whose items are to be written
 94 |    *   shape: shapes of each dimension of out
 95 |    *   strides: strides of the out array
 96 |    *   offset: offset of the out array
 97 |    */
 98 | 
 99 |   /// BEGIN SOLUTION
100 |   /// END SOLUTION
101 | }
102 | 
103 | void EwiseAdd(const AlignedArray& a, const AlignedArray& b, AlignedArray* out) {
104 |   /**
105 |    * Set entries in out to be the sum of correspondings entires in a and b.
106 |    */
107 |   for (size_t i = 0; i < a.size; i++) {
108 |     out->ptr[i] = a.ptr[i] + b.ptr[i];
109 |   }
110 | }
111 | 
112 | void ScalarAdd(const AlignedArray& a, scalar_t val, AlignedArray* out) {
113 |   /**
114 |    * Set entries in out to be the sum of corresponding entry in a plus the scalar val.
115 |    */
116 |   for (size_t i = 0; i < a.size; i++) {
117 |     out->ptr[i] = a.ptr[i] + val;
118 |   }
119 | }
120 | 
121 | 
122 | /**
123 |  * In the code the follows, use the above template to create analogous element-wise
124 |  * and and scalar operators for the following functions.  See the numpy backend for
125 |  * examples of how they should work.
126 |  *   - EwiseMul, ScalarMul
127 |  *   - EwiseDiv, ScalarDiv
128 |  *   - ScalarPower
129 |  *   - EwiseMaximum, ScalarMaximum
130 |  *   - EwiseEq, ScalarEq
131 |  *   - EwiseGe, ScalarGe
132 |  *   - EwiseLog
133 |  *   - EwiseExp
134 |  *   - EwiseTanh
135 |  *
136 |  * If you implement all these naively, there will be a lot of repeated code, so
137 |  * you are welcome (but not required), to use macros or templates to define these
138 |  * functions (however you want to do so, as long as the functions match the proper)
139 |  * signatures above.
140 |  */
141 | 
142 | /// BEGIN SOLUTION
143 | 
144 | /// END SOLUTION
145 | 
146 | void Matmul(const AlignedArray& a, const AlignedArray& b, AlignedArray* out, uint32_t m, uint32_t n,
147 |             uint32_t p) {
148 |   /**
149 |    * Multiply two (compact) matrices into an output (also compact) matrix.  For this implementation
150 |    * you can use the "naive" three-loop algorithm.
151 |    *
152 |    * Args:
153 |    *   a: compact 2D array of size m x n
154 |    *   b: compact 2D array of size n x p
155 |    *   out: compact 2D array of size m x p to write the output to
156 |    *   m: rows of a / out
157 |    *   n: columns of a / rows of b
158 |    *   p: columns of b / out
159 |    */
160 | 
161 |   /// BEGIN SOLUTION
162 |   /// END SOLUTION
163 | }
164 | 
165 | inline void AlignedDot(const float* __restrict__ a,
166 |                        const float* __restrict__ b,
167 |                        float* __restrict__ out) {
168 | 
169 |   /**
170 |    * Multiply together two TILE x TILE matrices, and _add _the result to out (it is important to add
171 |    * the result to the existing out, which you should not set to zero beforehand).  We are including
172 |    * the compiler flags here that enable the compile to properly use vector operators to implement
173 |    * this function.  Specifically, the __restrict__ keyword indicates to the compile that a, b, and
174 |    * out don't have any overlapping memory (which is necessary in order for vector operations to be
175 |    * equivalent to their non-vectorized counterparts (imagine what could happen otherwise if a, b,
176 |    * and out had overlapping memory).  Similarly the __builtin_assume_aligned keyword tells the
177 |    * compiler that the input array will be aligned to the appropriate blocks in memory, which also
178 |    * helps the compiler vectorize the code.
179 |    *
180 |    * Args:
181 |    *   a: compact 2D array of size TILE x TILE
182 |    *   b: compact 2D array of size TILE x TILE
183 |    *   out: compact 2D array of size TILE x TILE to write to
184 |    */
185 | 
186 |   a = (const float*)__builtin_assume_aligned(a, TILE * ELEM_SIZE);
187 |   b = (const float*)__builtin_assume_aligned(b, TILE * ELEM_SIZE);
188 |   out = (float*)__builtin_assume_aligned(out, TILE * ELEM_SIZE);
189 | 
190 |   /// BEGIN SOLUTION
191 | 
192 |   /// END SOLUTION
193 | }
194 | 
195 | void MatmulTiled(const AlignedArray& a, const AlignedArray& b, AlignedArray* out, uint32_t m,
196 |                  uint32_t n, uint32_t p) {
197 |   /**
198 |    * Matrix multiplication on tiled representations of array.  In this setting, a, b, and out
199 |    * are all *4D* compact arrays of the appropriate size, e.g. a is an array of size
200 |    *   a[m/TILE][n/TILE][TILE][TILE]
201 |    * You should do the multiplication tile-by-tile to improve performance of the array (i.e., this
202 |    * function should call `AlignedDot()` implemented above).
203 |    *
204 |    * Note that this function will only be called when m, n, p are all multiples of TILE, so you can
205 |    * assume that this division happens without any remainder.
206 |    *
207 |    * Args:
208 |    *   a: compact 4D array of size m/TILE x n/TILE x TILE x TILE
209 |    *   b: compact 4D array of size n/TILE x p/TILE x TILE x TILE
210 |    *   out: compact 4D array of size m/TILE x p/TILE x TILE x TILE to write to
211 |    *   m: rows of a / out
212 |    *   n: columns of a / rows of b
213 |    *   p: columns of b / out
214 |    *
215 |    */
216 |   /// BEGIN SOLUTION
217 | 
218 |   /// END SOLUTION
219 | }
220 | 
221 | void ReduceMax(const AlignedArray& a, AlignedArray* out, size_t reduce_size) {
222 |   /**
223 |    * Reduce by taking maximum over `reduce_size` contiguous blocks.
224 |    *
225 |    * Args:
226 |    *   a: compact array of size a.size = out.size * reduce_size to reduce over
227 |    *   out: compact array to write into
228 |    *   reduce_size: size of the dimension to reduce over
229 |    */
230 | 
231 |   /// BEGIN SOLUTION
232 | 
233 |   /// END SOLUTION
234 | }
235 | 
236 | void ReduceSum(const AlignedArray& a, AlignedArray* out, size_t reduce_size) {
237 |   /**
238 |    * Reduce by taking sum over `reduce_size` contiguous blocks.
239 |    *
240 |    * Args:
241 |    *   a: compact array of size a.size = out.size * reduce_size to reduce over
242 |    *   out: compact array to write into
243 |    *   reduce_size: size of the dimension to reduce over
244 |    */
245 | 
246 |   /// BEGIN SOLUTION
247 | 
248 |   /// END SOLUTION
249 | }
250 | 
251 | }  // namespace cpu
252 | }  // namespace needle
253 | 
254 | PYBIND11_MODULE(ndarray_backend_cpu, m) {
255 |   namespace py = pybind11;
256 |   using namespace needle;
257 |   using namespace cpu;
258 | 
259 |   m.attr("__device_name__") = "cpu";
260 |   m.attr("__tile_size__") = TILE;
261 | 
262 |   py::class_<AlignedArray>(m, "Array")
263 |       .def(py::init<size_t>(), py::return_value_policy::take_ownership)
264 |       .def("ptr", &AlignedArray::ptr_as_int)
265 |       .def_readonly("size", &AlignedArray::size);
266 | 
267 |   // return numpy array (with copying for simplicity, otherwise garbage
268 |   // collection is a pain)
269 |   m.def("to_numpy", [](const AlignedArray& a, std::vector<size_t> shape,
270 |                        std::vector<size_t> strides, size_t offset) {
271 |     std::vector<size_t> numpy_strides = strides;
272 |     std::transform(numpy_strides.begin(), numpy_strides.end(), numpy_strides.begin(),
273 |                    [](size_t& c) { return c * ELEM_SIZE; });
274 |     return py::array_t<scalar_t>(shape, numpy_strides, a.ptr + offset);
275 |   });
276 | 
277 |   // convert from numpy (with copying)
278 |   m.def("from_numpy", [](py::array_t<scalar_t> a, AlignedArray* out) {
279 |     std::memcpy(out->ptr, a.request().ptr, out->size * ELEM_SIZE);
280 |   });
281 | 
282 |   m.def("fill", Fill);
283 |   m.def("compact", Compact);
284 |   m.def("ewise_setitem", EwiseSetitem);
285 |   m.def("scalar_setitem", ScalarSetitem);
286 |   m.def("ewise_add", EwiseAdd);
287 |   m.def("scalar_add", ScalarAdd);
288 |   /*
289 |   m.def("ewise_mul", EwiseMul);
290 |   m.def("scalar_mul", ScalarMul);
291 |   m.def("ewise_div", EwiseDiv);
292 |   m.def("scalar_div", ScalarDiv);
293 |   m.def("scalar_power", ScalarPower);
294 | 
295 |   m.def("ewise_maximum", EwiseMaximum);
296 |   m.def("scalar_maximum", ScalarMaximum);
297 |   m.def("ewise_eq", EwiseEq);
298 |   m.def("scalar_eq", ScalarEq);
299 |   m.def("ewise_ge", EwiseGe);
300 |   m.def("scalar_ge", ScalarGe);
301 | 
302 |   m.def("ewise_log", EwiseLog);
303 |   m.def("ewise_exp", EwiseExp);
304 |   m.def("ewise_tanh", EwiseTanh);
305 | 
306 |   m.def("matmul", Matmul);
307 |   m.def("matmul_tiled", MatmulTiled);
308 | 
309 |   m.def("reduce_max", ReduceMax);
310 |   m.def("reduce_sum", ReduceSum);
311 |   */
312 | }
313 | 


--------------------------------------------------------------------------------
/src/ndarray_backend_cuda.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <pybind11/numpy.h>
  3 | #include <pybind11/pybind11.h>
  4 | #include <pybind11/stl.h>
  5 | 
  6 | #include <iostream>
  7 | #include <sstream>
  8 | 
  9 | namespace needle {
 10 | namespace cuda {
 11 | 
 12 | #define BASE_THREAD_NUM 256
 13 | 
 14 | #define TILE 4
 15 | typedef float scalar_t;
 16 | const size_t ELEM_SIZE = sizeof(scalar_t);
 17 | 
 18 | struct CudaArray {
 19 |   CudaArray(const size_t size) {
 20 |     cudaError_t err = cudaMalloc(&ptr, size * ELEM_SIZE);
 21 |     if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err));
 22 |     this->size = size;
 23 |   }
 24 |   ~CudaArray() { cudaFree(ptr); }
 25 |   size_t ptr_as_int() { return (size_t)ptr; }
 26 | 
 27 |   scalar_t* ptr;
 28 |   size_t size;
 29 | };
 30 | 
 31 | struct CudaDims {
 32 |   dim3 block, grid;
 33 | };
 34 | 
 35 | CudaDims CudaOneDim(size_t size) {
 36 |   /**
 37 |    * Utility function to get cuda dimensions for 1D call
 38 |    */
 39 |   CudaDims dim;
 40 |   size_t num_blocks = (size + BASE_THREAD_NUM - 1) / BASE_THREAD_NUM;
 41 |   dim.block = dim3(BASE_THREAD_NUM, 1, 1);
 42 |   dim.grid = dim3(num_blocks, 1, 1);
 43 |   return dim;
 44 | }
 45 | 
 46 | #define MAX_VEC_SIZE 8
 47 | struct CudaVec {
 48 |   uint32_t size;
 49 |   int32_t data[MAX_VEC_SIZE];
 50 | };
 51 | 
 52 | CudaVec VecToCuda(const std::vector<int32_t>& x) {
 53 |   CudaVec shape;
 54 |   if (x.size() > MAX_VEC_SIZE) throw std::runtime_error("Exceeded CUDA supported max dimesions");
 55 |   shape.size = x.size();
 56 |   for (size_t i = 0; i < x.size(); i++) {
 57 |     shape.data[i] = x[i];
 58 |   }
 59 |   return shape;
 60 | }
 61 | 
 62 | ////////////////////////////////////////////////////////////////////////////////
 63 | // Fill call
 64 | ////////////////////////////////////////////////////////////////////////////////
 65 | 
 66 | __global__ void FillKernel(scalar_t* out, scalar_t val, size_t size) {
 67 |   size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
 68 |   if (gid < size) out[gid] = val;
 69 | }
 70 | 
 71 | void Fill(CudaArray* out, scalar_t val) {
 72 |   CudaDims dim = CudaOneDim(out->size);
 73 |   FillKernel<<<dim.grid, dim.block>>>(out->ptr, val, out->size);
 74 | }
 75 | 
 76 | ////////////////////////////////////////////////////////////////////////////////
 77 | // Compact and setitem cals
 78 | ////////////////////////////////////////////////////////////////////////////////
 79 | 
 80 | // Untility function to convert contiguous index i to memory location from strides
 81 | 
 82 | 
 83 | 
 84 | __global__ void CompactKernel(const scalar_t* a, scalar_t* out, size_t size, CudaVec shape,
 85 |                               CudaVec strides, size_t offset) {
 86 |   /**
 87 |    * The CUDA kernel for the compact opeation.  This should effectively map a single entry in the
 88 |    * non-compact input a, to the corresponding item (at location gid) in the compact array out.
 89 |    *
 90 |    * Args:
 91 |    *   a: CUDA pointer to a array
 92 |    *   out: CUDA point to out array
 93 |    *   size: size of out array
 94 |    *   shape: vector of shapes of a and out arrays (of type CudaVec, for past passing to CUDA kernel)
 95 |    *   strides: vector of strides of out array
 96 |    *   offset: offset of out array
 97 |    */
 98 |   /// BEGIN SOLUTION
 99 |   /// END SOLUTION
100 | }
101 | 
102 | void Compact(const CudaArray& a, CudaArray* out, std::vector<int32_t> shape,
103 |              std::vector<int32_t> strides, size_t offset) {
104 |   /**
105 |    * Compact an array in memory.  Unlike the C++ version, in CUDA this will primarily call the
106 |    * relevant CUDA kernel.  In this case, we illustrate how you should set this up (i.e., we give
107 |    * you the code for this fuction, and also the prototype for the CompactKernel() function).  For
108 |    * the functions after this, however, you'll need to define these kernels as you see fit to
109 |    * execute the underlying function.
110 |    *
111 |    * Args:
112 |    *   a: non-compact represntation of the array, given as input
113 |    *   out: compact version of the array to be written
114 |    *   shape: shapes of each dimension for a and out
115 |    *   strides: strides of the *a* array (not out, which has compact strides)
116 |    *   offset: offset of the *a* array (not out, which has zero offset, being compact)
117 |    */
118 | 
119 |   // Nothing needs to be added here
120 |   CudaDims dim = CudaOneDim(out->size);
121 |   CompactKernel<<<dim.grid, dim.block>>>(a.ptr, out->ptr, out->size, VecToCuda(shape),
122 |                                          VecToCuda(strides), offset);
123 | }
124 | 
125 | 
126 | void EwiseSetitem(const CudaArray& a, CudaArray* out, std::vector<int32_t> shape,
127 |                   std::vector<int32_t> strides, size_t offset) {
128 |   /**
129 |    * Set items in a (non-compact) array using CUDA.  Yyou will most likely want to implement a
130 |    * EwiseSetitemKernel() function, similar to those above, that will do the actual work.
131 |    *
132 |    * Args:
133 |    *   a: _compact_ array whose items will be written to out
134 |    *   out: non-compact array whose items are to be written
135 |    *   shape: shapes of each dimension for a and out
136 |    *   strides: strides of the *out* array (not a, which has compact strides)
137 |    *   offset: offset of the *out* array (not a, which has zero offset, being compact)
138 |    */
139 |   /// BEGIN SOLUTION
140 |   /// END SOLUTION
141 | }
142 | 
143 | 
144 | void ScalarSetitem(size_t size, scalar_t val, CudaArray* out, std::vector<int32_t> shape,
145 |                    std::vector<int32_t> strides, size_t offset) {
146 |   /**
147 |    * Set items is a (non-compact) array
148 |    *
149 |    * Args:
150 |    *   size: number of elements to write in out array (note that this will note be the same as
151 |    *         out.size, because out is a non-compact subset array);  it _will_ be the same as the
152 |    *         product of items in shape, but covenient to just pass it here.
153 |    *   val: scalar value to write to
154 |    *   out: non-compact array whose items are to be written
155 |    *   shape: shapes of each dimension of out
156 |    *   strides: strides of the out array
157 |    *   offset: offset of the out array
158 |    */
159 |   /// BEGIN SOLUTION
160 |   /// END SOLUTION
161 | }
162 | 
163 | ////////////////////////////////////////////////////////////////////////////////
164 | // Elementwise and scalar operations
165 | ////////////////////////////////////////////////////////////////////////////////
166 | 
167 | __global__ void EwiseAddKernel(const scalar_t* a, const scalar_t* b, scalar_t* out, size_t size) {
168 |   size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
169 |   if (gid < size) out[gid] = a[gid] + b[gid];
170 | }
171 | 
172 | void EwiseAdd(const CudaArray& a, const CudaArray& b, CudaArray* out) {
173 |   /**
174 |    * Add together two CUDA array
175 |    */
176 |   CudaDims dim = CudaOneDim(out->size);
177 |   EwiseAddKernel<<<dim.grid, dim.block>>>(a.ptr, b.ptr, out->ptr, out->size);
178 | }
179 | 
180 | __global__ void ScalarAddKernel(const scalar_t* a, scalar_t val, scalar_t* out, size_t size) {
181 |   size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
182 |   if (gid < size) out[gid] = a[gid] + val;
183 | }
184 | 
185 | void ScalarAdd(const CudaArray& a, scalar_t val, CudaArray* out) {
186 |   /**
187 |    * Add together a CUDA array and a scalar value.
188 |    */
189 |   CudaDims dim = CudaOneDim(out->size);
190 |   ScalarAddKernel<<<dim.grid, dim.block>>>(a.ptr, val, out->ptr, out->size);
191 | }
192 | 
193 | /**
194 |  * In the code the follows, use the above template to create analogous elementise
195 |  * and and scalar operators for the following functions.  See the numpy backend for
196 |  * examples of how they should work.
197 |  *   - EwiseMul, ScalarMul
198 |  *   - EwiseDiv, ScalarDiv
199 |  *   - ScalarPower
200 |  *   - EwiseMaximum, ScalarMaximum
201 |  *   - EwiseEq, ScalarEq
202 |  *   - EwiseGe, ScalarGe
203 |  *   - EwiseLog
204 |  *   - EwiseExp
205 |  *   - EwiseTanh
206 |  *
207 |  * If you implement all these naively, there will be a lot of repeated code, so
208 |  * you are welcome (but not required), to use macros or templates to define these
209 |  * functions (however you want to do so, as long as the functions match the proper)
210 |  * signatures above.
211 |  */
212 | 
213 | /// BEGIN SOLUTION
214 | /// END SOLUTION
215 | 
216 | ////////////////////////////////////////////////////////////////////////////////
217 | // Elementwise and scalar operations
218 | ////////////////////////////////////////////////////////////////////////////////
219 | 
220 | 
221 | void Matmul(const CudaArray& a, const CudaArray& b, CudaArray* out, uint32_t M, uint32_t N,
222 |             uint32_t P) {
223 |   /**
224 |    * Multiply two (compact) matrices into an output (also comapct) matrix.  You will want to look
225 |    * at the lecture and notes on GPU-based linear algebra to see how to do this.  Since ultimately
226 |    * mugrade is just evaluating correctness, you _can_ implement a version that simply parallelizes
227 |    * over (i,j) entries in the output array.  However, to really get the full benefit of this
228 |    * problem, we would encourage you to use cooperative fetching, shared memory register tiling,
229 |    * and other ideas covered in the class notes.  Note that unlike the tiled matmul function in
230 |    * the CPU backend, here you should implement a single function that works across all size
231 |    * matrices, whether or not they are a multiple of a tile size.  As with previous CUDA
232 |    * implementations, this function here will largely just set up the kernel call, and you should
233 |    * implement the logic in a separate MatmulKernel() call.
234 |    *
235 |    *
236 |    * Args:
237 |    *   a: compact 2D array of size m x n
238 |    *   b: comapct 2D array of size n x p
239 |    *   out: compact 2D array of size m x p to write the output to
240 |    *   M: rows of a / out
241 |    *   N: columns of a / rows of b
242 |    *   P: columns of b / out
243 |    */
244 | 
245 |   /// BEGIN SOLUTION
246 |   /// END SOLUTION
247 | }
248 | 
249 | ////////////////////////////////////////////////////////////////////////////////
250 | // Max and sum reductions
251 | ////////////////////////////////////////////////////////////////////////////////
252 | 
253 | void ReduceMax(const CudaArray& a, CudaArray* out, size_t reduce_size) {
254 |   /**
255 |    * Reduce by taking maximum over `reduce_size` contiguous blocks.  Even though it is inefficient,
256 |    * for simplicity you can perform each reduction in a single CUDA thread.
257 |    *
258 |    * Args:
259 |    *   a: compact array of size a.size = out.size * reduce_size to reduce over
260 |    *   out: compact array to write into
261 |    *   redice_size: size of the dimension to reduce over
262 |    */
263 |   /// BEGIN SOLUTION
264 |   /// END SOLUTION
265 | }
266 | 
267 | void ReduceSum(const CudaArray& a, CudaArray* out, size_t reduce_size) {
268 |   /**
269 |    * Reduce by taking summation over `reduce_size` contiguous blocks.  Again, for simplicity you
270 |    * can perform each reduction in a single CUDA thread.
271 |    *
272 |    * Args:
273 |    *   a: compact array of size a.size = out.size * reduce_size to reduce over
274 |    *   out: compact array to write into
275 |    *   redice_size: size of the dimension to reduce over
276 |    */
277 |   /// BEGIN SOLUTION
278 |   /// END SOLUTION
279 | }
280 | 
281 | }  // namespace cuda
282 | }  // namespace needle
283 | 
284 | PYBIND11_MODULE(ndarray_backend_cuda, m) {
285 |   namespace py = pybind11;
286 |   using namespace needle;
287 |   using namespace cuda;
288 | 
289 |   m.attr("__device_name__") = "cuda";
290 |   m.attr("__tile_size__") = TILE;
291 | 
292 |   py::class_<CudaArray>(m, "Array")
293 |       .def(py::init<size_t>(), py::return_value_policy::take_ownership)
294 |       .def_readonly("size", &CudaArray::size)
295 |       .def("ptr", &CudaArray::ptr_as_int);
296 | 
297 |   // return numpy array, copying from CPU
298 |   m.def("to_numpy", [](const CudaArray& a, std::vector<size_t> shape, std::vector<size_t> strides,
299 |                        size_t offset) {
300 |     std::vector<size_t> numpy_strides = strides;
301 |     std::transform(numpy_strides.begin(), numpy_strides.end(), numpy_strides.begin(),
302 |                    [](size_t& c) { return c * ELEM_SIZE; });
303 | 
304 |     // copy memory to host
305 |     scalar_t* host_ptr = (scalar_t*)std::malloc(a.size * ELEM_SIZE);
306 |     if (host_ptr == 0) throw std::bad_alloc();
307 |     cudaError_t err = cudaMemcpy(host_ptr, a.ptr, a.size * ELEM_SIZE, cudaMemcpyDeviceToHost);
308 |     if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err));
309 | 
310 |     // return numpy array
311 |     py::capsule deallocate_buffer(host_ptr, [](void* p) { free(p); });
312 |     return py::array_t<scalar_t>(shape, numpy_strides, host_ptr + offset, deallocate_buffer);
313 |   });
314 | 
315 |   // copy numpy array to GPU
316 |   m.def("from_numpy", [](py::array_t<scalar_t> a, CudaArray* out) {
317 |     cudaError_t err =
318 |         cudaMemcpy(out->ptr, a.request().ptr, out->size * ELEM_SIZE, cudaMemcpyHostToDevice);
319 |     if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err));
320 |   });
321 | 
322 |   m.def("fill", Fill);
323 |   m.def("compact", Compact);
324 |   m.def("ewise_setitem", EwiseSetitem);
325 |   m.def("scalar_setitem", ScalarSetitem);
326 |   m.def("ewise_add", EwiseAdd);
327 |   m.def("scalar_add", ScalarAdd);
328 |   /*
329 |   m.def("ewise_mul", EwiseMul);
330 |   m.def("scalar_mul", ScalarMul);
331 |   m.def("ewise_div", EwiseDiv);
332 |   m.def("scalar_div", ScalarDiv);
333 |   m.def("scalar_power", ScalarPower);
334 | 
335 |   m.def("ewise_maximum", EwiseMaximum);
336 |   m.def("scalar_maximum", ScalarMaximum);
337 |   m.def("ewise_eq", EwiseEq);
338 |   m.def("scalar_eq", ScalarEq);
339 |   m.def("ewise_ge", EwiseGe);
340 |   m.def("scalar_ge", ScalarGe);
341 | 
342 |   m.def("ewise_log", EwiseLog);
343 |   m.def("ewise_exp", EwiseExp);
344 |   m.def("ewise_tanh", EwiseTanh);
345 | 
346 |   m.def("matmul", Matmul);
347 | 
348 |   m.def("reduce_max", ReduceMax);
349 |   m.def("reduce_sum", ReduceSum);
350 |   */
351 | }
352 | 


--------------------------------------------------------------------------------