├── .gitignore ├── 13_hardware_acceleration_architecture_overview.ipynb ├── 14_hardware_acceleration_architecture_overview.ipynb ├── CMakeLists.txt ├── LICENSE ├── Makefile ├── README.md ├── python └── needle │ ├── __init__.py │ ├── autograd.py │ ├── backend_ndarray │ ├── __init__.py │ ├── ndarray.py │ └── ndarray_backend_numpy.py │ ├── backend_numpy.py │ ├── backend_selection.py │ ├── data │ ├── __init__.py │ ├── data_basic.py │ ├── data_transforms.py │ └── datasets │ │ ├── __init__.py │ │ ├── mnist_dataset.py │ │ └── ndarray_dataset.py │ ├── init │ ├── __init__.py │ ├── init_basic.py │ └── init_initializers.py │ ├── nn │ ├── __init__.py │ └── nn_basic.py │ ├── ops │ ├── __init__.py │ ├── ops_logarithmic.py │ ├── ops_mathematic.py │ └── ops_tuple.py │ └── optim.py └── src ├── ndarray_backend_cpu.cc └── ndarray_backend_cuda.cu /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | data/ 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | *~ 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /13_hardware_acceleration_architecture_overview.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyNWBRQkcr+1xpaWxfYNIxpF", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "accelerator": "GPU" 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "view-in-github", 24 | "colab_type": "text" 25 | }, 26 | "source": [ 27 | "\"Open" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "id": "Mpn1ti5Urdsv" 34 | }, 35 | "source": [ 36 | "# Lecture 13: Hardware Acceleration Implementation\n", 37 | "\n", 38 | "In this lecture, we will to walk through backend scafoldings to get us hardware accelerations for needle.\n", 39 | "\n", 40 | "\n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "id": "MkXPIjVd90z7" 47 | }, 48 | "source": [ 49 | "## Select a GPU runtime type\n", 50 | "In this lecture, we are going to make use of c++ and CUDA to build accelerated linear algebra libraries. In order to do so, please make sure you select a runtime type with GPU and rerun the cells if needed:\n", 51 | "- Click on the \"Runtime\" tab\n", 52 | "- Click \"Change runtime type\"\n", 53 | "- Select GPU\n", 54 | "\n", 55 | "After you started the right runtime, you can run the following command to check if there is a GPU available." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "colab": { 62 | "base_uri": "https://localhost:8080/" 63 | }, 64 | "id": "5VM6IcuZ-kv6", 65 | "outputId": "58553884-f279-40b0-8889-b9b42ad2c7fd" 66 | }, 67 | "source": [ 68 | "!nvidia-smi" 69 | ], 70 | "execution_count": null, 71 | "outputs": [ 72 | { 73 | "output_type": "stream", 74 | "name": "stdout", 75 | "text": [ 76 | "Tue Oct 8 01:26:45 2024 \n", 77 | "+---------------------------------------------------------------------------------------+\n", 78 | "| NVIDIA-SMI 535.104.05 Driver Version: 535.104.05 CUDA Version: 12.2 |\n", 79 | "|-----------------------------------------+----------------------+----------------------+\n", 80 | "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 81 | "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", 82 | "| | | MIG M. |\n", 83 | "|=========================================+======================+======================|\n", 84 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 85 | "| N/A 53C P8 10W / 70W | 0MiB / 15360MiB | 0% Default |\n", 86 | "| | | N/A |\n", 87 | "+-----------------------------------------+----------------------+----------------------+\n", 88 | " \n", 89 | "+---------------------------------------------------------------------------------------+\n", 90 | "| Processes: |\n", 91 | "| GPU GI CI PID Type Process name GPU Memory |\n", 92 | "| ID ID Usage |\n", 93 | "|=======================================================================================|\n", 94 | "| No running processes found |\n", 95 | "+---------------------------------------------------------------------------------------+\n" 96 | ] 97 | } 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": { 103 | "id": "qXysoqn-vZuF" 104 | }, 105 | "source": [ 106 | "## Prepare the codebase\n", 107 | "\n", 108 | "To get started, we can clone the related repo from the github." 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "metadata": { 114 | "id": "JjEIRTyr8ajf", 115 | "colab": { 116 | "base_uri": "https://localhost:8080/" 117 | }, 118 | "outputId": "c13f17d1-a504-4560-e161-90566afc3963" 119 | }, 120 | "source": [ 121 | "# Code to set up the assignment\n", 122 | "from google.colab import drive\n", 123 | "drive.mount('/content/drive')\n", 124 | "%cd /content/drive/MyDrive/\n", 125 | "!mkdir -p 10714f24\n", 126 | "%cd /content/drive/MyDrive/10714f24\n", 127 | "# comment out the following line if you run it for the second time\n", 128 | "# as you already have a local copy of lecture13\n", 129 | "#!git clone https://github.com/dlsyscourse/lecture13\n", 130 | "!rm -rf /content/needle\n", 131 | "!ln -s /content/drive/MyDrive/10714f24/lecture13 /content/needle" 132 | ], 133 | "execution_count": null, 134 | "outputs": [ 135 | { 136 | "output_type": "stream", 137 | "name": "stdout", 138 | "text": [ 139 | "Mounted at /content/drive\n", 140 | "/content/drive/MyDrive\n", 141 | "/content/drive/MyDrive/10714f24\n", 142 | "Cloning into 'lecture14'...\n", 143 | "remote: Enumerating objects: 99, done.\u001b[K\n", 144 | "remote: Counting objects: 100% (99/99), done.\u001b[K\n", 145 | "remote: Compressing objects: 100% (71/71), done.\u001b[K\n", 146 | "remote: Total 99 (delta 31), reused 85 (delta 23), pack-reused 0 (from 0)\u001b[K\n", 147 | "Receiving objects: 100% (99/99), 53.49 KiB | 1.14 MiB/s, done.\n", 148 | "Resolving deltas: 100% (31/31), done.\n" 149 | ] 150 | } 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "metadata": { 156 | "colab": { 157 | "base_uri": "https://localhost:8080/" 158 | }, 159 | "id": "Xe3vClsD9jlq", 160 | "outputId": "0ff2a35c-4d81-45d7-98bf-45c22fc05279" 161 | }, 162 | "source": [ 163 | "!python3 -m pip install pybind11" 164 | ], 165 | "execution_count": null, 166 | "outputs": [ 167 | { 168 | "output_type": "stream", 169 | "name": "stdout", 170 | "text": [ 171 | "Collecting pybind11\n", 172 | " Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)\n", 173 | "Downloading pybind11-2.13.6-py3-none-any.whl (243 kB)\n", 174 | "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/243.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.6/243.3 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m243.3/243.3 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 175 | "\u001b[?25hInstalling collected packages: pybind11\n", 176 | "Successfully installed pybind11-2.13.6\n" 177 | ] 178 | } 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": { 184 | "id": "O_RrW38i_JNp" 185 | }, 186 | "source": [ 187 | "### Build the needle cuda library\n", 188 | "\n", 189 | "We leverage pybind to build a c++/cuda library for acceleration. You can type make to build the corresponding library." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "metadata": { 195 | "colab": { 196 | "base_uri": "https://localhost:8080/" 197 | }, 198 | "id": "o0EdAcB19saK", 199 | "outputId": "28c177ad-c7b2-46fb-eb9f-8f65878597db" 200 | }, 201 | "source": [ 202 | "%cd /content/needle\n", 203 | "!make clean\n", 204 | "!make" 205 | ], 206 | "execution_count": null, 207 | "outputs": [ 208 | { 209 | "output_type": "stream", 210 | "name": "stdout", 211 | "text": [ 212 | "/content/drive/MyDrive/10714f24/lecture14\n", 213 | "rm -rf build python/needle/backend_ndarray/ndarray_backend*.so\n", 214 | "\u001b[0mCMake Deprecation Warning at CMakeLists.txt:1 (cmake_minimum_required):\n", 215 | " Compatibility with CMake < 3.5 will be removed from a future version of\n", 216 | " CMake.\n", 217 | "\n", 218 | " Update the VERSION argument value or use a ... suffix to tell\n", 219 | " CMake that the project does not need compatibility with older versions.\n", 220 | "\n", 221 | "\u001b[0m\n", 222 | "-- The C compiler identification is GNU 11.4.0\n", 223 | "-- The CXX compiler identification is GNU 11.4.0\n", 224 | "-- Detecting C compiler ABI info\n", 225 | "-- Detecting C compiler ABI info - done\n", 226 | "-- Check for working C compiler: /usr/bin/cc - skipped\n", 227 | "-- Detecting C compile features\n", 228 | "-- Detecting C compile features - done\n", 229 | "-- Detecting CXX compiler ABI info\n", 230 | "-- Detecting CXX compiler ABI info - done\n", 231 | "-- Check for working CXX compiler: /usr/bin/c++ - skipped\n", 232 | "-- Detecting CXX compile features\n", 233 | "-- Detecting CXX compile features - done\n", 234 | "-- Found Python: /usr/local/bin/python (found version \"3.10.12\") found components: Development Interpreter Development.Module Development.Embed\n", 235 | "-- Performing Test HAS_FLTO\n", 236 | "-- Performing Test HAS_FLTO - Success\n", 237 | "-- Found pybind11: /usr/local/lib/python3.10/dist-packages/pybind11/include (found version \"2.13.6\")\n", 238 | "\u001b[33mCMake Warning (dev) at CMakeLists.txt:55 (find_package):\n", 239 | " Policy CMP0146 is not set: The FindCUDA module is removed. Run \"cmake\n", 240 | " --help-policy CMP0146\" for policy details. Use the cmake_policy command to\n", 241 | " set the policy and suppress this warning.\n", 242 | "\n", 243 | "This warning is for project developers. Use -Wno-dev to suppress it.\n", 244 | "\u001b[0m\n", 245 | "-- Performing Test CMAKE_HAVE_LIBC_PTHREAD\n", 246 | "-- Performing Test CMAKE_HAVE_LIBC_PTHREAD - Success\n", 247 | "-- Found Threads: TRUE\n", 248 | "-- Found CUDA: /usr/local/cuda (found version \"12.2\")\n", 249 | "-- Found cuda, building cuda backend\n", 250 | "Tue Oct 8 01:30:32 2024 \n", 251 | "+---------------------------------------------------------------------------------------+\n", 252 | "| NVIDIA-SMI 535.104.05 Driver Version: 535.104.05 CUDA Version: 12.2 |\n", 253 | "|-----------------------------------------+----------------------+----------------------+\n", 254 | "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 255 | "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", 256 | "| | | MIG M. |\n", 257 | "|=========================================+======================+======================|\n", 258 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 259 | "| N/A 45C P8 10W / 70W | 0MiB / 15360MiB | 0% Default |\n", 260 | "| | | N/A |\n", 261 | "+-----------------------------------------+----------------------+----------------------+\n", 262 | " \n", 263 | "+---------------------------------------------------------------------------------------+\n", 264 | "| Processes: |\n", 265 | "| GPU GI CI PID Type Process name GPU Memory |\n", 266 | "| ID ID Usage |\n", 267 | "|=======================================================================================|\n", 268 | "| No running processes found |\n", 269 | "+---------------------------------------------------------------------------------------+\n", 270 | "-- Autodetected CUDA architecture(s): 7.5\n", 271 | "-- Configuring done (6.0s)\n", 272 | "-- Generating done (0.3s)\n", 273 | "-- Build files have been written to: /content/drive/MyDrive/10714f24/lecture14/build\n", 274 | "make[1]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 275 | "make[2]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 276 | "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 277 | "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 278 | "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 279 | "[-25%] \u001b[32mBuilding CXX object CMakeFiles/ndarray_backend_cpu.dir/src/ndarray_backend_cpu.cc.o\u001b[0m\n", 280 | "[ 0%] \u001b[32m\u001b[1mLinking CXX shared module /content/drive/MyDrive/10714f24/lecture14/python/needle/backend_ndarray/ndarray_backend_cpu.cpython-310-x86_64-linux-gnu.so\u001b[0m\n", 281 | "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 282 | "[ 0%] Built target ndarray_backend_cpu\n", 283 | "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 284 | "[ 25%] \u001b[34m\u001b[1mBuilding NVCC (Device) object CMakeFiles/ndarray_backend_cuda.dir/src/ndarray_backend_cuda_generated_ndarray_backend_cuda.cu.o\u001b[0m\n", 285 | "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 286 | "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 287 | "[ 50%] \u001b[32m\u001b[1mLinking CXX shared module /content/drive/MyDrive/10714f24/lecture14/python/needle/backend_ndarray/ndarray_backend_cuda.cpython-310-x86_64-linux-gnu.so\u001b[0m\n", 288 | "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 289 | "[ 50%] Built target ndarray_backend_cuda\n", 290 | "make[2]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 291 | "make[1]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n" 292 | ] 293 | } 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": { 299 | "id": "DFxG3p3S1sBq" 300 | }, 301 | "source": [ 302 | "We can then run the following command to make the path to the package available in colab's environment as well as the PYTHONPATH." 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "metadata": { 308 | "id": "bix8OXLuCOKt", 309 | "colab": { 310 | "base_uri": "https://localhost:8080/" 311 | }, 312 | "outputId": "a250842b-c671-4ba1-e539-efa93f7fc35e" 313 | }, 314 | "source": [ 315 | "%set_env PYTHONPATH /content/needle/python:/env/python\n", 316 | "import sys\n", 317 | "sys.path.append(\"/content/needle/python\")" 318 | ], 319 | "execution_count": null, 320 | "outputs": [ 321 | { 322 | "output_type": "stream", 323 | "name": "stdout", 324 | "text": [ 325 | "env: PYTHONPATH=/content/needle/python:/env/python\n" 326 | ] 327 | } 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": { 333 | "id": "BBIuE2jc1DaU" 334 | }, 335 | "source": [ 336 | "## Codebase walkthrough\n", 337 | "\n", 338 | "\n", 339 | "Now click the files panel on the left side. You should be able to see these files\n", 340 | "\n", 341 | "Python:\n", 342 | "- needle/backend_ndarray/ndarray.py\n", 343 | "- needle/backend_ndarray/ndarray_backend_numpy.py\n", 344 | "\n", 345 | "C++/CUDA\n", 346 | "- src/ndarray_backend_cpu.cc\n", 347 | "- src/ndarray_backend_cuda.cu\n", 348 | "\n", 349 | "The main goal of this lecture is to create an accelerated ndarray library.\n", 350 | "As a result, we do not need to deal with needle.Tensor for now and will focus on backend_ndarray's implementation.\n", 351 | "\n", 352 | "After we build up this array library, we can then use it to power backend array computations in needle.\n" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": { 358 | "id": "x1Z8wSsI6PrU" 359 | }, 360 | "source": [ 361 | "## Creating a CUDA NDArray\n", 362 | "\n", 363 | "\n", 364 | "\n", 365 | "\n" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "metadata": { 371 | "id": "N2bm_WB9uF4V", 372 | "colab": { 373 | "base_uri": "https://localhost:8080/" 374 | }, 375 | "outputId": "874e2b52-6487-434b-fc94-b36c3fc57733" 376 | }, 377 | "source": [ 378 | "from needle import backend_ndarray as nd" 379 | ], 380 | "execution_count": null, 381 | "outputs": [ 382 | { 383 | "output_type": "stream", 384 | "name": "stdout", 385 | "text": [ 386 | "Using needle backend\n" 387 | ] 388 | } 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": { 394 | "id": "wZGnTUsKF1x1" 395 | }, 396 | "source": [ 397 | "We can create a CUDA tensor from the data by specifying a device keyword." 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "metadata": { 403 | "id": "1h5iAYFfBRED" 404 | }, 405 | "source": [ 406 | "x = nd.NDArray([1, 2, 3], device=nd.cuda())" 407 | ], 408 | "execution_count": null, 409 | "outputs": [] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "metadata": { 414 | "id": "CulMPqJkhkpE" 415 | }, 416 | "source": [ 417 | "y = x + 1" 418 | ], 419 | "execution_count": null, 420 | "outputs": [] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "metadata": { 425 | "id": "t4UuEs9KAkDR", 426 | "colab": { 427 | "base_uri": "https://localhost:8080/" 428 | }, 429 | "outputId": "0b671189-f629-4829-bcfa-56aaeefad557" 430 | }, 431 | "source": [ 432 | "x.numpy()" 433 | ], 434 | "execution_count": null, 435 | "outputs": [ 436 | { 437 | "output_type": "execute_result", 438 | "data": { 439 | "text/plain": [ 440 | "array([1., 2., 3.], dtype=float32)" 441 | ] 442 | }, 443 | "metadata": {}, 444 | "execution_count": 18 445 | } 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "metadata": { 451 | "id": "WBMvL6QEBtG7", 452 | "colab": { 453 | "base_uri": "https://localhost:8080/" 454 | }, 455 | "outputId": "818f1fdc-69b0-4a95-f6da-36f3ba67a1ab" 456 | }, 457 | "source": [ 458 | "x.device" 459 | ], 460 | "execution_count": null, 461 | "outputs": [ 462 | { 463 | "output_type": "execute_result", 464 | "data": { 465 | "text/plain": [ 466 | "cuda()" 467 | ] 468 | }, 469 | "metadata": {}, 470 | "execution_count": 19 471 | } 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "metadata": { 477 | "id": "qJSv7D8NGfAr" 478 | }, 479 | "source": [ 480 | "y = x + 1" 481 | ], 482 | "execution_count": null, 483 | "outputs": [] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "metadata": { 488 | "id": "yZ7hmyBVGhGd", 489 | "colab": { 490 | "base_uri": "https://localhost:8080/" 491 | }, 492 | "outputId": "60d5e07f-13f7-417e-9557-8abc87656d13" 493 | }, 494 | "source": [ 495 | "y.device" 496 | ], 497 | "execution_count": null, 498 | "outputs": [ 499 | { 500 | "output_type": "execute_result", 501 | "data": { 502 | "text/plain": [ 503 | "cuda()" 504 | ] 505 | }, 506 | "metadata": {}, 507 | "execution_count": 21 508 | } 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "metadata": { 514 | "id": "NQVtUgK-f7_y", 515 | "colab": { 516 | "base_uri": "https://localhost:8080/" 517 | }, 518 | "outputId": "ff69ffd8-720c-492d-a818-7d1f57dcd63f" 519 | }, 520 | "source": [ 521 | "y.numpy()" 522 | ], 523 | "execution_count": null, 524 | "outputs": [ 525 | { 526 | "output_type": "execute_result", 527 | "data": { 528 | "text/plain": [ 529 | "array([2., 3., 4.], dtype=float32)" 530 | ] 531 | }, 532 | "metadata": {}, 533 | "execution_count": 22 534 | } 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": { 540 | "id": "SPjNJfJsf_T9" 541 | }, 542 | "source": [ 543 | "### Key Data Structures\n", 544 | "\n", 545 | "Key data structures in backend_ndarray\n", 546 | "\n", 547 | "- NDArray: the container to hold device specific ndarray\n", 548 | "- BackendDevice: backend device\n", 549 | " - mod holds the module implementation that implements all functions\n", 550 | " - checkout ndarray_backend_numpy.py for a python-side reference.\n", 551 | "\n" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": { 557 | "id": "HxKF9dcFhTy3" 558 | }, 559 | "source": [ 560 | "## Trace GPU execution\n", 561 | "\n", 562 | "Now, let us take a look at what happens when we execute the following code\n" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "metadata": { 568 | "id": "PLLzZzuthhBH" 569 | }, 570 | "source": [ 571 | "x = nd.NDArray([1, 2, 3], device=nd.cuda())\n", 572 | "y = x + 1" 573 | ], 574 | "execution_count": null, 575 | "outputs": [] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "metadata": { 580 | "colab": { 581 | "base_uri": "https://localhost:8080/" 582 | }, 583 | "id": "V9NV0JFESkIe", 584 | "outputId": "011bc9d0-bcd5-4830-8d71-a2c373eb2202" 585 | }, 586 | "source": [ 587 | "x.device.from_numpy" 588 | ], 589 | "execution_count": null, 590 | "outputs": [ 591 | { 592 | "output_type": "execute_result", 593 | "data": { 594 | "text/plain": [ 595 | "" 596 | ] 597 | }, 598 | "metadata": {}, 599 | "execution_count": 24 600 | } 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "metadata": { 606 | "id": "H6vwR3yBRI9F" 607 | }, 608 | "source": [ 609 | "x = nd.NDArray([1, 2, 3])" 610 | ], 611 | "execution_count": null, 612 | "outputs": [] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "metadata": { 617 | "colab": { 618 | "base_uri": "https://localhost:8080/", 619 | "height": 121 620 | }, 621 | "id": "0PxoH_UzRMd3", 622 | "outputId": "54fd43b1-e18f-4f6d-e92a-4b04744fde90" 623 | }, 624 | "source": [ 625 | "x.device.from_numpy" 626 | ], 627 | "execution_count": null, 628 | "outputs": [ 629 | { 630 | "output_type": "execute_result", 631 | "data": { 632 | "text/plain": [ 633 | "" 634 | ], 635 | "text/html": [ 636 | "
\n", 648 | "
needle.backend_ndarray.ndarray_backend_numpy.from_numpy
def from_numpy(a, out)
/content/needle/python/needle/backend_ndarray/ndarray_backend_numpy.py<no docstring>
\n", 651 | " \n", 670 | "
" 671 | ] 672 | }, 673 | "metadata": {}, 674 | "execution_count": 26 675 | } 676 | ] 677 | }, 678 | { 679 | "cell_type": "markdown", 680 | "metadata": { 681 | "id": "xU5PFJJ-iR7J" 682 | }, 683 | "source": [ 684 | "Have the following trace:\n", 685 | "\n", 686 | "backend_ndarray/ndarray.py\n", 687 | "- `NDArray.__add__`\n", 688 | "- `NDArray.ewise_or_scalar`\n", 689 | "- `ndarray_backend_cpu.cc:ScalarAdd`" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "metadata": { 695 | "colab": { 696 | "base_uri": "https://localhost:8080/" 697 | }, 698 | "id": "TxAKyM6yjr_R", 699 | "outputId": "ba1bfe21-77be-4cd2-f625-f2a3afbcfb57" 700 | }, 701 | "source": [ 702 | "y.numpy()" 703 | ], 704 | "execution_count": null, 705 | "outputs": [ 706 | { 707 | "output_type": "execute_result", 708 | "data": { 709 | "text/plain": [ 710 | "array([2., 3., 4.], dtype=float32)" 711 | ] 712 | }, 713 | "metadata": {}, 714 | "execution_count": 27 715 | } 716 | ] 717 | }, 718 | { 719 | "cell_type": "markdown", 720 | "metadata": { 721 | "id": "F4vqb_a4j2O8" 722 | }, 723 | "source": [ 724 | "Have the following trace:\n", 725 | "\n", 726 | "- `NDArray.numpy`\n", 727 | "- `ndarray_backend_cpu.cc:to_numpy`" 728 | ] 729 | }, 730 | { 731 | "cell_type": "markdown", 732 | "metadata": { 733 | "id": "tMiFJmJVlD6j" 734 | }, 735 | "source": [ 736 | "## Guidelines for Reading C++/CUDA related Files\n", 737 | "\n", 738 | "Read\n", 739 | "- src/ndarray_backend_cpu.cc\n", 740 | "- src/ndarray_backend_cuda.cu\n", 741 | "\n", 742 | "\n", 743 | "Optional\n", 744 | "- CMakeLists.txt: this is used to setup the build and likely you do not need to tweak it.\n", 745 | "\n", 746 | "\n", 747 | "\n", 748 | "\n", 749 | "\n" 750 | ] 751 | }, 752 | { 753 | "cell_type": "markdown", 754 | "metadata": { 755 | "id": "uEpPbwQKkSkZ" 756 | }, 757 | "source": [ 758 | "## NDArray Data Structure\n", 759 | "\n", 760 | "Open up `python/needle/backend_ndarray/ndarray.py`.\n", 761 | "\n", 762 | "An NDArray contains the following fields:\n", 763 | "- handle: The backend handle that build a flat array which stores the data.\n", 764 | "- shape: The shape of the NDArray\n", 765 | "- strides: The strides that shows how do we access multi-dimensional elements\n", 766 | "- offset: The offset of the first element.\n", 767 | "- device: The backend device that backs the computation\n", 768 | "\n", 769 | "\n", 770 | "\n", 771 | "\n" 772 | ] 773 | }, 774 | { 775 | "cell_type": "markdown", 776 | "metadata": { 777 | "id": "875DgxFFACqb" 778 | }, 779 | "source": [ 780 | "## Transformation as Strided Computation\n", 781 | "\n", 782 | "We can leverage the strides and offset to perform transform/slicing with zero copy.\n", 783 | "\n", 784 | "- Broadcast: insert strides that equals 0\n", 785 | "- Tranpose: swap the strides\n", 786 | "- Slice: change the offset and shape\n", 787 | "\n", 788 | "For most of the computations, however, we will call `array.compact()` first to get a contiguous and aligned memory before running the computation." 789 | ] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "metadata": { 794 | "id": "I49fcoiyWYLt" 795 | }, 796 | "source": [ 797 | "import numpy as np\n" 798 | ], 799 | "execution_count": null, 800 | "outputs": [] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "metadata": { 805 | "id": "qGbICVsb6y98" 806 | }, 807 | "source": [ 808 | "x = nd.NDArray([1, 2, 3, 4], device=nd.cpu_numpy())" 809 | ], 810 | "execution_count": null, 811 | "outputs": [] 812 | }, 813 | { 814 | "cell_type": "code", 815 | "metadata": { 816 | "colab": { 817 | "base_uri": "https://localhost:8080/" 818 | }, 819 | "id": "iofcuXso64yk", 820 | "outputId": "66e792a6-918e-4a01-b237-c8ee97c694a8" 821 | }, 822 | "source": [ 823 | "x.numpy()" 824 | ], 825 | "execution_count": null, 826 | "outputs": [ 827 | { 828 | "output_type": "execute_result", 829 | "data": { 830 | "text/plain": [ 831 | "array([1., 2., 3., 4.], dtype=float32)" 832 | ] 833 | }, 834 | "metadata": {}, 835 | "execution_count": 34 836 | } 837 | ] 838 | }, 839 | { 840 | "cell_type": "markdown", 841 | "metadata": { 842 | "id": "oceIop5P7RHW" 843 | }, 844 | "source": [ 845 | "We can use strides and shape manipulation to create different views of the same array." 846 | ] 847 | }, 848 | { 849 | "cell_type": "code", 850 | "metadata": { 851 | "id": "C7zCed7e7B4u" 852 | }, 853 | "source": [ 854 | "y = nd.NDArray.make(shape=(2, 2), strides=(2, 1), device=x.device, handle=x._handle, offset=0)" 855 | ], 856 | "execution_count": null, 857 | "outputs": [] 858 | }, 859 | { 860 | "cell_type": "code", 861 | "metadata": { 862 | "colab": { 863 | "base_uri": "https://localhost:8080/" 864 | }, 865 | "id": "oaEPCvR17OMf", 866 | "outputId": "eee40587-5da5-4e01-a539-69f7788c638b" 867 | }, 868 | "source": [ 869 | "y.numpy()" 870 | ], 871 | "execution_count": null, 872 | "outputs": [ 873 | { 874 | "output_type": "execute_result", 875 | "data": { 876 | "text/plain": [ 877 | "array([[1., 2.],\n", 878 | " [3., 4.]], dtype=float32)" 879 | ] 880 | }, 881 | "metadata": {}, 882 | "execution_count": 36 883 | } 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "metadata": { 889 | "id": "5rNS5MW67XyX" 890 | }, 891 | "source": [ 892 | "z = nd.NDArray.make(shape=(2, 1), strides=(2, 1), device=x.device, handle=x._handle, offset=1)" 893 | ], 894 | "execution_count": null, 895 | "outputs": [] 896 | }, 897 | { 898 | "cell_type": "code", 899 | "metadata": { 900 | "colab": { 901 | "base_uri": "https://localhost:8080/" 902 | }, 903 | "id": "HzhpVtKB7b97", 904 | "outputId": "fe5e6ac3-3458-4140-a5f6-fae9547f6e99" 905 | }, 906 | "source": [ 907 | "z.numpy()" 908 | ], 909 | "execution_count": null, 910 | "outputs": [ 911 | { 912 | "output_type": "execute_result", 913 | "data": { 914 | "text/plain": [ 915 | "array([[2.],\n", 916 | " [4.]], dtype=float32)" 917 | ] 918 | }, 919 | "metadata": {}, 920 | "execution_count": 38 921 | } 922 | ] 923 | }, 924 | { 925 | "cell_type": "markdown", 926 | "metadata": { 927 | "id": "5ONkZbUuj6Dx" 928 | }, 929 | "source": [ 930 | "## CUDA Acceleration\n", 931 | "\n", 932 | "Now let us open `src/ndarray_cuda_backend.cu` and take a look at current implementation of GPU ops.\n" 933 | ] 934 | }, 935 | { 936 | "cell_type": "markdown", 937 | "metadata": { 938 | "id": "Og8N3iuZiZ4g" 939 | }, 940 | "source": [ 941 | "## Steps for adding a new operator implementation\n", 942 | "- Add an implementation in `ndarray_backend_cuda.cu`, expose via pybind\n", 943 | "- Call into the operator in ndarray.py\n", 944 | "- Write up testcases" 945 | ] 946 | }, 947 | { 948 | "cell_type": "code", 949 | "metadata": { 950 | "id": "xV1I7I2lkOJG", 951 | "colab": { 952 | "base_uri": "https://localhost:8080/" 953 | }, 954 | "outputId": "5701699e-a11c-4ee3-e98b-a6211908f329" 955 | }, 956 | "source": [ 957 | "!make" 958 | ], 959 | "execution_count": null, 960 | "outputs": [ 961 | { 962 | "output_type": "stream", 963 | "name": "stdout", 964 | "text": [ 965 | "\u001b[0mCMake Deprecation Warning at CMakeLists.txt:1 (cmake_minimum_required):\n", 966 | " Compatibility with CMake < 3.5 will be removed from a future version of\n", 967 | " CMake.\n", 968 | "\n", 969 | " Update the VERSION argument value or use a ... suffix to tell\n", 970 | " CMake that the project does not need compatibility with older versions.\n", 971 | "\n", 972 | "\u001b[0m\n", 973 | "-- Found pybind11: /usr/local/lib/python3.10/dist-packages/pybind11/include (found version \"2.13.6\")\n", 974 | "\u001b[33mCMake Warning (dev) at CMakeLists.txt:55 (find_package):\n", 975 | " Policy CMP0146 is not set: The FindCUDA module is removed. Run \"cmake\n", 976 | " --help-policy CMP0146\" for policy details. Use the cmake_policy command to\n", 977 | " set the policy and suppress this warning.\n", 978 | "\n", 979 | "This warning is for project developers. Use -Wno-dev to suppress it.\n", 980 | "\u001b[0m\n", 981 | "-- Found cuda, building cuda backend\n", 982 | "Tue Oct 8 01:33:00 2024 \n", 983 | "+---------------------------------------------------------------------------------------+\n", 984 | "| NVIDIA-SMI 535.104.05 Driver Version: 535.104.05 CUDA Version: 12.2 |\n", 985 | "|-----------------------------------------+----------------------+----------------------+\n", 986 | "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 987 | "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", 988 | "| | | MIG M. |\n", 989 | "|=========================================+======================+======================|\n", 990 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 991 | "| N/A 55C P0 29W / 70W | 105MiB / 15360MiB | 0% Default |\n", 992 | "| | | N/A |\n", 993 | "+-----------------------------------------+----------------------+----------------------+\n", 994 | " \n", 995 | "+---------------------------------------------------------------------------------------+\n", 996 | "| Processes: |\n", 997 | "| GPU GI CI PID Type Process name GPU Memory |\n", 998 | "| ID ID Usage |\n", 999 | "|=======================================================================================|\n", 1000 | "+---------------------------------------------------------------------------------------+\n", 1001 | "-- Autodetected CUDA architecture(s): 7.5\n", 1002 | "-- Configuring done (0.4s)\n", 1003 | "-- Generating done (0.4s)\n", 1004 | "-- Build files have been written to: /content/drive/MyDrive/10714f24/lecture14/build\n", 1005 | "make[1]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1006 | "make[2]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1007 | "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1008 | "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1009 | "[ 0%] Built target ndarray_backend_cpu\n", 1010 | "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1011 | "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1012 | "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1013 | "[ 25%] \u001b[32m\u001b[1mLinking CXX shared module /content/drive/MyDrive/10714f24/lecture14/python/needle/backend_ndarray/ndarray_backend_cuda.cpython-310-x86_64-linux-gnu.so\u001b[0m\n", 1014 | "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1015 | "[ 50%] Built target ndarray_backend_cuda\n", 1016 | "make[2]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1017 | "make[1]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n" 1018 | ] 1019 | } 1020 | ] 1021 | }, 1022 | { 1023 | "cell_type": "markdown", 1024 | "source": [ 1025 | "If we directly run the code block, we will see an error, because ewise mul is not yet implemented" 1026 | ], 1027 | "metadata": { 1028 | "id": "OpWxZKGb_4cJ" 1029 | } 1030 | }, 1031 | { 1032 | "cell_type": "code", 1033 | "metadata": { 1034 | "id": "YU870vVVZkzg", 1035 | "colab": { 1036 | "base_uri": "https://localhost:8080/", 1037 | "height": 311 1038 | }, 1039 | "outputId": "99ae1dbc-7ebd-4229-e3de-50eabafc8d5f" 1040 | }, 1041 | "source": [ 1042 | "x = nd.NDArray([1,2,3], device=nd.cuda())\n", 1043 | "x * 2" 1044 | ], 1045 | "execution_count": null, 1046 | "outputs": [ 1047 | { 1048 | "output_type": "error", 1049 | "ename": "AttributeError", 1050 | "evalue": "module 'needle.backend_ndarray.ndarray_backend_cuda' has no attribute 'ewise_mul'", 1051 | "traceback": [ 1052 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 1053 | "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", 1054 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNDArray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mx\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 1055 | "\u001b[0;32m/content/needle/python/needle/backend_ndarray/ndarray.py\u001b[0m in \u001b[0;36m__mul__\u001b[0;34m(self, other)\u001b[0m\n\u001b[1;32m 419\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__mul__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 420\u001b[0m return self.ewise_or_scalar(\n\u001b[0;32m--> 421\u001b[0;31m \u001b[0mother\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mewise_mul\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscalar_mul\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 422\u001b[0m )\n\u001b[1;32m 423\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 1056 | "\u001b[0;32m/content/needle/python/needle/backend_ndarray/ndarray.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__getattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0menabled\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1057 | "\u001b[0;31mAttributeError\u001b[0m: module 'needle.backend_ndarray.ndarray_backend_cuda' has no attribute 'ewise_mul'" 1058 | ] 1059 | } 1060 | ] 1061 | }, 1062 | { 1063 | "cell_type": "markdown", 1064 | "metadata": { 1065 | "id": "VEtbnbvr6Wt7" 1066 | }, 1067 | "source": [ 1068 | "## Connect back to needle Tensor\n", 1069 | "\n", 1070 | "So far we only played with the `backend_ndarray` subpackage, which is a self-contained ndarray implementation within needle.\n", 1071 | "\n", 1072 | "We can connect the ndarray back to needle as a backend." 1073 | ] 1074 | }, 1075 | { 1076 | "cell_type": "code", 1077 | "metadata": { 1078 | "id": "JeThSA8zAu_v" 1079 | }, 1080 | "source": [ 1081 | "import needle as ndl" 1082 | ], 1083 | "execution_count": null, 1084 | "outputs": [] 1085 | }, 1086 | { 1087 | "cell_type": "code", 1088 | "metadata": { 1089 | "id": "dobDH96Ql8SV", 1090 | "colab": { 1091 | "base_uri": "https://localhost:8080/" 1092 | }, 1093 | "outputId": "cd23f433-8c4c-4eb3-d67f-8b492a703261" 1094 | }, 1095 | "source": [ 1096 | "x = ndl.Tensor([1,2,3], device=nd.cuda(), dtype=\"float32\")\n", 1097 | "y = ndl.Tensor([2,3,5], device=nd.cuda(), dtype=\"float32\")\n", 1098 | "z = x + y\n", 1099 | "z" 1100 | ], 1101 | "execution_count": null, 1102 | "outputs": [ 1103 | { 1104 | "output_type": "execute_result", 1105 | "data": { 1106 | "text/plain": [ 1107 | "needle.Tensor([3. 5. 8.])" 1108 | ] 1109 | }, 1110 | "metadata": {}, 1111 | "execution_count": 43 1112 | } 1113 | ] 1114 | }, 1115 | { 1116 | "cell_type": "code", 1117 | "metadata": { 1118 | "colab": { 1119 | "base_uri": "https://localhost:8080/" 1120 | }, 1121 | "id": "ouXpj1v6g3z1", 1122 | "outputId": "ed910149-2d9e-4055-b31d-52fb698e463d" 1123 | }, 1124 | "source": [ 1125 | "z.device" 1126 | ], 1127 | "execution_count": null, 1128 | "outputs": [ 1129 | { 1130 | "output_type": "execute_result", 1131 | "data": { 1132 | "text/plain": [ 1133 | "cuda()" 1134 | ] 1135 | }, 1136 | "metadata": {}, 1137 | "execution_count": 44 1138 | } 1139 | ] 1140 | }, 1141 | { 1142 | "cell_type": "code", 1143 | "metadata": { 1144 | "colab": { 1145 | "base_uri": "https://localhost:8080/", 1146 | "height": 187 1147 | }, 1148 | "id": "4827VUz3bwvA", 1149 | "outputId": "94b59197-251e-4be5-8bb3-c0b704809477" 1150 | }, 1151 | "source": [ 1152 | "type(z.cached_data)" 1153 | ], 1154 | "execution_count": null, 1155 | "outputs": [ 1156 | { 1157 | "output_type": "execute_result", 1158 | "data": { 1159 | "text/plain": [ 1160 | "needle.backend_ndarray.ndarray.NDArray" 1161 | ], 1162 | "text/html": [ 1163 | "
\n", 1175 | "
needle.backend_ndarray.ndarray.NDArray
def __init__(other, device=None)
/content/needle/python/needle/backend_ndarray/ndarray.pyA generic ND array class that may contain multipe different backends\n",
1178 |               "i.e., a Numpy backend, a native CPU backend, or a GPU backend.\n",
1179 |               "\n",
1180 |               "This class will only contains those functions that you need to implement\n",
1181 |               "to actually get the desired functionality for the programming examples\n",
1182 |               "in the homework, and no more.\n",
1183 |               "\n",
1184 |               "For now, for simplicity the class only supports float32 types, though\n",
1185 |               "this can be extended if desired.
\n", 1186 | " \n", 1205 | "
" 1206 | ] 1207 | }, 1208 | "metadata": {}, 1209 | "execution_count": 45 1210 | } 1211 | ] 1212 | }, 1213 | { 1214 | "cell_type": "markdown", 1215 | "metadata": { 1216 | "id": "74OUUH2REG18" 1217 | }, 1218 | "source": [ 1219 | "## Write Standalone Python Test Files\n", 1220 | "\n", 1221 | "Now that we have additional c++/cuda libraries in needle, we will need to type make in order to rebuild the library. Additionally, because the colab environment caches the old library, it is inconvenient to use the ipython cells to debug the updated library.\n", 1222 | "\n", 1223 | "\n" 1224 | ] 1225 | }, 1226 | { 1227 | "cell_type": "code", 1228 | "metadata": { 1229 | "colab": { 1230 | "base_uri": "https://localhost:8080/" 1231 | }, 1232 | "id": "sgLoV-_KHAM3", 1233 | "outputId": "ae19d49d-faa3-4aa7-cd6f-ed61815cfce6" 1234 | }, 1235 | "source": [ 1236 | "!make" 1237 | ], 1238 | "execution_count": null, 1239 | "outputs": [ 1240 | { 1241 | "output_type": "stream", 1242 | "name": "stdout", 1243 | "text": [ 1244 | "\u001b[0mCMake Deprecation Warning at CMakeLists.txt:1 (cmake_minimum_required):\n", 1245 | " Compatibility with CMake < 3.5 will be removed from a future version of\n", 1246 | " CMake.\n", 1247 | "\n", 1248 | " Update the VERSION argument value or use a ... suffix to tell\n", 1249 | " CMake that the project does not need compatibility with older versions.\n", 1250 | "\n", 1251 | "\u001b[0m\n", 1252 | "-- Found pybind11: /usr/local/lib/python3.10/dist-packages/pybind11/include (found version \"2.13.6\")\n", 1253 | "\u001b[33mCMake Warning (dev) at CMakeLists.txt:55 (find_package):\n", 1254 | " Policy CMP0146 is not set: The FindCUDA module is removed. Run \"cmake\n", 1255 | " --help-policy CMP0146\" for policy details. Use the cmake_policy command to\n", 1256 | " set the policy and suppress this warning.\n", 1257 | "\n", 1258 | "This warning is for project developers. Use -Wno-dev to suppress it.\n", 1259 | "\u001b[0m\n", 1260 | "-- Found cuda, building cuda backend\n", 1261 | "Tue Oct 8 01:33:33 2024 \n", 1262 | "+---------------------------------------------------------------------------------------+\n", 1263 | "| NVIDIA-SMI 535.104.05 Driver Version: 535.104.05 CUDA Version: 12.2 |\n", 1264 | "|-----------------------------------------+----------------------+----------------------+\n", 1265 | "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 1266 | "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", 1267 | "| | | MIG M. |\n", 1268 | "|=========================================+======================+======================|\n", 1269 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 1270 | "| N/A 58C P0 30W / 70W | 105MiB / 15360MiB | 0% Default |\n", 1271 | "| | | N/A |\n", 1272 | "+-----------------------------------------+----------------------+----------------------+\n", 1273 | " \n", 1274 | "+---------------------------------------------------------------------------------------+\n", 1275 | "| Processes: |\n", 1276 | "| GPU GI CI PID Type Process name GPU Memory |\n", 1277 | "| ID ID Usage |\n", 1278 | "|=======================================================================================|\n", 1279 | "+---------------------------------------------------------------------------------------+\n", 1280 | "-- Autodetected CUDA architecture(s): 7.5\n", 1281 | "-- Configuring done (0.3s)\n", 1282 | "-- Generating done (0.3s)\n", 1283 | "-- Build files have been written to: /content/drive/MyDrive/10714f24/lecture14/build\n", 1284 | "make[1]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1285 | "make[2]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1286 | "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1287 | "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1288 | "[ 0%] Built target ndarray_backend_cpu\n", 1289 | "make[3]: Entering directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1290 | "make[3]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1291 | "[ 50%] Built target ndarray_backend_cuda\n", 1292 | "make[2]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n", 1293 | "make[1]: Leaving directory '/content/drive/MyDrive/10714f24/lecture14/build'\n" 1294 | ] 1295 | } 1296 | ] 1297 | }, 1298 | { 1299 | "cell_type": "markdown", 1300 | "metadata": { 1301 | "id": "dudnLHRqoKY2" 1302 | }, 1303 | "source": [ 1304 | "\n", 1305 | "We recommend writing separate python files and invoke them from the command line. Create a new file `tests/mytest.py` and write your local tests. This is also a common develop practice in big projects that involves python c++ FFI." 1306 | ] 1307 | }, 1308 | { 1309 | "cell_type": "code", 1310 | "metadata": { 1311 | "id": "TubIHJrkn4Sk", 1312 | "colab": { 1313 | "base_uri": "https://localhost:8080/" 1314 | }, 1315 | "outputId": "cf8889e2-605f-4310-d89a-e4a49bc254e4" 1316 | }, 1317 | "source": [ 1318 | "!python tests/mytest.py" 1319 | ], 1320 | "execution_count": null, 1321 | "outputs": [ 1322 | { 1323 | "output_type": "stream", 1324 | "name": "stdout", 1325 | "text": [ 1326 | "python3: can't open file '/content/drive/MyDrive/10714f24/lecture14/tests/mytest.py': [Errno 2] No such file or directory\n" 1327 | ] 1328 | } 1329 | ] 1330 | }, 1331 | { 1332 | "cell_type": "markdown", 1333 | "metadata": { 1334 | "id": "ei0UR-FYoY1-" 1335 | }, 1336 | "source": [ 1337 | "After we have building the library, we could choose to fully restart the runtime (factory reset runtime) if you want to bring the updated change back to another colab. Note that you will need to save your code changes to the drive or a private github repo." 1338 | ] 1339 | } 1340 | ] 1341 | } -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.2) 2 | project(needle C CXX) 3 | 4 | # find correct version of Python 5 | execute_process(COMMAND python3-config --prefix 6 | OUTPUT_VARIABLE Python_ROOT_DIR) 7 | find_package(Python COMPONENTS Development Interpreter REQUIRED) 8 | include_directories(${Python_INCLUDE_DIRS}) 9 | 10 | # find pybind 11 | execute_process(COMMAND python3 -m pybind11 --cmakedir 12 | RESULT_VARIABLE __pybind_exit_code 13 | OUTPUT_VARIABLE __pybind_path 14 | OUTPUT_STRIP_TRAILING_WHITESPACE) 15 | find_package(pybind11 PATHS ${__pybind_path}) 16 | 17 | 18 | if(NOT MSVC) 19 | set(CMAKE_CXX_FLAGS "-std=c++11 -O2 -march=native ${CMAKE_CXX_FLAGS}") 20 | set(CMAKE_CUDA_STANDARD 14) 21 | else() 22 | set(CMAKE_CXX_FLAGS "/std:c++11 -O2 -march=native ${CMAKE_CXX_FLAGS}") 23 | set(CMAKE_CUDA_STANDARD 14) 24 | endif() 25 | 26 | include_directories(SYSTEM ${pybind11_INCLUDE_DIRS}) 27 | list(APPEND LINKER_LIBS ${pybind11_LIBRARIES}) 28 | 29 | 30 | ################### 31 | ### CPU BACKEND ### 32 | ################### 33 | add_library(ndarray_backend_cpu MODULE src/ndarray_backend_cpu.cc) 34 | target_link_libraries(ndarray_backend_cpu PUBLIC ${LINKER_LIBS}) 35 | pybind11_extension(ndarray_backend_cpu) 36 | pybind11_strip(ndarray_backend_cpu) 37 | 38 | 39 | # directly output to ffi folder 40 | set_target_properties(ndarray_backend_cpu 41 | PROPERTIES 42 | LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/python/needle/backend_ndarray 43 | CXX_VISIBILITY_PRESET "hidden" 44 | ) 45 | 46 | if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") 47 | set_property(TARGET ndarray_backend_cpu PROPERTY LINK_OPTIONS -undefined dynamic_lookup) 48 | endif() 49 | 50 | 51 | 52 | #################### 53 | ### CUDA BACKEND ### 54 | #################### 55 | find_package(CUDA) 56 | if(CUDA_FOUND) 57 | message(STATUS "Found cuda, building cuda backend") 58 | 59 | include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) 60 | list(APPEND LINKER_LIBS ${CUDA_CUDART_LIBRARY}) 61 | 62 | # invoke nvidia smi to detect if we really have a GPU 63 | execute_process(COMMAND "nvidia-smi" ERROR_QUIET RESULT_VARIABLE NV_RET) 64 | if(NV_RET EQUAL "0") 65 | CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS Auto) 66 | else() 67 | # set to 3.7 the flag of K80 68 | CUDA_SELECT_NVCC_ARCH_FLAGS(ARCH_FLAGS 3.7) 69 | endif() 70 | 71 | # set arch flags properly 72 | CUDA_ADD_LIBRARY(ndarray_backend_cuda MODULE src/ndarray_backend_cuda.cu OPTIONS ${ARCH_FLAGS}) 73 | 74 | target_link_libraries(ndarray_backend_cuda ${LINKER_LIBS}) 75 | pybind11_extension(ndarray_backend_cuda) 76 | pybind11_strip(ndarray_backend_cuda) 77 | 78 | # directly output to ffi folder 79 | set_target_properties(ndarray_backend_cuda 80 | PROPERTIES 81 | LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/python/needle/backend_ndarray 82 | CXX_VISIBILITY_PRESET "hidden" 83 | CUDA_VISIBILITY_PRESET "hidden" 84 | ) 85 | 86 | endif() 87 | 88 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: lib, pybind, clean, format, all 2 | 3 | all: lib 4 | 5 | 6 | lib: 7 | @mkdir -p build 8 | @cd build; cmake .. 9 | @cd build; $(MAKE) 10 | 11 | format: 12 | python3 -m black . 13 | clang-format -i src/*.cc src/*.cu 14 | 15 | clean: 16 | rm -rf build python/needle/backend_ndarray/ndarray_backend*.so 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Lecture13 2 | 3 | This repo contains the infrastructure code needed for hardware acceleration implementation 4 | It can be safely replaced by hw3 repo eventually. 5 | -------------------------------------------------------------------------------- /python/needle/__init__.py: -------------------------------------------------------------------------------- 1 | from . import ops 2 | from .ops import * 3 | from .autograd import Tensor, cpu, all_devices 4 | 5 | from . import init 6 | from .init import ones, zeros, zeros_like, ones_like 7 | 8 | from . import data 9 | from . import nn 10 | from . import optim 11 | from .backend_selection import * 12 | -------------------------------------------------------------------------------- /python/needle/autograd.py: -------------------------------------------------------------------------------- 1 | """Core data structures.""" 2 | import needle 3 | from .backend_numpy import Device, cpu, all_devices 4 | from typing import List, Optional, NamedTuple, Tuple, Union 5 | from collections import namedtuple 6 | import numpy 7 | 8 | from needle import init 9 | 10 | # needle version 11 | LAZY_MODE = False 12 | TENSOR_COUNTER = 0 13 | 14 | # NOTE: we will import numpy as the array_api 15 | # as the backend for our computations, this line will change in later homeworks 16 | 17 | import numpy as array_api 18 | NDArray = numpy.ndarray 19 | 20 | from .backend_selection import array_api, NDArray 21 | 22 | 23 | class Op: 24 | """Operator definition.""" 25 | 26 | def __call__(self, *args): 27 | raise NotImplementedError() 28 | 29 | def compute(self, *args: Tuple[NDArray]): 30 | """Calculate forward pass of operator. 31 | 32 | Parameters 33 | ---------- 34 | input: np.ndarray 35 | A list of input arrays to the function 36 | 37 | Returns 38 | ------- 39 | output: nd.array 40 | Array output of the operation 41 | 42 | """ 43 | raise NotImplementedError() 44 | 45 | def gradient( 46 | self, out_grad: "Value", node: "Value" 47 | ) -> Union["Value", Tuple["Value"]]: 48 | """Compute partial adjoint for each input value for a given output adjoint. 49 | 50 | Parameters 51 | ---------- 52 | out_grad: Value 53 | The adjoint wrt to the output value. 54 | 55 | node: Value 56 | The value node of forward evaluation. 57 | 58 | Returns 59 | ------- 60 | input_grads: Value or Tuple[Value] 61 | A list containing partial gradient adjoints to be propagated to 62 | each of the input node. 63 | """ 64 | raise NotImplementedError() 65 | 66 | def gradient_as_tuple(self, out_grad: "Value", node: "Value") -> Tuple["Value"]: 67 | """Convenience method to always return a tuple from gradient call""" 68 | output = self.gradient(out_grad, node) 69 | if isinstance(output, tuple): 70 | return output 71 | elif isinstance(output, list): 72 | return tuple(output) 73 | else: 74 | return (output,) 75 | 76 | 77 | class TensorOp(Op): 78 | """Op class specialized to output tensors, will be alternate subclasses for other structures""" 79 | 80 | def __call__(self, *args): 81 | return Tensor.make_from_op(self, args) 82 | 83 | 84 | class TensorTupleOp(Op): 85 | """Op class specialized to output TensorTuple""" 86 | 87 | def __call__(self, *args): 88 | return TensorTuple.make_from_op(self, args) 89 | 90 | 91 | class Value: 92 | """A value in the computational graph.""" 93 | 94 | # trace of computational graph 95 | op: Optional[Op] 96 | inputs: List["Value"] 97 | # The following fields are cached fields for 98 | # dynamic computation 99 | cached_data: NDArray 100 | requires_grad: bool 101 | 102 | def realize_cached_data(self): 103 | """Run compute to realize the cached data""" 104 | # avoid recomputation 105 | if self.cached_data is not None: 106 | return self.cached_data 107 | # note: data implicitly calls realized cached data 108 | self.cached_data = self.op.compute( 109 | *[x.realize_cached_data() for x in self.inputs] 110 | ) 111 | return self.cached_data 112 | 113 | def is_leaf(self): 114 | return self.op is None 115 | 116 | def __del__(self): 117 | global TENSOR_COUNTER 118 | TENSOR_COUNTER -= 1 119 | 120 | def _init( 121 | self, 122 | op: Optional[Op], 123 | inputs: List["Tensor"], 124 | *, 125 | num_outputs: int = 1, 126 | cached_data: List[object] = None, 127 | requires_grad: Optional[bool] = None 128 | ): 129 | global TENSOR_COUNTER 130 | TENSOR_COUNTER += 1 131 | if requires_grad is None: 132 | requires_grad = any(x.requires_grad for x in inputs) 133 | self.op = op 134 | self.inputs = inputs 135 | self.num_outputs = num_outputs 136 | self.cached_data = cached_data 137 | self.requires_grad = requires_grad 138 | 139 | @classmethod 140 | def make_const(cls, data, *, requires_grad=False): 141 | value = cls.__new__(cls) 142 | value._init( 143 | None, 144 | [], 145 | cached_data=data, 146 | requires_grad=requires_grad, 147 | ) 148 | return value 149 | 150 | @classmethod 151 | def make_from_op(cls, op: Op, inputs: List["Value"]): 152 | value = cls.__new__(cls) 153 | value._init(op, inputs) 154 | 155 | if not LAZY_MODE: 156 | if not value.requires_grad: 157 | return value.detach() 158 | value.realize_cached_data() 159 | return value 160 | 161 | 162 | ### Not needed in HW1 163 | class TensorTuple(Value): 164 | """Represent a tuple of tensors. 165 | 166 | To keep things simple, we do not support nested tuples. 167 | """ 168 | 169 | def __len__(self): 170 | cdata = self.realize_cached_data() 171 | return len(cdata) 172 | 173 | def __getitem__(self, index: int): 174 | return needle.ops.tuple_get_item(self, index) 175 | 176 | def tuple(self): 177 | return tuple([x for x in self]) 178 | 179 | def __repr__(self): 180 | return "needle.TensorTuple" + str(self.tuple()) 181 | 182 | def __str__(self): 183 | return self.__repr__() 184 | 185 | def __add__(self, other): 186 | assert isinstance(other, TensorTuple) 187 | assert len(self) == len(other) 188 | return needle.ops.make_tuple(*[self[i] + other[i] for i in range(len(self))]) 189 | 190 | def detach(self): 191 | """Create a new tensor that shares the data but detaches from the graph.""" 192 | return Tuple.make_const(self.realize_cached_data()) 193 | 194 | 195 | class Tensor(Value): 196 | grad: "Tensor" 197 | 198 | def __init__( 199 | self, 200 | array, 201 | *, 202 | device: Optional[Device] = None, 203 | dtype=None, 204 | requires_grad=True, 205 | **kwargs 206 | ): 207 | if isinstance(array, Tensor): 208 | if device is None: 209 | device = array.device 210 | if dtype is None: 211 | dtype = array.dtype 212 | if device == array.device and dtype == array.dtype: 213 | cached_data = array.realize_cached_data() 214 | else: 215 | # fall back, copy through numpy conversion 216 | cached_data = Tensor._array_from_numpy( 217 | array.numpy(), device=device, dtype=dtype 218 | ) 219 | else: 220 | device = device if device else cpu() 221 | cached_data = Tensor._array_from_numpy(array, device=device, dtype=dtype) 222 | 223 | self._init( 224 | None, 225 | [], 226 | cached_data=cached_data, 227 | requires_grad=requires_grad, 228 | ) 229 | 230 | @staticmethod 231 | def _array_from_numpy(numpy_array, device, dtype): 232 | if array_api is numpy: 233 | return numpy.array(numpy_array, dtype=dtype) 234 | return array_api.array(numpy_array, device=device, dtype=dtype) 235 | 236 | @staticmethod 237 | def make_from_op(op: Op, inputs: List["Value"]): 238 | tensor = Tensor.__new__(Tensor) 239 | tensor._init(op, inputs) 240 | if not LAZY_MODE: 241 | if not tensor.requires_grad: 242 | return tensor.detach() 243 | tensor.realize_cached_data() 244 | return tensor 245 | 246 | @staticmethod 247 | def make_const(data, requires_grad=False): 248 | tensor = Tensor.__new__(Tensor) 249 | tensor._init( 250 | None, 251 | [], 252 | cached_data=data 253 | if not isinstance(data, Tensor) 254 | else data.realize_cached_data(), 255 | requires_grad=requires_grad, 256 | ) 257 | return tensor 258 | 259 | @property 260 | def data(self): 261 | return self.detach() 262 | 263 | @data.setter 264 | def data(self, value): 265 | assert isinstance(value, Tensor) 266 | assert value.dtype == self.dtype, "%s %s" % ( 267 | value.dtype, 268 | self.dtype, 269 | ) 270 | self.cached_data = value.realize_cached_data() 271 | 272 | def detach(self): 273 | """Create a new tensor that shares the data but detaches from the graph.""" 274 | return Tensor.make_const(self.realize_cached_data()) 275 | 276 | @property 277 | def shape(self): 278 | return self.realize_cached_data().shape 279 | 280 | @property 281 | def dtype(self): 282 | return self.realize_cached_data().dtype 283 | 284 | @property 285 | def device(self): 286 | data = self.realize_cached_data() 287 | # numpy array always sits on cpu 288 | if array_api is numpy: 289 | return cpu() 290 | return data.device 291 | 292 | def backward(self, out_grad=None): 293 | out_grad = ( 294 | out_grad 295 | if out_grad 296 | else init.ones(*self.shape, dtype=self.dtype, device=self.device) 297 | ) 298 | compute_gradient_of_variables(self, out_grad) 299 | 300 | def __repr__(self): 301 | return "needle.Tensor(" + str(self.realize_cached_data()) + ")" 302 | 303 | def __str__(self): 304 | return self.realize_cached_data().__str__() 305 | 306 | def numpy(self): 307 | data = self.realize_cached_data() 308 | if array_api is numpy: 309 | return data 310 | return data.numpy() 311 | 312 | def __add__(self, other): 313 | if isinstance(other, Tensor): 314 | return needle.ops.EWiseAdd()(self, other) 315 | else: 316 | return needle.ops.AddScalar(other)(self) 317 | 318 | def __mul__(self, other): 319 | if isinstance(other, Tensor): 320 | return needle.ops.EWiseMul()(self, other) 321 | else: 322 | return needle.ops.MulScalar(other)(self) 323 | 324 | def __pow__(self, other): 325 | if isinstance(other, Tensor): 326 | return needle.ops.EWisePow()(self, other) 327 | else: 328 | return needle.ops.PowerScalar(other)(self) 329 | 330 | def __sub__(self, other): 331 | if isinstance(other, Tensor): 332 | return needle.ops.EWiseAdd()(self, needle.ops.Negate()(other)) 333 | else: 334 | return needle.ops.AddScalar(-other)(self) 335 | 336 | def __truediv__(self, other): 337 | if isinstance(other, Tensor): 338 | return needle.ops.EWiseDiv()(self, other) 339 | else: 340 | return needle.ops.DivScalar(other)(self) 341 | 342 | def __matmul__(self, other): 343 | return needle.ops.MatMul()(self, other) 344 | 345 | def matmul(self, other): 346 | return needle.ops.MatMul()(self, other) 347 | 348 | def sum(self, axes=None): 349 | return needle.ops.Summation(axes)(self) 350 | 351 | def broadcast_to(self, shape): 352 | return needle.ops.BroadcastTo(shape)(self) 353 | 354 | def reshape(self, shape): 355 | return needle.ops.Reshape(shape)(self) 356 | 357 | def __neg__(self): 358 | return needle.ops.Negate()(self) 359 | 360 | def transpose(self, axes=None): 361 | return needle.ops.Transpose(axes)(self) 362 | 363 | __radd__ = __add__ 364 | __rmul__ = __mul__ 365 | __rsub__ = __sub__ 366 | __rmatmul__ = __matmul__ 367 | 368 | 369 | def compute_gradient_of_variables(output_tensor, out_grad): 370 | """Take gradient of output node with respect to each node in node_list. 371 | 372 | Store the computed result in the grad field of each Variable. 373 | """ 374 | # a map from node to a list of gradient contributions from each output node 375 | node_to_output_grads_list: Dict[Tensor, List[Tensor]] = {} 376 | # Special note on initializing gradient of 377 | # We are really taking a derivative of the scalar reduce_sum(output_node) 378 | # instead of the vector output_node. But this is the common case for loss function. 379 | node_to_output_grads_list[output_tensor] = [out_grad] 380 | 381 | # Traverse graph in reverse topological order given the output_node that we are taking gradient wrt. 382 | reverse_topo_order = list(reversed(find_topo_sort([output_tensor]))) 383 | 384 | ### BEGIN YOUR SOLUTION 385 | raise NotImplementedError() 386 | ### END YOUR SOLUTION 387 | 388 | 389 | def find_topo_sort(node_list: List[Value]) -> List[Value]: 390 | """Given a list of nodes, return a topological sort list of nodes ending in them. 391 | 392 | A simple algorithm is to do a post-order DFS traversal on the given nodes, 393 | going backwards based on input edges. Since a node is added to the ordering 394 | after all its predecessors are traversed due to post-order DFS, we get a topological 395 | sort. 396 | """ 397 | ### BEGIN YOUR SOLUTION 398 | raise NotImplementedError() 399 | ### END YOUR SOLUTION 400 | 401 | 402 | def topo_sort_dfs(node, visited, topo_order): 403 | """Post-order DFS""" 404 | ### BEGIN YOUR SOLUTION 405 | raise NotImplementedError() 406 | ### END YOUR SOLUTION 407 | 408 | 409 | ############################## 410 | ####### Helper Methods ####### 411 | ############################## 412 | 413 | 414 | def sum_node_list(node_list): 415 | """Custom sum function in order to avoid create redundant nodes in Python sum implementation.""" 416 | from operator import add 417 | from functools import reduce 418 | 419 | return reduce(add, node_list) 420 | -------------------------------------------------------------------------------- /python/needle/backend_ndarray/__init__.py: -------------------------------------------------------------------------------- 1 | from .ndarray import * 2 | -------------------------------------------------------------------------------- /python/needle/backend_ndarray/ndarray.py: -------------------------------------------------------------------------------- 1 | import operator 2 | import math 3 | from functools import reduce 4 | import numpy as np 5 | from . import ndarray_backend_numpy 6 | from . import ndarray_backend_cpu 7 | 8 | 9 | # math.prod not in Python 3.7 10 | def prod(x): 11 | return reduce(operator.mul, x, 1) 12 | 13 | 14 | class BackendDevice: 15 | """A backend device, wrapps the implementation module.""" 16 | 17 | def __init__(self, name, mod): 18 | self.name = name 19 | self.mod = mod 20 | 21 | def __eq__(self, other): 22 | return self.name == other.name 23 | 24 | def __repr__(self): 25 | return self.name + "()" 26 | 27 | def __getattr__(self, name): 28 | return getattr(self.mod, name) 29 | 30 | def enabled(self): 31 | return self.mod is not None 32 | 33 | def randn(self, *shape, dtype="float32"): 34 | # note: numpy doesn't support types within standard random routines, and 35 | # .astype("float32") does work if we're generating a singleton 36 | return NDArray(np.random.randn(*shape).astype(dtype), device=self) 37 | 38 | def rand(self, *shape, dtype="float32"): 39 | # note: numpy doesn't support types within standard random routines, and 40 | # .astype("float32") does work if we're generating a singleton 41 | return NDArray(np.random.rand(*shape).astype(dtype), device=self) 42 | 43 | def one_hot(self, n, i, dtype="float32"): 44 | return NDArray(np.eye(n, dtype=dtype)[i], device=self) 45 | 46 | def empty(self, shape, dtype="float32"): 47 | dtype = "float32" if dtype is None else dtype 48 | assert dtype == "float32" 49 | return NDArray.make(shape, device=self) 50 | 51 | def full(self, shape, fill_value, dtype="float32"): 52 | dtype = "float32" if dtype is None else dtype 53 | assert dtype == "float32" 54 | arr = self.empty(shape, dtype) 55 | arr.fill(fill_value) 56 | return arr 57 | 58 | 59 | def cuda(): 60 | """Return cuda device""" 61 | try: 62 | from . import ndarray_backend_cuda 63 | 64 | return BackendDevice("cuda", ndarray_backend_cuda) 65 | except ImportError: 66 | return BackendDevice("cuda", None) 67 | 68 | 69 | def cpu_numpy(): 70 | """Return numpy device""" 71 | return BackendDevice("cpu_numpy", ndarray_backend_numpy) 72 | 73 | 74 | def cpu(): 75 | """Return cpu device""" 76 | return BackendDevice("cpu", ndarray_backend_cpu) 77 | 78 | 79 | def default_device(): 80 | return cpu_numpy() 81 | 82 | 83 | def all_devices(): 84 | """return a list of all available devices""" 85 | return [cpu(), cuda(), cpu_numpy()] 86 | 87 | 88 | class NDArray: 89 | """A generic ND array class that may contain multipe different backends 90 | i.e., a Numpy backend, a native CPU backend, or a GPU backend. 91 | 92 | This class will only contains those functions that you need to implement 93 | to actually get the desired functionality for the programming examples 94 | in the homework, and no more. 95 | 96 | For now, for simplicity the class only supports float32 types, though 97 | this can be extended if desired. 98 | """ 99 | 100 | def __init__(self, other, device=None): 101 | """Create by copying another NDArray, or from numpy""" 102 | if isinstance(other, NDArray): 103 | # create a copy of existing NDArray 104 | if device is None: 105 | device = other.device 106 | self._init(other.to(device) + 0.0) # this creates a copy 107 | elif isinstance(other, np.ndarray): 108 | # create copy from numpy array 109 | device = device if device is not None else default_device() 110 | array = self.make(other.shape, device=device) 111 | array.device.from_numpy(np.ascontiguousarray(other), array._handle) 112 | self._init(array) 113 | else: 114 | # see if we can create a numpy array from input 115 | array = NDArray(np.array(other), device=device) 116 | self._init(array) 117 | 118 | def _init(self, other): 119 | self._shape = other._shape 120 | self._strides = other._strides 121 | self._offset = other._offset 122 | self._device = other._device 123 | self._handle = other._handle 124 | 125 | @staticmethod 126 | def compact_strides(shape): 127 | """Utility function to compute compact strides""" 128 | stride = 1 129 | res = [] 130 | for i in range(1, len(shape) + 1): 131 | res.append(stride) 132 | stride *= shape[-i] 133 | return tuple(res[::-1]) 134 | 135 | @staticmethod 136 | def make(shape, strides=None, device=None, handle=None, offset=0): 137 | """Create a new NDArray with the given properties. This will allocation the 138 | memory if handle=None, otherwise it will use the handle of an existing 139 | array.""" 140 | array = NDArray.__new__(NDArray) 141 | array._shape = tuple(shape) 142 | array._strides = NDArray.compact_strides(shape) if strides is None else strides 143 | array._offset = offset 144 | array._device = device if device is not None else default_device() 145 | if handle is None: 146 | array._handle = array.device.Array(prod(shape)) 147 | else: 148 | array._handle = handle 149 | return array 150 | 151 | ### Properies and string representations 152 | @property 153 | def shape(self): 154 | return self._shape 155 | 156 | @property 157 | def strides(self): 158 | return self._strides 159 | 160 | @property 161 | def device(self): 162 | return self._device 163 | 164 | @property 165 | def dtype(self): 166 | # only support float32 for now 167 | return "float32" 168 | 169 | @property 170 | def ndim(self): 171 | """Return number of dimensions.""" 172 | return len(self._shape) 173 | 174 | @property 175 | def size(self): 176 | return prod(self._shape) 177 | 178 | def __repr__(self): 179 | return "NDArray(" + self.numpy().__str__() + f", device={self.device})" 180 | 181 | def __str__(self): 182 | return self.numpy().__str__() 183 | 184 | ### Basic array manipulation 185 | def fill(self, value): 186 | """Fill (in place) with a constant value.""" 187 | self._device.fill(self._handle, value) 188 | 189 | def to(self, device): 190 | """Convert between devices, using to/from numpy calls as the unifying bridge.""" 191 | if device == self.device: 192 | return self 193 | else: 194 | return NDArray(self.numpy(), device=device) 195 | 196 | def numpy(self): 197 | """convert to a numpy array""" 198 | return self.device.to_numpy( 199 | self._handle, self.shape, self.strides, self._offset 200 | ) 201 | 202 | def is_compact(self): 203 | """Return true if array is compact in memory and internal size equals product 204 | of the shape dimensions""" 205 | return ( 206 | self._strides == self.compact_strides(self._shape) 207 | and prod(self.shape) == self._handle.size 208 | ) 209 | 210 | def compact(self): 211 | """Convert a matrix to be compact""" 212 | if self.is_compact(): 213 | return self 214 | else: 215 | out = NDArray.make(self.shape, device=self.device) 216 | self.device.compact( 217 | self._handle, out._handle, self.shape, self.strides, self._offset 218 | ) 219 | return out 220 | 221 | def as_strided(self, shape, strides): 222 | """Restride the matrix without copying memory.""" 223 | assert len(shape) == len(strides) 224 | return NDArray.make( 225 | shape, strides=strides, device=self.device, handle=self._handle 226 | ) 227 | 228 | @property 229 | def flat(self): 230 | return self.reshape((self.size,)) 231 | 232 | def reshape(self, new_shape): 233 | """ 234 | Reshape the matrix without copying memory. This will return a matrix 235 | that corresponds to a reshaped array but points to the same memory as 236 | the original array. 237 | 238 | Raises: 239 | ValueError if product of current shape is not equal to the product 240 | of the new shape, or if the matrix is not compact. 241 | 242 | Args: 243 | new_shape (tuple): new shape of the array 244 | 245 | Returns: 246 | NDArray : reshaped array; this will point to thep 247 | """ 248 | 249 | ### BEGIN YOUR SOLUTION 250 | raise NotImplementedError() 251 | ### END YOUR SOLUTION 252 | 253 | def permute(self, new_axes): 254 | """ 255 | Permute order of the dimensions. new_axes describes a permuation of the 256 | existing axes, so e.g.: 257 | - If we have an array with dimension "BHWC" then .permute((0,3,1,2)) 258 | would convert this to "BCHW" order. 259 | - For a 2D array, .permute((1,0)) would transpose the array. 260 | Like reshape, this operation should not copy memory, but achieves the 261 | permuting by just adjusting the shape/strides of the array. That is, 262 | it returns a new array that has the dimensions permuted as desired, but 263 | which points to the same memroy as the original array. 264 | 265 | Args: 266 | new_axes (tuple): permuation order of the dimensions 267 | 268 | Returns: 269 | NDarray : new NDArray object with permuted dimensions, pointing 270 | to the same memory as the original NDArray (i.e., just shape and 271 | strides changed). 272 | """ 273 | 274 | ### BEGIN YOUR SOLUTION 275 | raise NotImplementedError() 276 | ### END YOUR SOLUTION 277 | 278 | def broadcast_to(self, new_shape): 279 | """ 280 | Broadcast an array to a new shape. new_shape's elements must be the 281 | same as the original shape, except for dimensions in the self where 282 | the size = 1 (which can then be broadcast to any size). As with the 283 | previous calls, this will not copy memory, and just achieves 284 | broadcasting by manipulating the strides. 285 | 286 | Raises: 287 | assertion error if new_shape[i] != shape[i] for all i where 288 | shape[i] != 1 289 | 290 | Args: 291 | new_shape (tuple): shape to broadcast to 292 | 293 | Returns: 294 | NDArray: the new NDArray object with the new broadcast shape; should 295 | point to the same memory as the original array. 296 | """ 297 | 298 | ### BEGIN YOUR SOLUTION 299 | raise NotImplementedError() 300 | ### END YOUR SOLUTION 301 | 302 | ### Get and set elements 303 | 304 | def process_slice(self, sl, dim): 305 | """Convert a slice to an explicit start/stop/step""" 306 | start, stop, step = sl.start, sl.stop, sl.step 307 | if start == None: 308 | start = 0 309 | if start < 0: 310 | start = self.shape[dim] 311 | if stop == None: 312 | stop = self.shape[dim] 313 | if stop < 0: 314 | stop = self.shape[dim] + stop 315 | if step == None: 316 | step = 1 317 | 318 | # we're not gonna handle negative strides and that kind of thing 319 | assert stop > start, "Start must be less than stop" 320 | assert step > 0, "No support for negative increments" 321 | return slice(start, stop, step) 322 | 323 | def __getitem__(self, idxs): 324 | """ 325 | The __getitem__ operator in Python allows us to access elements of our 326 | array. When passed notation such as a[1:5,:-1:2,4,:] etc, Python will 327 | convert this to a tuple of slices and integers (for singletons like the 328 | '4' in this example). Slices can be a bit odd to work with (they have 329 | three elements .start .stop .step), which can be None or have negative 330 | entries, so for simplicity we wrote the code for you to convert these 331 | to always be a tuple of slices, one of each dimension. 332 | 333 | For this tuple of slices, return an array that subsets the desired 334 | elements. As before, this can be done entirely through compute a new 335 | shape, stride, and offset for the new "view" into the original array, 336 | pointing to the same memory 337 | 338 | Raises: 339 | AssertionError if a slice has negative size or step, or if number 340 | of slices is not equal to the number of dimension (the stub code 341 | already raises all these errors. 342 | 343 | Args: 344 | idxs tuple: (after stub code processes), a tuple of slice elements 345 | coresponding to the subset of the matrix to get 346 | 347 | Returns: 348 | NDArray: a new NDArray object corresponding to the selected 349 | subset of elements. As before, this should not copy memroy but just 350 | manipulate the shape/strides/offset of the new array, referecing 351 | the same array as the original one. 352 | """ 353 | 354 | # handle singleton as tuple, everything as slices 355 | if not isinstance(idxs, tuple): 356 | idxs = (idxs,) 357 | idxs = tuple( 358 | [ 359 | self.process_slice(s, i) if isinstance(s, slice) else slice(s, s + 1, 1) 360 | for i, s in enumerate(idxs) 361 | ] 362 | ) 363 | assert len(idxs) == self.ndim, "Need indexes equal to number of dimensions" 364 | 365 | ### BEGIN YOUR SOLUTION 366 | raise NotImplementedError() 367 | ### END YOUR SOLUTION 368 | 369 | def __setitem__(self, idxs, other): 370 | """Set the values of a view into an array, using the same semantics 371 | as __getitem__().""" 372 | view = self.__getitem__(idxs) 373 | if isinstance(other, NDArray): 374 | assert prod(view.shape) == prod(other.shape) 375 | self.device.ewise_setitem( 376 | other.compact()._handle, 377 | view._handle, 378 | view.shape, 379 | view.strides, 380 | view._offset, 381 | ) 382 | else: 383 | self.device.scalar_setitem( 384 | prod(view.shape), 385 | other, 386 | view._handle, 387 | view.shape, 388 | view.strides, 389 | view._offset, 390 | ) 391 | 392 | ### Collection of elementwise and scalar function: add, multiply, boolean, etc 393 | 394 | def ewise_or_scalar(self, other, ewise_func, scalar_func): 395 | """Run either an elementwise or scalar version of a function, 396 | depending on whether "other" is an NDArray or scalar 397 | """ 398 | out = NDArray.make(self.shape, device=self.device) 399 | if isinstance(other, NDArray): 400 | assert self.shape == other.shape, "operation needs two equal-sized arrays" 401 | ewise_func(self.compact()._handle, other.compact()._handle, out._handle) 402 | else: 403 | scalar_func(self.compact()._handle, other, out._handle) 404 | return out 405 | 406 | def __add__(self, other): 407 | return self.ewise_or_scalar( 408 | other, self.device.ewise_add, self.device.scalar_add 409 | ) 410 | 411 | __radd__ = __add__ 412 | 413 | def __sub__(self, other): 414 | return self + (-other) 415 | 416 | def __rsub__(self, other): 417 | return other + (-self) 418 | 419 | def __mul__(self, other): 420 | return self.ewise_or_scalar( 421 | other, self.device.ewise_mul, self.device.scalar_mul 422 | ) 423 | 424 | __rmul__ = __mul__ 425 | 426 | def __truediv__(self, other): 427 | return self.ewise_or_scalar( 428 | other, self.device.ewise_div, self.device.scalar_div 429 | ) 430 | 431 | def __neg__(self): 432 | return self * (-1) 433 | 434 | def __pow__(self, other): 435 | out = NDArray.make(self.shape, device=self.device) 436 | self.device.scalar_power(self.compact()._handle, other, out._handle) 437 | return out 438 | 439 | def maximum(self, other): 440 | return self.ewise_or_scalar( 441 | other, self.device.ewise_maximum, self.device.scalar_maximum 442 | ) 443 | 444 | ### Binary operators all return (0.0, 1.0) floating point values, could of course be optimized 445 | def __eq__(self, other): 446 | return self.ewise_or_scalar(other, self.device.ewise_eq, self.device.scalar_eq) 447 | 448 | def __ge__(self, other): 449 | return self.ewise_or_scalar(other, self.device.ewise_ge, self.device.scalar_ge) 450 | 451 | def __ne__(self, other): 452 | return 1 - (self == other) 453 | 454 | def __gt__(self, other): 455 | return (self >= other) * (self != other) 456 | 457 | def __lt__(self, other): 458 | return 1 - (self >= other) 459 | 460 | def __le__(self, other): 461 | return 1 - (self > other) 462 | 463 | ### Elementwise functions 464 | 465 | def log(self): 466 | out = NDArray.make(self.shape, device=self.device) 467 | self.device.ewise_log(self.compact()._handle, out._handle) 468 | return out 469 | 470 | def exp(self): 471 | out = NDArray.make(self.shape, device=self.device) 472 | self.device.ewise_exp(self.compact()._handle, out._handle) 473 | return out 474 | 475 | def tanh(self): 476 | out = NDArray.make(self.shape, device=self.device) 477 | self.device.ewise_tanh(self.compact()._handle, out._handle) 478 | return out 479 | 480 | ### Matrix multiplication 481 | def __matmul__(self, other): 482 | """Matrix multplication of two arrays. This requires that both arrays 483 | be 2D (i.e., we don't handle batch matrix multiplication), and that the 484 | sizes match up properly for matrix multiplication. 485 | 486 | In the case of the CPU backend, you will implement an efficient "tiled" 487 | version of matrix multiplication for the case when all dimensions of 488 | the array are divisible by self.device.__tile_size__. In this case, 489 | the code below will restride and compact the matrix into tiled form, 490 | and then pass to the relevant CPU backend. For the CPU version we will 491 | just fall back to the naive CPU implementation if the array shape is not 492 | a multiple of the tile size 493 | 494 | The GPU (and numpy) versions don't have any tiled version (or rather, 495 | the GPU version will just work natively by tiling any input size). 496 | """ 497 | 498 | assert self.ndim == 2 and other.ndim == 2 499 | assert self.shape[1] == other.shape[0] 500 | 501 | m, n, p = self.shape[0], self.shape[1], other.shape[1] 502 | 503 | # if the matrix is aligned, use tiled matrix multiplication 504 | if hasattr(self.device, "matmul_tiled") and all( 505 | d % self.device.__tile_size__ == 0 for d in (m, n, p) 506 | ): 507 | 508 | def tile(a, tile): 509 | return a.as_strided( 510 | (a.shape[0] // tile, a.shape[1] // tile, tile, tile), 511 | (a.shape[1] * tile, tile, self.shape[1], 1), 512 | ) 513 | 514 | t = self.device.__tile_size__ 515 | a = tile(self.compact(), t).compact() 516 | b = tile(other.compact(), t).compact() 517 | out = NDArray.make((a.shape[0], b.shape[1], t, t), device=self.device) 518 | self.device.matmul_tiled(a._handle, b._handle, out._handle, m, n, p) 519 | 520 | return ( 521 | out.permute((0, 2, 1, 3)) 522 | .compact() 523 | .reshape((self.shape[0], other.shape[1])) 524 | ) 525 | 526 | else: 527 | out = NDArray.make((m, p), device=self.device) 528 | self.device.matmul( 529 | self.compact()._handle, other.compact()._handle, out._handle, m, n, p 530 | ) 531 | return out 532 | 533 | ### Reductions, i.e., sum/max over all element or over given axis 534 | def reduce_view_out(self, axis): 535 | """Return a view to the array set up for reduction functions and output array.""" 536 | if axis is None: 537 | view = self.reshape((1,) * (self.ndim - 1) + (prod(self.shape),)) 538 | out = NDArray.make((1,) * self.ndim, device=self.device) 539 | else: 540 | if isinstance(axis, (tuple, list)): 541 | assert len(axis) == 1, "Only support reduction over a single axis" 542 | axis = axis[0] 543 | 544 | view = self.permute( 545 | tuple([a for a in range(self.ndim) if a != axis]) + (axis,) 546 | ) 547 | out = NDArray.make( 548 | tuple([1 if i == axis else s for i, s in enumerate(self.shape)]), 549 | device=self.device, 550 | ) 551 | return view, out 552 | 553 | def sum(self, axis=None): 554 | view, out = self.reduce_view_out(axis) 555 | self.device.reduce_sum(view.compact()._handle, out._handle, view.shape[-1]) 556 | return out 557 | 558 | def max(self, axis=None): 559 | view, out = self.reduce_view_out(axis) 560 | self.device.reduce_max(view.compact()._handle, out._handle, view.shape[-1]) 561 | return out 562 | 563 | 564 | def array(a, dtype="float32", device=None): 565 | """Convenience methods to match numpy a bit more closely.""" 566 | dtype = "float32" if dtype is None else dtype 567 | assert dtype == "float32" 568 | return NDArray(a, device=device) 569 | 570 | 571 | def empty(shape, dtype="float32", device=None): 572 | device = device if device is not None else default_device() 573 | return device.empty(shape, dtype) 574 | 575 | 576 | def full(shape, fill_value, dtype="float32", device=None): 577 | device = device if device is not None else default_device() 578 | return device.full(shape, fill_value, dtype) 579 | 580 | 581 | def broadcast_to(array, new_shape): 582 | return array.broadcast_to(new_shape) 583 | 584 | 585 | def reshape(array, new_shape): 586 | return array.reshape(new_shape) 587 | 588 | 589 | def maximum(a, b): 590 | return a.maximum(b) 591 | 592 | 593 | def log(a): 594 | return a.log() 595 | 596 | 597 | def exp(a): 598 | return a.exp() 599 | 600 | 601 | def tanh(a): 602 | return a.tanh() 603 | 604 | 605 | def sum(a, axis=None): 606 | return a.sum(axis=axis) 607 | -------------------------------------------------------------------------------- /python/needle/backend_ndarray/ndarray_backend_numpy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | __device_name__ = "numpy" 5 | _datatype = np.float32 6 | _datetype_size = np.dtype(_datatype).itemsize 7 | 8 | 9 | class Array: 10 | def __init__(self, size): 11 | self.array = np.empty(size, dtype=np.float32) 12 | 13 | @property 14 | def size(self): 15 | return self.array.size 16 | 17 | 18 | def to_numpy(a, shape, strides, offset): 19 | return np.lib.stride_tricks.as_strided( 20 | a.array[offset:], shape, tuple([s * _datetype_size for s in strides]) 21 | ) 22 | 23 | 24 | def from_numpy(a, out): 25 | out.array[:] = a.flatten() 26 | 27 | 28 | def fill(out, val): 29 | out.array.fill(val) 30 | 31 | 32 | def compact(a, out, shape, strides, offset): 33 | out.array[:] = to_numpy(a, shape, strides, offset).flatten() 34 | 35 | 36 | def ewise_setitem(a, out, shape, strides, offset): 37 | to_numpy(out, shape, strides, offset)[:] = a.array.reshape(shape) 38 | 39 | 40 | def scalar_setitem(size, val, out, shape, strides, offset): 41 | to_numpy(out, shape, strides, offset)[:] = val 42 | 43 | 44 | def ewise_add(a, b, out): 45 | out.array[:] = a.array + b.array 46 | 47 | 48 | def scalar_add(a, val, out): 49 | out.array[:] = a.array + val 50 | 51 | 52 | def ewise_mul(a, b, out): 53 | out.array[:] = a.array * b.array 54 | 55 | 56 | def scalar_mul(a, val, out): 57 | out.array[:] = a.array * val 58 | 59 | 60 | def ewise_div(a, b, out): 61 | out.array[:] = a.array / b.array 62 | 63 | 64 | def scalar_div(a, val, out): 65 | out.array[:] = a.array / val 66 | 67 | 68 | def scalar_power(a, val, out): 69 | out.array[:] = a.array**val 70 | 71 | 72 | def ewise_maximum(a, b, out): 73 | out.array[:] = np.maximum(a.array, b.array) 74 | 75 | 76 | def scalar_maximum(a, val, out): 77 | out.array[:] = np.maximum(a.array, val) 78 | 79 | 80 | def ewise_eq(a, b, out): 81 | out.array[:] = (a.array == b.array).astype(np.float32) 82 | 83 | 84 | def scalar_eq(a, val, out): 85 | out.array[:] = (a.array == val).astype(np.float32) 86 | 87 | 88 | def ewise_ge(a, b, out): 89 | out.array[:] = (a.array >= b.array).astype(np.float32) 90 | 91 | 92 | def scalar_ge(a, val, out): 93 | out.array[:] = (a.array >= val).astype(np.float32) 94 | 95 | 96 | def ewise_log(a, out): 97 | out.array[:] = np.log(a.array) 98 | 99 | 100 | def ewise_exp(a, out): 101 | out.array[:] = np.exp(a.array) 102 | 103 | 104 | def ewise_tanh(a, out): 105 | out.array[:] = np.tanh(a.array) 106 | 107 | 108 | def matmul(a, b, out, m, n, p): 109 | out.array[:] = (a.array.reshape(m, n) @ b.array.reshape(n, p)).reshape(-1) 110 | 111 | 112 | def reduce_max(a, out, reduce_size): 113 | out.array[:] = a.array[:].reshape(-1, reduce_size).max(axis=1) 114 | 115 | 116 | def reduce_sum(a, out, reduce_size): 117 | out.array[:] = a.array[:].reshape(-1, reduce_size).sum(axis=1) 118 | -------------------------------------------------------------------------------- /python/needle/backend_numpy.py: -------------------------------------------------------------------------------- 1 | """This file defies specific implementations of devices when using numpy as NDArray backend. 2 | """ 3 | import numpy 4 | 5 | 6 | class Device: 7 | """Baseclass of all device""" 8 | 9 | 10 | class CPUDevice(Device): 11 | """Represents data that sits in CPU""" 12 | 13 | def __repr__(self): 14 | return "needle.cpu()" 15 | 16 | def __hash__(self): 17 | return self.__repr__().__hash__() 18 | 19 | def __eq__(self, other): 20 | return isinstance(other, CPUDevice) 21 | 22 | def enabled(self): 23 | return True 24 | 25 | def zeros(self, *shape, dtype="float32"): 26 | return numpy.zeros(shape, dtype=dtype) 27 | 28 | def ones(self, *shape, dtype="float32"): 29 | return numpy.ones(shape, dtype=dtype) 30 | 31 | def randn(self, *shape): 32 | # note: numpy doesn't support types within standard random routines, and 33 | # .astype("float32") does work if we're generating a singleton 34 | return numpy.random.randn(*shape) 35 | 36 | def rand(self, *shape): 37 | # note: numpy doesn't support types within standard random routines, and 38 | # .astype("float32") does work if we're generating a singleton 39 | return numpy.random.rand(*shape) 40 | 41 | def one_hot(self, n, i, dtype="float32"): 42 | return numpy.eye(n, dtype=dtype)[i] 43 | 44 | def empty(self, shape, dtype="float32"): 45 | return numpy.empty(shape, dtype=dtype) 46 | 47 | def full(self, shape, fill_value, dtype="float32"): 48 | return numpy.full(shape, fill_value, dtype=dtype) 49 | 50 | 51 | def cpu(): 52 | """Return cpu device""" 53 | return CPUDevice() 54 | 55 | 56 | def default_device(): 57 | return cpu() 58 | 59 | 60 | def all_devices(): 61 | """return a list of all available devices""" 62 | return [cpu()] 63 | -------------------------------------------------------------------------------- /python/needle/backend_selection.py: -------------------------------------------------------------------------------- 1 | """Logic for backend selection""" 2 | import os 3 | 4 | 5 | BACKEND = os.environ.get("NEEDLE_BACKEND", "nd") 6 | 7 | 8 | if BACKEND == "nd": 9 | print("Using needle backend") 10 | from . import backend_ndarray as array_api 11 | from .backend_ndarray import ( 12 | all_devices, 13 | cuda, 14 | cpu, 15 | cpu_numpy, 16 | default_device, 17 | BackendDevice as Device, 18 | ) 19 | 20 | NDArray = array_api.NDArray 21 | elif BACKEND == "np": 22 | print("Using numpy backend") 23 | import numpy as array_api 24 | from .backend_numpy import all_devices, cpu, default_device, Device 25 | 26 | NDArray = array_api.ndarray 27 | else: 28 | raise RuntimeError("Unknown needle array backend %s" % BACKEND) 29 | -------------------------------------------------------------------------------- /python/needle/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_basic import * 2 | from .data_transforms import * 3 | from .datasets import * 4 | -------------------------------------------------------------------------------- /python/needle/data/data_basic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ..autograd import Tensor 3 | 4 | from typing import Iterator, Optional, List, Sized, Union, Iterable, Any 5 | 6 | 7 | 8 | class Dataset: 9 | r"""An abstract class representing a `Dataset`. 10 | 11 | All subclasses should overwrite :meth:`__getitem__`, supporting fetching a 12 | data sample for a given key. Subclasses must also overwrite 13 | :meth:`__len__`, which is expected to return the size of the dataset. 14 | """ 15 | 16 | def __init__(self, transforms: Optional[List] = None): 17 | self.transforms = transforms 18 | 19 | def __getitem__(self, index) -> object: 20 | raise NotImplementedError 21 | 22 | def __len__(self) -> int: 23 | raise NotImplementedError 24 | 25 | def apply_transforms(self, x): 26 | if self.transforms is not None: 27 | # apply the transforms 28 | for tform in self.transforms: 29 | x = tform(x) 30 | return x 31 | 32 | 33 | class DataLoader: 34 | r""" 35 | Data loader. Combines a dataset and a sampler, and provides an iterable over 36 | the given dataset. 37 | Args: 38 | dataset (Dataset): dataset from which to load the data. 39 | batch_size (int, optional): how many samples per batch to load 40 | (default: ``1``). 41 | shuffle (bool, optional): set to ``True`` to have the data reshuffled 42 | at every epoch (default: ``False``). 43 | """ 44 | dataset: Dataset 45 | batch_size: Optional[int] 46 | 47 | def __init__( 48 | self, 49 | dataset: Dataset, 50 | batch_size: Optional[int] = 1, 51 | shuffle: bool = False, 52 | ): 53 | 54 | self.dataset = dataset 55 | self.shuffle = shuffle 56 | self.batch_size = batch_size 57 | if not self.shuffle: 58 | self.ordering = np.array_split(np.arange(len(dataset)), 59 | range(batch_size, len(dataset), batch_size)) 60 | 61 | def __iter__(self): 62 | ### BEGIN YOUR SOLUTION 63 | raise NotImplementedError() 64 | ### END YOUR SOLUTION 65 | return self 66 | 67 | def __next__(self): 68 | ### BEGIN YOUR SOLUTION 69 | raise NotImplementedError() 70 | ### END YOUR SOLUTION 71 | 72 | -------------------------------------------------------------------------------- /python/needle/data/data_transforms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Transform: 4 | def __call__(self, x): 5 | raise NotImplementedError 6 | 7 | 8 | class RandomFlipHorizontal(Transform): 9 | def __init__(self, p = 0.5): 10 | self.p = p 11 | 12 | def __call__(self, img): 13 | """ 14 | Horizonally flip an image, specified as an H x W x C NDArray. 15 | Args: 16 | img: H x W x C NDArray of an image 17 | Returns: 18 | H x W x C ndarray corresponding to image flipped with probability self.p 19 | Note: use the provided code to provide randomness, for easier testing 20 | """ 21 | flip_img = np.random.rand() < self.p 22 | ### BEGIN YOUR SOLUTION 23 | raise NotImplementedError() 24 | ### END YOUR SOLUTION 25 | 26 | 27 | class RandomCrop(Transform): 28 | def __init__(self, padding=3): 29 | self.padding = padding 30 | 31 | def __call__(self, img): 32 | """ Zero pad and then randomly crop an image. 33 | Args: 34 | img: H x W x C NDArray of an image 35 | Return 36 | H x W x C NAArray of cliped image 37 | Note: generate the image shifted by shift_x, shift_y specified below 38 | """ 39 | shift_x, shift_y = np.random.randint(low=-self.padding, high=self.padding+1, size=2) 40 | ### BEGIN YOUR SOLUTION 41 | raise NotImplementedError() 42 | ### END YOUR SOLUTION 43 | -------------------------------------------------------------------------------- /python/needle/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .mnist_dataset import * 2 | from .ndarray_dataset import * 3 | -------------------------------------------------------------------------------- /python/needle/data/datasets/mnist_dataset.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from ..data_basic import Dataset 3 | import numpy as np 4 | 5 | class MNISTDataset(Dataset): 6 | def __init__( 7 | self, 8 | image_filename: str, 9 | label_filename: str, 10 | transforms: Optional[List] = None, 11 | ): 12 | ### BEGIN YOUR SOLUTION 13 | raise NotImplementedError() 14 | ### END YOUR SOLUTION 15 | 16 | def __getitem__(self, index) -> object: 17 | ### BEGIN YOUR SOLUTION 18 | raise NotImplementedError() 19 | ### END YOUR SOLUTION 20 | 21 | def __len__(self) -> int: 22 | ### BEGIN YOUR SOLUTION 23 | raise NotImplementedError() 24 | ### END YOUR SOLUTION -------------------------------------------------------------------------------- /python/needle/data/datasets/ndarray_dataset.py: -------------------------------------------------------------------------------- 1 | from ..data_basic import Dataset 2 | 3 | class NDArrayDataset(Dataset): 4 | def __init__(self, *arrays): 5 | self.arrays = arrays 6 | 7 | def __len__(self) -> int: 8 | return self.arrays[0].shape[0] 9 | 10 | def __getitem__(self, i) -> object: 11 | return tuple([a[i] for a in self.arrays]) -------------------------------------------------------------------------------- /python/needle/init/__init__.py: -------------------------------------------------------------------------------- 1 | from .init_basic import * 2 | 3 | from .init_initializers import * 4 | -------------------------------------------------------------------------------- /python/needle/init/init_basic.py: -------------------------------------------------------------------------------- 1 | import math 2 | import needle as ndl 3 | 4 | 5 | def rand(*shape, low=0.0, high=1.0, device=None, dtype="float32", requires_grad=False): 6 | """Generate random numbers uniform between low and high""" 7 | device = ndl.cpu() if device is None else device 8 | array = device.rand(*shape) * (high - low) + low 9 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad) 10 | 11 | 12 | def randn(*shape, mean=0.0, std=1.0, device=None, dtype="float32", requires_grad=False): 13 | """Generate random normal with specified mean and std deviation""" 14 | device = ndl.cpu() if device is None else device 15 | array = device.randn(*shape) * std + mean 16 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad) 17 | 18 | 19 | def constant(*shape, c=1.0, device=None, dtype="float32", requires_grad=False): 20 | """Generate constant Tensor""" 21 | device = ndl.cpu() if device is None else device 22 | array = device.ones(*shape, dtype=dtype) * c # note: can change dtype 23 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad) 24 | 25 | 26 | def ones(*shape, device=None, dtype="float32", requires_grad=False): 27 | """Generate all-ones Tensor""" 28 | return constant( 29 | *shape, c=1.0, device=device, dtype=dtype, requires_grad=requires_grad 30 | ) 31 | 32 | 33 | def zeros(*shape, device=None, dtype="float32", requires_grad=False): 34 | """Generate all-zeros Tensor""" 35 | return constant( 36 | *shape, c=0.0, device=device, dtype=dtype, requires_grad=requires_grad 37 | ) 38 | 39 | 40 | def randb(*shape, p=0.5, device=None, dtype="bool", requires_grad=False): 41 | """Generate binary random Tensor""" 42 | device = ndl.cpu() if device is None else device 43 | array = device.rand(*shape) <= p 44 | return ndl.Tensor(array, device=device, dtype=dtype, requires_grad=requires_grad) 45 | 46 | 47 | def one_hot(n, i, device=None, dtype="float32", requires_grad=False): 48 | """Generate one-hot encoding Tensor""" 49 | device = ndl.cpu() if device is None else device 50 | return ndl.Tensor( 51 | device.one_hot(n, i.numpy(), dtype=dtype), 52 | device=device, 53 | requires_grad=requires_grad, 54 | ) 55 | 56 | 57 | def zeros_like(array, *, device=None, requires_grad=False): 58 | device = device if device else array.device 59 | return zeros( 60 | *array.shape, dtype=array.dtype, device=device, requires_grad=requires_grad 61 | ) 62 | 63 | 64 | def ones_like(array, *, device=None, requires_grad=False): 65 | device = device if device else array.device 66 | return ones( 67 | *array.shape, dtype=array.dtype, device=device, requires_grad=requires_grad 68 | ) 69 | -------------------------------------------------------------------------------- /python/needle/init/init_initializers.py: -------------------------------------------------------------------------------- 1 | import math 2 | from .init_basic import * 3 | 4 | 5 | def xavier_uniform(fan_in, fan_out, gain=1.0, **kwargs): 6 | ### BEGIN YOUR SOLUTION 7 | raise NotImplementedError() 8 | ### END YOUR SOLUTION 9 | 10 | 11 | def xavier_normal(fan_in, fan_out, gain=1.0, **kwargs): 12 | ### BEGIN YOUR SOLUTION 13 | raise NotImplementedError() 14 | ### END YOUR SOLUTION 15 | 16 | 17 | def kaiming_uniform(fan_in, fan_out, nonlinearity="relu", **kwargs): 18 | assert nonlinearity == "relu", "Only relu supported currently" 19 | ### BEGIN YOUR SOLUTION 20 | raise NotImplementedError() 21 | ### END YOUR SOLUTION 22 | 23 | 24 | def kaiming_normal(fan_in, fan_out, nonlinearity="relu", **kwargs): 25 | assert nonlinearity == "relu", "Only relu supported currently" 26 | ### BEGIN YOUR SOLUTION 27 | raise NotImplementedError() 28 | ### END YOUR SOLUTION 29 | -------------------------------------------------------------------------------- /python/needle/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from .nn_basic import * 2 | -------------------------------------------------------------------------------- /python/needle/nn/nn_basic.py: -------------------------------------------------------------------------------- 1 | """The module. 2 | """ 3 | from typing import List, Callable, Any 4 | from needle.autograd import Tensor 5 | from needle import ops 6 | import needle.init as init 7 | import numpy as np 8 | 9 | 10 | class Parameter(Tensor): 11 | """A special kind of tensor that represents parameters.""" 12 | 13 | 14 | def _unpack_params(value: object) -> List[Tensor]: 15 | if isinstance(value, Parameter): 16 | return [value] 17 | elif isinstance(value, Module): 18 | return value.parameters() 19 | elif isinstance(value, dict): 20 | params = [] 21 | for k, v in value.items(): 22 | params += _unpack_params(v) 23 | return params 24 | elif isinstance(value, (list, tuple)): 25 | params = [] 26 | for v in value: 27 | params += _unpack_params(v) 28 | return params 29 | else: 30 | return [] 31 | 32 | 33 | def _child_modules(value: object) -> List["Module"]: 34 | if isinstance(value, Module): 35 | modules = [value] 36 | modules.extend(_child_modules(value.__dict__)) 37 | return modules 38 | if isinstance(value, dict): 39 | modules = [] 40 | for k, v in value.items(): 41 | modules += _child_modules(v) 42 | return modules 43 | elif isinstance(value, (list, tuple)): 44 | modules = [] 45 | for v in value: 46 | modules += _child_modules(v) 47 | return modules 48 | else: 49 | return [] 50 | 51 | 52 | class Module: 53 | def __init__(self): 54 | self.training = True 55 | 56 | def parameters(self) -> List[Tensor]: 57 | """Return the list of parameters in the module.""" 58 | return _unpack_params(self.__dict__) 59 | 60 | def _children(self) -> List["Module"]: 61 | return _child_modules(self.__dict__) 62 | 63 | def eval(self): 64 | self.training = False 65 | for m in self._children(): 66 | m.training = False 67 | 68 | def train(self): 69 | self.training = True 70 | for m in self._children(): 71 | m.training = True 72 | 73 | def __call__(self, *args, **kwargs): 74 | return self.forward(*args, **kwargs) 75 | 76 | 77 | class Identity(Module): 78 | def forward(self, x): 79 | return x 80 | 81 | 82 | class Linear(Module): 83 | def __init__( 84 | self, in_features, out_features, bias=True, device=None, dtype="float32" 85 | ): 86 | super().__init__() 87 | self.in_features = in_features 88 | self.out_features = out_features 89 | 90 | ### BEGIN YOUR SOLUTION 91 | raise NotImplementedError() 92 | ### END YOUR SOLUTION 93 | 94 | def forward(self, X: Tensor) -> Tensor: 95 | ### BEGIN YOUR SOLUTION 96 | raise NotImplementedError() 97 | ### END YOUR SOLUTION 98 | 99 | 100 | class Flatten(Module): 101 | def forward(self, X): 102 | ### BEGIN YOUR SOLUTION 103 | raise NotImplementedError() 104 | ### END YOUR SOLUTION 105 | 106 | 107 | class ReLU(Module): 108 | def forward(self, x: Tensor) -> Tensor: 109 | ### BEGIN YOUR SOLUTION 110 | raise NotImplementedError() 111 | ### END YOUR SOLUTION 112 | 113 | 114 | class Sequential(Module): 115 | def __init__(self, *modules): 116 | super().__init__() 117 | self.modules = modules 118 | 119 | def forward(self, x: Tensor) -> Tensor: 120 | ### BEGIN YOUR SOLUTION 121 | raise NotImplementedError() 122 | ### END YOUR SOLUTION 123 | 124 | 125 | class SoftmaxLoss(Module): 126 | def forward(self, logits: Tensor, y: Tensor): 127 | ### BEGIN YOUR SOLUTION 128 | raise NotImplementedError() 129 | ### END YOUR SOLUTION 130 | 131 | 132 | class BatchNorm1d(Module): 133 | def __init__(self, dim, eps=1e-5, momentum=0.1, device=None, dtype="float32"): 134 | super().__init__() 135 | self.dim = dim 136 | self.eps = eps 137 | self.momentum = momentum 138 | ### BEGIN YOUR SOLUTION 139 | raise NotImplementedError() 140 | ### END YOUR SOLUTION 141 | 142 | def forward(self, x: Tensor) -> Tensor: 143 | ### BEGIN YOUR SOLUTION 144 | raise NotImplementedError() 145 | ### END YOUR SOLUTION 146 | 147 | 148 | class LayerNorm1d(Module): 149 | def __init__(self, dim, eps=1e-5, device=None, dtype="float32"): 150 | super().__init__() 151 | self.dim = dim 152 | self.eps = eps 153 | ### BEGIN YOUR SOLUTION 154 | raise NotImplementedError() 155 | ### END YOUR SOLUTION 156 | 157 | def forward(self, x: Tensor) -> Tensor: 158 | ### BEGIN YOUR SOLUTION 159 | raise NotImplementedError() 160 | ### END YOUR SOLUTION 161 | 162 | 163 | class Dropout(Module): 164 | def __init__(self, p=0.5): 165 | super().__init__() 166 | self.p = p 167 | 168 | def forward(self, x: Tensor) -> Tensor: 169 | ### BEGIN YOUR SOLUTION 170 | raise NotImplementedError() 171 | ### END YOUR SOLUTION 172 | 173 | 174 | class Residual(Module): 175 | def __init__(self, fn: Module): 176 | super().__init__() 177 | self.fn = fn 178 | 179 | def forward(self, x: Tensor) -> Tensor: 180 | ### BEGIN YOUR SOLUTION 181 | raise NotImplementedError() 182 | ### END YOUR SOLUTION 183 | -------------------------------------------------------------------------------- /python/needle/ops/__init__.py: -------------------------------------------------------------------------------- 1 | from .ops_mathematic import * 2 | 3 | from .ops_logarithmic import * 4 | from .ops_tuple import * 5 | -------------------------------------------------------------------------------- /python/needle/ops/ops_logarithmic.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from ..autograd import NDArray 3 | from ..autograd import Op, Tensor, Value, TensorOp 4 | from ..autograd import TensorTuple, TensorTupleOp 5 | 6 | from .ops_mathematic import * 7 | 8 | import numpy as array_api 9 | 10 | class LogSoftmax(TensorOp): 11 | def compute(self, Z): 12 | ### BEGIN YOUR SOLUTION 13 | raise NotImplementedError() 14 | ### END YOUR SOLUTION 15 | 16 | def gradient(self, out_grad, node): 17 | ### BEGIN YOUR SOLUTION 18 | raise NotImplementedError() 19 | ### END YOUR SOLUTION 20 | 21 | 22 | def logsoftmax(a): 23 | return LogSoftmax()(a) 24 | 25 | 26 | class LogSumExp(TensorOp): 27 | def __init__(self, axes: Optional[tuple] = None): 28 | self.axes = axes 29 | 30 | def compute(self, Z): 31 | ### BEGIN YOUR SOLUTION 32 | raise NotImplementedError() 33 | ### END YOUR SOLUTION 34 | 35 | def gradient(self, out_grad, node): 36 | ### BEGIN YOUR SOLUTION 37 | raise NotImplementedError() 38 | ### END YOUR SOLUTION 39 | 40 | 41 | def logsumexp(a, axes=None): 42 | return LogSumExp(axes=axes)(a) 43 | 44 | -------------------------------------------------------------------------------- /python/needle/ops/ops_mathematic.py: -------------------------------------------------------------------------------- 1 | """Operator implementations.""" 2 | 3 | from numbers import Number 4 | from typing import Optional, List, Tuple, Union 5 | 6 | from ..autograd import NDArray 7 | from ..autograd import Op, Tensor, Value, TensorOp 8 | from ..autograd import TensorTuple, TensorTupleOp 9 | import numpy 10 | 11 | # NOTE: we will import numpy as the array_api 12 | # as the backend for our computations, this line will change in later homeworks 13 | 14 | import numpy as array_api 15 | 16 | 17 | class EWiseAdd(TensorOp): 18 | def compute(self, a: NDArray, b: NDArray): 19 | return a + b 20 | 21 | def gradient(self, out_grad: Tensor, node: Tensor): 22 | return out_grad, out_grad 23 | 24 | 25 | def add(a, b): 26 | return EWiseAdd()(a, b) 27 | 28 | 29 | class AddScalar(TensorOp): 30 | def __init__(self, scalar): 31 | self.scalar = scalar 32 | 33 | def compute(self, a: NDArray): 34 | return a + self.scalar 35 | 36 | def gradient(self, out_grad: Tensor, node: Tensor): 37 | return out_grad 38 | 39 | 40 | def add_scalar(a, scalar): 41 | return AddScalar(scalar)(a) 42 | 43 | 44 | class EWiseMul(TensorOp): 45 | def compute(self, a: NDArray, b: NDArray): 46 | return a * b 47 | 48 | def gradient(self, out_grad: Tensor, node: Tensor): 49 | lhs, rhs = node.inputs 50 | return out_grad * rhs, out_grad * lhs 51 | 52 | 53 | def multiply(a, b): 54 | return EWiseMul()(a, b) 55 | 56 | 57 | class MulScalar(TensorOp): 58 | def __init__(self, scalar): 59 | self.scalar = scalar 60 | 61 | def compute(self, a: NDArray): 62 | return a * self.scalar 63 | 64 | def gradient(self, out_grad: Tensor, node: Tensor): 65 | return (out_grad * self.scalar,) 66 | 67 | 68 | def mul_scalar(a, scalar): 69 | return MulScalar(scalar)(a) 70 | 71 | 72 | class EWisePow(TensorOp): 73 | """Op to element-wise raise a tensor to a power.""" 74 | 75 | def compute(self, a: NDArray, b: NDArray) -> NDArray: 76 | return a**b 77 | 78 | def gradient(self, out_grad, node): 79 | if not isinstance(node.inputs[0], NDArray) or not isinstance( 80 | node.inputs[1], NDArray 81 | ): 82 | raise ValueError("Both inputs must be tensors (NDArray).") 83 | 84 | a, b = node.inputs[0], node.inputs[1] 85 | grad_a = out_grad * b * (a ** (b - 1)) 86 | grad_b = out_grad * (a**b) * log(a) 87 | return grad_a, grad_b 88 | 89 | def power(a, b): 90 | return EWisePow()(a, b) 91 | 92 | 93 | class PowerScalar(TensorOp): 94 | """Op raise a tensor to an (integer) power.""" 95 | 96 | def __init__(self, scalar: int): 97 | self.scalar = scalar 98 | 99 | def compute(self, a: NDArray) -> NDArray: 100 | ### BEGIN YOUR SOLUTION 101 | raise NotImplementedError() 102 | ### END YOUR SOLUTION 103 | 104 | def gradient(self, out_grad, node): 105 | ### BEGIN YOUR SOLUTION 106 | raise NotImplementedError() 107 | ### END YOUR SOLUTION 108 | 109 | 110 | def power_scalar(a, scalar): 111 | return PowerScalar(scalar)(a) 112 | 113 | 114 | class EWiseDiv(TensorOp): 115 | """Op to element-wise divide two nodes.""" 116 | 117 | def compute(self, a, b): 118 | ### BEGIN YOUR SOLUTION 119 | raise NotImplementedError() 120 | ### END YOUR SOLUTION 121 | 122 | def gradient(self, out_grad, node): 123 | ### BEGIN YOUR SOLUTION 124 | raise NotImplementedError() 125 | ### END YOUR SOLUTION 126 | 127 | 128 | def divide(a, b): 129 | return EWiseDiv()(a, b) 130 | 131 | 132 | class DivScalar(TensorOp): 133 | def __init__(self, scalar): 134 | self.scalar = scalar 135 | 136 | def compute(self, a): 137 | ### BEGIN YOUR SOLUTION 138 | raise NotImplementedError() 139 | ### END YOUR SOLUTION 140 | 141 | def gradient(self, out_grad, node): 142 | ### BEGIN YOUR SOLUTION 143 | raise NotImplementedError() 144 | ### END YOUR SOLUTION 145 | 146 | 147 | def divide_scalar(a, scalar): 148 | return DivScalar(scalar)(a) 149 | 150 | 151 | class Transpose(TensorOp): 152 | def __init__(self, axes: Optional[tuple] = None): 153 | self.axes = axes 154 | 155 | def compute(self, a): 156 | ### BEGIN YOUR SOLUTION 157 | raise NotImplementedError() 158 | ### END YOUR SOLUTION 159 | 160 | def gradient(self, out_grad, node): 161 | ### BEGIN YOUR SOLUTION 162 | raise NotImplementedError() 163 | ### END YOUR SOLUTION 164 | 165 | 166 | def transpose(a, axes=None): 167 | return Transpose(axes)(a) 168 | 169 | 170 | class Reshape(TensorOp): 171 | def __init__(self, shape): 172 | self.shape = shape 173 | 174 | def compute(self, a): 175 | ### BEGIN YOUR SOLUTION 176 | raise NotImplementedError() 177 | ### END YOUR SOLUTION 178 | 179 | def gradient(self, out_grad, node): 180 | ### BEGIN YOUR SOLUTION 181 | raise NotImplementedError() 182 | ### END YOUR SOLUTION 183 | 184 | 185 | def reshape(a, shape): 186 | return Reshape(shape)(a) 187 | 188 | 189 | class BroadcastTo(TensorOp): 190 | def __init__(self, shape): 191 | self.shape = shape 192 | 193 | def compute(self, a): 194 | ### BEGIN YOUR SOLUTION 195 | raise NotImplementedError() 196 | ### END YOUR SOLUTION 197 | 198 | def gradient(self, out_grad, node): 199 | ### BEGIN YOUR SOLUTION 200 | raise NotImplementedError() 201 | ### END YOUR SOLUTION 202 | 203 | 204 | def broadcast_to(a, shape): 205 | return BroadcastTo(shape)(a) 206 | 207 | 208 | class Summation(TensorOp): 209 | def __init__(self, axes: Optional[tuple] = None): 210 | self.axes = axes 211 | 212 | def compute(self, a): 213 | ### BEGIN YOUR SOLUTION 214 | raise NotImplementedError() 215 | ### END YOUR SOLUTION 216 | 217 | def gradient(self, out_grad, node): 218 | ### BEGIN YOUR SOLUTION 219 | raise NotImplementedError() 220 | ### END YOUR SOLUTION 221 | 222 | 223 | def summation(a, axes=None): 224 | return Summation(axes)(a) 225 | 226 | 227 | class MatMul(TensorOp): 228 | def compute(self, a, b): 229 | ### BEGIN YOUR SOLUTION 230 | raise NotImplementedError() 231 | ### END YOUR SOLUTION 232 | 233 | def gradient(self, out_grad, node): 234 | ### BEGIN YOUR SOLUTION 235 | raise NotImplementedError() 236 | ### END YOUR SOLUTION 237 | 238 | 239 | def matmul(a, b): 240 | return MatMul()(a, b) 241 | 242 | 243 | class Negate(TensorOp): 244 | def compute(self, a): 245 | ### BEGIN YOUR SOLUTION 246 | raise NotImplementedError() 247 | ### END YOUR SOLUTION 248 | 249 | def gradient(self, out_grad, node): 250 | ### BEGIN YOUR SOLUTION 251 | raise NotImplementedError() 252 | ### END YOUR SOLUTION 253 | 254 | 255 | def negate(a): 256 | return Negate()(a) 257 | 258 | 259 | class Log(TensorOp): 260 | def compute(self, a): 261 | ### BEGIN YOUR SOLUTION 262 | raise NotImplementedError() 263 | ### END YOUR SOLUTION 264 | 265 | def gradient(self, out_grad, node): 266 | ### BEGIN YOUR SOLUTION 267 | raise NotImplementedError() 268 | ### END YOUR SOLUTION 269 | 270 | 271 | def log(a): 272 | return Log()(a) 273 | 274 | 275 | class Exp(TensorOp): 276 | def compute(self, a): 277 | ### BEGIN YOUR SOLUTION 278 | raise NotImplementedError() 279 | ### END YOUR SOLUTION 280 | 281 | def gradient(self, out_grad, node): 282 | ### BEGIN YOUR SOLUTION 283 | raise NotImplementedError() 284 | ### END YOUR SOLUTION 285 | 286 | 287 | def exp(a): 288 | return Exp()(a) 289 | 290 | 291 | class ReLU(TensorOp): 292 | def compute(self, a): 293 | ### BEGIN YOUR SOLUTION 294 | raise NotImplementedError() 295 | ### END YOUR SOLUTION 296 | 297 | def gradient(self, out_grad, node): 298 | ### BEGIN YOUR SOLUTION 299 | raise NotImplementedError() 300 | ### END YOUR SOLUTION 301 | 302 | 303 | def relu(a): 304 | return ReLU()(a) 305 | -------------------------------------------------------------------------------- /python/needle/ops/ops_tuple.py: -------------------------------------------------------------------------------- 1 | from ..autograd import Op, Tensor, TensorTuple, Value, TensorOp, TensorTupleOp 2 | 3 | 4 | class MakeTensorTuple(TensorTupleOp): 5 | def compute(self, *args) -> tuple: 6 | return tuple(args) 7 | 8 | def gradient(self, out_grad, node): 9 | assert isinstance(out_grad, TensorTuple) 10 | return tuple(*[out_grad[i] for i in range(len(out_grad))]) 11 | 12 | 13 | def make_tuple(*args): 14 | return MakeTensorTuple()(*args) 15 | 16 | 17 | class TupleGetItem(TensorOp): 18 | def __init__(self, index): 19 | self.index = index 20 | 21 | def __call__(self, a: TensorTuple, fold_const=True) -> Value: 22 | assert isinstance(a, TensorTuple) 23 | # constant folding 24 | if fold_const and isinstance(a.op, MakeTensorTuple): 25 | return a.inputs[self.index] 26 | return Tensor.make_from_op(self, [a]) 27 | 28 | def compute(self, a): 29 | return a[self.index] 30 | 31 | def gradient(self, out_grad, node): 32 | index = self.index 33 | in_grad = [] 34 | for i, value in enumerate(node.inputs[0]): 35 | if i != index: 36 | in_grad.append(init.zeros_like(value)) 37 | else: 38 | in_grad.append(out_grad) 39 | return MakeTensorTuple()(*in_grad) 40 | 41 | 42 | def tuple_get_item(value, index): 43 | return TupleGetItem(index)(value) 44 | 45 | 46 | class FusedAddScalars(TensorTupleOp): 47 | def __init__(self, c0: float, c1: float): 48 | self.c0 = c0 49 | self.c1 = c1 50 | 51 | def compute(self, a): 52 | return a + self.c0, a + self.c1 53 | 54 | def gradient(self, out_grad, node): 55 | return out_grad[0] + out_grad[1] 56 | 57 | 58 | def fused_add_scalars(x, c0, c1): 59 | return FusedAddScalars(c0, c1)(x) 60 | -------------------------------------------------------------------------------- /python/needle/optim.py: -------------------------------------------------------------------------------- 1 | """Optimization module""" 2 | import needle as ndl 3 | import numpy as np 4 | 5 | 6 | class Optimizer: 7 | def __init__(self, params): 8 | self.params = params 9 | 10 | def step(self): 11 | raise NotImplementedError() 12 | 13 | def reset_grad(self): 14 | for p in self.params: 15 | p.grad = None 16 | 17 | 18 | class SGD(Optimizer): 19 | def __init__(self, params, lr=0.01, momentum=0.0, weight_decay=0.0): 20 | super().__init__(params) 21 | self.lr = lr 22 | self.momentum = momentum 23 | self.u = {} 24 | self.weight_decay = weight_decay 25 | 26 | def step(self): 27 | ### BEGIN YOUR SOLUTION 28 | raise NotImplementedError() 29 | ### END YOUR SOLUTION 30 | 31 | def clip_grad_norm(self, max_norm=0.25): 32 | """ 33 | Clips gradient norm of parameters. 34 | """ 35 | ### BEGIN YOUR SOLUTION 36 | raise NotImplementedError() 37 | ### END YOUR SOLUTION 38 | 39 | 40 | class Adam(Optimizer): 41 | def __init__( 42 | self, 43 | params, 44 | lr=0.01, 45 | beta1=0.9, 46 | beta2=0.999, 47 | eps=1e-8, 48 | weight_decay=0.0, 49 | ): 50 | super().__init__(params) 51 | self.lr = lr 52 | self.beta1 = beta1 53 | self.beta2 = beta2 54 | self.eps = eps 55 | self.weight_decay = weight_decay 56 | self.t = 0 57 | 58 | self.m = {} 59 | self.v = {} 60 | 61 | def step(self): 62 | ### BEGIN YOUR SOLUTION 63 | raise NotImplementedError() 64 | ### END YOUR SOLUTION 65 | -------------------------------------------------------------------------------- /src/ndarray_backend_cpu.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace needle { 10 | namespace cpu { 11 | 12 | #define ALIGNMENT 256 13 | #define TILE 8 14 | typedef float scalar_t; 15 | const size_t ELEM_SIZE = sizeof(scalar_t); 16 | 17 | 18 | /** 19 | * This is a utility structure for maintaining an array aligned to ALIGNMENT boundaries in 20 | * memory. This alignment should be at least TILE * ELEM_SIZE, though we make it even larger 21 | * here by default. 22 | */ 23 | struct AlignedArray { 24 | AlignedArray(const size_t size) { 25 | int ret = posix_memalign((void**)&ptr, ALIGNMENT, size * ELEM_SIZE); 26 | if (ret != 0) throw std::bad_alloc(); 27 | this->size = size; 28 | } 29 | ~AlignedArray() { free(ptr); } 30 | size_t ptr_as_int() {return (size_t)ptr; } 31 | scalar_t* ptr; 32 | size_t size; 33 | }; 34 | 35 | 36 | 37 | void Fill(AlignedArray* out, scalar_t val) { 38 | /** 39 | * Fill the values of an aligned array with val 40 | */ 41 | for (int i = 0; i < out->size; i++) { 42 | out->ptr[i] = val; 43 | } 44 | } 45 | 46 | 47 | void Compact(const AlignedArray& a, AlignedArray* out, std::vector shape, 48 | std::vector strides, size_t offset) { 49 | /** 50 | * Compact an array in memory 51 | * 52 | * Args: 53 | * a: non-compact representation of the array, given as input 54 | * out: compact version of the array to be written 55 | * shape: shapes of each dimension for a and out 56 | * strides: strides of the *a* array (not out, which has compact strides) 57 | * offset: offset of the *a* array (not out, which has zero offset, being compact) 58 | * 59 | * Returns: 60 | * void (you need to modify out directly, rather than returning anything; this is true for all the 61 | * function will implement here, so we won't repeat this note.) 62 | */ 63 | /// BEGIN SOLUTION 64 | /// END SOLUTION 65 | } 66 | 67 | void EwiseSetitem(const AlignedArray& a, AlignedArray* out, std::vector shape, 68 | std::vector strides, size_t offset) { 69 | /** 70 | * Set items in a (non-compact) array 71 | * 72 | * Args: 73 | * a: _compact_ array whose items will be written to out 74 | * out: non-compact array whose items are to be written 75 | * shape: shapes of each dimension for a and out 76 | * strides: strides of the *out* array (not a, which has compact strides) 77 | * offset: offset of the *out* array (not a, which has zero offset, being compact) 78 | */ 79 | /// BEGIN SOLUTION 80 | /// END SOLUTION 81 | } 82 | 83 | void ScalarSetitem(const size_t size, scalar_t val, AlignedArray* out, std::vector shape, 84 | std::vector strides, size_t offset) { 85 | /** 86 | * Set items is a (non-compact) array 87 | * 88 | * Args: 89 | * size: number of elements to write in out array (note that this will note be the same as 90 | * out.size, because out is a non-compact subset array); it _will_ be the same as the 91 | * product of items in shape, but convenient to just pass it here. 92 | * val: scalar value to write to 93 | * out: non-compact array whose items are to be written 94 | * shape: shapes of each dimension of out 95 | * strides: strides of the out array 96 | * offset: offset of the out array 97 | */ 98 | 99 | /// BEGIN SOLUTION 100 | /// END SOLUTION 101 | } 102 | 103 | void EwiseAdd(const AlignedArray& a, const AlignedArray& b, AlignedArray* out) { 104 | /** 105 | * Set entries in out to be the sum of correspondings entires in a and b. 106 | */ 107 | for (size_t i = 0; i < a.size; i++) { 108 | out->ptr[i] = a.ptr[i] + b.ptr[i]; 109 | } 110 | } 111 | 112 | void ScalarAdd(const AlignedArray& a, scalar_t val, AlignedArray* out) { 113 | /** 114 | * Set entries in out to be the sum of corresponding entry in a plus the scalar val. 115 | */ 116 | for (size_t i = 0; i < a.size; i++) { 117 | out->ptr[i] = a.ptr[i] + val; 118 | } 119 | } 120 | 121 | 122 | /** 123 | * In the code the follows, use the above template to create analogous element-wise 124 | * and and scalar operators for the following functions. See the numpy backend for 125 | * examples of how they should work. 126 | * - EwiseMul, ScalarMul 127 | * - EwiseDiv, ScalarDiv 128 | * - ScalarPower 129 | * - EwiseMaximum, ScalarMaximum 130 | * - EwiseEq, ScalarEq 131 | * - EwiseGe, ScalarGe 132 | * - EwiseLog 133 | * - EwiseExp 134 | * - EwiseTanh 135 | * 136 | * If you implement all these naively, there will be a lot of repeated code, so 137 | * you are welcome (but not required), to use macros or templates to define these 138 | * functions (however you want to do so, as long as the functions match the proper) 139 | * signatures above. 140 | */ 141 | 142 | /// BEGIN SOLUTION 143 | 144 | /// END SOLUTION 145 | 146 | void Matmul(const AlignedArray& a, const AlignedArray& b, AlignedArray* out, uint32_t m, uint32_t n, 147 | uint32_t p) { 148 | /** 149 | * Multiply two (compact) matrices into an output (also compact) matrix. For this implementation 150 | * you can use the "naive" three-loop algorithm. 151 | * 152 | * Args: 153 | * a: compact 2D array of size m x n 154 | * b: compact 2D array of size n x p 155 | * out: compact 2D array of size m x p to write the output to 156 | * m: rows of a / out 157 | * n: columns of a / rows of b 158 | * p: columns of b / out 159 | */ 160 | 161 | /// BEGIN SOLUTION 162 | /// END SOLUTION 163 | } 164 | 165 | inline void AlignedDot(const float* __restrict__ a, 166 | const float* __restrict__ b, 167 | float* __restrict__ out) { 168 | 169 | /** 170 | * Multiply together two TILE x TILE matrices, and _add _the result to out (it is important to add 171 | * the result to the existing out, which you should not set to zero beforehand). We are including 172 | * the compiler flags here that enable the compile to properly use vector operators to implement 173 | * this function. Specifically, the __restrict__ keyword indicates to the compile that a, b, and 174 | * out don't have any overlapping memory (which is necessary in order for vector operations to be 175 | * equivalent to their non-vectorized counterparts (imagine what could happen otherwise if a, b, 176 | * and out had overlapping memory). Similarly the __builtin_assume_aligned keyword tells the 177 | * compiler that the input array will be aligned to the appropriate blocks in memory, which also 178 | * helps the compiler vectorize the code. 179 | * 180 | * Args: 181 | * a: compact 2D array of size TILE x TILE 182 | * b: compact 2D array of size TILE x TILE 183 | * out: compact 2D array of size TILE x TILE to write to 184 | */ 185 | 186 | a = (const float*)__builtin_assume_aligned(a, TILE * ELEM_SIZE); 187 | b = (const float*)__builtin_assume_aligned(b, TILE * ELEM_SIZE); 188 | out = (float*)__builtin_assume_aligned(out, TILE * ELEM_SIZE); 189 | 190 | /// BEGIN SOLUTION 191 | 192 | /// END SOLUTION 193 | } 194 | 195 | void MatmulTiled(const AlignedArray& a, const AlignedArray& b, AlignedArray* out, uint32_t m, 196 | uint32_t n, uint32_t p) { 197 | /** 198 | * Matrix multiplication on tiled representations of array. In this setting, a, b, and out 199 | * are all *4D* compact arrays of the appropriate size, e.g. a is an array of size 200 | * a[m/TILE][n/TILE][TILE][TILE] 201 | * You should do the multiplication tile-by-tile to improve performance of the array (i.e., this 202 | * function should call `AlignedDot()` implemented above). 203 | * 204 | * Note that this function will only be called when m, n, p are all multiples of TILE, so you can 205 | * assume that this division happens without any remainder. 206 | * 207 | * Args: 208 | * a: compact 4D array of size m/TILE x n/TILE x TILE x TILE 209 | * b: compact 4D array of size n/TILE x p/TILE x TILE x TILE 210 | * out: compact 4D array of size m/TILE x p/TILE x TILE x TILE to write to 211 | * m: rows of a / out 212 | * n: columns of a / rows of b 213 | * p: columns of b / out 214 | * 215 | */ 216 | /// BEGIN SOLUTION 217 | 218 | /// END SOLUTION 219 | } 220 | 221 | void ReduceMax(const AlignedArray& a, AlignedArray* out, size_t reduce_size) { 222 | /** 223 | * Reduce by taking maximum over `reduce_size` contiguous blocks. 224 | * 225 | * Args: 226 | * a: compact array of size a.size = out.size * reduce_size to reduce over 227 | * out: compact array to write into 228 | * reduce_size: size of the dimension to reduce over 229 | */ 230 | 231 | /// BEGIN SOLUTION 232 | 233 | /// END SOLUTION 234 | } 235 | 236 | void ReduceSum(const AlignedArray& a, AlignedArray* out, size_t reduce_size) { 237 | /** 238 | * Reduce by taking sum over `reduce_size` contiguous blocks. 239 | * 240 | * Args: 241 | * a: compact array of size a.size = out.size * reduce_size to reduce over 242 | * out: compact array to write into 243 | * reduce_size: size of the dimension to reduce over 244 | */ 245 | 246 | /// BEGIN SOLUTION 247 | 248 | /// END SOLUTION 249 | } 250 | 251 | } // namespace cpu 252 | } // namespace needle 253 | 254 | PYBIND11_MODULE(ndarray_backend_cpu, m) { 255 | namespace py = pybind11; 256 | using namespace needle; 257 | using namespace cpu; 258 | 259 | m.attr("__device_name__") = "cpu"; 260 | m.attr("__tile_size__") = TILE; 261 | 262 | py::class_(m, "Array") 263 | .def(py::init(), py::return_value_policy::take_ownership) 264 | .def("ptr", &AlignedArray::ptr_as_int) 265 | .def_readonly("size", &AlignedArray::size); 266 | 267 | // return numpy array (with copying for simplicity, otherwise garbage 268 | // collection is a pain) 269 | m.def("to_numpy", [](const AlignedArray& a, std::vector shape, 270 | std::vector strides, size_t offset) { 271 | std::vector numpy_strides = strides; 272 | std::transform(numpy_strides.begin(), numpy_strides.end(), numpy_strides.begin(), 273 | [](size_t& c) { return c * ELEM_SIZE; }); 274 | return py::array_t(shape, numpy_strides, a.ptr + offset); 275 | }); 276 | 277 | // convert from numpy (with copying) 278 | m.def("from_numpy", [](py::array_t a, AlignedArray* out) { 279 | std::memcpy(out->ptr, a.request().ptr, out->size * ELEM_SIZE); 280 | }); 281 | 282 | m.def("fill", Fill); 283 | m.def("compact", Compact); 284 | m.def("ewise_setitem", EwiseSetitem); 285 | m.def("scalar_setitem", ScalarSetitem); 286 | m.def("ewise_add", EwiseAdd); 287 | m.def("scalar_add", ScalarAdd); 288 | /* 289 | m.def("ewise_mul", EwiseMul); 290 | m.def("scalar_mul", ScalarMul); 291 | m.def("ewise_div", EwiseDiv); 292 | m.def("scalar_div", ScalarDiv); 293 | m.def("scalar_power", ScalarPower); 294 | 295 | m.def("ewise_maximum", EwiseMaximum); 296 | m.def("scalar_maximum", ScalarMaximum); 297 | m.def("ewise_eq", EwiseEq); 298 | m.def("scalar_eq", ScalarEq); 299 | m.def("ewise_ge", EwiseGe); 300 | m.def("scalar_ge", ScalarGe); 301 | 302 | m.def("ewise_log", EwiseLog); 303 | m.def("ewise_exp", EwiseExp); 304 | m.def("ewise_tanh", EwiseTanh); 305 | 306 | m.def("matmul", Matmul); 307 | m.def("matmul_tiled", MatmulTiled); 308 | 309 | m.def("reduce_max", ReduceMax); 310 | m.def("reduce_sum", ReduceSum); 311 | */ 312 | } 313 | -------------------------------------------------------------------------------- /src/ndarray_backend_cuda.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | namespace needle { 10 | namespace cuda { 11 | 12 | #define BASE_THREAD_NUM 256 13 | 14 | #define TILE 4 15 | typedef float scalar_t; 16 | const size_t ELEM_SIZE = sizeof(scalar_t); 17 | 18 | struct CudaArray { 19 | CudaArray(const size_t size) { 20 | cudaError_t err = cudaMalloc(&ptr, size * ELEM_SIZE); 21 | if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err)); 22 | this->size = size; 23 | } 24 | ~CudaArray() { cudaFree(ptr); } 25 | size_t ptr_as_int() { return (size_t)ptr; } 26 | 27 | scalar_t* ptr; 28 | size_t size; 29 | }; 30 | 31 | struct CudaDims { 32 | dim3 block, grid; 33 | }; 34 | 35 | CudaDims CudaOneDim(size_t size) { 36 | /** 37 | * Utility function to get cuda dimensions for 1D call 38 | */ 39 | CudaDims dim; 40 | size_t num_blocks = (size + BASE_THREAD_NUM - 1) / BASE_THREAD_NUM; 41 | dim.block = dim3(BASE_THREAD_NUM, 1, 1); 42 | dim.grid = dim3(num_blocks, 1, 1); 43 | return dim; 44 | } 45 | 46 | #define MAX_VEC_SIZE 8 47 | struct CudaVec { 48 | uint32_t size; 49 | int32_t data[MAX_VEC_SIZE]; 50 | }; 51 | 52 | CudaVec VecToCuda(const std::vector& x) { 53 | CudaVec shape; 54 | if (x.size() > MAX_VEC_SIZE) throw std::runtime_error("Exceeded CUDA supported max dimesions"); 55 | shape.size = x.size(); 56 | for (size_t i = 0; i < x.size(); i++) { 57 | shape.data[i] = x[i]; 58 | } 59 | return shape; 60 | } 61 | 62 | //////////////////////////////////////////////////////////////////////////////// 63 | // Fill call 64 | //////////////////////////////////////////////////////////////////////////////// 65 | 66 | __global__ void FillKernel(scalar_t* out, scalar_t val, size_t size) { 67 | size_t gid = blockIdx.x * blockDim.x + threadIdx.x; 68 | if (gid < size) out[gid] = val; 69 | } 70 | 71 | void Fill(CudaArray* out, scalar_t val) { 72 | CudaDims dim = CudaOneDim(out->size); 73 | FillKernel<<>>(out->ptr, val, out->size); 74 | } 75 | 76 | //////////////////////////////////////////////////////////////////////////////// 77 | // Compact and setitem cals 78 | //////////////////////////////////////////////////////////////////////////////// 79 | 80 | // Untility function to convert contiguous index i to memory location from strides 81 | 82 | 83 | 84 | __global__ void CompactKernel(const scalar_t* a, scalar_t* out, size_t size, CudaVec shape, 85 | CudaVec strides, size_t offset) { 86 | /** 87 | * The CUDA kernel for the compact opeation. This should effectively map a single entry in the 88 | * non-compact input a, to the corresponding item (at location gid) in the compact array out. 89 | * 90 | * Args: 91 | * a: CUDA pointer to a array 92 | * out: CUDA point to out array 93 | * size: size of out array 94 | * shape: vector of shapes of a and out arrays (of type CudaVec, for past passing to CUDA kernel) 95 | * strides: vector of strides of out array 96 | * offset: offset of out array 97 | */ 98 | /// BEGIN SOLUTION 99 | /// END SOLUTION 100 | } 101 | 102 | void Compact(const CudaArray& a, CudaArray* out, std::vector shape, 103 | std::vector strides, size_t offset) { 104 | /** 105 | * Compact an array in memory. Unlike the C++ version, in CUDA this will primarily call the 106 | * relevant CUDA kernel. In this case, we illustrate how you should set this up (i.e., we give 107 | * you the code for this fuction, and also the prototype for the CompactKernel() function). For 108 | * the functions after this, however, you'll need to define these kernels as you see fit to 109 | * execute the underlying function. 110 | * 111 | * Args: 112 | * a: non-compact represntation of the array, given as input 113 | * out: compact version of the array to be written 114 | * shape: shapes of each dimension for a and out 115 | * strides: strides of the *a* array (not out, which has compact strides) 116 | * offset: offset of the *a* array (not out, which has zero offset, being compact) 117 | */ 118 | 119 | // Nothing needs to be added here 120 | CudaDims dim = CudaOneDim(out->size); 121 | CompactKernel<<>>(a.ptr, out->ptr, out->size, VecToCuda(shape), 122 | VecToCuda(strides), offset); 123 | } 124 | 125 | 126 | void EwiseSetitem(const CudaArray& a, CudaArray* out, std::vector shape, 127 | std::vector strides, size_t offset) { 128 | /** 129 | * Set items in a (non-compact) array using CUDA. Yyou will most likely want to implement a 130 | * EwiseSetitemKernel() function, similar to those above, that will do the actual work. 131 | * 132 | * Args: 133 | * a: _compact_ array whose items will be written to out 134 | * out: non-compact array whose items are to be written 135 | * shape: shapes of each dimension for a and out 136 | * strides: strides of the *out* array (not a, which has compact strides) 137 | * offset: offset of the *out* array (not a, which has zero offset, being compact) 138 | */ 139 | /// BEGIN SOLUTION 140 | /// END SOLUTION 141 | } 142 | 143 | 144 | void ScalarSetitem(size_t size, scalar_t val, CudaArray* out, std::vector shape, 145 | std::vector strides, size_t offset) { 146 | /** 147 | * Set items is a (non-compact) array 148 | * 149 | * Args: 150 | * size: number of elements to write in out array (note that this will note be the same as 151 | * out.size, because out is a non-compact subset array); it _will_ be the same as the 152 | * product of items in shape, but covenient to just pass it here. 153 | * val: scalar value to write to 154 | * out: non-compact array whose items are to be written 155 | * shape: shapes of each dimension of out 156 | * strides: strides of the out array 157 | * offset: offset of the out array 158 | */ 159 | /// BEGIN SOLUTION 160 | /// END SOLUTION 161 | } 162 | 163 | //////////////////////////////////////////////////////////////////////////////// 164 | // Elementwise and scalar operations 165 | //////////////////////////////////////////////////////////////////////////////// 166 | 167 | __global__ void EwiseAddKernel(const scalar_t* a, const scalar_t* b, scalar_t* out, size_t size) { 168 | size_t gid = blockIdx.x * blockDim.x + threadIdx.x; 169 | if (gid < size) out[gid] = a[gid] + b[gid]; 170 | } 171 | 172 | void EwiseAdd(const CudaArray& a, const CudaArray& b, CudaArray* out) { 173 | /** 174 | * Add together two CUDA array 175 | */ 176 | CudaDims dim = CudaOneDim(out->size); 177 | EwiseAddKernel<<>>(a.ptr, b.ptr, out->ptr, out->size); 178 | } 179 | 180 | __global__ void ScalarAddKernel(const scalar_t* a, scalar_t val, scalar_t* out, size_t size) { 181 | size_t gid = blockIdx.x * blockDim.x + threadIdx.x; 182 | if (gid < size) out[gid] = a[gid] + val; 183 | } 184 | 185 | void ScalarAdd(const CudaArray& a, scalar_t val, CudaArray* out) { 186 | /** 187 | * Add together a CUDA array and a scalar value. 188 | */ 189 | CudaDims dim = CudaOneDim(out->size); 190 | ScalarAddKernel<<>>(a.ptr, val, out->ptr, out->size); 191 | } 192 | 193 | /** 194 | * In the code the follows, use the above template to create analogous elementise 195 | * and and scalar operators for the following functions. See the numpy backend for 196 | * examples of how they should work. 197 | * - EwiseMul, ScalarMul 198 | * - EwiseDiv, ScalarDiv 199 | * - ScalarPower 200 | * - EwiseMaximum, ScalarMaximum 201 | * - EwiseEq, ScalarEq 202 | * - EwiseGe, ScalarGe 203 | * - EwiseLog 204 | * - EwiseExp 205 | * - EwiseTanh 206 | * 207 | * If you implement all these naively, there will be a lot of repeated code, so 208 | * you are welcome (but not required), to use macros or templates to define these 209 | * functions (however you want to do so, as long as the functions match the proper) 210 | * signatures above. 211 | */ 212 | 213 | /// BEGIN SOLUTION 214 | /// END SOLUTION 215 | 216 | //////////////////////////////////////////////////////////////////////////////// 217 | // Elementwise and scalar operations 218 | //////////////////////////////////////////////////////////////////////////////// 219 | 220 | 221 | void Matmul(const CudaArray& a, const CudaArray& b, CudaArray* out, uint32_t M, uint32_t N, 222 | uint32_t P) { 223 | /** 224 | * Multiply two (compact) matrices into an output (also comapct) matrix. You will want to look 225 | * at the lecture and notes on GPU-based linear algebra to see how to do this. Since ultimately 226 | * mugrade is just evaluating correctness, you _can_ implement a version that simply parallelizes 227 | * over (i,j) entries in the output array. However, to really get the full benefit of this 228 | * problem, we would encourage you to use cooperative fetching, shared memory register tiling, 229 | * and other ideas covered in the class notes. Note that unlike the tiled matmul function in 230 | * the CPU backend, here you should implement a single function that works across all size 231 | * matrices, whether or not they are a multiple of a tile size. As with previous CUDA 232 | * implementations, this function here will largely just set up the kernel call, and you should 233 | * implement the logic in a separate MatmulKernel() call. 234 | * 235 | * 236 | * Args: 237 | * a: compact 2D array of size m x n 238 | * b: comapct 2D array of size n x p 239 | * out: compact 2D array of size m x p to write the output to 240 | * M: rows of a / out 241 | * N: columns of a / rows of b 242 | * P: columns of b / out 243 | */ 244 | 245 | /// BEGIN SOLUTION 246 | /// END SOLUTION 247 | } 248 | 249 | //////////////////////////////////////////////////////////////////////////////// 250 | // Max and sum reductions 251 | //////////////////////////////////////////////////////////////////////////////// 252 | 253 | void ReduceMax(const CudaArray& a, CudaArray* out, size_t reduce_size) { 254 | /** 255 | * Reduce by taking maximum over `reduce_size` contiguous blocks. Even though it is inefficient, 256 | * for simplicity you can perform each reduction in a single CUDA thread. 257 | * 258 | * Args: 259 | * a: compact array of size a.size = out.size * reduce_size to reduce over 260 | * out: compact array to write into 261 | * redice_size: size of the dimension to reduce over 262 | */ 263 | /// BEGIN SOLUTION 264 | /// END SOLUTION 265 | } 266 | 267 | void ReduceSum(const CudaArray& a, CudaArray* out, size_t reduce_size) { 268 | /** 269 | * Reduce by taking summation over `reduce_size` contiguous blocks. Again, for simplicity you 270 | * can perform each reduction in a single CUDA thread. 271 | * 272 | * Args: 273 | * a: compact array of size a.size = out.size * reduce_size to reduce over 274 | * out: compact array to write into 275 | * redice_size: size of the dimension to reduce over 276 | */ 277 | /// BEGIN SOLUTION 278 | /// END SOLUTION 279 | } 280 | 281 | } // namespace cuda 282 | } // namespace needle 283 | 284 | PYBIND11_MODULE(ndarray_backend_cuda, m) { 285 | namespace py = pybind11; 286 | using namespace needle; 287 | using namespace cuda; 288 | 289 | m.attr("__device_name__") = "cuda"; 290 | m.attr("__tile_size__") = TILE; 291 | 292 | py::class_(m, "Array") 293 | .def(py::init(), py::return_value_policy::take_ownership) 294 | .def_readonly("size", &CudaArray::size) 295 | .def("ptr", &CudaArray::ptr_as_int); 296 | 297 | // return numpy array, copying from CPU 298 | m.def("to_numpy", [](const CudaArray& a, std::vector shape, std::vector strides, 299 | size_t offset) { 300 | std::vector numpy_strides = strides; 301 | std::transform(numpy_strides.begin(), numpy_strides.end(), numpy_strides.begin(), 302 | [](size_t& c) { return c * ELEM_SIZE; }); 303 | 304 | // copy memory to host 305 | scalar_t* host_ptr = (scalar_t*)std::malloc(a.size * ELEM_SIZE); 306 | if (host_ptr == 0) throw std::bad_alloc(); 307 | cudaError_t err = cudaMemcpy(host_ptr, a.ptr, a.size * ELEM_SIZE, cudaMemcpyDeviceToHost); 308 | if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err)); 309 | 310 | // return numpy array 311 | py::capsule deallocate_buffer(host_ptr, [](void* p) { free(p); }); 312 | return py::array_t(shape, numpy_strides, host_ptr + offset, deallocate_buffer); 313 | }); 314 | 315 | // copy numpy array to GPU 316 | m.def("from_numpy", [](py::array_t a, CudaArray* out) { 317 | cudaError_t err = 318 | cudaMemcpy(out->ptr, a.request().ptr, out->size * ELEM_SIZE, cudaMemcpyHostToDevice); 319 | if (err != cudaSuccess) throw std::runtime_error(cudaGetErrorString(err)); 320 | }); 321 | 322 | m.def("fill", Fill); 323 | m.def("compact", Compact); 324 | m.def("ewise_setitem", EwiseSetitem); 325 | m.def("scalar_setitem", ScalarSetitem); 326 | m.def("ewise_add", EwiseAdd); 327 | m.def("scalar_add", ScalarAdd); 328 | /* 329 | m.def("ewise_mul", EwiseMul); 330 | m.def("scalar_mul", ScalarMul); 331 | m.def("ewise_div", EwiseDiv); 332 | m.def("scalar_div", ScalarDiv); 333 | m.def("scalar_power", ScalarPower); 334 | 335 | m.def("ewise_maximum", EwiseMaximum); 336 | m.def("scalar_maximum", ScalarMaximum); 337 | m.def("ewise_eq", EwiseEq); 338 | m.def("scalar_eq", ScalarEq); 339 | m.def("ewise_ge", EwiseGe); 340 | m.def("scalar_ge", ScalarGe); 341 | 342 | m.def("ewise_log", EwiseLog); 343 | m.def("ewise_exp", EwiseExp); 344 | m.def("ewise_tanh", EwiseTanh); 345 | 346 | m.def("matmul", Matmul); 347 | 348 | m.def("reduce_max", ReduceMax); 349 | m.def("reduce_sum", ReduceSum); 350 | */ 351 | } 352 | --------------------------------------------------------------------------------