├── .gitignore ├── LICENSE ├── README.md └── Solution ├── Exercise_01.ipynb ├── Exercise_02.ipynb ├── Exercise_03.ipynb ├── Exercise_04.ipynb ├── Exercise_05.ipynb ├── Exercise_06.ipynb ├── Exercise_07.ipynb ├── Exercise_08.ipynb └── Exercise_09.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | *.i 2 | *.ii 3 | *.gpu 4 | *.ptx 5 | *.cubin 6 | *.fatbin 7 | .vscode/* 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 SuperChange001 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CUDA_Learning 2 | This is my hobby project, for preparing the FPGA RTX interface. 3 | -------------------------------------------------------------------------------- /Solution/Exercise_01.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Exercise_01.ipynb", 7 | "provenance": [] 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "gZABpep_V-8C" 23 | }, 24 | "source": [ 25 | "# CUDA Exercise 01\n", 26 | "> This is Hello World exampel! \n", 27 | "\n", 28 | "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n", 29 | "\n", 30 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_01.ipynb)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": { 36 | "id": "P401L2N_WG6R" 37 | }, 38 | "source": [ 39 | "## Initialize the CUDA dev environment" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "metadata": { 45 | "colab": { 46 | "base_uri": "https://localhost:8080/" 47 | }, 48 | "id": "OONoNFZeV63L", 49 | "outputId": "504e7952-c8c0-4e92-cb59-cdd013efed4d" 50 | }, 51 | "source": [ 52 | "# clone the code repo,\n", 53 | "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n", 54 | "%load_ext nvcc_plugin\n", 55 | "\n", 56 | "# Check the environment \n", 57 | "!lsb_release -a\n", 58 | "!nvcc --version\n", 59 | "!nvidia-smi" 60 | ], 61 | "execution_count": 1, 62 | "outputs": [ 63 | { 64 | "output_type": "stream", 65 | "text": [ 66 | "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n", 67 | " Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-7rots3_w\n", 68 | " Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-7rots3_w\n", 69 | "Building wheels for collected packages: NVCCPlugin\n", 70 | " Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 71 | " Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=6a39638fb6e744f8984e03b1347b0b350bec4c23334d79c353a9e16c0981cae4\n", 72 | " Stored in directory: /tmp/pip-ephem-wheel-cache-11ej3wib/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n", 73 | "Successfully built NVCCPlugin\n", 74 | "Installing collected packages: NVCCPlugin\n", 75 | "Successfully installed NVCCPlugin-0.0.2\n", 76 | "Default out bin result.out\n", 77 | "No LSB modules are available.\n", 78 | "Distributor ID:\tUbuntu\n", 79 | "Description:\tUbuntu 18.04.5 LTS\n", 80 | "Release:\t18.04\n", 81 | "Codename:\tbionic\n", 82 | "nvcc: NVIDIA (R) Cuda compiler driver\n", 83 | "Copyright (c) 2005-2020 NVIDIA Corporation\n", 84 | "Built on Wed_Jul_22_19:09:09_PDT_2020\n", 85 | "Cuda compilation tools, release 11.0, V11.0.221\n", 86 | "Build cuda_11.0_bu.TC445_37.28845127_0\n", 87 | "Thu Apr 22 20:52:22 2021 \n", 88 | "+-----------------------------------------------------------------------------+\n", 89 | "| NVIDIA-SMI 465.19.01 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", 90 | "|-------------------------------+----------------------+----------------------+\n", 91 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 92 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 93 | "| | | MIG M. |\n", 94 | "|===============================+======================+======================|\n", 95 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 96 | "| N/A 49C P8 10W / 70W | 0MiB / 15109MiB | 0% Default |\n", 97 | "| | | N/A |\n", 98 | "+-------------------------------+----------------------+----------------------+\n", 99 | " \n", 100 | "+-----------------------------------------------------------------------------+\n", 101 | "| Processes: |\n", 102 | "| GPU GI CI PID Type Process name GPU Memory |\n", 103 | "| ID ID Usage |\n", 104 | "|=============================================================================|\n", 105 | "| No running processes found |\n", 106 | "+-----------------------------------------------------------------------------+\n" 107 | ], 108 | "name": "stdout" 109 | } 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": { 115 | "id": "DDN2x4izW0rO" 116 | }, 117 | "source": [ 118 | "## Hello World" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "metadata": { 124 | "colab": { 125 | "base_uri": "https://localhost:8080/", 126 | "height": 35 127 | }, 128 | "id": "egrZEZ3MWaP_", 129 | "outputId": "0cc15126-0b1b-4c93-d9de-3e91acb770ce" 130 | }, 131 | "source": [ 132 | "%%cu\n", 133 | "#include \n", 134 | "__global__ void cuda_hello(){\n", 135 | " printf(\"Hello World from GPU!\\n\");\n", 136 | "}\n", 137 | "\n", 138 | "int main() {\n", 139 | " cuda_hello<<<1,1>>>();\n", 140 | " cudaDeviceSynchronize();\n", 141 | "\n", 142 | " return 0;\n", 143 | "}" 144 | ], 145 | "execution_count": 3, 146 | "outputs": [ 147 | { 148 | "output_type": "execute_result", 149 | "data": { 150 | "application/vnd.google.colaboratory.intrinsic+json": { 151 | "type": "string" 152 | }, 153 | "text/plain": [ 154 | "'Hello World from GPU!\\n'" 155 | ] 156 | }, 157 | "metadata": { 158 | "tags": [] 159 | }, 160 | "execution_count": 3 161 | } 162 | ] 163 | } 164 | ] 165 | } 166 | -------------------------------------------------------------------------------- /Solution/Exercise_02.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Exercise_02.ipynb", 7 | "provenance": [] 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "gZABpep_V-8C" 23 | }, 24 | "source": [ 25 | "# CUDA Exercise 02\n", 26 | "> Vector add example with CPU and GPU, only applied with single thread. \n", 27 | "\n", 28 | "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n", 29 | "\n", 30 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_02.ipynb)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": { 36 | "id": "P401L2N_WG6R" 37 | }, 38 | "source": [ 39 | "## Initialize the CUDA dev environment" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "metadata": { 45 | "colab": { 46 | "base_uri": "https://localhost:8080/" 47 | }, 48 | "id": "OONoNFZeV63L", 49 | "outputId": "300c7939-3fac-4eaf-bbfe-1d3641a779f4" 50 | }, 51 | "source": [ 52 | "# clone the code repo,\n", 53 | "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n", 54 | "%load_ext nvcc_plugin\n", 55 | "\n", 56 | "# Check the environment \n", 57 | "!lsb_release -a\n", 58 | "!nvcc --version\n", 59 | "!nvidia-smi" 60 | ], 61 | "execution_count": 1, 62 | "outputs": [ 63 | { 64 | "output_type": "stream", 65 | "text": [ 66 | "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n", 67 | " Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-22k37xu7\n", 68 | " Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-22k37xu7\n", 69 | "Building wheels for collected packages: NVCCPlugin\n", 70 | " Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 71 | " Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=bc12d7017a71a934fd7d39e61241824922d949a1086f514170ffd209c2dc57b5\n", 72 | " Stored in directory: /tmp/pip-ephem-wheel-cache-4zyegsxi/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n", 73 | "Successfully built NVCCPlugin\n", 74 | "Installing collected packages: NVCCPlugin\n", 75 | "Successfully installed NVCCPlugin-0.0.2\n", 76 | "Default out bin result.out\n", 77 | "No LSB modules are available.\n", 78 | "Distributor ID:\tUbuntu\n", 79 | "Description:\tUbuntu 18.04.5 LTS\n", 80 | "Release:\t18.04\n", 81 | "Codename:\tbionic\n", 82 | "nvcc: NVIDIA (R) Cuda compiler driver\n", 83 | "Copyright (c) 2005-2020 NVIDIA Corporation\n", 84 | "Built on Wed_Jul_22_19:09:09_PDT_2020\n", 85 | "Cuda compilation tools, release 11.0, V11.0.221\n", 86 | "Build cuda_11.0_bu.TC445_37.28845127_0\n", 87 | "Thu Apr 22 21:04:18 2021 \n", 88 | "+-----------------------------------------------------------------------------+\n", 89 | "| NVIDIA-SMI 465.19.01 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", 90 | "|-------------------------------+----------------------+----------------------+\n", 91 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 92 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 93 | "| | | MIG M. |\n", 94 | "|===============================+======================+======================|\n", 95 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 96 | "| N/A 65C P8 11W / 70W | 0MiB / 15109MiB | 0% Default |\n", 97 | "| | | N/A |\n", 98 | "+-------------------------------+----------------------+----------------------+\n", 99 | " \n", 100 | "+-----------------------------------------------------------------------------+\n", 101 | "| Processes: |\n", 102 | "| GPU GI CI PID Type Process name GPU Memory |\n", 103 | "| ID ID Usage |\n", 104 | "|=============================================================================|\n", 105 | "| No running processes found |\n", 106 | "+-----------------------------------------------------------------------------+\n" 107 | ], 108 | "name": "stdout" 109 | } 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": { 115 | "id": "DDN2x4izW0rO" 116 | }, 117 | "source": [ 118 | "## Vector Add" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "metadata": { 124 | "colab": { 125 | "base_uri": "https://localhost:8080/", 126 | "height": 35 127 | }, 128 | "id": "egrZEZ3MWaP_", 129 | "outputId": "9254f1bb-2518-4300-f124-5754b0674021" 130 | }, 131 | "source": [ 132 | "%%cu\n", 133 | "\n", 134 | "#include \n", 135 | "#include \n", 136 | "\n", 137 | "#define VECTOR_LENGTH 10000 \n", 138 | "#define MAX_ERR 1e-4\n", 139 | "\n", 140 | "__global__ void vector_add(float *out, float *a, float *b, int n) \n", 141 | "{\n", 142 | " for(int i = 0; i < n; i++)\n", 143 | " {\n", 144 | " out[i] = a[i] + b[i];\n", 145 | " }\n", 146 | "}\n", 147 | "\n", 148 | "int main()\n", 149 | "{\n", 150 | " float *a, *b, *out;\n", 151 | " float *d_a, *d_b, *d_out; \n", 152 | "\n", 153 | " //===================步骤1===================\n", 154 | " // Allocate memory on CPU\n", 155 | " a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n", 156 | " b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n", 157 | " out = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n", 158 | "\n", 159 | " // data initializtion\n", 160 | " for(int i = 0; i < VECTOR_LENGTH; i++)\n", 161 | " {\n", 162 | " a[i] = 3.0f;\n", 163 | " b[i] = 0.14f;\n", 164 | " }\n", 165 | " //===================步骤1===================\n", 166 | "\n", 167 | " //===================步骤2===================\n", 168 | " // Allocate memory on GPU\n", 169 | " cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);\n", 170 | " cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);\n", 171 | " cudaMalloc((void**)&d_out, sizeof(float) * VECTOR_LENGTH);\n", 172 | " //===================步骤2===================\n", 173 | "\n", 174 | " //===================步骤3===================\n", 175 | " // copy operator to GPU\n", 176 | " cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n", 177 | " cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n", 178 | " //===================步骤3===================\n", 179 | "\n", 180 | " //===================步骤4===================\n", 181 | " // GPU do the work, CPU waits\n", 182 | " vector_add<<<1,1>>>(d_out, d_a, d_b, VECTOR_LENGTH);\n", 183 | " //===================步骤4===================\n", 184 | "\n", 185 | " //===================步骤5===================\n", 186 | " // Get results from the GPU\n", 187 | " cudaMemcpy(out, d_out, sizeof(float) * VECTOR_LENGTH, \n", 188 | " cudaMemcpyDeviceToHost);\n", 189 | " \n", 190 | " // Test the result\n", 191 | " for(int i = 0; i < VECTOR_LENGTH; i++)\n", 192 | " {\n", 193 | " assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);\n", 194 | " }\n", 195 | " printf(\"out[0] is %f\\n\", out[0]);\n", 196 | " printf(\"PASSED\\n\");\n", 197 | " //===================步骤5===================\n", 198 | "\n", 199 | " //===================步骤6===================\n", 200 | " // Free the memory\n", 201 | " cudaFree(d_a);\n", 202 | " cudaFree(d_b);\n", 203 | " cudaFree(d_out);\n", 204 | " free(a);\n", 205 | " free(b);\n", 206 | " free(out);\n", 207 | " //===================步骤6===================\n", 208 | "}" 209 | ], 210 | "execution_count": 2, 211 | "outputs": [ 212 | { 213 | "output_type": "execute_result", 214 | "data": { 215 | "application/vnd.google.colaboratory.intrinsic+json": { 216 | "type": "string" 217 | }, 218 | "text/plain": [ 219 | "'out[0] is 3.140000\\nPASSED\\n'" 220 | ] 221 | }, 222 | "metadata": { 223 | "tags": [] 224 | }, 225 | "execution_count": 2 226 | } 227 | ] 228 | } 229 | ] 230 | } -------------------------------------------------------------------------------- /Solution/Exercise_03.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Exercise_03.ipynb", 7 | "provenance": [] 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "gZABpep_V-8C" 23 | }, 24 | "source": [ 25 | "# CUDA Exercise 03\n", 26 | "> Vector dot product(inner product) example on GPU, only applied with single thread. \n", 27 | "\n", 28 | "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n", 29 | "\n", 30 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_03.ipynb)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": { 36 | "id": "P401L2N_WG6R" 37 | }, 38 | "source": [ 39 | "## Initialize the CUDA dev environment" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "metadata": { 45 | "colab": { 46 | "base_uri": "https://localhost:8080/" 47 | }, 48 | "id": "OONoNFZeV63L", 49 | "outputId": "3e28c708-a18e-40de-a57c-6f8b1b6b08ee" 50 | }, 51 | "source": [ 52 | "# clone the code repo,\n", 53 | "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n", 54 | "%load_ext nvcc_plugin\n", 55 | "\n", 56 | "# Check the environment \n", 57 | "!lsb_release -a\n", 58 | "!nvcc --version\n", 59 | "!nvidia-smi" 60 | ], 61 | "execution_count": 1, 62 | "outputs": [ 63 | { 64 | "output_type": "stream", 65 | "text": [ 66 | "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n", 67 | " Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-dcn3mih6\n", 68 | " Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-dcn3mih6\n", 69 | "Building wheels for collected packages: NVCCPlugin\n", 70 | " Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 71 | " Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=502f57f1df304061f8b68db3c23567f7917f40794f6bdf2e09e21eef86af5570\n", 72 | " Stored in directory: /tmp/pip-ephem-wheel-cache-mk6amdyq/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n", 73 | "Successfully built NVCCPlugin\n", 74 | "Installing collected packages: NVCCPlugin\n", 75 | "Successfully installed NVCCPlugin-0.0.2\n", 76 | "Default out bin result.out\n", 77 | "No LSB modules are available.\n", 78 | "Distributor ID:\tUbuntu\n", 79 | "Description:\tUbuntu 18.04.5 LTS\n", 80 | "Release:\t18.04\n", 81 | "Codename:\tbionic\n", 82 | "nvcc: NVIDIA (R) Cuda compiler driver\n", 83 | "Copyright (c) 2005-2020 NVIDIA Corporation\n", 84 | "Built on Wed_Jul_22_19:09:09_PDT_2020\n", 85 | "Cuda compilation tools, release 11.0, V11.0.221\n", 86 | "Build cuda_11.0_bu.TC445_37.28845127_0\n", 87 | "Thu Apr 22 21:12:57 2021 \n", 88 | "+-----------------------------------------------------------------------------+\n", 89 | "| NVIDIA-SMI 465.19.01 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", 90 | "|-------------------------------+----------------------+----------------------+\n", 91 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 92 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 93 | "| | | MIG M. |\n", 94 | "|===============================+======================+======================|\n", 95 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 96 | "| N/A 41C P8 9W / 70W | 0MiB / 15109MiB | 0% Default |\n", 97 | "| | | N/A |\n", 98 | "+-------------------------------+----------------------+----------------------+\n", 99 | " \n", 100 | "+-----------------------------------------------------------------------------+\n", 101 | "| Processes: |\n", 102 | "| GPU GI CI PID Type Process name GPU Memory |\n", 103 | "| ID ID Usage |\n", 104 | "|=============================================================================|\n", 105 | "| No running processes found |\n", 106 | "+-----------------------------------------------------------------------------+\n" 107 | ], 108 | "name": "stdout" 109 | } 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": { 115 | "id": "DDN2x4izW0rO" 116 | }, 117 | "source": [ 118 | "## Vector Dot Production" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "metadata": { 124 | "colab": { 125 | "base_uri": "https://localhost:8080/", 126 | "height": 35 127 | }, 128 | "id": "egrZEZ3MWaP_", 129 | "outputId": "8662228f-7eec-4158-a27b-5a9934e35909" 130 | }, 131 | "source": [ 132 | "%%cu\n", 133 | "#include \n", 134 | "#include \n", 135 | "\n", 136 | "#define VECTOR_LENGTH 10 \n", 137 | "#define MAX_ERR 1e-5\n", 138 | "\n", 139 | "__global__ void vector_dot_product(float *out, float *a, float *b, int n) \n", 140 | "{\n", 141 | " float sum=0;\n", 142 | " for(int i = 0; i < n; i++)\n", 143 | " {\n", 144 | " sum = sum + a[i] * b[i];\n", 145 | " }\n", 146 | " *out = sum;\n", 147 | "}\n", 148 | "\n", 149 | "void test_vector_dot_product(void)\n", 150 | "{\n", 151 | " float *a, *b, *out;\n", 152 | " float *d_a, *d_b, *d_out; \n", 153 | "\n", 154 | " // Allocate memory on CPU\n", 155 | " a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n", 156 | " b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n", 157 | " out = (float*)malloc(sizeof(float));\n", 158 | "\n", 159 | " // data initializtion\n", 160 | " for(int i = 0; i < VECTOR_LENGTH; i++)\n", 161 | " {\n", 162 | " a[i] = 3.14f;\n", 163 | " b[i] = 2.0f;\n", 164 | " }\n", 165 | "\n", 166 | " // Allocate memory on GPU\n", 167 | " cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);\n", 168 | " cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);\n", 169 | " cudaMalloc((void**)&d_out, sizeof(float));\n", 170 | "\n", 171 | " // copy operator to GPU\n", 172 | " cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n", 173 | " cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n", 174 | "\n", 175 | " // GPU do the work, CPU waits\n", 176 | " vector_dot_product<<<1,1>>>(d_out, d_a, d_b, VECTOR_LENGTH);\n", 177 | " \n", 178 | " // Get results from the GPU\n", 179 | " cudaMemcpy(out, d_out, sizeof(float), \n", 180 | " cudaMemcpyDeviceToHost);\n", 181 | " \n", 182 | " // Test the result\n", 183 | " assert(fabs(*out - 20*3.14) < MAX_ERR);\n", 184 | " \n", 185 | " printf(\"out[0] = %f\\n\", out[0]);\n", 186 | " printf(\"PASSED\\n\");\n", 187 | "\n", 188 | " // Free the memory\n", 189 | " cudaFree(d_a);\n", 190 | " cudaFree(d_b);\n", 191 | " cudaFree(d_out);\n", 192 | " free(a);\n", 193 | " free(b);\n", 194 | " free(out);\n", 195 | "}\n", 196 | "\n", 197 | "int main()\n", 198 | "{\n", 199 | " test_vector_dot_product();\n", 200 | "}" 201 | ], 202 | "execution_count": 2, 203 | "outputs": [ 204 | { 205 | "output_type": "execute_result", 206 | "data": { 207 | "application/vnd.google.colaboratory.intrinsic+json": { 208 | "type": "string" 209 | }, 210 | "text/plain": [ 211 | "'out[0] = 62.799995\\nPASSED\\n'" 212 | ] 213 | }, 214 | "metadata": { 215 | "tags": [] 216 | }, 217 | "execution_count": 2 218 | } 219 | ] 220 | } 221 | ] 222 | } -------------------------------------------------------------------------------- /Solution/Exercise_04.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Exercise_04.ipynb", 7 | "provenance": [] 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | }, 16 | "accelerator": "GPU" 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "gZABpep_V-8C" 23 | }, 24 | "source": [ 25 | "# CUDA Exercise 04\n", 26 | "> Matrix summation example on GPU, only applied with single thread. \n", 27 | "\n", 28 | "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n", 29 | "\n", 30 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_04.ipynb)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": { 36 | "id": "P401L2N_WG6R" 37 | }, 38 | "source": [ 39 | "## Initialize the CUDA dev environment" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "metadata": { 45 | "colab": { 46 | "base_uri": "https://localhost:8080/" 47 | }, 48 | "id": "OONoNFZeV63L", 49 | "outputId": "0de4afb8-9a85-42aa-ec39-d9ab1bc7f898" 50 | }, 51 | "source": [ 52 | "# clone the code repo,\n", 53 | "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n", 54 | "%load_ext nvcc_plugin\n", 55 | "\n", 56 | "# Check the environment \n", 57 | "!lsb_release -a\n", 58 | "!nvcc --version\n", 59 | "!nvidia-smi" 60 | ], 61 | "execution_count": 4, 62 | "outputs": [ 63 | { 64 | "output_type": "stream", 65 | "text": [ 66 | "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n", 67 | " Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-t778hzfn\n", 68 | " Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-t778hzfn\n", 69 | "Requirement already satisfied (use --upgrade to upgrade): NVCCPlugin==0.0.2 from git+git://github.com/depctg/nvcc4jupyter.git in /usr/local/lib/python3.7/dist-packages\n", 70 | "Building wheels for collected packages: NVCCPlugin\n", 71 | " Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 72 | " Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=8873f16186676dbca7cd6b1588c46f86c101f2a8cdd29b38b813a2ca468ed8f7\n", 73 | " Stored in directory: /tmp/pip-ephem-wheel-cache-yr5jb27e/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n", 74 | "Successfully built NVCCPlugin\n", 75 | "The nvcc_plugin extension is already loaded. To reload it, use:\n", 76 | " %reload_ext nvcc_plugin\n", 77 | "No LSB modules are available.\n", 78 | "Distributor ID:\tUbuntu\n", 79 | "Description:\tUbuntu 18.04.5 LTS\n", 80 | "Release:\t18.04\n", 81 | "Codename:\tbionic\n", 82 | "nvcc: NVIDIA (R) Cuda compiler driver\n", 83 | "Copyright (c) 2005-2020 NVIDIA Corporation\n", 84 | "Built on Wed_Jul_22_19:09:09_PDT_2020\n", 85 | "Cuda compilation tools, release 11.0, V11.0.221\n", 86 | "Build cuda_11.0_bu.TC445_37.28845127_0\n", 87 | "Thu Apr 22 21:19:33 2021 \n", 88 | "+-----------------------------------------------------------------------------+\n", 89 | "| NVIDIA-SMI 465.19.01 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", 90 | "|-------------------------------+----------------------+----------------------+\n", 91 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 92 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 93 | "| | | MIG M. |\n", 94 | "|===============================+======================+======================|\n", 95 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 96 | "| N/A 38C P8 9W / 70W | 0MiB / 15109MiB | 0% Default |\n", 97 | "| | | N/A |\n", 98 | "+-------------------------------+----------------------+----------------------+\n", 99 | " \n", 100 | "+-----------------------------------------------------------------------------+\n", 101 | "| Processes: |\n", 102 | "| GPU GI CI PID Type Process name GPU Memory |\n", 103 | "| ID ID Usage |\n", 104 | "|=============================================================================|\n", 105 | "| No running processes found |\n", 106 | "+-----------------------------------------------------------------------------+\n" 107 | ], 108 | "name": "stdout" 109 | } 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": { 115 | "id": "DDN2x4izW0rO" 116 | }, 117 | "source": [ 118 | "## Matrix Summation" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "metadata": { 124 | "colab": { 125 | "base_uri": "https://localhost:8080/", 126 | "height": 35 127 | }, 128 | "id": "egrZEZ3MWaP_", 129 | "outputId": "1e81177e-dd74-421e-ce4e-74b74936b818" 130 | }, 131 | "source": [ 132 | "%%cu\n", 133 | "#include \n", 134 | "#include \n", 135 | "\n", 136 | "#define M 10\n", 137 | "#define N 10\n", 138 | "#define MAX_ERR 1e-4\n", 139 | "\n", 140 | "__global__ void matrix_summation(float* out, float *a, float *b, int m, int n) \n", 141 | "{\n", 142 | " int index;\n", 143 | " for(int i = 0; i < m; i++)\n", 144 | " {\n", 145 | " for(int j = 0; j < n; j++)\n", 146 | " {\n", 147 | " index = i*n+j;\n", 148 | " out[index] = a[index] + b[index];\n", 149 | " }\n", 150 | " }\n", 151 | "}\n", 152 | "\n", 153 | "int main()\n", 154 | "{\n", 155 | " float *a, *b, *out;\n", 156 | " float *d_a, *d_b, *d_out;\n", 157 | " \n", 158 | " a = (float*)malloc(sizeof(float) * (M * N));\n", 159 | " b = (float*)malloc(sizeof(float) * (M * N));\n", 160 | " out = (float*)malloc(sizeof(float) * (M * N));\n", 161 | "\n", 162 | " // data initializtion\n", 163 | " for(int i = 0; i < M; i++)\n", 164 | " {\n", 165 | " for(int j = 0; j < N; j++)\n", 166 | " {\n", 167 | " int index = i*N+j;\n", 168 | " a[index] = i*3.14f;\n", 169 | " b[index] = j;\n", 170 | " }\n", 171 | " }\n", 172 | " printf(\"a[12] = %f\\n\", a[12]);\n", 173 | " printf(\"b[12] = %f\\n\", b[12]);\n", 174 | "\n", 175 | " // Allocate memory on GPU\n", 176 | " cudaMalloc((void**)&d_a, sizeof(float) * (M * N));\n", 177 | " cudaMalloc((void**)&d_b, sizeof(float) * (M * N));\n", 178 | " cudaMalloc((void**)&d_out, sizeof(float) * (M * N));\n", 179 | "\n", 180 | " // copy operator to GPU\n", 181 | " cudaMemcpy(d_a, a, sizeof(float) * (M * N), cudaMemcpyHostToDevice);\n", 182 | " cudaMemcpy(d_b, b, sizeof(float) * (M * N), cudaMemcpyHostToDevice);\n", 183 | "\n", 184 | " // GPU do the work, CPU waits\n", 185 | " matrix_summation<<<1,1>>>(d_out, d_a, d_b, M, N);\n", 186 | " \n", 187 | " // Get results from the GPU\n", 188 | " cudaMemcpy(out, d_out, sizeof(float) * (M * N), \n", 189 | " cudaMemcpyDeviceToHost);\n", 190 | " \n", 191 | " // Test the result\n", 192 | " for(int i = 0; i < M; i++)\n", 193 | " {\n", 194 | " for(int j = 0; j < N; j++)\n", 195 | " {\n", 196 | " int index = i*N+j;\n", 197 | " assert(fabs(out[index] - a[index] - b[index]) < MAX_ERR);\n", 198 | " }\n", 199 | " }\n", 200 | " printf(\"out[12] = %f\\n\", out[12]);\n", 201 | " printf(\"PASSED\\n\");\n", 202 | " \n", 203 | " cudaDeviceSynchronize();\n", 204 | " // Free the memory\n", 205 | " cudaFree(d_a);\n", 206 | " cudaFree(d_b);\n", 207 | " cudaFree(d_out);\n", 208 | " free(a);\n", 209 | " free(b);\n", 210 | " free(out);\n", 211 | " \n", 212 | " return 0;\n", 213 | "}" 214 | ], 215 | "execution_count": 5, 216 | "outputs": [ 217 | { 218 | "output_type": "execute_result", 219 | "data": { 220 | "application/vnd.google.colaboratory.intrinsic+json": { 221 | "type": "string" 222 | }, 223 | "text/plain": [ 224 | "'a[12] = 3.140000\\nb[12] = 2.000000\\nout[12] = 5.140000\\nPASSED\\n'" 225 | ] 226 | }, 227 | "metadata": { 228 | "tags": [] 229 | }, 230 | "execution_count": 5 231 | } 232 | ] 233 | } 234 | ] 235 | } 236 | -------------------------------------------------------------------------------- /Solution/Exercise_05.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Exercise_05.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "accelerator": "GPU" 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "gZABpep_V-8C" 24 | }, 25 | "source": [ 26 | "# CUDA Exercise 05\n", 27 | "> Parallelized Vector add. \n", 28 | "\n", 29 | "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n", 30 | "\n", 31 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_05.ipynb)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": { 37 | "id": "P401L2N_WG6R" 38 | }, 39 | "source": [ 40 | "## Initialize the CUDA dev environment" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "metadata": { 46 | "colab": { 47 | "base_uri": "https://localhost:8080/" 48 | }, 49 | "id": "OONoNFZeV63L", 50 | "outputId": "e15d11f8-6c0f-43b7-b60e-675822ac8794" 51 | }, 52 | "source": [ 53 | "# clone the code repo,\n", 54 | "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n", 55 | "%load_ext nvcc_plugin\n", 56 | "\n", 57 | "# Check the environment \n", 58 | "!lsb_release -a\n", 59 | "!nvcc --version\n", 60 | "!nvidia-smi" 61 | ], 62 | "execution_count": 1, 63 | "outputs": [ 64 | { 65 | "output_type": "stream", 66 | "text": [ 67 | "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n", 68 | " Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-2r93udvh\n", 69 | " Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-2r93udvh\n", 70 | "Building wheels for collected packages: NVCCPlugin\n", 71 | " Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 72 | " Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=4010fe33cb0bdc3a44bc6c4d10aea34076d9daf8c6daec21c1a1544f0ab1b3f4\n", 73 | " Stored in directory: /tmp/pip-ephem-wheel-cache-y67t9ubh/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n", 74 | "Successfully built NVCCPlugin\n", 75 | "Installing collected packages: NVCCPlugin\n", 76 | "Successfully installed NVCCPlugin-0.0.2\n", 77 | "Default out bin result.out\n", 78 | "No LSB modules are available.\n", 79 | "Distributor ID:\tUbuntu\n", 80 | "Description:\tUbuntu 18.04.5 LTS\n", 81 | "Release:\t18.04\n", 82 | "Codename:\tbionic\n", 83 | "nvcc: NVIDIA (R) Cuda compiler driver\n", 84 | "Copyright (c) 2005-2020 NVIDIA Corporation\n", 85 | "Built on Wed_Jul_22_19:09:09_PDT_2020\n", 86 | "Cuda compilation tools, release 11.0, V11.0.221\n", 87 | "Build cuda_11.0_bu.TC445_37.28845127_0\n", 88 | "Thu Apr 22 21:31:17 2021 \n", 89 | "+-----------------------------------------------------------------------------+\n", 90 | "| NVIDIA-SMI 465.19.01 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", 91 | "|-------------------------------+----------------------+----------------------+\n", 92 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 93 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 94 | "| | | MIG M. |\n", 95 | "|===============================+======================+======================|\n", 96 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 97 | "| N/A 50C P8 9W / 70W | 0MiB / 15109MiB | 0% Default |\n", 98 | "| | | N/A |\n", 99 | "+-------------------------------+----------------------+----------------------+\n", 100 | " \n", 101 | "+-----------------------------------------------------------------------------+\n", 102 | "| Processes: |\n", 103 | "| GPU GI CI PID Type Process name GPU Memory |\n", 104 | "| ID ID Usage |\n", 105 | "|=============================================================================|\n", 106 | "| No running processes found |\n", 107 | "+-----------------------------------------------------------------------------+\n" 108 | ], 109 | "name": "stdout" 110 | } 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": { 116 | "id": "DDN2x4izW0rO" 117 | }, 118 | "source": [ 119 | "## Vector Add with Single Thread" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "metadata": { 125 | "colab": { 126 | "base_uri": "https://localhost:8080/" 127 | }, 128 | "id": "egrZEZ3MWaP_", 129 | "outputId": "7d95b219-bf9e-4f5f-e443-e6219f453dd0" 130 | }, 131 | "source": [ 132 | "%%writefile verctor_add_signal_thread.cu\n", 133 | "\n", 134 | "#include \n", 135 | "#include \n", 136 | "\n", 137 | "#define VECTOR_LENGTH 10000 \n", 138 | "#define MAX_ERR 1e-4\n", 139 | "\n", 140 | "__global__ void vector_add(float *out, float *a, float *b, int n) \n", 141 | "{\n", 142 | " for(int i = 0; i < n; i++)\n", 143 | " {\n", 144 | " out[i] = a[i] + b[i];\n", 145 | " }\n", 146 | "}\n", 147 | "\n", 148 | "int main()\n", 149 | "{\n", 150 | " float *a, *b, *out;\n", 151 | " float *d_a, *d_b, *d_out; \n", 152 | "\n", 153 | " // Allocate memory on CPU\n", 154 | " a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n", 155 | " b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n", 156 | " out = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n", 157 | "\n", 158 | " // data initializtion\n", 159 | " for(int i = 0; i < VECTOR_LENGTH; i++)\n", 160 | " {\n", 161 | " a[i] = 3.0f;\n", 162 | " b[i] = 0.14f;\n", 163 | " }\n", 164 | "\n", 165 | " // Allocate memory on GPU\n", 166 | " cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);\n", 167 | " cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);\n", 168 | " cudaMalloc((void**)&d_out, sizeof(float) * VECTOR_LENGTH);\n", 169 | "\n", 170 | " // copy operator to GPU\n", 171 | " cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n", 172 | " cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n", 173 | "\n", 174 | " for(int i=0;i<100;i++)\n", 175 | " {\n", 176 | " // GPU do the work, CPU waits\n", 177 | " vector_add<<<1,1>>>(d_out, d_a, d_b, VECTOR_LENGTH);\n", 178 | " }\n", 179 | " // Get results from the GPU\n", 180 | " cudaMemcpy(out, d_out, sizeof(float) * VECTOR_LENGTH, \n", 181 | " cudaMemcpyDeviceToHost);\n", 182 | " \n", 183 | " // Test the result\n", 184 | " for(int i = 0; i < VECTOR_LENGTH; i++){\n", 185 | " assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);\n", 186 | " }\n", 187 | " printf(\"out[0] = %f\\n\", out[0]);\n", 188 | " printf(\"PASSED\\n\");\n", 189 | "\n", 190 | " // Free the memory\n", 191 | " cudaFree(d_a);\n", 192 | " cudaFree(d_b);\n", 193 | " cudaFree(d_out);\n", 194 | " free(a);\n", 195 | " free(b);\n", 196 | " free(out);\n", 197 | "}" 198 | ], 199 | "execution_count": 2, 200 | "outputs": [ 201 | { 202 | "output_type": "stream", 203 | "text": [ 204 | "Writing verctor_add_signal_thread.cu\n" 205 | ], 206 | "name": "stdout" 207 | } 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": { 213 | "id": "TZI-nXBxefbc" 214 | }, 215 | "source": [ 216 | "## Vector Add with Multiple Threads" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "metadata": { 222 | "colab": { 223 | "base_uri": "https://localhost:8080/" 224 | }, 225 | "id": "cxsC_CQRen43", 226 | "outputId": "184cc2e3-eea7-4731-df27-2fd165c475b6" 227 | }, 228 | "source": [ 229 | "%%writefile verctor_add_multi_thread.cu\n", 230 | "\n", 231 | "#include \n", 232 | "#include \n", 233 | "#include \n", 234 | "\n", 235 | "#define VECTOR_LENGTH 10000\n", 236 | "#define MAX_ERR 1e-4\n", 237 | "\n", 238 | "__global__ void vector_add(float *out, float *a, float *b, int n) \n", 239 | "{\n", 240 | " int index = threadIdx.x;\n", 241 | " int stride = blockDim.x;\n", 242 | " for(int i = index; i < n; i=i+stride)\n", 243 | " {\n", 244 | " out[i] = a[i] + b[i];\n", 245 | " }\n", 246 | "}\n", 247 | "\n", 248 | "int main(int argc, char *argv[])\n", 249 | "{\n", 250 | " float *a, *b, *out;\n", 251 | " float *d_a, *d_b, *d_out;\n", 252 | " int list_of_test_block_size[]={1,64,128,256,512,1024};\n", 253 | " int block_size = 1;\n", 254 | " \n", 255 | " if( argc == 2 ) {\n", 256 | " //printf(\"The argument supplied is %s\\n\", argv[1]);\n", 257 | " int arg1 = atoi(argv[1]); //argv[0] is the program name\n", 258 | " //atoi = ascii to int\n", 259 | " \n", 260 | " block_size = list_of_test_block_size[arg1];\n", 261 | " }\n", 262 | " else if( argc > 2 ) {\n", 263 | " printf(\"Too many arguments supplied.\\n\");\n", 264 | " }\n", 265 | " else {\n", 266 | " printf(\"One argument expected.\\n\");\n", 267 | " \n", 268 | " }\n", 269 | " \n", 270 | " printf(\"The Block size is %d.\\n\", block_size);\n", 271 | "\n", 272 | " // Allocate memory on CPU\n", 273 | " a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n", 274 | " b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n", 275 | " out = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n", 276 | "\n", 277 | " // data initializtion\n", 278 | " for(int i = 0; i < VECTOR_LENGTH; i++)\n", 279 | " {\n", 280 | " a[i] = 3.0f;\n", 281 | " b[i] = 0.14f;\n", 282 | " }\n", 283 | "\n", 284 | " // Allocate memory on GPU\n", 285 | " cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);\n", 286 | " cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);\n", 287 | " cudaMalloc((void**)&d_out, sizeof(float) * VECTOR_LENGTH);\n", 288 | "\n", 289 | " // copy operator to GPU\n", 290 | " cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n", 291 | " cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n", 292 | "\n", 293 | " for(int i=0;i<100;i++)\n", 294 | " {\n", 295 | " // GPU do the work, CPU waits\n", 296 | " vector_add<<<1,block_size>>>(d_out, d_a, d_b, VECTOR_LENGTH);\n", 297 | " }\n", 298 | " // Get results from the GPU\n", 299 | " cudaMemcpy(out, d_out, sizeof(float) * VECTOR_LENGTH, \n", 300 | " cudaMemcpyDeviceToHost);\n", 301 | " \n", 302 | " // Test the result\n", 303 | " for(int i = 0; i < VECTOR_LENGTH; i++){\n", 304 | " assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);\n", 305 | " }\n", 306 | " printf(\"out[0] = %f\\n\", out[0]);\n", 307 | " printf(\"PASSED\\n\");\n", 308 | "\n", 309 | " // Free the memory\n", 310 | " cudaFree(d_a);\n", 311 | " cudaFree(d_b);\n", 312 | " cudaFree(d_out);\n", 313 | " free(a);\n", 314 | " free(b);\n", 315 | " free(out);\n", 316 | "}" 317 | ], 318 | "execution_count": 3, 319 | "outputs": [ 320 | { 321 | "output_type": "stream", 322 | "text": [ 323 | "Writing verctor_add_multi_thread.cu\n" 324 | ], 325 | "name": "stdout" 326 | } 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": { 332 | "id": "d9Zw1YvsewRK" 333 | }, 334 | "source": [ 335 | "## Evaluation" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": { 341 | "id": "q4j_yDKhfHzv" 342 | }, 343 | "source": [ 344 | "Measuring the time cost of executing the CUDA fucntion with **signle thread**" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "metadata": { 350 | "colab": { 351 | "base_uri": "https://localhost:8080/" 352 | }, 353 | "id": "uOfZmgUxezqF", 354 | "outputId": "5be83f69-6a78-4e48-f87e-83b28c36aac1" 355 | }, 356 | "source": [ 357 | "!nvcc -o verctor_add_signal_thread verctor_add_signal_thread.cu\n", 358 | "!nvprof ./verctor_add_signal_thread" 359 | ], 360 | "execution_count": 4, 361 | "outputs": [ 362 | { 363 | "output_type": "stream", 364 | "text": [ 365 | "==166== NVPROF is profiling process 166, command: ./verctor_add_signal_thread\n", 366 | "out[0] = 3.140000\n", 367 | "PASSED\n", 368 | "==166== Profiling application: ./verctor_add_signal_thread\n", 369 | "==166== Profiling result:\n", 370 | " Type Time(%) Time Calls Avg Min Max Name\n", 371 | " GPU activities: 99.99% 118.76ms 100 1.1876ms 1.1875ms 1.1882ms vector_add(float*, float*, float*, int)\n", 372 | " 0.01% 9.6960us 2 4.8480us 4.7040us 4.9920us [CUDA memcpy HtoD]\n", 373 | " 0.00% 5.1840us 1 5.1840us 5.1840us 5.1840us [CUDA memcpy DtoH]\n", 374 | " API calls: 72.18% 312.33ms 3 104.11ms 2.8630us 312.32ms cudaMalloc\n", 375 | " 27.39% 118.53ms 3 39.510ms 27.121us 118.47ms cudaMemcpy\n", 376 | " 0.14% 603.24us 1 603.24us 603.24us 603.24us cuDeviceGetPCIBusId\n", 377 | " 0.11% 481.38us 100 4.8130us 3.4180us 35.589us cudaLaunchKernel\n", 378 | " 0.08% 356.39us 1 356.39us 356.39us 356.39us cuDeviceTotalMem\n", 379 | " 0.04% 182.81us 101 1.8100us 133ns 86.635us cuDeviceGetAttribute\n", 380 | " 0.04% 170.18us 3 56.725us 4.5810us 145.75us cudaFree\n", 381 | " 0.01% 28.980us 1 28.980us 28.980us 28.980us cuDeviceGetName\n", 382 | " 0.00% 1.5750us 2 787ns 328ns 1.2470us cuDeviceGet\n", 383 | " 0.00% 1.4200us 3 473ns 232ns 861ns cuDeviceGetCount\n", 384 | " 0.00% 300ns 1 300ns 300ns 300ns cuDeviceGetUuid\n" 385 | ], 386 | "name": "stdout" 387 | } 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": { 393 | "id": "ch5mhas6fIZd" 394 | }, 395 | "source": [ 396 | "Measuring the time cost of executing the CUDA fucntion with **multi-threads**" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "metadata": { 402 | "colab": { 403 | "base_uri": "https://localhost:8080/" 404 | }, 405 | "id": "0wc1X6ZCFAVo", 406 | "outputId": "948c0e1c-a491-4173-9a76-e0bfcb291db5" 407 | }, 408 | "source": [ 409 | "!nvcc -o verctor_add_multi_thread verctor_add_multi_thread.cu\n", 410 | "!nvprof ./verctor_add_multi_thread 0\n", 411 | "!nvprof ./verctor_add_multi_thread 1\n", 412 | "!nvprof ./verctor_add_multi_thread 2\n", 413 | "!nvprof ./verctor_add_multi_thread 3\n", 414 | "!nvprof ./verctor_add_multi_thread 4\n", 415 | "!nvprof ./verctor_add_multi_thread 5" 416 | ], 417 | "execution_count": 5, 418 | "outputs": [ 419 | { 420 | "output_type": "stream", 421 | "text": [ 422 | "The Block size is 1.\n", 423 | "==210== NVPROF is profiling process 210, command: ./verctor_add_multi_thread 0\n", 424 | "out[0] = 3.140000\n", 425 | "PASSED\n", 426 | "==210== Profiling application: ./verctor_add_multi_thread 0\n", 427 | "==210== Profiling result:\n", 428 | " Type Time(%) Time Calls Avg Min Max Name\n", 429 | " GPU activities: 99.99% 110.00ms 100 1.1000ms 1.0999ms 1.1004ms vector_add(float*, float*, float*, int)\n", 430 | " 0.01% 9.4400us 2 4.7200us 4.5760us 4.8640us [CUDA memcpy HtoD]\n", 431 | " 0.00% 5.1520us 1 5.1520us 5.1520us 5.1520us [CUDA memcpy DtoH]\n", 432 | " API calls: 67.94% 235.85ms 3 78.615ms 2.9820us 235.84ms cudaMalloc\n", 433 | " 31.64% 109.82ms 3 36.607ms 25.740us 109.77ms cudaMemcpy\n", 434 | " 0.15% 533.49us 1 533.49us 533.49us 533.49us cuDeviceTotalMem\n", 435 | " 0.13% 448.80us 100 4.4880us 3.4570us 33.193us cudaLaunchKernel\n", 436 | " 0.07% 230.29us 3 76.761us 5.3020us 199.20us cudaFree\n", 437 | " 0.06% 193.50us 101 1.9150us 184ns 79.100us cuDeviceGetAttribute\n", 438 | " 0.01% 33.101us 1 33.101us 33.101us 33.101us cuDeviceGetName\n", 439 | " 0.00% 5.6790us 1 5.6790us 5.6790us 5.6790us cuDeviceGetPCIBusId\n", 440 | " 0.00% 2.1510us 3 717ns 201ns 1.4490us cuDeviceGetCount\n", 441 | " 0.00% 1.7380us 2 869ns 272ns 1.4660us cuDeviceGet\n", 442 | " 0.00% 427ns 1 427ns 427ns 427ns cuDeviceGetUuid\n", 443 | "The Block size is 64.\n", 444 | "==221== NVPROF is profiling process 221, command: ./verctor_add_multi_thread 1\n", 445 | "out[0] = 3.140000\n", 446 | "PASSED\n", 447 | "==221== Profiling application: ./verctor_add_multi_thread 1\n", 448 | "==221== Profiling result:\n", 449 | " Type Time(%) Time Calls Avg Min Max Name\n", 450 | " GPU activities: 99.66% 3.4722ms 100 34.722us 34.624us 34.945us vector_add(float*, float*, float*, int)\n", 451 | " 0.20% 7.1360us 2 3.5680us 3.4560us 3.6800us [CUDA memcpy HtoD]\n", 452 | " 0.13% 4.5760us 1 4.5760us 4.5760us 4.5760us [CUDA memcpy DtoH]\n", 453 | " API calls: 97.83% 193.30ms 3 64.432ms 3.3410us 193.29ms cudaMalloc\n", 454 | " 1.61% 3.1743ms 3 1.0581ms 24.151us 3.1191ms cudaMemcpy\n", 455 | " 0.24% 475.21us 100 4.7520us 3.4350us 29.099us cudaLaunchKernel\n", 456 | " 0.17% 341.94us 1 341.94us 341.94us 341.94us cuDeviceTotalMem\n", 457 | " 0.07% 145.35us 101 1.4390us 137ns 61.921us cuDeviceGetAttribute\n", 458 | " 0.06% 123.14us 3 41.048us 4.7850us 107.26us cudaFree\n", 459 | " 0.01% 26.334us 1 26.334us 26.334us 26.334us cuDeviceGetName\n", 460 | " 0.00% 5.1070us 1 5.1070us 5.1070us 5.1070us cuDeviceGetPCIBusId\n", 461 | " 0.00% 1.7040us 2 852ns 311ns 1.3930us cuDeviceGet\n", 462 | " 0.00% 1.5310us 3 510ns 195ns 746ns cuDeviceGetCount\n", 463 | " 0.00% 288ns 1 288ns 288ns 288ns cuDeviceGetUuid\n", 464 | "The Block size is 128.\n", 465 | "==232== NVPROF is profiling process 232, command: ./verctor_add_multi_thread 2\n", 466 | "out[0] = 3.140000\n", 467 | "PASSED\n", 468 | "==232== Profiling application: ./verctor_add_multi_thread 2\n", 469 | "==232== Profiling result:\n", 470 | " Type Time(%) Time Calls Avg Min Max Name\n", 471 | " GPU activities: 99.37% 1.8371ms 100 18.371us 18.176us 18.880us vector_add(float*, float*, float*, int)\n", 472 | " 0.39% 7.1360us 2 3.5680us 3.4560us 3.6800us [CUDA memcpy HtoD]\n", 473 | " 0.25% 4.5760us 1 4.5760us 4.5760us 4.5760us [CUDA memcpy DtoH]\n", 474 | " API calls: 98.53% 178.54ms 3 59.513ms 3.2840us 178.53ms cudaMalloc\n", 475 | " 0.85% 1.5462ms 3 515.39us 24.255us 1.4941ms cudaMemcpy\n", 476 | " 0.26% 462.22us 100 4.6220us 3.5560us 25.245us cudaLaunchKernel\n", 477 | " 0.19% 342.18us 1 342.18us 342.18us 342.18us cuDeviceTotalMem\n", 478 | " 0.08% 150.30us 101 1.4880us 137ns 61.771us cuDeviceGetAttribute\n", 479 | " 0.07% 124.56us 3 41.519us 4.7580us 104.88us cudaFree\n", 480 | " 0.01% 27.142us 1 27.142us 27.142us 27.142us cuDeviceGetName\n", 481 | " 0.00% 6.0030us 1 6.0030us 6.0030us 6.0030us cuDeviceGetPCIBusId\n", 482 | " 0.00% 1.7140us 3 571ns 232ns 991ns cuDeviceGetCount\n", 483 | " 0.00% 1.1770us 2 588ns 302ns 875ns cuDeviceGet\n", 484 | " 0.00% 292ns 1 292ns 292ns 292ns cuDeviceGetUuid\n", 485 | "The Block size is 256.\n", 486 | "==243== NVPROF is profiling process 243, command: ./verctor_add_multi_thread 3\n", 487 | "out[0] = 3.140000\n", 488 | "PASSED\n", 489 | "==243== Profiling application: ./verctor_add_multi_thread 3\n", 490 | "==243== Profiling result:\n", 491 | " Type Time(%) Time Calls Avg Min Max Name\n", 492 | " GPU activities: 98.87% 1.0244ms 100 10.244us 9.9200us 11.136us vector_add(float*, float*, float*, int)\n", 493 | " 0.69% 7.1360us 2 3.5680us 3.4560us 3.6800us [CUDA memcpy HtoD]\n", 494 | " 0.44% 4.6080us 1 4.6080us 4.6080us 4.6080us [CUDA memcpy DtoH]\n", 495 | " API calls: 98.98% 180.43ms 3 60.143ms 3.3970us 180.42ms cudaMalloc\n", 496 | " 0.38% 692.31us 3 230.77us 23.738us 639.01us cudaMemcpy\n", 497 | " 0.27% 500.12us 100 5.0010us 3.6400us 26.479us cudaLaunchKernel\n", 498 | " 0.20% 367.77us 1 367.77us 367.77us 367.77us cuDeviceTotalMem\n", 499 | " 0.08% 145.39us 101 1.4390us 146ns 60.433us cuDeviceGetAttribute\n", 500 | " 0.07% 121.86us 3 40.621us 4.3540us 106.28us cudaFree\n", 501 | " 0.02% 32.412us 1 32.412us 32.412us 32.412us cuDeviceGetName\n", 502 | " 0.00% 4.7100us 1 4.7100us 4.7100us 4.7100us cuDeviceGetPCIBusId\n", 503 | " 0.00% 1.4430us 3 481ns 196ns 847ns cuDeviceGetCount\n", 504 | " 0.00% 1.1370us 2 568ns 297ns 840ns cuDeviceGet\n", 505 | " 0.00% 288ns 1 288ns 288ns 288ns cuDeviceGetUuid\n", 506 | "The Block size is 512.\n", 507 | "==256== NVPROF is profiling process 256, command: ./verctor_add_multi_thread 4\n", 508 | "out[0] = 3.140000\n", 509 | "PASSED\n", 510 | "==256== Profiling application: ./verctor_add_multi_thread 4\n", 511 | "==256== Profiling result:\n", 512 | " Type Time(%) Time Calls Avg Min Max Name\n", 513 | " GPU activities: 98.65% 1.0801ms 100 10.801us 10.592us 11.296us vector_add(float*, float*, float*, int)\n", 514 | " 0.88% 9.6640us 2 4.8320us 4.7040us 4.9600us [CUDA memcpy HtoD]\n", 515 | " 0.46% 5.0880us 1 5.0880us 5.0880us 5.0880us [CUDA memcpy DtoH]\n", 516 | " API calls: 98.92% 184.62ms 3 61.541ms 2.6110us 184.62ms cudaMalloc\n", 517 | " 0.47% 879.32us 3 293.11us 26.589us 797.05us cudaMemcpy\n", 518 | " 0.24% 454.74us 100 4.5470us 3.5170us 30.816us cudaLaunchKernel\n", 519 | " 0.20% 373.11us 1 373.11us 373.11us 373.11us cuDeviceTotalMem\n", 520 | " 0.08% 155.62us 101 1.5400us 139ns 67.367us cuDeviceGetAttribute\n", 521 | " 0.06% 119.57us 3 39.858us 4.2690us 104.01us cudaFree\n", 522 | " 0.02% 29.107us 1 29.107us 29.107us 29.107us cuDeviceGetName\n", 523 | " 0.00% 5.2130us 1 5.2130us 5.2130us 5.2130us cuDeviceGetPCIBusId\n", 524 | " 0.00% 1.4490us 3 483ns 196ns 1.0070us cuDeviceGetCount\n", 525 | " 0.00% 1.1590us 2 579ns 203ns 956ns cuDeviceGet\n", 526 | " 0.00% 317ns 1 317ns 317ns 317ns cuDeviceGetUuid\n", 527 | "The Block size is 1024.\n", 528 | "==267== NVPROF is profiling process 267, command: ./verctor_add_multi_thread 5\n", 529 | "out[0] = 3.140000\n", 530 | "PASSED\n", 531 | "==267== Profiling application: ./verctor_add_multi_thread 5\n", 532 | "==267== Profiling result:\n", 533 | " Type Time(%) Time Calls Avg Min Max Name\n", 534 | " GPU activities: 97.97% 715.27us 100 7.1520us 7.0080us 8.0320us vector_add(float*, float*, float*, int)\n", 535 | " 1.33% 9.6960us 2 4.8480us 4.7040us 4.9920us [CUDA memcpy HtoD]\n", 536 | " 0.70% 5.0880us 1 5.0880us 5.0880us 5.0880us [CUDA memcpy DtoH]\n", 537 | " API calls: 99.13% 182.23ms 3 60.744ms 2.6040us 182.22ms cudaMalloc\n", 538 | " 0.29% 528.43us 100 5.2840us 3.5320us 28.747us cudaLaunchKernel\n", 539 | " 0.22% 404.82us 3 134.94us 25.338us 351.20us cudaMemcpy\n", 540 | " 0.20% 358.63us 1 358.63us 358.63us 358.63us cuDeviceTotalMem\n", 541 | " 0.08% 146.85us 101 1.4540us 139ns 62.110us cuDeviceGetAttribute\n", 542 | " 0.07% 122.11us 3 40.701us 4.8210us 105.45us cudaFree\n", 543 | " 0.02% 29.790us 1 29.790us 29.790us 29.790us cuDeviceGetName\n", 544 | " 0.00% 5.5170us 1 5.5170us 5.5170us 5.5170us cuDeviceGetPCIBusId\n", 545 | " 0.00% 1.2460us 3 415ns 184ns 737ns cuDeviceGetCount\n", 546 | " 0.00% 1.1670us 2 583ns 223ns 944ns cuDeviceGet\n", 547 | " 0.00% 267ns 1 267ns 267ns 267ns cuDeviceGetUuid\n" 548 | ], 549 | "name": "stdout" 550 | } 551 | ] 552 | } 553 | ] 554 | } 555 | -------------------------------------------------------------------------------- /Solution/Exercise_06.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Exercise_06.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "accelerator": "GPU" 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "gZABpep_V-8C" 24 | }, 25 | "source": [ 26 | "# CUDA Exercise 06\n", 27 | "> Another approach of parallelized Vector add. \n", 28 | "\n", 29 | "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n", 30 | "\n", 31 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_06.ipynb)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": { 37 | "id": "P401L2N_WG6R" 38 | }, 39 | "source": [ 40 | "## Initialize the CUDA dev environment" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "metadata": { 46 | "colab": { 47 | "base_uri": "https://localhost:8080/" 48 | }, 49 | "id": "OONoNFZeV63L", 50 | "outputId": "7ea7eb64-bce9-4b3e-fb37-26c22c542977" 51 | }, 52 | "source": [ 53 | "# clone the code repo,\n", 54 | "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n", 55 | "%load_ext nvcc_plugin\n", 56 | "\n", 57 | "# Check the environment \n", 58 | "!lsb_release -a\n", 59 | "!nvcc --version\n", 60 | "!nvidia-smi" 61 | ], 62 | "execution_count": 1, 63 | "outputs": [ 64 | { 65 | "output_type": "stream", 66 | "text": [ 67 | "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n", 68 | " Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-0h_on20m\n", 69 | " Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-0h_on20m\n", 70 | "Building wheels for collected packages: NVCCPlugin\n", 71 | " Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 72 | " Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=c3bbe482d2b7bd608c155ee0855393664aee1a212eb71f17e7c1d5c7be4d469c\n", 73 | " Stored in directory: /tmp/pip-ephem-wheel-cache-s37pn594/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n", 74 | "Successfully built NVCCPlugin\n", 75 | "Installing collected packages: NVCCPlugin\n", 76 | "Successfully installed NVCCPlugin-0.0.2\n", 77 | "Default out bin result.out\n", 78 | "No LSB modules are available.\n", 79 | "Distributor ID:\tUbuntu\n", 80 | "Description:\tUbuntu 18.04.5 LTS\n", 81 | "Release:\t18.04\n", 82 | "Codename:\tbionic\n", 83 | "nvcc: NVIDIA (R) Cuda compiler driver\n", 84 | "Copyright (c) 2005-2020 NVIDIA Corporation\n", 85 | "Built on Wed_Jul_22_19:09:09_PDT_2020\n", 86 | "Cuda compilation tools, release 11.0, V11.0.221\n", 87 | "Build cuda_11.0_bu.TC445_37.28845127_0\n", 88 | "Thu Apr 22 21:38:17 2021 \n", 89 | "+-----------------------------------------------------------------------------+\n", 90 | "| NVIDIA-SMI 465.19.01 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", 91 | "|-------------------------------+----------------------+----------------------+\n", 92 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 93 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 94 | "| | | MIG M. |\n", 95 | "|===============================+======================+======================|\n", 96 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 97 | "| N/A 64C P8 11W / 70W | 0MiB / 15109MiB | 0% Default |\n", 98 | "| | | N/A |\n", 99 | "+-------------------------------+----------------------+----------------------+\n", 100 | " \n", 101 | "+-----------------------------------------------------------------------------+\n", 102 | "| Processes: |\n", 103 | "| GPU GI CI PID Type Process name GPU Memory |\n", 104 | "| ID ID Usage |\n", 105 | "|=============================================================================|\n", 106 | "| No running processes found |\n", 107 | "+-----------------------------------------------------------------------------+\n" 108 | ], 109 | "name": "stdout" 110 | } 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": { 116 | "id": "Bxx_JzKwgnh1" 117 | }, 118 | "source": [ 119 | "## Vector Add with Multiple Threads across Blocks" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "metadata": { 125 | "colab": { 126 | "base_uri": "https://localhost:8080/" 127 | }, 128 | "id": "egrZEZ3MWaP_", 129 | "outputId": "c9574221-69cc-4343-bfa6-44db49fdadc5" 130 | }, 131 | "source": [ 132 | "%%writefile verctor_add_multi_blocks_thread.cu\n", 133 | "\n", 134 | "#include \n", 135 | "#include \n", 136 | "\n", 137 | "#define VECTOR_LENGTH 10000\n", 138 | "#define MAX_ERR 1e-4\n", 139 | "\n", 140 | "__global__ void vector_add(float *out, float *a, float *b, int n) \n", 141 | "{\n", 142 | " int tid = blockIdx.x * blockDim.x + threadIdx.x;\n", 143 | " \n", 144 | " if(tid 2 ) {\n", 165 | " printf(\"Too many arguments supplied.\\n\");\n", 166 | " }\n", 167 | " else {\n", 168 | " printf(\"One argument expected.\\n\");\n", 169 | " \n", 170 | " }\n", 171 | "\n", 172 | " printf(\"The Block size is %d.\\n\", block_size);\n", 173 | "\n", 174 | "\n", 175 | " // Allocate memory on CPU\n", 176 | " a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n", 177 | " b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n", 178 | " out = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n", 179 | "\n", 180 | " // data initializtion\n", 181 | " for(int i = 0; i < VECTOR_LENGTH; i++)\n", 182 | " {\n", 183 | " a[i] = 3.0f;\n", 184 | " b[i] = 0.14f;\n", 185 | " }\n", 186 | "\n", 187 | " // Allocate memory on GPU\n", 188 | " cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);\n", 189 | " cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);\n", 190 | " cudaMalloc((void**)&d_out, sizeof(float) * VECTOR_LENGTH);\n", 191 | "\n", 192 | " // copy operator to GPU\n", 193 | " cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n", 194 | " cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n", 195 | "\n", 196 | " for(int i=0;i<100;i++)\n", 197 | " {\n", 198 | " // GPU do the work, CPU waits\n", 199 | " // Executing kernel \n", 200 | " int grid_size = ((VECTOR_LENGTH + block_size) / block_size);\n", 201 | " vector_add<<>>(d_out, d_a, d_b, VECTOR_LENGTH);\n", 202 | " }\n", 203 | " // Get results from the GPU\n", 204 | " cudaMemcpy(out, d_out, sizeof(float) * VECTOR_LENGTH, \n", 205 | " cudaMemcpyDeviceToHost);\n", 206 | "\n", 207 | " // Test the result\n", 208 | " for(int i = 0; i < VECTOR_LENGTH; i++){\n", 209 | " assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);\n", 210 | " }\n", 211 | " printf(\"out[0] = %f\\n\", out[0]);\n", 212 | " printf(\"PASSED\\n\");\n", 213 | "\n", 214 | " // Free the memory\n", 215 | " cudaFree(d_a);\n", 216 | " cudaFree(d_b);\n", 217 | " cudaFree(d_out);\n", 218 | " free(a);\n", 219 | " free(b);\n", 220 | " free(out);\n", 221 | " }" 222 | ], 223 | "execution_count": 2, 224 | "outputs": [ 225 | { 226 | "output_type": "stream", 227 | "text": [ 228 | "Writing verctor_add_multi_blocks_thread.cu\n" 229 | ], 230 | "name": "stdout" 231 | } 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": { 237 | "id": "d9Zw1YvsewRK" 238 | }, 239 | "source": [ 240 | "## Evaluation" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": { 246 | "id": "ch5mhas6fIZd" 247 | }, 248 | "source": [ 249 | "Measuring the time cost of executing the CUDA fucntion" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "metadata": { 255 | "colab": { 256 | "base_uri": "https://localhost:8080/" 257 | }, 258 | "id": "0wc1X6ZCFAVo", 259 | "outputId": "be4bb169-e77b-4060-daa1-588a5d832419" 260 | }, 261 | "source": [ 262 | "!nvcc -o verctor_add_multi_blocks_thread verctor_add_multi_blocks_thread.cu\n", 263 | "!nvprof ./verctor_add_multi_blocks_thread 0\n", 264 | "!nvprof ./verctor_add_multi_blocks_thread 1\n", 265 | "!nvprof ./verctor_add_multi_blocks_thread 2\n", 266 | "!nvprof ./verctor_add_multi_blocks_thread 3" 267 | ], 268 | "execution_count": 3, 269 | "outputs": [ 270 | { 271 | "output_type": "stream", 272 | "text": [ 273 | "The Block size is 1.\n", 274 | "==165== NVPROF is profiling process 165, command: ./verctor_add_multi_blocks_thread 0\n", 275 | "out[0] = 3.140000\n", 276 | "PASSED\n", 277 | "==165== Profiling application: ./verctor_add_multi_blocks_thread 0\n", 278 | "==165== Profiling result:\n", 279 | " Type Time(%) Time Calls Avg Min Max Name\n", 280 | " GPU activities: 99.53% 3.0884ms 100 30.884us 30.785us 31.265us vector_add(float*, float*, float*, int)\n", 281 | " 0.31% 9.5690us 2 4.7840us 4.6400us 4.9290us [CUDA memcpy HtoD]\n", 282 | " 0.16% 5.1200us 1 5.1200us 5.1200us 5.1200us [CUDA memcpy DtoH]\n", 283 | " API calls: 98.68% 329.13ms 3 109.71ms 3.1850us 329.12ms cudaMalloc\n", 284 | " 0.86% 2.8551ms 3 951.69us 25.687us 2.7982ms cudaMemcpy\n", 285 | " 0.14% 471.47us 100 4.7140us 3.2380us 32.273us cudaLaunchKernel\n", 286 | " 0.12% 398.11us 1 398.11us 398.11us 398.11us cuDeviceGetPCIBusId\n", 287 | " 0.11% 352.50us 1 352.50us 352.50us 352.50us cuDeviceTotalMem\n", 288 | " 0.04% 148.32us 101 1.4680us 137ns 63.197us cuDeviceGetAttribute\n", 289 | " 0.04% 125.70us 3 41.900us 4.3440us 110.18us cudaFree\n", 290 | " 0.01% 48.719us 1 48.719us 48.719us 48.719us cuDeviceGetName\n", 291 | " 0.00% 1.6970us 3 565ns 234ns 1.0660us cuDeviceGetCount\n", 292 | " 0.00% 1.4190us 2 709ns 236ns 1.1830us cuDeviceGet\n", 293 | " 0.00% 304ns 1 304ns 304ns 304ns cuDeviceGetUuid\n", 294 | "The Block size is 64.\n", 295 | "==176== NVPROF is profiling process 176, command: ./verctor_add_multi_blocks_thread 1\n", 296 | "out[0] = 3.140000\n", 297 | "PASSED\n", 298 | "==176== Profiling application: ./verctor_add_multi_blocks_thread 1\n", 299 | "==176== Profiling result:\n", 300 | " Type Time(%) Time Calls Avg Min Max Name\n", 301 | " GPU activities: 94.94% 273.86us 100 2.7380us 2.6880us 3.0720us vector_add(float*, float*, float*, int)\n", 302 | " 3.32% 9.5680us 2 4.7840us 4.6400us 4.9280us [CUDA memcpy HtoD]\n", 303 | " 1.74% 5.0240us 1 5.0240us 5.0240us 5.0240us [CUDA memcpy DtoH]\n", 304 | " API calls: 99.28% 177.51ms 3 59.169ms 3.0770us 177.50ms cudaMalloc\n", 305 | " 0.27% 486.52us 100 4.8650us 3.4350us 28.910us cudaLaunchKernel\n", 306 | " 0.20% 366.26us 1 366.26us 366.26us 366.26us cuDeviceTotalMem\n", 307 | " 0.09% 162.01us 101 1.6040us 137ns 65.314us cuDeviceGetAttribute\n", 308 | " 0.08% 137.57us 3 45.855us 4.3990us 122.37us cudaFree\n", 309 | " 0.06% 102.92us 3 34.307us 24.556us 47.219us cudaMemcpy\n", 310 | " 0.01% 24.120us 1 24.120us 24.120us 24.120us cuDeviceGetName\n", 311 | " 0.00% 4.4430us 1 4.4430us 4.4430us 4.4430us cuDeviceGetPCIBusId\n", 312 | " 0.00% 1.8130us 3 604ns 218ns 1.1550us cuDeviceGetCount\n", 313 | " 0.00% 1.1100us 2 555ns 198ns 912ns cuDeviceGet\n", 314 | " 0.00% 300ns 1 300ns 300ns 300ns cuDeviceGetUuid\n", 315 | "The Block size is 128.\n", 316 | "==187== NVPROF is profiling process 187, command: ./verctor_add_multi_blocks_thread 2\n", 317 | "out[0] = 3.140000\n", 318 | "PASSED\n", 319 | "==187== Profiling application: ./verctor_add_multi_blocks_thread 2\n", 320 | "==187== Profiling result:\n", 321 | " Type Time(%) Time Calls Avg Min Max Name\n", 322 | " GPU activities: 94.45% 251.05us 100 2.5100us 2.4640us 2.8800us vector_add(float*, float*, float*, int)\n", 323 | " 3.65% 9.6960us 2 4.8480us 4.6720us 5.0240us [CUDA memcpy HtoD]\n", 324 | " 1.90% 5.0560us 1 5.0560us 5.0560us 5.0560us [CUDA memcpy DtoH]\n", 325 | " API calls: 99.34% 180.00ms 3 60.002ms 2.7890us 180.00ms cudaMalloc\n", 326 | " 0.24% 429.38us 100 4.2930us 3.3800us 26.703us cudaLaunchKernel\n", 327 | " 0.20% 357.20us 1 357.20us 357.20us 357.20us cuDeviceTotalMem\n", 328 | " 0.08% 140.11us 101 1.3870us 141ns 59.152us cuDeviceGetAttribute\n", 329 | " 0.07% 121.41us 3 40.470us 24.773us 70.702us cudaMemcpy\n", 330 | " 0.06% 115.47us 3 38.490us 4.5820us 98.706us cudaFree\n", 331 | " 0.02% 29.831us 1 29.831us 29.831us 29.831us cuDeviceGetName\n", 332 | " 0.00% 6.1680us 1 6.1680us 6.1680us 6.1680us cuDeviceGetPCIBusId\n", 333 | " 0.00% 1.7640us 2 882ns 275ns 1.4890us cuDeviceGet\n", 334 | " 0.00% 1.4760us 3 492ns 208ns 888ns cuDeviceGetCount\n", 335 | " 0.00% 256ns 1 256ns 256ns 256ns cuDeviceGetUuid\n", 336 | "The Block size is 256.\n", 337 | "==198== NVPROF is profiling process 198, command: ./verctor_add_multi_blocks_thread 3\n", 338 | "out[0] = 3.140000\n", 339 | "PASSED\n", 340 | "==198== Profiling application: ./verctor_add_multi_blocks_thread 3\n", 341 | "==198== Profiling result:\n", 342 | " Type Time(%) Time Calls Avg Min Max Name\n", 343 | " GPU activities: 94.40% 246.92us 100 2.4690us 2.4320us 2.8160us vector_add(float*, float*, float*, int)\n", 344 | " 3.67% 9.6000us 2 4.8000us 4.6400us 4.9600us [CUDA memcpy HtoD]\n", 345 | " 1.93% 5.0560us 1 5.0560us 5.0560us 5.0560us [CUDA memcpy DtoH]\n", 346 | " API calls: 99.34% 178.61ms 3 59.537ms 3.3080us 178.60ms cudaMalloc\n", 347 | " 0.24% 439.19us 100 4.3910us 3.3750us 32.020us cudaLaunchKernel\n", 348 | " 0.20% 353.33us 1 353.33us 353.33us 353.33us cuDeviceTotalMem\n", 349 | " 0.08% 138.06us 101 1.3660us 134ns 58.617us cuDeviceGetAttribute\n", 350 | " 0.06% 114.41us 3 38.136us 4.2640us 100.60us cudaFree\n", 351 | " 0.06% 104.06us 3 34.685us 26.218us 44.970us cudaMemcpy\n", 352 | " 0.02% 37.664us 1 37.664us 37.664us 37.664us cuDeviceGetName\n", 353 | " 0.00% 4.8210us 1 4.8210us 4.8210us 4.8210us cuDeviceGetPCIBusId\n", 354 | " 0.00% 1.7230us 3 574ns 212ns 1.1040us cuDeviceGetCount\n", 355 | " 0.00% 1.3420us 2 671ns 301ns 1.0410us cuDeviceGet\n", 356 | " 0.00% 286ns 1 286ns 286ns 286ns cuDeviceGetUuid\n" 357 | ], 358 | "name": "stdout" 359 | } 360 | ] 361 | } 362 | ] 363 | } 364 | -------------------------------------------------------------------------------- /Solution/Exercise_07.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Exercise_07.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "accelerator": "GPU" 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "h-JwSwNW9QmT" 24 | }, 25 | "source": [ 26 | "\n", 27 | "# CUDA Exercise 07\n", 28 | "> You should try to implement your own solution for vector dot product, and try to parallelize the computation.\n", 29 | "\n", 30 | "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n", 31 | "\n", 32 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_07.ipynb)\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "id": "cOEai4hb95Ip" 39 | }, 40 | "source": [ 41 | "## Initialize the CUDA dev environment" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "metadata": { 47 | "id": "bqmwwI7H5nDx" 48 | }, 49 | "source": [ 50 | "# clone the code repo,\n", 51 | "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n", 52 | "%load_ext nvcc_plugin" 53 | ], 54 | "execution_count": null, 55 | "outputs": [] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "id": "P2Zeyyo4_gNH" 61 | }, 62 | "source": [ 63 | "## Check the environment " 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "metadata": { 69 | "id": "N6PT4QpR6oxt" 70 | }, 71 | "source": [ 72 | "!lsb_release -a\n", 73 | "!nvcc --version\n", 74 | "!nvidia-smi" 75 | ], 76 | "execution_count": null, 77 | "outputs": [] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": { 82 | "id": "TF6KTYqE_n7H" 83 | }, 84 | "source": [ 85 | "## Naive approach of vector dot product" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "metadata": { 91 | "id": "Ev5_BW1z80S3" 92 | }, 93 | "source": [ 94 | "%%writefile exercise01.cu\n", 95 | "#include \n", 96 | "#include \n", 97 | "\n", 98 | "#define MAX_ERR 0.1\n", 99 | "#define MULTI_TIMES_RUN 1\n", 100 | "\n", 101 | "__global__ void vector_dot_product(float *result, float *vector_a, float *vector_b, int vertor_length) \n", 102 | "{\n", 103 | " extern __shared__ float temp[];\n", 104 | " \n", 105 | " int index = threadIdx.x; // index offset of this thread\n", 106 | " int stride = blockDim.x; // stride step of each iteration\n", 107 | "\n", 108 | " // so if threadIdx.x=0, and blockDim.x=10,\n", 109 | " // then this thread is responsible for calculating temp[0], temp[10], temp[20]\n", 110 | " // similiarly, the following thread will calculate temp[1], temp[11], temp[21]\n", 111 | " for(int i = index; i < vertor_length; i += stride)\n", 112 | " {\n", 113 | " temp[i] = vector_a[i] * vector_b[i];\n", 114 | " }\n", 115 | " \n", 116 | " __syncthreads(); // synchronize all threads\n", 117 | " \n", 118 | " // The accumulation only needs to happen at thread_0\n", 119 | " if (threadIdx.x == 0)\n", 120 | " {\n", 121 | " float sum = 0;\n", 122 | " for (int i = 0; i < vertor_length; i++)\n", 123 | " {\n", 124 | " sum += temp[i];\n", 125 | " }\n", 126 | " *result=sum;\n", 127 | " }\n", 128 | "}\n", 129 | "\n", 130 | "int main(int argc, char *argv[])\n", 131 | "{\n", 132 | " float *vector_a, *vector_b, *result;\n", 133 | " float *d_vector_a, *d_vector_b, *d_result;\n", 134 | " int list_of_thread_num[]={1,64,128,256,512,1024};\n", 135 | " int list_of_vector_length[]={100,200,1000,2000,10000};\n", 136 | " int thread_num = 1;\n", 137 | " int vector_length = 1000;\n", 138 | " \n", 139 | " if( argc == 3 ) {\n", 140 | " //printf(\"The argument supplied is %s\\n\", argv[1]);\n", 141 | " int arg1 = atoi(argv[1]); //argv[0] is the program name\n", 142 | " //atoi = ascii to int\n", 143 | " int arg2 = atoi(argv[2]); \n", 144 | " \n", 145 | " vector_length = list_of_vector_length[arg1];\n", 146 | " thread_num = list_of_thread_num[arg2];\n", 147 | " }\n", 148 | " else if( argc > 2 ) {\n", 149 | " printf(\"Too many arguments supplied.\\n\");\n", 150 | " }\n", 151 | " else {\n", 152 | " printf(\"One argument expected.\\n\");\n", 153 | " \n", 154 | " }\n", 155 | "\n", 156 | " // Allocate memory on CPU\n", 157 | " vector_a = (float*)malloc(sizeof(float) * vector_length);\n", 158 | " vector_b = (float*)malloc(sizeof(float) * vector_length);\n", 159 | " result = (float*)malloc(sizeof(float));\n", 160 | "\n", 161 | " // data initializtion\n", 162 | " for(int i = 0; i < vector_length; i++)\n", 163 | " {\n", 164 | " vector_a[i] = 0.1f;\n", 165 | " vector_b[i] = 0.9f;\n", 166 | " }\n", 167 | "\n", 168 | " // Allocate memory on GPU\n", 169 | " cudaMalloc((void**)&d_vector_a, sizeof(float) * vector_length);\n", 170 | " cudaMalloc((void**)&d_vector_b, sizeof(float) * vector_length);\n", 171 | " cudaMalloc((void**)&d_result, sizeof(float));\n", 172 | "\n", 173 | " // copy operator to GPU\n", 174 | " cudaMemcpy(d_vector_a, vector_a, sizeof(float) * vector_length, cudaMemcpyHostToDevice);\n", 175 | " cudaMemcpy(d_vector_b, vector_b, sizeof(float) * vector_length, cudaMemcpyHostToDevice);\n", 176 | "\n", 177 | " // GPU do the work, CPU waits\n", 178 | "#if MULTI_TIMES_RUN\n", 179 | " for(int i=0; i< 10; i++)\n", 180 | " {\n", 181 | "#endif\n", 182 | " vector_dot_product<<<1,thread_num,sizeof(float) * vector_length>>>(d_result, d_vector_a, d_vector_b, vector_length);\n", 183 | "#if MULTI_TIMES_RUN\n", 184 | " }\n", 185 | " #endif\n", 186 | " \n", 187 | " // Get results from the GPU\n", 188 | " cudaMemcpy(result, d_result, sizeof(float), \n", 189 | " cudaMemcpyDeviceToHost);\n", 190 | " \n", 191 | " // Test the result\n", 192 | " //assert(fabs(*result - vector_length*2*3.14) < MAX_ERR);\n", 193 | " \n", 194 | " // you only need them for checking if the math is correct\n", 195 | " printf(\"result[0] = %f\\n\", result[0]);\n", 196 | " // printf(\"PASSED\\n\");\n", 197 | "\n", 198 | " // Free the memory\n", 199 | " cudaFree(d_vector_a);\n", 200 | " cudaFree(d_vector_b);\n", 201 | " cudaFree(d_result);\n", 202 | " free(vector_a);\n", 203 | " free(vector_a);\n", 204 | " free(result);\n", 205 | "\n", 206 | "}" 207 | ], 208 | "execution_count": null, 209 | "outputs": [] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": { 214 | "id": "Unl0xR2C_27V" 215 | }, 216 | "source": [ 217 | "## Optimized approach of vector dot product" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "ba05ukJC8AKq" 224 | }, 225 | "source": [ 226 | "%%writefile exercise01.cu\n", 227 | "#include \n", 228 | "#include \n", 229 | "\n", 230 | "#define MAX_ERR 0.1\n", 231 | "#define MULTI_TIMES_RUN 1\n", 232 | "\n", 233 | "__global__ void vector_dot_product(float *result, float *vector_a, float *vector_b, int vertor_length) \n", 234 | "{\n", 235 | " extern __shared__ float temp[];\n", 236 | " \n", 237 | " int index = threadIdx.x; // index offset of this thread\n", 238 | " int stride = blockDim.x; // stride step of each iteration\n", 239 | "\n", 240 | " temp[threadIdx.x] = 0;\n", 241 | " for(int i = index; i < vertor_length; i += stride)\n", 242 | " {\n", 243 | " temp[threadIdx.x] = temp[threadIdx.x] + vector_a[i] * vector_b[i];\n", 244 | " }\n", 245 | " \n", 246 | " __syncthreads(); // synchronize all threads\n", 247 | " \n", 248 | " // The accumulation only needs to happen at thread_0\n", 249 | " if (threadIdx.x == 0)\n", 250 | " {\n", 251 | " float sum = 0;\n", 252 | " int thread_num = (vertor_length+blockDim.x)/blockDim.x;\n", 253 | " for (int i = 0; i < thread_num; i++)\n", 254 | " {\n", 255 | " sum += temp[i];\n", 256 | " }\n", 257 | " *result=sum;\n", 258 | " }\n", 259 | "}\n", 260 | "\n", 261 | "int main(int argc, char *argv[])\n", 262 | "{\n", 263 | " float *vector_a, *vector_b, *result;\n", 264 | " float *d_vector_a, *d_vector_b, *d_result;\n", 265 | " int list_of_thread_num[]={1,64,128,256,512,1024};\n", 266 | " int list_of_vector_length[]={100,200,1000,2000,10000};\n", 267 | " int thread_num = 1;\n", 268 | " int vector_length = 1000;\n", 269 | " \n", 270 | " if( argc == 3 ) {\n", 271 | " //printf(\"The arguments supplied are %s, %s\\n\", argv[1], argv[2]);\n", 272 | " int arg1 = atoi(argv[1]); //argv[0] is the program name\n", 273 | " //atoi = ascii to int\n", 274 | " int arg2 = atoi(argv[2]); \n", 275 | " \n", 276 | " vector_length = list_of_vector_length[arg1];\n", 277 | " thread_num = list_of_thread_num[arg2];\n", 278 | " }\n", 279 | " else if( argc > 2 ) {\n", 280 | " printf(\"Too many arguments supplied.\\n\");\n", 281 | " }\n", 282 | " else {\n", 283 | " printf(\"Two argument expected.\\n\");\n", 284 | " return 0;\n", 285 | " }\n", 286 | "\n", 287 | " // Allocate memory on CPU\n", 288 | " vector_a = (float*)malloc(sizeof(float) * vector_length);\n", 289 | " vector_b = (float*)malloc(sizeof(float) * vector_length);\n", 290 | " result = (float*)malloc(sizeof(float));\n", 291 | "\n", 292 | " // data initializtion\n", 293 | " for(int i = 0; i < vector_length; i++)\n", 294 | " {\n", 295 | " vector_a[i] = 0.1f;\n", 296 | " vector_b[i] = 0.9f;\n", 297 | " }\n", 298 | "\n", 299 | " // Allocate memory on GPU\n", 300 | " cudaMalloc((void**)&d_vector_a, sizeof(float) * vector_length);\n", 301 | " cudaMalloc((void**)&d_vector_b, sizeof(float) * vector_length);\n", 302 | " cudaMalloc((void**)&d_result, sizeof(float));\n", 303 | "\n", 304 | " // copy operator to GPU\n", 305 | " cudaMemcpy(d_vector_a, vector_a, sizeof(float) * vector_length, cudaMemcpyHostToDevice);\n", 306 | " cudaMemcpy(d_vector_b, vector_b, sizeof(float) * vector_length, cudaMemcpyHostToDevice);\n", 307 | "\n", 308 | " // GPU do the work, CPU waits\n", 309 | "#if MULTI_TIMES_RUN\n", 310 | " for(int i=0; i< 10; i++)\n", 311 | " {\n", 312 | "#endif\n", 313 | " vector_dot_product<<<1,thread_num,sizeof(float) * thread_num>>>(d_result, d_vector_a, d_vector_b, vector_length);\n", 314 | "#if MULTI_TIMES_RUN\n", 315 | " }\n", 316 | " #endif\n", 317 | " \n", 318 | " // Get results from the GPU\n", 319 | " cudaMemcpy(result, d_result, sizeof(float), \n", 320 | " cudaMemcpyDeviceToHost);\n", 321 | " \n", 322 | " // Test the result\n", 323 | " //assert(fabs(*result - vector_length*2*3.14) < MAX_ERR);\n", 324 | " \n", 325 | " // you only need them for checking if the math is correct\n", 326 | " printf(\"result[0] = %f\\n\", result[0]);\n", 327 | " // printf(\"PASSED\\n\");\n", 328 | "\n", 329 | " // Free the memory\n", 330 | " cudaFree(d_vector_a);\n", 331 | " cudaFree(d_vector_b);\n", 332 | " cudaFree(d_result);\n", 333 | " free(vector_a);\n", 334 | " free(vector_a);\n", 335 | " free(result);\n", 336 | "}" 337 | ], 338 | "execution_count": null, 339 | "outputs": [] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": { 344 | "id": "_BsEJesxACRz" 345 | }, 346 | "source": [ 347 | "## Evaluation to collect enough information for the benchmark" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "metadata": { 353 | "id": "CjisNLsazjUT" 354 | }, 355 | "source": [ 356 | "!nvcc -o exercise01 exercise01.cu\n", 357 | "!nvprof ./exercise01 0 0\n", 358 | "!nvprof ./exercise01 1 0\n", 359 | "!nvprof ./exercise01 2 0\n", 360 | "!nvprof ./exercise01 3 0\n", 361 | "!nvprof ./exercise01 4 0" 362 | ], 363 | "execution_count": null, 364 | "outputs": [] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "metadata": { 369 | "id": "J20hMfub0Tr2" 370 | }, 371 | "source": [ 372 | "!nvcc -o exercise01 exercise01.cu\n", 373 | "!nvprof ./exercise01 4 0\n", 374 | "!nvprof ./exercise01 4 1\n", 375 | "!nvprof ./exercise01 4 2\n", 376 | "!nvprof ./exercise01 4 3\n", 377 | "!nvprof ./exercise01 4 4" 378 | ], 379 | "execution_count": null, 380 | "outputs": [] 381 | } 382 | ] 383 | } 384 | -------------------------------------------------------------------------------- /Solution/Exercise_08.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Exercise_08.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "accelerator": "GPU" 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "h-JwSwNW9QmT" 24 | }, 25 | "source": [ 26 | "\n", 27 | "# CUDA Exercise 08\n", 28 | "> You should try to implement your own solution for matrix vector multiplication, and try to parallelize the computation.\n", 29 | "\n", 30 | "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n", 31 | "\n", 32 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_08.ipynb)\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": { 38 | "id": "cOEai4hb95Ip" 39 | }, 40 | "source": [ 41 | "## Initialize the CUDA dev environment" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "metadata": { 47 | "id": "bqmwwI7H5nDx", 48 | "colab": { 49 | "base_uri": "https://localhost:8080/" 50 | }, 51 | "outputId": "df4692fa-1acf-4689-ce13-f59b43ead1f6" 52 | }, 53 | "source": [ 54 | "# clone the code repo,\n", 55 | "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n", 56 | "%load_ext nvcc_plugin" 57 | ], 58 | "execution_count": 1, 59 | "outputs": [ 60 | { 61 | "output_type": "stream", 62 | "text": [ 63 | "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n", 64 | " Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-6ri04v_g\n", 65 | " Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-6ri04v_g\n", 66 | "Building wheels for collected packages: NVCCPlugin\n", 67 | " Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 68 | " Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=4d14ae8e1b5d4553791c7785ff742a5ca7908444bfa86c9a7f151acbb55ff62c\n", 69 | " Stored in directory: /tmp/pip-ephem-wheel-cache-83ylvme0/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n", 70 | "Successfully built NVCCPlugin\n", 71 | "Installing collected packages: NVCCPlugin\n", 72 | "Successfully installed NVCCPlugin-0.0.2\n", 73 | "Default out bin result.out\n" 74 | ], 75 | "name": "stdout" 76 | } 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": { 82 | "id": "P2Zeyyo4_gNH" 83 | }, 84 | "source": [ 85 | "## Check the environment " 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "metadata": { 91 | "id": "N6PT4QpR6oxt", 92 | "colab": { 93 | "base_uri": "https://localhost:8080/" 94 | }, 95 | "outputId": "5a8244ea-c4d9-44fe-ba52-f106a709938f" 96 | }, 97 | "source": [ 98 | "!lsb_release -a\n", 99 | "!nvcc --version\n", 100 | "!nvidia-smi" 101 | ], 102 | "execution_count": 2, 103 | "outputs": [ 104 | { 105 | "output_type": "stream", 106 | "text": [ 107 | "No LSB modules are available.\n", 108 | "Distributor ID:\tUbuntu\n", 109 | "Description:\tUbuntu 18.04.5 LTS\n", 110 | "Release:\t18.04\n", 111 | "Codename:\tbionic\n", 112 | "nvcc: NVIDIA (R) Cuda compiler driver\n", 113 | "Copyright (c) 2005-2020 NVIDIA Corporation\n", 114 | "Built on Wed_Jul_22_19:09:09_PDT_2020\n", 115 | "Cuda compilation tools, release 11.0, V11.0.221\n", 116 | "Build cuda_11.0_bu.TC445_37.28845127_0\n", 117 | "Sun Apr 25 20:46:45 2021 \n", 118 | "+-----------------------------------------------------------------------------+\n", 119 | "| NVIDIA-SMI 465.19.01 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", 120 | "|-------------------------------+----------------------+----------------------+\n", 121 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 122 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 123 | "| | | MIG M. |\n", 124 | "|===============================+======================+======================|\n", 125 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 126 | "| N/A 49C P8 10W / 70W | 0MiB / 15109MiB | 0% Default |\n", 127 | "| | | N/A |\n", 128 | "+-------------------------------+----------------------+----------------------+\n", 129 | " \n", 130 | "+-----------------------------------------------------------------------------+\n", 131 | "| Processes: |\n", 132 | "| GPU GI CI PID Type Process name GPU Memory |\n", 133 | "| ID ID Usage |\n", 134 | "|=============================================================================|\n", 135 | "| No running processes found |\n", 136 | "+-----------------------------------------------------------------------------+\n" 137 | ], 138 | "name": "stdout" 139 | } 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": { 145 | "id": "TF6KTYqE_n7H" 146 | }, 147 | "source": [ 148 | "## Naive approach of matrix vector multiplication\n", 149 | "Try to optimize it, you can do much better!" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "metadata": { 155 | "id": "Ev5_BW1z80S3", 156 | "colab": { 157 | "base_uri": "https://localhost:8080/" 158 | }, 159 | "outputId": "afc6adc9-ccd6-46c1-9f1e-3b67a7e02e9f" 160 | }, 161 | "source": [ 162 | "%%writefile matrix_vector_multiplication.cu\n", 163 | "#include \n", 164 | "#include \n", 165 | "\n", 166 | "#define M 100\n", 167 | "#define N 100\n", 168 | "#define MAX_ERR 1e-4\n", 169 | "\n", 170 | "__global__ void matrix_vector_multiplication(float* vector_result, float *matrix_a, float *vector_b, int m_row, int n_col) \n", 171 | "{\n", 172 | " extern __shared__ float temp[];\n", 173 | " \n", 174 | " // blockIdx.x => which row\n", 175 | " // blockDim.x => row length\n", 176 | " // threadIdx.x => which element in this row\n", 177 | " \n", 178 | " // Unique tid which can index each single element in the matrix\n", 179 | " int tid = blockIdx.x * blockDim.x + threadIdx.x;\n", 180 | "\n", 181 | " // the condiction logic make sure, we only do the calculation in the matrix space\n", 182 | " int size_of_the_matrix = m_row*n_col;\n", 183 | " if(tid>>(d_vector_result, d_martix_a, d_martix_b, M, N);\n", 238 | " \n", 239 | " // Get results from the GPU\n", 240 | " cudaMemcpy(vector_result, d_vector_result, sizeof(float) * M, cudaMemcpyDeviceToHost);\n", 241 | " \n", 242 | " // Test the result\n", 243 | " for(int i = 0; i < M; i++)\n", 244 | " {\n", 245 | " float temp_sum =0;\n", 246 | " for(int j = 0; j < N; j++)\n", 247 | " {\n", 248 | " int index = i*N+j;\n", 249 | " temp_sum = temp_sum + martix_a[index]*martix_b[j]; \n", 250 | " }\n", 251 | " //printf(\"out[%d]: %f, %f\\n\", i, temp_sum, vector_result[i]);\n", 252 | " \n", 253 | " assert(fabs(vector_result[i] - temp_sum) < MAX_ERR);\n", 254 | " }\n", 255 | " printf(\"PASSED\\n\");\n", 256 | "\n", 257 | " // Free the memory\n", 258 | " cudaFree(d_martix_a);\n", 259 | " cudaFree(d_martix_b);\n", 260 | " cudaFree(d_vector_result);\n", 261 | " free(martix_a);\n", 262 | " free(martix_b);\n", 263 | " free(vector_result);\n", 264 | " \n", 265 | " return 0;\n", 266 | "}" 267 | ], 268 | "execution_count": 4, 269 | "outputs": [ 270 | { 271 | "output_type": "stream", 272 | "text": [ 273 | "Overwriting matrix_vector_multiplication.cu\n" 274 | ], 275 | "name": "stdout" 276 | } 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": { 282 | "id": "_BsEJesxACRz" 283 | }, 284 | "source": [ 285 | "## Evaluation to collect enough information for the benchmark" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "CjisNLsazjUT", 292 | "colab": { 293 | "base_uri": "https://localhost:8080/" 294 | }, 295 | "outputId": "8f37c2cd-23aa-42c1-ff7f-0e1ac2572987" 296 | }, 297 | "source": [ 298 | "!nvcc -o matrix_vector_multiplication matrix_vector_multiplication.cu\n", 299 | "!nvprof ./matrix_vector_multiplication 0 0\n", 300 | "!nvprof ./matrix_vector_multiplication 1 0\n", 301 | "!nvprof ./matrix_vector_multiplication 2 0\n", 302 | "!nvprof ./matrix_vector_multiplication 3 0\n", 303 | "!nvprof ./matrix_vector_multiplication 4 0" 304 | ], 305 | "execution_count": 5, 306 | "outputs": [ 307 | { 308 | "output_type": "stream", 309 | "text": [ 310 | "==166== NVPROF is profiling process 166, command: ./matrix_vector_multiplication 0 0\n", 311 | "PASSED\n", 312 | "==166== Profiling application: ./matrix_vector_multiplication 0 0\n", 313 | "==166== Profiling result:\n", 314 | " Type Time(%) Time Calls Avg Min Max Name\n", 315 | " GPU activities: 53.11% 9.5670us 1 9.5670us 9.5670us 9.5670us matrix_vector_multiplication(float*, float*, float*, int, int)\n", 316 | " 35.17% 6.3360us 2 3.1680us 1.4080us 4.9280us [CUDA memcpy HtoD]\n", 317 | " 11.72% 2.1120us 1 2.1120us 2.1120us 2.1120us [CUDA memcpy DtoH]\n", 318 | " API calls: 98.77% 317.58ms 3 105.86ms 3.9820us 317.57ms cudaMalloc\n", 319 | " 0.98% 3.1590ms 1 3.1590ms 3.1590ms 3.1590ms cuDeviceGetPCIBusId\n", 320 | " 0.11% 360.55us 1 360.55us 360.55us 360.55us cuDeviceTotalMem\n", 321 | " 0.05% 146.39us 101 1.4490us 140ns 62.080us cuDeviceGetAttribute\n", 322 | " 0.04% 138.31us 3 46.102us 4.1920us 125.39us cudaFree\n", 323 | " 0.02% 66.555us 3 22.185us 12.925us 32.811us cudaMemcpy\n", 324 | " 0.01% 34.868us 1 34.868us 34.868us 34.868us cudaLaunchKernel\n", 325 | " 0.01% 30.340us 1 30.340us 30.340us 30.340us cuDeviceGetName\n", 326 | " 0.00% 1.6340us 2 817ns 288ns 1.3460us cuDeviceGet\n", 327 | " 0.00% 1.3770us 3 459ns 237ns 803ns cuDeviceGetCount\n", 328 | " 0.00% 276ns 1 276ns 276ns 276ns cuDeviceGetUuid\n", 329 | "==177== NVPROF is profiling process 177, command: ./matrix_vector_multiplication 1 0\n", 330 | "PASSED\n", 331 | "==177== Profiling application: ./matrix_vector_multiplication 1 0\n", 332 | "==177== Profiling result:\n", 333 | " Type Time(%) Time Calls Avg Min Max Name\n", 334 | " GPU activities: 52.54% 9.5990us 1 9.5990us 9.5990us 9.5990us matrix_vector_multiplication(float*, float*, float*, int, int)\n", 335 | " 36.08% 6.5920us 2 3.2960us 1.6320us 4.9600us [CUDA memcpy HtoD]\n", 336 | " 11.38% 2.0800us 1 2.0800us 2.0800us 2.0800us [CUDA memcpy DtoH]\n", 337 | " API calls: 99.60% 180.65ms 3 60.217ms 3.5120us 180.64ms cudaMalloc\n", 338 | " 0.19% 346.94us 1 346.94us 346.94us 346.94us cuDeviceTotalMem\n", 339 | " 0.08% 144.39us 101 1.4290us 139ns 57.541us cuDeviceGetAttribute\n", 340 | " 0.06% 108.45us 3 36.150us 4.3680us 94.815us cudaFree\n", 341 | " 0.04% 65.119us 3 21.706us 12.926us 31.528us cudaMemcpy\n", 342 | " 0.02% 30.745us 1 30.745us 30.745us 30.745us cudaLaunchKernel\n", 343 | " 0.01% 25.721us 1 25.721us 25.721us 25.721us cuDeviceGetName\n", 344 | " 0.00% 7.3610us 1 7.3610us 7.3610us 7.3610us cuDeviceGetPCIBusId\n", 345 | " 0.00% 1.7910us 3 597ns 247ns 894ns cuDeviceGetCount\n", 346 | " 0.00% 1.3040us 2 652ns 326ns 978ns cuDeviceGet\n", 347 | " 0.00% 259ns 1 259ns 259ns 259ns cuDeviceGetUuid\n", 348 | "==188== NVPROF is profiling process 188, command: ./matrix_vector_multiplication 2 0\n", 349 | "PASSED\n", 350 | "==188== Profiling application: ./matrix_vector_multiplication 2 0\n", 351 | "==188== Profiling result:\n", 352 | " Type Time(%) Time Calls Avg Min Max Name\n", 353 | " GPU activities: 53.20% 9.5680us 1 9.5680us 9.5680us 9.5680us matrix_vector_multiplication(float*, float*, float*, int, int)\n", 354 | " 35.05% 6.3040us 2 3.1520us 1.4080us 4.8960us [CUDA memcpy HtoD]\n", 355 | " 11.74% 2.1120us 1 2.1120us 2.1120us 2.1120us [CUDA memcpy DtoH]\n", 356 | " API calls: 99.59% 179.94ms 3 59.980ms 3.5560us 179.93ms cudaMalloc\n", 357 | " 0.20% 360.13us 1 360.13us 360.13us 360.13us cuDeviceTotalMem\n", 358 | " 0.08% 138.49us 101 1.3710us 140ns 56.464us cuDeviceGetAttribute\n", 359 | " 0.06% 105.28us 3 35.094us 4.6640us 91.544us cudaFree\n", 360 | " 0.04% 73.811us 3 24.603us 13.530us 32.582us cudaMemcpy\n", 361 | " 0.02% 34.857us 1 34.857us 34.857us 34.857us cuDeviceGetName\n", 362 | " 0.01% 25.780us 1 25.780us 25.780us 25.780us cudaLaunchKernel\n", 363 | " 0.00% 5.4220us 1 5.4220us 5.4220us 5.4220us cuDeviceGetPCIBusId\n", 364 | " 0.00% 1.3330us 3 444ns 198ns 723ns cuDeviceGetCount\n", 365 | " 0.00% 1.1720us 2 586ns 299ns 873ns cuDeviceGet\n", 366 | " 0.00% 254ns 1 254ns 254ns 254ns cuDeviceGetUuid\n", 367 | "==199== NVPROF is profiling process 199, command: ./matrix_vector_multiplication 3 0\n", 368 | "PASSED\n", 369 | "==199== Profiling application: ./matrix_vector_multiplication 3 0\n", 370 | "==199== Profiling result:\n", 371 | " Type Time(%) Time Calls Avg Min Max Name\n", 372 | " GPU activities: 53.21% 9.5680us 1 9.5680us 9.5680us 9.5680us matrix_vector_multiplication(float*, float*, float*, int, int)\n", 373 | " 35.23% 6.3350us 2 3.1670us 1.4070us 4.9280us [CUDA memcpy HtoD]\n", 374 | " 11.57% 2.0800us 1 2.0800us 2.0800us 2.0800us [CUDA memcpy DtoH]\n", 375 | " API calls: 99.58% 178.32ms 3 59.438ms 3.5200us 178.31ms cudaMalloc\n", 376 | " 0.20% 351.78us 1 351.78us 351.78us 351.78us cuDeviceTotalMem\n", 377 | " 0.08% 140.43us 3 46.810us 4.5390us 127.32us cudaFree\n", 378 | " 0.08% 134.77us 101 1.3340us 137ns 57.331us cuDeviceGetAttribute\n", 379 | " 0.03% 62.196us 3 20.732us 14.455us 27.225us cudaMemcpy\n", 380 | " 0.02% 29.932us 1 29.932us 29.932us 29.932us cuDeviceGetName\n", 381 | " 0.01% 24.688us 1 24.688us 24.688us 24.688us cudaLaunchKernel\n", 382 | " 0.00% 4.7770us 1 4.7770us 4.7770us 4.7770us cuDeviceGetPCIBusId\n", 383 | " 0.00% 1.3490us 3 449ns 200ns 842ns cuDeviceGetCount\n", 384 | " 0.00% 1.1530us 2 576ns 254ns 899ns cuDeviceGet\n", 385 | " 0.00% 253ns 1 253ns 253ns 253ns cuDeviceGetUuid\n", 386 | "==210== NVPROF is profiling process 210, command: ./matrix_vector_multiplication 4 0\n", 387 | "PASSED\n", 388 | "==210== Profiling application: ./matrix_vector_multiplication 4 0\n", 389 | "==210== Profiling result:\n", 390 | " Type Time(%) Time Calls Avg Min Max Name\n", 391 | " GPU activities: 53.37% 9.6320us 1 9.6320us 9.6320us 9.6320us matrix_vector_multiplication(float*, float*, float*, int, int)\n", 392 | " 34.93% 6.3040us 2 3.1520us 1.3760us 4.9280us [CUDA memcpy HtoD]\n", 393 | " 11.70% 2.1110us 1 2.1110us 2.1110us 2.1110us [CUDA memcpy DtoH]\n", 394 | " API calls: 99.58% 177.20ms 3 59.067ms 3.6720us 177.19ms cudaMalloc\n", 395 | " 0.21% 366.81us 1 366.81us 366.81us 366.81us cuDeviceTotalMem\n", 396 | " 0.08% 136.60us 101 1.3520us 138ns 57.487us cuDeviceGetAttribute\n", 397 | " 0.06% 110.76us 3 36.921us 4.7200us 96.591us cudaFree\n", 398 | " 0.03% 61.611us 3 20.537us 13.440us 27.097us cudaMemcpy\n", 399 | " 0.02% 29.652us 1 29.652us 29.652us 29.652us cuDeviceGetName\n", 400 | " 0.02% 28.677us 1 28.677us 28.677us 28.677us cudaLaunchKernel\n", 401 | " 0.00% 6.0860us 1 6.0860us 6.0860us 6.0860us cuDeviceGetPCIBusId\n", 402 | " 0.00% 1.2890us 3 429ns 247ns 732ns cuDeviceGetCount\n", 403 | " 0.00% 1.1320us 2 566ns 280ns 852ns cuDeviceGet\n", 404 | " 0.00% 257ns 1 257ns 257ns 257ns cuDeviceGetUuid\n" 405 | ], 406 | "name": "stdout" 407 | } 408 | ] 409 | } 410 | ] 411 | } -------------------------------------------------------------------------------- /Solution/Exercise_09.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Exercise_09.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [ 9 | "_BsEJesxACRz" 10 | ] 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "language_info": { 17 | "name": "python" 18 | }, 19 | "accelerator": "GPU" 20 | }, 21 | "cells": [ 22 | { 23 | "cell_type": "markdown", 24 | "metadata": { 25 | "id": "h-JwSwNW9QmT" 26 | }, 27 | "source": [ 28 | "\n", 29 | "# CUDA Exercise 09\n", 30 | "> You should try to implement your own solution for matrix multiplication, and try to parallelize the computation.\n", 31 | "\n", 32 | "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n", 33 | "\n", 34 | "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_09.ipynb)\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": { 40 | "id": "cOEai4hb95Ip" 41 | }, 42 | "source": [ 43 | "## Initialize the CUDA dev environment" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "metadata": { 49 | "id": "bqmwwI7H5nDx", 50 | "colab": { 51 | "base_uri": "https://localhost:8080/" 52 | }, 53 | "outputId": "d963df04-926f-400a-d0e4-2878c4a03198" 54 | }, 55 | "source": [ 56 | "# clone the code repo,\n", 57 | "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n", 58 | "%load_ext nvcc_plugin" 59 | ], 60 | "execution_count": 1, 61 | "outputs": [ 62 | { 63 | "output_type": "stream", 64 | "text": [ 65 | "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n", 66 | " Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-9uosm_fy\n", 67 | " Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-9uosm_fy\n", 68 | "Building wheels for collected packages: NVCCPlugin\n", 69 | " Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 70 | " Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=b2c7f0347c89a0d2f434e28ded0da15c6996ef06e1885e654b7568adf563eff6\n", 71 | " Stored in directory: /tmp/pip-ephem-wheel-cache-kkvx15za/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n", 72 | "Successfully built NVCCPlugin\n", 73 | "Installing collected packages: NVCCPlugin\n", 74 | "Successfully installed NVCCPlugin-0.0.2\n", 75 | "Default out bin result.out\n" 76 | ], 77 | "name": "stdout" 78 | } 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": { 84 | "id": "P2Zeyyo4_gNH" 85 | }, 86 | "source": [ 87 | "## Check the environment " 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "metadata": { 93 | "id": "N6PT4QpR6oxt", 94 | "colab": { 95 | "base_uri": "https://localhost:8080/" 96 | }, 97 | "outputId": "0d20d71f-ff18-4f85-a6e5-8c90e1f97a8a" 98 | }, 99 | "source": [ 100 | "!lsb_release -a\n", 101 | "!nvcc --version\n", 102 | "!nvidia-smi" 103 | ], 104 | "execution_count": 2, 105 | "outputs": [ 106 | { 107 | "output_type": "stream", 108 | "text": [ 109 | "No LSB modules are available.\n", 110 | "Distributor ID:\tUbuntu\n", 111 | "Description:\tUbuntu 18.04.5 LTS\n", 112 | "Release:\t18.04\n", 113 | "Codename:\tbionic\n", 114 | "nvcc: NVIDIA (R) Cuda compiler driver\n", 115 | "Copyright (c) 2005-2020 NVIDIA Corporation\n", 116 | "Built on Wed_Jul_22_19:09:09_PDT_2020\n", 117 | "Cuda compilation tools, release 11.0, V11.0.221\n", 118 | "Build cuda_11.0_bu.TC445_37.28845127_0\n", 119 | "Mon Apr 26 21:01:30 2021 \n", 120 | "+-----------------------------------------------------------------------------+\n", 121 | "| NVIDIA-SMI 465.19.01 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", 122 | "|-------------------------------+----------------------+----------------------+\n", 123 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 124 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 125 | "| | | MIG M. |\n", 126 | "|===============================+======================+======================|\n", 127 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 128 | "| N/A 40C P8 9W / 70W | 0MiB / 15109MiB | 0% Default |\n", 129 | "| | | N/A |\n", 130 | "+-------------------------------+----------------------+----------------------+\n", 131 | " \n", 132 | "+-----------------------------------------------------------------------------+\n", 133 | "| Processes: |\n", 134 | "| GPU GI CI PID Type Process name GPU Memory |\n", 135 | "| ID ID Usage |\n", 136 | "|=============================================================================|\n", 137 | "| No running processes found |\n", 138 | "+-----------------------------------------------------------------------------+\n" 139 | ], 140 | "name": "stdout" 141 | } 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": { 147 | "id": "TF6KTYqE_n7H" 148 | }, 149 | "source": [ 150 | "## Matrix Multiplication - Implimentation 01" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "metadata": { 156 | "id": "Ev5_BW1z80S3", 157 | "colab": { 158 | "base_uri": "https://localhost:8080/" 159 | }, 160 | "outputId": "286f06f4-0014-49b2-ac34-21640ade8975" 161 | }, 162 | "source": [ 163 | "%%writefile matrix_mul_01.cu\n", 164 | "// %%cu\n", 165 | "#include \n", 166 | "\n", 167 | "__global__ void matrix_mul(int *matrix_a, int *matrix_b, int *matrix_c,int matrix_a_row,int matrix_a_column,int matrix_b_column){\n", 168 | " int matrix_c_element = 0;\n", 169 | " for (int i = 0; i < matrix_a_column; i++){\n", 170 | " matrix_c_element += matrix_a[(threadIdx.x/matrix_b_column)*matrix_a_column+i] * matrix_b[threadIdx.x%matrix_b_column+i*matrix_b_column];\n", 171 | " }\n", 172 | " matrix_c[threadIdx.x]= matrix_c_element;\n", 173 | "}\n", 174 | "\n", 175 | "int main(int argc, char *argv[]){\n", 176 | " \n", 177 | " //===========================================================================\n", 178 | " // Below, there are three example case, which you should only uncomment one\n", 179 | " // of them, to run the test.\n", 180 | " /* Example 1\n", 181 | " int matrix_a[16] = {5,0,34,21,7,17,-12,28,8,-3,-3,-3,0,-3,5,9};\n", 182 | " int matrix_a_row = 4;\n", 183 | " int matrix_a_column = 4;\n", 184 | " int matrix_b[16] = {0,16,24,-90,-23,0,11,1,3,3,0,3,66,7,8,0};\n", 185 | " int matrix_b_row = 4;\n", 186 | " int matrix_b_column = 4;\n", 187 | " */\n", 188 | "\n", 189 | " /* Example 2\n", 190 | " int matrix_a[12] = {12,6,22,7,17,-12,36,9,9,0,-1,-2};\n", 191 | " int matrix_a_row = 4;\n", 192 | " int matrix_a_column = 3;\n", 193 | " int matrix_b[15] = {0,16,24,-1,4,-23,0,11,1,4,3,3,0,3,4};\n", 194 | " int matrix_b_row = 3;\n", 195 | " int matrix_b_column = 5;\n", 196 | " */\n", 197 | "\n", 198 | " // random initialization of larger matrixes\n", 199 | " // matrix_a_row * matrix_b_column <= 1024\n", 200 | " int matrix_a_row = 50;\n", 201 | " int matrix_a_column = 30;\n", 202 | " int *matrix_a = (int*) malloc(sizeof(int) * (matrix_a_row * matrix_a_column));\n", 203 | " for(int i = 0; i < matrix_a_row; i++){\n", 204 | " for(int j = 0; j < matrix_a_column; j++)\n", 205 | " {\n", 206 | " int index = i * matrix_a_column+j;\n", 207 | " matrix_a[index] = 1;\n", 208 | " }\n", 209 | " }\n", 210 | " int matrix_b_row = 30;\n", 211 | " int matrix_b_column = 20;\n", 212 | " int *matrix_b = (int*) malloc(sizeof(int) * (matrix_b_row * matrix_b_column));\n", 213 | " for(int i = 0; i < matrix_b_row; i++){\n", 214 | " for(int j = 0; j < matrix_b_column; j++)\n", 215 | " {\n", 216 | " int index = i * matrix_b_column+j;\n", 217 | " matrix_b[index] = 2;\n", 218 | " }\n", 219 | " }\n", 220 | "\n", 221 | " //===========================================================================\n", 222 | "\n", 223 | " int *matrix_c = (int*) malloc(sizeof(int) * (matrix_a_row * matrix_b_column));\n", 224 | " int *d_matrix_a, *d_matrix_b, *d_matrix_c;\n", 225 | " \n", 226 | " cudaMalloc((void**)&d_matrix_a,sizeof(int) * (matrix_a_row * matrix_a_column));\n", 227 | " cudaMalloc((void**)&d_matrix_b,sizeof(int) * (matrix_b_row * matrix_b_column));\n", 228 | " cudaMalloc((void**)&d_matrix_c,sizeof(int) * (matrix_a_row * matrix_b_column));\n", 229 | "\n", 230 | " cudaMemcpy(d_matrix_a, matrix_a, sizeof(int) * (matrix_a_row * matrix_a_column), cudaMemcpyHostToDevice);\n", 231 | " cudaMemcpy(d_matrix_b, matrix_b, sizeof(int) * (matrix_b_row * matrix_b_column), cudaMemcpyHostToDevice);\n", 232 | "\n", 233 | " // implement 100 times for getting average execution time\n", 234 | " for(int i=0; i<100;i++){\n", 235 | " matrix_mul<<<1,matrix_a_row * matrix_b_column>>>(d_matrix_a, d_matrix_b, d_matrix_c, matrix_a_row,matrix_a_column, matrix_b_column);\n", 236 | " }\n", 237 | "\n", 238 | " cudaMemcpy(matrix_c, d_matrix_c,sizeof(int) * (matrix_a_row * matrix_b_column), cudaMemcpyDeviceToHost);\n", 239 | "\n", 240 | " // print matrix_c to check correction\n", 241 | " for(int i = 0; i < matrix_a_row; i++){\n", 242 | " for(int j = 0; j < matrix_b_column; j++){\n", 243 | " int index = i * matrix_b_column +j;\n", 244 | " printf(\"%d, \",matrix_c[index]);\n", 245 | " }\n", 246 | " printf(\"\\n\");\n", 247 | " }\n", 248 | " cudaDeviceSynchronize();\n", 249 | "\n", 250 | " cudaFree(d_matrix_c);\n", 251 | " cudaFree(d_matrix_b);\n", 252 | " cudaFree(d_matrix_a);\n", 253 | "\n", 254 | " return 0;\n", 255 | "}" 256 | ], 257 | "execution_count": 3, 258 | "outputs": [ 259 | { 260 | "output_type": "stream", 261 | "text": [ 262 | "Writing matrix_mul_01.cu\n" 263 | ], 264 | "name": "stdout" 265 | } 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": { 271 | "id": "_BsEJesxACRz" 272 | }, 273 | "source": [ 274 | "## Evaluation to collect enough information for the benchmark" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "metadata": { 280 | "id": "CjisNLsazjUT", 281 | "colab": { 282 | "base_uri": "https://localhost:8080/" 283 | }, 284 | "outputId": "ab265330-1331-44b7-a3ae-15f5334c006a" 285 | }, 286 | "source": [ 287 | "!nvcc -o matrix_mul_01 matrix_mul_01.cu\n", 288 | "!nvprof ./matrix_mul_01\n" 289 | ], 290 | "execution_count": 4, 291 | "outputs": [ 292 | { 293 | "output_type": "stream", 294 | "text": [ 295 | "==165== NVPROF is profiling process 165, command: ./matrix_mul_01\n", 296 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 297 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 298 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 299 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 300 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 301 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 302 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 303 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 304 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 305 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 306 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 307 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 308 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 309 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 310 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 311 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 312 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 313 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 314 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 315 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 316 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 317 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 318 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 319 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 320 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 321 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 322 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 323 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 324 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 325 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 326 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 327 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 328 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 329 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 330 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 331 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 332 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 333 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 334 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 335 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 336 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 337 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 338 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 339 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 340 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 341 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 342 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 343 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 344 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 345 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 346 | "==165== Profiling application: ./matrix_mul_01\n", 347 | "==165== Profiling result:\n", 348 | " Type Time(%) Time Calls Avg Min Max Name\n", 349 | " GPU activities: 99.40% 1.1060ms 100 11.060us 10.944us 11.360us matrix_mul(int*, int*, int*, int, int, int)\n", 350 | " 0.35% 3.9360us 2 1.9680us 1.6320us 2.3040us [CUDA memcpy HtoD]\n", 351 | " 0.25% 2.7840us 1 2.7840us 2.7840us 2.7840us [CUDA memcpy DtoH]\n", 352 | " API calls: 99.30% 339.36ms 3 113.12ms 3.2070us 339.35ms cudaMalloc\n", 353 | " 0.23% 779.42us 3 259.81us 10.308us 744.60us cudaMemcpy\n", 354 | " 0.14% 492.00us 100 4.9200us 3.7570us 35.725us cudaLaunchKernel\n", 355 | " 0.12% 400.87us 1 400.87us 400.87us 400.87us cuDeviceGetPCIBusId\n", 356 | " 0.11% 369.10us 1 369.10us 369.10us 369.10us cuDeviceTotalMem\n", 357 | " 0.06% 193.36us 101 1.9140us 144ns 76.246us cuDeviceGetAttribute\n", 358 | " 0.04% 131.28us 3 43.758us 4.4320us 115.95us cudaFree\n", 359 | " 0.01% 30.707us 1 30.707us 30.707us 30.707us cuDeviceGetName\n", 360 | " 0.00% 7.8160us 1 7.8160us 7.8160us 7.8160us cudaDeviceSynchronize\n", 361 | " 0.00% 1.9930us 3 664ns 216ns 1.3860us cuDeviceGetCount\n", 362 | " 0.00% 1.7150us 2 857ns 202ns 1.5130us cuDeviceGet\n", 363 | " 0.00% 295ns 1 295ns 295ns 295ns cuDeviceGetUuid\n" 364 | ], 365 | "name": "stdout" 366 | } 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": { 372 | "id": "4LefKVzj4VUV" 373 | }, 374 | "source": [ 375 | "## Matrix Multiplication - Implimentation 02" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "metadata": { 381 | "id": "YZvzZt8d4UpL", 382 | "colab": { 383 | "base_uri": "https://localhost:8080/" 384 | }, 385 | "outputId": "77b64349-167d-4632-87a7-f1f8055b7afd" 386 | }, 387 | "source": [ 388 | "%%writefile matrix_mul_02.cu\n", 389 | "//%%cu\n", 390 | "#include \n", 391 | "\n", 392 | "__global__ void matrix_mul(int *matrix_a, int *matrix_b, int *matrix_c,int matrix_a_row,int matrix_a_column,int matrix_b_column){\n", 393 | " int matrix_c_element = 0;\n", 394 | " int tid = blockIdx.x * blockDim.x + threadIdx.x;\n", 395 | " for (int i = 0; i < matrix_a_column; i++){\n", 396 | " matrix_c_element += matrix_a[(tid/matrix_b_column)*matrix_a_column+i] * matrix_b[tid%matrix_b_column+i*matrix_b_column];\n", 397 | " }\n", 398 | " matrix_c[tid]= matrix_c_element;\n", 399 | "}\n", 400 | "\n", 401 | "int main(int argc, char *argv[]){\n", 402 | " \n", 403 | " //===========================================================================\n", 404 | " // Below, there are three example case, which you should only uncomment one\n", 405 | " // of them, to run the test.\n", 406 | "\n", 407 | " /* Example 1\n", 408 | " int matrix_a[16] = {5,0,34,21,7,17,-12,28,8,-3,-3,-3,0,-3,5,9};\n", 409 | " int matrix_a_row = 4;\n", 410 | " int matrix_a_column = 4;\n", 411 | " int matrix_b[16] = {0,16,24,-90,-23,0,11,1,3,3,0,3,66,7,8,0};\n", 412 | " int matrix_b_row = 4;\n", 413 | " int matrix_b_column = 4;\n", 414 | " */\n", 415 | " \n", 416 | " /* Example 2\n", 417 | " int matrix_a[12] = {12,6,22,7,17,-12,36,9,9,0,-1,-2};\n", 418 | " int matrix_a_row = 4;\n", 419 | " int matrix_a_column = 3;\n", 420 | " int matrix_b[15] = {0,16,24,-1,4,-23,0,11,1,4,3,3,0,3,4};\n", 421 | " int matrix_b_row = 3;\n", 422 | " int matrix_b_column = 5;\n", 423 | " */\n", 424 | " \n", 425 | " \n", 426 | " // random initialization of larger matrixes\n", 427 | " // matrix_a_row as number of blocks\n", 428 | " // matrix_b_column as number of threads per block\n", 429 | " int matrix_a_row = 50;\n", 430 | " int matrix_a_column = 30;\n", 431 | " int *matrix_a = (int*) malloc(sizeof(int) * (matrix_a_row * matrix_a_column));\n", 432 | " for(int i = 0; i < matrix_a_row; i++){\n", 433 | " for(int j = 0; j < matrix_a_column; j++)\n", 434 | " {\n", 435 | " int index = i * matrix_a_column+j;\n", 436 | " matrix_a[index] = 1;\n", 437 | " }\n", 438 | " }\n", 439 | " int matrix_b_row = 30;\n", 440 | " int matrix_b_column = 20;\n", 441 | " int *matrix_b = (int*) malloc(sizeof(int) * (matrix_b_row * matrix_b_column));\n", 442 | " for(int i = 0; i < matrix_b_row; i++){\n", 443 | " for(int j = 0; j < matrix_b_column; j++)\n", 444 | " {\n", 445 | " int index = i * matrix_b_column+j;\n", 446 | " matrix_b[index] = 2;\n", 447 | " }\n", 448 | " }\n", 449 | " //===========================================================================\n", 450 | "\n", 451 | "\n", 452 | " int *matrix_c = (int*) malloc(sizeof(int) * (matrix_a_row * matrix_b_column));\n", 453 | " int *d_matrix_a, *d_matrix_b, *d_matrix_c;\n", 454 | " \n", 455 | " cudaMalloc((void**)&d_matrix_a,sizeof(int) * (matrix_a_row * matrix_a_column));\n", 456 | " cudaMalloc((void**)&d_matrix_b,sizeof(int) * (matrix_b_row * matrix_b_column));\n", 457 | " cudaMalloc((void**)&d_matrix_c,sizeof(int) * (matrix_a_row * matrix_b_column));\n", 458 | "\n", 459 | " cudaMemcpy(d_matrix_a, matrix_a, sizeof(int) * (matrix_a_row * matrix_a_column), cudaMemcpyHostToDevice);\n", 460 | " cudaMemcpy(d_matrix_b, matrix_b, sizeof(int) * (matrix_b_row * matrix_b_column), cudaMemcpyHostToDevice);\n", 461 | "\n", 462 | " // implement 100 times for getting average execution time\n", 463 | " for(int i=0; i<100;i++){\n", 464 | " matrix_mul<<>>(d_matrix_a, d_matrix_b, d_matrix_c, matrix_a_row,matrix_a_column, matrix_b_column);\n", 465 | " \n", 466 | " //for comparison with 01.cu\n", 467 | " //matrix_mul<<<1,matrix_a_row * matrix_b_column>>>(d_matrix_a, d_matrix_b, d_matrix_c, matrix_a_row,matrix_a_column, matrix_b_column);\n", 468 | " }\n", 469 | "\n", 470 | " cudaMemcpy(matrix_c, d_matrix_c,sizeof(int) * (matrix_a_row * matrix_b_column), cudaMemcpyDeviceToHost);\n", 471 | "\n", 472 | " // print matrix_c to check correction\n", 473 | " for(int i = 0; i < matrix_a_row; i++){\n", 474 | " for(int j = 0; j < matrix_b_column; j++){\n", 475 | " int index = i * matrix_b_column +j;\n", 476 | " printf(\"%d, \",matrix_c[index]);\n", 477 | " }\n", 478 | " printf(\"\\n\");\n", 479 | " }\n", 480 | " cudaDeviceSynchronize();\n", 481 | "\n", 482 | " cudaFree(d_matrix_c);\n", 483 | " cudaFree(d_matrix_b);\n", 484 | " cudaFree(d_matrix_a);\n", 485 | "\n", 486 | " return 0;\n", 487 | "}" 488 | ], 489 | "execution_count": 5, 490 | "outputs": [ 491 | { 492 | "output_type": "stream", 493 | "text": [ 494 | "Writing matrix_mul_02.cu\n" 495 | ], 496 | "name": "stdout" 497 | } 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": { 503 | "id": "AKNir-yF_F_8" 504 | }, 505 | "source": [ 506 | "## Evaluation to collect enough information for the benchmark" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "metadata": { 512 | "id": "s61EVRmqQ0RF", 513 | "colab": { 514 | "base_uri": "https://localhost:8080/" 515 | }, 516 | "outputId": "8a411a83-bcd3-4549-fd3b-a87119bf81c3" 517 | }, 518 | "source": [ 519 | "!nvcc -o matrix_mul_02 matrix_mul_02.cu\n", 520 | "!nvprof ./matrix_mul_02" 521 | ], 522 | "execution_count": 6, 523 | "outputs": [ 524 | { 525 | "output_type": "stream", 526 | "text": [ 527 | "==209== NVPROF is profiling process 209, command: ./matrix_mul_02\n", 528 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 529 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 530 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 531 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 532 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 533 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 534 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 535 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 536 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 537 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 538 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 539 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 540 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 541 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 542 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 543 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 544 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 545 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 546 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 547 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 548 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 549 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 550 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 551 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 552 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 553 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 554 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 555 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 556 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 557 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 558 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 559 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 560 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 561 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 562 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 563 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 564 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 565 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 566 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 567 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 568 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 569 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 570 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 571 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 572 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 573 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 574 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 575 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 576 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 577 | "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n", 578 | "==209== Profiling application: ./matrix_mul_02\n", 579 | "==209== Profiling result:\n", 580 | " Type Time(%) Time Calls Avg Min Max Name\n", 581 | " GPU activities: 98.67% 526.42us 100 5.2640us 5.1830us 5.6000us matrix_mul(int*, int*, int*, int, int, int)\n", 582 | " 0.78% 4.1600us 2 2.0800us 1.6320us 2.5280us [CUDA memcpy HtoD]\n", 583 | " 0.55% 2.9120us 1 2.9120us 2.9120us 2.9120us [CUDA memcpy DtoH]\n", 584 | " API calls: 99.44% 256.98ms 3 85.659ms 3.3400us 256.97ms cudaMalloc\n", 585 | " 0.21% 536.21us 100 5.3620us 3.9960us 34.504us cudaLaunchKernel\n", 586 | " 0.15% 376.96us 1 376.96us 376.96us 376.96us cuDeviceTotalMem\n", 587 | " 0.07% 184.83us 3 61.610us 10.349us 156.72us cudaMemcpy\n", 588 | " 0.06% 157.59us 3 52.528us 3.3410us 145.02us cudaFree\n", 589 | " 0.06% 154.20us 101 1.5260us 143ns 69.320us cuDeviceGetAttribute\n", 590 | " 0.01% 28.938us 1 28.938us 28.938us 28.938us cuDeviceGetName\n", 591 | " 0.00% 7.2200us 1 7.2200us 7.2200us 7.2200us cuDeviceGetPCIBusId\n", 592 | " 0.00% 6.6720us 1 6.6720us 6.6720us 6.6720us cudaDeviceSynchronize\n", 593 | " 0.00% 1.8660us 3 622ns 220ns 1.2690us cuDeviceGetCount\n", 594 | " 0.00% 1.6280us 2 814ns 338ns 1.2900us cuDeviceGet\n", 595 | " 0.00% 294ns 1 294ns 294ns 294ns cuDeviceGetUuid\n" 596 | ], 597 | "name": "stdout" 598 | } 599 | ] 600 | } 601 | ] 602 | } --------------------------------------------------------------------------------