├── .gitignore
├── LICENSE
├── README.md
└── Solution
    ├── Exercise_01.ipynb
    ├── Exercise_02.ipynb
    ├── Exercise_03.ipynb
    ├── Exercise_04.ipynb
    ├── Exercise_05.ipynb
    ├── Exercise_06.ipynb
    ├── Exercise_07.ipynb
    ├── Exercise_08.ipynb
    └── Exercise_09.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | *.i
2 | *.ii
3 | *.gpu
4 | *.ptx
5 | *.cubin
6 | *.fatbin
7 | .vscode/*
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 SuperChange001
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CUDA_Learning
2 | This is my hobby project, for preparing the FPGA RTX interface.
3 | 


--------------------------------------------------------------------------------
/Solution/Exercise_01.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Exercise_01.ipynb",
  7 |       "provenance": []
  8 |     },
  9 |     "kernelspec": {
 10 |       "name": "python3",
 11 |       "display_name": "Python 3"
 12 |     },
 13 |     "language_info": {
 14 |       "name": "python"
 15 |     },
 16 |     "accelerator": "GPU"
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "gZABpep_V-8C"
 23 |       },
 24 |       "source": [
 25 |         "# CUDA Exercise 01\n",
 26 |         "> This is Hello World exampel! \n",
 27 |         "\n",
 28 |         "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n",
 29 |         "\n",
 30 |         "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_01.ipynb)"
 31 |       ]
 32 |     },
 33 |     {
 34 |       "cell_type": "markdown",
 35 |       "metadata": {
 36 |         "id": "P401L2N_WG6R"
 37 |       },
 38 |       "source": [
 39 |         "## Initialize the CUDA dev environment"
 40 |       ]
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "metadata": {
 45 |         "colab": {
 46 |           "base_uri": "https://localhost:8080/"
 47 |         },
 48 |         "id": "OONoNFZeV63L",
 49 |         "outputId": "504e7952-c8c0-4e92-cb59-cdd013efed4d"
 50 |       },
 51 |       "source": [
 52 |         "# clone the code repo,\n",
 53 |         "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n",
 54 |         "%load_ext nvcc_plugin\n",
 55 |         "\n",
 56 |         "# Check the environment \n",
 57 |         "!lsb_release -a\n",
 58 |         "!nvcc --version\n",
 59 |         "!nvidia-smi"
 60 |       ],
 61 |       "execution_count": 1,
 62 |       "outputs": [
 63 |         {
 64 |           "output_type": "stream",
 65 |           "text": [
 66 |             "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n",
 67 |             "  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-7rots3_w\n",
 68 |             "  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-7rots3_w\n",
 69 |             "Building wheels for collected packages: NVCCPlugin\n",
 70 |             "  Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 71 |             "  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=6a39638fb6e744f8984e03b1347b0b350bec4c23334d79c353a9e16c0981cae4\n",
 72 |             "  Stored in directory: /tmp/pip-ephem-wheel-cache-11ej3wib/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n",
 73 |             "Successfully built NVCCPlugin\n",
 74 |             "Installing collected packages: NVCCPlugin\n",
 75 |             "Successfully installed NVCCPlugin-0.0.2\n",
 76 |             "Default out bin result.out\n",
 77 |             "No LSB modules are available.\n",
 78 |             "Distributor ID:\tUbuntu\n",
 79 |             "Description:\tUbuntu 18.04.5 LTS\n",
 80 |             "Release:\t18.04\n",
 81 |             "Codename:\tbionic\n",
 82 |             "nvcc: NVIDIA (R) Cuda compiler driver\n",
 83 |             "Copyright (c) 2005-2020 NVIDIA Corporation\n",
 84 |             "Built on Wed_Jul_22_19:09:09_PDT_2020\n",
 85 |             "Cuda compilation tools, release 11.0, V11.0.221\n",
 86 |             "Build cuda_11.0_bu.TC445_37.28845127_0\n",
 87 |             "Thu Apr 22 20:52:22 2021       \n",
 88 |             "+-----------------------------------------------------------------------------+\n",
 89 |             "| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
 90 |             "|-------------------------------+----------------------+----------------------+\n",
 91 |             "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
 92 |             "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
 93 |             "|                               |                      |               MIG M. |\n",
 94 |             "|===============================+======================+======================|\n",
 95 |             "|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |\n",
 96 |             "| N/A   49C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |\n",
 97 |             "|                               |                      |                  N/A |\n",
 98 |             "+-------------------------------+----------------------+----------------------+\n",
 99 |             "                                                                               \n",
100 |             "+-----------------------------------------------------------------------------+\n",
101 |             "| Processes:                                                                  |\n",
102 |             "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
103 |             "|        ID   ID                                                   Usage      |\n",
104 |             "|=============================================================================|\n",
105 |             "|  No running processes found                                                 |\n",
106 |             "+-----------------------------------------------------------------------------+\n"
107 |           ],
108 |           "name": "stdout"
109 |         }
110 |       ]
111 |     },
112 |     {
113 |       "cell_type": "markdown",
114 |       "metadata": {
115 |         "id": "DDN2x4izW0rO"
116 |       },
117 |       "source": [
118 |         "## Hello World"
119 |       ]
120 |     },
121 |     {
122 |       "cell_type": "code",
123 |       "metadata": {
124 |         "colab": {
125 |           "base_uri": "https://localhost:8080/",
126 |           "height": 35
127 |         },
128 |         "id": "egrZEZ3MWaP_",
129 |         "outputId": "0cc15126-0b1b-4c93-d9de-3e91acb770ce"
130 |       },
131 |       "source": [
132 |         "%%cu\n",
133 |         "#include <stdio.h>\n",
134 |         "__global__ void cuda_hello(){\n",
135 |         "    printf(\"Hello World from GPU!\\n\");\n",
136 |         "}\n",
137 |         "\n",
138 |         "int main() {\n",
139 |         "    cuda_hello<<<1,1>>>();\n",
140 |         "    cudaDeviceSynchronize();\n",
141 |         "\n",
142 |         "    return 0;\n",
143 |         "}"
144 |       ],
145 |       "execution_count": 3,
146 |       "outputs": [
147 |         {
148 |           "output_type": "execute_result",
149 |           "data": {
150 |             "application/vnd.google.colaboratory.intrinsic+json": {
151 |               "type": "string"
152 |             },
153 |             "text/plain": [
154 |               "'Hello World from GPU!\\n'"
155 |             ]
156 |           },
157 |           "metadata": {
158 |             "tags": []
159 |           },
160 |           "execution_count": 3
161 |         }
162 |       ]
163 |     }
164 |   ]
165 | }
166 | 


--------------------------------------------------------------------------------
/Solution/Exercise_02.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Exercise_02.ipynb",
  7 |       "provenance": []
  8 |     },
  9 |     "kernelspec": {
 10 |       "name": "python3",
 11 |       "display_name": "Python 3"
 12 |     },
 13 |     "language_info": {
 14 |       "name": "python"
 15 |     },
 16 |     "accelerator": "GPU"
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "gZABpep_V-8C"
 23 |       },
 24 |       "source": [
 25 |         "# CUDA Exercise 02\n",
 26 |         "> Vector add example with CPU and GPU, only applied with single thread. \n",
 27 |         "\n",
 28 |         "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n",
 29 |         "\n",
 30 |         "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_02.ipynb)"
 31 |       ]
 32 |     },
 33 |     {
 34 |       "cell_type": "markdown",
 35 |       "metadata": {
 36 |         "id": "P401L2N_WG6R"
 37 |       },
 38 |       "source": [
 39 |         "## Initialize the CUDA dev environment"
 40 |       ]
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "metadata": {
 45 |         "colab": {
 46 |           "base_uri": "https://localhost:8080/"
 47 |         },
 48 |         "id": "OONoNFZeV63L",
 49 |         "outputId": "300c7939-3fac-4eaf-bbfe-1d3641a779f4"
 50 |       },
 51 |       "source": [
 52 |         "# clone the code repo,\n",
 53 |         "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n",
 54 |         "%load_ext nvcc_plugin\n",
 55 |         "\n",
 56 |         "# Check the environment \n",
 57 |         "!lsb_release -a\n",
 58 |         "!nvcc --version\n",
 59 |         "!nvidia-smi"
 60 |       ],
 61 |       "execution_count": 1,
 62 |       "outputs": [
 63 |         {
 64 |           "output_type": "stream",
 65 |           "text": [
 66 |             "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n",
 67 |             "  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-22k37xu7\n",
 68 |             "  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-22k37xu7\n",
 69 |             "Building wheels for collected packages: NVCCPlugin\n",
 70 |             "  Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 71 |             "  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=bc12d7017a71a934fd7d39e61241824922d949a1086f514170ffd209c2dc57b5\n",
 72 |             "  Stored in directory: /tmp/pip-ephem-wheel-cache-4zyegsxi/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n",
 73 |             "Successfully built NVCCPlugin\n",
 74 |             "Installing collected packages: NVCCPlugin\n",
 75 |             "Successfully installed NVCCPlugin-0.0.2\n",
 76 |             "Default out bin result.out\n",
 77 |             "No LSB modules are available.\n",
 78 |             "Distributor ID:\tUbuntu\n",
 79 |             "Description:\tUbuntu 18.04.5 LTS\n",
 80 |             "Release:\t18.04\n",
 81 |             "Codename:\tbionic\n",
 82 |             "nvcc: NVIDIA (R) Cuda compiler driver\n",
 83 |             "Copyright (c) 2005-2020 NVIDIA Corporation\n",
 84 |             "Built on Wed_Jul_22_19:09:09_PDT_2020\n",
 85 |             "Cuda compilation tools, release 11.0, V11.0.221\n",
 86 |             "Build cuda_11.0_bu.TC445_37.28845127_0\n",
 87 |             "Thu Apr 22 21:04:18 2021       \n",
 88 |             "+-----------------------------------------------------------------------------+\n",
 89 |             "| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
 90 |             "|-------------------------------+----------------------+----------------------+\n",
 91 |             "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
 92 |             "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
 93 |             "|                               |                      |               MIG M. |\n",
 94 |             "|===============================+======================+======================|\n",
 95 |             "|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |\n",
 96 |             "| N/A   65C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |\n",
 97 |             "|                               |                      |                  N/A |\n",
 98 |             "+-------------------------------+----------------------+----------------------+\n",
 99 |             "                                                                               \n",
100 |             "+-----------------------------------------------------------------------------+\n",
101 |             "| Processes:                                                                  |\n",
102 |             "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
103 |             "|        ID   ID                                                   Usage      |\n",
104 |             "|=============================================================================|\n",
105 |             "|  No running processes found                                                 |\n",
106 |             "+-----------------------------------------------------------------------------+\n"
107 |           ],
108 |           "name": "stdout"
109 |         }
110 |       ]
111 |     },
112 |     {
113 |       "cell_type": "markdown",
114 |       "metadata": {
115 |         "id": "DDN2x4izW0rO"
116 |       },
117 |       "source": [
118 |         "## Vector Add"
119 |       ]
120 |     },
121 |     {
122 |       "cell_type": "code",
123 |       "metadata": {
124 |         "colab": {
125 |           "base_uri": "https://localhost:8080/",
126 |           "height": 35
127 |         },
128 |         "id": "egrZEZ3MWaP_",
129 |         "outputId": "9254f1bb-2518-4300-f124-5754b0674021"
130 |       },
131 |       "source": [
132 |         "%%cu\n",
133 |         "\n",
134 |         "#include <stdio.h>\n",
135 |         "#include <assert.h>\n",
136 |         "\n",
137 |         "#define VECTOR_LENGTH 10000 \n",
138 |         "#define MAX_ERR 1e-4\n",
139 |         "\n",
140 |         "__global__ void vector_add(float *out, float *a, float *b, int n) \n",
141 |         "{\n",
142 |         "    for(int i = 0; i < n; i++)\n",
143 |         "    {\n",
144 |         "        out[i] = a[i] + b[i];\n",
145 |         "    }\n",
146 |         "}\n",
147 |         "\n",
148 |         "int main()\n",
149 |         "{\n",
150 |         "    float *a, *b, *out;\n",
151 |         "    float *d_a, *d_b, *d_out; \n",
152 |         "\n",
153 |         "    //===================步骤1===================\n",
154 |         "    // Allocate memory on CPU\n",
155 |         "    a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n",
156 |         "    b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n",
157 |         "    out = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n",
158 |         "\n",
159 |         "    // data initializtion\n",
160 |         "    for(int i = 0; i < VECTOR_LENGTH; i++)\n",
161 |         "    {\n",
162 |         "        a[i] = 3.0f;\n",
163 |         "        b[i] = 0.14f;\n",
164 |         "    }\n",
165 |         "    //===================步骤1===================\n",
166 |         "\n",
167 |         "    //===================步骤2===================\n",
168 |         "    // Allocate memory on GPU\n",
169 |         "    cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);\n",
170 |         "    cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);\n",
171 |         "    cudaMalloc((void**)&d_out, sizeof(float) * VECTOR_LENGTH);\n",
172 |         "    //===================步骤2===================\n",
173 |         "\n",
174 |         "    //===================步骤3===================\n",
175 |         "    // copy operator to GPU\n",
176 |         "    cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n",
177 |         "    cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n",
178 |         "    //===================步骤3===================\n",
179 |         "\n",
180 |         "    //===================步骤4===================\n",
181 |         "    // GPU do the work, CPU waits\n",
182 |         "    vector_add<<<1,1>>>(d_out, d_a, d_b, VECTOR_LENGTH);\n",
183 |         "    //===================步骤4===================\n",
184 |         "\n",
185 |         "    //===================步骤5===================\n",
186 |         "    // Get results from the GPU\n",
187 |         "    cudaMemcpy(out, d_out, sizeof(float) * VECTOR_LENGTH, \n",
188 |         "               cudaMemcpyDeviceToHost);\n",
189 |         " \n",
190 |         "    // Test the result\n",
191 |         "    for(int i = 0; i < VECTOR_LENGTH; i++)\n",
192 |         "    {\n",
193 |         "        assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);\n",
194 |         "    }\n",
195 |         "    printf(\"out[0] is %f\\n\", out[0]);\n",
196 |         "    printf(\"PASSED\\n\");\n",
197 |         "    //===================步骤5===================\n",
198 |         "\n",
199 |         "    //===================步骤6===================\n",
200 |         "    // Free the memory\n",
201 |         "    cudaFree(d_a);\n",
202 |         "    cudaFree(d_b);\n",
203 |         "    cudaFree(d_out);\n",
204 |         "    free(a);\n",
205 |         "    free(b);\n",
206 |         "    free(out);\n",
207 |         "    //===================步骤6===================\n",
208 |         "}"
209 |       ],
210 |       "execution_count": 2,
211 |       "outputs": [
212 |         {
213 |           "output_type": "execute_result",
214 |           "data": {
215 |             "application/vnd.google.colaboratory.intrinsic+json": {
216 |               "type": "string"
217 |             },
218 |             "text/plain": [
219 |               "'out[0] is 3.140000\\nPASSED\\n'"
220 |             ]
221 |           },
222 |           "metadata": {
223 |             "tags": []
224 |           },
225 |           "execution_count": 2
226 |         }
227 |       ]
228 |     }
229 |   ]
230 | }


--------------------------------------------------------------------------------
/Solution/Exercise_03.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Exercise_03.ipynb",
  7 |       "provenance": []
  8 |     },
  9 |     "kernelspec": {
 10 |       "name": "python3",
 11 |       "display_name": "Python 3"
 12 |     },
 13 |     "language_info": {
 14 |       "name": "python"
 15 |     },
 16 |     "accelerator": "GPU"
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "gZABpep_V-8C"
 23 |       },
 24 |       "source": [
 25 |         "# CUDA Exercise 03\n",
 26 |         "> Vector dot product(inner product) example on GPU, only applied with single thread. \n",
 27 |         "\n",
 28 |         "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n",
 29 |         "\n",
 30 |         "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_03.ipynb)"
 31 |       ]
 32 |     },
 33 |     {
 34 |       "cell_type": "markdown",
 35 |       "metadata": {
 36 |         "id": "P401L2N_WG6R"
 37 |       },
 38 |       "source": [
 39 |         "## Initialize the CUDA dev environment"
 40 |       ]
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "metadata": {
 45 |         "colab": {
 46 |           "base_uri": "https://localhost:8080/"
 47 |         },
 48 |         "id": "OONoNFZeV63L",
 49 |         "outputId": "3e28c708-a18e-40de-a57c-6f8b1b6b08ee"
 50 |       },
 51 |       "source": [
 52 |         "# clone the code repo,\n",
 53 |         "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n",
 54 |         "%load_ext nvcc_plugin\n",
 55 |         "\n",
 56 |         "# Check the environment \n",
 57 |         "!lsb_release -a\n",
 58 |         "!nvcc --version\n",
 59 |         "!nvidia-smi"
 60 |       ],
 61 |       "execution_count": 1,
 62 |       "outputs": [
 63 |         {
 64 |           "output_type": "stream",
 65 |           "text": [
 66 |             "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n",
 67 |             "  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-dcn3mih6\n",
 68 |             "  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-dcn3mih6\n",
 69 |             "Building wheels for collected packages: NVCCPlugin\n",
 70 |             "  Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 71 |             "  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=502f57f1df304061f8b68db3c23567f7917f40794f6bdf2e09e21eef86af5570\n",
 72 |             "  Stored in directory: /tmp/pip-ephem-wheel-cache-mk6amdyq/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n",
 73 |             "Successfully built NVCCPlugin\n",
 74 |             "Installing collected packages: NVCCPlugin\n",
 75 |             "Successfully installed NVCCPlugin-0.0.2\n",
 76 |             "Default out bin result.out\n",
 77 |             "No LSB modules are available.\n",
 78 |             "Distributor ID:\tUbuntu\n",
 79 |             "Description:\tUbuntu 18.04.5 LTS\n",
 80 |             "Release:\t18.04\n",
 81 |             "Codename:\tbionic\n",
 82 |             "nvcc: NVIDIA (R) Cuda compiler driver\n",
 83 |             "Copyright (c) 2005-2020 NVIDIA Corporation\n",
 84 |             "Built on Wed_Jul_22_19:09:09_PDT_2020\n",
 85 |             "Cuda compilation tools, release 11.0, V11.0.221\n",
 86 |             "Build cuda_11.0_bu.TC445_37.28845127_0\n",
 87 |             "Thu Apr 22 21:12:57 2021       \n",
 88 |             "+-----------------------------------------------------------------------------+\n",
 89 |             "| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
 90 |             "|-------------------------------+----------------------+----------------------+\n",
 91 |             "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
 92 |             "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
 93 |             "|                               |                      |               MIG M. |\n",
 94 |             "|===============================+======================+======================|\n",
 95 |             "|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |\n",
 96 |             "| N/A   41C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |\n",
 97 |             "|                               |                      |                  N/A |\n",
 98 |             "+-------------------------------+----------------------+----------------------+\n",
 99 |             "                                                                               \n",
100 |             "+-----------------------------------------------------------------------------+\n",
101 |             "| Processes:                                                                  |\n",
102 |             "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
103 |             "|        ID   ID                                                   Usage      |\n",
104 |             "|=============================================================================|\n",
105 |             "|  No running processes found                                                 |\n",
106 |             "+-----------------------------------------------------------------------------+\n"
107 |           ],
108 |           "name": "stdout"
109 |         }
110 |       ]
111 |     },
112 |     {
113 |       "cell_type": "markdown",
114 |       "metadata": {
115 |         "id": "DDN2x4izW0rO"
116 |       },
117 |       "source": [
118 |         "## Vector Dot Production"
119 |       ]
120 |     },
121 |     {
122 |       "cell_type": "code",
123 |       "metadata": {
124 |         "colab": {
125 |           "base_uri": "https://localhost:8080/",
126 |           "height": 35
127 |         },
128 |         "id": "egrZEZ3MWaP_",
129 |         "outputId": "8662228f-7eec-4158-a27b-5a9934e35909"
130 |       },
131 |       "source": [
132 |         "%%cu\n",
133 |         "#include <stdio.h>\n",
134 |         "#include <assert.h>\n",
135 |         "\n",
136 |         "#define VECTOR_LENGTH 10 \n",
137 |         "#define MAX_ERR 1e-5\n",
138 |         "\n",
139 |         "__global__ void vector_dot_product(float *out, float *a, float *b, int n) \n",
140 |         "{\n",
141 |         "    float sum=0;\n",
142 |         "    for(int i = 0; i < n; i++)\n",
143 |         "    {\n",
144 |         "        sum = sum +  a[i] * b[i];\n",
145 |         "    }\n",
146 |         "    *out = sum;\n",
147 |         "}\n",
148 |         "\n",
149 |         "void test_vector_dot_product(void)\n",
150 |         "{\n",
151 |         "    float *a, *b, *out;\n",
152 |         "    float *d_a, *d_b, *d_out; \n",
153 |         "\n",
154 |         "    // Allocate memory on CPU\n",
155 |         "    a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n",
156 |         "    b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n",
157 |         "    out = (float*)malloc(sizeof(float));\n",
158 |         "\n",
159 |         "    // data initializtion\n",
160 |         "    for(int i = 0; i < VECTOR_LENGTH; i++)\n",
161 |         "    {\n",
162 |         "        a[i] = 3.14f;\n",
163 |         "        b[i] = 2.0f;\n",
164 |         "    }\n",
165 |         "\n",
166 |         "    // Allocate memory on GPU\n",
167 |         "    cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);\n",
168 |         "    cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);\n",
169 |         "    cudaMalloc((void**)&d_out, sizeof(float));\n",
170 |         "\n",
171 |         "    // copy operator to GPU\n",
172 |         "    cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n",
173 |         "    cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n",
174 |         "\n",
175 |         "    // GPU do the work, CPU waits\n",
176 |         "    vector_dot_product<<<1,1>>>(d_out, d_a, d_b, VECTOR_LENGTH);\n",
177 |         " \n",
178 |         "    // Get results from the GPU\n",
179 |         "    cudaMemcpy(out, d_out, sizeof(float), \n",
180 |         "               cudaMemcpyDeviceToHost);\n",
181 |         " \n",
182 |         "    // Test the result\n",
183 |         "    assert(fabs(*out - 20*3.14) < MAX_ERR);\n",
184 |         " \n",
185 |         "    printf(\"out[0] = %f\\n\", out[0]);\n",
186 |         "    printf(\"PASSED\\n\");\n",
187 |         "\n",
188 |         "    // Free the memory\n",
189 |         "    cudaFree(d_a);\n",
190 |         "    cudaFree(d_b);\n",
191 |         "    cudaFree(d_out);\n",
192 |         "    free(a);\n",
193 |         "    free(b);\n",
194 |         "    free(out);\n",
195 |         "}\n",
196 |         "\n",
197 |         "int main()\n",
198 |         "{\n",
199 |         "    test_vector_dot_product();\n",
200 |         "}"
201 |       ],
202 |       "execution_count": 2,
203 |       "outputs": [
204 |         {
205 |           "output_type": "execute_result",
206 |           "data": {
207 |             "application/vnd.google.colaboratory.intrinsic+json": {
208 |               "type": "string"
209 |             },
210 |             "text/plain": [
211 |               "'out[0] = 62.799995\\nPASSED\\n'"
212 |             ]
213 |           },
214 |           "metadata": {
215 |             "tags": []
216 |           },
217 |           "execution_count": 2
218 |         }
219 |       ]
220 |     }
221 |   ]
222 | }


--------------------------------------------------------------------------------
/Solution/Exercise_04.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Exercise_04.ipynb",
  7 |       "provenance": []
  8 |     },
  9 |     "kernelspec": {
 10 |       "name": "python3",
 11 |       "display_name": "Python 3"
 12 |     },
 13 |     "language_info": {
 14 |       "name": "python"
 15 |     },
 16 |     "accelerator": "GPU"
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "markdown",
 21 |       "metadata": {
 22 |         "id": "gZABpep_V-8C"
 23 |       },
 24 |       "source": [
 25 |         "# CUDA Exercise 04\n",
 26 |         "> Matrix summation example on GPU, only applied with single thread. \n",
 27 |         "\n",
 28 |         "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n",
 29 |         "\n",
 30 |         "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_04.ipynb)"
 31 |       ]
 32 |     },
 33 |     {
 34 |       "cell_type": "markdown",
 35 |       "metadata": {
 36 |         "id": "P401L2N_WG6R"
 37 |       },
 38 |       "source": [
 39 |         "## Initialize the CUDA dev environment"
 40 |       ]
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "metadata": {
 45 |         "colab": {
 46 |           "base_uri": "https://localhost:8080/"
 47 |         },
 48 |         "id": "OONoNFZeV63L",
 49 |         "outputId": "0de4afb8-9a85-42aa-ec39-d9ab1bc7f898"
 50 |       },
 51 |       "source": [
 52 |         "# clone the code repo,\n",
 53 |         "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n",
 54 |         "%load_ext nvcc_plugin\n",
 55 |         "\n",
 56 |         "# Check the environment \n",
 57 |         "!lsb_release -a\n",
 58 |         "!nvcc --version\n",
 59 |         "!nvidia-smi"
 60 |       ],
 61 |       "execution_count": 4,
 62 |       "outputs": [
 63 |         {
 64 |           "output_type": "stream",
 65 |           "text": [
 66 |             "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n",
 67 |             "  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-t778hzfn\n",
 68 |             "  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-t778hzfn\n",
 69 |             "Requirement already satisfied (use --upgrade to upgrade): NVCCPlugin==0.0.2 from git+git://github.com/depctg/nvcc4jupyter.git in /usr/local/lib/python3.7/dist-packages\n",
 70 |             "Building wheels for collected packages: NVCCPlugin\n",
 71 |             "  Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 72 |             "  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=8873f16186676dbca7cd6b1588c46f86c101f2a8cdd29b38b813a2ca468ed8f7\n",
 73 |             "  Stored in directory: /tmp/pip-ephem-wheel-cache-yr5jb27e/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n",
 74 |             "Successfully built NVCCPlugin\n",
 75 |             "The nvcc_plugin extension is already loaded. To reload it, use:\n",
 76 |             "  %reload_ext nvcc_plugin\n",
 77 |             "No LSB modules are available.\n",
 78 |             "Distributor ID:\tUbuntu\n",
 79 |             "Description:\tUbuntu 18.04.5 LTS\n",
 80 |             "Release:\t18.04\n",
 81 |             "Codename:\tbionic\n",
 82 |             "nvcc: NVIDIA (R) Cuda compiler driver\n",
 83 |             "Copyright (c) 2005-2020 NVIDIA Corporation\n",
 84 |             "Built on Wed_Jul_22_19:09:09_PDT_2020\n",
 85 |             "Cuda compilation tools, release 11.0, V11.0.221\n",
 86 |             "Build cuda_11.0_bu.TC445_37.28845127_0\n",
 87 |             "Thu Apr 22 21:19:33 2021       \n",
 88 |             "+-----------------------------------------------------------------------------+\n",
 89 |             "| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
 90 |             "|-------------------------------+----------------------+----------------------+\n",
 91 |             "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
 92 |             "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
 93 |             "|                               |                      |               MIG M. |\n",
 94 |             "|===============================+======================+======================|\n",
 95 |             "|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |\n",
 96 |             "| N/A   38C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |\n",
 97 |             "|                               |                      |                  N/A |\n",
 98 |             "+-------------------------------+----------------------+----------------------+\n",
 99 |             "                                                                               \n",
100 |             "+-----------------------------------------------------------------------------+\n",
101 |             "| Processes:                                                                  |\n",
102 |             "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
103 |             "|        ID   ID                                                   Usage      |\n",
104 |             "|=============================================================================|\n",
105 |             "|  No running processes found                                                 |\n",
106 |             "+-----------------------------------------------------------------------------+\n"
107 |           ],
108 |           "name": "stdout"
109 |         }
110 |       ]
111 |     },
112 |     {
113 |       "cell_type": "markdown",
114 |       "metadata": {
115 |         "id": "DDN2x4izW0rO"
116 |       },
117 |       "source": [
118 |         "## Matrix Summation"
119 |       ]
120 |     },
121 |     {
122 |       "cell_type": "code",
123 |       "metadata": {
124 |         "colab": {
125 |           "base_uri": "https://localhost:8080/",
126 |           "height": 35
127 |         },
128 |         "id": "egrZEZ3MWaP_",
129 |         "outputId": "1e81177e-dd74-421e-ce4e-74b74936b818"
130 |       },
131 |       "source": [
132 |         "%%cu\n",
133 |         "#include <stdio.h>\n",
134 |         "#include <assert.h>\n",
135 |         "\n",
136 |         "#define M 10\n",
137 |         "#define N 10\n",
138 |         "#define MAX_ERR 1e-4\n",
139 |         "\n",
140 |         "__global__ void matrix_summation(float* out, float *a, float *b, int m, int n) \n",
141 |         "{\n",
142 |         "  int index;\n",
143 |         "  for(int i = 0; i < m; i++)\n",
144 |         "  {\n",
145 |         "      for(int j = 0; j < n; j++)\n",
146 |         "      {\n",
147 |         "          index = i*n+j;\n",
148 |         "          out[index] = a[index] + b[index];\n",
149 |         "      }\n",
150 |         "  }\n",
151 |         "}\n",
152 |         "\n",
153 |         "int main()\n",
154 |         "{\n",
155 |         "    float *a, *b, *out;\n",
156 |         "    float *d_a, *d_b, *d_out;\n",
157 |         " \n",
158 |         "    a = (float*)malloc(sizeof(float) * (M * N));\n",
159 |         "    b = (float*)malloc(sizeof(float) * (M * N));\n",
160 |         "    out = (float*)malloc(sizeof(float) * (M * N));\n",
161 |         "\n",
162 |         "    // data initializtion\n",
163 |         "    for(int i = 0; i < M; i++)\n",
164 |         "    {\n",
165 |         "        for(int j = 0; j < N; j++)\n",
166 |         "        {\n",
167 |         "            int index = i*N+j;\n",
168 |         "            a[index] = i*3.14f;\n",
169 |         "            b[index] = j;\n",
170 |         "        }\n",
171 |         "    }\n",
172 |         "    printf(\"a[12] = %f\\n\", a[12]);\n",
173 |         "    printf(\"b[12] = %f\\n\", b[12]);\n",
174 |         "\n",
175 |         "    // Allocate memory on GPU\n",
176 |         "    cudaMalloc((void**)&d_a, sizeof(float) * (M * N));\n",
177 |         "    cudaMalloc((void**)&d_b, sizeof(float) * (M * N));\n",
178 |         "    cudaMalloc((void**)&d_out, sizeof(float) * (M * N));\n",
179 |         "\n",
180 |         "    // copy operator to GPU\n",
181 |         "    cudaMemcpy(d_a, a, sizeof(float) * (M * N), cudaMemcpyHostToDevice);\n",
182 |         "    cudaMemcpy(d_b, b, sizeof(float) * (M * N), cudaMemcpyHostToDevice);\n",
183 |         "\n",
184 |         "    // GPU do the work, CPU waits\n",
185 |         "    matrix_summation<<<1,1>>>(d_out, d_a, d_b, M, N);\n",
186 |         " \n",
187 |         "    // Get results from the GPU\n",
188 |         "    cudaMemcpy(out, d_out, sizeof(float) * (M * N), \n",
189 |         "               cudaMemcpyDeviceToHost);\n",
190 |         " \n",
191 |         "    // Test the result\n",
192 |         "    for(int i = 0; i < M; i++)\n",
193 |         "    {\n",
194 |         "        for(int j = 0; j < N; j++)\n",
195 |         "        {\n",
196 |         "            int index = i*N+j;\n",
197 |         "            assert(fabs(out[index] - a[index] - b[index]) < MAX_ERR);\n",
198 |         "        }\n",
199 |         "    }\n",
200 |         "    printf(\"out[12] = %f\\n\", out[12]);\n",
201 |         "    printf(\"PASSED\\n\");\n",
202 |         " \n",
203 |         "    cudaDeviceSynchronize();\n",
204 |         "    // Free the memory\n",
205 |         "    cudaFree(d_a);\n",
206 |         "    cudaFree(d_b);\n",
207 |         "    cudaFree(d_out);\n",
208 |         "    free(a);\n",
209 |         "    free(b);\n",
210 |         "    free(out);\n",
211 |         "    \n",
212 |         "    return 0;\n",
213 |         "}"
214 |       ],
215 |       "execution_count": 5,
216 |       "outputs": [
217 |         {
218 |           "output_type": "execute_result",
219 |           "data": {
220 |             "application/vnd.google.colaboratory.intrinsic+json": {
221 |               "type": "string"
222 |             },
223 |             "text/plain": [
224 |               "'a[12] = 3.140000\\nb[12] = 2.000000\\nout[12] = 5.140000\\nPASSED\\n'"
225 |             ]
226 |           },
227 |           "metadata": {
228 |             "tags": []
229 |           },
230 |           "execution_count": 5
231 |         }
232 |       ]
233 |     }
234 |   ]
235 | }
236 | 


--------------------------------------------------------------------------------
/Solution/Exercise_05.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Exercise_05.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     },
 17 |     "accelerator": "GPU"
 18 |   },
 19 |   "cells": [
 20 |     {
 21 |       "cell_type": "markdown",
 22 |       "metadata": {
 23 |         "id": "gZABpep_V-8C"
 24 |       },
 25 |       "source": [
 26 |         "# CUDA Exercise 05\n",
 27 |         "> Parallelized Vector add. \n",
 28 |         "\n",
 29 |         "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n",
 30 |         "\n",
 31 |         "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_05.ipynb)"
 32 |       ]
 33 |     },
 34 |     {
 35 |       "cell_type": "markdown",
 36 |       "metadata": {
 37 |         "id": "P401L2N_WG6R"
 38 |       },
 39 |       "source": [
 40 |         "## Initialize the CUDA dev environment"
 41 |       ]
 42 |     },
 43 |     {
 44 |       "cell_type": "code",
 45 |       "metadata": {
 46 |         "colab": {
 47 |           "base_uri": "https://localhost:8080/"
 48 |         },
 49 |         "id": "OONoNFZeV63L",
 50 |         "outputId": "e15d11f8-6c0f-43b7-b60e-675822ac8794"
 51 |       },
 52 |       "source": [
 53 |         "# clone the code repo,\n",
 54 |         "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n",
 55 |         "%load_ext nvcc_plugin\n",
 56 |         "\n",
 57 |         "# Check the environment \n",
 58 |         "!lsb_release -a\n",
 59 |         "!nvcc --version\n",
 60 |         "!nvidia-smi"
 61 |       ],
 62 |       "execution_count": 1,
 63 |       "outputs": [
 64 |         {
 65 |           "output_type": "stream",
 66 |           "text": [
 67 |             "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n",
 68 |             "  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-2r93udvh\n",
 69 |             "  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-2r93udvh\n",
 70 |             "Building wheels for collected packages: NVCCPlugin\n",
 71 |             "  Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 72 |             "  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=4010fe33cb0bdc3a44bc6c4d10aea34076d9daf8c6daec21c1a1544f0ab1b3f4\n",
 73 |             "  Stored in directory: /tmp/pip-ephem-wheel-cache-y67t9ubh/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n",
 74 |             "Successfully built NVCCPlugin\n",
 75 |             "Installing collected packages: NVCCPlugin\n",
 76 |             "Successfully installed NVCCPlugin-0.0.2\n",
 77 |             "Default out bin result.out\n",
 78 |             "No LSB modules are available.\n",
 79 |             "Distributor ID:\tUbuntu\n",
 80 |             "Description:\tUbuntu 18.04.5 LTS\n",
 81 |             "Release:\t18.04\n",
 82 |             "Codename:\tbionic\n",
 83 |             "nvcc: NVIDIA (R) Cuda compiler driver\n",
 84 |             "Copyright (c) 2005-2020 NVIDIA Corporation\n",
 85 |             "Built on Wed_Jul_22_19:09:09_PDT_2020\n",
 86 |             "Cuda compilation tools, release 11.0, V11.0.221\n",
 87 |             "Build cuda_11.0_bu.TC445_37.28845127_0\n",
 88 |             "Thu Apr 22 21:31:17 2021       \n",
 89 |             "+-----------------------------------------------------------------------------+\n",
 90 |             "| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
 91 |             "|-------------------------------+----------------------+----------------------+\n",
 92 |             "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
 93 |             "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
 94 |             "|                               |                      |               MIG M. |\n",
 95 |             "|===============================+======================+======================|\n",
 96 |             "|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |\n",
 97 |             "| N/A   50C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |\n",
 98 |             "|                               |                      |                  N/A |\n",
 99 |             "+-------------------------------+----------------------+----------------------+\n",
100 |             "                                                                               \n",
101 |             "+-----------------------------------------------------------------------------+\n",
102 |             "| Processes:                                                                  |\n",
103 |             "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
104 |             "|        ID   ID                                                   Usage      |\n",
105 |             "|=============================================================================|\n",
106 |             "|  No running processes found                                                 |\n",
107 |             "+-----------------------------------------------------------------------------+\n"
108 |           ],
109 |           "name": "stdout"
110 |         }
111 |       ]
112 |     },
113 |     {
114 |       "cell_type": "markdown",
115 |       "metadata": {
116 |         "id": "DDN2x4izW0rO"
117 |       },
118 |       "source": [
119 |         "## Vector Add with Single Thread"
120 |       ]
121 |     },
122 |     {
123 |       "cell_type": "code",
124 |       "metadata": {
125 |         "colab": {
126 |           "base_uri": "https://localhost:8080/"
127 |         },
128 |         "id": "egrZEZ3MWaP_",
129 |         "outputId": "7d95b219-bf9e-4f5f-e443-e6219f453dd0"
130 |       },
131 |       "source": [
132 |         "%%writefile verctor_add_signal_thread.cu\n",
133 |         "\n",
134 |         "#include <stdio.h>\n",
135 |         "#include <assert.h>\n",
136 |         "\n",
137 |         "#define VECTOR_LENGTH 10000 \n",
138 |         "#define MAX_ERR 1e-4\n",
139 |         "\n",
140 |         "__global__ void vector_add(float *out, float *a, float *b, int n) \n",
141 |         "{\n",
142 |         "    for(int i = 0; i < n; i++)\n",
143 |         "    {\n",
144 |         "        out[i] = a[i] + b[i];\n",
145 |         "    }\n",
146 |         "}\n",
147 |         "\n",
148 |         "int main()\n",
149 |         "{\n",
150 |         "    float *a, *b, *out;\n",
151 |         "    float *d_a, *d_b, *d_out; \n",
152 |         "\n",
153 |         "    // Allocate memory on CPU\n",
154 |         "    a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n",
155 |         "    b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n",
156 |         "    out = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n",
157 |         "\n",
158 |         "    // data initializtion\n",
159 |         "    for(int i = 0; i < VECTOR_LENGTH; i++)\n",
160 |         "    {\n",
161 |         "        a[i] = 3.0f;\n",
162 |         "        b[i] = 0.14f;\n",
163 |         "    }\n",
164 |         "\n",
165 |         "    // Allocate memory on GPU\n",
166 |         "    cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);\n",
167 |         "    cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);\n",
168 |         "    cudaMalloc((void**)&d_out, sizeof(float) * VECTOR_LENGTH);\n",
169 |         "\n",
170 |         "    // copy operator to GPU\n",
171 |         "    cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n",
172 |         "    cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n",
173 |         "\n",
174 |         "    for(int i=0;i<100;i++)\n",
175 |         "    {\n",
176 |         "      // GPU do the work, CPU waits\n",
177 |         "      vector_add<<<1,1>>>(d_out, d_a, d_b, VECTOR_LENGTH);\n",
178 |         "    }\n",
179 |         "    // Get results from the GPU\n",
180 |         "    cudaMemcpy(out, d_out, sizeof(float) * VECTOR_LENGTH, \n",
181 |         "               cudaMemcpyDeviceToHost);\n",
182 |         " \n",
183 |         "    // Test the result\n",
184 |         "    for(int i = 0; i < VECTOR_LENGTH; i++){\n",
185 |         "        assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);\n",
186 |         "    }\n",
187 |         "    printf(\"out[0] = %f\\n\", out[0]);\n",
188 |         "    printf(\"PASSED\\n\");\n",
189 |         "\n",
190 |         "    // Free the memory\n",
191 |         "    cudaFree(d_a);\n",
192 |         "    cudaFree(d_b);\n",
193 |         "    cudaFree(d_out);\n",
194 |         "    free(a);\n",
195 |         "    free(b);\n",
196 |         "    free(out);\n",
197 |         "}"
198 |       ],
199 |       "execution_count": 2,
200 |       "outputs": [
201 |         {
202 |           "output_type": "stream",
203 |           "text": [
204 |             "Writing verctor_add_signal_thread.cu\n"
205 |           ],
206 |           "name": "stdout"
207 |         }
208 |       ]
209 |     },
210 |     {
211 |       "cell_type": "markdown",
212 |       "metadata": {
213 |         "id": "TZI-nXBxefbc"
214 |       },
215 |       "source": [
216 |         "## Vector Add with Multiple Threads"
217 |       ]
218 |     },
219 |     {
220 |       "cell_type": "code",
221 |       "metadata": {
222 |         "colab": {
223 |           "base_uri": "https://localhost:8080/"
224 |         },
225 |         "id": "cxsC_CQRen43",
226 |         "outputId": "184cc2e3-eea7-4731-df27-2fd165c475b6"
227 |       },
228 |       "source": [
229 |         "%%writefile verctor_add_multi_thread.cu\n",
230 |         "\n",
231 |         "#include <stdio.h>\n",
232 |         "#include <stdlib.h>\n",
233 |         "#include <assert.h>\n",
234 |         "\n",
235 |         "#define VECTOR_LENGTH 10000\n",
236 |         "#define MAX_ERR 1e-4\n",
237 |         "\n",
238 |         "__global__ void vector_add(float *out, float *a, float *b, int n) \n",
239 |         "{\n",
240 |         "    int index = threadIdx.x;\n",
241 |         "    int stride = blockDim.x;\n",
242 |         "    for(int i = index; i < n; i=i+stride)\n",
243 |         "    {\n",
244 |         "        out[i] = a[i] + b[i];\n",
245 |         "    }\n",
246 |         "}\n",
247 |         "\n",
248 |         "int main(int argc, char *argv[])\n",
249 |         "{\n",
250 |         "    float *a, *b, *out;\n",
251 |         "    float *d_a, *d_b, *d_out;\n",
252 |         "    int list_of_test_block_size[]={1,64,128,256,512,1024};\n",
253 |         "    int block_size = 1;\n",
254 |         " \n",
255 |         "    if( argc == 2 ) {\n",
256 |         "      //printf(\"The argument supplied is %s\\n\", argv[1]);\n",
257 |         "      int arg1 = atoi(argv[1]);  //argv[0] is the program name\n",
258 |         "                                //atoi = ascii to int\n",
259 |         "                     \n",
260 |         "      block_size = list_of_test_block_size[arg1];\n",
261 |         "    }\n",
262 |         "    else if( argc > 2 ) {\n",
263 |         "      printf(\"Too many arguments supplied.\\n\");\n",
264 |         "    }\n",
265 |         "    else {\n",
266 |         "      printf(\"One argument expected.\\n\");\n",
267 |         "      \n",
268 |         "    }\n",
269 |         " \n",
270 |         "    printf(\"The Block size is %d.\\n\", block_size);\n",
271 |         "\n",
272 |         "    // Allocate memory on CPU\n",
273 |         "    a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n",
274 |         "    b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n",
275 |         "    out = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n",
276 |         "\n",
277 |         "    // data initializtion\n",
278 |         "    for(int i = 0; i < VECTOR_LENGTH; i++)\n",
279 |         "    {\n",
280 |         "        a[i] = 3.0f;\n",
281 |         "        b[i] = 0.14f;\n",
282 |         "    }\n",
283 |         "\n",
284 |         "    // Allocate memory on GPU\n",
285 |         "    cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);\n",
286 |         "    cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);\n",
287 |         "    cudaMalloc((void**)&d_out, sizeof(float) * VECTOR_LENGTH);\n",
288 |         "\n",
289 |         "    // copy operator to GPU\n",
290 |         "    cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n",
291 |         "    cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n",
292 |         "\n",
293 |         "    for(int i=0;i<100;i++)\n",
294 |         "    {\n",
295 |         "      // GPU do the work, CPU waits\n",
296 |         "      vector_add<<<1,block_size>>>(d_out, d_a, d_b, VECTOR_LENGTH);\n",
297 |         "    }\n",
298 |         "    // Get results from the GPU\n",
299 |         "    cudaMemcpy(out, d_out, sizeof(float) * VECTOR_LENGTH, \n",
300 |         "               cudaMemcpyDeviceToHost);\n",
301 |         " \n",
302 |         "    // Test the result\n",
303 |         "    for(int i = 0; i < VECTOR_LENGTH; i++){\n",
304 |         "        assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);\n",
305 |         "    }\n",
306 |         "    printf(\"out[0] = %f\\n\", out[0]);\n",
307 |         "    printf(\"PASSED\\n\");\n",
308 |         "\n",
309 |         "    // Free the memory\n",
310 |         "    cudaFree(d_a);\n",
311 |         "    cudaFree(d_b);\n",
312 |         "    cudaFree(d_out);\n",
313 |         "    free(a);\n",
314 |         "    free(b);\n",
315 |         "    free(out);\n",
316 |         "}"
317 |       ],
318 |       "execution_count": 3,
319 |       "outputs": [
320 |         {
321 |           "output_type": "stream",
322 |           "text": [
323 |             "Writing verctor_add_multi_thread.cu\n"
324 |           ],
325 |           "name": "stdout"
326 |         }
327 |       ]
328 |     },
329 |     {
330 |       "cell_type": "markdown",
331 |       "metadata": {
332 |         "id": "d9Zw1YvsewRK"
333 |       },
334 |       "source": [
335 |         "## Evaluation"
336 |       ]
337 |     },
338 |     {
339 |       "cell_type": "markdown",
340 |       "metadata": {
341 |         "id": "q4j_yDKhfHzv"
342 |       },
343 |       "source": [
344 |         "Measuring the time cost of executing the CUDA fucntion with **signle thread**"
345 |       ]
346 |     },
347 |     {
348 |       "cell_type": "code",
349 |       "metadata": {
350 |         "colab": {
351 |           "base_uri": "https://localhost:8080/"
352 |         },
353 |         "id": "uOfZmgUxezqF",
354 |         "outputId": "5be83f69-6a78-4e48-f87e-83b28c36aac1"
355 |       },
356 |       "source": [
357 |         "!nvcc -o verctor_add_signal_thread verctor_add_signal_thread.cu\n",
358 |         "!nvprof ./verctor_add_signal_thread"
359 |       ],
360 |       "execution_count": 4,
361 |       "outputs": [
362 |         {
363 |           "output_type": "stream",
364 |           "text": [
365 |             "==166== NVPROF is profiling process 166, command: ./verctor_add_signal_thread\n",
366 |             "out[0] = 3.140000\n",
367 |             "PASSED\n",
368 |             "==166== Profiling application: ./verctor_add_signal_thread\n",
369 |             "==166== Profiling result:\n",
370 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
371 |             " GPU activities:   99.99%  118.76ms       100  1.1876ms  1.1875ms  1.1882ms  vector_add(float*, float*, float*, int)\n",
372 |             "                    0.01%  9.6960us         2  4.8480us  4.7040us  4.9920us  [CUDA memcpy HtoD]\n",
373 |             "                    0.00%  5.1840us         1  5.1840us  5.1840us  5.1840us  [CUDA memcpy DtoH]\n",
374 |             "      API calls:   72.18%  312.33ms         3  104.11ms  2.8630us  312.32ms  cudaMalloc\n",
375 |             "                   27.39%  118.53ms         3  39.510ms  27.121us  118.47ms  cudaMemcpy\n",
376 |             "                    0.14%  603.24us         1  603.24us  603.24us  603.24us  cuDeviceGetPCIBusId\n",
377 |             "                    0.11%  481.38us       100  4.8130us  3.4180us  35.589us  cudaLaunchKernel\n",
378 |             "                    0.08%  356.39us         1  356.39us  356.39us  356.39us  cuDeviceTotalMem\n",
379 |             "                    0.04%  182.81us       101  1.8100us     133ns  86.635us  cuDeviceGetAttribute\n",
380 |             "                    0.04%  170.18us         3  56.725us  4.5810us  145.75us  cudaFree\n",
381 |             "                    0.01%  28.980us         1  28.980us  28.980us  28.980us  cuDeviceGetName\n",
382 |             "                    0.00%  1.5750us         2     787ns     328ns  1.2470us  cuDeviceGet\n",
383 |             "                    0.00%  1.4200us         3     473ns     232ns     861ns  cuDeviceGetCount\n",
384 |             "                    0.00%     300ns         1     300ns     300ns     300ns  cuDeviceGetUuid\n"
385 |           ],
386 |           "name": "stdout"
387 |         }
388 |       ]
389 |     },
390 |     {
391 |       "cell_type": "markdown",
392 |       "metadata": {
393 |         "id": "ch5mhas6fIZd"
394 |       },
395 |       "source": [
396 |         "Measuring the time cost of executing the CUDA fucntion with **multi-threads**"
397 |       ]
398 |     },
399 |     {
400 |       "cell_type": "code",
401 |       "metadata": {
402 |         "colab": {
403 |           "base_uri": "https://localhost:8080/"
404 |         },
405 |         "id": "0wc1X6ZCFAVo",
406 |         "outputId": "948c0e1c-a491-4173-9a76-e0bfcb291db5"
407 |       },
408 |       "source": [
409 |         "!nvcc -o verctor_add_multi_thread verctor_add_multi_thread.cu\n",
410 |         "!nvprof ./verctor_add_multi_thread 0\n",
411 |         "!nvprof ./verctor_add_multi_thread 1\n",
412 |         "!nvprof ./verctor_add_multi_thread 2\n",
413 |         "!nvprof ./verctor_add_multi_thread 3\n",
414 |         "!nvprof ./verctor_add_multi_thread 4\n",
415 |         "!nvprof ./verctor_add_multi_thread 5"
416 |       ],
417 |       "execution_count": 5,
418 |       "outputs": [
419 |         {
420 |           "output_type": "stream",
421 |           "text": [
422 |             "The Block size is 1.\n",
423 |             "==210== NVPROF is profiling process 210, command: ./verctor_add_multi_thread 0\n",
424 |             "out[0] = 3.140000\n",
425 |             "PASSED\n",
426 |             "==210== Profiling application: ./verctor_add_multi_thread 0\n",
427 |             "==210== Profiling result:\n",
428 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
429 |             " GPU activities:   99.99%  110.00ms       100  1.1000ms  1.0999ms  1.1004ms  vector_add(float*, float*, float*, int)\n",
430 |             "                    0.01%  9.4400us         2  4.7200us  4.5760us  4.8640us  [CUDA memcpy HtoD]\n",
431 |             "                    0.00%  5.1520us         1  5.1520us  5.1520us  5.1520us  [CUDA memcpy DtoH]\n",
432 |             "      API calls:   67.94%  235.85ms         3  78.615ms  2.9820us  235.84ms  cudaMalloc\n",
433 |             "                   31.64%  109.82ms         3  36.607ms  25.740us  109.77ms  cudaMemcpy\n",
434 |             "                    0.15%  533.49us         1  533.49us  533.49us  533.49us  cuDeviceTotalMem\n",
435 |             "                    0.13%  448.80us       100  4.4880us  3.4570us  33.193us  cudaLaunchKernel\n",
436 |             "                    0.07%  230.29us         3  76.761us  5.3020us  199.20us  cudaFree\n",
437 |             "                    0.06%  193.50us       101  1.9150us     184ns  79.100us  cuDeviceGetAttribute\n",
438 |             "                    0.01%  33.101us         1  33.101us  33.101us  33.101us  cuDeviceGetName\n",
439 |             "                    0.00%  5.6790us         1  5.6790us  5.6790us  5.6790us  cuDeviceGetPCIBusId\n",
440 |             "                    0.00%  2.1510us         3     717ns     201ns  1.4490us  cuDeviceGetCount\n",
441 |             "                    0.00%  1.7380us         2     869ns     272ns  1.4660us  cuDeviceGet\n",
442 |             "                    0.00%     427ns         1     427ns     427ns     427ns  cuDeviceGetUuid\n",
443 |             "The Block size is 64.\n",
444 |             "==221== NVPROF is profiling process 221, command: ./verctor_add_multi_thread 1\n",
445 |             "out[0] = 3.140000\n",
446 |             "PASSED\n",
447 |             "==221== Profiling application: ./verctor_add_multi_thread 1\n",
448 |             "==221== Profiling result:\n",
449 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
450 |             " GPU activities:   99.66%  3.4722ms       100  34.722us  34.624us  34.945us  vector_add(float*, float*, float*, int)\n",
451 |             "                    0.20%  7.1360us         2  3.5680us  3.4560us  3.6800us  [CUDA memcpy HtoD]\n",
452 |             "                    0.13%  4.5760us         1  4.5760us  4.5760us  4.5760us  [CUDA memcpy DtoH]\n",
453 |             "      API calls:   97.83%  193.30ms         3  64.432ms  3.3410us  193.29ms  cudaMalloc\n",
454 |             "                    1.61%  3.1743ms         3  1.0581ms  24.151us  3.1191ms  cudaMemcpy\n",
455 |             "                    0.24%  475.21us       100  4.7520us  3.4350us  29.099us  cudaLaunchKernel\n",
456 |             "                    0.17%  341.94us         1  341.94us  341.94us  341.94us  cuDeviceTotalMem\n",
457 |             "                    0.07%  145.35us       101  1.4390us     137ns  61.921us  cuDeviceGetAttribute\n",
458 |             "                    0.06%  123.14us         3  41.048us  4.7850us  107.26us  cudaFree\n",
459 |             "                    0.01%  26.334us         1  26.334us  26.334us  26.334us  cuDeviceGetName\n",
460 |             "                    0.00%  5.1070us         1  5.1070us  5.1070us  5.1070us  cuDeviceGetPCIBusId\n",
461 |             "                    0.00%  1.7040us         2     852ns     311ns  1.3930us  cuDeviceGet\n",
462 |             "                    0.00%  1.5310us         3     510ns     195ns     746ns  cuDeviceGetCount\n",
463 |             "                    0.00%     288ns         1     288ns     288ns     288ns  cuDeviceGetUuid\n",
464 |             "The Block size is 128.\n",
465 |             "==232== NVPROF is profiling process 232, command: ./verctor_add_multi_thread 2\n",
466 |             "out[0] = 3.140000\n",
467 |             "PASSED\n",
468 |             "==232== Profiling application: ./verctor_add_multi_thread 2\n",
469 |             "==232== Profiling result:\n",
470 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
471 |             " GPU activities:   99.37%  1.8371ms       100  18.371us  18.176us  18.880us  vector_add(float*, float*, float*, int)\n",
472 |             "                    0.39%  7.1360us         2  3.5680us  3.4560us  3.6800us  [CUDA memcpy HtoD]\n",
473 |             "                    0.25%  4.5760us         1  4.5760us  4.5760us  4.5760us  [CUDA memcpy DtoH]\n",
474 |             "      API calls:   98.53%  178.54ms         3  59.513ms  3.2840us  178.53ms  cudaMalloc\n",
475 |             "                    0.85%  1.5462ms         3  515.39us  24.255us  1.4941ms  cudaMemcpy\n",
476 |             "                    0.26%  462.22us       100  4.6220us  3.5560us  25.245us  cudaLaunchKernel\n",
477 |             "                    0.19%  342.18us         1  342.18us  342.18us  342.18us  cuDeviceTotalMem\n",
478 |             "                    0.08%  150.30us       101  1.4880us     137ns  61.771us  cuDeviceGetAttribute\n",
479 |             "                    0.07%  124.56us         3  41.519us  4.7580us  104.88us  cudaFree\n",
480 |             "                    0.01%  27.142us         1  27.142us  27.142us  27.142us  cuDeviceGetName\n",
481 |             "                    0.00%  6.0030us         1  6.0030us  6.0030us  6.0030us  cuDeviceGetPCIBusId\n",
482 |             "                    0.00%  1.7140us         3     571ns     232ns     991ns  cuDeviceGetCount\n",
483 |             "                    0.00%  1.1770us         2     588ns     302ns     875ns  cuDeviceGet\n",
484 |             "                    0.00%     292ns         1     292ns     292ns     292ns  cuDeviceGetUuid\n",
485 |             "The Block size is 256.\n",
486 |             "==243== NVPROF is profiling process 243, command: ./verctor_add_multi_thread 3\n",
487 |             "out[0] = 3.140000\n",
488 |             "PASSED\n",
489 |             "==243== Profiling application: ./verctor_add_multi_thread 3\n",
490 |             "==243== Profiling result:\n",
491 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
492 |             " GPU activities:   98.87%  1.0244ms       100  10.244us  9.9200us  11.136us  vector_add(float*, float*, float*, int)\n",
493 |             "                    0.69%  7.1360us         2  3.5680us  3.4560us  3.6800us  [CUDA memcpy HtoD]\n",
494 |             "                    0.44%  4.6080us         1  4.6080us  4.6080us  4.6080us  [CUDA memcpy DtoH]\n",
495 |             "      API calls:   98.98%  180.43ms         3  60.143ms  3.3970us  180.42ms  cudaMalloc\n",
496 |             "                    0.38%  692.31us         3  230.77us  23.738us  639.01us  cudaMemcpy\n",
497 |             "                    0.27%  500.12us       100  5.0010us  3.6400us  26.479us  cudaLaunchKernel\n",
498 |             "                    0.20%  367.77us         1  367.77us  367.77us  367.77us  cuDeviceTotalMem\n",
499 |             "                    0.08%  145.39us       101  1.4390us     146ns  60.433us  cuDeviceGetAttribute\n",
500 |             "                    0.07%  121.86us         3  40.621us  4.3540us  106.28us  cudaFree\n",
501 |             "                    0.02%  32.412us         1  32.412us  32.412us  32.412us  cuDeviceGetName\n",
502 |             "                    0.00%  4.7100us         1  4.7100us  4.7100us  4.7100us  cuDeviceGetPCIBusId\n",
503 |             "                    0.00%  1.4430us         3     481ns     196ns     847ns  cuDeviceGetCount\n",
504 |             "                    0.00%  1.1370us         2     568ns     297ns     840ns  cuDeviceGet\n",
505 |             "                    0.00%     288ns         1     288ns     288ns     288ns  cuDeviceGetUuid\n",
506 |             "The Block size is 512.\n",
507 |             "==256== NVPROF is profiling process 256, command: ./verctor_add_multi_thread 4\n",
508 |             "out[0] = 3.140000\n",
509 |             "PASSED\n",
510 |             "==256== Profiling application: ./verctor_add_multi_thread 4\n",
511 |             "==256== Profiling result:\n",
512 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
513 |             " GPU activities:   98.65%  1.0801ms       100  10.801us  10.592us  11.296us  vector_add(float*, float*, float*, int)\n",
514 |             "                    0.88%  9.6640us         2  4.8320us  4.7040us  4.9600us  [CUDA memcpy HtoD]\n",
515 |             "                    0.46%  5.0880us         1  5.0880us  5.0880us  5.0880us  [CUDA memcpy DtoH]\n",
516 |             "      API calls:   98.92%  184.62ms         3  61.541ms  2.6110us  184.62ms  cudaMalloc\n",
517 |             "                    0.47%  879.32us         3  293.11us  26.589us  797.05us  cudaMemcpy\n",
518 |             "                    0.24%  454.74us       100  4.5470us  3.5170us  30.816us  cudaLaunchKernel\n",
519 |             "                    0.20%  373.11us         1  373.11us  373.11us  373.11us  cuDeviceTotalMem\n",
520 |             "                    0.08%  155.62us       101  1.5400us     139ns  67.367us  cuDeviceGetAttribute\n",
521 |             "                    0.06%  119.57us         3  39.858us  4.2690us  104.01us  cudaFree\n",
522 |             "                    0.02%  29.107us         1  29.107us  29.107us  29.107us  cuDeviceGetName\n",
523 |             "                    0.00%  5.2130us         1  5.2130us  5.2130us  5.2130us  cuDeviceGetPCIBusId\n",
524 |             "                    0.00%  1.4490us         3     483ns     196ns  1.0070us  cuDeviceGetCount\n",
525 |             "                    0.00%  1.1590us         2     579ns     203ns     956ns  cuDeviceGet\n",
526 |             "                    0.00%     317ns         1     317ns     317ns     317ns  cuDeviceGetUuid\n",
527 |             "The Block size is 1024.\n",
528 |             "==267== NVPROF is profiling process 267, command: ./verctor_add_multi_thread 5\n",
529 |             "out[0] = 3.140000\n",
530 |             "PASSED\n",
531 |             "==267== Profiling application: ./verctor_add_multi_thread 5\n",
532 |             "==267== Profiling result:\n",
533 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
534 |             " GPU activities:   97.97%  715.27us       100  7.1520us  7.0080us  8.0320us  vector_add(float*, float*, float*, int)\n",
535 |             "                    1.33%  9.6960us         2  4.8480us  4.7040us  4.9920us  [CUDA memcpy HtoD]\n",
536 |             "                    0.70%  5.0880us         1  5.0880us  5.0880us  5.0880us  [CUDA memcpy DtoH]\n",
537 |             "      API calls:   99.13%  182.23ms         3  60.744ms  2.6040us  182.22ms  cudaMalloc\n",
538 |             "                    0.29%  528.43us       100  5.2840us  3.5320us  28.747us  cudaLaunchKernel\n",
539 |             "                    0.22%  404.82us         3  134.94us  25.338us  351.20us  cudaMemcpy\n",
540 |             "                    0.20%  358.63us         1  358.63us  358.63us  358.63us  cuDeviceTotalMem\n",
541 |             "                    0.08%  146.85us       101  1.4540us     139ns  62.110us  cuDeviceGetAttribute\n",
542 |             "                    0.07%  122.11us         3  40.701us  4.8210us  105.45us  cudaFree\n",
543 |             "                    0.02%  29.790us         1  29.790us  29.790us  29.790us  cuDeviceGetName\n",
544 |             "                    0.00%  5.5170us         1  5.5170us  5.5170us  5.5170us  cuDeviceGetPCIBusId\n",
545 |             "                    0.00%  1.2460us         3     415ns     184ns     737ns  cuDeviceGetCount\n",
546 |             "                    0.00%  1.1670us         2     583ns     223ns     944ns  cuDeviceGet\n",
547 |             "                    0.00%     267ns         1     267ns     267ns     267ns  cuDeviceGetUuid\n"
548 |           ],
549 |           "name": "stdout"
550 |         }
551 |       ]
552 |     }
553 |   ]
554 | }
555 | 


--------------------------------------------------------------------------------
/Solution/Exercise_06.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Exercise_06.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     },
 17 |     "accelerator": "GPU"
 18 |   },
 19 |   "cells": [
 20 |     {
 21 |       "cell_type": "markdown",
 22 |       "metadata": {
 23 |         "id": "gZABpep_V-8C"
 24 |       },
 25 |       "source": [
 26 |         "# CUDA Exercise 06\n",
 27 |         "> Another approach of parallelized Vector add. \n",
 28 |         "\n",
 29 |         "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n",
 30 |         "\n",
 31 |         "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_06.ipynb)"
 32 |       ]
 33 |     },
 34 |     {
 35 |       "cell_type": "markdown",
 36 |       "metadata": {
 37 |         "id": "P401L2N_WG6R"
 38 |       },
 39 |       "source": [
 40 |         "## Initialize the CUDA dev environment"
 41 |       ]
 42 |     },
 43 |     {
 44 |       "cell_type": "code",
 45 |       "metadata": {
 46 |         "colab": {
 47 |           "base_uri": "https://localhost:8080/"
 48 |         },
 49 |         "id": "OONoNFZeV63L",
 50 |         "outputId": "7ea7eb64-bce9-4b3e-fb37-26c22c542977"
 51 |       },
 52 |       "source": [
 53 |         "# clone the code repo,\n",
 54 |         "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n",
 55 |         "%load_ext nvcc_plugin\n",
 56 |         "\n",
 57 |         "# Check the environment \n",
 58 |         "!lsb_release -a\n",
 59 |         "!nvcc --version\n",
 60 |         "!nvidia-smi"
 61 |       ],
 62 |       "execution_count": 1,
 63 |       "outputs": [
 64 |         {
 65 |           "output_type": "stream",
 66 |           "text": [
 67 |             "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n",
 68 |             "  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-0h_on20m\n",
 69 |             "  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-0h_on20m\n",
 70 |             "Building wheels for collected packages: NVCCPlugin\n",
 71 |             "  Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 72 |             "  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=c3bbe482d2b7bd608c155ee0855393664aee1a212eb71f17e7c1d5c7be4d469c\n",
 73 |             "  Stored in directory: /tmp/pip-ephem-wheel-cache-s37pn594/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n",
 74 |             "Successfully built NVCCPlugin\n",
 75 |             "Installing collected packages: NVCCPlugin\n",
 76 |             "Successfully installed NVCCPlugin-0.0.2\n",
 77 |             "Default out bin result.out\n",
 78 |             "No LSB modules are available.\n",
 79 |             "Distributor ID:\tUbuntu\n",
 80 |             "Description:\tUbuntu 18.04.5 LTS\n",
 81 |             "Release:\t18.04\n",
 82 |             "Codename:\tbionic\n",
 83 |             "nvcc: NVIDIA (R) Cuda compiler driver\n",
 84 |             "Copyright (c) 2005-2020 NVIDIA Corporation\n",
 85 |             "Built on Wed_Jul_22_19:09:09_PDT_2020\n",
 86 |             "Cuda compilation tools, release 11.0, V11.0.221\n",
 87 |             "Build cuda_11.0_bu.TC445_37.28845127_0\n",
 88 |             "Thu Apr 22 21:38:17 2021       \n",
 89 |             "+-----------------------------------------------------------------------------+\n",
 90 |             "| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
 91 |             "|-------------------------------+----------------------+----------------------+\n",
 92 |             "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
 93 |             "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
 94 |             "|                               |                      |               MIG M. |\n",
 95 |             "|===============================+======================+======================|\n",
 96 |             "|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |\n",
 97 |             "| N/A   64C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |\n",
 98 |             "|                               |                      |                  N/A |\n",
 99 |             "+-------------------------------+----------------------+----------------------+\n",
100 |             "                                                                               \n",
101 |             "+-----------------------------------------------------------------------------+\n",
102 |             "| Processes:                                                                  |\n",
103 |             "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
104 |             "|        ID   ID                                                   Usage      |\n",
105 |             "|=============================================================================|\n",
106 |             "|  No running processes found                                                 |\n",
107 |             "+-----------------------------------------------------------------------------+\n"
108 |           ],
109 |           "name": "stdout"
110 |         }
111 |       ]
112 |     },
113 |     {
114 |       "cell_type": "markdown",
115 |       "metadata": {
116 |         "id": "Bxx_JzKwgnh1"
117 |       },
118 |       "source": [
119 |         "## Vector Add with Multiple Threads across Blocks"
120 |       ]
121 |     },
122 |     {
123 |       "cell_type": "code",
124 |       "metadata": {
125 |         "colab": {
126 |           "base_uri": "https://localhost:8080/"
127 |         },
128 |         "id": "egrZEZ3MWaP_",
129 |         "outputId": "c9574221-69cc-4343-bfa6-44db49fdadc5"
130 |       },
131 |       "source": [
132 |         "%%writefile verctor_add_multi_blocks_thread.cu\n",
133 |         "\n",
134 |         "#include <stdio.h>\n",
135 |         "#include <assert.h>\n",
136 |         "\n",
137 |         "#define VECTOR_LENGTH 10000\n",
138 |         "#define MAX_ERR 1e-4\n",
139 |         "\n",
140 |         "__global__ void vector_add(float *out, float *a, float *b, int n) \n",
141 |         "{\n",
142 |         "    int tid = blockIdx.x * blockDim.x + threadIdx.x;\n",
143 |         " \n",
144 |         "    if(tid<n)\n",
145 |         "    {\n",
146 |         "        out[tid] = a[tid] + b[tid];\n",
147 |         "    }\n",
148 |         "}\n",
149 |         "\n",
150 |         "int main(int argc, char *argv[])\n",
151 |         "{\n",
152 |         "  float *a, *b, *out;\n",
153 |         "  float *d_a, *d_b, *d_out;\n",
154 |         "  int list_of_test_block_size[]={1,64,128,256,512,1024};\n",
155 |         "  int block_size = 1;\n",
156 |         "\n",
157 |         "  if( argc == 2 ) {\n",
158 |         "    //printf(\"The argument supplied is %s\\n\", argv[1]);\n",
159 |         "    int arg1 = atoi(argv[1]);  //argv[0] is the program name\n",
160 |         "                              //atoi = ascii to int\n",
161 |         "                    \n",
162 |         "    block_size = list_of_test_block_size[arg1];\n",
163 |         "  }\n",
164 |         "  else if( argc > 2 ) {\n",
165 |         "    printf(\"Too many arguments supplied.\\n\");\n",
166 |         "  }\n",
167 |         "  else {\n",
168 |         "    printf(\"One argument expected.\\n\");\n",
169 |         "    \n",
170 |         "  }\n",
171 |         "\n",
172 |         "  printf(\"The Block size is %d.\\n\", block_size);\n",
173 |         "\n",
174 |         "\n",
175 |         "  // Allocate memory on CPU\n",
176 |         "  a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n",
177 |         "  b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n",
178 |         "  out = (float*)malloc(sizeof(float) * VECTOR_LENGTH);\n",
179 |         "\n",
180 |         "  // data initializtion\n",
181 |         "  for(int i = 0; i < VECTOR_LENGTH; i++)\n",
182 |         "  {\n",
183 |         "      a[i] = 3.0f;\n",
184 |         "      b[i] = 0.14f;\n",
185 |         "  }\n",
186 |         "\n",
187 |         "  // Allocate memory on GPU\n",
188 |         "  cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);\n",
189 |         "  cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);\n",
190 |         "  cudaMalloc((void**)&d_out, sizeof(float) * VECTOR_LENGTH);\n",
191 |         "\n",
192 |         "  // copy operator to GPU\n",
193 |         "  cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n",
194 |         "  cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);\n",
195 |         "\n",
196 |         "  for(int i=0;i<100;i++)\n",
197 |         "  {\n",
198 |         "    // GPU do the work, CPU waits\n",
199 |         "    // Executing kernel \n",
200 |         "    int grid_size = ((VECTOR_LENGTH + block_size) / block_size);\n",
201 |         "    vector_add<<<grid_size,block_size>>>(d_out, d_a, d_b, VECTOR_LENGTH);\n",
202 |         "  }\n",
203 |         "  // Get results from the GPU\n",
204 |         "  cudaMemcpy(out, d_out, sizeof(float) * VECTOR_LENGTH, \n",
205 |         "              cudaMemcpyDeviceToHost);\n",
206 |         "\n",
207 |         "  // Test the result\n",
208 |         "  for(int i = 0; i < VECTOR_LENGTH; i++){\n",
209 |         "      assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);\n",
210 |         "  }\n",
211 |         "  printf(\"out[0] = %f\\n\", out[0]);\n",
212 |         "  printf(\"PASSED\\n\");\n",
213 |         "\n",
214 |         "  // Free the memory\n",
215 |         "  cudaFree(d_a);\n",
216 |         "  cudaFree(d_b);\n",
217 |         "  cudaFree(d_out);\n",
218 |         "  free(a);\n",
219 |         "  free(b);\n",
220 |         "  free(out);\n",
221 |         "  }"
222 |       ],
223 |       "execution_count": 2,
224 |       "outputs": [
225 |         {
226 |           "output_type": "stream",
227 |           "text": [
228 |             "Writing verctor_add_multi_blocks_thread.cu\n"
229 |           ],
230 |           "name": "stdout"
231 |         }
232 |       ]
233 |     },
234 |     {
235 |       "cell_type": "markdown",
236 |       "metadata": {
237 |         "id": "d9Zw1YvsewRK"
238 |       },
239 |       "source": [
240 |         "## Evaluation"
241 |       ]
242 |     },
243 |     {
244 |       "cell_type": "markdown",
245 |       "metadata": {
246 |         "id": "ch5mhas6fIZd"
247 |       },
248 |       "source": [
249 |         "Measuring the time cost of executing the CUDA fucntion"
250 |       ]
251 |     },
252 |     {
253 |       "cell_type": "code",
254 |       "metadata": {
255 |         "colab": {
256 |           "base_uri": "https://localhost:8080/"
257 |         },
258 |         "id": "0wc1X6ZCFAVo",
259 |         "outputId": "be4bb169-e77b-4060-daa1-588a5d832419"
260 |       },
261 |       "source": [
262 |         "!nvcc -o verctor_add_multi_blocks_thread verctor_add_multi_blocks_thread.cu\n",
263 |         "!nvprof ./verctor_add_multi_blocks_thread 0\n",
264 |         "!nvprof ./verctor_add_multi_blocks_thread 1\n",
265 |         "!nvprof ./verctor_add_multi_blocks_thread 2\n",
266 |         "!nvprof ./verctor_add_multi_blocks_thread 3"
267 |       ],
268 |       "execution_count": 3,
269 |       "outputs": [
270 |         {
271 |           "output_type": "stream",
272 |           "text": [
273 |             "The Block size is 1.\n",
274 |             "==165== NVPROF is profiling process 165, command: ./verctor_add_multi_blocks_thread 0\n",
275 |             "out[0] = 3.140000\n",
276 |             "PASSED\n",
277 |             "==165== Profiling application: ./verctor_add_multi_blocks_thread 0\n",
278 |             "==165== Profiling result:\n",
279 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
280 |             " GPU activities:   99.53%  3.0884ms       100  30.884us  30.785us  31.265us  vector_add(float*, float*, float*, int)\n",
281 |             "                    0.31%  9.5690us         2  4.7840us  4.6400us  4.9290us  [CUDA memcpy HtoD]\n",
282 |             "                    0.16%  5.1200us         1  5.1200us  5.1200us  5.1200us  [CUDA memcpy DtoH]\n",
283 |             "      API calls:   98.68%  329.13ms         3  109.71ms  3.1850us  329.12ms  cudaMalloc\n",
284 |             "                    0.86%  2.8551ms         3  951.69us  25.687us  2.7982ms  cudaMemcpy\n",
285 |             "                    0.14%  471.47us       100  4.7140us  3.2380us  32.273us  cudaLaunchKernel\n",
286 |             "                    0.12%  398.11us         1  398.11us  398.11us  398.11us  cuDeviceGetPCIBusId\n",
287 |             "                    0.11%  352.50us         1  352.50us  352.50us  352.50us  cuDeviceTotalMem\n",
288 |             "                    0.04%  148.32us       101  1.4680us     137ns  63.197us  cuDeviceGetAttribute\n",
289 |             "                    0.04%  125.70us         3  41.900us  4.3440us  110.18us  cudaFree\n",
290 |             "                    0.01%  48.719us         1  48.719us  48.719us  48.719us  cuDeviceGetName\n",
291 |             "                    0.00%  1.6970us         3     565ns     234ns  1.0660us  cuDeviceGetCount\n",
292 |             "                    0.00%  1.4190us         2     709ns     236ns  1.1830us  cuDeviceGet\n",
293 |             "                    0.00%     304ns         1     304ns     304ns     304ns  cuDeviceGetUuid\n",
294 |             "The Block size is 64.\n",
295 |             "==176== NVPROF is profiling process 176, command: ./verctor_add_multi_blocks_thread 1\n",
296 |             "out[0] = 3.140000\n",
297 |             "PASSED\n",
298 |             "==176== Profiling application: ./verctor_add_multi_blocks_thread 1\n",
299 |             "==176== Profiling result:\n",
300 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
301 |             " GPU activities:   94.94%  273.86us       100  2.7380us  2.6880us  3.0720us  vector_add(float*, float*, float*, int)\n",
302 |             "                    3.32%  9.5680us         2  4.7840us  4.6400us  4.9280us  [CUDA memcpy HtoD]\n",
303 |             "                    1.74%  5.0240us         1  5.0240us  5.0240us  5.0240us  [CUDA memcpy DtoH]\n",
304 |             "      API calls:   99.28%  177.51ms         3  59.169ms  3.0770us  177.50ms  cudaMalloc\n",
305 |             "                    0.27%  486.52us       100  4.8650us  3.4350us  28.910us  cudaLaunchKernel\n",
306 |             "                    0.20%  366.26us         1  366.26us  366.26us  366.26us  cuDeviceTotalMem\n",
307 |             "                    0.09%  162.01us       101  1.6040us     137ns  65.314us  cuDeviceGetAttribute\n",
308 |             "                    0.08%  137.57us         3  45.855us  4.3990us  122.37us  cudaFree\n",
309 |             "                    0.06%  102.92us         3  34.307us  24.556us  47.219us  cudaMemcpy\n",
310 |             "                    0.01%  24.120us         1  24.120us  24.120us  24.120us  cuDeviceGetName\n",
311 |             "                    0.00%  4.4430us         1  4.4430us  4.4430us  4.4430us  cuDeviceGetPCIBusId\n",
312 |             "                    0.00%  1.8130us         3     604ns     218ns  1.1550us  cuDeviceGetCount\n",
313 |             "                    0.00%  1.1100us         2     555ns     198ns     912ns  cuDeviceGet\n",
314 |             "                    0.00%     300ns         1     300ns     300ns     300ns  cuDeviceGetUuid\n",
315 |             "The Block size is 128.\n",
316 |             "==187== NVPROF is profiling process 187, command: ./verctor_add_multi_blocks_thread 2\n",
317 |             "out[0] = 3.140000\n",
318 |             "PASSED\n",
319 |             "==187== Profiling application: ./verctor_add_multi_blocks_thread 2\n",
320 |             "==187== Profiling result:\n",
321 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
322 |             " GPU activities:   94.45%  251.05us       100  2.5100us  2.4640us  2.8800us  vector_add(float*, float*, float*, int)\n",
323 |             "                    3.65%  9.6960us         2  4.8480us  4.6720us  5.0240us  [CUDA memcpy HtoD]\n",
324 |             "                    1.90%  5.0560us         1  5.0560us  5.0560us  5.0560us  [CUDA memcpy DtoH]\n",
325 |             "      API calls:   99.34%  180.00ms         3  60.002ms  2.7890us  180.00ms  cudaMalloc\n",
326 |             "                    0.24%  429.38us       100  4.2930us  3.3800us  26.703us  cudaLaunchKernel\n",
327 |             "                    0.20%  357.20us         1  357.20us  357.20us  357.20us  cuDeviceTotalMem\n",
328 |             "                    0.08%  140.11us       101  1.3870us     141ns  59.152us  cuDeviceGetAttribute\n",
329 |             "                    0.07%  121.41us         3  40.470us  24.773us  70.702us  cudaMemcpy\n",
330 |             "                    0.06%  115.47us         3  38.490us  4.5820us  98.706us  cudaFree\n",
331 |             "                    0.02%  29.831us         1  29.831us  29.831us  29.831us  cuDeviceGetName\n",
332 |             "                    0.00%  6.1680us         1  6.1680us  6.1680us  6.1680us  cuDeviceGetPCIBusId\n",
333 |             "                    0.00%  1.7640us         2     882ns     275ns  1.4890us  cuDeviceGet\n",
334 |             "                    0.00%  1.4760us         3     492ns     208ns     888ns  cuDeviceGetCount\n",
335 |             "                    0.00%     256ns         1     256ns     256ns     256ns  cuDeviceGetUuid\n",
336 |             "The Block size is 256.\n",
337 |             "==198== NVPROF is profiling process 198, command: ./verctor_add_multi_blocks_thread 3\n",
338 |             "out[0] = 3.140000\n",
339 |             "PASSED\n",
340 |             "==198== Profiling application: ./verctor_add_multi_blocks_thread 3\n",
341 |             "==198== Profiling result:\n",
342 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
343 |             " GPU activities:   94.40%  246.92us       100  2.4690us  2.4320us  2.8160us  vector_add(float*, float*, float*, int)\n",
344 |             "                    3.67%  9.6000us         2  4.8000us  4.6400us  4.9600us  [CUDA memcpy HtoD]\n",
345 |             "                    1.93%  5.0560us         1  5.0560us  5.0560us  5.0560us  [CUDA memcpy DtoH]\n",
346 |             "      API calls:   99.34%  178.61ms         3  59.537ms  3.3080us  178.60ms  cudaMalloc\n",
347 |             "                    0.24%  439.19us       100  4.3910us  3.3750us  32.020us  cudaLaunchKernel\n",
348 |             "                    0.20%  353.33us         1  353.33us  353.33us  353.33us  cuDeviceTotalMem\n",
349 |             "                    0.08%  138.06us       101  1.3660us     134ns  58.617us  cuDeviceGetAttribute\n",
350 |             "                    0.06%  114.41us         3  38.136us  4.2640us  100.60us  cudaFree\n",
351 |             "                    0.06%  104.06us         3  34.685us  26.218us  44.970us  cudaMemcpy\n",
352 |             "                    0.02%  37.664us         1  37.664us  37.664us  37.664us  cuDeviceGetName\n",
353 |             "                    0.00%  4.8210us         1  4.8210us  4.8210us  4.8210us  cuDeviceGetPCIBusId\n",
354 |             "                    0.00%  1.7230us         3     574ns     212ns  1.1040us  cuDeviceGetCount\n",
355 |             "                    0.00%  1.3420us         2     671ns     301ns  1.0410us  cuDeviceGet\n",
356 |             "                    0.00%     286ns         1     286ns     286ns     286ns  cuDeviceGetUuid\n"
357 |           ],
358 |           "name": "stdout"
359 |         }
360 |       ]
361 |     }
362 |   ]
363 | }
364 | 


--------------------------------------------------------------------------------
/Solution/Exercise_07.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Exercise_07.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     },
 17 |     "accelerator": "GPU"
 18 |   },
 19 |   "cells": [
 20 |     {
 21 |       "cell_type": "markdown",
 22 |       "metadata": {
 23 |         "id": "h-JwSwNW9QmT"
 24 |       },
 25 |       "source": [
 26 |         "\n",
 27 |         "# CUDA Exercise 07\n",
 28 |         "> You should try to implement your own solution for vector dot product, and try to parallelize the computation.\n",
 29 |         "\n",
 30 |         "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n",
 31 |         "\n",
 32 |         "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_07.ipynb)\n"
 33 |       ]
 34 |     },
 35 |     {
 36 |       "cell_type": "markdown",
 37 |       "metadata": {
 38 |         "id": "cOEai4hb95Ip"
 39 |       },
 40 |       "source": [
 41 |         "## Initialize the CUDA dev environment"
 42 |       ]
 43 |     },
 44 |     {
 45 |       "cell_type": "code",
 46 |       "metadata": {
 47 |         "id": "bqmwwI7H5nDx"
 48 |       },
 49 |       "source": [
 50 |         "# clone the code repo,\n",
 51 |         "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n",
 52 |         "%load_ext nvcc_plugin"
 53 |       ],
 54 |       "execution_count": null,
 55 |       "outputs": []
 56 |     },
 57 |     {
 58 |       "cell_type": "markdown",
 59 |       "metadata": {
 60 |         "id": "P2Zeyyo4_gNH"
 61 |       },
 62 |       "source": [
 63 |         "## Check the environment "
 64 |       ]
 65 |     },
 66 |     {
 67 |       "cell_type": "code",
 68 |       "metadata": {
 69 |         "id": "N6PT4QpR6oxt"
 70 |       },
 71 |       "source": [
 72 |         "!lsb_release -a\n",
 73 |         "!nvcc --version\n",
 74 |         "!nvidia-smi"
 75 |       ],
 76 |       "execution_count": null,
 77 |       "outputs": []
 78 |     },
 79 |     {
 80 |       "cell_type": "markdown",
 81 |       "metadata": {
 82 |         "id": "TF6KTYqE_n7H"
 83 |       },
 84 |       "source": [
 85 |         "## Naive approach of vector dot product"
 86 |       ]
 87 |     },
 88 |     {
 89 |       "cell_type": "code",
 90 |       "metadata": {
 91 |         "id": "Ev5_BW1z80S3"
 92 |       },
 93 |       "source": [
 94 |         "%%writefile exercise01.cu\n",
 95 |         "#include <stdio.h>\n",
 96 |         "#include <assert.h>\n",
 97 |         "\n",
 98 |         "#define MAX_ERR 0.1\n",
 99 |         "#define MULTI_TIMES_RUN 1\n",
100 |         "\n",
101 |         "__global__ void vector_dot_product(float *result, float *vector_a, float *vector_b, int vertor_length) \n",
102 |         "{\n",
103 |         "    extern __shared__ float temp[];\n",
104 |         " \n",
105 |         "    int index = threadIdx.x;    // index offset of this thread\n",
106 |         "    int stride = blockDim.x;    // stride step of each iteration\n",
107 |         "\n",
108 |         "    // so if threadIdx.x=0, and blockDim.x=10,\n",
109 |         "    // then this thread is responsible for calculating temp[0], temp[10], temp[20]\n",
110 |         "    // similiarly, the following thread will calculate temp[1], temp[11], temp[21]\n",
111 |         "    for(int i = index; i < vertor_length; i += stride)\n",
112 |         "    {\n",
113 |         "        temp[i] = vector_a[i] * vector_b[i];\n",
114 |         "    }\n",
115 |         " \n",
116 |         "    __syncthreads(); // synchronize all threads\n",
117 |         " \n",
118 |         "    // The accumulation only needs to happen at thread_0\n",
119 |         "    if (threadIdx.x == 0)\n",
120 |         "    {\n",
121 |         "        float sum = 0;\n",
122 |         "        for (int i = 0; i < vertor_length; i++)\n",
123 |         "        {\n",
124 |         "            sum += temp[i];\n",
125 |         "        }\n",
126 |         "        *result=sum;\n",
127 |         "    }\n",
128 |         "}\n",
129 |         "\n",
130 |         "int main(int argc, char *argv[])\n",
131 |         "{\n",
132 |         "    float *vector_a, *vector_b, *result;\n",
133 |         "    float *d_vector_a, *d_vector_b, *d_result;\n",
134 |         "    int list_of_thread_num[]={1,64,128,256,512,1024};\n",
135 |         "    int list_of_vector_length[]={100,200,1000,2000,10000};\n",
136 |         "    int thread_num = 1;\n",
137 |         "    int vector_length = 1000;\n",
138 |         " \n",
139 |         "     if( argc == 3 ) {\n",
140 |         "      //printf(\"The argument supplied is %s\\n\", argv[1]);\n",
141 |         "      int arg1 = atoi(argv[1]);  //argv[0] is the program name\n",
142 |         "                                //atoi = ascii to int\n",
143 |         "      int arg2 = atoi(argv[2]);              \n",
144 |         "      \n",
145 |         "      vector_length = list_of_vector_length[arg1];\n",
146 |         "      thread_num = list_of_thread_num[arg2];\n",
147 |         "    }\n",
148 |         "    else if( argc > 2 ) {\n",
149 |         "      printf(\"Too many arguments supplied.\\n\");\n",
150 |         "    }\n",
151 |         "    else {\n",
152 |         "      printf(\"One argument expected.\\n\");\n",
153 |         "      \n",
154 |         "    }\n",
155 |         "\n",
156 |         "    // Allocate memory on CPU\n",
157 |         "    vector_a = (float*)malloc(sizeof(float) * vector_length);\n",
158 |         "    vector_b = (float*)malloc(sizeof(float) * vector_length);\n",
159 |         "    result = (float*)malloc(sizeof(float));\n",
160 |         "\n",
161 |         "    // data initializtion\n",
162 |         "    for(int i = 0; i < vector_length; i++)\n",
163 |         "    {\n",
164 |         "        vector_a[i] = 0.1f;\n",
165 |         "        vector_b[i] = 0.9f;\n",
166 |         "    }\n",
167 |         "\n",
168 |         "    // Allocate memory on GPU\n",
169 |         "    cudaMalloc((void**)&d_vector_a, sizeof(float) * vector_length);\n",
170 |         "    cudaMalloc((void**)&d_vector_b, sizeof(float) * vector_length);\n",
171 |         "    cudaMalloc((void**)&d_result, sizeof(float));\n",
172 |         "\n",
173 |         "    // copy operator to GPU\n",
174 |         "    cudaMemcpy(d_vector_a, vector_a, sizeof(float) * vector_length, cudaMemcpyHostToDevice);\n",
175 |         "    cudaMemcpy(d_vector_b, vector_b, sizeof(float) * vector_length, cudaMemcpyHostToDevice);\n",
176 |         "\n",
177 |         "    // GPU do the work, CPU waits\n",
178 |         "#if MULTI_TIMES_RUN\n",
179 |         "    for(int i=0; i< 10; i++)\n",
180 |         "    {\n",
181 |         "#endif\n",
182 |         "        vector_dot_product<<<1,thread_num,sizeof(float) * vector_length>>>(d_result, d_vector_a, d_vector_b, vector_length);\n",
183 |         "#if MULTI_TIMES_RUN\n",
184 |         "    }\n",
185 |         " #endif\n",
186 |         " \n",
187 |         "    // Get results from the GPU\n",
188 |         "    cudaMemcpy(result, d_result, sizeof(float), \n",
189 |         "               cudaMemcpyDeviceToHost);\n",
190 |         " \n",
191 |         "    // Test the result\n",
192 |         "    //assert(fabs(*result - vector_length*2*3.14) < MAX_ERR);\n",
193 |         "    \n",
194 |         "    // you only need them for checking if the math is correct\n",
195 |         "     printf(\"result[0] = %f\\n\", result[0]);\n",
196 |         "    // printf(\"PASSED\\n\");\n",
197 |         "\n",
198 |         "    // Free the memory\n",
199 |         "    cudaFree(d_vector_a);\n",
200 |         "    cudaFree(d_vector_b);\n",
201 |         "    cudaFree(d_result);\n",
202 |         "    free(vector_a);\n",
203 |         "    free(vector_a);\n",
204 |         "    free(result);\n",
205 |         "\n",
206 |         "}"
207 |       ],
208 |       "execution_count": null,
209 |       "outputs": []
210 |     },
211 |     {
212 |       "cell_type": "markdown",
213 |       "metadata": {
214 |         "id": "Unl0xR2C_27V"
215 |       },
216 |       "source": [
217 |         "## Optimized approach of vector dot product"
218 |       ]
219 |     },
220 |     {
221 |       "cell_type": "code",
222 |       "metadata": {
223 |         "id": "ba05ukJC8AKq"
224 |       },
225 |       "source": [
226 |         "%%writefile exercise01.cu\n",
227 |         "#include <stdio.h>\n",
228 |         "#include <assert.h>\n",
229 |         "\n",
230 |         "#define MAX_ERR 0.1\n",
231 |         "#define MULTI_TIMES_RUN 1\n",
232 |         "\n",
233 |         "__global__ void vector_dot_product(float *result, float *vector_a, float *vector_b, int vertor_length) \n",
234 |         "{\n",
235 |         "    extern __shared__ float temp[];\n",
236 |         " \n",
237 |         "    int index = threadIdx.x;    // index offset of this thread\n",
238 |         "    int stride = blockDim.x;    // stride step of each iteration\n",
239 |         "\n",
240 |         "    temp[threadIdx.x] = 0;\n",
241 |         "    for(int i = index; i < vertor_length; i += stride)\n",
242 |         "    {\n",
243 |         "        temp[threadIdx.x] = temp[threadIdx.x] + vector_a[i] * vector_b[i];\n",
244 |         "    }\n",
245 |         " \n",
246 |         "    __syncthreads(); // synchronize all threads\n",
247 |         " \n",
248 |         "    // The accumulation only needs to happen at thread_0\n",
249 |         "    if (threadIdx.x == 0)\n",
250 |         "    {\n",
251 |         "        float sum = 0;\n",
252 |         "        int thread_num = (vertor_length+blockDim.x)/blockDim.x;\n",
253 |         "        for (int i = 0; i < thread_num; i++)\n",
254 |         "        {\n",
255 |         "            sum += temp[i];\n",
256 |         "        }\n",
257 |         "        *result=sum;\n",
258 |         "    }\n",
259 |         "}\n",
260 |         "\n",
261 |         "int main(int argc, char *argv[])\n",
262 |         "{\n",
263 |         "    float *vector_a, *vector_b, *result;\n",
264 |         "    float *d_vector_a, *d_vector_b, *d_result;\n",
265 |         "    int list_of_thread_num[]={1,64,128,256,512,1024};\n",
266 |         "    int list_of_vector_length[]={100,200,1000,2000,10000};\n",
267 |         "    int thread_num = 1;\n",
268 |         "    int vector_length = 1000;\n",
269 |         " \n",
270 |         "     if( argc == 3 ) {\n",
271 |         "      //printf(\"The arguments supplied are %s, %s\\n\", argv[1], argv[2]);\n",
272 |         "      int arg1 = atoi(argv[1]);  //argv[0] is the program name\n",
273 |         "                                //atoi = ascii to int\n",
274 |         "      int arg2 = atoi(argv[2]);              \n",
275 |         "      \n",
276 |         "      vector_length = list_of_vector_length[arg1];\n",
277 |         "      thread_num = list_of_thread_num[arg2];\n",
278 |         "    }\n",
279 |         "    else if( argc > 2 ) {\n",
280 |         "      printf(\"Too many arguments supplied.\\n\");\n",
281 |         "    }\n",
282 |         "    else {\n",
283 |         "      printf(\"Two argument expected.\\n\");\n",
284 |         "      return 0;\n",
285 |         "    }\n",
286 |         "\n",
287 |         "    // Allocate memory on CPU\n",
288 |         "    vector_a = (float*)malloc(sizeof(float) * vector_length);\n",
289 |         "    vector_b = (float*)malloc(sizeof(float) * vector_length);\n",
290 |         "    result = (float*)malloc(sizeof(float));\n",
291 |         "\n",
292 |         "    // data initializtion\n",
293 |         "    for(int i = 0; i < vector_length; i++)\n",
294 |         "    {\n",
295 |         "        vector_a[i] = 0.1f;\n",
296 |         "        vector_b[i] = 0.9f;\n",
297 |         "    }\n",
298 |         "\n",
299 |         "    // Allocate memory on GPU\n",
300 |         "    cudaMalloc((void**)&d_vector_a, sizeof(float) * vector_length);\n",
301 |         "    cudaMalloc((void**)&d_vector_b, sizeof(float) * vector_length);\n",
302 |         "    cudaMalloc((void**)&d_result, sizeof(float));\n",
303 |         "\n",
304 |         "    // copy operator to GPU\n",
305 |         "    cudaMemcpy(d_vector_a, vector_a, sizeof(float) * vector_length, cudaMemcpyHostToDevice);\n",
306 |         "    cudaMemcpy(d_vector_b, vector_b, sizeof(float) * vector_length, cudaMemcpyHostToDevice);\n",
307 |         "\n",
308 |         "    // GPU do the work, CPU waits\n",
309 |         "#if MULTI_TIMES_RUN\n",
310 |         "    for(int i=0; i< 10; i++)\n",
311 |         "    {\n",
312 |         "#endif\n",
313 |         "        vector_dot_product<<<1,thread_num,sizeof(float) * thread_num>>>(d_result, d_vector_a, d_vector_b, vector_length);\n",
314 |         "#if MULTI_TIMES_RUN\n",
315 |         "    }\n",
316 |         " #endif\n",
317 |         " \n",
318 |         "    // Get results from the GPU\n",
319 |         "    cudaMemcpy(result, d_result, sizeof(float), \n",
320 |         "               cudaMemcpyDeviceToHost);\n",
321 |         " \n",
322 |         "    // Test the result\n",
323 |         "    //assert(fabs(*result - vector_length*2*3.14) < MAX_ERR);\n",
324 |         "    \n",
325 |         "    // you only need them for checking if the math is correct\n",
326 |         "     printf(\"result[0] = %f\\n\", result[0]);\n",
327 |         "    // printf(\"PASSED\\n\");\n",
328 |         "\n",
329 |         "    // Free the memory\n",
330 |         "    cudaFree(d_vector_a);\n",
331 |         "    cudaFree(d_vector_b);\n",
332 |         "    cudaFree(d_result);\n",
333 |         "    free(vector_a);\n",
334 |         "    free(vector_a);\n",
335 |         "    free(result);\n",
336 |         "}"
337 |       ],
338 |       "execution_count": null,
339 |       "outputs": []
340 |     },
341 |     {
342 |       "cell_type": "markdown",
343 |       "metadata": {
344 |         "id": "_BsEJesxACRz"
345 |       },
346 |       "source": [
347 |         "## Evaluation to collect enough information for the benchmark"
348 |       ]
349 |     },
350 |     {
351 |       "cell_type": "code",
352 |       "metadata": {
353 |         "id": "CjisNLsazjUT"
354 |       },
355 |       "source": [
356 |         "!nvcc -o exercise01 exercise01.cu\n",
357 |         "!nvprof ./exercise01 0 0\n",
358 |         "!nvprof ./exercise01 1 0\n",
359 |         "!nvprof ./exercise01 2 0\n",
360 |         "!nvprof ./exercise01 3 0\n",
361 |         "!nvprof ./exercise01 4 0"
362 |       ],
363 |       "execution_count": null,
364 |       "outputs": []
365 |     },
366 |     {
367 |       "cell_type": "code",
368 |       "metadata": {
369 |         "id": "J20hMfub0Tr2"
370 |       },
371 |       "source": [
372 |         "!nvcc -o exercise01 exercise01.cu\n",
373 |         "!nvprof ./exercise01 4 0\n",
374 |         "!nvprof ./exercise01 4 1\n",
375 |         "!nvprof ./exercise01 4 2\n",
376 |         "!nvprof ./exercise01 4 3\n",
377 |         "!nvprof ./exercise01 4 4"
378 |       ],
379 |       "execution_count": null,
380 |       "outputs": []
381 |     }
382 |   ]
383 | }
384 | 


--------------------------------------------------------------------------------
/Solution/Exercise_08.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Exercise_08.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": []
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     },
 17 |     "accelerator": "GPU"
 18 |   },
 19 |   "cells": [
 20 |     {
 21 |       "cell_type": "markdown",
 22 |       "metadata": {
 23 |         "id": "h-JwSwNW9QmT"
 24 |       },
 25 |       "source": [
 26 |         "\n",
 27 |         "# CUDA Exercise 08\n",
 28 |         "> You should try to implement your own solution for matrix vector multiplication, and try to parallelize the computation.\n",
 29 |         "\n",
 30 |         "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n",
 31 |         "\n",
 32 |         "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_08.ipynb)\n"
 33 |       ]
 34 |     },
 35 |     {
 36 |       "cell_type": "markdown",
 37 |       "metadata": {
 38 |         "id": "cOEai4hb95Ip"
 39 |       },
 40 |       "source": [
 41 |         "## Initialize the CUDA dev environment"
 42 |       ]
 43 |     },
 44 |     {
 45 |       "cell_type": "code",
 46 |       "metadata": {
 47 |         "id": "bqmwwI7H5nDx",
 48 |         "colab": {
 49 |           "base_uri": "https://localhost:8080/"
 50 |         },
 51 |         "outputId": "df4692fa-1acf-4689-ce13-f59b43ead1f6"
 52 |       },
 53 |       "source": [
 54 |         "# clone the code repo,\n",
 55 |         "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n",
 56 |         "%load_ext nvcc_plugin"
 57 |       ],
 58 |       "execution_count": 1,
 59 |       "outputs": [
 60 |         {
 61 |           "output_type": "stream",
 62 |           "text": [
 63 |             "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n",
 64 |             "  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-6ri04v_g\n",
 65 |             "  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-6ri04v_g\n",
 66 |             "Building wheels for collected packages: NVCCPlugin\n",
 67 |             "  Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 68 |             "  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=4d14ae8e1b5d4553791c7785ff742a5ca7908444bfa86c9a7f151acbb55ff62c\n",
 69 |             "  Stored in directory: /tmp/pip-ephem-wheel-cache-83ylvme0/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n",
 70 |             "Successfully built NVCCPlugin\n",
 71 |             "Installing collected packages: NVCCPlugin\n",
 72 |             "Successfully installed NVCCPlugin-0.0.2\n",
 73 |             "Default out bin result.out\n"
 74 |           ],
 75 |           "name": "stdout"
 76 |         }
 77 |       ]
 78 |     },
 79 |     {
 80 |       "cell_type": "markdown",
 81 |       "metadata": {
 82 |         "id": "P2Zeyyo4_gNH"
 83 |       },
 84 |       "source": [
 85 |         "## Check the environment "
 86 |       ]
 87 |     },
 88 |     {
 89 |       "cell_type": "code",
 90 |       "metadata": {
 91 |         "id": "N6PT4QpR6oxt",
 92 |         "colab": {
 93 |           "base_uri": "https://localhost:8080/"
 94 |         },
 95 |         "outputId": "5a8244ea-c4d9-44fe-ba52-f106a709938f"
 96 |       },
 97 |       "source": [
 98 |         "!lsb_release -a\n",
 99 |         "!nvcc --version\n",
100 |         "!nvidia-smi"
101 |       ],
102 |       "execution_count": 2,
103 |       "outputs": [
104 |         {
105 |           "output_type": "stream",
106 |           "text": [
107 |             "No LSB modules are available.\n",
108 |             "Distributor ID:\tUbuntu\n",
109 |             "Description:\tUbuntu 18.04.5 LTS\n",
110 |             "Release:\t18.04\n",
111 |             "Codename:\tbionic\n",
112 |             "nvcc: NVIDIA (R) Cuda compiler driver\n",
113 |             "Copyright (c) 2005-2020 NVIDIA Corporation\n",
114 |             "Built on Wed_Jul_22_19:09:09_PDT_2020\n",
115 |             "Cuda compilation tools, release 11.0, V11.0.221\n",
116 |             "Build cuda_11.0_bu.TC445_37.28845127_0\n",
117 |             "Sun Apr 25 20:46:45 2021       \n",
118 |             "+-----------------------------------------------------------------------------+\n",
119 |             "| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
120 |             "|-------------------------------+----------------------+----------------------+\n",
121 |             "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
122 |             "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
123 |             "|                               |                      |               MIG M. |\n",
124 |             "|===============================+======================+======================|\n",
125 |             "|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |\n",
126 |             "| N/A   49C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |\n",
127 |             "|                               |                      |                  N/A |\n",
128 |             "+-------------------------------+----------------------+----------------------+\n",
129 |             "                                                                               \n",
130 |             "+-----------------------------------------------------------------------------+\n",
131 |             "| Processes:                                                                  |\n",
132 |             "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
133 |             "|        ID   ID                                                   Usage      |\n",
134 |             "|=============================================================================|\n",
135 |             "|  No running processes found                                                 |\n",
136 |             "+-----------------------------------------------------------------------------+\n"
137 |           ],
138 |           "name": "stdout"
139 |         }
140 |       ]
141 |     },
142 |     {
143 |       "cell_type": "markdown",
144 |       "metadata": {
145 |         "id": "TF6KTYqE_n7H"
146 |       },
147 |       "source": [
148 |         "## Naive approach of matrix vector multiplication\n",
149 |         "Try to optimize it, you can do much better!"
150 |       ]
151 |     },
152 |     {
153 |       "cell_type": "code",
154 |       "metadata": {
155 |         "id": "Ev5_BW1z80S3",
156 |         "colab": {
157 |           "base_uri": "https://localhost:8080/"
158 |         },
159 |         "outputId": "afc6adc9-ccd6-46c1-9f1e-3b67a7e02e9f"
160 |       },
161 |       "source": [
162 |         "%%writefile matrix_vector_multiplication.cu\n",
163 |         "#include <stdio.h>\n",
164 |         "#include <assert.h>\n",
165 |         "\n",
166 |         "#define M 100\n",
167 |         "#define N 100\n",
168 |         "#define MAX_ERR 1e-4\n",
169 |         "\n",
170 |         "__global__ void matrix_vector_multiplication(float* vector_result, float *matrix_a, float *vector_b, int m_row, int n_col) \n",
171 |         "{\n",
172 |         "    extern __shared__ float temp[];\n",
173 |         " \n",
174 |         "    // blockIdx.x => which row\n",
175 |         "    // blockDim.x => row length\n",
176 |         "    // threadIdx.x => which element in this row\n",
177 |         " \n",
178 |         "    // Unique tid which can index each single element in the matrix\n",
179 |         "    int tid = blockIdx.x * blockDim.x + threadIdx.x;\n",
180 |         "\n",
181 |         "    // the condiction logic make sure, we only do the calculation in the matrix space\n",
182 |         "    int size_of_the_matrix = m_row*n_col;\n",
183 |         "    if(tid<size_of_the_matrix)\n",
184 |         "    {\n",
185 |         "        temp[tid] = matrix_a[tid] * vector_b[threadIdx.x]; // sum\n",
186 |         "    }\n",
187 |         " \n",
188 |         "    __syncthreads(); // synchronize all threads\n",
189 |         " \n",
190 |         "    // The accumulation only needs to happen at thread_0\n",
191 |         "    if (threadIdx.x == 0)\n",
192 |         "    {\n",
193 |         "        float sum = 0;\n",
194 |         "        int index = blockIdx.x * blockDim.x;\n",
195 |         "        for (int i = index; i < index + blockDim.x ; i++)\n",
196 |         "        {\n",
197 |         "            sum += temp[i];\n",
198 |         "        }\n",
199 |         "        vector_result[blockIdx.x] = sum;\n",
200 |         "    }\n",
201 |         "}\n",
202 |         "\n",
203 |         "int main()\n",
204 |         "{\n",
205 |         "    float *martix_a, *martix_b, *vector_result;\n",
206 |         "    float *d_martix_a, *d_martix_b, *d_vector_result;\n",
207 |         " \n",
208 |         "    martix_a = (float*)malloc(sizeof(float) * (M * N));\n",
209 |         "    martix_b = (float*)malloc(sizeof(float) * (N));\n",
210 |         "    vector_result = (float*)malloc(sizeof(float) * (M));\n",
211 |         "\n",
212 |         "    // data initializtion\n",
213 |         "    for(int raw_num = 0; raw_num < M; raw_num++) \n",
214 |         "    {\n",
215 |         "        for(int col_num = 0; col_num < N; col_num++)\n",
216 |         "        {\n",
217 |         "            int index = raw_num*N+col_num;\n",
218 |         "            martix_a[index] = raw_num*3.14f+col_num;\n",
219 |         "        }\n",
220 |         "    }\n",
221 |         " \n",
222 |         "    for(int col_num = 0; col_num < N; col_num++)\n",
223 |         "    {\n",
224 |         "        martix_b[col_num] = col_num+1;\n",
225 |         "    }\n",
226 |         " \n",
227 |         "    // Allocate memory on GPU\n",
228 |         "    cudaMalloc((void**)&d_martix_a, sizeof(float) * (M * N));\n",
229 |         "    cudaMalloc((void**)&d_martix_b, sizeof(float) * N);\n",
230 |         "    cudaMalloc((void**)&d_vector_result, sizeof(float) * M);\n",
231 |         "\n",
232 |         "    // copy operator to GPU\n",
233 |         "    cudaMemcpy(d_martix_a, martix_a, sizeof(float) * (M * N), cudaMemcpyHostToDevice);\n",
234 |         "    cudaMemcpy(d_martix_b, martix_b, sizeof(float) * N, cudaMemcpyHostToDevice);\n",
235 |         "\n",
236 |         "    // GPU do the work, CPU waits\n",
237 |         "    matrix_vector_multiplication<<<M,N,sizeof(float) * (M * N)>>>(d_vector_result, d_martix_a, d_martix_b, M, N);\n",
238 |         " \n",
239 |         "    // Get results from the GPU\n",
240 |         "    cudaMemcpy(vector_result, d_vector_result, sizeof(float) * M, cudaMemcpyDeviceToHost);\n",
241 |         " \n",
242 |         "    // Test the result\n",
243 |         "    for(int i = 0; i < M; i++)\n",
244 |         "    {\n",
245 |         "        float temp_sum =0;\n",
246 |         "        for(int j = 0; j < N; j++)\n",
247 |         "        {\n",
248 |         "            int index = i*N+j;\n",
249 |         "            temp_sum = temp_sum + martix_a[index]*martix_b[j]; \n",
250 |         "        }\n",
251 |         "        //printf(\"out[%d]: %f, %f\\n\", i, temp_sum, vector_result[i]);\n",
252 |         "     \n",
253 |         "        assert(fabs(vector_result[i] - temp_sum) < MAX_ERR);\n",
254 |         "    }\n",
255 |         "    printf(\"PASSED\\n\");\n",
256 |         "\n",
257 |         "    // Free the memory\n",
258 |         "    cudaFree(d_martix_a);\n",
259 |         "    cudaFree(d_martix_b);\n",
260 |         "    cudaFree(d_vector_result);\n",
261 |         "    free(martix_a);\n",
262 |         "    free(martix_b);\n",
263 |         "    free(vector_result);\n",
264 |         "    \n",
265 |         "    return 0;\n",
266 |         "}"
267 |       ],
268 |       "execution_count": 4,
269 |       "outputs": [
270 |         {
271 |           "output_type": "stream",
272 |           "text": [
273 |             "Overwriting matrix_vector_multiplication.cu\n"
274 |           ],
275 |           "name": "stdout"
276 |         }
277 |       ]
278 |     },
279 |     {
280 |       "cell_type": "markdown",
281 |       "metadata": {
282 |         "id": "_BsEJesxACRz"
283 |       },
284 |       "source": [
285 |         "## Evaluation to collect enough information for the benchmark"
286 |       ]
287 |     },
288 |     {
289 |       "cell_type": "code",
290 |       "metadata": {
291 |         "id": "CjisNLsazjUT",
292 |         "colab": {
293 |           "base_uri": "https://localhost:8080/"
294 |         },
295 |         "outputId": "8f37c2cd-23aa-42c1-ff7f-0e1ac2572987"
296 |       },
297 |       "source": [
298 |         "!nvcc -o matrix_vector_multiplication matrix_vector_multiplication.cu\n",
299 |         "!nvprof ./matrix_vector_multiplication 0 0\n",
300 |         "!nvprof ./matrix_vector_multiplication 1 0\n",
301 |         "!nvprof ./matrix_vector_multiplication 2 0\n",
302 |         "!nvprof ./matrix_vector_multiplication 3 0\n",
303 |         "!nvprof ./matrix_vector_multiplication 4 0"
304 |       ],
305 |       "execution_count": 5,
306 |       "outputs": [
307 |         {
308 |           "output_type": "stream",
309 |           "text": [
310 |             "==166== NVPROF is profiling process 166, command: ./matrix_vector_multiplication 0 0\n",
311 |             "PASSED\n",
312 |             "==166== Profiling application: ./matrix_vector_multiplication 0 0\n",
313 |             "==166== Profiling result:\n",
314 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
315 |             " GPU activities:   53.11%  9.5670us         1  9.5670us  9.5670us  9.5670us  matrix_vector_multiplication(float*, float*, float*, int, int)\n",
316 |             "                   35.17%  6.3360us         2  3.1680us  1.4080us  4.9280us  [CUDA memcpy HtoD]\n",
317 |             "                   11.72%  2.1120us         1  2.1120us  2.1120us  2.1120us  [CUDA memcpy DtoH]\n",
318 |             "      API calls:   98.77%  317.58ms         3  105.86ms  3.9820us  317.57ms  cudaMalloc\n",
319 |             "                    0.98%  3.1590ms         1  3.1590ms  3.1590ms  3.1590ms  cuDeviceGetPCIBusId\n",
320 |             "                    0.11%  360.55us         1  360.55us  360.55us  360.55us  cuDeviceTotalMem\n",
321 |             "                    0.05%  146.39us       101  1.4490us     140ns  62.080us  cuDeviceGetAttribute\n",
322 |             "                    0.04%  138.31us         3  46.102us  4.1920us  125.39us  cudaFree\n",
323 |             "                    0.02%  66.555us         3  22.185us  12.925us  32.811us  cudaMemcpy\n",
324 |             "                    0.01%  34.868us         1  34.868us  34.868us  34.868us  cudaLaunchKernel\n",
325 |             "                    0.01%  30.340us         1  30.340us  30.340us  30.340us  cuDeviceGetName\n",
326 |             "                    0.00%  1.6340us         2     817ns     288ns  1.3460us  cuDeviceGet\n",
327 |             "                    0.00%  1.3770us         3     459ns     237ns     803ns  cuDeviceGetCount\n",
328 |             "                    0.00%     276ns         1     276ns     276ns     276ns  cuDeviceGetUuid\n",
329 |             "==177== NVPROF is profiling process 177, command: ./matrix_vector_multiplication 1 0\n",
330 |             "PASSED\n",
331 |             "==177== Profiling application: ./matrix_vector_multiplication 1 0\n",
332 |             "==177== Profiling result:\n",
333 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
334 |             " GPU activities:   52.54%  9.5990us         1  9.5990us  9.5990us  9.5990us  matrix_vector_multiplication(float*, float*, float*, int, int)\n",
335 |             "                   36.08%  6.5920us         2  3.2960us  1.6320us  4.9600us  [CUDA memcpy HtoD]\n",
336 |             "                   11.38%  2.0800us         1  2.0800us  2.0800us  2.0800us  [CUDA memcpy DtoH]\n",
337 |             "      API calls:   99.60%  180.65ms         3  60.217ms  3.5120us  180.64ms  cudaMalloc\n",
338 |             "                    0.19%  346.94us         1  346.94us  346.94us  346.94us  cuDeviceTotalMem\n",
339 |             "                    0.08%  144.39us       101  1.4290us     139ns  57.541us  cuDeviceGetAttribute\n",
340 |             "                    0.06%  108.45us         3  36.150us  4.3680us  94.815us  cudaFree\n",
341 |             "                    0.04%  65.119us         3  21.706us  12.926us  31.528us  cudaMemcpy\n",
342 |             "                    0.02%  30.745us         1  30.745us  30.745us  30.745us  cudaLaunchKernel\n",
343 |             "                    0.01%  25.721us         1  25.721us  25.721us  25.721us  cuDeviceGetName\n",
344 |             "                    0.00%  7.3610us         1  7.3610us  7.3610us  7.3610us  cuDeviceGetPCIBusId\n",
345 |             "                    0.00%  1.7910us         3     597ns     247ns     894ns  cuDeviceGetCount\n",
346 |             "                    0.00%  1.3040us         2     652ns     326ns     978ns  cuDeviceGet\n",
347 |             "                    0.00%     259ns         1     259ns     259ns     259ns  cuDeviceGetUuid\n",
348 |             "==188== NVPROF is profiling process 188, command: ./matrix_vector_multiplication 2 0\n",
349 |             "PASSED\n",
350 |             "==188== Profiling application: ./matrix_vector_multiplication 2 0\n",
351 |             "==188== Profiling result:\n",
352 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
353 |             " GPU activities:   53.20%  9.5680us         1  9.5680us  9.5680us  9.5680us  matrix_vector_multiplication(float*, float*, float*, int, int)\n",
354 |             "                   35.05%  6.3040us         2  3.1520us  1.4080us  4.8960us  [CUDA memcpy HtoD]\n",
355 |             "                   11.74%  2.1120us         1  2.1120us  2.1120us  2.1120us  [CUDA memcpy DtoH]\n",
356 |             "      API calls:   99.59%  179.94ms         3  59.980ms  3.5560us  179.93ms  cudaMalloc\n",
357 |             "                    0.20%  360.13us         1  360.13us  360.13us  360.13us  cuDeviceTotalMem\n",
358 |             "                    0.08%  138.49us       101  1.3710us     140ns  56.464us  cuDeviceGetAttribute\n",
359 |             "                    0.06%  105.28us         3  35.094us  4.6640us  91.544us  cudaFree\n",
360 |             "                    0.04%  73.811us         3  24.603us  13.530us  32.582us  cudaMemcpy\n",
361 |             "                    0.02%  34.857us         1  34.857us  34.857us  34.857us  cuDeviceGetName\n",
362 |             "                    0.01%  25.780us         1  25.780us  25.780us  25.780us  cudaLaunchKernel\n",
363 |             "                    0.00%  5.4220us         1  5.4220us  5.4220us  5.4220us  cuDeviceGetPCIBusId\n",
364 |             "                    0.00%  1.3330us         3     444ns     198ns     723ns  cuDeviceGetCount\n",
365 |             "                    0.00%  1.1720us         2     586ns     299ns     873ns  cuDeviceGet\n",
366 |             "                    0.00%     254ns         1     254ns     254ns     254ns  cuDeviceGetUuid\n",
367 |             "==199== NVPROF is profiling process 199, command: ./matrix_vector_multiplication 3 0\n",
368 |             "PASSED\n",
369 |             "==199== Profiling application: ./matrix_vector_multiplication 3 0\n",
370 |             "==199== Profiling result:\n",
371 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
372 |             " GPU activities:   53.21%  9.5680us         1  9.5680us  9.5680us  9.5680us  matrix_vector_multiplication(float*, float*, float*, int, int)\n",
373 |             "                   35.23%  6.3350us         2  3.1670us  1.4070us  4.9280us  [CUDA memcpy HtoD]\n",
374 |             "                   11.57%  2.0800us         1  2.0800us  2.0800us  2.0800us  [CUDA memcpy DtoH]\n",
375 |             "      API calls:   99.58%  178.32ms         3  59.438ms  3.5200us  178.31ms  cudaMalloc\n",
376 |             "                    0.20%  351.78us         1  351.78us  351.78us  351.78us  cuDeviceTotalMem\n",
377 |             "                    0.08%  140.43us         3  46.810us  4.5390us  127.32us  cudaFree\n",
378 |             "                    0.08%  134.77us       101  1.3340us     137ns  57.331us  cuDeviceGetAttribute\n",
379 |             "                    0.03%  62.196us         3  20.732us  14.455us  27.225us  cudaMemcpy\n",
380 |             "                    0.02%  29.932us         1  29.932us  29.932us  29.932us  cuDeviceGetName\n",
381 |             "                    0.01%  24.688us         1  24.688us  24.688us  24.688us  cudaLaunchKernel\n",
382 |             "                    0.00%  4.7770us         1  4.7770us  4.7770us  4.7770us  cuDeviceGetPCIBusId\n",
383 |             "                    0.00%  1.3490us         3     449ns     200ns     842ns  cuDeviceGetCount\n",
384 |             "                    0.00%  1.1530us         2     576ns     254ns     899ns  cuDeviceGet\n",
385 |             "                    0.00%     253ns         1     253ns     253ns     253ns  cuDeviceGetUuid\n",
386 |             "==210== NVPROF is profiling process 210, command: ./matrix_vector_multiplication 4 0\n",
387 |             "PASSED\n",
388 |             "==210== Profiling application: ./matrix_vector_multiplication 4 0\n",
389 |             "==210== Profiling result:\n",
390 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
391 |             " GPU activities:   53.37%  9.6320us         1  9.6320us  9.6320us  9.6320us  matrix_vector_multiplication(float*, float*, float*, int, int)\n",
392 |             "                   34.93%  6.3040us         2  3.1520us  1.3760us  4.9280us  [CUDA memcpy HtoD]\n",
393 |             "                   11.70%  2.1110us         1  2.1110us  2.1110us  2.1110us  [CUDA memcpy DtoH]\n",
394 |             "      API calls:   99.58%  177.20ms         3  59.067ms  3.6720us  177.19ms  cudaMalloc\n",
395 |             "                    0.21%  366.81us         1  366.81us  366.81us  366.81us  cuDeviceTotalMem\n",
396 |             "                    0.08%  136.60us       101  1.3520us     138ns  57.487us  cuDeviceGetAttribute\n",
397 |             "                    0.06%  110.76us         3  36.921us  4.7200us  96.591us  cudaFree\n",
398 |             "                    0.03%  61.611us         3  20.537us  13.440us  27.097us  cudaMemcpy\n",
399 |             "                    0.02%  29.652us         1  29.652us  29.652us  29.652us  cuDeviceGetName\n",
400 |             "                    0.02%  28.677us         1  28.677us  28.677us  28.677us  cudaLaunchKernel\n",
401 |             "                    0.00%  6.0860us         1  6.0860us  6.0860us  6.0860us  cuDeviceGetPCIBusId\n",
402 |             "                    0.00%  1.2890us         3     429ns     247ns     732ns  cuDeviceGetCount\n",
403 |             "                    0.00%  1.1320us         2     566ns     280ns     852ns  cuDeviceGet\n",
404 |             "                    0.00%     257ns         1     257ns     257ns     257ns  cuDeviceGetUuid\n"
405 |           ],
406 |           "name": "stdout"
407 |         }
408 |       ]
409 |     }
410 |   ]
411 | }


--------------------------------------------------------------------------------
/Solution/Exercise_09.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Exercise_09.ipynb",
  7 |       "provenance": [],
  8 |       "collapsed_sections": [
  9 |         "_BsEJesxACRz"
 10 |       ]
 11 |     },
 12 |     "kernelspec": {
 13 |       "name": "python3",
 14 |       "display_name": "Python 3"
 15 |     },
 16 |     "language_info": {
 17 |       "name": "python"
 18 |     },
 19 |     "accelerator": "GPU"
 20 |   },
 21 |   "cells": [
 22 |     {
 23 |       "cell_type": "markdown",
 24 |       "metadata": {
 25 |         "id": "h-JwSwNW9QmT"
 26 |       },
 27 |       "source": [
 28 |         "\n",
 29 |         "# CUDA Exercise 09\n",
 30 |         "> You should try to implement your own solution for matrix multiplication, and try to parallelize the computation.\n",
 31 |         "\n",
 32 |         "This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.\n",
 33 |         "\n",
 34 |         "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_09.ipynb)\n"
 35 |       ]
 36 |     },
 37 |     {
 38 |       "cell_type": "markdown",
 39 |       "metadata": {
 40 |         "id": "cOEai4hb95Ip"
 41 |       },
 42 |       "source": [
 43 |         "## Initialize the CUDA dev environment"
 44 |       ]
 45 |     },
 46 |     {
 47 |       "cell_type": "code",
 48 |       "metadata": {
 49 |         "id": "bqmwwI7H5nDx",
 50 |         "colab": {
 51 |           "base_uri": "https://localhost:8080/"
 52 |         },
 53 |         "outputId": "d963df04-926f-400a-d0e4-2878c4a03198"
 54 |       },
 55 |       "source": [
 56 |         "# clone the code repo,\n",
 57 |         "!pip install git+git://github.com/depctg/nvcc4jupyter.git\n",
 58 |         "%load_ext nvcc_plugin"
 59 |       ],
 60 |       "execution_count": 1,
 61 |       "outputs": [
 62 |         {
 63 |           "output_type": "stream",
 64 |           "text": [
 65 |             "Collecting git+git://github.com/depctg/nvcc4jupyter.git\n",
 66 |             "  Cloning git://github.com/depctg/nvcc4jupyter.git to /tmp/pip-req-build-9uosm_fy\n",
 67 |             "  Running command git clone -q git://github.com/depctg/nvcc4jupyter.git /tmp/pip-req-build-9uosm_fy\n",
 68 |             "Building wheels for collected packages: NVCCPlugin\n",
 69 |             "  Building wheel for NVCCPlugin (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 70 |             "  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4334 sha256=b2c7f0347c89a0d2f434e28ded0da15c6996ef06e1885e654b7568adf563eff6\n",
 71 |             "  Stored in directory: /tmp/pip-ephem-wheel-cache-kkvx15za/wheels/1e/43/2d/099cad2b9b02dfa88573f50a22735d8a0b2ba69bf82167b81c\n",
 72 |             "Successfully built NVCCPlugin\n",
 73 |             "Installing collected packages: NVCCPlugin\n",
 74 |             "Successfully installed NVCCPlugin-0.0.2\n",
 75 |             "Default out bin result.out\n"
 76 |           ],
 77 |           "name": "stdout"
 78 |         }
 79 |       ]
 80 |     },
 81 |     {
 82 |       "cell_type": "markdown",
 83 |       "metadata": {
 84 |         "id": "P2Zeyyo4_gNH"
 85 |       },
 86 |       "source": [
 87 |         "## Check the environment "
 88 |       ]
 89 |     },
 90 |     {
 91 |       "cell_type": "code",
 92 |       "metadata": {
 93 |         "id": "N6PT4QpR6oxt",
 94 |         "colab": {
 95 |           "base_uri": "https://localhost:8080/"
 96 |         },
 97 |         "outputId": "0d20d71f-ff18-4f85-a6e5-8c90e1f97a8a"
 98 |       },
 99 |       "source": [
100 |         "!lsb_release -a\n",
101 |         "!nvcc --version\n",
102 |         "!nvidia-smi"
103 |       ],
104 |       "execution_count": 2,
105 |       "outputs": [
106 |         {
107 |           "output_type": "stream",
108 |           "text": [
109 |             "No LSB modules are available.\n",
110 |             "Distributor ID:\tUbuntu\n",
111 |             "Description:\tUbuntu 18.04.5 LTS\n",
112 |             "Release:\t18.04\n",
113 |             "Codename:\tbionic\n",
114 |             "nvcc: NVIDIA (R) Cuda compiler driver\n",
115 |             "Copyright (c) 2005-2020 NVIDIA Corporation\n",
116 |             "Built on Wed_Jul_22_19:09:09_PDT_2020\n",
117 |             "Cuda compilation tools, release 11.0, V11.0.221\n",
118 |             "Build cuda_11.0_bu.TC445_37.28845127_0\n",
119 |             "Mon Apr 26 21:01:30 2021       \n",
120 |             "+-----------------------------------------------------------------------------+\n",
121 |             "| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |\n",
122 |             "|-------------------------------+----------------------+----------------------+\n",
123 |             "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
124 |             "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
125 |             "|                               |                      |               MIG M. |\n",
126 |             "|===============================+======================+======================|\n",
127 |             "|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |\n",
128 |             "| N/A   40C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |\n",
129 |             "|                               |                      |                  N/A |\n",
130 |             "+-------------------------------+----------------------+----------------------+\n",
131 |             "                                                                               \n",
132 |             "+-----------------------------------------------------------------------------+\n",
133 |             "| Processes:                                                                  |\n",
134 |             "|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |\n",
135 |             "|        ID   ID                                                   Usage      |\n",
136 |             "|=============================================================================|\n",
137 |             "|  No running processes found                                                 |\n",
138 |             "+-----------------------------------------------------------------------------+\n"
139 |           ],
140 |           "name": "stdout"
141 |         }
142 |       ]
143 |     },
144 |     {
145 |       "cell_type": "markdown",
146 |       "metadata": {
147 |         "id": "TF6KTYqE_n7H"
148 |       },
149 |       "source": [
150 |         "## Matrix Multiplication - Implimentation 01"
151 |       ]
152 |     },
153 |     {
154 |       "cell_type": "code",
155 |       "metadata": {
156 |         "id": "Ev5_BW1z80S3",
157 |         "colab": {
158 |           "base_uri": "https://localhost:8080/"
159 |         },
160 |         "outputId": "286f06f4-0014-49b2-ac34-21640ade8975"
161 |       },
162 |       "source": [
163 |         "%%writefile matrix_mul_01.cu\n",
164 |         "// %%cu\n",
165 |         "#include <stdio.h>\n",
166 |         "\n",
167 |         "__global__ void matrix_mul(int *matrix_a, int *matrix_b, int *matrix_c,int matrix_a_row,int matrix_a_column,int matrix_b_column){\n",
168 |         "    int matrix_c_element = 0;\n",
169 |         "    for (int i = 0; i < matrix_a_column; i++){\n",
170 |         "      matrix_c_element += matrix_a[(threadIdx.x/matrix_b_column)*matrix_a_column+i] * matrix_b[threadIdx.x%matrix_b_column+i*matrix_b_column];\n",
171 |         "    }\n",
172 |         "    matrix_c[threadIdx.x]= matrix_c_element;\n",
173 |         "}\n",
174 |         "\n",
175 |         "int main(int argc, char *argv[]){\n",
176 |         "    \n",
177 |         "    //===========================================================================\n",
178 |         "    // Below, there are three example case, which you should only uncomment one\n",
179 |         "    // of them, to run the test.\n",
180 |         "    /* Example 1\n",
181 |         "    int matrix_a[16] = {5,0,34,21,7,17,-12,28,8,-3,-3,-3,0,-3,5,9};\n",
182 |         "    int matrix_a_row = 4;\n",
183 |         "    int matrix_a_column = 4;\n",
184 |         "    int matrix_b[16] = {0,16,24,-90,-23,0,11,1,3,3,0,3,66,7,8,0};\n",
185 |         "    int matrix_b_row = 4;\n",
186 |         "    int matrix_b_column = 4;\n",
187 |         "    */\n",
188 |         "\n",
189 |         "    /* Example 2\n",
190 |         "    int matrix_a[12] = {12,6,22,7,17,-12,36,9,9,0,-1,-2};\n",
191 |         "    int matrix_a_row = 4;\n",
192 |         "    int matrix_a_column = 3;\n",
193 |         "    int matrix_b[15] = {0,16,24,-1,4,-23,0,11,1,4,3,3,0,3,4};\n",
194 |         "    int matrix_b_row = 3;\n",
195 |         "    int matrix_b_column = 5;\n",
196 |         "    */\n",
197 |         "\n",
198 |         "    // random initialization of larger matrixes\n",
199 |         "    // matrix_a_row * matrix_b_column <= 1024\n",
200 |         "    int matrix_a_row = 50;\n",
201 |         "    int matrix_a_column = 30;\n",
202 |         "    int *matrix_a = (int*) malloc(sizeof(int) * (matrix_a_row * matrix_a_column));\n",
203 |         "    for(int i = 0; i < matrix_a_row; i++){\n",
204 |         "        for(int j = 0; j < matrix_a_column; j++)\n",
205 |         "        {\n",
206 |         "            int index = i * matrix_a_column+j;\n",
207 |         "            matrix_a[index] = 1;\n",
208 |         "        }\n",
209 |         "    }\n",
210 |         "    int matrix_b_row = 30;\n",
211 |         "    int matrix_b_column = 20;\n",
212 |         "    int *matrix_b = (int*) malloc(sizeof(int) * (matrix_b_row * matrix_b_column));\n",
213 |         "    for(int i = 0; i < matrix_b_row; i++){\n",
214 |         "        for(int j = 0; j < matrix_b_column; j++)\n",
215 |         "        {\n",
216 |         "            int index = i * matrix_b_column+j;\n",
217 |         "            matrix_b[index] = 2;\n",
218 |         "        }\n",
219 |         "    }\n",
220 |         "\n",
221 |         "    //===========================================================================\n",
222 |         "\n",
223 |         "    int *matrix_c = (int*) malloc(sizeof(int) * (matrix_a_row * matrix_b_column));\n",
224 |         "    int *d_matrix_a, *d_matrix_b, *d_matrix_c;\n",
225 |         "    \n",
226 |         "    cudaMalloc((void**)&d_matrix_a,sizeof(int) * (matrix_a_row * matrix_a_column));\n",
227 |         "    cudaMalloc((void**)&d_matrix_b,sizeof(int) * (matrix_b_row * matrix_b_column));\n",
228 |         "    cudaMalloc((void**)&d_matrix_c,sizeof(int) * (matrix_a_row * matrix_b_column));\n",
229 |         "\n",
230 |         "    cudaMemcpy(d_matrix_a, matrix_a, sizeof(int) * (matrix_a_row * matrix_a_column), cudaMemcpyHostToDevice);\n",
231 |         "    cudaMemcpy(d_matrix_b, matrix_b, sizeof(int) * (matrix_b_row * matrix_b_column), cudaMemcpyHostToDevice);\n",
232 |         "\n",
233 |         "    // implement 100 times for getting average execution time\n",
234 |         "    for(int i=0; i<100;i++){\n",
235 |         "      matrix_mul<<<1,matrix_a_row * matrix_b_column>>>(d_matrix_a, d_matrix_b, d_matrix_c, matrix_a_row,matrix_a_column, matrix_b_column);\n",
236 |         "    }\n",
237 |         "\n",
238 |         "    cudaMemcpy(matrix_c, d_matrix_c,sizeof(int) * (matrix_a_row * matrix_b_column), cudaMemcpyDeviceToHost);\n",
239 |         "\n",
240 |         "    // print matrix_c to check correction\n",
241 |         "    for(int i = 0; i < matrix_a_row; i++){\n",
242 |         "        for(int j = 0; j < matrix_b_column; j++){\n",
243 |         "            int index = i * matrix_b_column +j;\n",
244 |         "            printf(\"%d, \",matrix_c[index]);\n",
245 |         "        }\n",
246 |         "        printf(\"\\n\");\n",
247 |         "    }\n",
248 |         "    cudaDeviceSynchronize();\n",
249 |         "\n",
250 |         "    cudaFree(d_matrix_c);\n",
251 |         "    cudaFree(d_matrix_b);\n",
252 |         "    cudaFree(d_matrix_a);\n",
253 |         "\n",
254 |         "    return 0;\n",
255 |         "}"
256 |       ],
257 |       "execution_count": 3,
258 |       "outputs": [
259 |         {
260 |           "output_type": "stream",
261 |           "text": [
262 |             "Writing matrix_mul_01.cu\n"
263 |           ],
264 |           "name": "stdout"
265 |         }
266 |       ]
267 |     },
268 |     {
269 |       "cell_type": "markdown",
270 |       "metadata": {
271 |         "id": "_BsEJesxACRz"
272 |       },
273 |       "source": [
274 |         "## Evaluation to collect enough information for the benchmark"
275 |       ]
276 |     },
277 |     {
278 |       "cell_type": "code",
279 |       "metadata": {
280 |         "id": "CjisNLsazjUT",
281 |         "colab": {
282 |           "base_uri": "https://localhost:8080/"
283 |         },
284 |         "outputId": "ab265330-1331-44b7-a3ae-15f5334c006a"
285 |       },
286 |       "source": [
287 |         "!nvcc -o matrix_mul_01 matrix_mul_01.cu\n",
288 |         "!nvprof ./matrix_mul_01\n"
289 |       ],
290 |       "execution_count": 4,
291 |       "outputs": [
292 |         {
293 |           "output_type": "stream",
294 |           "text": [
295 |             "==165== NVPROF is profiling process 165, command: ./matrix_mul_01\n",
296 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
297 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
298 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
299 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
300 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
301 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
302 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
303 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
304 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
305 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
306 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
307 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
308 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
309 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
310 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
311 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
312 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
313 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
314 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
315 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
316 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
317 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
318 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
319 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
320 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
321 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
322 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
323 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
324 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
325 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
326 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
327 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
328 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
329 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
330 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
331 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
332 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
333 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
334 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
335 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
336 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
337 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
338 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
339 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
340 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
341 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
342 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
343 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
344 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
345 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
346 |             "==165== Profiling application: ./matrix_mul_01\n",
347 |             "==165== Profiling result:\n",
348 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
349 |             " GPU activities:   99.40%  1.1060ms       100  11.060us  10.944us  11.360us  matrix_mul(int*, int*, int*, int, int, int)\n",
350 |             "                    0.35%  3.9360us         2  1.9680us  1.6320us  2.3040us  [CUDA memcpy HtoD]\n",
351 |             "                    0.25%  2.7840us         1  2.7840us  2.7840us  2.7840us  [CUDA memcpy DtoH]\n",
352 |             "      API calls:   99.30%  339.36ms         3  113.12ms  3.2070us  339.35ms  cudaMalloc\n",
353 |             "                    0.23%  779.42us         3  259.81us  10.308us  744.60us  cudaMemcpy\n",
354 |             "                    0.14%  492.00us       100  4.9200us  3.7570us  35.725us  cudaLaunchKernel\n",
355 |             "                    0.12%  400.87us         1  400.87us  400.87us  400.87us  cuDeviceGetPCIBusId\n",
356 |             "                    0.11%  369.10us         1  369.10us  369.10us  369.10us  cuDeviceTotalMem\n",
357 |             "                    0.06%  193.36us       101  1.9140us     144ns  76.246us  cuDeviceGetAttribute\n",
358 |             "                    0.04%  131.28us         3  43.758us  4.4320us  115.95us  cudaFree\n",
359 |             "                    0.01%  30.707us         1  30.707us  30.707us  30.707us  cuDeviceGetName\n",
360 |             "                    0.00%  7.8160us         1  7.8160us  7.8160us  7.8160us  cudaDeviceSynchronize\n",
361 |             "                    0.00%  1.9930us         3     664ns     216ns  1.3860us  cuDeviceGetCount\n",
362 |             "                    0.00%  1.7150us         2     857ns     202ns  1.5130us  cuDeviceGet\n",
363 |             "                    0.00%     295ns         1     295ns     295ns     295ns  cuDeviceGetUuid\n"
364 |           ],
365 |           "name": "stdout"
366 |         }
367 |       ]
368 |     },
369 |     {
370 |       "cell_type": "markdown",
371 |       "metadata": {
372 |         "id": "4LefKVzj4VUV"
373 |       },
374 |       "source": [
375 |         "## Matrix Multiplication - Implimentation 02"
376 |       ]
377 |     },
378 |     {
379 |       "cell_type": "code",
380 |       "metadata": {
381 |         "id": "YZvzZt8d4UpL",
382 |         "colab": {
383 |           "base_uri": "https://localhost:8080/"
384 |         },
385 |         "outputId": "77b64349-167d-4632-87a7-f1f8055b7afd"
386 |       },
387 |       "source": [
388 |         "%%writefile matrix_mul_02.cu\n",
389 |         "//%%cu\n",
390 |         "#include <stdio.h>\n",
391 |         "\n",
392 |         "__global__ void matrix_mul(int *matrix_a, int *matrix_b, int *matrix_c,int matrix_a_row,int matrix_a_column,int matrix_b_column){\n",
393 |         "    int matrix_c_element = 0;\n",
394 |         "    int tid = blockIdx.x * blockDim.x + threadIdx.x;\n",
395 |         "    for (int i = 0; i < matrix_a_column; i++){\n",
396 |         "      matrix_c_element += matrix_a[(tid/matrix_b_column)*matrix_a_column+i] * matrix_b[tid%matrix_b_column+i*matrix_b_column];\n",
397 |         "    }\n",
398 |         "    matrix_c[tid]= matrix_c_element;\n",
399 |         "}\n",
400 |         "\n",
401 |         "int main(int argc, char *argv[]){\n",
402 |         "    \n",
403 |         "    //===========================================================================\n",
404 |         "    // Below, there are three example case, which you should only uncomment one\n",
405 |         "    // of them, to run the test.\n",
406 |         "\n",
407 |         "    /* Example 1\n",
408 |         "    int matrix_a[16] = {5,0,34,21,7,17,-12,28,8,-3,-3,-3,0,-3,5,9};\n",
409 |         "    int matrix_a_row = 4;\n",
410 |         "    int matrix_a_column = 4;\n",
411 |         "    int matrix_b[16] = {0,16,24,-90,-23,0,11,1,3,3,0,3,66,7,8,0};\n",
412 |         "    int matrix_b_row = 4;\n",
413 |         "    int matrix_b_column = 4;\n",
414 |         "    */\n",
415 |         "    \n",
416 |         "    /* Example 2\n",
417 |         "    int matrix_a[12] = {12,6,22,7,17,-12,36,9,9,0,-1,-2};\n",
418 |         "    int matrix_a_row = 4;\n",
419 |         "    int matrix_a_column = 3;\n",
420 |         "    int matrix_b[15] = {0,16,24,-1,4,-23,0,11,1,4,3,3,0,3,4};\n",
421 |         "    int matrix_b_row = 3;\n",
422 |         "    int matrix_b_column = 5;\n",
423 |         "    */\n",
424 |         "    \n",
425 |         "    \n",
426 |         "    // random initialization of larger matrixes\n",
427 |         "    // matrix_a_row as number of blocks\n",
428 |         "    // matrix_b_column as number of threads per block\n",
429 |         "    int matrix_a_row = 50;\n",
430 |         "    int matrix_a_column = 30;\n",
431 |         "    int *matrix_a = (int*) malloc(sizeof(int) * (matrix_a_row * matrix_a_column));\n",
432 |         "    for(int i = 0; i < matrix_a_row; i++){\n",
433 |         "        for(int j = 0; j < matrix_a_column; j++)\n",
434 |         "        {\n",
435 |         "            int index = i * matrix_a_column+j;\n",
436 |         "            matrix_a[index] = 1;\n",
437 |         "        }\n",
438 |         "    }\n",
439 |         "    int matrix_b_row = 30;\n",
440 |         "    int matrix_b_column = 20;\n",
441 |         "    int *matrix_b = (int*) malloc(sizeof(int) * (matrix_b_row * matrix_b_column));\n",
442 |         "    for(int i = 0; i < matrix_b_row; i++){\n",
443 |         "        for(int j = 0; j < matrix_b_column; j++)\n",
444 |         "        {\n",
445 |         "            int index = i * matrix_b_column+j;\n",
446 |         "            matrix_b[index] = 2;\n",
447 |         "        }\n",
448 |         "    }\n",
449 |         "    //===========================================================================\n",
450 |         "\n",
451 |         "\n",
452 |         "    int *matrix_c = (int*) malloc(sizeof(int) * (matrix_a_row * matrix_b_column));\n",
453 |         "    int *d_matrix_a, *d_matrix_b, *d_matrix_c;\n",
454 |         "    \n",
455 |         "    cudaMalloc((void**)&d_matrix_a,sizeof(int) * (matrix_a_row * matrix_a_column));\n",
456 |         "    cudaMalloc((void**)&d_matrix_b,sizeof(int) * (matrix_b_row * matrix_b_column));\n",
457 |         "    cudaMalloc((void**)&d_matrix_c,sizeof(int) * (matrix_a_row * matrix_b_column));\n",
458 |         "\n",
459 |         "    cudaMemcpy(d_matrix_a, matrix_a, sizeof(int) * (matrix_a_row * matrix_a_column), cudaMemcpyHostToDevice);\n",
460 |         "    cudaMemcpy(d_matrix_b, matrix_b, sizeof(int) * (matrix_b_row * matrix_b_column), cudaMemcpyHostToDevice);\n",
461 |         "\n",
462 |         "    // implement 100 times for getting average execution time\n",
463 |         "    for(int i=0; i<100;i++){\n",
464 |         "    matrix_mul<<<matrix_a_row,matrix_b_column>>>(d_matrix_a, d_matrix_b, d_matrix_c, matrix_a_row,matrix_a_column, matrix_b_column);\n",
465 |         "    \n",
466 |         "    //for comparison with 01.cu\n",
467 |         "    //matrix_mul<<<1,matrix_a_row * matrix_b_column>>>(d_matrix_a, d_matrix_b, d_matrix_c, matrix_a_row,matrix_a_column, matrix_b_column);\n",
468 |         "    }\n",
469 |         "\n",
470 |         "    cudaMemcpy(matrix_c, d_matrix_c,sizeof(int) * (matrix_a_row * matrix_b_column), cudaMemcpyDeviceToHost);\n",
471 |         "\n",
472 |         "    // print matrix_c to check correction\n",
473 |         "    for(int i = 0; i < matrix_a_row; i++){\n",
474 |         "        for(int j = 0; j < matrix_b_column; j++){\n",
475 |         "            int index = i * matrix_b_column +j;\n",
476 |         "            printf(\"%d, \",matrix_c[index]);\n",
477 |         "        }\n",
478 |         "        printf(\"\\n\");\n",
479 |         "    }\n",
480 |         "    cudaDeviceSynchronize();\n",
481 |         "\n",
482 |         "    cudaFree(d_matrix_c);\n",
483 |         "    cudaFree(d_matrix_b);\n",
484 |         "    cudaFree(d_matrix_a);\n",
485 |         "\n",
486 |         "    return 0;\n",
487 |         "}"
488 |       ],
489 |       "execution_count": 5,
490 |       "outputs": [
491 |         {
492 |           "output_type": "stream",
493 |           "text": [
494 |             "Writing matrix_mul_02.cu\n"
495 |           ],
496 |           "name": "stdout"
497 |         }
498 |       ]
499 |     },
500 |     {
501 |       "cell_type": "markdown",
502 |       "metadata": {
503 |         "id": "AKNir-yF_F_8"
504 |       },
505 |       "source": [
506 |         "## Evaluation to collect enough information for the benchmark"
507 |       ]
508 |     },
509 |     {
510 |       "cell_type": "code",
511 |       "metadata": {
512 |         "id": "s61EVRmqQ0RF",
513 |         "colab": {
514 |           "base_uri": "https://localhost:8080/"
515 |         },
516 |         "outputId": "8a411a83-bcd3-4549-fd3b-a87119bf81c3"
517 |       },
518 |       "source": [
519 |         "!nvcc -o matrix_mul_02 matrix_mul_02.cu\n",
520 |         "!nvprof ./matrix_mul_02"
521 |       ],
522 |       "execution_count": 6,
523 |       "outputs": [
524 |         {
525 |           "output_type": "stream",
526 |           "text": [
527 |             "==209== NVPROF is profiling process 209, command: ./matrix_mul_02\n",
528 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
529 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
530 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
531 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
532 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
533 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
534 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
535 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
536 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
537 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
538 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
539 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
540 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
541 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
542 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
543 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
544 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
545 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
546 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
547 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
548 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
549 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
550 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
551 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
552 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
553 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
554 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
555 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
556 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
557 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
558 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
559 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
560 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
561 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
562 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
563 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
564 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
565 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
566 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
567 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
568 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
569 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
570 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
571 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
572 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
573 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
574 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
575 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
576 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
577 |             "60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, \n",
578 |             "==209== Profiling application: ./matrix_mul_02\n",
579 |             "==209== Profiling result:\n",
580 |             "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
581 |             " GPU activities:   98.67%  526.42us       100  5.2640us  5.1830us  5.6000us  matrix_mul(int*, int*, int*, int, int, int)\n",
582 |             "                    0.78%  4.1600us         2  2.0800us  1.6320us  2.5280us  [CUDA memcpy HtoD]\n",
583 |             "                    0.55%  2.9120us         1  2.9120us  2.9120us  2.9120us  [CUDA memcpy DtoH]\n",
584 |             "      API calls:   99.44%  256.98ms         3  85.659ms  3.3400us  256.97ms  cudaMalloc\n",
585 |             "                    0.21%  536.21us       100  5.3620us  3.9960us  34.504us  cudaLaunchKernel\n",
586 |             "                    0.15%  376.96us         1  376.96us  376.96us  376.96us  cuDeviceTotalMem\n",
587 |             "                    0.07%  184.83us         3  61.610us  10.349us  156.72us  cudaMemcpy\n",
588 |             "                    0.06%  157.59us         3  52.528us  3.3410us  145.02us  cudaFree\n",
589 |             "                    0.06%  154.20us       101  1.5260us     143ns  69.320us  cuDeviceGetAttribute\n",
590 |             "                    0.01%  28.938us         1  28.938us  28.938us  28.938us  cuDeviceGetName\n",
591 |             "                    0.00%  7.2200us         1  7.2200us  7.2200us  7.2200us  cuDeviceGetPCIBusId\n",
592 |             "                    0.00%  6.6720us         1  6.6720us  6.6720us  6.6720us  cudaDeviceSynchronize\n",
593 |             "                    0.00%  1.8660us         3     622ns     220ns  1.2690us  cuDeviceGetCount\n",
594 |             "                    0.00%  1.6280us         2     814ns     338ns  1.2900us  cuDeviceGet\n",
595 |             "                    0.00%     294ns         1     294ns     294ns     294ns  cuDeviceGetUuid\n"
596 |           ],
597 |           "name": "stdout"
598 |         }
599 |       ]
600 |     }
601 |   ]
602 | }


--------------------------------------------------------------------------------