├── ci
    ├── demos
    └── run
├── miniapp
    ├── Makefile
    ├── README.md
    ├── era5_preicp_nc_to_bin.py
    └── weather_app.cu
├── README.md
├── LICENSE
└── src
    ├── atomic_flag.cpp
    ├── file_before.cpp
    ├── ticket_lock.cpp
    └── file_after.cpp


/ci/demos:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | set -ex
 3 | 
 4 | (
 5 |   cd src
 6 |   bash ticket_lock.cpp    
 7 |   bash atomic_flag.cpp
 8 |   bash file_before.cpp
 9 |   bash file_after.cpp
10 | )  
11 | 


--------------------------------------------------------------------------------
/ci/run:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | set -ex
 3 | 
 4 | IMG=nvcr.io/nvidia/nvhpc:23.5-devel-cuda12.1-ubuntu22.04
 5 | docker run \
 6 |   --gpus=all \
 7 |   -u $(id -u):$(id -g) \
 8 |   -v $(pwd):/src \
 9 |   -w /src \
10 |   $IMG \
11 |   bash -c "set -ex && ./ci/demos"
12 |   
13 | 


--------------------------------------------------------------------------------
/miniapp/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC ?= nvcc
 2 | 
 3 | all: weather_app
 4 | 
 5 | weather_app: weather_app.cu
 6 | 	$(NVCC) $^ -o $@ -std=c++11 -gencode arch=compute_80,code=sm_80 \
 7 |                               -gencode arch=compute_90,code=sm_90 \
 8 |                               -gencode arch=compute_90,code=compute_90
 9 | 
10 | clean:
11 | 	$(RM) weather_app
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | CUDA 12.2 Heterogeneous Memory Management (HMM) demos
 2 | ===
 3 | 
 4 | This repository contains the HMM demos of the [Simplifying GPU Application Development with HMM (Heterogeneous Memory Management)] blogpost.
 5 | 
 6 | The HMM requirements are described in the [CUDA 12.2 Release Notes].
 7 | The demos require a system with ATS or HMM enabled; this can be verifyied by querying the Addressing Mode using `nvidia-smi`:
 8 | 
 9 | ```shell
10 | $ nvidia-smi -q | grep Addressing
11 | 
12 | Addressing Mode                       : HMM
13 | ```
14 | 
15 | The demos are available in the [`src/`](./src) directory. On systems with docker installed, they can be run as follows:
16 | 
17 | ```shell
18 | ./ci/run
19 | ```
20 | 
21 | # License
22 | 
23 | See [LICENSE](./LICENSE).
24 | 
25 | [Simplifying GPU Application Development with HMM (Heterogeneous Memory Management)]: link.
26 | [CUDA 12.2 Release Notes]: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#general-cuda
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | SPDX-License-Identifier: MIT
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a
 5 | copy of this software and associated documentation files (the "Software"),
 6 | to deal in the Software without restriction, including without limitation
 7 | the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 | and/or sell copies of the Software, and to permit persons to whom the
 9 | Software is furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 | DEALINGS IN THE SOFTWARE.
21 | 
22 | 


--------------------------------------------------------------------------------
/miniapp/README.md:
--------------------------------------------------------------------------------
 1 | # ERA5 Total Precipitation Data Aggregation
 2 | The sample code here demonstrates how to aggregate data and generate few statistics for total precpitation from the ERA5 weather re-analysis dataset. 
 3 | 
 4 | ## Data Download
 5 | The input data for this application can be downloaded from the [this](https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels?tab=form) link. The ERA5 dataset consists of hourly estimates of several atmospheric variables at a latitude and longitude resolution of 0.25°. The 0.25° resolution results in 721x1440 distinct locations on earth. In the ERA5 dataset total precipitation data for each month is stored in a separate file in NetCDF format. For our application we pre-processed the files from NetCDF to binary format consisting of the raw floating point values. You can use the provided python script [`era5_preicp_nc_to_bin.py`](./era5_preicp_nc_to_bin.py) to convert data from NetCDF to binary format. For our test we used 40 years of “Total precipitation” data from 1981-2020, which sum to 480 input files aggregating to ~1.3 TB total input data size.
 6 | 
 7 | ## Build and Run Instructions
 8 | Use the provided `Makefile` to compile the application, which will produce binary named `weather_app`. The binary takes 3 commandline arguments.
 9 | 
10 | ```
11 | ./weather_app StartYear EndYear /PATH\_TO\_BINARY\_FILES/
12 | ```
13 | 
14 | The application uses HMM to mmap the input binary files and using CUDA computes the total precipitation for each hour for all the days in a year for the input year range. It outputs a csv file (`processed_log.csv`) with average monthly precipitation and average per-hour precipitation for each month of the year. The raw accumulated total precipitation for each hour of the year is also saved to file in binary format.
15 | 


--------------------------------------------------------------------------------
/src/atomic_flag.cpp:
--------------------------------------------------------------------------------
 1 | #if 0
 2 |   set -ex
 3 |   nvc++ -std=c++20 -stdpar=gpu -gpu=nomanaged -o atomic_flag $0
 4 |   ./atomic_flag
 5 |   exit 0
 6 | #endif
 7 | /*
 8 |  * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 |  * SPDX-License-Identifier: MIT
10 |  *
11 |  * Permission is hereby granted, free of charge, to any person obtaining a
12 |  * copy of this software and associated documentation files (the "Software"),
13 |  * to deal in the Software without restriction, including without limitation
14 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15 |  * and/or sell copies of the Software, and to permit persons to whom the
16 |  * Software is furnished to do so, subject to the following conditions:
17 |  *
18 |  * The above copyright notice and this permission notice shall be included in
19 |  * all copies or substantial portions of the Software.
20 |  *
21 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 |  * DEALINGS IN THE SOFTWARE.
28 |  */
29 | #include <algorithm>
30 | #include <atomic>
31 | #include <execution>
32 | #include <iostream>
33 | #include <thread>
34 | 
35 | int main() {
36 | 
37 |   std::atomic<int> flag = 0;
38 |   int message = 0;
39 | 
40 |   // Start a new thread that launches a GPU kernel
41 |   auto t = std::jthread([&] {
42 |     std::for_each_n(std::execution::par, &message, 1, [&](int& m) {
43 |       m = 42;
44 |       flag.store(1);
45 |     });
46 |   });
47 | 
48 |   // Wait on the flag
49 |   while (flag.load() == 0)
50 |     ;
51 |   // Read the message
52 |   std::cout << "CPU read message sent by GPU thread: " << message << std::endl;
53 | 
54 |   return 0;
55 | }
56 | 


--------------------------------------------------------------------------------
/miniapp/era5_preicp_nc_to_bin.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: MIT
 3 | #
 4 | # Permission is hereby granted, free of charge, to any person obtaining a
 5 | # copy of this software and associated documentation files (the "Software"),
 6 | # to deal in the Software without restriction, including without limitation
 7 | # the rights to use, copy, modify, merge, publish, distribute, sublicense,
 8 | # and/or sell copies of the Software, and to permit persons to whom the
 9 | # Software is furnished to do so, subject to the following conditions:
10 | #
11 | # The above copyright notice and this permission notice shall be included in
12 | # all copies or substantial portions of the Software.
13 | #
14 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 | # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 | # DEALINGS IN THE SOFTWARE.
21 | 
22 | import netCDF4 as nc
23 | import numpy as np
24 | from datetime import datetime, timedelta
25 | from dateutil.relativedelta import relativedelta
26 | from tqdm import tqdm
27 | 
28 | start_y = 1981
29 | end_y = 2020
30 | 
31 | for y in range(end_y - start_y + 1):
32 |   start_date = datetime(year=start_y+y, month=1, day=1)
33 |   end_date = datetime(year=start_y+y, month=12, day=1)
34 |   n_months = relativedelta(end_date, start_date).months + 1
35 | 
36 |   dates = [start_date + relativedelta(months=x) for x in range(n_months)]
37 | 
38 |   for date in tqdm(dates):
39 |       file_prefix = date.strftime('e5.accumulated_tp_1h.%Y%m')
40 |       input_filename = f"./raw_1hr_all/{file_prefix}.nc"
41 |       output_filename = f"./binary_1hr_all/{file_prefix}.bin"
42 |       print(f"Reading {input_filename} and outputting to {output_filename}")
43 |     
44 |       ds = nc.Dataset(input_filename, "r", format="NETCDF4")
45 |       precip = np.asarray(ds["tp"])
46 |       precip.tofile(output_filename)
47 | 
48 | 


--------------------------------------------------------------------------------
/src/file_before.cpp:
--------------------------------------------------------------------------------
 1 | #if 0
 2 |   set -ex
 3 |   nvc++ -std=c++20 -stdpar=gpu -o file_before $0
 4 |   ./file_before 10
 5 |   exit 0
 6 | #endif
 7 | /*
 8 |  * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 |  * SPDX-License-Identifier: MIT
10 |  *
11 |  * Permission is hereby granted, free of charge, to any person obtaining a
12 |  * copy of this software and associated documentation files (the "Software"),
13 |  * to deal in the Software without restriction, including without limitation
14 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15 |  * and/or sell copies of the Software, and to permit persons to whom the
16 |  * Software is furnished to do so, subject to the following conditions:
17 |  *
18 |  * The above copyright notice and this permission notice shall be included in
19 |  * all copies or substantial portions of the Software.
20 |  *
21 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 |  * DEALINGS IN THE SOFTWARE.
28 |  */
29 | #include <algorithm>
30 | #include <execution>
31 | #include <fstream>
32 | #include <functional>
33 | #include <iostream>
34 | #include <ranges>
35 | #include <span>
36 | #include <stdio.h>
37 | #include <vector>
38 | 
39 | void use_data(std::span<char>);
40 | 
41 | void sortfile(FILE* fp, int N) {
42 |   std::vector<char> buffer;
43 |   buffer.resize(N);
44 |   fread(buffer.data(), 1, N, fp);
45 |   std::sort(std::execution::par, buffer.begin(), buffer.end(), std::greater{});
46 |   use_data(std::span{buffer});
47 | }
48 | 
49 | void use_data(std::span<char> data) {
50 |   for (auto c : data) std::cout << c << std::endl;
51 | }
52 | 
53 | int main(int argc, char* argv[]) {
54 |   if (argc != 2) {
55 |     std::cerr << "ERROR: missing length argument" << std::endl;
56 |     std::cerr << "Usage: " << argv[0] << " <file bytes> " << std::endl;
57 |     abort();
58 |   }
59 |   std::size_t N = std::stoll(argv[1]);
60 | 
61 |   // Generate a file with n elements
62 |   char const* fname = "file_before.txt";
63 |   {
64 |     std::cout << "Generating file with " << N << " bytes..." << std::endl;
65 |     std::ofstream out(fname);
66 |     if (!out) {
67 |       std::cerr << "File open failed!" << std::endl;
68 |       abort();
69 |     }
70 |     std::vector<unsigned char> buffer(N);
71 |     std::for_each_n(std::execution::par, std::views::iota(0).begin(), N,
72 |                     [&](int i) { buffer[i] = (unsigned char)'0' + (unsigned char)i; });
73 |     out.write((const char*)buffer.data(), N);
74 |     out.close();
75 |   }
76 | 
77 |   FILE* fp = fopen(fname, "r");
78 |   if (fp == nullptr) {
79 |     std::cerr << "Failed to read file" << std::endl;
80 |     abort();
81 |   }
82 |   sortfile(fp, N);
83 |   return 0;
84 | }
85 | 


--------------------------------------------------------------------------------
/src/ticket_lock.cpp:
--------------------------------------------------------------------------------
 1 | #if 0
 2 |   set -ex
 3 |   nvc++ -std=c++20 -stdpar=gpu -gpu=nomanaged -o ticket_lock $0
 4 |   ./ticket_lock
 5 |   exit 0
 6 | #endif
 7 | /*
 8 |  * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 |  * SPDX-License-Identifier: MIT
10 |  *
11 |  * Permission is hereby granted, free of charge, to any person obtaining a
12 |  * copy of this software and associated documentation files (the "Software"),
13 |  * to deal in the Software without restriction, including without limitation
14 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15 |  * and/or sell copies of the Software, and to permit persons to whom the
16 |  * Software is furnished to do so, subject to the following conditions:
17 |  *
18 |  * The above copyright notice and this permission notice shall be included in
19 |  * all copies or substantial portions of the Software.
20 |  *
21 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 |  * DEALINGS IN THE SOFTWARE.
28 |  */
29 | #include <algorithm>
30 | #include <atomic>
31 | #include <execution>
32 | #include <iostream>
33 | #include <ranges>
34 | #include <thread>
35 | 
36 | struct ticket_lock {
37 |   std::atomic<uint32_t> ticket = 0;
38 |   std::atomic<uint32_t> lock = 0;
39 | 
40 |   struct guard_t {
41 |     ticket_lock* p;
42 |     ~guard_t() { p->lock.fetch_add(1, std::memory_order_release); }
43 |   };
44 | 
45 |   guard_t guard() {
46 |     uint32_t t = ticket.fetch_add(1, std::memory_order_relaxed);
47 |     while (lock.load(std::memory_order_acquire) != t)
48 |       ;
49 |     return {this};
50 |   }
51 | };
52 | 
53 | int main() {
54 |   ticket_lock lock;
55 |   int message = 0;
56 | 
57 |   int cpu_threads = 64;
58 |   int gpu_threads = 4096;
59 | 
60 |   {
61 |     // Start a new thread that launches a GPU kernel
62 |     auto t = std::jthread([&] {
63 |       std::for_each_n(std::execution::par, std::views::iota(0).begin(), gpu_threads, [&](int) {
64 |         auto g = lock.guard();
65 |         message += 1;
66 |       });
67 |     });
68 | 
69 |     // Start cpu threads
70 |     std::vector<std::jthread> threads;
71 |     threads.reserve(cpu_threads);
72 |     for (int i = 0; i < cpu_threads; ++i) {
73 |       threads.emplace_back([&] {
74 |         auto g = lock.guard();
75 |         message += 1;
76 |       });
77 |     }
78 |   } // All threads complete here
79 | 
80 |   // Read the message
81 |   int should = cpu_threads + gpu_threads;
82 |   if (message != should) {
83 |     std::cerr << "FAILED: message = " << message << " != " << should << " threads" << std::endl;
84 |     return 1;
85 |   }
86 |   std::cerr << "SUCESS: message = " << message << " == " << should << " threads" << std::endl;
87 | 
88 |   return 0;
89 | }
90 | 


--------------------------------------------------------------------------------
/src/file_after.cpp:
--------------------------------------------------------------------------------
 1 | #if 0
 2 |   set -ex
 3 |   nvc++ -std=c++20 -stdpar=gpu -gpu=nomanaged -o file_after $0
 4 |   ./file_after 10
 5 |   exit 0
 6 | #endif
 7 | /*
 8 |  * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 9 |  * SPDX-License-Identifier: MIT
10 |  *
11 |  * Permission is hereby granted, free of charge, to any person obtaining a
12 |  * copy of this software and associated documentation files (the "Software"),
13 |  * to deal in the Software without restriction, including without limitation
14 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15 |  * and/or sell copies of the Software, and to permit persons to whom the
16 |  * Software is furnished to do so, subject to the following conditions:
17 |  *
18 |  * The above copyright notice and this permission notice shall be included in
19 |  * all copies or substantial portions of the Software.
20 |  *
21 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27 |  * DEALINGS IN THE SOFTWARE.
28 |  */
29 | #include <algorithm>
30 | #include <execution>
31 | #include <fcntl.h>
32 | #include <fstream>
33 | #include <functional>
34 | #include <iostream>
35 | #include <ranges>
36 | #include <span>
37 | #include <stdio.h>
38 | #include <sys/mman.h>
39 | #include <vector>
40 | 
41 | void use_data(std::span<char>);
42 | 
43 | void sortfile(int fd, int N) {
44 |   char* buffer = (char*)mmap(NULL, N, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
45 |   if (buffer == nullptr) {
46 |     std::cerr << "Failed to mmap file!" << std::endl;
47 |     abort();
48 |   }
49 |   std::sort(std::execution::par, buffer, buffer + N, std::greater{});
50 |   use_data(std::span{buffer, N});
51 |   munmap(buffer, N);
52 | }
53 | 
54 | void use_data(std::span<char> data) {
55 |   for (auto c : data) std::cout << c << std::endl;
56 | }
57 | 
58 | int main(int argc, char* argv[]) {
59 |   if (argc != 2) {
60 |     std::cerr << "ERROR: missing length argument" << std::endl;
61 |     std::cerr << "Usage: " << argv[0] << " <file bytes> " << std::endl;
62 |     abort();
63 |   }
64 |   std::size_t N = std::stoll(argv[1]);
65 | 
66 |   // Generate a file with n elements
67 |   char const* fname = "file_after.txt";
68 |   {
69 |     std::cout << "Generating file with " << N << " bytes..." << std::endl;
70 |     std::ofstream out(fname);
71 |     if (!out) {
72 |       std::cerr << "File open failed!" << std::endl;
73 |       abort();
74 |     }
75 |     std::vector<unsigned char> buffer(N);
76 |     std::for_each_n(std::execution::par, std::views::iota(0).begin(), N,
77 |                     [&](int i) { buffer[i] = (unsigned char)'0' + (unsigned char)i; });
78 |     out.write((const char*)buffer.data(), N);
79 |     out.close();
80 |   }
81 | 
82 |   int fd = open(fname, O_RDWR);
83 |   if (fd == -1) {
84 |     std::cerr << "Failed to read file" << std::endl;
85 |     abort();
86 |   }
87 |   sortfile(fd, N);
88 |   return 0;
89 | }
90 | 


--------------------------------------------------------------------------------
/miniapp/weather_app.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3 |  * SPDX-License-Identifier: MIT
  4 |  *
  5 |  * Permission is hereby granted, free of charge, to any person obtaining a
  6 |  * copy of this software and associated documentation files (the "Software"),
  7 |  * to deal in the Software without restriction, including without limitation
  8 |  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 |  * and/or sell copies of the Software, and to permit persons to whom the
 10 |  * Software is furnished to do so, subject to the following conditions:
 11 |  *
 12 |  * The above copyright notice and this permission notice shall be included in
 13 |  * all copies or substantial portions of the Software.
 14 |  *
 15 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 |  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 |  * DEALINGS IN THE SOFTWARE.
 22 |  */
 23 | 
 24 | #include <cassert>
 25 | #include <cerrno>
 26 | #include <cstdint>
 27 | #include <cub/cub.cuh>
 28 | #include <cuda_runtime_api.h>
 29 | #include <fcntl.h>
 30 | #include <fstream>
 31 | #include <iostream>
 32 | #include <numeric>
 33 | #include <string>
 34 | #include <sys/mman.h>
 35 | #include <unistd.h>
 36 | #include <vector>
 37 | 
 38 | #define CUDA_CHECK(err)                                                                                      \
 39 |   if (err != cudaSuccess) {                                                                                  \
 40 |     std::cout << "CUDA error at " << __LINE__ << " " << cudaGetErrorString(err) << std::endl;                \
 41 |     return -1;                                                                                               \
 42 |   }
 43 | 
 44 | __constant__ size_t month_day_boundary[13] = {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366};
 45 | 
 46 | __inline__ __device__ int get_month_from_day_of_year(int day_of_year) {
 47 |   const int total_months = 12;
 48 | 
 49 |   int month = total_months / 2;
 50 |   int upper_month = total_months;
 51 |   int lower_month = 0;
 52 | 
 53 |   // binary search in array
 54 |   while (lower_month <= upper_month) {
 55 |     if (day_of_year >= month_day_boundary[month])
 56 |       lower_month = month + 1;
 57 |     else if (day_of_year < month_day_boundary[month])
 58 |       upper_month = month - 1;
 59 |     month = int((upper_month + lower_month) / 2);
 60 |   }
 61 |   return month; // this is 0 based
 62 | }
 63 | 
 64 | __global__ void construct_yearly_histogram(float *input_data, int start_year, int end_year,
 65 |                                            size_t input_grid_height, size_t input_grid_width,
 66 |                                            size_t aligned_month_file_map_offset, float *histogram_data) {
 67 |   // end year is included
 68 |   const size_t hours_per_day = 24; // assumed 24 hr data
 69 |   const size_t days_per_leap_year = 366;
 70 |   const size_t months_per_year = 12;
 71 | 
 72 |   // sum will accumulate in register for the full grid
 73 |   size_t grid_pitch = input_grid_height * input_grid_width;
 74 |   size_t day_grid_pitch = hours_per_day * grid_pitch;
 75 | 
 76 |   // output mapping
 77 |   // total 366 * 24 * 721 * 1440 active threads
 78 |   size_t linear_day_hr_loc_idx = (size_t)blockDim.x * blockIdx.x + threadIdx.x;
 79 | 
 80 |   size_t max_active_threads = (days_per_leap_year * (int64_t)day_grid_pitch);
 81 | 
 82 |   size_t day_of_year = linear_day_hr_loc_idx / day_grid_pitch; // this is 0-based
 83 |   size_t hour_of_day = (linear_day_hr_loc_idx - (day_of_year * day_grid_pitch)) / grid_pitch;
 84 |   size_t grid_linearized_idx =
 85 |       linear_day_hr_loc_idx - (day_of_year * day_grid_pitch) - (hour_of_day * grid_pitch);
 86 | 
 87 |   size_t grid_y = grid_linearized_idx / input_grid_width;
 88 |   size_t grid_x = grid_linearized_idx % input_grid_width;
 89 | 
 90 |   // month is required as each file is mapped at a separate offset - for page boundary alignment
 91 |   size_t month = (size_t)get_month_from_day_of_year((int)day_of_year);
 92 | 
 93 |   if (linear_day_hr_loc_idx < max_active_threads) {
 94 |     float accum_sum = 0.0f;
 95 | 
 96 |     for (int i = 0; i <= (end_year - start_year); i++) {
 97 |       int year = i + start_year;
 98 | 
 99 |       size_t access_index = (((size_t)i * months_per_year + month) * aligned_month_file_map_offset) +
100 |                             ((day_of_year - month_day_boundary[month]) * day_grid_pitch) +
101 |                             (hour_of_day * grid_pitch) + grid_y * input_grid_width + grid_x;
102 |       // leap year adjustment for feb
103 |       if (day_of_year == 59) {
104 |         if ((year % 4) == 0) {
105 |           // leap year - read away
106 |           accum_sum += input_data[access_index];
107 |         }
108 |       } else {
109 |         accum_sum += input_data[access_index];
110 |       }
111 |     }
112 |     // write out
113 |     histogram_data[linear_day_hr_loc_idx] = accum_sum;
114 |   }
115 | }
116 | 
117 | int main(int argc, char *argv[]) {
118 | 
119 |   // hard coded constants for ERA5
120 |   const int hours_per_day = 24; // assumed 24 hr data
121 |   const int days_per_leap_year = 366;
122 |   const int max_days_per_month = 31;
123 |   const int months_per_year = 12;
124 |   const int input_grid_height = 721;
125 |   const int input_grid_width = 1440;
126 |   int start_year = std::atoi(argv[1]);
127 |   int end_year = std::atoi(argv[2]);
128 |   std::string file_path = std::string(argv[3]);
129 | 
130 |   const int num_years = end_year - start_year + 1;
131 | 
132 |   size_t max_file_size =
133 |       sizeof(float) * max_days_per_month * hours_per_day * input_grid_height * input_grid_width;
134 | 
135 |   size_t TWO_MB = 2 * 1024 * 1024;
136 |   size_t max_aligned_file_pages = (max_file_size + TWO_MB - 1) / TWO_MB;
137 |   size_t max_aligned_file_size = max_aligned_file_pages * TWO_MB;
138 | 
139 |   std::cout << "aligned size: " << max_aligned_file_size << std::endl;
140 | 
141 |   std::vector<size_t> file_sizes;
142 |   std::vector<int> open_fds;
143 | 
144 |   // 2 MB aligned VA range to allocate
145 |   size_t va_alloc_size = sizeof(float) * num_years * months_per_year * max_aligned_file_size;
146 | 
147 |   void *va_alloc = mmap(nullptr, va_alloc_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
148 | 
149 |   void *running_address = va_alloc;
150 | 
151 |   std::string file_path_base = file_path;
152 | 
153 |   for (int y = start_year; y <= end_year; y++) {
154 |     for (int k = 1; k <= months_per_year; k++) {
155 |       char filestr_buf[10];
156 |       sprintf(filestr_buf, "%d%02d.bin", y, k);
157 |       std::string filename = file_path_base + "e5.accumulated_tp_1h." + std::string(filestr_buf);
158 |       std::cout << "mapping: " << filename << std::endl;
159 | 
160 |       std::ifstream fstreamInput(filename, std::ios::binary);
161 |       fstreamInput.seekg(0, std::ios::end);
162 |       size_t fileByteSize = fstreamInput.tellg();
163 |       fstreamInput.close();
164 | 
165 |       int fd = open(filename.c_str(), O_RDONLY, 0);
166 |       if (fd == -1) {
167 |         std::cout << "Error opening input file: " << filename << std::endl;
168 |         return -1;
169 |       }
170 | 
171 |       char *mapped_addr = NULL;
172 |       // probably need 2 MB pages for perf
173 |       mapped_addr =
174 |           (char *)mmap((void *)running_address, fileByteSize, PROT_READ, MAP_PRIVATE | MAP_FIXED, fd, 0);
175 | 
176 |       if (mapped_addr == MAP_FAILED) {
177 |         close(fd);
178 |         std::cout << "Error mapping input file: " << filename << std::endl;
179 |         return -2;
180 |       }
181 | 
182 |       assert(mapped_addr == (char *)running_address);
183 |       running_address = (void *)((char *)running_address + max_aligned_file_size);
184 | 
185 |       file_sizes.push_back(fileByteSize);
186 |       open_fds.push_back(fd);
187 |     }
188 |   }
189 | 
190 |   // launch kernel and feed in pointer and values
191 |   size_t hist_bins = (size_t)days_per_leap_year * hours_per_day * input_grid_height * input_grid_width;
192 |   size_t histogram_alloc_size = hist_bins * sizeof(float);
193 |   float *histogram_data = NULL;
194 |   CUDA_CHECK(cudaMalloc((void **)&histogram_data, histogram_alloc_size));
195 |   CUDA_CHECK(cudaMemset(histogram_data, 0, histogram_alloc_size));
196 | 
197 |   cudaEvent_t start_event, stop_event;
198 |   CUDA_CHECK(cudaEventCreate(&start_event));
199 |   CUDA_CHECK(cudaEventCreate(&stop_event));
200 | 
201 |   dim3 block(1024, 1, 1);
202 |   dim3 grid(1, 1, 1);
203 | 
204 |   grid.x = (hist_bins + block.x - 1) / block.x;
205 | 
206 |   CUDA_CHECK(cudaEventRecord(start_event));
207 |   construct_yearly_histogram<<<grid, block, 0, NULL>>>(
208 |       reinterpret_cast<float *>(va_alloc), start_year, end_year, (size_t)input_grid_height,
209 |       (size_t)input_grid_width, max_aligned_file_size / sizeof(float), histogram_data);
210 | 
211 |   CUDA_CHECK(cudaGetLastError()); // for catching errors from launch
212 |   CUDA_CHECK(cudaEventRecord(stop_event));
213 |   CUDA_CHECK(cudaEventSynchronize(stop_event));
214 | 
215 |   float time_ms = 0.0f;
216 |   CUDA_CHECK(cudaEventElapsedTime(&time_ms, start_event, stop_event));
217 |   std::cout << "kernel time: " << time_ms << " ms" << std::endl;
218 | 
219 |   CUDA_CHECK(cudaDeviceSynchronize()); // to start reading output histogram on host
220 | 
221 |   size_t month_day_boundary[13] = {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366};
222 |   FILE *fp_log = fopen("processed_log.csv", "w");
223 |   fprintf(fp_log, "Month,Total Precipitation (m)\n");
224 |   std::vector<std::vector<float>> hourly_sum_per_day;
225 | 
226 |   for (int m = 0; m < months_per_year; m++) {
227 |     size_t start_index = month_day_boundary[m] * hours_per_day * input_grid_height * input_grid_width;
228 |     float local_sum = 0.0f;
229 |     for (int d = 0; d < (month_day_boundary[m + 1] - month_day_boundary[m]); d++) {
230 |       std::vector<float> hour_sum(24);
231 |       for (int h = 0; h < hours_per_day; h++) {
232 |         float month_sum[16] = {0.0f};
233 |         void *d_temp_storage = NULL;
234 |         size_t temp_storage_bytes = 0;
235 |         size_t num_items = (input_grid_height / 2) * input_grid_width;
236 |         size_t strided_hourly_idx = start_index + (d * hours_per_day * input_grid_height * input_grid_width) +
237 |                                     (h * input_grid_height * input_grid_width);
238 |         float *array_start = &(histogram_data[strided_hourly_idx]);
239 |         cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, array_start, month_sum, num_items);
240 | 
241 |         d_temp_storage = malloc(temp_storage_bytes); // use HMM B-)
242 |         cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, array_start, month_sum, num_items);
243 |         cudaDeviceSynchronize();
244 | 
245 |         free(d_temp_storage);
246 |         hour_sum[h] = month_sum[0];
247 |       }
248 |       hourly_sum_per_day.push_back(hour_sum);
249 |       local_sum += std::accumulate(hour_sum.begin(), hour_sum.end(), 0.0f);
250 |     }
251 |     std::cout << "Month: " << (m + 1) << " Total Precip: " << local_sum << " m" << std::endl;
252 |     fprintf(fp_log, "%d,%f\n", m + 1, local_sum);
253 |   }
254 | 
255 |   // get per hour total rainfall for each month
256 |   fprintf(fp_log, "Hourly average per-month\n");
257 |   fprintf(fp_log, "Month,Hour,Total Precipitation (m)\n");
258 |   for (int m = 0; m < months_per_year; m++) {
259 |     for (int h = 0; h < hours_per_day; h++) {
260 |       float hour_sum = 0.0f;
261 |       for (int d = 0; d < (month_day_boundary[m + 1] - month_day_boundary[m]); d++) {
262 |         hour_sum += hourly_sum_per_day[month_day_boundary[m] + d][h];
263 |       }
264 |       std::cout << "m: " << m + 1 << " h: " << h + 1 << " hour_sum: " << hour_sum << std::endl;
265 |       fprintf(fp_log, "%d,%d,%f\n", m + 1, h + 1, hour_sum);
266 |     }
267 |     std::cout << std::endl;
268 |   }
269 |   fclose(fp_log);
270 | 
271 |   FILE *fp_out = fopen("yearly_aggregates.bin", "wb");
272 |   fwrite(histogram_data, sizeof(float), hist_bins, fp_out);
273 |   fflush(0);
274 |   fclose(fp_out);
275 | 
276 |   CUDA_CHECK(cudaEventDestroy(start_event));
277 |   CUDA_CHECK(cudaEventDestroy(stop_event));
278 | 
279 |   CUDA_CHECK(cudaFree(histogram_data));
280 | 
281 |   void *unmap_address = va_alloc;
282 |   for (int k = 1; k < argc; k++) {
283 |     int unmap_return = munmap(unmap_address, file_sizes[k - 1]); // unmap all address
284 |     if (unmap_return != 0) {
285 |       std::cout << "Error unmapping VA alloc range: " << strerror(errno) << std::endl;
286 |     }
287 |     close(open_fds[k - 1]);
288 |     unmap_address = (void *)((char *)unmap_address + max_aligned_file_size);
289 |   }
290 | 
291 |   int unmap_return = munmap(va_alloc, va_alloc_size); // unmap all address
292 |   if (unmap_return != 0) { std::cout << "Error unmapping VA alloc range: " << strerror(errno) << std::endl; }
293 | 
294 |   return 0;
295 | }
296 | 


--------------------------------------------------------------------------------