├── LICENSE
├── README.md
├── base.cpp
├── cover.jpeg
├── data
    ├── create_measurements.py
    └── weather_stations.csv
├── fast.cu
└── run.sh


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # cuda-1brc
 2 | 
 3 | <img src="cover.jpeg" width="300">
 4 | 
 5 | Can CUDA handle one billion rows of text? Yes.
 6 | 
 7 | `fast.cu` takes **16.8 seconds** to process one billion rows on a V100.
 8 | This is a **60X** speedup over a pure C++ baseline in `base.cpp`.
 9 | 
10 | Check out the [blog](https://tspeterkim.github.io/posts/cuda-1brc) for a detailed explanation.


--------------------------------------------------------------------------------
/base.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <iomanip>
 3 | #include <fstream>
 4 | #include <sstream>
 5 | #include <string>
 6 | #include <map>
 7 | #include <cmath>
 8 | 
 9 | using namespace std;
10 | 
11 | struct Stat {
12 |     float min, max, sum;
13 |     int count;
14 | };
15 | 
16 | int main(int argc, char* argv[]) {
17 |     if (argc < 2) {
18 |         cerr << "Usage: " << argv[0] << " <file path>" << endl;
19 |         return 1;
20 |     }
21 | 
22 |     ifstream file(argv[1]);
23 |     string line;
24 |     map<string, Stat> stationStats;
25 | 
26 |     while (getline(file, line)) {
27 |         istringstream iss(line);
28 |         string station;
29 |         float temp;
30 |         getline(iss, station, ';');
31 |         iss >> temp;
32 | 
33 |         auto it = stationStats.find(station);
34 |         if (it == stationStats.end()) {
35 |             stationStats[station] = {temp, temp, temp, 1};
36 |         } else {
37 |             Stat& s = it->second;
38 |             s.min = min(s.min, temp);
39 |             s.max = max(s.max, temp);
40 |             s.sum += temp;
41 |             s.count++;
42 |         }
43 |     }
44 | 
45 |     ofstream measurements("measurements.out");
46 |     for (auto& pair : stationStats) {
47 |         const Stat& s = pair.second;
48 |         float mean = s.sum / s.count;
49 |         measurements << pair.first << "=" << s.min << "/";
50 |         measurements << fixed << setprecision(1) << mean << "/";
51 |         measurements << s.max << endl;
52 |     }
53 |     return 0;
54 | }
55 | 


--------------------------------------------------------------------------------
/cover.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tspeterkim/cuda-1brc/9678472f1ac081e0e130513a062243996ef7c1b4/cover.jpeg


--------------------------------------------------------------------------------
/data/create_measurements.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | #  Copyright 2023 The original authors
  4 | #
  5 | #  Licensed under the Apache License, Version 2.0 (the "License");
  6 | #  you may not use this file except in compliance with the License.
  7 | #  You may obtain a copy of the License at
  8 | #
  9 | #      http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | #  Unless required by applicable law or agreed to in writing, software
 12 | #  distributed under the License is distributed on an "AS IS" BASIS,
 13 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | #  See the License for the specific language governing permissions and
 15 | #  limitations under the License.
 16 | #
 17 | 
 18 | # Based on https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CreateMeasurements.java
 19 | 
 20 | import os
 21 | import sys
 22 | import random
 23 | import time
 24 | 
 25 | 
 26 | def check_args(file_args):
 27 |     """
 28 |     Sanity checks out input and prints out usage if input is not a positive integer
 29 |     """
 30 |     try:
 31 |         if len(file_args) != 2 or int(file_args[1]) <= 0:
 32 |             raise Exception()
 33 |     except:
 34 |         print("Usage:  create_measurements.sh <positive integer number of records to create>")
 35 |         print("        You can use underscore notation for large number of records.")
 36 |         print("        For example:  1_000_000_000 for one billion")
 37 |         exit()
 38 | 
 39 | 
 40 | def build_weather_station_name_list():
 41 |     """
 42 |     Grabs the weather station names from example data provided in repo and dedups
 43 |     """
 44 |     station_names = []
 45 |     with open('data/weather_stations.csv', 'r') as file:
 46 |         file_contents = file.read()
 47 |     for station in file_contents.splitlines():
 48 |         if "#" in station:
 49 |             next
 50 |         else:
 51 |             station_names.append(station.split(';')[0])
 52 |     return list(set(station_names))
 53 | 
 54 | 
 55 | def convert_bytes(num):
 56 |     """
 57 |     Convert bytes to a human-readable format (e.g., KiB, MiB, GiB)
 58 |     """
 59 |     for x in ['bytes', 'KiB', 'MiB', 'GiB']:
 60 |         if num < 1024.0:
 61 |             return "%3.1f %s" % (num, x)
 62 |         num /= 1024.0
 63 | 
 64 | 
 65 | def format_elapsed_time(seconds):
 66 |     """
 67 |     Format elapsed time in a human-readable format
 68 |     """
 69 |     if seconds < 60:
 70 |         return f"{seconds:.3f} seconds"
 71 |     elif seconds < 3600:
 72 |         minutes, seconds = divmod(seconds, 60)
 73 |         return f"{int(minutes)} minutes {int(seconds)} seconds"
 74 |     else:
 75 |         hours, remainder = divmod(seconds, 3600)
 76 |         minutes, seconds = divmod(remainder, 60)
 77 |         if minutes == 0:
 78 |             return f"{int(hours)} hours {int(seconds)} seconds"
 79 |         else:
 80 |             return f"{int(hours)} hours {int(minutes)} minutes {int(seconds)} seconds"
 81 | 
 82 | 
 83 | def estimate_file_size(weather_station_names, num_rows_to_create):
 84 |     """
 85 |     Tries to estimate how large a file the test data will be
 86 |     """
 87 |     total_name_bytes = sum(len(s.encode("utf-8")) for s in weather_station_names)
 88 |     avg_name_bytes = total_name_bytes / float(len(weather_station_names))
 89 | 
 90 |     # avg_temp_bytes = sum(len(str(n / 10.0)) for n in range(-999, 1000)) / 1999
 91 |     avg_temp_bytes = 4.400200100050025
 92 | 
 93 |     # add 2 for separator and newline
 94 |     avg_line_length = avg_name_bytes + avg_temp_bytes + 2
 95 | 
 96 |     human_file_size = convert_bytes(num_rows_to_create * avg_line_length)
 97 | 
 98 |     return f"Estimated max file size is:  {human_file_size}."
 99 | 
100 | 
101 | def build_test_data(weather_station_names, num_rows_to_create):
102 |     """
103 |     Generates and writes to file the requested length of test data
104 |     """
105 |     start_time = time.time()
106 |     coldest_temp = -99.9
107 |     hottest_temp = 99.9
108 |     station_names_10k_max = random.choices(weather_station_names, k=10_000)
109 |     batch_size = 10000 # instead of writing line by line to file, process a batch of stations and put it to disk
110 |     chunks = num_rows_to_create // batch_size
111 |     print('Building test data...')
112 | 
113 |     try:
114 |         with open("data/measurements.txt", 'w') as file:
115 |             progress = 0
116 |             for chunk in range(chunks):
117 |                 
118 |                 batch = random.choices(station_names_10k_max, k=batch_size)
119 |                 prepped_deviated_batch = '\n'.join([f"{station};{random.uniform(coldest_temp, hottest_temp):.1f}" for station in batch]) # :.1f should quicker than round on a large scale, because round utilizes mathematical operation
120 |                 file.write(prepped_deviated_batch + '\n')
121 |                 
122 |                 # Update progress bar every 1%
123 |                 if (chunk + 1) * 100 // chunks != progress:
124 |                     progress = (chunk + 1) * 100 // chunks
125 |                     bars = '=' * (progress // 2)
126 |                     sys.stdout.write(f"\r[{bars:<50}] {progress}%")
127 |                     sys.stdout.flush()
128 |         sys.stdout.write('\n')
129 |     except Exception as e:
130 |         print("Something went wrong. Printing error info and exiting...")
131 |         print(e)
132 |         exit()
133 |     
134 |     end_time = time.time()
135 |     elapsed_time = end_time - start_time
136 |     file_size = os.path.getsize("data/measurements.txt")
137 |     human_file_size = convert_bytes(file_size)
138 |  
139 |     print("Test data successfully written to 1brc/data/measurements.txt")
140 |     print(f"Actual file size:  {human_file_size}")
141 |     print(f"Elapsed time: {format_elapsed_time(elapsed_time)}")
142 | 
143 | 
144 | def main():
145 |     """
146 |     main program function
147 |     """
148 |     check_args(sys.argv)
149 |     num_rows_to_create = int(sys.argv[1])
150 |     weather_station_names = []
151 |     weather_station_names = build_weather_station_name_list()
152 |     print(estimate_file_size(weather_station_names, num_rows_to_create))
153 |     build_test_data(weather_station_names, num_rows_to_create)
154 |     print("Test data build complete.")
155 | 
156 | 
157 | if __name__ == "__main__":
158 |     main()
159 | exit()
160 | 


--------------------------------------------------------------------------------
/fast.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <fstream>
  4 | #include <string>
  5 | #include <set>
  6 | #include <vector>
  7 | #include <chrono>
  8 | #include <cstring>
  9 | 
 10 | #define MAX_CITY_BYTE 100  // City names are at most 100 bytes.
 11 | #define MAX_THREADS_PER_BLOCK 1024
 12 | 
 13 | // File split metadata of the end offset, and length, all in bytes.
 14 | struct Part {
 15 |     long long offset; long long length;
 16 | };
 17 | 
 18 | // Array entry of a given city, keeping track of the temperature statistics.
 19 | struct Stat {
 20 |     char city[MAX_CITY_BYTE];
 21 |     float min = INFINITY; float max = -INFINITY; float sum = 0;
 22 |     int count = 0;
 23 |     Stat() {}
 24 |     Stat(const std::string& init_city) {
 25 |         strncpy(city, init_city.c_str(), init_city.size());
 26 |         city[init_city.size()] = '\0';
 27 |     }
 28 | };
 29 | 
 30 | // CUDA's atomicMin/Max only work with ints
 31 | __device__ static float atomicMin(float* address, float val) {
 32 |     int* address_as_i = (int*) address;
 33 |     int old = *address_as_i, assumed;
 34 |     do {
 35 |         assumed = old;
 36 |         old = ::atomicCAS(address_as_i, assumed,
 37 |         __float_as_int(::fminf(val, __int_as_float(assumed))));
 38 |     } while (assumed != old);
 39 |     return __int_as_float(old);
 40 | }
 41 | 
 42 | __device__ static float atomicMax(float* address, float val) {
 43 |     int* address_as_i = (int*) address;
 44 |     int old = *address_as_i, assumed;
 45 |     do {
 46 |         assumed = old;
 47 |         old = ::atomicCAS(address_as_i, assumed,
 48 |         __float_as_int(::fmaxf(val, __int_as_float(assumed))));
 49 |     } while (assumed != old);
 50 |     return __int_as_float(old);
 51 | }
 52 | 
 53 | // ChatGPT's working solution.
 54 | // Probably could be made more accurate using doubles like the actual strtod.c
 55 | __device__ float cuda_atof(char* str) {
 56 |     float result = 0.0f;
 57 |     int sign = 1; int decimal = 0; int digits = 0;
 58 | 
 59 |     if (*str == '-') {
 60 |         sign = -1;
 61 |         ++str;
 62 |     }
 63 | 
 64 |     while (*str >= '0' && *str <= '9') {
 65 |         result = result * 10.0f + (*str - '0');
 66 |         ++str;
 67 |         ++digits;
 68 |     }
 69 | 
 70 |     if (*str == '.') {
 71 |         ++str;
 72 |         while (*str >= '0' && *str <= '9') {
 73 |             result = result * 10.0f + (*str - '0');
 74 |             ++str;
 75 |             ++digits;
 76 |             ++decimal;
 77 |         }
 78 |     }
 79 |     result *= sign;
 80 | 
 81 |     while (decimal > 0) {
 82 |         result /= 10.0f;
 83 |         --decimal;
 84 |     }
 85 |     return result;
 86 | }
 87 | 
 88 | // Identical to glibc's strcmp.c
 89 | __device__ int cuda_strcmp(const char* p1, const char* p2) {
 90 |     const unsigned char *s1 = (const unsigned char *) p1;
 91 |     const unsigned char *s2 = (const unsigned char *) p2;
 92 |     unsigned char c1, c2;
 93 |     do {
 94 |         c1 = (unsigned char) *s1++;
 95 |         c2 = (unsigned char) *s2++;
 96 |         if (c1 == '\0')
 97 |             return c1 - c2;
 98 |     } while (c1 == c2);
 99 |     return c1 - c2;
100 | }
101 | 
102 | // Returns the pre-defined index of a city using good ol' binary search.
103 | __device__ int get_index(char* cities, char* city_target, int n_city) {
104 |     int left = 0;
105 |     int right = n_city - 1;
106 |     while (left <= right) {
107 |         int mid = left + (right - left) / 2;
108 |         const char* city_query = cities + mid * MAX_CITY_BYTE;
109 | 
110 |         int cmp = cuda_strcmp(city_query, city_target);
111 |         if (cmp == 0)
112 |             return mid;
113 |         else if (cmp < 0)
114 |             left = mid + 1;
115 |         else
116 |             right = mid - 1;
117 |     }
118 |     return -1;
119 | }
120 | 
121 | // The CUDA kernel. Each thread operates on a different section of the buffer, and updates the statistics.
122 | __global__ void process_buffer(char* buffer, Part* parts, Stat* stats, char* cities, int n_city, long long buffer_offset, int part_size) {
123 |     int tx = threadIdx.x;
124 |     int bx = blockIdx.x * blockDim.x + tx;
125 | 
126 |     if (bx >= part_size)  // For threads that are not assigned any work, return
127 |         return;
128 | 
129 |     int index = 0;
130 |     bool parsing_city = true;
131 | 
132 |     char city[MAX_CITY_BYTE];
133 |     char floatstr[5];  // longest temperature float str is "-99.9" i.e. 5 bytes
134 | 
135 |     // An ugly way to do string processing in CUDA.
136 |     // I could probably use more helper functions here like my own getline.
137 |     for (int i = 0; i < parts[bx].length; i++) {
138 |         char c = buffer[parts[bx].offset-buffer_offset + i];
139 |         if (parsing_city) {  // City characters
140 |             if (c == ';') {
141 |                 city[index] = '\0';
142 |                 index = 0;
143 |                 parsing_city = false;
144 |             } else {
145 |                 city[index] = c;
146 |                 index++;
147 |             }
148 |         } else {  // Float characters
149 |             if (c == '\n') {
150 |                 floatstr[index] = '\0';
151 | 
152 |                 int stat_index = get_index(cities, city, n_city);
153 |                 float temp = cuda_atof(floatstr);
154 | 
155 |                 // The heart of the CUDA kernel.
156 |                 // Update (atomically) the temperature statistics.
157 |                 // Identical in spirit to the C++ baseline.
158 |                 atomicMin(&stats[stat_index].min, temp);
159 |                 atomicMax(&stats[stat_index].max, temp);
160 |                 atomicAdd(&stats[stat_index].sum, temp);
161 |                 atomicAdd(&stats[stat_index].count, 1);
162 | 
163 |                 // reset for next line read
164 |                 parsing_city = true;
165 |                 index = 0;
166 |                 floatstr[0] = '\0'; city[0] = '\0';
167 |             } else {
168 |                 floatstr[index] = c;
169 |                 index++;
170 |             }
171 |         }
172 |     }
173 | }
174 | 
175 | // Adapted from https://github.com/benhoyt/go-1brc/blob/master/r8.go#L124
176 | std::vector<Part> split_file(std::string input_path, int num_parts) {
177 |     std::ifstream file(input_path, std::ios::binary | std::ios::ate);
178 |     std::streamsize size = file.tellg();
179 |     file.seekg(0, std::ios::beg);
180 | 
181 |     // Using long long is necessary to avoid overflow of file size.
182 |     // e.g. 15B (bytes) for a 1B-row file
183 |     long long split_size = size / num_parts;
184 | 
185 |     std::cout << "Total file size: " << size << ", split size: " << split_size << std::endl;
186 | 
187 |     long long offset = 0;
188 |     std::vector<Part> parts;
189 |     while (offset < size) {
190 |         long long seek_offset = std::max(offset + split_size - MAX_CITY_BYTE, 0LL);
191 |         if (seek_offset > size) {
192 |             parts.back().length += size-offset;
193 |             break;
194 |         }
195 |         file.seekg(seek_offset, std::ios::beg);
196 |         char buf[MAX_CITY_BYTE];
197 |         file.read(buf, MAX_CITY_BYTE);
198 | 
199 |         std::streamsize n = file.gcount();
200 |         std::streamsize newline = -1;
201 |         for (int i = n - 1; i >= 0; --i) {
202 |             if (buf[i] == '\n') {
203 |                 newline = i;
204 |                 break;
205 |             }
206 |         }
207 |         int remaining = n - newline - 1;
208 |         long long next_offset = seek_offset + n - remaining;
209 |         parts.push_back({offset, next_offset-offset});
210 |         offset = next_offset;
211 |     }
212 |     file.close();
213 |     return parts;
214 | }
215 | 
216 | std::set<std::string> get_cities() {
217 |     std::ifstream weather_file("data/weather_stations.csv");
218 |     std::string line;
219 |     std::set<std::string> all_cities;
220 | 
221 |     while (getline(weather_file, line)) {
222 |         std::istringstream iss(line);
223 |         if (line[0] == '#')
224 |             continue;
225 |         std::string station;
226 |         std::getline(iss, station, ';');
227 |         all_cities.insert(station);
228 |     }
229 |     weather_file.close();
230 |     return all_cities;
231 | }
232 | 
233 | int main(int argc, char* argv[]) {
234 |     if (argc < 4) {
235 |         std::cerr << "Usage: " << argv[0] << " <file path> <num parts> <batch size>" << std::endl;
236 |         return 1;
237 |     }
238 | 
239 |     // Bending the rules of the challenge here.
240 |     // I'm assuming a file like data/weather_stations.csv is given.
241 |     // This file lists all possible cities that could appear in the input file.
242 |     std::set<std::string> all_cities = get_cities();
243 | 
244 |     int n_city = all_cities.size();
245 |     Stat* stats = new Stat[n_city];
246 |     int index = 0;
247 |     char cities[MAX_CITY_BYTE * n_city] = {'\0'};
248 | 
249 |     for (const auto& city : all_cities) {
250 |         stats[index] = Stat(city);
251 |         strcpy(cities + (index * MAX_CITY_BYTE), city.c_str());
252 |         index++;
253 |     }
254 | 
255 |     auto start = std::chrono::high_resolution_clock::now();
256 | 
257 |     std::string input_path = argv[1];
258 |     int num_parts = atoi(argv[2]); int batch_size = atoi(argv[3]);
259 | 
260 |     std::vector<Part> parts = split_file(input_path, num_parts);
261 |     num_parts = parts.size();
262 | 
263 |     std::cout << "Required GPU RAM Size (GB): " <<  parts[0].length * batch_size / 1'000'000'000.0 << std::endl;
264 | 
265 |     auto end = std::chrono::high_resolution_clock::now();
266 |     std::chrono::duration<double> elapsed = end - start;
267 |     std::cout << "Time taken finding parts: " << elapsed.count() << " seconds" << std::endl;
268 |     start = std::chrono::high_resolution_clock::now();
269 | 
270 |     Stat* d_stats;  // Array of temperature statistics. Each entry corresponds to a different city.
271 |     cudaMalloc(&d_stats, n_city * sizeof(Stat));
272 |     cudaMemcpy(d_stats, stats, n_city * sizeof(Stat), cudaMemcpyHostToDevice);
273 | 
274 |     char* d_buffer;  // Holds a subset of the raw text char buffer.
275 |     cudaMalloc((void**) &d_buffer, 10'000'000'000 * sizeof(char));
276 | 
277 |     Part* d_parts;  // File splits s.t. each thread can work on a different split.
278 |     cudaMalloc(&d_parts, parts.size() * sizeof(Part));
279 | 
280 |     char* d_cities;  // List of all cities for city -> index lookup.
281 |     cudaMalloc(&d_cities, MAX_CITY_BYTE * n_city * sizeof(char));
282 |     cudaMemcpy(d_cities, cities, MAX_CITY_BYTE * n_city * sizeof(char), cudaMemcpyHostToDevice);
283 | 
284 |     // Launch CUDA kernels that processes different splits of the file.
285 |     // Does it in sequential batches, if GPU RAM is limited.
286 |     std::ifstream file(input_path, std::ios::binary);
287 |     for (int b = 0; b < num_parts; b += batch_size) {
288 |         long long batch_file_size = 0;
289 |         for (int bi = b; bi < std::min(b + batch_size, num_parts); bi++)
290 |             batch_file_size += parts[bi].length;
291 | 
292 |         file.seekg(parts[b].offset, std::ios::beg);
293 | 
294 |         char* buffer = new char[batch_file_size];
295 |         file.read(buffer, batch_file_size);
296 | 
297 |         cudaMemcpy(d_buffer, buffer, batch_file_size * sizeof(char), cudaMemcpyHostToDevice);
298 | 
299 |         int part_size = batch_size;
300 |         if (b + batch_size > num_parts)
301 |             part_size = num_parts - b;
302 |         cudaMemcpy(d_parts, parts.data() + b, part_size * sizeof(Part), cudaMemcpyHostToDevice);
303 | 
304 |         int grid_blocks = std::ceil((float) part_size / MAX_THREADS_PER_BLOCK);
305 | 
306 |         process_buffer<<<grid_blocks, MAX_THREADS_PER_BLOCK>>>(d_buffer, d_parts, d_stats, d_cities, n_city, parts[b].offset, part_size);
307 |         cudaError_t error = cudaGetLastError();
308 |         if (error != cudaSuccess)
309 |             std::cerr << "Error: " << cudaGetErrorString(error) << std::endl;
310 | 
311 |         delete[] buffer;
312 |     }
313 | 
314 |     cudaDeviceSynchronize();  // for accurate profiling (cuda calls are async)
315 |     end = std::chrono::high_resolution_clock::now();
316 |     elapsed = end - start;
317 |     std::cout << "Time taken in cuda kernel: " << elapsed.count() << " seconds" << std::endl;
318 | 
319 |     // Write out the results, and complete the challenge.
320 |     cudaMemcpy(stats, d_stats, n_city * sizeof(Stat), cudaMemcpyDeviceToHost);
321 |     std::ofstream measurements("cuda_measurements.out");
322 |     for (int i = 0; i < n_city; i++) {
323 |         if (stats[i].count != 0) {
324 |             float mean = stats[i].sum / stats[i].count;
325 |             measurements << stats[i].city << "=" << stats[i].min << "/";
326 |             measurements << std::fixed << std::setprecision(1) << mean << "/";
327 |             measurements << stats[i].max << std::endl;
328 |         }
329 |     }
330 | 
331 |     return 0;
332 | }
333 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | # Create the 1 billion row file as measurements.txt (~14GB)
 2 | python data/create_measurements.py 1000000000
 3 | 
 4 | # Compile and run the c++ baseline.
 5 | g++ -o base -O2 base.cpp
 6 | time ./base data/measurements.txt  # ~17 mins
 7 | 
 8 | # Compile and run my cuda solution.
 9 | nvcc -o fast -O2 fast.cu
10 | # 1 million threads to execute in batches of max size 600000.
11 | # TODO: Increase the max size by using a GPU with more memory.
12 | time ./fast data/measurements.txt 1000000 600000  # ~17s on V100
13 | 
14 | # Sanity check
15 | # TODO: The cuda floats are off by 0.1 for some cities. Check why.
16 | diff cuda_measurements.out measurements.out
17 | 


--------------------------------------------------------------------------------