├── LICENSE ├── README.md ├── base.cpp ├── cover.jpeg ├── data ├── create_measurements.py └── weather_stations.csv ├── fast.cu └── run.sh /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cuda-1brc 2 | 3 | 4 | 5 | Can CUDA handle one billion rows of text? Yes. 6 | 7 | `fast.cu` takes **16.8 seconds** to process one billion rows on a V100. 8 | This is a **60X** speedup over a pure C++ baseline in `base.cpp`. 9 | 10 | Check out the [blog](https://tspeterkim.github.io/posts/cuda-1brc) for a detailed explanation. -------------------------------------------------------------------------------- /base.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | struct Stat { 12 | float min, max, sum; 13 | int count; 14 | }; 15 | 16 | int main(int argc, char* argv[]) { 17 | if (argc < 2) { 18 | cerr << "Usage: " << argv[0] << " " << endl; 19 | return 1; 20 | } 21 | 22 | ifstream file(argv[1]); 23 | string line; 24 | map stationStats; 25 | 26 | while (getline(file, line)) { 27 | istringstream iss(line); 28 | string station; 29 | float temp; 30 | getline(iss, station, ';'); 31 | iss >> temp; 32 | 33 | auto it = stationStats.find(station); 34 | if (it == stationStats.end()) { 35 | stationStats[station] = {temp, temp, temp, 1}; 36 | } else { 37 | Stat& s = it->second; 38 | s.min = min(s.min, temp); 39 | s.max = max(s.max, temp); 40 | s.sum += temp; 41 | s.count++; 42 | } 43 | } 44 | 45 | ofstream measurements("measurements.out"); 46 | for (auto& pair : stationStats) { 47 | const Stat& s = pair.second; 48 | float mean = s.sum / s.count; 49 | measurements << pair.first << "=" << s.min << "/"; 50 | measurements << fixed << setprecision(1) << mean << "/"; 51 | measurements << s.max << endl; 52 | } 53 | return 0; 54 | } 55 | -------------------------------------------------------------------------------- /cover.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tspeterkim/cuda-1brc/9678472f1ac081e0e130513a062243996ef7c1b4/cover.jpeg -------------------------------------------------------------------------------- /data/create_measurements.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2023 The original authors 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Based on https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CreateMeasurements.java 19 | 20 | import os 21 | import sys 22 | import random 23 | import time 24 | 25 | 26 | def check_args(file_args): 27 | """ 28 | Sanity checks out input and prints out usage if input is not a positive integer 29 | """ 30 | try: 31 | if len(file_args) != 2 or int(file_args[1]) <= 0: 32 | raise Exception() 33 | except: 34 | print("Usage: create_measurements.sh ") 35 | print(" You can use underscore notation for large number of records.") 36 | print(" For example: 1_000_000_000 for one billion") 37 | exit() 38 | 39 | 40 | def build_weather_station_name_list(): 41 | """ 42 | Grabs the weather station names from example data provided in repo and dedups 43 | """ 44 | station_names = [] 45 | with open('data/weather_stations.csv', 'r') as file: 46 | file_contents = file.read() 47 | for station in file_contents.splitlines(): 48 | if "#" in station: 49 | next 50 | else: 51 | station_names.append(station.split(';')[0]) 52 | return list(set(station_names)) 53 | 54 | 55 | def convert_bytes(num): 56 | """ 57 | Convert bytes to a human-readable format (e.g., KiB, MiB, GiB) 58 | """ 59 | for x in ['bytes', 'KiB', 'MiB', 'GiB']: 60 | if num < 1024.0: 61 | return "%3.1f %s" % (num, x) 62 | num /= 1024.0 63 | 64 | 65 | def format_elapsed_time(seconds): 66 | """ 67 | Format elapsed time in a human-readable format 68 | """ 69 | if seconds < 60: 70 | return f"{seconds:.3f} seconds" 71 | elif seconds < 3600: 72 | minutes, seconds = divmod(seconds, 60) 73 | return f"{int(minutes)} minutes {int(seconds)} seconds" 74 | else: 75 | hours, remainder = divmod(seconds, 3600) 76 | minutes, seconds = divmod(remainder, 60) 77 | if minutes == 0: 78 | return f"{int(hours)} hours {int(seconds)} seconds" 79 | else: 80 | return f"{int(hours)} hours {int(minutes)} minutes {int(seconds)} seconds" 81 | 82 | 83 | def estimate_file_size(weather_station_names, num_rows_to_create): 84 | """ 85 | Tries to estimate how large a file the test data will be 86 | """ 87 | total_name_bytes = sum(len(s.encode("utf-8")) for s in weather_station_names) 88 | avg_name_bytes = total_name_bytes / float(len(weather_station_names)) 89 | 90 | # avg_temp_bytes = sum(len(str(n / 10.0)) for n in range(-999, 1000)) / 1999 91 | avg_temp_bytes = 4.400200100050025 92 | 93 | # add 2 for separator and newline 94 | avg_line_length = avg_name_bytes + avg_temp_bytes + 2 95 | 96 | human_file_size = convert_bytes(num_rows_to_create * avg_line_length) 97 | 98 | return f"Estimated max file size is: {human_file_size}." 99 | 100 | 101 | def build_test_data(weather_station_names, num_rows_to_create): 102 | """ 103 | Generates and writes to file the requested length of test data 104 | """ 105 | start_time = time.time() 106 | coldest_temp = -99.9 107 | hottest_temp = 99.9 108 | station_names_10k_max = random.choices(weather_station_names, k=10_000) 109 | batch_size = 10000 # instead of writing line by line to file, process a batch of stations and put it to disk 110 | chunks = num_rows_to_create // batch_size 111 | print('Building test data...') 112 | 113 | try: 114 | with open("data/measurements.txt", 'w') as file: 115 | progress = 0 116 | for chunk in range(chunks): 117 | 118 | batch = random.choices(station_names_10k_max, k=batch_size) 119 | prepped_deviated_batch = '\n'.join([f"{station};{random.uniform(coldest_temp, hottest_temp):.1f}" for station in batch]) # :.1f should quicker than round on a large scale, because round utilizes mathematical operation 120 | file.write(prepped_deviated_batch + '\n') 121 | 122 | # Update progress bar every 1% 123 | if (chunk + 1) * 100 // chunks != progress: 124 | progress = (chunk + 1) * 100 // chunks 125 | bars = '=' * (progress // 2) 126 | sys.stdout.write(f"\r[{bars:<50}] {progress}%") 127 | sys.stdout.flush() 128 | sys.stdout.write('\n') 129 | except Exception as e: 130 | print("Something went wrong. Printing error info and exiting...") 131 | print(e) 132 | exit() 133 | 134 | end_time = time.time() 135 | elapsed_time = end_time - start_time 136 | file_size = os.path.getsize("data/measurements.txt") 137 | human_file_size = convert_bytes(file_size) 138 | 139 | print("Test data successfully written to 1brc/data/measurements.txt") 140 | print(f"Actual file size: {human_file_size}") 141 | print(f"Elapsed time: {format_elapsed_time(elapsed_time)}") 142 | 143 | 144 | def main(): 145 | """ 146 | main program function 147 | """ 148 | check_args(sys.argv) 149 | num_rows_to_create = int(sys.argv[1]) 150 | weather_station_names = [] 151 | weather_station_names = build_weather_station_name_list() 152 | print(estimate_file_size(weather_station_names, num_rows_to_create)) 153 | build_test_data(weather_station_names, num_rows_to_create) 154 | print("Test data build complete.") 155 | 156 | 157 | if __name__ == "__main__": 158 | main() 159 | exit() 160 | -------------------------------------------------------------------------------- /fast.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #define MAX_CITY_BYTE 100 // City names are at most 100 bytes. 11 | #define MAX_THREADS_PER_BLOCK 1024 12 | 13 | // File split metadata of the end offset, and length, all in bytes. 14 | struct Part { 15 | long long offset; long long length; 16 | }; 17 | 18 | // Array entry of a given city, keeping track of the temperature statistics. 19 | struct Stat { 20 | char city[MAX_CITY_BYTE]; 21 | float min = INFINITY; float max = -INFINITY; float sum = 0; 22 | int count = 0; 23 | Stat() {} 24 | Stat(const std::string& init_city) { 25 | strncpy(city, init_city.c_str(), init_city.size()); 26 | city[init_city.size()] = '\0'; 27 | } 28 | }; 29 | 30 | // CUDA's atomicMin/Max only work with ints 31 | __device__ static float atomicMin(float* address, float val) { 32 | int* address_as_i = (int*) address; 33 | int old = *address_as_i, assumed; 34 | do { 35 | assumed = old; 36 | old = ::atomicCAS(address_as_i, assumed, 37 | __float_as_int(::fminf(val, __int_as_float(assumed)))); 38 | } while (assumed != old); 39 | return __int_as_float(old); 40 | } 41 | 42 | __device__ static float atomicMax(float* address, float val) { 43 | int* address_as_i = (int*) address; 44 | int old = *address_as_i, assumed; 45 | do { 46 | assumed = old; 47 | old = ::atomicCAS(address_as_i, assumed, 48 | __float_as_int(::fmaxf(val, __int_as_float(assumed)))); 49 | } while (assumed != old); 50 | return __int_as_float(old); 51 | } 52 | 53 | // ChatGPT's working solution. 54 | // Probably could be made more accurate using doubles like the actual strtod.c 55 | __device__ float cuda_atof(char* str) { 56 | float result = 0.0f; 57 | int sign = 1; int decimal = 0; int digits = 0; 58 | 59 | if (*str == '-') { 60 | sign = -1; 61 | ++str; 62 | } 63 | 64 | while (*str >= '0' && *str <= '9') { 65 | result = result * 10.0f + (*str - '0'); 66 | ++str; 67 | ++digits; 68 | } 69 | 70 | if (*str == '.') { 71 | ++str; 72 | while (*str >= '0' && *str <= '9') { 73 | result = result * 10.0f + (*str - '0'); 74 | ++str; 75 | ++digits; 76 | ++decimal; 77 | } 78 | } 79 | result *= sign; 80 | 81 | while (decimal > 0) { 82 | result /= 10.0f; 83 | --decimal; 84 | } 85 | return result; 86 | } 87 | 88 | // Identical to glibc's strcmp.c 89 | __device__ int cuda_strcmp(const char* p1, const char* p2) { 90 | const unsigned char *s1 = (const unsigned char *) p1; 91 | const unsigned char *s2 = (const unsigned char *) p2; 92 | unsigned char c1, c2; 93 | do { 94 | c1 = (unsigned char) *s1++; 95 | c2 = (unsigned char) *s2++; 96 | if (c1 == '\0') 97 | return c1 - c2; 98 | } while (c1 == c2); 99 | return c1 - c2; 100 | } 101 | 102 | // Returns the pre-defined index of a city using good ol' binary search. 103 | __device__ int get_index(char* cities, char* city_target, int n_city) { 104 | int left = 0; 105 | int right = n_city - 1; 106 | while (left <= right) { 107 | int mid = left + (right - left) / 2; 108 | const char* city_query = cities + mid * MAX_CITY_BYTE; 109 | 110 | int cmp = cuda_strcmp(city_query, city_target); 111 | if (cmp == 0) 112 | return mid; 113 | else if (cmp < 0) 114 | left = mid + 1; 115 | else 116 | right = mid - 1; 117 | } 118 | return -1; 119 | } 120 | 121 | // The CUDA kernel. Each thread operates on a different section of the buffer, and updates the statistics. 122 | __global__ void process_buffer(char* buffer, Part* parts, Stat* stats, char* cities, int n_city, long long buffer_offset, int part_size) { 123 | int tx = threadIdx.x; 124 | int bx = blockIdx.x * blockDim.x + tx; 125 | 126 | if (bx >= part_size) // For threads that are not assigned any work, return 127 | return; 128 | 129 | int index = 0; 130 | bool parsing_city = true; 131 | 132 | char city[MAX_CITY_BYTE]; 133 | char floatstr[5]; // longest temperature float str is "-99.9" i.e. 5 bytes 134 | 135 | // An ugly way to do string processing in CUDA. 136 | // I could probably use more helper functions here like my own getline. 137 | for (int i = 0; i < parts[bx].length; i++) { 138 | char c = buffer[parts[bx].offset-buffer_offset + i]; 139 | if (parsing_city) { // City characters 140 | if (c == ';') { 141 | city[index] = '\0'; 142 | index = 0; 143 | parsing_city = false; 144 | } else { 145 | city[index] = c; 146 | index++; 147 | } 148 | } else { // Float characters 149 | if (c == '\n') { 150 | floatstr[index] = '\0'; 151 | 152 | int stat_index = get_index(cities, city, n_city); 153 | float temp = cuda_atof(floatstr); 154 | 155 | // The heart of the CUDA kernel. 156 | // Update (atomically) the temperature statistics. 157 | // Identical in spirit to the C++ baseline. 158 | atomicMin(&stats[stat_index].min, temp); 159 | atomicMax(&stats[stat_index].max, temp); 160 | atomicAdd(&stats[stat_index].sum, temp); 161 | atomicAdd(&stats[stat_index].count, 1); 162 | 163 | // reset for next line read 164 | parsing_city = true; 165 | index = 0; 166 | floatstr[0] = '\0'; city[0] = '\0'; 167 | } else { 168 | floatstr[index] = c; 169 | index++; 170 | } 171 | } 172 | } 173 | } 174 | 175 | // Adapted from https://github.com/benhoyt/go-1brc/blob/master/r8.go#L124 176 | std::vector split_file(std::string input_path, int num_parts) { 177 | std::ifstream file(input_path, std::ios::binary | std::ios::ate); 178 | std::streamsize size = file.tellg(); 179 | file.seekg(0, std::ios::beg); 180 | 181 | // Using long long is necessary to avoid overflow of file size. 182 | // e.g. 15B (bytes) for a 1B-row file 183 | long long split_size = size / num_parts; 184 | 185 | std::cout << "Total file size: " << size << ", split size: " << split_size << std::endl; 186 | 187 | long long offset = 0; 188 | std::vector parts; 189 | while (offset < size) { 190 | long long seek_offset = std::max(offset + split_size - MAX_CITY_BYTE, 0LL); 191 | if (seek_offset > size) { 192 | parts.back().length += size-offset; 193 | break; 194 | } 195 | file.seekg(seek_offset, std::ios::beg); 196 | char buf[MAX_CITY_BYTE]; 197 | file.read(buf, MAX_CITY_BYTE); 198 | 199 | std::streamsize n = file.gcount(); 200 | std::streamsize newline = -1; 201 | for (int i = n - 1; i >= 0; --i) { 202 | if (buf[i] == '\n') { 203 | newline = i; 204 | break; 205 | } 206 | } 207 | int remaining = n - newline - 1; 208 | long long next_offset = seek_offset + n - remaining; 209 | parts.push_back({offset, next_offset-offset}); 210 | offset = next_offset; 211 | } 212 | file.close(); 213 | return parts; 214 | } 215 | 216 | std::set get_cities() { 217 | std::ifstream weather_file("data/weather_stations.csv"); 218 | std::string line; 219 | std::set all_cities; 220 | 221 | while (getline(weather_file, line)) { 222 | std::istringstream iss(line); 223 | if (line[0] == '#') 224 | continue; 225 | std::string station; 226 | std::getline(iss, station, ';'); 227 | all_cities.insert(station); 228 | } 229 | weather_file.close(); 230 | return all_cities; 231 | } 232 | 233 | int main(int argc, char* argv[]) { 234 | if (argc < 4) { 235 | std::cerr << "Usage: " << argv[0] << " " << std::endl; 236 | return 1; 237 | } 238 | 239 | // Bending the rules of the challenge here. 240 | // I'm assuming a file like data/weather_stations.csv is given. 241 | // This file lists all possible cities that could appear in the input file. 242 | std::set all_cities = get_cities(); 243 | 244 | int n_city = all_cities.size(); 245 | Stat* stats = new Stat[n_city]; 246 | int index = 0; 247 | char cities[MAX_CITY_BYTE * n_city] = {'\0'}; 248 | 249 | for (const auto& city : all_cities) { 250 | stats[index] = Stat(city); 251 | strcpy(cities + (index * MAX_CITY_BYTE), city.c_str()); 252 | index++; 253 | } 254 | 255 | auto start = std::chrono::high_resolution_clock::now(); 256 | 257 | std::string input_path = argv[1]; 258 | int num_parts = atoi(argv[2]); int batch_size = atoi(argv[3]); 259 | 260 | std::vector parts = split_file(input_path, num_parts); 261 | num_parts = parts.size(); 262 | 263 | std::cout << "Required GPU RAM Size (GB): " << parts[0].length * batch_size / 1'000'000'000.0 << std::endl; 264 | 265 | auto end = std::chrono::high_resolution_clock::now(); 266 | std::chrono::duration elapsed = end - start; 267 | std::cout << "Time taken finding parts: " << elapsed.count() << " seconds" << std::endl; 268 | start = std::chrono::high_resolution_clock::now(); 269 | 270 | Stat* d_stats; // Array of temperature statistics. Each entry corresponds to a different city. 271 | cudaMalloc(&d_stats, n_city * sizeof(Stat)); 272 | cudaMemcpy(d_stats, stats, n_city * sizeof(Stat), cudaMemcpyHostToDevice); 273 | 274 | char* d_buffer; // Holds a subset of the raw text char buffer. 275 | cudaMalloc((void**) &d_buffer, 10'000'000'000 * sizeof(char)); 276 | 277 | Part* d_parts; // File splits s.t. each thread can work on a different split. 278 | cudaMalloc(&d_parts, parts.size() * sizeof(Part)); 279 | 280 | char* d_cities; // List of all cities for city -> index lookup. 281 | cudaMalloc(&d_cities, MAX_CITY_BYTE * n_city * sizeof(char)); 282 | cudaMemcpy(d_cities, cities, MAX_CITY_BYTE * n_city * sizeof(char), cudaMemcpyHostToDevice); 283 | 284 | // Launch CUDA kernels that processes different splits of the file. 285 | // Does it in sequential batches, if GPU RAM is limited. 286 | std::ifstream file(input_path, std::ios::binary); 287 | for (int b = 0; b < num_parts; b += batch_size) { 288 | long long batch_file_size = 0; 289 | for (int bi = b; bi < std::min(b + batch_size, num_parts); bi++) 290 | batch_file_size += parts[bi].length; 291 | 292 | file.seekg(parts[b].offset, std::ios::beg); 293 | 294 | char* buffer = new char[batch_file_size]; 295 | file.read(buffer, batch_file_size); 296 | 297 | cudaMemcpy(d_buffer, buffer, batch_file_size * sizeof(char), cudaMemcpyHostToDevice); 298 | 299 | int part_size = batch_size; 300 | if (b + batch_size > num_parts) 301 | part_size = num_parts - b; 302 | cudaMemcpy(d_parts, parts.data() + b, part_size * sizeof(Part), cudaMemcpyHostToDevice); 303 | 304 | int grid_blocks = std::ceil((float) part_size / MAX_THREADS_PER_BLOCK); 305 | 306 | process_buffer<<>>(d_buffer, d_parts, d_stats, d_cities, n_city, parts[b].offset, part_size); 307 | cudaError_t error = cudaGetLastError(); 308 | if (error != cudaSuccess) 309 | std::cerr << "Error: " << cudaGetErrorString(error) << std::endl; 310 | 311 | delete[] buffer; 312 | } 313 | 314 | cudaDeviceSynchronize(); // for accurate profiling (cuda calls are async) 315 | end = std::chrono::high_resolution_clock::now(); 316 | elapsed = end - start; 317 | std::cout << "Time taken in cuda kernel: " << elapsed.count() << " seconds" << std::endl; 318 | 319 | // Write out the results, and complete the challenge. 320 | cudaMemcpy(stats, d_stats, n_city * sizeof(Stat), cudaMemcpyDeviceToHost); 321 | std::ofstream measurements("cuda_measurements.out"); 322 | for (int i = 0; i < n_city; i++) { 323 | if (stats[i].count != 0) { 324 | float mean = stats[i].sum / stats[i].count; 325 | measurements << stats[i].city << "=" << stats[i].min << "/"; 326 | measurements << std::fixed << std::setprecision(1) << mean << "/"; 327 | measurements << stats[i].max << std::endl; 328 | } 329 | } 330 | 331 | return 0; 332 | } 333 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | # Create the 1 billion row file as measurements.txt (~14GB) 2 | python data/create_measurements.py 1000000000 3 | 4 | # Compile and run the c++ baseline. 5 | g++ -o base -O2 base.cpp 6 | time ./base data/measurements.txt # ~17 mins 7 | 8 | # Compile and run my cuda solution. 9 | nvcc -o fast -O2 fast.cu 10 | # 1 million threads to execute in batches of max size 600000. 11 | # TODO: Increase the max size by using a GPU with more memory. 12 | time ./fast data/measurements.txt 1000000 600000 # ~17s on V100 13 | 14 | # Sanity check 15 | # TODO: The cuda floats are off by 0.1 for some cities. Check why. 16 | diff cuda_measurements.out measurements.out 17 | --------------------------------------------------------------------------------