├── .gitignore ├── src ├── rustfmt.toml └── main.rs ├── Cargo.toml └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | Cargo.lock 3 | -------------------------------------------------------------------------------- /src/rustfmt.toml: -------------------------------------------------------------------------------- 1 | struct_lit_width = 50 -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rust-1brc" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | rayon = "1.8" 8 | bstr = "1.9.0" 9 | memmap = "0.7.0" 10 | 11 | [profile.release] 12 | debug = false 13 | lto = true 14 | codegen-units = 1 15 | panic = "abort" 16 | strip = true 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Marko Topolnik 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use bstr::ByteSlice; 2 | use memmap::MmapOptions; 3 | use rayon::prelude::*; 4 | use std::collections::HashMap; 5 | use std::convert::TryInto; 6 | use std::ops::{Shl, Shr}; 7 | use std::{array, io}; 8 | use std::{collections::BTreeMap, fs::File}; 9 | 10 | #[repr(C, align(64))] 11 | struct Stats { 12 | hash: u64, 13 | name_len: u32, 14 | count: u32, 15 | sum: i32, 16 | min: i16, 17 | max: i16, 18 | name: [u8; 104], 19 | } 20 | 21 | impl Default for Stats { 22 | fn default() -> Self { 23 | Self { 24 | name: array::from_fn(|_| 0u8), 25 | hash: Default::default(), 26 | count: Default::default(), 27 | sum: Default::default(), 28 | min: Default::default(), 29 | max: Default::default(), 30 | name_len: Default::default(), 31 | } 32 | } 33 | } 34 | 35 | struct FinalStats { 36 | count: u32, 37 | sum: i32, 38 | min: i16, 39 | max: i16, 40 | } 41 | 42 | fn main() -> io::Result<()> { 43 | let path = "measurements.txt"; 44 | let chunk_count: usize = std::thread::available_parallelism().unwrap().into(); 45 | let file = File::open(path).unwrap(); 46 | let mmap = unsafe { MmapOptions::new().map(&file).unwrap() }; 47 | let fsize = mmap.len(); 48 | 49 | let chunk_start_offsets = { 50 | let mut chunk_start_offsets = Vec::with_capacity(chunk_count); 51 | chunk_start_offsets.push(0); 52 | for chunk_index in 1..chunk_count { 53 | let chunk_start = fsize * chunk_index / chunk_count; 54 | let newline_pos = &mmap[chunk_start..].find_byte(b'\n').unwrap(); 55 | chunk_start_offsets.push(chunk_start + newline_pos + 1); 56 | } 57 | chunk_start_offsets 58 | }; 59 | let mut chunks = Vec::new(); 60 | for i in 0..chunk_start_offsets.len() { 61 | let chunk_start = chunk_start_offsets[i]; 62 | let chunk_end = if i < chunk_start_offsets.len() - 1 { 63 | chunk_start_offsets[i + 1] 64 | } else { 65 | fsize 66 | }; 67 | if chunk_start < chunk_end { 68 | chunks.push((chunk_start, chunk_end)); 69 | } 70 | } 71 | 72 | const HASHTABLE_SIZE: usize = 32_768; 73 | 74 | let stats = chunks 75 | .par_iter() 76 | .map(|&(start, limit)| { 77 | let mut hashtable: Vec = Vec::with_capacity(HASHTABLE_SIZE); 78 | for _ in 0..HASHTABLE_SIZE { 79 | hashtable.push(Stats::default()); 80 | } 81 | let mut record_tail = &mmap[start..limit]; 82 | loop { 83 | let pos_of_semicolon = record_tail.find_byte(b';').unwrap(); 84 | let name = &record_tail[..pos_of_semicolon]; 85 | let hash = hash(record_tail, pos_of_semicolon); 86 | let temperature_tail = &record_tail[pos_of_semicolon + 1..]; 87 | let (temperature, pos_of_next_line) = parse_temperature(temperature_tail); 88 | let mut hashtable_index = hash as usize % HASHTABLE_SIZE; 89 | loop { 90 | let stats = &mut hashtable[hashtable_index]; 91 | let name_len = stats.name_len as usize; 92 | if stats.hash == hash 93 | && name_len == name.len() 94 | && &stats.name[..name_len] == name 95 | { 96 | stats.count += 1; 97 | stats.sum += temperature as i32; 98 | stats.min = stats.min.min(temperature); 99 | stats.max = stats.max.max(temperature); 100 | break; 101 | } 102 | if stats.name_len != 0 { 103 | hashtable_index = (hashtable_index + 1) % HASHTABLE_SIZE; 104 | continue; 105 | } 106 | stats.hash = hash; 107 | stats.name_len = pos_of_semicolon as u32; 108 | stats.count = 1; 109 | stats.sum = temperature as i32; 110 | stats.min = temperature; 111 | stats.max = temperature; 112 | stats.name[..name.len()].copy_from_slice(name); 113 | break; 114 | } 115 | if pos_of_next_line >= temperature_tail.len() { 116 | break; 117 | } 118 | record_tail = &temperature_tail[pos_of_next_line..]; 119 | } 120 | hashtable 121 | }) 122 | .fold( 123 | || HashMap::::with_capacity(16_384), 124 | |mut totals, hashtable| { 125 | for stats in hashtable { 126 | if stats.name_len == 0 { 127 | continue; 128 | } 129 | let Stats { name_len, name, count, sum, min, max, .. } = stats; 130 | totals 131 | .entry(String::from_utf8_lossy(&name[..name_len as usize]).into_owned()) 132 | .and_modify(|totals| { 133 | totals.count += count; 134 | totals.sum += sum; 135 | totals.min = (totals.min).min(min); 136 | totals.max = (totals.max).max(max); 137 | }) 138 | .or_insert(FinalStats { count, sum, min, max }); 139 | } 140 | totals 141 | }, 142 | ) 143 | .reduce( 144 | || HashMap::::with_capacity(16_384), 145 | |mut totals, stats_map| { 146 | for (name, FinalStats { count, sum, min, max }) in stats_map { 147 | totals 148 | .entry(name) 149 | .and_modify(|totals| { 150 | totals.count += count; 151 | totals.sum += sum; 152 | totals.min = (totals.min).min(min); 153 | totals.max = (totals.max).max(max); 154 | }) 155 | .or_insert(FinalStats { count, sum, min, max }); 156 | } 157 | totals 158 | }, 159 | ); 160 | 161 | let mut sorted = BTreeMap::new(); 162 | sorted.extend(stats); 163 | print!("{{"); 164 | let mut on_first = true; 165 | for (city, FinalStats { count, sum, min, max, .. }) in sorted { 166 | let (count, sum, min, max) = (count as f32, sum, min, max); 167 | if on_first { 168 | on_first = false; 169 | } else { 170 | print!(", "); 171 | } 172 | print!( 173 | "{}={:.1}/{:.1}/{:.1}", 174 | city, 175 | (min as f64) / 10.0, 176 | ((sum as f64) / (count as f64)).round() / 10.0, 177 | (max as f64) / 10.0 178 | ); 179 | } 180 | println!("}}"); 181 | Ok(()) 182 | } 183 | 184 | fn hash(name_tail: &[u8], pos_of_semicolon: usize) -> u64 { 185 | let seed: u64 = 0x51_7c_c1_b7_27_22_0a_95; 186 | let rot_dist = 17; 187 | 188 | let block = if name_tail.len() >= 8 { 189 | let block = u64::from_le_bytes(name_tail[0..8].try_into().unwrap()); 190 | let shift_distance = 8 * 0.max(8 - pos_of_semicolon as i32); 191 | // Mask out bytes not belonging to name 192 | let mask = (!0u64).shr(shift_distance); 193 | block & mask 194 | } else { 195 | let mut buf = [0u8; 8]; 196 | let copy_len = pos_of_semicolon.min(8); 197 | buf[..copy_len].copy_from_slice(&name_tail[..copy_len]); 198 | u64::from_le_bytes(buf) 199 | }; 200 | let mut hash = block; 201 | hash = hash.wrapping_mul(seed); 202 | hash = hash.rotate_left(rot_dist); 203 | hash 204 | } 205 | 206 | fn parse_temperature(chars: &[u8]) -> (i16, usize) { 207 | if chars.len() >= 8 { 208 | parse_temperature_swar(chars) 209 | } else { 210 | parse_temperature_simple(chars) 211 | } 212 | } 213 | 214 | fn parse_temperature_swar(chars: &[u8]) -> (i16, usize) { 215 | let word = i64::from_le_bytes(chars[0..8].try_into().unwrap()); 216 | let negated = !word; 217 | let dot_pos = (negated & 0x10101000).trailing_zeros(); 218 | let mut signed: i64 = negated.shl(59); 219 | signed = signed.shr(63); 220 | let remove_sign_mask = !(signed & 0xFF); 221 | let digits = (word & remove_sign_mask).shl(28 - dot_pos) & 0x0F000F0F00; 222 | let abs_value = (digits.wrapping_mul(0x640a0001)).shr(32) & 0x3FFi64; 223 | let temperature = (abs_value ^ signed) - signed; 224 | (temperature as i16, (dot_pos / 8 + 3) as usize) 225 | } 226 | 227 | fn parse_temperature_simple(chars: &[u8]) -> (i16, usize) { 228 | let mut i = 0; 229 | let sign = if chars[i] == b'-' { 230 | i += 1; 231 | -1 232 | } else { 233 | 1 234 | }; 235 | let mut temperature: i16 = (chars[i] - b'0') as i16; 236 | i += 1; 237 | if chars[i] == b'.' { 238 | i += 1; 239 | } else { 240 | temperature = 10 * temperature + (chars[i] - b'0') as i16; 241 | i += 2; 242 | } 243 | (sign * (10 * temperature + (chars[i] - b'0') as i16), i + 2) 244 | } 245 | --------------------------------------------------------------------------------