├── .gitignore ├── Analysis.jl ├── Common.jl ├── IntervalModel.jl ├── Main.jl ├── Makefile ├── README.md ├── SizeModel.jl ├── common.h ├── diff_leveldb.patch ├── diff_rocksdb.patch ├── leveldb.cpp ├── leveldb.h ├── leveldb_impl.cpp ├── leveldb_impl.h ├── main.cpp ├── measure_rw.cpp ├── meshdb.cpp ├── meshdb.h ├── rocksdb_impl.cpp ├── rocksdb_impl.h ├── stat.h ├── util.cpp ├── util.h └── zipf.h /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.d 3 | main 4 | measure_rw 5 | perf.data* 6 | output.txt 7 | output_*.txt 8 | output_*.txt.tmp 9 | #!output_sensitivity.txt 10 | #!output_leveldb-sim_*.txt 11 | #!output_leveldb-impl_*.txt 12 | #!output_rocksdb-impl_*.txt 13 | #!output_measure_rw.txt 14 | #!output_universal_compaction.txt 15 | .*.sw? 16 | *.pyc 17 | leveldb_files 18 | rocksdb_files 19 | msls_*.tar.bz2 20 | leveldb 21 | rocksdb -------------------------------------------------------------------------------- /Common.jl: -------------------------------------------------------------------------------- 1 | module Common 2 | # using Ipopt 3 | 4 | export Distribution 5 | 6 | export check_validity 7 | export update_derived_values 8 | 9 | export unique 10 | export unique_inv 11 | 12 | export unique_avg 13 | 14 | export density 15 | export density_sum 16 | export interval_from_density 17 | 18 | export merge 19 | 20 | export ccp 21 | 22 | export load_zipf_compressed 23 | 24 | export geom_mean 25 | 26 | export get_wa 27 | 28 | 29 | type Distribution 30 | count::Int64 31 | c::Array{Float64} 32 | p::Array{Float64} 33 | 34 | # derived values 35 | p1::Array{Float64} 36 | c_log_p1::Array{Float64} 37 | end 38 | 39 | function check_validity(X::Distribution) 40 | @assert X.count > 0 41 | @assert length(X.c) == length(X.p) 42 | 43 | count = 0. 44 | prob = 0. 45 | 46 | for i = 1:length(X.c) 47 | @assert X.c[i] != 0. 48 | @assert X.p[i] != 0. 49 | @assert X.p[i] != 1. 50 | 51 | count += X.c[i] 52 | prob += X.c[i] * X.p[i] 53 | end 54 | 55 | @assert abs(Float64(X.count) / count - 1.) < 0.001 56 | @assert abs(prob - 1.) < 0.001 57 | end 58 | 59 | function update_derived_values(X::Distribution) 60 | X.p1 = Array(Float64, length(X.c)) 61 | X.c_log_p1 = Array(Float64, length(X.c)) 62 | for i = 1:length(X.c) 63 | p1 = 1. - X.p[i] 64 | X.p1[i] = p1 65 | X.c_log_p1[i] = X.c[i] * log(p1) 66 | end 67 | end 68 | 69 | type UniqueParam 70 | X::Distribution 71 | c::Float64 72 | end 73 | 74 | function hash(x::UniqueParam) 75 | hash(x.c) 76 | end 77 | 78 | function isequal(x::UniqueParam, y::UniqueParam) 79 | x.c == y.c && x.X == y.X 80 | end 81 | 82 | # global unique_memoization = Dict{UniqueParam, Float64}() 83 | 84 | function unique(X::Distribution, c::Float64) 85 | # global unique_memoization::Dict{UniqueParam, Float64} 86 | 87 | if c == Inf 88 | return Float64(X.count) 89 | end 90 | 91 | @assert c >= 0. 92 | 93 | # get!(unique_memoization, UniqueParam(X, c)) do 94 | s = Float64(X.count) 95 | for i = 1:length(X.c) 96 | # s -= X.c[i] * ((1. - X.p[i]) ^ c) 97 | s -= X.c[i] * (X.p1[i] ^ c) 98 | end 99 | 100 | s 101 | # end 102 | end 103 | 104 | function unique_diff(X::Distribution, c::Float64) 105 | s = 0. 106 | for i = 1:length(X.c) 107 | # p1 = 1. - X.p[i] 108 | # s -= X.c[i] * (p1 ^ c) * log(p1) 109 | s -= X.c_log_p1[i] * (X.p1[i] ^ c) 110 | end 111 | 112 | s 113 | end 114 | 115 | function unique_int(X::Distribution, c0::Float64, c1::Float64) 116 | @assert c0 < c1 117 | f = (c) -> begin 118 | unique(X, c) 119 | end 120 | I, E = quadgk(f, c0, c1, maxevals=10) 121 | I 122 | end 123 | 124 | function discrete_sum(f, a::Float64, b::Float64, maxevals::Int64=10) 125 | pq = Collections.PriorityQueue() 126 | vs = [] 127 | 128 | f_a = f(a) 129 | push!(vs, (a, f_a)) 130 | f_b = f(b) 131 | push!(vs, (b, f_b)) 132 | Collections.enqueue!(pq, (a, b, f_a, f_b), -abs((f_a - f_b) * (a - b))) 133 | eval = 2 134 | 135 | while eval < maxevals 136 | try 137 | a, b, f_a, f_b = Collections.dequeue!(pq) 138 | catch y 139 | if isa(y, BoundsError) 140 | break 141 | end 142 | end 143 | # println("eval=", eval, " a=", a, " b=", b, " f_a=", f_a, " f_b=", f_b, " diff=", abs(f_a - f_b)) 144 | if a + 1. < b 145 | m = round((a + b) / 2.) 146 | @assert a != m 147 | @assert b != m 148 | f_m = f(m) 149 | push!(vs, (m, f_m)) 150 | Collections.enqueue!(pq, (a, m, f_a, f_m), -abs((f_a - f_m) * (a - m))) 151 | Collections.enqueue!(pq, (m, b, f_m, f_b), -abs((f_m - f_b) * (m - b))) 152 | eval += 1 153 | end 154 | end 155 | 156 | sort!(vs) 157 | 158 | sum = 0. 159 | len = length(vs) 160 | for i = 2:len 161 | sum += vs[i - 1][2] * (vs[i][1] - vs[i - 1][1]) 162 | end 163 | sum += vs[end][2] * 1. 164 | 165 | sum 166 | end 167 | 168 | function unique_avg(X::Distribution, c0::Float64, c1::Float64) 169 | # remove a negative range that are not valid for unique(); quadgk() can emit DomainError otherwise 170 | if c0 < 0. 171 | c0 = 0. 172 | end 173 | if c1 < 0. 174 | c1 = 0. 175 | end 176 | 177 | if c0 > c1 178 | 0. 179 | elseif c0 == c1 180 | unique(X, c0) 181 | else 182 | # if c1 / c0 < 100. 183 | # sum = 0. 184 | # count = 0 185 | # for i = 1:int64(ceil(c1 / c0)) 186 | # sum += unique(X, c0 * Float64(i)) 187 | # count += 1 188 | # end 189 | # sum / count 190 | # # discrete_sum(f, start_step, end_step) / (end_step - start_step + 1.) 191 | # else 192 | unique_int(X, c0, c1) / (c1 - c0) 193 | # end 194 | 195 | # start_step = 1. 196 | # end_step = c1 / c0 197 | # f = (step) -> begin 198 | # unique(X, c0 * step) 199 | # end 200 | # discrete_sum(f, start_step, end_step) / (end_step - start_step + 1.) 201 | end 202 | end 203 | 204 | 205 | type UniqueInvParam 206 | X::Distribution 207 | u::Float64 208 | end 209 | 210 | function hash(x::UniqueInvParam) 211 | hash(x.u) 212 | end 213 | 214 | function isequal(x::UniqueInvParam, y::UniqueInvParam) 215 | x.u == y.u && x.X == y.X 216 | end 217 | 218 | # global unique_inv_memoization = Dict{UniqueInvParam, Float64}() 219 | 220 | function unique_inv(X::Distribution, u::Float64) 221 | unique_inv_nt(X, u) 222 | # unique_inv_ipopt(X, u) 223 | end 224 | 225 | function unique_inv_nt(X::Distribution, u::Float64) 226 | # Newton's method 227 | # global unique_inv_memoization::Dict{UniqueInvParam, Float64} 228 | 229 | if u >= Float64(X.count) * (1 - 0.000001) 230 | return Inf 231 | end 232 | 233 | # get!(unique_inv_memoization, UniqueInvParam(X, u)) do 234 | # take u as the initial c 235 | c = u 236 | for count = 1:100 237 | u1 = unique(X, c) 238 | if abs(u1 / u - 1.) < 0.001 239 | break 240 | end 241 | c -= (u1 - u) / unique_diff(X, c) 242 | 243 | if c < 0. 244 | c = 0. 245 | end 246 | end 247 | 248 | c 249 | # end 250 | end 251 | 252 | function unique_inv_ipopt(X::Distribution, u::Float64) 253 | # inaccurate 254 | 255 | # if u >= Float64(X.count) * (1 - 0.000001) 256 | # return Inf 257 | # end 258 | 259 | eval_f = (x) -> begin 260 | abs(unique(X, x[1]) - u) / u 261 | end 262 | 263 | eval_grad_f = (x, grad_f) -> begin 264 | # grad_f[1] = unique_diff(X, x[1]) 265 | diff = x[1] * 0.001 266 | grad_f[1] = (abs(unique(X, x[1] + diff) - u) - abs(unique(X, x[1]) - u)) / u / diff 267 | end 268 | 269 | eval_g = (x, g) -> begin 270 | # g[1] = x[1] 271 | end 272 | 273 | eval_jac_g = (x, mode, rows, cols, values) -> begin 274 | # if mode == :Structure 275 | # rows[1] = 1 276 | # cols[1] = 1 277 | # else 278 | # values[1] = 1. 279 | # end 280 | end 281 | 282 | v_L = [1.] 283 | v_U = [Float64(X.c) ^ 2.] 284 | 285 | # g_L = [1.] 286 | # # # g_U = [2.e19] 287 | # g_U = [Float64(X.c) ^ 2.] 288 | g_L = Array(Float64, 0) 289 | g_U = Array(Float64, 0) 290 | 291 | prob = createProblem(1, v_L, v_U, 292 | 0, g_L, g_U, 293 | 0, 0, 294 | eval_f, eval_g, eval_grad_f, eval_jac_g) 295 | 296 | addOption(prob, "hessian_approximation", "limited-memory") 297 | 298 | # addOption(prob, "tol", 0.1) 299 | 300 | addOption(prob, "print_level", 2); 301 | 302 | prob.x = [u] 303 | status = solveProblem(prob) 304 | # ret = Ipopt.ApplicationReturnStatus[status] 305 | # obj_val = prob.obj_val 306 | # println("$obj_val in unique_inv (returned $ret)") 307 | 308 | prob.x[1] 309 | end 310 | 311 | 312 | function density(X::Distribution, interval::Float64, d::Float64) 313 | n = X.count 314 | v = unique(X, d / n * interval) / n 315 | #println(v) 316 | v 317 | end 318 | 319 | function density_sum(X::Distribution, interval::Float64) 320 | @assert interval >= 0. 321 | 322 | n = X.count 323 | 324 | # using integration 325 | f = (d) -> begin 326 | v = density(X, interval, d) 327 | @assert !isnan(v) 328 | v 329 | end 330 | #I, E = quadgk(f, 1., n, maxevals=10) 331 | I, E = quadgk(f, 0., n - 1., maxevals=10) 332 | I 333 | 334 | # using a geometric sum of unique() - this is fast but has a precision issue with large n due to the use of close-to-zero divisions 335 | # s = Float64(n) 336 | # for i = 1:length(X.c) 337 | # s -= X.c[i] * (1. - (X.p1[i] ^ interval)) / (1. - (X.p1[i] ^ (interval / n))) / n 338 | # end 339 | # s 340 | end 341 | 342 | function interval_from_density(X::Distribution, u::Float64) 343 | # fix up an invalid u that can be created by the solver 344 | u = min(u, float(X.count)) 345 | 346 | # unique_inv() * 2 is usually close to the solution 347 | c = unique_inv(X, u) * 2. 348 | for count = 1:100 349 | u1 = density_sum(X, c) 350 | if abs(u1 / u - 1.) < 0.001 351 | break 352 | end 353 | diff = (u1 - density_sum(X, c * 1.01)) / (c - c * 1.01) 354 | if isnan(diff) 355 | println(diff, " ", u1, " ", density_sum(X, c * 1.1), " ", c) 356 | @assert false 357 | end 358 | c -= (u1 - u) / diff 359 | 360 | if c < 0. 361 | c = 0. 362 | end 363 | end 364 | 365 | c 366 | end 367 | 368 | 369 | function merge(X::Distribution, n1::Float64, n2::Float64) 370 | c = unique_inv(X, n1) + unique_inv(X, n2) 371 | unique(X, c) 372 | end 373 | 374 | 375 | function ccp_subset_sum_choose(X::Distribution, q::Int64, pos::Int64, min::Int64, p_sum::Float64) 376 | if pos > q 377 | return 1. / (1. - p_sum) 378 | end 379 | m = X.count 380 | s = 0. 381 | for i = min:m 382 | s += ccp_subset_sum_choose(X, q, pos + 1, i + 1, p_sum + X.p[i]) 383 | end 384 | 385 | s 386 | end 387 | 388 | function ccp_subset_sum(X::Distribution, q::Int64) 389 | ccp_subset_sum_choose(X, q, 1, 1, 0.) 390 | end 391 | 392 | function ccp(X::Distribution, j::Int64) 393 | # Coupon collector's problem; expected time to collect j coupons whose distribution is X 394 | # this is quite slow for large X (e.g., > 30) 395 | 396 | m = X.count 397 | for i = 1:m 398 | # ccp_subset_sum() cannot handle non-1 cardinality 399 | @assert X.c[i] == 1. 400 | end 401 | 402 | t = 0. 403 | for q = 0:(j - 1) 404 | t += Float64((-1) ^ (j - 1 - q) * binomial(m - q - 1, m - j)) * ccp_subset_sum(X, q) 405 | end 406 | 407 | t 408 | end 409 | 410 | function zipf(count::Int64, s::Float64) 411 | X = Distribution(count, Array(Float64, count), Array(Float64, count), Array(Float64, 0), Array(Float64, 0)) 412 | p_sum = 0. 413 | for i = 1:count 414 | if s == 0. 415 | p = 1. 416 | elseif s == 1. 417 | p = 1. / (Float64(i)) 418 | else 419 | p = 1. / (Float64(i) ^ s) 420 | end 421 | p_sum += p 422 | X.c[i] = 1. 423 | X.p[i] = p 424 | end 425 | X.p /= p_sum 426 | check_validity(X) 427 | 428 | X 429 | end 430 | 431 | function zipf_compressed(count::Int64, s::Float64, rel_diff::Float64) 432 | X = Distribution(count, Array(Float64, 0), Array(Float64, 0), Array(Float64, 0), Array(Float64, 0)) 433 | 434 | p_denom = 0. 435 | 436 | if s == 0. 437 | p = 1. 438 | elseif s == 1. 439 | p = 1. / (Float64(count + 1 - 1)) 440 | else 441 | p = 1. / (Float64(count + 1 - 1) ^ s) 442 | end 443 | c = 1. 444 | p_denom += p 445 | min_p = p 446 | c_sum = c 447 | p_sum = c * p 448 | for i = 2:count 449 | if s == 0. 450 | p = 1. 451 | elseif s == 1. 452 | p = 1. / (Float64(count + 1 - i)) 453 | else 454 | p = 1. / (Float64(count + 1 - i) ^ s) 455 | end 456 | c = 1. 457 | p_denom += p 458 | @assert min_p <= p 459 | if p / min_p - 1. <= rel_diff 460 | c_sum += c 461 | p_sum += c * p 462 | else 463 | push!(X.c, c_sum) 464 | push!(X.p, p_sum / c_sum) 465 | min_p = p 466 | c_sum = c 467 | p_sum = c * p 468 | end 469 | end 470 | push!(X.c, c_sum) 471 | push!(X.p, p_sum / c_sum) 472 | X.p /= p_denom 473 | check_validity(X) 474 | 475 | X 476 | end 477 | 478 | function load_zipf_compressed(count::Int64, s::Float64, rel_diff::Float64) 479 | filename = string("data/zipf_", count, "_", s, "_", rel_diff, ".dat") 480 | 481 | X = Distribution(0, Array(Float64, 0), Array(Float64, 0), Array(Float64, 0), Array(Float64, 0)) 482 | try 483 | f = open(filename, "r") 484 | X.count = deserialize(f) 485 | X.c = deserialize(f) 486 | X.p = deserialize(f) 487 | close(f) 488 | catch 489 | println("creating $filename") 490 | X = zipf_compressed(count, s, rel_diff) 491 | f = open(filename, "w") 492 | serialize(f, X.count) 493 | serialize(f, X.c) 494 | serialize(f, X.p) 495 | close(f) 496 | end 497 | X 498 | end 499 | 500 | 501 | function compress(X::Distribution, rel_diff::Float64) 502 | new_X = Distribution(X.count, Array(Float64, 0), Array(Float64, 0), Array(Float64, 0), Array(Float64, 0)) 503 | 504 | perm = sortperm(X.p) 505 | 506 | p = X.p[perm[1]] 507 | c = X.c[perm[1]] 508 | min_p = p 509 | c_sum = c 510 | p_sum = c * p 511 | for idx in perm[2:end] 512 | p = X.p[idx] 513 | c = X.c[idx] 514 | @assert min_p <= p 515 | if p / min_p - 1. <= rel_diff 516 | c_sum += c 517 | p_sum += c * p 518 | else 519 | push!(new_X.c, c_sum) 520 | push!(new_X.p, p_sum / c_sum) 521 | min_p = p 522 | c_sum = c 523 | p_sum = c * p 524 | end 525 | end 526 | push!(new_X.c, c_sum) 527 | push!(new_X.p, p_sum / c_sum) 528 | check_validity(new_X) 529 | 530 | new_X 531 | end 532 | 533 | function geom_mean(A::Array{Float64}) 534 | s = 0. 535 | for a in A 536 | s += 1. / a 537 | end 538 | 1. / s 539 | end 540 | 541 | function get_wa(wa_r_factor::Float64, t) 542 | wa_r = t[1] 543 | wa_w = t[2] 544 | return sum(wa_w) + wa_r_factor * sum(wa_r) 545 | end 546 | 547 | end 548 | -------------------------------------------------------------------------------- /IntervalModel.jl: -------------------------------------------------------------------------------- 1 | module IntervalModel 2 | 3 | using Common 4 | # using NLopt 5 | using Ipopt 6 | 7 | function init_intervals(log_size::Float64, l0_count::Float64, level_count::Int64) 8 | interval = Array(Float64, 0) 9 | for i = 1:level_count 10 | # push!(interval, log_size * l0_count) 11 | push!(interval, log_size * l0_count * (10. ^ Float64(i - 1))) 12 | end 13 | interval 14 | end 15 | 16 | function calculate_wa_twolevel!(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}, wa_r::Array{Float64}, wa_w::Array{Float64}) 17 | # mem->log 18 | wa_r[1] = 0. 19 | wa_w[1] = 1. 20 | 21 | # log->0 22 | wa_r[2] = 0. 23 | wa_w[2] = Common.unique(X, log_size) / log_size 24 | 25 | # ## amortized, full destination level 26 | # # 0->1, 1->2, ... 27 | # for i in 1:(length(intervals) - 1) 28 | # wa[2 + i] = Common.unique(X, intervals[i] + intervals[i + 1]) / intervals[i] 29 | # end 30 | # wa[2 + length(intervals)] = Float64(X.count) / intervals[end] 31 | 32 | # ## amortized, compact entire level 33 | # # 0->1, 1->2, ... 34 | # interval = 0. 35 | # next_interval = intervals[1] 36 | # for i in 1:(length(intervals) - 1) 37 | # interval = interval * 0.5 + next_interval 38 | # next_interval = intervals[i + 1] 39 | # wa[2 + i] = unique_avg(X, interval, interval * 0.5 + next_interval) / interval 40 | # end 41 | # interval = interval * 0.5 + next_interval 42 | # wa[2 + length(intervals)] = Float64(X.count) / interval 43 | 44 | ## deamortized, compact each sstable in a round-robin way 45 | # 0->1, 1->2, ... 46 | interval = 0. 47 | next_interval = intervals[1] 48 | for i in 1:(length(intervals) - 1) 49 | if i == 1 50 | # 0->1 compaction is usually a whole level 51 | # do not use interval_from_density() and adding extra unique() to WA that are caused by using small tables 52 | interval = next_interval 53 | next_interval = intervals[i + 1] 54 | wa_r[2 + i] = (Common.unique(X, log_size) * l0_count + Common.unique(X, next_interval)) / interval 55 | wa_w[2 + i] = Common.unique(X, interval + next_interval) / interval 56 | else 57 | interval = interval + interval_from_density(X, Common.unique(X, next_interval)) 58 | next_interval = intervals[i + 1] 59 | # using additional unique(); see SizeMode.jl for details 60 | wa_r[2 + i] = (Common.unique(X, interval) + Common.unique(X, next_interval) + Common.unique(X, interval) * 1.) / interval 61 | wa_w[2 + i] = (Common.unique(X, interval + next_interval) + Common.unique(X, interval) * 1.) / interval 62 | end 63 | end 64 | interval = interval + interval_from_density(X, Common.unique(X, next_interval)) 65 | # using additional unique(); see SizeMode.jl for details 66 | wa_r[2 + length(intervals)] = (Common.unique(X, interval) + Float64(X.count) + Common.unique(X, interval) * 1.) / interval 67 | wa_w[2 + length(intervals)] = (Float64(X.count) + Common.unique(X, interval) * 1.) / interval 68 | 69 | wa_r, wa_w 70 | end 71 | 72 | function calculate_wa_twolevel(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}) 73 | wa_r = Array(Float64, 2 + length(intervals)) 74 | wa_w = Array(Float64, 2 + length(intervals)) 75 | calculate_wa_twolevel!(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}, wa_r, wa_w) 76 | end 77 | 78 | # function calculate_wa_twolevel_ratios(X::Distribution, log_size::Float64, l0_count::Float64, interval_ratios::Array{Float64}) 79 | # current_interval = interval_ratios[1] 80 | # intervals = interval_ratios * (log_size * l0_count / current_interval) 81 | # return calculate_wa_twolevel(X, log_size, l0_count, intervals) 82 | # end 83 | 84 | function calculate_sizes_twolevel(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}) 85 | sizes = Array(Float64, length(intervals)) 86 | 87 | for i in 1:(length(intervals) - 1) 88 | sizes[i] = Common.unique(X, intervals[i + 1]) 89 | end 90 | sizes[length(intervals)] = Float64(X.count) 91 | 92 | sizes 93 | end 94 | 95 | function calculate_wa_multilevel!(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}, wa_r::Array{Float64}, wa_w::Array{Float64}) 96 | # TODO: wa_r 97 | 98 | # mem->log 99 | wa_r[1] = 0. 100 | wa_w[1] = 1. 101 | 102 | # log->0 103 | wa_r[2] = 0. 104 | wa_w[2] = Common.unique(X, log_size) / log_size 105 | 106 | # ## amortized, full destination level 107 | # # 0->1, 1->2, ... 108 | # for i in 1:(length(intervals) - 1) 109 | # # level-0...1 size, level-0...2 size, ... 110 | # level_size = Common.unique(X, geom_mean(intervals[(i + 1):end])) 111 | # wa[2 + i] = level_size / intervals[i] 112 | # end 113 | # wa[2 + length(intervals)] = Float64(X.count) / intervals[end] 114 | 115 | ## amortized, compact entire level (TODO: do we need to modify interval to consider "0.5" factor?) 116 | # 0->1, 1->2, ... 117 | # interval = 0. 118 | # next_interval = geom_mean(intervals) 119 | for i in 1:(length(intervals) - 1) 120 | wa_r[2 + i] = 0. 121 | wa_w[2 + i] = unique_avg(X, geom_mean(intervals[i:end]), geom_mean(intervals[i:end]) * 0.5 + geom_mean(intervals[(i + 1):end])) / intervals[i] 122 | # interval = interval * 0.5 + next_interval 123 | # next_interval = geom_mean(intervals[(i + 1):end]) 124 | # wa[2 + i] = unique_avg(X, interval, interval * 0.5 + next_interval) / interval 125 | end 126 | wa_r[2 + length(intervals)] = 0. 127 | wa_w[2 + length(intervals)] = Float64(X.count) / intervals[end] 128 | 129 | wa_r, wa_w 130 | end 131 | 132 | function calculate_wa_multilevel(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}) 133 | wa_r = Array(Float64, 2 + length(intervals)) 134 | wa_w = Array(Float64, 2 + length(intervals)) 135 | calculate_wa_multilevel!(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}, wa_r, wa_w) 136 | end 137 | 138 | # function calculate_wa_multilevel_ratios(X::Distribution, log_size::Float64, l0_count::Float64, interval_ratios::Array{Float64}) 139 | # current_interval = geom_mean(interval_ratios) 140 | # intervals = interval_ratios * (log_size * l0_count / current_interval) 141 | # return calculate_wa_multilevel(X, log_size, l0_count, intervals) 142 | # end 143 | 144 | function calculate_sizes_multilevel(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}) 145 | sizes = Array(Float64, length(intervals)) 146 | 147 | for i in 1:(length(intervals) - 1) 148 | sizes[i] = Common.unique(X, geom_mean(intervals[i + 1:end])) 149 | end 150 | sizes[length(intervals)] = Float64(X.count) 151 | 152 | sizes 153 | end 154 | 155 | function optimize_wa_twolevel(X::Distribution, log_size::Float64, l0_count::Float64, init_intervals::Array{Float64}, wa_r_factor::Float64, ftol::Float64, max_time::Float64) 156 | n = X.count 157 | level_count = length(init_intervals) 158 | 159 | # v2 = Array(Float64, level_count) 160 | # v2[1] = log_size * l0_counum 161 | 162 | # count = 0 163 | # wa_r = Array(Float64, 2 + level_count) 164 | # wa_w = Array(Float64, 2 + level_count) 165 | # f = (v, grad) -> begin 166 | # count += 1 167 | # v2[2:level_count] = v 168 | # get_wa(wa_r_factor, calculate_wa_twolevel!(X, log_size, l0_count, v2, wa_r, wa_w)) 169 | # end 170 | 171 | # v = init_intervals[2:end] 172 | 173 | # opt = Opt(:LN_COBYLA, level_count - 1) 174 | # min_objective!(opt, f) 175 | # # inequality_constraint!(opt, (v, grad) -> log_size * l0_count - v[1]) # <= 0 176 | # for i = 1:(level_count - 2) 177 | # inequality_constraint!(opt, (v, grad) -> v[i] - v[i + 1]) # <= 0 178 | # end 179 | # ftol_abs!(opt, ftol) 180 | # maxtime!(opt, max_time) 181 | # @time (minf, minx, ret) = optimize(opt, v) 182 | # println("got $minf at $minx after $count iterations (returned $ret)") 183 | 184 | # cat(1, [log_size * l0_count], minx) 185 | 186 | ####################### 187 | 188 | v2 = Array(Float64, level_count) 189 | v2[1] = log_size * l0_count 190 | 191 | count = 0 192 | wa_r = Array(Float64, 2 + level_count) 193 | wa_w = Array(Float64, 2 + level_count) 194 | 195 | eval_f = (v) -> begin 196 | count += 1 197 | v2[2:level_count] = v 198 | get_wa(wa_r_factor, calculate_wa_twolevel!(X, log_size, l0_count, v2, wa_r, wa_w)) 199 | end 200 | 201 | eval_grad_f = (v, grad_f) -> begin 202 | v2[2:level_count] = v 203 | y = get_wa(wa_r_factor, calculate_wa_twolevel!(X, log_size, l0_count, v2, wa_r, wa_w)) 204 | for i = 2:level_count 205 | diff = max(v2[i] * 0.001, 1.) 206 | org = v2[i] 207 | v2[i] += diff 208 | grad_f[i - 1] = (get_wa(wa_r_factor, calculate_wa_twolevel!(X, log_size, l0_count, v2, wa_r, wa_w)) - y) / diff 209 | v2[i] = org 210 | end 211 | end 212 | 213 | eval_g = (v, g) -> begin 214 | for i = 1:(level_count - 2) 215 | g[i] = v[i] - v[i + 1] 216 | end 217 | end 218 | 219 | # level i's interval - level i+1's interval <= 0 220 | eval_jac_g = (v, mode, rows, cols, values) -> begin 221 | if mode == :Structure 222 | c = 1 223 | for i = 1:level_count - 2 224 | rows[c] = i 225 | cols[c] = i 226 | c += 1 227 | rows[c] = i 228 | cols[c] = i + 1 229 | c += 1 230 | end 231 | else 232 | c = 1 233 | for i = 1:level_count - 2 234 | values[c] = 1. 235 | c += 1 236 | values[c] = -1. 237 | c += 1 238 | end 239 | end 240 | end 241 | 242 | v_L = [log_size * l0_count for i = 1:level_count - 1] 243 | v_U = [2.e19 for i = 1:level_count - 1] 244 | 245 | g_L = [-2.e19 for i = 1:level_count - 2] 246 | g_U = [0. for i = 1:level_count - 2] 247 | 248 | prob = createProblem(level_count - 1, v_L, v_U, 249 | level_count - 2, g_L, g_U, 250 | (level_count - 2) * 2, 0, 251 | eval_f, eval_g, eval_grad_f, eval_jac_g) 252 | 253 | addOption(prob, "hessian_approximation", "limited-memory") 254 | 255 | addOption(prob, "tol", ftol) 256 | addOption(prob, "max_cpu_time", max_time) 257 | addOption(prob, "acceptable_iter", 1000) 258 | 259 | addOption(prob, "print_level", 2) 260 | 261 | prob.x = init_intervals[2:end] 262 | 263 | @time status = solveProblem(prob) 264 | 265 | ret = Ipopt.ApplicationReturnStatus[status] 266 | minf = prob.obj_val 267 | minx = prob.x 268 | println("got $minf at $minx after $count iterations (returned $ret)") 269 | 270 | cat(1, [log_size * l0_count], minx) 271 | end 272 | 273 | function optimize_wa_multilevel(X::Distribution, log_size::Float64, l0_count::Float64, init_intervals::Array{Float64}, wa_r_factor::Float64, ftol::Float64, max_time::Float64) 274 | n = X.count 275 | level_count = length(init_intervals) 276 | 277 | # v2 = Array(Float64, level_count) 278 | 279 | # count = 0 280 | # wa_r = Array(Float64, 2 + level_count) 281 | # wa_w = Array(Float64, 2 + level_count) 282 | # f = (v, grad) -> begin 283 | # count += 1 284 | # # we need to make geom_mean(cat(1, [X], v)) = log_size * l0_count 285 | # # 1/X + .. = 1 / (log_size * l0_count) 286 | # # 1/X = 1 / (log_size * l0_count) - ... 287 | # # X = 1 / (1 / (log_size * l0_count) - ...) 288 | # # = geom_mean(cat(1, [log_size * l0_count], -v)) 289 | 290 | # # v2[1] = geom_mean(cat(1, [log_size * l0_count], -v)) 291 | 292 | # v2[1] = -(log_size * l0_count) 293 | # v2[2:level_count] = v 294 | # v2[1] = -geom_mean(v2) 295 | # get_wa(wa_r_factor, calculate_wa_multilevel!(X, log_size, l0_count, v2, wa_r, wa_w)) 296 | # end 297 | 298 | # v = init_intervals[2:end] 299 | 300 | # opt = Opt(:LN_COBYLA, level_count - 1) 301 | # min_objective!(opt, f) 302 | # for i = 1:(level_count - 2) 303 | # inequality_constraint!(opt, (v, grad) -> v[i] - v[i + 1]) # <= 0 304 | # end 305 | # ftol_abs!(opt, ftol) 306 | # maxtime!(opt, max_time) 307 | # @time (minf, minx, ret) = optimize(opt, v) 308 | # println("got $minf at $minx after $count iterations (returned $ret)") 309 | 310 | # x = geom_mean(cat(1, [log_size * l0_count], -minx)) 311 | # cat(1, [x], minx) 312 | 313 | ####################### 314 | 315 | v2 = Array(Float64, level_count) 316 | v2[1] = log_size * l0_count 317 | 318 | count = 0 319 | wa_r = Array(Float64, 2 + level_count) 320 | wa_w = Array(Float64, 2 + level_count) 321 | 322 | eval_f = (v) -> begin 323 | count += 1 324 | v2[1] = -(log_size * l0_count) 325 | v2[2:level_count] = v 326 | v2[1] = -geom_mean(v2) 327 | # note that v2[1] can become negative accidentally, which is not valid for unique() 328 | get_wa(wa_r_factor, calculate_wa_multilevel!(X, log_size, l0_count, v2, wa_r, wa_w)) 329 | end 330 | 331 | eval_grad_f = (v, grad_f) -> begin 332 | v2[1] = -(log_size * l0_count) 333 | v2[2:level_count] = v 334 | v2[1] = -geom_mean(v2) 335 | y = get_wa(wa_r_factor, calculate_wa_multilevel!(X, log_size, l0_count, v2, wa_r, wa_w)) 336 | for i = 2:level_count 337 | diff = max(v2[i] * 0.001, 1.) 338 | org = v2[i] 339 | v2[i] += diff 340 | v2[1] = -(log_size * l0_count) 341 | v2[1] = -geom_mean(v2) 342 | grad_f[i - 1] = (get_wa(wa_r_factor, calculate_wa_multilevel!(X, log_size, l0_count, v2, wa_r, wa_w)) - y) / diff 343 | v2[i] = org 344 | end 345 | end 346 | 347 | eval_g = (v, g) -> begin 348 | for i = 1:(level_count - 2) 349 | g[i] = v[i] - v[i + 1] 350 | end 351 | end 352 | 353 | # level i's interval - level i+1's interval <= 0 354 | eval_jac_g = (v, mode, rows, cols, values) -> begin 355 | if mode == :Structure 356 | c = 1 357 | for i = 1:level_count - 2 358 | rows[c] = i 359 | cols[c] = i 360 | c += 1 361 | rows[c] = i 362 | cols[c] = i + 1 363 | c += 1 364 | end 365 | else 366 | c = 1 367 | for i = 1:level_count - 2 368 | values[c] = 1 369 | c += 1 370 | values[c] = -1 371 | c += 1 372 | end 373 | end 374 | end 375 | 376 | v_L = [log_size * l0_count for i = 1:level_count - 1] 377 | v_U = [2.e19 for i = 1:level_count - 1] 378 | 379 | g_L = [-2.e19 for i = 1:level_count - 2] 380 | g_U = [0. for i = 1:level_count - 2] 381 | 382 | prob = createProblem(level_count - 1, v_L, v_U, 383 | level_count - 2, g_L, g_U, 384 | (level_count - 2) * 2, 0, 385 | eval_f, eval_g, eval_grad_f, eval_jac_g) 386 | 387 | addOption(prob, "hessian_approximation", "limited-memory") 388 | 389 | addOption(prob, "tol", ftol) 390 | addOption(prob, "max_cpu_time", max_time) 391 | addOption(prob, "acceptable_iter", 1000) 392 | 393 | addOption(prob, "print_level", 2) 394 | 395 | prob.x = init_intervals[2:end] 396 | 397 | @time status = solveProblem(prob) 398 | 399 | ret = Ipopt.ApplicationReturnStatus[status] 400 | minf = prob.obj_val 401 | minx = prob.x 402 | println("got $minf at $minx after $count iterations (returned $ret)") 403 | 404 | x = geom_mean(cat(1, [log_size * l0_count], -minx)) 405 | cat(1, [x], minx) 406 | end 407 | 408 | function print_twolevel(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}, wa_r_factor::Float64) 409 | level_count = length(intervals) 410 | 411 | println("intervals = ", [iround(v) for v in intervals]) 412 | println("exp. size = ", [iround(Common.unique(X, v)) for v in intervals]) 413 | println("(", [round(intervals[i] / intervals[i - 1] * 100.) / 100. for i in 2:length(intervals)], " X)") 414 | wa = calculate_wa_twolevel(X, log_size, l0_count, intervals) 415 | println("WA (mem->log) = ", wa[2][1]) 416 | println("WA (log->0) = ", wa[2][2]) 417 | for i = 1:level_count; println("WA ($(i-1)->$i) = ", wa[2][i + 2]) end 418 | println("WA = ", get_wa(wa_r_factor, wa)) 419 | end 420 | 421 | function print_multilevel(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}, wa_r_factor::Float64) 422 | level_count = length(intervals) 423 | 424 | println("intervals = ", [iround(v) for v in intervals]) 425 | println("(", [round(intervals[i] / intervals[i - 1] * 100.) / 100. for i in 2:length(intervals)], " X)") 426 | wa = calculate_wa_multilevel(X, log_size, l0_count, intervals) 427 | println("avg L0 intervals = ", iround(geom_mean(intervals))) 428 | println("WA (mem->log) = ", wa[2][1]) 429 | println("WA (log->0) = ", wa[2][2]) 430 | for i = 1:level_count; println("WA ($(i-1)->$i) = ", wa[2][i + 2]) end 431 | println("WA = ", get_wa(wa_r_factor, wa)) 432 | end 433 | 434 | end 435 | -------------------------------------------------------------------------------- /Main.jl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/julia 2 | 3 | include("Common.jl") 4 | include("SizeModel.jl") 5 | include("IntervalModel.jl") 6 | include("Analysis.jl") 7 | 8 | #Analysis.run() 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS=\ 2 | -Ileveldb/include \ 3 | -Irocksdb/include \ 4 | -pthread \ 5 | -g -Wall -Wextra -Wsign-conversion -Winline -Wno-unused-function \ 6 | -Wconversion \ 7 | -O3 \ 8 | -march=native \ 9 | -std=c++0x 10 | # -std=c++11 11 | # -fno-omit-frame-pointer 12 | 13 | MAIN_SRC=util.cpp leveldb.cpp leveldb_impl.cpp rocksdb_impl.cpp meshdb.cpp main.cpp 14 | MEASURE_RW_SRC=measure_rw.cpp 15 | 16 | TARGETS=main measure_rw 17 | 18 | MAIN_OBJ=$(patsubst %.cpp,%.o,$(MAIN_SRC)) 19 | MAIN_DEPFILES:=$(patsubst %.cpp,%.d,$(MAIN_SRC)) 20 | 21 | MEASURE_RW_OBJ=$(patsubst %.cpp,%.o,$(MEASURE_RW_SRC)) 22 | MEASURE_RW_DEPFILES:=$(patsubst %.cpp,%.d,$(MEASURE_RW_SRC)) 23 | 24 | all: $(TARGETS) 25 | 26 | main: $(MAIN_OBJ) leveldb/libleveldb.a rocksdb/librocksdb.a 27 | $(CXX) $(CXXFLAGS) -o $@ $^ -lsnappy -lz -lbz2 -lrt 28 | 29 | measure_rw: $(MEASURE_RW_OBJ) 30 | $(CXX) $(CXXFLAGS) -o $@ $^ 31 | 32 | clean: 33 | $(RM) $(MAIN_OBJ) $(MAIN_DEPFILES) $(MEASURE_RW_OBJ) $(MEASURE_RW_DEPFILES) $(TARGETS) 34 | 35 | 36 | # dependancy checking from https://stackoverflow.com/a/313787 37 | NODEPS:=clean 38 | 39 | ifeq (0, $(words $(findstring $(MAKECMDGOALS), $(NODEPS)))) 40 | -include $(DEPFILES) 41 | endif 42 | 43 | %.d: %.cpp 44 | $(CXX) $(CXXFLAGS) -MM -MT '$(patsubst %.cpp,%.o,$<)' $< -MF $@ 45 | # end 46 | 47 | 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Accurate and Fast Evaluation of Multi-Stage Log-Structured Designs 2 | ================================================================== 3 | 4 | 5 | 6 | Contributors 7 | ------------ 8 | 9 | * Hyeontaek Lim (CMU) 10 | 11 | 12 | License 13 | ------- 14 | 15 | Copyright 2014, 2015, 2016 Carnegie Mellon University 16 | 17 | Licensed under the Apache License, Version 2.0 (the "License"); 18 | you may not use this file except in compliance with the License. 19 | You may obtain a copy of the License at 20 | 21 | http://www.apache.org/licenses/LICENSE-2.0 22 | 23 | Unless required by applicable law or agreed to in writing, software 24 | distributed under the License is distributed on an "AS IS" BASIS, 25 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 26 | See the License for the specific language governing permissions and 27 | limitations under the License. 28 | 29 | -------------------------------------------------------------------------------- /SizeModel.jl: -------------------------------------------------------------------------------- 1 | module SizeModel 2 | 3 | using Common 4 | # using NLopt 5 | using Ipopt 6 | 7 | function init_sizes(X::Distribution, l1_size::Float64, growth_factor::Float64=0., level_count::Int64=0) 8 | n = X.count 9 | 10 | if growth_factor != 0. 11 | # we are fine 12 | elseif level_count != 0 13 | growth_factor = exp(log(Float64(n) / l1_size) / Float64(level_count - 1)) 14 | else 15 | @assert false 16 | end 17 | 18 | sizes = Array(Float64, 0) 19 | i = 1 20 | while true 21 | size = l1_size * ceil(growth_factor ^ Float64(i - 1)) 22 | if size < Float64(n) 23 | push!(sizes, size) 24 | else 25 | push!(sizes, Float64(n)) 26 | break 27 | end 28 | i += 1 29 | end 30 | 31 | sizes 32 | end 33 | 34 | function calculate_ra!(X::Distribution, X_q::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64}, ra::Array{Float64}) 35 | end 36 | 37 | function calculate_wa!(X::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64}, wa_r::Array{Float64}, wa_w::Array{Float64}) 38 | @assert sizes[end] == Float64(X.count) 39 | 40 | # mem->log 41 | wa_r[1] = 0. 42 | wa_w[1] = 1. 43 | 44 | # log->0 45 | wa_r[2] = 0. 46 | wa_w[2] = Common.unique(X, log_size) / log_size 47 | 48 | # ## amortized, full destination level 49 | # # 0->1 50 | # wa[3] = Common.unique(X, unique_inv(X, sizes[1]) + (log_size * l0_count)) / (log_size * l0_count) 51 | # # 1->2, 2->3, ... 52 | # for i in 1:(length(sizes) - 1) 53 | # if i < length(sizes) - 1 54 | # wa[3 + i] = merge(X, sizes[i + 1], sizes[i]) / unique_inv(X, sizes[i]) 55 | # else 56 | # wa[3 + i] = Float64(X.count) / unique_inv(X, sizes[i]) 57 | # end 58 | # end 59 | 60 | # ## amortized, compact entire level, discrete interval calculation (maybe accurate, but does not work with optimizer) 61 | # # 0->1 62 | # interval = log_size * l0_count 63 | # next_interval = unique_inv(X, sizes[1]) 64 | # effective_next_interval = floor(next_interval / interval + 1.) * interval 65 | # wa[3] = unique_avg(X, interval, effective_next_interval) / interval 66 | # # 1->2, 2->3, ... 67 | # for i in 1:(length(sizes) - 1) 68 | # interval = effective_next_interval 69 | # if i < length(sizes) - 1 70 | # next_interval = unique_inv(X, sizes[i + 1]) 71 | # effective_next_interval = floor(next_interval / interval + 1.) * interval 72 | # wa[3 + i] = unique_avg(X, interval, effective_next_interval) / interval 73 | # else 74 | # wa[3 + i] = sizes[end] / interval 75 | # end 76 | # end 77 | 78 | # note that "interval * 0.5" is the overflown amount that causes compaction 79 | # 0.5 is just an approximate; it should be lower under high skew or with a key count close to the total unique count 80 | # because the level grows slowly as its size approaches the maximum level size. 81 | 82 | # ## amortized, compact entire level, continuous interval calculation (maybe less accurate) 83 | # # 0->1 84 | # interval = log_size * l0_count 85 | # next_interval = unique_inv(X, sizes[1]) 86 | # wa[3] = unique_avg(X, interval, interval * 0.5 + next_interval) / interval 87 | # # 1->2, 2->3, ... 88 | # for i in 1:(length(sizes) - 1) 89 | # interval = interval * 0.5 + next_interval 90 | # if i < length(sizes) - 1 91 | # next_interval = unique_inv(X, sizes[i + 1]) 92 | # wa[3 + i] = unique_avg(X, interval, interval * 0.5 + next_interval) / interval 93 | # else 94 | # wa[3 + i] = sizes[end] / interval 95 | # end 96 | # end 97 | 98 | ## deamortized, compact each sstable in a round-robin way 99 | # 0->1 100 | interval = log_size * l0_count 101 | next_interval = unique_inv(X, sizes[1]) 102 | wa_r[3] = (Common.unique(X, log_size) * l0_count + sizes[1]) / interval 103 | wa_w[3] = Common.unique(X, interval + next_interval) / interval 104 | # 1->2, 2->3, ... 105 | for i in 1:(length(sizes) - 1) 106 | # we need to take the previous interval as part of this interval ("interval +") 107 | # because the current level temporarily has to accommodate the data from the previous level 108 | interval = interval + interval_from_density(X, sizes[i]) 109 | if i < length(sizes) - 1 110 | next_interval = unique_inv(X, sizes[i + 1]) 111 | # plus unique(X, interval) * 1 to WA because of the overlapping tables' keys that do not actually overlap the compaction key range 112 | # TODO: this may become less accurate with spatial locality in key range because the overlapping tables' key range may be sparse 113 | wa_r[3 + i] = (Common.unique(X, interval) + sizes[i + 1] + Common.unique(X, interval) * 1.) / interval 114 | wa_w[3 + i] = (Common.unique(X, interval + next_interval) + Common.unique(X, interval) * 1.) / interval 115 | else 116 | wa_r[3 + i] = (Common.unique(X, interval) + sizes[end] + Common.unique(X, interval) * 1.) / interval 117 | wa_w[3 + i] = (sizes[end] + Common.unique(X, interval) * 1.) / interval 118 | end 119 | end 120 | 121 | wa_r, wa_w 122 | end 123 | 124 | function calculate_wa(X::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64}) 125 | wa_r = Array(Float64, 2 + length(sizes)) 126 | wa_w = Array(Float64, 2 + length(sizes)) 127 | calculate_wa!(X::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64}, wa_r, wa_w) 128 | end 129 | 130 | function optimize_wa(X::Distribution, log_size::Float64, l0_count::Float64, init_sizes::Array{Float64}, wa_r_factor::Float64, ftol::Float64, max_time::Float64) 131 | n = X.count 132 | level_count = length(init_sizes) 133 | 134 | # v2 = Array(Float64, level_count) 135 | # v2[level_count] = n 136 | 137 | # count = 0 138 | # wa = Array(Float64, 2 + level_count) 139 | # f = (v, grad) -> begin 140 | # if length(grad) > 0 141 | # v2[1:level_count - 1] = v 142 | # y = get_wa(wa_r_factor, calculate_wa!(X, log_size, l0_count, v2, wa)) 143 | # for i = 1:length(grad) 144 | # org = v2[i] 145 | # v2[i] += 1 146 | # grad[i] = get_wa(wa_r_factor, calculate_wa!(X, log_size, l0_count, v2, wa)) - y 147 | # v2[i] = org 148 | # end 149 | # end 150 | # count += 1 151 | # v2[1:level_count - 1] = v 152 | # get_wa(wa_r_factor, calculate_wa!(X, log_size, l0_count, v2, wa)) 153 | # end 154 | 155 | # gen_g = (i) -> (v, grad) -> begin 156 | # if length(grad) > 0 157 | # for j = 1:length(grad) 158 | # if i == j 159 | # grad[j] = 1. 160 | # elseif i + 1 == j 161 | # grad[j] = -1. 162 | # else 163 | # grad[j] = 0. 164 | # end 165 | # end 166 | # end 167 | # v[i] - v[i + 1] 168 | # end 169 | 170 | # v = init_sizes[1:end - 1] 171 | 172 | # v_L = [0. for i = 1:level_count - 1] 173 | # v_U = [2.e19 for i = 1:level_count - 1] 174 | 175 | # opt = Opt(:LN_COBYLA, level_count - 1) 176 | # # opt = Opt(:LD_MMA, level_count - 1) 177 | # min_objective!(opt, f) 178 | # lower_bounds!(opt, v_L) 179 | # upper_bounds!(opt, v_U) 180 | # for i = 1:(level_count - 2) 181 | # # inequality_constraint!(opt, (v, grad) -> v[i] - v[i + 1]) # <= 0 182 | # inequality_constraint!(opt, gen_g(i)) # <= 0 183 | # end 184 | # # inequality_constraint!(opt, (v, grad) -> v[level_count - 2] - n) # <= 0 185 | # ftol_abs!(opt, ftol) 186 | # maxtime!(opt, max_time) 187 | # @time (minf, minx, ret) = optimize(opt, v) 188 | # println("got $minf at $minx after $count iterations (returned $ret)") 189 | 190 | # cat(1, minx, [n]) 191 | 192 | ######################### 193 | 194 | v2 = Array(Float64, level_count) 195 | v2[level_count] = n 196 | 197 | count = 0 198 | wa_r = Array(Float64, 2 + level_count) 199 | wa_w = Array(Float64, 2 + level_count) 200 | 201 | eval_f = (v) -> begin 202 | count += 1 203 | v2[1:level_count - 1] = v 204 | get_wa(wa_r_factor, calculate_wa!(X, log_size, l0_count, v2, wa_r, wa_w)) 205 | end 206 | 207 | eval_grad_f = (v, grad_f) -> begin 208 | v2[1:level_count - 1] = v 209 | y = get_wa(wa_r_factor, calculate_wa!(X, log_size, l0_count, v2, wa_r, wa_w)) 210 | for i = 1:(level_count - 1) 211 | diff = max(v2[i] * 0.001, 1.) 212 | org = v2[i] 213 | v2[i] += diff 214 | grad_f[i] = (get_wa(wa_r_factor, calculate_wa!(X, log_size, l0_count, v2, wa_r, wa_w)) - y) / diff 215 | v2[i] = org 216 | end 217 | end 218 | 219 | eval_g = (v, g) -> begin 220 | for i = 1:(level_count - 2) 221 | g[i] = v[i] - v[i + 1] 222 | end 223 | end 224 | 225 | # level i's size - level i+1's size <= 0 226 | eval_jac_g = (v, mode, rows, cols, values) -> begin 227 | if mode == :Structure 228 | c = 1 229 | for i = 1:level_count - 2 230 | rows[c] = i 231 | cols[c] = i 232 | c += 1 233 | rows[c] = i 234 | cols[c] = i + 1 235 | c += 1 236 | end 237 | else 238 | c = 1 239 | for i = 1:level_count - 2 240 | values[c] = 1. 241 | c += 1 242 | values[c] = -1. 243 | c += 1 244 | end 245 | end 246 | end 247 | 248 | v_L = [1. for i = 1:level_count - 1] 249 | v_U = [Float64(n) for i = 1:level_count - 1] 250 | 251 | g_L = [-2.e19 for i = 1:level_count - 2] 252 | g_U = [0. for i = 1:level_count - 2] 253 | 254 | prob = createProblem(level_count - 1, v_L, v_U, 255 | level_count - 2, g_L, g_U, 256 | (level_count - 2) * 2, 0, 257 | eval_f, eval_g, eval_grad_f, eval_jac_g) 258 | 259 | addOption(prob, "hessian_approximation", "limited-memory") 260 | 261 | addOption(prob, "tol", ftol) 262 | addOption(prob, "max_cpu_time", max_time) 263 | addOption(prob, "acceptable_iter", 1000) 264 | 265 | addOption(prob, "print_level", 2) 266 | 267 | prob.x = init_sizes[1:end - 1] 268 | 269 | @time status = solveProblem(prob) 270 | 271 | ret = Ipopt.ApplicationReturnStatus[status] 272 | minf = prob.obj_val 273 | minx = prob.x 274 | println("got $minf at $minx after $count iterations (returned $ret)") 275 | 276 | cat(1, minx, [n]) 277 | end 278 | 279 | 280 | ########### 281 | 282 | function calculate_random_compaction_wa!(X::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64}, wa_r::Array{Float64}, wa_w::Array{Float64}) 283 | @assert sizes[end] == Float64(X.count) 284 | 285 | # mem->log 286 | wa_r[1] = 0. 287 | wa_w[1] = 1. 288 | 289 | # log->0 290 | wa_r[2] = 0. 291 | wa_w[2] = Common.unique(X, log_size) / log_size 292 | 293 | ## deamortized, compact each sstable in a random way 294 | # 0->1 295 | interval = log_size * l0_count 296 | next_interval = unique_inv(X, sizes[1]) 297 | wa_r[3] = (Common.unique(X, log_size) * l0_count + sizes[1]) / interval 298 | wa_w[3] = Common.unique(X, interval + next_interval) / interval 299 | # 1->2, 2->3, ... 300 | for i in 1:(length(sizes) - 1) 301 | # we need to take the previous interval as part of this interval ("interval +") 302 | # because the current level temporarily has to accommodate the data from the previous level 303 | interval = interval + next_interval 304 | if i < length(sizes) - 1 305 | next_interval = unique_inv(X, sizes[i + 1]) 306 | # plus unique(X, interval) * 1 to WA because of the overlapping tables' keys that do not actually overlap the compaction key range 307 | # TODO: this may become less accurate with spatial locality in key range because the overlapping tables' key range may be sparse 308 | wa_r[3 + i] = (Common.unique(X, interval) + sizes[i + 1] + Common.unique(X, interval) * 1.) / interval 309 | wa_w[3 + i] = (Common.unique(X, interval + next_interval) + Common.unique(X, interval) * 1.) / interval 310 | else 311 | wa_r[3 + i] = (Common.unique(X, interval) + sizes[end] + Common.unique(X, interval) * 1.) / interval 312 | wa_w[3 + i] = (sizes[end] + Common.unique(X, interval) * 1.) / interval 313 | end 314 | end 315 | 316 | wa_r, wa_w 317 | end 318 | 319 | function calculate_random_compaction_wa(X::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64}) 320 | wa_r = Array(Float64, 2 + length(sizes)) 321 | wa_w = Array(Float64, 2 + length(sizes)) 322 | calculate_random_compaction_wa!(X::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64}, wa_r, wa_w) 323 | end 324 | 325 | 326 | 327 | 328 | ########### 329 | 330 | 331 | function calculate_mbuf_wa!(X::Distribution, mbuf_size::Float64, sizes::Array{Float64}, wa_r::Array{Float64}, wa_w::Array{Float64}) 332 | @assert sizes[end] == Float64(X.count) 333 | 334 | # mem->log 335 | wa_r[1] = 0. 336 | wa_w[1] = 1. 337 | 338 | # log->mbuf 339 | wa_r[2] = 0. 340 | wa_w[2] = 0. 341 | 342 | ## deamortized, compact each sstable in a round-robin way 343 | # mbuf->1 344 | # no overflow from log because mbuf must be compacted proactively. 345 | interval = interval_from_density(X, mbuf_size) 346 | next_interval = unique_inv(X, sizes[1]) 347 | # # however, we have to consider false overlaps since this is now incremental compaction. 348 | # wa[3] = (Common.unique(X, interval + next_interval) + Common.unique(X, interval) * 1.) / interval 349 | wa_r[3] = (Common.unique(X, log_size) * l0_count + sizes[1]) / interval 350 | wa_w[3] = Common.unique(X, interval + next_interval) / interval 351 | # 1->2, 2->3, ... 352 | for i in 1:(length(sizes) - 1) 353 | # we need to take the previous interval as part of this interval ("interval +") 354 | # because the current level temporarily has to accommodate the data from the previous level 355 | interval = interval + interval_from_density(X, sizes[i]) 356 | if i < length(sizes) - 1 357 | next_interval = unique_inv(X, sizes[i + 1]) 358 | # plus unique(X, interval) * 1 to WA because of the overlapping tables' keys that do not actually overlap the compaction key range 359 | # TODO: this may become less accurate with spatial locality in key range because the overlapping tables' key range may be sparse 360 | wa_r[3 + i] = (Common.unique(X, interval) + sizes[i + 1] + Common.unique(X, interval) * 1.) / interval 361 | wa_w[3 + i] = (Common.unique(X, interval + next_interval) + Common.unique(X, interval) * 1.) / interval 362 | else 363 | wa_r[3 + i] = (Common.unique(X, interval) + sizes[end] + Common.unique(X, interval) * 1.) / interval 364 | wa_w[3 + i] = (sizes[end] + Common.unique(X, interval) * 1.) / interval 365 | end 366 | end 367 | 368 | wa_r, wa_w 369 | end 370 | 371 | function calculate_mbuf_wa(X::Distribution, mbuf_size::Float64, sizes::Array{Float64}) 372 | wa_r = Array(Float64, 2 + length(sizes)) 373 | wa_w = Array(Float64, 2 + length(sizes)) 374 | calculate_mbuf_wa!(X::Distribution, mbuf_size::Float64, sizes::Array{Float64}, wa_r, wa_w) 375 | end 376 | 377 | function optimize_mbuf_wa(X::Distribution, mbuf_size::Float64, init_sizes::Array{Float64}, wa_r_factor::Float64, ftol::Float64, max_time::Float64) 378 | n = X.count 379 | level_count = length(init_sizes) 380 | 381 | v2 = Array(Float64, level_count) 382 | v2[level_count] = n 383 | 384 | count = 0 385 | wa = Array(Float64, 2 + level_count) 386 | 387 | eval_f = (v) -> begin 388 | count += 1 389 | v2[1:level_count - 1] = v 390 | get_wa(wa_r_factor, calculate_mbuf_wa!(X, mbuf_size, v2, wa)) 391 | end 392 | 393 | eval_grad_f = (v, grad_f) -> begin 394 | v2[1:level_count - 1] = v 395 | y = get_wa(wa_r_factor, calculate_mbuf_wa!(X, mbuf_size, v2, wa)) 396 | for i = 1:(level_count - 1) 397 | diff = max(v2[i] * 0.001, 1.) 398 | org = v2[i] 399 | v2[i] += diff 400 | grad_f[i] = (get_wa(wa_r_factor, calculate_mbuf_wa!(X, mbuf_size, v2, wa)) - y) / diff 401 | v2[i] = org 402 | end 403 | end 404 | 405 | eval_g = (v, g) -> begin 406 | for i = 1:(level_count - 2) 407 | g[i] = v[i] - v[i + 1] 408 | end 409 | end 410 | 411 | # level i's size - level i+1's size <= 0 412 | eval_jac_g = (v, mode, rows, cols, values) -> begin 413 | if mode == :Structure 414 | c = 1 415 | for i = 1:level_count - 2 416 | rows[c] = i 417 | cols[c] = i 418 | c += 1 419 | rows[c] = i 420 | cols[c] = i + 1 421 | c += 1 422 | end 423 | else 424 | c = 1 425 | for i = 1:level_count - 2 426 | values[c] = 1. 427 | c += 1 428 | values[c] = -1. 429 | c += 1 430 | end 431 | end 432 | end 433 | 434 | # v_L = [1. for i = 1:level_count - 1] 435 | v_L = [mbuf_size for i = 1:level_count - 1] 436 | v_U = [Float64(n) for i = 1:level_count - 1] 437 | 438 | g_L = [-2.e19 for i = 1:level_count - 2] 439 | g_U = [0. for i = 1:level_count - 2] 440 | 441 | prob = createProblem(level_count - 1, v_L, v_U, 442 | level_count - 2, g_L, g_U, 443 | (level_count - 2) * 2, 0, 444 | eval_f, eval_g, eval_grad_f, eval_jac_g) 445 | 446 | addOption(prob, "hessian_approximation", "limited-memory") 447 | 448 | addOption(prob, "tol", ftol) 449 | addOption(prob, "max_cpu_time", max_time) 450 | addOption(prob, "acceptable_iter", 1000) 451 | 452 | addOption(prob, "print_level", 2) 453 | 454 | prob.x = init_sizes[1:end - 1] 455 | 456 | @time status = solveProblem(prob) 457 | 458 | ret = Ipopt.ApplicationReturnStatus[status] 459 | minf = prob.obj_val 460 | minx = prob.x 461 | println("got $minf at $minx after $count iterations (returned $ret)") 462 | 463 | cat(1, minx, [n]) 464 | end 465 | 466 | 467 | 468 | 469 | 470 | 471 | ########### 472 | 473 | 474 | 475 | function print(X::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64}, wa_r_factor::Float64) 476 | level_count = length(sizes) 477 | 478 | println("sizes = ", [iround(v) for v in sizes]) 479 | println("(", [round(sizes[i] / sizes[i - 1] * 100.) / 100. for i in 2:length(sizes)], " X)") 480 | wa = calculate_wa(X, log_size, l0_count, sizes) 481 | println("WA (mem->log) = ", wa[2][1]) 482 | println("WA (log->0) = ", wa[2][2]) 483 | for i = 1:level_count; println("WA ($(i-1)->$i) = ", wa[2][i + 2]) end 484 | println("WA = ", get_wa(wa_r_factor, wa)) 485 | end 486 | 487 | 488 | ## TODO: COLA and SAMT should be moved to IntervalModel 489 | 490 | # COLA 491 | function calculate_wa_cola!(X::Distribution, log_size::Float64, r::Int64, L::Int64, wa_r::Array{Float64}, wa_w::Array{Float64}) 492 | # mem->log 493 | wa_r[1] = 0. 494 | wa_w[1] = 1. 495 | 496 | # mem->1, 1->2, 2->3, ... 497 | interval = 0. 498 | next_interval = log_size 499 | for i in 0:(L - 2) 500 | interval = next_interval 501 | next_interval = interval * r 502 | r_ = 0. 503 | w = 0. 504 | # a level accepts merges up to r-1 times. 505 | # this means that we set r (g in the COLA paper) to be (B^e + 1), which is still in Theta(B^e). 506 | # choosing r in that way makes the number of levels bounded by O(log_{B^e + 1} N) = O(log_r N), 507 | # which results in the level count we intend to obtain. 508 | for j in 0:(r - 2) 509 | if i == 0 510 | r_ += Common.unique(X, interval * j) 511 | else 512 | r_ += Common.unique(X, interval) + Common.unique(X, interval * j) 513 | end 514 | w += Common.unique(X, interval + interval * j) 515 | end 516 | wa_r[2 + i] += r_ / next_interval 517 | wa_w[2 + i] += w / next_interval 518 | end 519 | 520 | # (L-1)->L 521 | interval = next_interval 522 | wa_r[2 + L - 1] = (Common.unique(X, interval) + X.count) / interval 523 | wa_w[2 + L - 1] = X.count / interval 524 | 525 | wa_r, wa_w 526 | end 527 | 528 | function calculate_wa_cola(X::Distribution, log_size::Float64, r::Int64, L::Int64) 529 | wa_r = Array(Float64, 1 + L) 530 | wa_w = Array(Float64, 1 + L) 531 | calculate_wa_cola!(X::Distribution, log_size::Float64, r::Int64, L::Int64, wa_r, wa_w) 532 | end 533 | 534 | # SAMT 535 | function calculate_wa_samt!(X::Distribution, log_size::Float64, r::Int64, L::Int64, wa_r::Array{Float64}, wa_w::Array{Float64}) 536 | # mem->log 537 | wa_r[1] = 0. 538 | wa_w[1] = 1. 539 | 540 | # mem->1, 1->2, 2->3, ... 541 | interval = 0. 542 | next_interval = log_size 543 | for i in 0:(L - 2) 544 | interval = next_interval 545 | next_interval = interval * r 546 | # a level has r slots to put merges 547 | # actually, we do not write anything to the last slot because 548 | # we can merge the level into the next level, which makes 549 | # COLA and SAMT identical when r = 2 550 | #wa[2 + i] = ((r - 1) * Common.unique(X, interval)) / next_interval 551 | # but we choose to do maintain full r slots because the SAMT paper seems to intend it. 552 | # this makes SAMT more expensive (and wasteful) than COLA with r = 2. 553 | # however, SAMT usually uses r = 4, and the compaction only needs to do up to r-way merge 554 | # (not ((r-1)^l)-way in COLA), which makes more sense in a practical standpoint. 555 | wa_r[2 + i] = (r * Common.unique(X, interval)) / next_interval 556 | wa_w[2 + i] = (r * Common.unique(X, interval)) / next_interval 557 | end 558 | 559 | # (L-1)->L 560 | interval = next_interval 561 | wa_r[2 + L - 1] = (r * Common.unique(X, interval) + X.count) / interval 562 | wa_w[2 + L - 1] = X.count / interval 563 | 564 | wa_r, wa_w 565 | end 566 | 567 | function calculate_wa_samt(X::Distribution, log_size::Float64, r::Int64, L::Int64) 568 | wa_r = Array(Float64, 1 + L) 569 | wa_w = Array(Float64, 1 + L) 570 | calculate_wa_samt!(X::Distribution, log_size::Float64, r::Int64, L::Int64, wa_r, wa_w) 571 | end 572 | 573 | ########### 574 | 575 | 576 | # original SILT with major compaction from HashStore to SortedStore 577 | function calculate_wa_silt!(X::Distribution, hash_size::Float64, hash_occupancy::Float64, hash_count::Int64, wa_r::Array{Float64}, wa_w::Array{Float64}) 578 | convert_interval = unique_inv(X, hash_size * hash_occupancy) 579 | 580 | # TODO: wa_r 581 | 582 | # mem->log store 583 | wa_r[1] = 0. 584 | wa_w[1] = 1. 585 | 586 | # log store->hash store 587 | wa_r[2] = 0. 588 | wa_w[2] = hash_size / convert_interval 589 | 590 | # hash stores->sorted store 591 | wa_r[3] = 0. 592 | wa_w[3] = X.count / (convert_interval * hash_count) 593 | 594 | wa_r, wa_w 595 | end 596 | 597 | function calculate_wa_silt(X::Distribution, log_size::Float64, hash_occupancy::Float64, hash_count::Int64) 598 | wa = Array(Float64, 3) 599 | calculate_wa_silt!(X::Distribution, log_size::Float64, hash_occupancy::Float64, hash_count::Int64, wa) 600 | end 601 | 602 | 603 | # SILT + minor compaction among HashStore; assume any size of HashStore can be created 604 | function calculate_wa_silt_multi!(X::Distribution, hash_size::Float64, hash_occupancy::Float64, hash_count::Int64, hash_threshold::Float64, wa_r::Array{Float64}, wa_w::Array{Float64}) 605 | convert_interval = unique_inv(X, hash_size * hash_occupancy) 606 | 607 | # TODO: wa_r 608 | 609 | # the interval of minor compaction 610 | minor_compaction_interval = convert_interval * hash_count 611 | 612 | # the number of minor compaction to trigger major compaction; the last minor compaction does not actually write data 613 | minor_compaction_count = floor(unique_inv(X, X.count * hash_threshold) / minor_compaction_interval) 614 | @assert minor_compaction_count >= 1.0 615 | 616 | # the interval of major compaction 617 | major_compaction_interval = minor_compaction_interval * minor_compaction_count 618 | 619 | # mem->log 620 | wa_r[1] = 0. 621 | wa_w[1] = 1. 622 | 623 | # log store->hash store 624 | wa_r[2] = 0. 625 | wa_w[2] = hash_size / convert_interval 626 | 627 | # hash stores->hash store 628 | wa_r[3] = 0. 629 | wa_w[3] = 0. 630 | if minor_compaction_count >= 2 631 | for j in 0:(minor_compaction_count - 2) 632 | wa_w[3] += (Common.unique(X, minor_compaction_interval + minor_compaction_interval * j) / hash_occupancy) / major_compaction_interval 633 | end 634 | end 635 | 636 | # hash stores->sorted 637 | wa_r[4] = 0. 638 | wa_w[4] = X.count / major_compaction_interval 639 | 640 | wa_r, wa_w 641 | end 642 | 643 | function calculate_wa_silt_multi(X::Distribution, hash_size::Float64, hash_occupancy::Float64, hash_count::Int64, hash_threshold::Float64) 644 | wa = Array(Float64, 4) 645 | calculate_wa_silt_multi!(X::Distribution, hash_size::Float64, hash_occupancy::Float64, hash_count::Int64, hash_threshold::Float64, wa) 646 | end 647 | 648 | 649 | end 650 | 651 | -------------------------------------------------------------------------------- /common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | -------------------------------------------------------------------------------- /diff_leveldb.patch: -------------------------------------------------------------------------------- 1 | diff --git a/db/dbformat.h b/db/dbformat.h 2 | index 5d8a032..20cccec 100644 3 | --- a/db/dbformat.h 4 | +++ b/db/dbformat.h 5 | @@ -25,10 +25,12 @@ static const int kNumLevels = 7; 6 | static const int kL0_CompactionTrigger = 4; 7 | 8 | // Soft limit on number of level-0 files. We slow down writes at this point. 9 | -static const int kL0_SlowdownWritesTrigger = 8; 10 | +//static const int kL0_SlowdownWritesTrigger = 8; 11 | +static const int kL0_SlowdownWritesTrigger = 4; 12 | 13 | // Maximum number of level-0 files. We stop writes at this point. 14 | -static const int kL0_StopWritesTrigger = 12; 15 | +//static const int kL0_StopWritesTrigger = 12; 16 | +static const int kL0_StopWritesTrigger = 4; 17 | 18 | // Maximum level to which a new compacted memtable is pushed if it 19 | // does not create overlap. We try to push to level 2 to avoid the 20 | diff --git a/db/version_set.cc b/db/version_set.cc 21 | index aa83df5..f5d8937 100644 22 | --- a/db/version_set.cc 23 | +++ b/db/version_set.cc 24 | @@ -1038,7 +1038,13 @@ void VersionSet::Finalize(Version* v) { 25 | } else { 26 | // Compute the ratio of current size to size limit. 27 | const uint64_t level_bytes = TotalFileSize(v->files_[level]); 28 | - score = static_cast(level_bytes) / MaxBytesForLevel(level); 29 | + // score = static_cast(level_bytes) / MaxBytesForLevel(level); 30 | + // MSLS 31 | + if (level < options_->custom_level_size_count) 32 | + score = static_cast(level_bytes) / static_cast(options_->custom_level_sizes[level]); 33 | + else { 34 | + score = static_cast(level_bytes) / MaxBytesForLevel(level); 35 | + } 36 | } 37 | 38 | if (score > best_score) { 39 | @@ -1286,6 +1292,8 @@ Compaction* VersionSet::PickCompaction() { 40 | // c->inputs_[0] earlier and replace it with an overlapping set 41 | // which will include the picked file. 42 | current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]); 43 | + // MSLS - test 44 | + //current_->GetOverlappingInputs(0, NULL, NULL, &c->inputs_[0]); 45 | assert(!c->inputs_[0].empty()); 46 | } 47 | 48 | diff --git a/include/leveldb/options.h b/include/leveldb/options.h 49 | index fdda718..a6955d4 100644 50 | --- a/include/leveldb/options.h 51 | +++ b/include/leveldb/options.h 52 | @@ -135,6 +135,15 @@ struct Options { 53 | // Default: NULL 54 | const FilterPolicy* filter_policy; 55 | 56 | + // MSLS: Use custom level sizes if custom_level_size_count != 0. 57 | + // custom_level_size_count is the maximum level number to change the size. 58 | + // custom_level_sizes[i] specifies the maximum size of level-i (i < custom_level_size_count). 59 | + // custom_level_sizes[0] is ignored. 60 | + // 61 | + // Default: 0, NULL 62 | + size_t custom_level_size_count; 63 | + const size_t* custom_level_sizes; 64 | + 65 | // Create an Options object with default values for all fields. 66 | Options(); 67 | }; 68 | diff --git a/util/crc32c.cc b/util/crc32c.cc 69 | index 6db9e77..7adb5f9 100644 70 | --- a/util/crc32c.cc 71 | +++ b/util/crc32c.cc 72 | @@ -284,6 +284,9 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) { 73 | } 74 | 75 | uint32_t Extend(uint32_t crc, const char* buf, size_t size) { 76 | + // MSLS 77 | + return 0; 78 | + /* 79 | const uint8_t *p = reinterpret_cast(buf); 80 | const uint8_t *e = p + size; 81 | uint32_t l = crc ^ 0xffffffffu; 82 | @@ -326,6 +329,7 @@ uint32_t Extend(uint32_t crc, const char* buf, size_t size) { 83 | #undef STEP4 84 | #undef STEP1 85 | return l ^ 0xffffffffu; 86 | + */ 87 | } 88 | 89 | } // namespace crc32c 90 | diff --git a/util/options.cc b/util/options.cc 91 | index 76af5b9..98ff188 100644 92 | --- a/util/options.cc 93 | +++ b/util/options.cc 94 | @@ -22,7 +22,9 @@ Options::Options() 95 | block_size(4096), 96 | block_restart_interval(16), 97 | compression(kSnappyCompression), 98 | - filter_policy(NULL) { 99 | + filter_policy(NULL), 100 | + custom_level_size_count(0), 101 | + custom_level_sizes(NULL) { 102 | } 103 | 104 | 105 | -------------------------------------------------------------------------------- /diff_rocksdb.patch: -------------------------------------------------------------------------------- 1 | diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc 2 | index 02f53e5..502d98d 100644 3 | --- a/db/compaction_picker.cc 4 | +++ b/db/compaction_picker.cc 5 | @@ -878,15 +878,37 @@ Compaction* LevelCompactionPicker::PickCompaction( 6 | assert(i == 0 || score <= vstorage->CompactionScore(i - 1)); 7 | if (score >= 1) { 8 | output_level = (level == 0) ? vstorage->base_level() : level + 1; 9 | - if (PickCompactionBySize(vstorage, level, output_level, &inputs, 10 | - &parent_index, &base_index) && 11 | - ExpandWhileOverlapping(cf_name, vstorage, &inputs)) { 12 | - // found the compaction! 13 | - break; 14 | + // MSLS 15 | + //if (PickCompactionBySize(vstorage, level, output_level, &inputs, 16 | + // &parent_index, &base_index) && 17 | + // ExpandWhileOverlapping(cf_name, vstorage, &inputs)) { 18 | + // // found the compaction! 19 | + // break; 20 | + //} else { 21 | + // // didn't find the compaction, clear the inputs 22 | + // inputs.clear(); 23 | + //} 24 | + if (!mutable_cf_options.use_leveldb_table_selection) { 25 | + if (PickCompactionBySize(vstorage, level, output_level, &inputs, 26 | + &parent_index, &base_index) && 27 | + ExpandWhileOverlapping(cf_name, vstorage, &inputs)) { 28 | + // found the compaction! 29 | + break; 30 | + } else { 31 | + // didn't find the compaction, clear the inputs 32 | + inputs.clear(); 33 | + } 34 | } else { 35 | - // didn't find the compaction, clear the inputs 36 | - inputs.clear(); 37 | - } 38 | + if (PickCompactionLevelDB(vstorage, level, output_level, &inputs, 39 | + &parent_index, &base_index) && 40 | + ExpandWhileOverlapping(cf_name, vstorage, &inputs)) { 41 | + // found the compaction! 42 | + break; 43 | + } else { 44 | + // didn't find the compaction, clear the inputs 45 | + inputs.clear(); 46 | + } 47 | + } 48 | } 49 | } 50 | 51 | @@ -1074,6 +1096,73 @@ bool LevelCompactionPicker::PickCompactionBySize(VersionStorageInfo* vstorage, 52 | return inputs->size() > 0; 53 | } 54 | 55 | +// MSLS 56 | +bool LevelCompactionPicker::PickCompactionLevelDB(VersionStorageInfo* vstorage, 57 | + int level, int output_level, 58 | + CompactionInputFiles* inputs, 59 | + int* parent_index, 60 | + int* base_index) { 61 | + // level 0 files are overlapping. So we cannot pick more 62 | + // than one concurrent compactions at this level. This 63 | + // could be made better by looking at key-ranges that are 64 | + // being compacted at level 0. 65 | + if (level == 0 && !level0_compactions_in_progress_.empty()) { 66 | + return false; 67 | + } 68 | + 69 | + inputs->clear(); 70 | + 71 | + assert(level >= 0); 72 | + 73 | + const std::vector& level_files = vstorage->LevelFiles(level); 74 | + 75 | + std::string last_key = vstorage->LastKey(level); 76 | + bool respect_last_key = true; 77 | + 78 | + for (unsigned int i = 0; i < level_files.size() * 2; i++) { 79 | + int index = (vstorage->NextCompactionIndex(level) + i) % (int)level_files.size(); 80 | + assert(index >= 0 && static_cast(index) < level_files.size()); 81 | + 82 | + FileMetaData* f = level_files[index]; 83 | + 84 | + if (i != 0 && index == 0) { 85 | + if (respect_last_key) { 86 | + respect_last_key = false; 87 | + } 88 | + } 89 | + 90 | + if (respect_last_key && f->smallest.Encode().ToString() < last_key) { 91 | + //printf("too small key\n"); 92 | + continue; 93 | + } 94 | + 95 | + // do not pick a file to compact if it is being compacted 96 | + // from n-1 level. 97 | + if (f->being_compacted) { 98 | + //printf("being compacted\n"); 99 | + continue; 100 | + } 101 | + 102 | + // Do not pick this file if its parents at level+1 are being compacted. 103 | + // Maybe we can avoid redoing this work in SetupOtherInputs 104 | + *parent_index = -1; 105 | + if (RangeInCompaction(vstorage, &f->smallest, &f->largest, output_level, 106 | + parent_index)) { 107 | + //printf("parents being compacted\n"); 108 | + continue; 109 | + } 110 | + inputs->files.push_back(f); 111 | + inputs->level = level; 112 | + *base_index = index; 113 | + vstorage->SetNextCompactionIndex(level, index); 114 | + vstorage->SetLastKey(level, f->largest.Encode().ToString()); 115 | + //printf("%d %d %d/%zu %hhx %hhx\n", level, i, index, level_files.size(), last_key.c_str()[0], f->largest.Encode().ToString().c_str()[0]); 116 | + break; 117 | + } 118 | + 119 | + return inputs->size() > 0; 120 | +} 121 | + 122 | #ifndef ROCKSDB_LITE 123 | bool UniversalCompactionPicker::NeedsCompaction( 124 | const VersionStorageInfo* vstorage) const { 125 | diff --git a/db/compaction_picker.h b/db/compaction_picker.h 126 | index 1d1abe3..b30b3df 100644 127 | --- a/db/compaction_picker.h 128 | +++ b/db/compaction_picker.h 129 | @@ -210,6 +210,13 @@ class LevelCompactionPicker : public CompactionPicker { 130 | VersionStorageInfo* vstorage, 131 | CompactionInputFiles* inputs, 132 | int* level, int* output_level); 133 | + 134 | + // MSLS 135 | + // Similar to PickCompactionBySize except it chooses files 136 | + // in a round-robin fashion in the key space, like LevelDB does. 137 | + bool PickCompactionLevelDB(VersionStorageInfo* vstorage, int level, 138 | + int output_level, CompactionInputFiles* inputs, 139 | + int* parent_index, int* base_index); 140 | }; 141 | 142 | #ifndef ROCKSDB_LITE 143 | diff --git a/db/version_set.cc b/db/version_set.cc 144 | index cedaa3e..73e45b1 100644 145 | --- a/db/version_set.cc 146 | +++ b/db/version_set.cc 147 | @@ -795,7 +795,8 @@ VersionStorageInfo::VersionStorageInfo( 148 | accumulated_num_deletions_(0), 149 | num_samples_(0), 150 | estimated_compaction_needed_bytes_(0), 151 | - finalized_(false) { 152 | + finalized_(false), 153 | + last_key_(num_levels_) { 154 | if (ref_vstorage != nullptr) { 155 | accumulated_file_size_ = ref_vstorage->accumulated_file_size_; 156 | accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_; 157 | @@ -804,6 +805,7 @@ VersionStorageInfo::VersionStorageInfo( 158 | ref_vstorage->accumulated_num_non_deletions_; 159 | accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_; 160 | num_samples_ = ref_vstorage->num_samples_; 161 | + last_key_ = ref_vstorage->last_key_; 162 | } 163 | } 164 | 165 | @@ -1148,6 +1150,8 @@ void VersionStorageInfo::ComputeCompactionScore( 166 | } else { 167 | score = static_cast(num_sorted_runs) / 168 | mutable_cf_options.level0_file_num_compaction_trigger; 169 | + // MSLS - The previous version of RocksDB used to priotize level-0 -> level-1 compaction, which caused starvation that disallows making level-2, level-3, ... 170 | + // This has been reverted back to LevelDB's method in newer RocksDB versions. 171 | } 172 | } else { 173 | // Compute the ratio of current size to size limit. 174 | @@ -1803,6 +1807,13 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions, 175 | } 176 | } 177 | } 178 | + 179 | + // MSLS 180 | + for (auto i = 0u; i < options.custom_level_size_count; i++) { 181 | + if (i < level_max_bytes_.size()) { 182 | + level_max_bytes_[i] = options.custom_level_sizes[i]; 183 | + } 184 | + } 185 | } 186 | 187 | uint64_t VersionStorageInfo::EstimateLiveDataSize() const { 188 | diff --git a/db/version_set.h b/db/version_set.h 189 | index 7707bb1..825ad62 100644 190 | --- a/db/version_set.h 191 | +++ b/db/version_set.h 192 | @@ -323,6 +323,17 @@ class VersionStorageInfo { 193 | return estimated_compaction_needed_bytes_; 194 | } 195 | 196 | + // MSLS 197 | + void SetLastKey(int level, const std::string& last_key) { 198 | + assert(static_cast(level) < last_key_.size()); 199 | + last_key_[level] = last_key; 200 | + } 201 | + 202 | + const std::string& LastKey(int level) { 203 | + assert(static_cast(level) < last_key_.size()); 204 | + return last_key_[level]; 205 | + } 206 | + 207 | private: 208 | const InternalKeyComparator* internal_comparator_; 209 | const Comparator* user_comparator_; 210 | @@ -408,6 +419,9 @@ class VersionStorageInfo { 211 | // No copying allowed 212 | VersionStorageInfo(const VersionStorageInfo&) = delete; 213 | void operator=(const VersionStorageInfo&) = delete; 214 | + 215 | + // MSLS 216 | + std::vector last_key_; 217 | }; 218 | 219 | class Version { 220 | diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h 221 | index 23b8507..294111c 100644 222 | --- a/include/rocksdb/options.h 223 | +++ b/include/rocksdb/options.h 224 | @@ -1116,6 +1116,20 @@ struct DBOptions { 225 | // Default: nullptr (disabled) 226 | // Not supported in ROCKSDB_LITE mode! 227 | std::shared_ptr row_cache; 228 | + 229 | + // MSLS: Use custom level sizes if custom_level_size_count != 0. 230 | + // custom_level_size_count is the maximum level number to change the size. 231 | + // custom_level_sizes[i] specifies the maximum size of level-i (i < custom_level_size_count). 232 | + // custom_level_sizes[0] is ignored. 233 | + // 234 | + // Default: 0, NULL 235 | + size_t custom_level_size_count; 236 | + const size_t* custom_level_sizes; 237 | + 238 | + // MSLS: Use LevelDB-style circular table selection for compaction. 239 | + // 240 | + // Default: false 241 | + bool use_leveldb_table_selection; 242 | }; 243 | 244 | // Options to control the behavior of a database (passed to DB::Open) 245 | diff --git a/util/crc32c.cc b/util/crc32c.cc 246 | index b8d281a..87d884d 100644 247 | --- a/util/crc32c.cc 248 | +++ b/util/crc32c.cc 249 | @@ -394,7 +394,9 @@ bool IsFastCrc32Supported() { 250 | Function ChosenExtend = Choose_Extend(); 251 | 252 | uint32_t Extend(uint32_t crc, const char* buf, size_t size) { 253 | - return ChosenExtend(crc, buf, size); 254 | + // MSLS 255 | + return 0; 256 | + //return ChosenExtend(crc, buf, size); 257 | } 258 | 259 | } // namespace crc32c 260 | diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h 261 | index 4110ecc..38bb271 100644 262 | --- a/util/mutable_cf_options.h 263 | +++ b/util/mutable_cf_options.h 264 | @@ -44,7 +44,10 @@ struct MutableCFOptions { 265 | max_sequential_skip_in_iterations( 266 | options.max_sequential_skip_in_iterations), 267 | paranoid_file_checks(options.paranoid_file_checks), 268 | - compaction_measure_io_stats(options.compaction_measure_io_stats) 269 | + compaction_measure_io_stats(options.compaction_measure_io_stats), 270 | + custom_level_size_count(options.custom_level_size_count), 271 | + custom_level_sizes(options.custom_level_sizes), 272 | + use_leveldb_table_selection(options.use_leveldb_table_selection) 273 | 274 | { 275 | RefreshDerivedOptions(ioptions); 276 | @@ -76,7 +79,10 @@ struct MutableCFOptions { 277 | max_subcompactions(1), 278 | max_sequential_skip_in_iterations(0), 279 | paranoid_file_checks(false), 280 | - compaction_measure_io_stats(false) {} 281 | + compaction_measure_io_stats(false), 282 | + custom_level_size_count(0), 283 | + custom_level_sizes(NULL), 284 | + use_leveldb_table_selection(false) {} 285 | 286 | // Must be called after any change to MutableCFOptions 287 | void RefreshDerivedOptions(const ImmutableCFOptions& ioptions); 288 | @@ -132,6 +138,11 @@ struct MutableCFOptions { 289 | bool paranoid_file_checks; 290 | bool compaction_measure_io_stats; 291 | 292 | + // MSLS 293 | + size_t custom_level_size_count; 294 | + const size_t* custom_level_sizes; 295 | + bool use_leveldb_table_selection; 296 | + 297 | // Derived options 298 | // Per-level target file size. 299 | std::vector max_file_size; 300 | diff --git a/util/options.cc b/util/options.cc 301 | index 7f3bf75..d8ff0dd 100644 302 | --- a/util/options.cc 303 | +++ b/util/options.cc 304 | @@ -250,7 +250,10 @@ DBOptions::DBOptions() 305 | enable_thread_tracking(false), 306 | delayed_write_rate(1024U * 1024U), 307 | skip_stats_update_on_db_open(false), 308 | - wal_recovery_mode(WALRecoveryMode::kTolerateCorruptedTailRecords) { 309 | + wal_recovery_mode(WALRecoveryMode::kTolerateCorruptedTailRecords), 310 | + custom_level_size_count(0), 311 | + custom_level_sizes(NULL), 312 | + use_leveldb_table_selection(false) { 313 | } 314 | 315 | DBOptions::DBOptions(const Options& options) 316 | @@ -305,7 +308,10 @@ DBOptions::DBOptions(const Options& options) 317 | delayed_write_rate(options.delayed_write_rate), 318 | skip_stats_update_on_db_open(options.skip_stats_update_on_db_open), 319 | wal_recovery_mode(options.wal_recovery_mode), 320 | - row_cache(options.row_cache) {} 321 | + row_cache(options.row_cache), 322 | + custom_level_size_count(options.custom_level_size_count), 323 | + custom_level_sizes(options.custom_level_sizes), 324 | + use_leveldb_table_selection(options.use_leveldb_table_selection) {} 325 | 326 | static const char* const access_hints[] = { 327 | "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" 328 | -------------------------------------------------------------------------------- /leveldb.cpp: -------------------------------------------------------------------------------- 1 | #include "leveldb.h" 2 | #include "util.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | // #define REMEMBER_NEXT_FIRST_KEY 9 | 10 | LevelDB::LevelDB(const LevelDBParams& params, std::vector& stats) 11 | : params_(params), stats_(stats) { 12 | log_bytes_ = 0; 13 | // for log and level-0 that do not use compact() 14 | for (auto i = stats_.size(); i < 2; i++) stats_.push_back(Stat()); 15 | 16 | levels_.push_back(sstables_t()); 17 | 18 | level_bytes_.push_back(0); 19 | level_bytes_threshold_.push_back( 20 | static_cast(-1)); // level-0 can accept any SSTable size 21 | 22 | if (params_.compaction_mode == LevelDBCompactionMode::kLinear) 23 | level_next_compaction_key_.push_back(LevelDBKeyMax); 24 | else if (params_.compaction_mode == LevelDBCompactionMode::kLinearNextFirst) 25 | level_next_compaction_key_.push_back(LevelDBKeyMin); 26 | 27 | inserts_ = 0; 28 | level_overflows_.push_back(0); 29 | level_compactions_.push_back(0); 30 | level_overlapping_sstables_.push_back(0); 31 | level_overlapping_sstables_false_.push_back(0); 32 | level_sweeps_.push_back(0); 33 | 34 | next_version_ = 0; 35 | } 36 | 37 | LevelDB::~LevelDB() { 38 | for (std::size_t level = 0; level < levels_.size(); level++) 39 | for (auto& sstable : levels_[level]) delete sstable; 40 | } 41 | 42 | void LevelDB::print_status() const { 43 | printf("log: %zu items, %lu bytes\n", log_.size(), log_bytes_); 44 | for (std::size_t i = 0; i < levels_.size(); i++) { 45 | double overlaps = 0.; 46 | double overlaps_false = 0.; 47 | if (level_compactions_[i] != 0) { 48 | overlaps = level_overlapping_sstables_[i] / 49 | static_cast(level_compactions_[i]); 50 | overlaps_false = level_overlapping_sstables_false_[i] / 51 | static_cast(level_compactions_[i]); 52 | } 53 | uint64_t interval = 0; 54 | if (level_sweeps_[i] > 0) interval = inserts_ / level_sweeps_[i]; 55 | printf( 56 | "level-%zu: %5zu tables, %14lu bytes, %6lu overflows, %6lu " 57 | "compactions, %5.2lf avg overlaps (%.2lf false), %4lu sweeps " 58 | "(interval=%8lu)\n", 59 | i, levels_[i].size(), level_bytes_[i], level_overflows_[i], 60 | level_compactions_[i], overlaps, overlaps_false, level_sweeps_[i], 61 | interval); 62 | } 63 | } 64 | 65 | void LevelDB::dump_state(FILE* fp) const { 66 | // XXX: Memtable is not dumped now. 67 | fprintf(fp, "next_version:%lu\n", next_version_); 68 | 69 | fprintf(fp, "log:\n"); 70 | dump_state(fp, log_); 71 | 72 | fprintf(fp, "levels:\n"); 73 | for (std::size_t level = 0; level < levels_.size(); level++) { 74 | auto& sstables = levels_[level]; 75 | fprintf(fp, "level:\n"); 76 | for (std::size_t i = 0; i < sstables.size(); i++) { 77 | fprintf(fp, "sstable:\n"); 78 | dump_state(fp, *sstables[i]); 79 | } 80 | } 81 | } 82 | 83 | void LevelDB::dump_state(FILE* fp, const sstable_t& l) { 84 | for (std::size_t i = 0; i < l.size(); i++) dump_state(fp, l[i]); 85 | } 86 | 87 | void LevelDB::dump_state(FILE* fp, const LevelDBItem& item) { 88 | #ifdef LEVELDB_TRACK_VERSION 89 | fprintf(fp, "item:%u,%lu,%u,%s\n", item.key, item.version, 90 | item.size & LevelDBItemSizeMask, 91 | item.size == LevelDBItemDeletion ? "T" : "F"); 92 | #else 93 | fprintf(fp, "item:%u,0,%u,%s\n", item.key, item.size & LevelDBItemSizeMask, 94 | item.size == LevelDBItemDeletion ? "T" : "F"); 95 | #endif 96 | } 97 | 98 | void LevelDB::put(LevelDBKey key, uint32_t item_size) { 99 | #ifdef LEVELDB_TRACK_VERSION 100 | LevelDBItem item{key, item_size, next_version_++}; 101 | #else 102 | LevelDBItem item{key, item_size}; 103 | #endif 104 | inserts_++; 105 | append_to_log(item); 106 | } 107 | 108 | void LevelDB::del(LevelDBKey key) { 109 | #ifdef LEVELDB_TRACK_VERSION 110 | LevelDBItem item{key, LevelDBItemDeletion, next_version_++}; 111 | #else 112 | LevelDBItem item{key, LevelDBItemDeletion}; 113 | #endif 114 | append_to_log(item); 115 | } 116 | 117 | uint64_t LevelDB::get(LevelDBKey key) { 118 | // TODO: Implement 119 | (void)key; 120 | return 0; 121 | } 122 | 123 | void LevelDB::force_compact() { 124 | flush_log(); 125 | 126 | for (std::size_t level = 0; level < levels_.size() - 1; level++) { 127 | std::vector> sstable_indices; 128 | sstable_indices.push_back(std::vector()); 129 | sstable_indices.back().push_back(0); 130 | while (levels_[level].size() > 0) { 131 | compact(level, sstable_indices); 132 | } 133 | } 134 | } 135 | 136 | void LevelDB::append_to_log(const LevelDBItem& item) { 137 | log_.push_back(item); 138 | 139 | // Update statistics. 140 | auto new_log_bytes = log_bytes_ + item.size; 141 | // auto log_bytes_d = log_bytes_ / 4096; 142 | // auto new_log_bytes_d = new_log_bytes / 4096; 143 | // if (log_bytes_d != new_log_bytes_d) { 144 | // // New blocks are written. 145 | // stat_.write((new_log_bytes_d - log_bytes_d) * 4096); 146 | // } 147 | stats_[0].write(item.size); 148 | log_bytes_ = new_log_bytes; 149 | 150 | if (log_bytes_ > params_.log_size_threshold) flush_log(); 151 | } 152 | 153 | void LevelDB::flush_log() { 154 | if (log_.size() == 0) return; 155 | 156 | // Simplified for simulation; a new SSTable is created from the memtable, 157 | // causing no disk read. 158 | sort_items(log_); 159 | levels_t sstable_runs; 160 | sstable_runs.push_back(sstables_t()); 161 | sstable_runs.back().push_back(&log_); 162 | merge_sstables(sstable_runs, 0); 163 | delete_log(); 164 | 165 | // TODO: LevelDB computes the score of each level: [current table count / 166 | // compaction trigger] (for level = 0) or [current level byte size / max level 167 | // byte size] (for level >= 1). 168 | // It picks a level of the highest score in VersionSet::Finalize() 169 | // (db/version_set.cc). 170 | // Our checking is fine because compaction here is done synchronously 171 | // and lower levels tend to get a higher score until being compacted. 172 | for (std::size_t level = 0; level < levels_.size(); level++) 173 | check_compaction(level); 174 | } 175 | 176 | void LevelDB::delete_log() { 177 | // stat_.del(log_bytes_ / 4096 * 4096); 178 | stats_[0].del(log_bytes_); 179 | log_.clear(); 180 | log_bytes_ = 0; 181 | } 182 | 183 | struct _LevelDBKeyComparer { 184 | bool operator()(const LevelDBItem& a, const LevelDBItem& b) const { 185 | return a.key < b.key; 186 | } 187 | }; 188 | 189 | void LevelDB::sort_items(sstable_t& items) { 190 | std::stable_sort(items.begin(), items.end(), _LevelDBKeyComparer()); 191 | } 192 | 193 | struct _LevelDBSSTableComparer { 194 | LevelDB::sstable_t** sstables; 195 | std::size_t* sstables_pos; 196 | 197 | bool operator()(const std::size_t& a, const std::size_t& b) const { 198 | auto& item_a = (*sstables[a])[sstables_pos[a]]; 199 | auto& item_b = (*sstables[b])[sstables_pos[b]]; 200 | // Since std::make_heap makes a max-heap, we use a comparator with the 201 | // opposite result. 202 | if (item_a.key > item_b.key) 203 | return true; 204 | else if (item_a.key == item_b.key && a > b) 205 | return true; 206 | return false; 207 | } 208 | }; 209 | 210 | void LevelDB::merge_sstables(const levels_t& sstable_runs, std::size_t level) { 211 | // The current SSTable in each run. 212 | std::size_t sstables_idx[sstable_runs.size()]; 213 | sstable_t* sstables[sstable_runs.size()]; 214 | 215 | // The current item in each run's current SSTable. 216 | std::size_t sstables_pos[sstable_runs.size()]; 217 | 218 | for (std::size_t i = 0; i < sstable_runs.size(); i++) { 219 | assert(sstable_runs[i].size() != 0); 220 | sstables_idx[i] = 0; 221 | sstables[i] = sstable_runs[i][sstables_idx[i]]; 222 | sstables_pos[i] = 0; 223 | } 224 | 225 | // Initialize push. 226 | push_state state; 227 | push_init(state, level); 228 | 229 | // Initialize a heap. 230 | std::vector heap; 231 | _LevelDBSSTableComparer comp{sstables, sstables_pos}; 232 | sequence(sstable_runs.size(), heap); 233 | std::make_heap(heap.begin(), heap.end(), comp); 234 | 235 | while (heap.size() != 0) { 236 | // Get the smallest key's SSTable index. 237 | auto i = heap.front(); 238 | std::pop_heap(heap.begin(), heap.end(), comp); 239 | heap.pop_back(); 240 | 241 | // Discover how many keys we can take from this SSTable. 242 | sstable_t* sstable = sstables[i]; 243 | std::size_t size = sstable->size(); 244 | 245 | std::size_t start = sstables_pos[i]; 246 | std::size_t end; 247 | if (heap.size() == 0) 248 | // No other SSTables; we can take the remaining items in this SSTable. 249 | end = size; 250 | else { 251 | // Get the next smallest key's SSTable index (besides i's). 252 | auto j = heap.front(); 253 | LevelDBKey next_possible_key = (*sstables[j])[sstables_pos[j]].key; 254 | 255 | end = start + 1; 256 | while (end < size && (*sstable)[end].key < next_possible_key) end++; 257 | } 258 | 259 | push_items(state, *sstable, start, end); 260 | 261 | if (end < size) { 262 | // More items in this SSTable. 263 | sstables_pos[i] = end; 264 | 265 | heap.push_back(i); 266 | std::push_heap(heap.begin(), heap.end(), comp); 267 | } else { 268 | // No more items in this SSTable. Select the next SSTable in the same 269 | // run. 270 | sstables_idx[i]++; 271 | if (sstables_idx[i] < sstable_runs[i].size()) { 272 | sstables[i] = sstable_runs[i][sstables_idx[i]]; 273 | sstables_pos[i] = 0; 274 | 275 | heap.push_back(i); 276 | std::push_heap(heap.begin(), heap.end(), comp); 277 | } else { 278 | // all SSTables in the same run have been consumed. 279 | } 280 | } 281 | } 282 | 283 | push_flush(state); 284 | } 285 | 286 | void LevelDB::check_compaction(std::size_t level) { 287 | if (level == 0) { 288 | // Compact if we have too many level-0 SSTables. 289 | if (levels_[0].size() >= params_.level0_sstable_count_threshold) { 290 | level_overflows_[0]++; 291 | level_sweeps_[0]++; 292 | std::vector> sstable_indices; 293 | for (std::size_t i = 0; i < levels_[0].size(); i++) { 294 | sstable_indices.push_back(std::vector()); 295 | sstable_indices.back().push_back(i); 296 | } 297 | compact(0, sstable_indices); 298 | assert(levels_[0].size() == 0); 299 | } 300 | } else { 301 | // Compact if we have too much data in this level. 302 | if (level_bytes_[level] > level_bytes_threshold_[level]) { 303 | level_overflows_[level]++; 304 | std::vector> sstable_indices; 305 | sstable_indices.push_back(std::vector()); 306 | 307 | while (level_bytes_[level] > level_bytes_threshold_[level]) { 308 | sstable_indices.back().clear(); 309 | 310 | if (params_.compaction_mode == LevelDBCompactionMode::kLinear || 311 | params_.compaction_mode == 312 | LevelDBCompactionMode::kLinearNextFirst) { 313 | // Find the next table to compact. 314 | auto& level_tables = levels_[level]; 315 | std::size_t count = level_tables.size(); 316 | std::size_t i; 317 | for (i = 0; i < count; i++) { 318 | auto& sstable = *level_tables[i]; 319 | 320 | if (params_.compaction_mode == LevelDBCompactionMode::kLinear) { 321 | if (sstable.front().key > level_next_compaction_key_[level]) 322 | break; 323 | } else if (params_.compaction_mode == 324 | LevelDBCompactionMode::kLinearNextFirst) { 325 | if (sstable.front().key >= level_next_compaction_key_[level]) 326 | break; 327 | } 328 | } 329 | if (i == count) { 330 | i = 0; 331 | level_sweeps_[level]++; 332 | } 333 | if (params_.compaction_mode == LevelDBCompactionMode::kLinear) { 334 | level_next_compaction_key_[level] = level_tables[i]->back().key; 335 | } else if (params_.compaction_mode == 336 | LevelDBCompactionMode::kLinearNextFirst) { 337 | if (i < count - 1) 338 | level_next_compaction_key_[level] = 339 | level_tables[i + 1]->front().key; 340 | else 341 | level_next_compaction_key_[level] = LevelDBKeyMax; 342 | } 343 | 344 | sstable_indices.back().push_back(i); 345 | } else if (params_.compaction_mode == 346 | LevelDBCompactionMode::kMostNarrow) { 347 | auto& level_tables = levels_[level]; 348 | std::size_t count = level_tables.size(); 349 | 350 | // TODO: This is quite slow -- O(N). We may probably want to make it 351 | // O(logN) with a priority queue. 352 | std::size_t selected = count; 353 | LevelDBKey min_width = 0; 354 | for (std::size_t i = 0; i < count; i++) { 355 | auto& sstable = *level_tables[i]; 356 | LevelDBKey width = sstable.back().key - sstable.front().key; 357 | if (selected == count || min_width > width) { 358 | min_width = width; 359 | selected = i; 360 | } 361 | } 362 | assert(selected != count); 363 | sstable_indices.back().push_back(selected); 364 | } else if (params_.compaction_mode == 365 | LevelDBCompactionMode::kLeastOverlap) { 366 | auto& level_tables = levels_[level]; 367 | std::size_t count = level_tables.size(); 368 | 369 | if (level < levels_.size() - 1) { 370 | // TODO: This is quite slow -- O(N). We may probably want to make 371 | // it O(logN) with some magic (this is complicated because overlaps 372 | // change as we compact). 373 | auto& level_tables_next = levels_[level + 1]; 374 | std::size_t selected = count; 375 | std::size_t min_overlap = 0; 376 | std::size_t sstable_idx_start = 0; 377 | std::size_t sstable_idx_end = 0; 378 | for (std::size_t i = 0; i < count; i++) { 379 | auto& sstable = *level_tables[i]; 380 | if (sstable_idx_end > 0) sstable_idx_start = sstable_idx_end - 1; 381 | while (sstable_idx_start < level_tables_next.size() && 382 | level_tables_next[sstable_idx_start]->back().key < 383 | sstable.front().key) 384 | sstable_idx_start++; 385 | sstable_idx_end = sstable_idx_start; 386 | while (sstable_idx_end < level_tables_next.size() && 387 | level_tables_next[sstable_idx_end]->front().key < 388 | sstable.back().key) 389 | sstable_idx_end++; 390 | 391 | std::size_t overlap = sstable_idx_end - sstable_idx_start; 392 | // if (overlap != 0) { 393 | // printf("range: [%u,%u]\n", sstable.front().key, 394 | // sstable.back().key); 395 | // printf("overlap: %zu[%u,%u] - %zu[%u,%u]\n", 396 | // sstable_idx_start, 397 | // level_tables_next[sstable_idx_start]->front().key, 398 | // level_tables_next[sstable_idx_start]->back().key, 399 | // sstable_idx_end - 1, level_tables_next[sstable_idx_end - 400 | // 1]->front().key, level_tables_next[sstable_idx_end - 401 | // 1]->back().key); 402 | // } 403 | if (selected == count || min_overlap > overlap) { 404 | min_overlap = overlap; 405 | selected = i; 406 | } 407 | } 408 | assert(selected != count); 409 | sstable_indices.back().push_back(selected); 410 | } else { 411 | // We cannot use find_overlapping_tables() if the next level is not 412 | // created yet. 413 | sstable_indices.back().push_back(0); 414 | } 415 | } else if (params_.compaction_mode == 416 | LevelDBCompactionMode::kLargestRatio) { 417 | auto& level_tables = levels_[level]; 418 | std::size_t count = level_tables.size(); 419 | 420 | if (level < levels_.size() - 1) { 421 | // TODO: This is quite slow -- O(N). We may probably want to make 422 | // it O(logN) with some magic (this is complicated because overlaps 423 | // change as we compact). 424 | auto& level_tables_next = levels_[level + 1]; 425 | std::size_t selected = count; 426 | double max_ratio = 0.; 427 | std::size_t sstable_idx_start = 0; 428 | std::size_t sstable_idx_end = 0; 429 | for (std::size_t i = 0; i < count; i++) { 430 | auto& sstable = *level_tables[i]; 431 | if (sstable_idx_end > 0) sstable_idx_start = sstable_idx_end - 1; 432 | while (sstable_idx_start < level_tables_next.size() && 433 | level_tables_next[sstable_idx_start]->back().key < 434 | sstable.front().key) 435 | sstable_idx_start++; 436 | sstable_idx_end = sstable_idx_start; 437 | while (sstable_idx_end < level_tables_next.size() && 438 | level_tables_next[sstable_idx_end]->front().key < 439 | sstable.back().key) 440 | sstable_idx_end++; 441 | 442 | // TODO: Use LevelDBItem::size instead of the item count. 443 | std::size_t s = 0; 444 | for (std::size_t j = sstable_idx_start; j < sstable_idx_end; j++) 445 | s += level_tables_next[j]->size(); 446 | // Make division cleaner. 447 | if (s == 0) s = 1; 448 | 449 | double ratio = 450 | static_cast(sstable.size()) / static_cast(s); 451 | if (selected == count || max_ratio < ratio) { 452 | max_ratio = ratio; 453 | selected = i; 454 | } 455 | } 456 | assert(selected != count); 457 | sstable_indices.back().push_back(selected); 458 | } else { 459 | // We cannot use find_overlapping_tables() if the next level is not 460 | // created yet. 461 | sstable_indices.back().push_back(0); 462 | } 463 | } else if (params_.compaction_mode == 464 | LevelDBCompactionMode::kWholeLevel) { 465 | level_sweeps_[level]++; 466 | sequence(levels_[level].size(), sstable_indices.back()); 467 | } else 468 | assert(false); 469 | 470 | compact(level, sstable_indices); 471 | } 472 | } 473 | } 474 | } 475 | 476 | void LevelDB::push_init(push_state& state, std::size_t level) { 477 | state.level = level; 478 | 479 | state.pending_item = nullptr; 480 | 481 | state.current_sstable = nullptr; 482 | 483 | state.current_sstable_size = 0; 484 | state.use_split_key = false; 485 | } 486 | 487 | void LevelDB::push_items(push_state& state, const sstable_t& sstable, 488 | std::size_t start, std::size_t end) { 489 | assert(start != end); 490 | 491 | bool level0 = (state.level == 0); 492 | bool last_level = (state.level == levels_.size() - 1); 493 | 494 | if (state.pending_item == nullptr) { 495 | state.pending_item = &sstable[start]; 496 | start++; 497 | } 498 | 499 | while (start != end) { 500 | bool drop_pending_item = false; 501 | if (state.pending_item->size == LevelDBItemDeletion && last_level) 502 | drop_pending_item = true; 503 | else if (state.pending_item->key == sstable[start].key) { 504 | #ifdef LEVELDB_TRACK_VERSION 505 | if (state.pending_item->version >= sstable[start].version) 506 | printf("pv %lu cv %lu level %zu start %zu end %zu\n", 507 | state.pending_item->version, sstable[start].version, state.level, 508 | start, end); 509 | assert(state.pending_item->version < sstable[start].version); 510 | #endif 511 | drop_pending_item = true; 512 | } 513 | 514 | if (!drop_pending_item) { 515 | if (state.current_sstable == nullptr) 516 | state.current_sstable = new sstable_t(); 517 | 518 | state.current_sstable->push_back(*state.pending_item); 519 | state.current_sstable_size += 520 | state.pending_item->size & LevelDBItemSizeMask; 521 | 522 | if (state.current_sstable->size() == 1 && !params_.use_custom_sizes) { 523 | // Determine the split key; the current SSTable should not contain this 524 | // split key, otherwise it will overlap with too many SSTables in the 525 | // next level. 526 | if (level0 || last_level) 527 | state.use_split_key = false; 528 | else { 529 | auto& level_tables = levels_[state.level + 1]; 530 | std::size_t count = level_tables.size(); 531 | 532 | std::size_t i; 533 | // Choose the first SSTable in the next level that can potentially 534 | // overlap. 535 | // TODO: Use binary search and memorization from previous run. 536 | for (i = 0; i < count; i++) { 537 | auto& sstable = *level_tables[i]; 538 | if (state.pending_item->key <= sstable.back().key) break; 539 | } 540 | // XXX: This follows LevelDB's impl.html, but the actual 541 | // implementation uses bytes instead of the number of SSTables. 542 | // See kMaxGrandParentOverlapBytes (db/version_set.cc). 543 | std::size_t end = 544 | std::min(i + params_.sstable_overlap_threshold, count); 545 | if (end < count) { 546 | // Remember the split key. 547 | state.use_split_key = true; 548 | state.split_key = level_tables[end]->front().key; 549 | } else { 550 | // Splitting by key will never happen because there will be few 551 | // overlapping tables. 552 | state.use_split_key = false; 553 | } 554 | } 555 | } 556 | } 557 | 558 | state.pending_item = &sstable[start]; 559 | 560 | bool need_new_sstable = false; 561 | if (state.use_split_key && state.pending_item->key >= state.split_key) 562 | need_new_sstable = true; 563 | else { 564 | uint64_t item_size = state.pending_item->size & LevelDBItemSizeMask; 565 | // Level-0 generates only one SSTable per merge. Otherwise, we obey the 566 | // maximum SSTable size. 567 | if (!level0 && 568 | state.current_sstable_size + item_size > 569 | params_.sstable_size_threshold) 570 | need_new_sstable = true; 571 | } 572 | 573 | if (need_new_sstable) { 574 | if (state.current_sstable != nullptr) { 575 | state.current_sstable->shrink_to_fit(); 576 | state.completed_sstables.push_back(state.current_sstable); 577 | level_bytes_[state.level] += state.current_sstable_size; 578 | stats_[1 + state.level].write(state.current_sstable_size); 579 | 580 | state.current_sstable = nullptr; 581 | 582 | state.current_sstable_size = 0; 583 | state.use_split_key = false; 584 | } 585 | } 586 | 587 | start++; 588 | } 589 | } 590 | 591 | void LevelDB::push_flush(push_state& state) { 592 | // printf("push_flush level %zu\n", state.level); 593 | bool level0 = (state.level == 0); 594 | bool last_level = (state.level == levels_.size() - 1); 595 | 596 | // Flush the pending item. 597 | if (state.pending_item != nullptr) { 598 | bool drop_pending_item = false; 599 | if (state.pending_item->size == LevelDBItemDeletion && last_level) 600 | drop_pending_item = true; 601 | 602 | if (!drop_pending_item) { 603 | if (state.current_sstable == nullptr) { 604 | state.current_sstable = new sstable_t(); 605 | state.current_sstable_size = 0; 606 | } 607 | 608 | state.current_sstable->push_back(*state.pending_item); 609 | state.current_sstable_size += 610 | state.pending_item->size & LevelDBItemSizeMask; 611 | } 612 | } 613 | 614 | // Flush the current SSTable. 615 | if (state.current_sstable != nullptr) { 616 | state.current_sstable->shrink_to_fit(); 617 | state.completed_sstables.push_back(state.current_sstable); 618 | level_bytes_[state.level] += state.current_sstable_size; 619 | stats_[1 + state.level].write(state.current_sstable_size); 620 | } 621 | 622 | // Insert new SSTables into the level. 623 | if (level0) 624 | levels_[0].insert(levels_[0].end(), state.completed_sstables.begin(), 625 | state.completed_sstables.end()); 626 | else { 627 | auto& level_tables = levels_[state.level]; 628 | std::size_t count = level_tables.size(); 629 | 630 | std::size_t i; 631 | for (i = 0; i < count; i++) { 632 | auto& sstable = *level_tables[i]; 633 | if (state.pending_item->key <= sstable.back().key) break; 634 | } 635 | 636 | level_tables.insert( 637 | std::next(level_tables.begin(), static_cast(i)), 638 | state.completed_sstables.begin(), state.completed_sstables.end()); 639 | } 640 | } 641 | 642 | void LevelDB::find_overlapping_tables( 643 | std::size_t level, const LevelDBKey& first, const LevelDBKey& last, 644 | std::vector& out_sstable_indices) { 645 | assert(level >= 1); 646 | assert(level < levels_.size()); 647 | 648 | // TODO: Use binary search to reduce the search range. 649 | 650 | auto& level_tables = levels_[level]; 651 | std::size_t count = level_tables.size(); 652 | out_sstable_indices.clear(); 653 | 654 | for (std::size_t i = 0; i < count; i++) { 655 | auto& sstable = *level_tables[i]; 656 | if (!(last < sstable.front().key || sstable.back().key < first)) 657 | out_sstable_indices.push_back(i); 658 | } 659 | } 660 | 661 | void LevelDB::compact( 662 | std::size_t level, 663 | const std::vector>& sstable_indices) { 664 | // printf("compact level %zu\n", level); 665 | 666 | // Ensure we have all necessary data structures for the next level. 667 | if (levels_.size() <= level + 1) { 668 | levels_.push_back(sstables_t()); 669 | level_bytes_.push_back(0); 670 | 671 | for (auto i = stats_.size(); i < 2 + level + 1; i++) 672 | stats_.push_back(Stat()); 673 | level_overflows_.push_back(0); 674 | level_compactions_.push_back(0); 675 | level_overlapping_sstables_.push_back(0); 676 | level_overlapping_sstables_false_.push_back(0); 677 | level_sweeps_.push_back(0); 678 | 679 | // E.g., level_size for level-1 = params_.level_size_ratio 680 | // E.g., level_size for level-2 = params_.level_size_ratio * 681 | // params_.growth_factor 682 | uint64_t level_size = params_.level_size_ratio; 683 | for (std::size_t i = 1; i < level + 1; i++) 684 | level_size *= params_.growth_factor; 685 | 686 | if (params_.use_custom_sizes) { 687 | level_size = 0; 688 | std::ifstream ifs("output_sensitivity.txt"); 689 | while (!ifs.eof()) { 690 | std::string line; 691 | std::getline(ifs, line); 692 | 693 | std::istringstream iss(line); 694 | std::vector tokens{std::istream_iterator{iss}, 695 | std::istream_iterator{}}; 696 | 697 | if (tokens.size() < 4) continue; 698 | if (tokens[0] != "sensitivity_item_count_leveldb_best_sizes" && 699 | tokens[0] != "sensitivity_log_size_leveldb_best_sizes") 700 | continue; 701 | if (static_cast(atol(tokens[1].c_str())) != 702 | params_.hint_num_unique_keys) 703 | continue; 704 | if (atof(tokens[2].c_str()) != params_.hint_theta) continue; 705 | if (static_cast(atol(tokens[3].c_str())) != 706 | params_.log_size_threshold) 707 | continue; 708 | 709 | assert(level < tokens.size() - 5); 710 | // Assume the item size of 1000 bytes. 711 | level_size = static_cast( 712 | atof(tokens[5 + level].c_str()) * 1000. + 0.5); 713 | break; 714 | } 715 | assert(level_size != 0); 716 | } 717 | printf("level-%zu: max size %lu bytes\n", level + 1, level_size); 718 | level_bytes_threshold_.push_back(level_size); 719 | 720 | if (params_.compaction_mode == LevelDBCompactionMode::kLinear || 721 | params_.compaction_mode == LevelDBCompactionMode::kLinearNextFirst) 722 | level_next_compaction_key_.push_back(LevelDBKeyMax); 723 | } 724 | 725 | // Discover SSTables to merge. 726 | std::vector sstable_indices_current; 727 | for (auto& sstable_indices_sub : sstable_indices) 728 | for (auto i : sstable_indices_sub) sstable_indices_current.push_back(i); 729 | 730 | std::vector sstable_indices_next; 731 | LevelDBKey min_key; 732 | LevelDBKey max_key; 733 | if (params_.compaction_mode == LevelDBCompactionMode::kLinear || 734 | params_.compaction_mode == LevelDBCompactionMode::kLinearNextFirst || 735 | params_.compaction_mode == LevelDBCompactionMode::kMostNarrow || 736 | params_.compaction_mode == LevelDBCompactionMode::kLeastOverlap || 737 | params_.compaction_mode == LevelDBCompactionMode::kLargestRatio) { 738 | min_key = LevelDBKeyMax; 739 | max_key = LevelDBKeyMin; 740 | for (auto i : sstable_indices_current) { 741 | min_key = std::min(min_key, levels_[level][i]->front().key); 742 | max_key = std::max(max_key, levels_[level][i]->back().key); 743 | } 744 | find_overlapping_tables(level + 1, min_key, max_key, sstable_indices_next); 745 | } else if (params_.compaction_mode == LevelDBCompactionMode::kWholeLevel) { 746 | min_key = LevelDBKeyMin; 747 | max_key = LevelDBKeyMax; 748 | sequence(levels_[level + 1].size(), sstable_indices_next); 749 | } else 750 | assert(false); 751 | 752 | // level_compactions_[level] += sstable_indices_current.size(); 753 | // level_overlapping_sstables_[level] += sstable_indices_next.size(); 754 | 755 | // level_compactions_[level]++; 756 | // level_overlapping_sstables_[level] += 757 | // static_cast(sstable_indices_next.size()) / 758 | // static_cast(sstable_indices_current.size()); 759 | 760 | // TODO: Use LevelDBItem::size instead of the item count. 761 | uint64_t s0 = 0; 762 | uint64_t s1 = 0; 763 | uint64_t s1_false = 0; 764 | for (auto i : sstable_indices_current) s0 += levels_[level][i]->size(); 765 | for (auto i : sstable_indices_next) s1 += levels_[level + 1][i]->size(); 766 | for (auto i : sstable_indices_next) 767 | for (auto& item : *levels_[level + 1][i]) 768 | if (item.key < min_key || item.key > max_key) s1_false++; 769 | level_compactions_[level]++; 770 | level_overlapping_sstables_[level] += 771 | static_cast(s1) / static_cast(s0); 772 | level_overlapping_sstables_false_[level] += 773 | static_cast(s1_false) / static_cast(s0); 774 | 775 | // printf("overlapping\n"); 776 | // printf(" level %zu (%zu):", level, levels_[level].size()); 777 | // for (auto i : sstable_indices_current) 778 | // printf(" %zu", i); 779 | // printf("\n level %zu (%zu):", level + 1, levels_[level + 1].size()); 780 | // for (auto i : sstable_indices_next) 781 | // printf(" %zu", i); 782 | // printf("\n"); 783 | 784 | levels_t source_sstables; 785 | if (sstable_indices_next.size() != 0) { 786 | source_sstables.push_back(sstables_t()); 787 | for (auto i : sstable_indices_next) { 788 | source_sstables.back().push_back(levels_[level + 1][i]); 789 | 790 | std::uint64_t sstable_size = 0; 791 | for (auto& item : *source_sstables.back().back()) 792 | sstable_size += item.size & LevelDBItemSizeMask; 793 | level_bytes_[level + 1] -= sstable_size; 794 | stats_[1 + level + 1].read(sstable_size); 795 | stats_[1 + level + 1].del(sstable_size); 796 | } 797 | } 798 | for (auto& sstable_indices_sub : sstable_indices) { 799 | source_sstables.push_back(sstables_t()); 800 | for (auto i : sstable_indices_sub) { 801 | source_sstables.back().push_back(levels_[level][i]); 802 | 803 | std::uint64_t sstable_size = 0; 804 | for (auto& item : *source_sstables.back().back()) 805 | sstable_size += item.size & LevelDBItemSizeMask; 806 | level_bytes_[level] -= sstable_size; 807 | // We are reading from level, but let level+1 have the numbers to follow 808 | // the convention used in the analysis 809 | // stats_[1 + level].read(sstable_size); 810 | stats_[1 + level + 1].read(sstable_size); 811 | stats_[1 + level].del(sstable_size); 812 | } 813 | } 814 | 815 | { 816 | std::sort(sstable_indices_current.begin(), sstable_indices_current.end()); 817 | std::reverse(sstable_indices_current.begin(), 818 | sstable_indices_current.end()); 819 | for (auto i : sstable_indices_current) remove_sstable(level, i); 820 | 821 | std::reverse(sstable_indices_next.begin(), sstable_indices_next.end()); 822 | for (auto i : sstable_indices_next) remove_sstable(level + 1, i); 823 | } 824 | 825 | merge_sstables(source_sstables, level + 1); 826 | 827 | // Delete old SSTables. 828 | for (auto& sstables : source_sstables) 829 | for (auto& sstable : sstables) delete sstable; 830 | } 831 | 832 | LevelDB::sstable_t* LevelDB::remove_sstable(std::size_t level, 833 | std::size_t idx) { 834 | sstable_t* t = levels_[level][idx]; 835 | 836 | for (auto j = idx; j < levels_[level].size() - 1; j++) 837 | levels_[level][j] = levels_[level][j + 1]; 838 | levels_[level].pop_back(); 839 | 840 | return t; 841 | } 842 | -------------------------------------------------------------------------------- /leveldb.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.h" 4 | #include "stat.h" 5 | #include 6 | 7 | // #define LEVELDB_TRACK_VERSION 8 | 9 | typedef uint32_t LevelDBKey; 10 | static const uint32_t LevelDBKeyMin = 0; 11 | static const uint32_t LevelDBKeyMax = static_cast(-1); 12 | 13 | enum class LevelDBCompactionMode { 14 | // LevelDB's default compaction; pick one SSTable and pick the next linearly. 15 | kLinear = 0, 16 | // Similar to above but remember the first key of the next available SSTable 17 | // instead of the last key of the compacted SSTable. 18 | kLinearNextFirst = 1, 19 | // Pick an SSTable with the narrow key range. 20 | kMostNarrow = 2, 21 | // Pick an SSTable with the least number of next-level SSTables that overlap 22 | // with it. 23 | kLeastOverlap = 3, 24 | // Pick an SSTable whose size ratio to next-level overlapping SSTables size 25 | // (potential the inverse of write amplification) is the greatest; this is 26 | // similar to HyperLevelDB's strategy (see VersionSet::PickCompaction() in 27 | // HyperLevelDB/db/version_set.cc). 28 | kLargestRatio = 4, 29 | // Always compact the whole level (like LSM-tree). 30 | kWholeLevel = 5, 31 | 32 | // RocksDB - Pick an SSTable whose size is the maximum (default) + 1 33 | // compaction thread 34 | kRocksDBMaxSize = 6, 35 | // RocksDB - Pick an SSTable in the same way as kLinear + 1 compaction thread 36 | kRocksDBLinear = 7, 37 | // RocksDB - kRocksDBMaxSize + 4 compaction threads 38 | kRocksDBMaxSizeMT = 8, 39 | // RocksDB - kRocksDBLinear + 4 compaction threads 40 | kRocksDBLinearMT = 9, 41 | 42 | // RocksDB - Universal Compaction 43 | kRocksDBUniversal = 10, 44 | }; 45 | 46 | struct LevelDBParams { 47 | // When a log file exceeds this size, a new Level-0 SSTable is created, and a 48 | // new log file is created. 49 | uint64_t log_size_threshold; 50 | // When the level 0 ("young") has this many SSTables, all of them are merged 51 | // into the next level. 52 | uint64_t level0_sstable_count_threshold; 53 | // When an SSTable file exceeds this size, a new SSTable is created. 54 | uint64_t sstable_size_threshold; 55 | // When a level-L SSTable's key range overlaps with this many level-(L+1) 56 | // SSTables, a new level-L SSTable is created. 57 | uint64_t sstable_overlap_threshold; 58 | // When the level L is (growth factor)^L * (level size ratio) bytes big, an 59 | // level-L SSTable and all overlapping level-(L+1) SSTables are merged and 60 | // form new level-(L+1) SSTables. The level-L SSTable is chosen in a 61 | // round-robin way. 62 | uint64_t growth_factor; 63 | // The size of level 1. 64 | uint64_t level_size_ratio; 65 | 66 | // The compaction mode. 67 | LevelDBCompactionMode compaction_mode; 68 | 69 | // Use custom level sizes. 70 | bool use_custom_sizes; 71 | // Hints used for custom_sizes 72 | uint64_t hint_num_unique_keys; 73 | double hint_theta; 74 | 75 | // Enable fsync for implementation-based tests. 76 | bool enable_fsync; 77 | 78 | LevelDBParams() { 79 | log_size_threshold = 80 | 4 * 1048576; // write_buffer_size (include/leveldb/options.h) 81 | level0_sstable_count_threshold = 82 | 4; // When LevelDB triggers compaction (db/dbformat.h) 83 | // level0_sstable_count_threshold = 8; // When LevelDB slows down new 84 | // insertion 85 | // level0_sstable_count_threshold = 12; // When LevelDB stops handling 86 | // new insertion 87 | sstable_size_threshold = 88 | 2 * 1048576; // kTargetFileSize (db/version_set.cc) 89 | sstable_overlap_threshold = 90 | 10; // kMaxGrandParentOverlapBytes (db/version_set.cc) 91 | growth_factor = 10; // MaxBytesForLevel() (db/version_set.cc) 92 | level_size_ratio = 10 * 1048576; // MaxBytesForLevel() (db/version_set.cc) 93 | 94 | use_custom_sizes = false; 95 | hint_num_unique_keys = 0; 96 | hint_theta = 0.; 97 | 98 | enable_fsync = false; 99 | } 100 | }; 101 | 102 | struct LevelDBItem { 103 | LevelDBKey key; 104 | uint32_t size; 105 | #ifdef LEVELDB_TRACK_VERSION 106 | uint64_t version; 107 | #endif 108 | }; 109 | 110 | static const uint32_t LevelDBItemSizeMask = 0x7fffffffU; 111 | static const uint32_t LevelDBItemDeletion = 0x80000010U; 112 | 113 | // A LevelDB simulation based on 114 | // https://leveldb.googlecode.com/svn/trunk/doc/impl.html 115 | class LevelDB { 116 | public: 117 | LevelDB(const LevelDBParams& params, std::vector& stats); 118 | ~LevelDB(); 119 | 120 | // Prints the summary of the store. 121 | void print_status() const; 122 | 123 | // Writes the current items in the store to the file. 124 | void dump_state(FILE* fp) const; 125 | 126 | // Puts a new item in the store. 127 | void put(LevelDBKey key, uint32_t item_size); 128 | 129 | // Deletes an item from the store. 130 | void del(LevelDBKey key); 131 | 132 | // Gets an item from the store. 133 | uint64_t get(LevelDBKey key); 134 | 135 | // Forces compaction until there is no SSTable except the last level. 136 | void force_compact(); 137 | 138 | typedef std::vector sstable_t; 139 | typedef std::vector sstables_t; 140 | typedef std::vector levels_t; 141 | 142 | // typedef std::vector item_ptr_t; 143 | 144 | protected: 145 | // Adds a new item to the log. 146 | void append_to_log(const LevelDBItem& item); 147 | 148 | // Flushes all in-memory data to disk. This effectively creates new level-0 149 | // SSTables from the Memtable. 150 | void flush_log(); 151 | 152 | // Deletes the log. 153 | void delete_log(); 154 | 155 | // Sorts items in place. 156 | void sort_items(sstable_t& items); 157 | 158 | // Merges SSTables and emits SSTable in the specified level. Items at a later 159 | // position take precedence. 160 | void merge_sstables(const levels_t& source_sstables, std::size_t level); 161 | 162 | // Check if we need new compaction. 163 | void check_compaction(std::size_t level); 164 | 165 | // Pushes items to a level, creating SSTables. 166 | struct push_state { 167 | std::size_t level; 168 | 169 | const LevelDBItem* pending_item; 170 | 171 | sstable_t* current_sstable; 172 | 173 | uint64_t current_sstable_size; 174 | bool use_split_key; 175 | LevelDBKey split_key; 176 | 177 | sstables_t completed_sstables; 178 | }; 179 | void push_init(push_state& state, std::size_t level); 180 | void push_items(push_state& state, const sstable_t& sstable, 181 | std::size_t start, std::size_t end); 182 | void push_flush(push_state& state); 183 | 184 | // Finds all overlapping SSTables in the level. 185 | void find_overlapping_tables(std::size_t level, const LevelDBKey& first, 186 | const LevelDBKey& last, 187 | std::vector& out_sstable_indices); 188 | 189 | // Performs compaction with SSTables from the level and all over overlapping 190 | // SSTables in the next level. 191 | void compact(std::size_t level, 192 | const std::vector>& sstable_indices); 193 | 194 | // // Removes an SSTable from the level. This does not release the memory 195 | // used by the SSTable. 196 | sstable_t* remove_sstable(std::size_t level, std::size_t idx); 197 | 198 | // Writes an item list to the file. 199 | static void dump_state(FILE* fp, const sstable_t& l); 200 | static void dump_state(FILE* fp, const LevelDBItem& item); 201 | 202 | private: 203 | LevelDBParams params_; 204 | std::vector& stats_; 205 | sstable_t log_; 206 | uint64_t log_bytes_; 207 | levels_t levels_; 208 | std::vector level_bytes_; 209 | std::vector level_bytes_threshold_; 210 | // for LevelDBCompactionMode::kLinear and 211 | // LevelDBCompactionMode::kLinearNextFirst 212 | std::vector level_next_compaction_key_; 213 | uint64_t inserts_; 214 | std::vector level_overflows_; 215 | std::vector level_compactions_; 216 | std::vector level_overlapping_sstables_; 217 | std::vector level_overlapping_sstables_false_; 218 | std::vector level_sweeps_; 219 | uint64_t next_version_; 220 | }; 221 | -------------------------------------------------------------------------------- /leveldb_impl.cpp: -------------------------------------------------------------------------------- 1 | #include "leveldb_impl.h" 2 | #include "leveldb/db.h" 3 | #include "leveldb/env.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define OVERRIDE override 12 | // #define OVERRIDE 13 | 14 | // A wrapper for SequentialFile that forwards the data read information to 15 | // LevelDBImpl. 16 | class LevelDBSequentialFile : public leveldb::SequentialFile { 17 | public: 18 | LevelDBSequentialFile(LevelDBImpl* leveldb_impl, leveldb::SequentialFile* t) 19 | : leveldb::SequentialFile(), leveldb_impl_(leveldb_impl), target_(t) {} 20 | 21 | virtual ~LevelDBSequentialFile() OVERRIDE { delete target_; } 22 | 23 | virtual leveldb::Status Read(size_t n, leveldb::Slice* result, 24 | char* scratch) OVERRIDE { 25 | leveldb_impl_->Read(n); 26 | return target_->Read(n, result, scratch); 27 | } 28 | 29 | virtual leveldb::Status Skip(uint64_t n) OVERRIDE { return target_->Skip(n); } 30 | 31 | private: 32 | class LevelDBImpl* leveldb_impl_; 33 | leveldb::SequentialFile* target_; 34 | }; 35 | 36 | // A wrapper for RandomAccessFile that forwards the data read information to 37 | // LevelDBImpl. 38 | class LevelDBRandomAccessFile : public leveldb::RandomAccessFile { 39 | public: 40 | LevelDBRandomAccessFile(LevelDBImpl* leveldb_impl, 41 | leveldb::RandomAccessFile* t) 42 | : leveldb::RandomAccessFile(), leveldb_impl_(leveldb_impl), target_(t) {} 43 | 44 | virtual ~LevelDBRandomAccessFile() OVERRIDE { delete target_; } 45 | 46 | virtual leveldb::Status Read(uint64_t offset, size_t n, 47 | leveldb::Slice* result, 48 | char* scratch) const OVERRIDE { 49 | leveldb_impl_->Read(n); 50 | return target_->Read(offset, n, result, scratch); 51 | } 52 | 53 | private: 54 | class LevelDBImpl* leveldb_impl_; 55 | leveldb::RandomAccessFile* target_; 56 | }; 57 | 58 | // A wrapper for WritableFile that forwards the data append information to 59 | // LevelDBImpl. 60 | class LevelDBWritableFile : public leveldb::WritableFile { 61 | public: 62 | LevelDBWritableFile(LevelDBImpl* leveldb_impl, leveldb::WritableFile* t) 63 | : leveldb::WritableFile(), leveldb_impl_(leveldb_impl), target_(t) {} 64 | 65 | virtual ~LevelDBWritableFile() OVERRIDE { delete target_; } 66 | 67 | virtual leveldb::Status Append(const leveldb::Slice& data) OVERRIDE { 68 | leveldb_impl_->Append(data.size()); 69 | return target_->Append(data); 70 | } 71 | 72 | virtual leveldb::Status Close() OVERRIDE { return target_->Close(); } 73 | 74 | virtual leveldb::Status Flush() OVERRIDE { return target_->Flush(); } 75 | 76 | virtual leveldb::Status Sync() OVERRIDE { 77 | if (leveldb_impl_->params_.enable_fsync) 78 | return target_->Sync(); 79 | else { 80 | // Let's ignore Sync() for faster experiments. 81 | return leveldb::Status::OK(); 82 | } 83 | } 84 | 85 | private: 86 | class LevelDBImpl* leveldb_impl_; 87 | leveldb::WritableFile* target_; 88 | }; 89 | 90 | // A wrapper for Env that forwards the file deletion information to LevelDBImpl. 91 | class LevelDBEnv : public leveldb::EnvWrapper { 92 | public: 93 | LevelDBEnv(LevelDBImpl* leveldb_impl) 94 | : leveldb::EnvWrapper(leveldb::Env::Default()), 95 | leveldb_impl_(leveldb_impl) {} 96 | 97 | virtual ~LevelDBEnv() OVERRIDE {} 98 | 99 | virtual leveldb::Status NewSequentialFile( 100 | const std::string& f, leveldb::SequentialFile** r) OVERRIDE { 101 | leveldb::Status status = target()->NewSequentialFile(f, r); 102 | if (*r != NULL) *r = new LevelDBSequentialFile(leveldb_impl_, *r); 103 | return status; 104 | } 105 | 106 | virtual leveldb::Status NewRandomAccessFile( 107 | const std::string& f, leveldb::RandomAccessFile** r) OVERRIDE { 108 | leveldb::Status status = target()->NewRandomAccessFile(f, r); 109 | if (*r != NULL) *r = new LevelDBRandomAccessFile(leveldb_impl_, *r); 110 | return status; 111 | } 112 | 113 | virtual leveldb::Status NewWritableFile(const std::string& f, 114 | leveldb::WritableFile** r) OVERRIDE { 115 | leveldb::Status status = target()->NewWritableFile(f, r); 116 | if (*r != NULL) *r = new LevelDBWritableFile(leveldb_impl_, *r); 117 | return status; 118 | } 119 | 120 | virtual leveldb::Status DeleteFile(const std::string& f) OVERRIDE { 121 | struct stat st; 122 | memset(&st, 0, sizeof(st)); 123 | // XXX: The file length *might* not be as large as its actual content 124 | // because the directory metadata can be updated later than the appends. 125 | int ret = stat(f.c_str(), &st); 126 | if (ret == 0) leveldb_impl_->Delete(static_cast(st.st_size)); 127 | 128 | return target()->DeleteFile(f); 129 | } 130 | 131 | private: 132 | class LevelDBImpl* leveldb_impl_; 133 | }; 134 | 135 | LevelDBImpl::LevelDBImpl(const LevelDBParams& params, std::vector& stats) 136 | : params_(params), stats_(stats) { 137 | stats_.push_back(Stat()); 138 | 139 | pthread_mutex_init(&stats_mutex_, NULL); 140 | read_ = 0; 141 | appended_ = 0; 142 | 143 | // Clean up old files. 144 | leveldb::DestroyDB("leveldb_files", leveldb::Options()); 145 | 146 | options_ = new leveldb::Options(); 147 | 148 | options_->create_if_missing = true; 149 | 150 | // Turn off Snappy. 151 | options_->compression = leveldb::CompressionType::kNoCompression; 152 | 153 | // Use our Env to gather statistics. 154 | options_->env = new LevelDBEnv(this); 155 | 156 | // Limit the max open file count. 157 | options_->max_open_files = 900; 158 | 159 | // Configure the write buffer size. 160 | options_->write_buffer_size = params.log_size_threshold; 161 | 162 | // Do not overload insert. 163 | // These are hardcoded in leveldb/db/dbformat.h 164 | // options_->level0_file_num_compaction_trigger = 4; 165 | // options_->level0_slowdown_writes_trigger = 4; 166 | // options_->level0_stop_writes_trigger = 4; 167 | 168 | // Use custom level sizes 169 | if (params_.use_custom_sizes) { 170 | std::size_t* custom_level_sizes = new std::size_t[20]; 171 | 172 | std::ifstream ifs("output_sensitivity.txt"); 173 | while (!ifs.eof()) { 174 | std::string line; 175 | std::getline(ifs, line); 176 | 177 | std::istringstream iss(line); 178 | std::vector tokens{std::istream_iterator{iss}, 179 | std::istream_iterator{}}; 180 | 181 | if (tokens.size() < 5) continue; 182 | if (tokens[0] != "sensitivity_item_count_leveldb_best_sizes" && 183 | tokens[0] != "sensitivity_log_size_leveldb_best_sizes") 184 | continue; 185 | if (static_cast(atol(tokens[1].c_str())) != 186 | params_.hint_num_unique_keys) 187 | continue; 188 | if (atof(tokens[2].c_str()) != params_.hint_theta) continue; 189 | if (static_cast(atol(tokens[3].c_str())) != 190 | params_.log_size_threshold) 191 | continue; 192 | 193 | options_->custom_level_size_count = tokens.size() - 5 + 1; 194 | 195 | custom_level_sizes[0] = 0; 196 | std::size_t level; 197 | for (level = 1; level < options_->custom_level_size_count; level++) { 198 | custom_level_sizes[level] = static_cast( 199 | atof(tokens[5 + level - 1].c_str()) * 1000. + 0.5); 200 | printf("level-%zu: %zu\n", level, custom_level_sizes[level]); 201 | } 202 | // Make the last level very large and not spill. 203 | level--; 204 | custom_level_sizes[level] = 1000000000000000LU; 205 | printf("level-%zu: %zu (expanded)\n", level, custom_level_sizes[level]); 206 | printf("\n"); 207 | break; 208 | } 209 | assert(options_->custom_level_size_count != 0); 210 | 211 | options_->custom_level_sizes = custom_level_sizes; 212 | } 213 | 214 | leveldb::Status status = leveldb::DB::Open(*options_, "leveldb_files", &db_); 215 | if (!status.ok()) { 216 | printf("%s\n", status.ToString().c_str()); 217 | assert(false); 218 | } 219 | 220 | memset(value_buf_, 0, sizeof(value_buf_)); 221 | } 222 | 223 | LevelDBImpl::~LevelDBImpl() { 224 | delete db_; 225 | 226 | delete options_->env; 227 | if (params_.use_custom_sizes) delete[] options_->custom_level_sizes; 228 | delete options_; 229 | 230 | pthread_mutex_destroy(&stats_mutex_); 231 | } 232 | 233 | void LevelDBImpl::print_status() const { 234 | // Force updating stats. 235 | const_cast(this)->Delete(0); 236 | } 237 | 238 | void LevelDBImpl::dump_state(FILE* fp) const { 239 | // TODO: Implement. 240 | (void)fp; 241 | } 242 | 243 | void LevelDBImpl::put(LevelDBKey key, uint32_t item_size) { 244 | // LevelDB includes the full SSTable file size during calculating the level 245 | // size; 246 | // we consider the average space overhead per item in LevelDB so that the 247 | // average stored size becomes similar to item_size. 248 | const uint32_t overhead = 18; 249 | 250 | leveldb::Slice s_key(reinterpret_cast(&key), sizeof(key)); 251 | uint32_t value_size = 252 | static_cast(static_cast(item_size) - sizeof(key)) - 253 | overhead; 254 | assert(value_size < sizeof(value_buf_)); 255 | leveldb::Slice s_value(value_buf_, value_size); 256 | 257 | leveldb::Status status = db_->Put(leveldb::WriteOptions(), s_key, s_value); 258 | if (!status.ok()) { 259 | printf("%s\n", status.ToString().c_str()); 260 | assert(false); 261 | } 262 | } 263 | 264 | void LevelDBImpl::del(LevelDBKey key) { 265 | leveldb::Slice s_key(reinterpret_cast(&key), sizeof(key)); 266 | 267 | leveldb::Status status = db_->Delete(leveldb::WriteOptions(), s_key); 268 | if (!status.ok()) { 269 | printf("%s\n", status.ToString().c_str()); 270 | assert(false); 271 | } 272 | } 273 | 274 | uint64_t LevelDBImpl::get(LevelDBKey key) { 275 | leveldb::Slice s_key(reinterpret_cast(&key), sizeof(key)); 276 | std::string s_value; 277 | uint64_t value; 278 | 279 | leveldb::Status status = db_->Get(leveldb::ReadOptions(), s_key, &s_value); 280 | if (!status.ok()) { 281 | printf("%s\n", status.ToString().c_str()); 282 | assert(false); 283 | } 284 | assert(s_value.size() >= sizeof(uint64_t)); 285 | value = *reinterpret_cast(s_value.data()); 286 | return value; 287 | } 288 | 289 | void LevelDBImpl::force_compact() { 290 | db_->CompactRange(NULL, NULL); 291 | 292 | // Force stat update. 293 | Delete(0); 294 | } 295 | 296 | void LevelDBImpl::Read(std::size_t len) { __sync_fetch_and_add(&read_, len); } 297 | 298 | void LevelDBImpl::Append(std::size_t len) { 299 | __sync_fetch_and_add(&appended_, len); 300 | } 301 | 302 | void LevelDBImpl::Delete(std::size_t len) { 303 | uint64_t read = read_; 304 | __sync_fetch_and_sub(&read_, read); 305 | uint64_t appended = appended_; 306 | __sync_fetch_and_sub(&appended_, appended); 307 | 308 | pthread_mutex_lock(&stats_mutex_); 309 | if (read != 0) stats_.back().read(read); 310 | if (appended != 0) stats_.back().write(appended); 311 | if (len != 0) stats_.back().del(len); 312 | pthread_mutex_unlock(&stats_mutex_); 313 | } 314 | -------------------------------------------------------------------------------- /leveldb_impl.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "leveldb.h" 4 | #include 5 | 6 | namespace leveldb { 7 | // For forward declaration. 8 | class DB; 9 | class Options; 10 | } 11 | 12 | // An interface to the LevelDB implementation 13 | class LevelDBImpl { 14 | friend class LevelDBSequentialFile; 15 | friend class LevelDBRandomAccessFile; 16 | friend class LevelDBWritableFile; 17 | friend class LevelDBEnv; 18 | 19 | public: 20 | LevelDBImpl(const LevelDBParams& params, std::vector& stats); 21 | ~LevelDBImpl(); 22 | 23 | // Prints the summary of the store. 24 | void print_status() const; 25 | 26 | // Writes the current items in the store to the file. 27 | void dump_state(FILE* fp) const; 28 | 29 | // Puts a new item in the store. 30 | void put(LevelDBKey key, uint32_t item_size); 31 | 32 | // Deletes an item from the store. 33 | void del(LevelDBKey key); 34 | 35 | // Gets an item from the store. 36 | uint64_t get(LevelDBKey key); 37 | 38 | // Forces compaction until there is no SSTable except the last level. 39 | void force_compact(); 40 | 41 | protected: 42 | void Read(std::size_t len); 43 | void Append(std::size_t len); 44 | void Delete(std::size_t len); 45 | 46 | private: 47 | LevelDBParams params_; 48 | std::vector& stats_; 49 | 50 | leveldb::Options* options_; 51 | leveldb::DB* db_; 52 | 53 | pthread_mutex_t stats_mutex_; 54 | volatile uint64_t read_; 55 | volatile uint64_t appended_; 56 | 57 | char value_buf_[1024]; 58 | }; 59 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include "util.h" 3 | #include "zipf.h" 4 | #include "leveldb.h" 5 | #include "leveldb_impl.h" 6 | #include "rocksdb_impl.h" 7 | #include "meshdb.h" 8 | #include 9 | 10 | enum class ActiveKeyMode { 11 | kEntire = 0, 12 | kClustered = 1, 13 | kScattered = 2, 14 | }; 15 | 16 | enum class DependencyMode { 17 | kIndependent = 0, 18 | kClustered = 1, 19 | kScattered = 2, 20 | kSequential = 3, 21 | }; 22 | 23 | /* 24 | class ItemLifetimeInfo : public MeshDBItemLifetimeInfo { 25 | public: 26 | ItemLifetimeInfo(const zipf_gen_state& zipf_state, std::size_t 27 | num_unique_keys, const std::vector& keys) { 28 | item_class_.resize(num_unique_keys); 29 | item_lifetime_.resize(num_unique_keys); 30 | class_lifetime_.resize(3); 31 | 32 | std::vector item_prob; 33 | item_prob.resize(num_unique_keys); 34 | 35 | std::vector class_prob_sum; 36 | std::vector class_count; 37 | class_prob_sum.resize(3); 38 | class_count.resize(3); 39 | 40 | double prob_sum = 0; 41 | for (std::size_t i = 0; i < num_unique_keys; i++) { 42 | uint64_t key = static_cast(keys[i]); 43 | double prob = zipf_prob(&zipf_state, i); 44 | item_prob[key] = prob; 45 | prob_sum += prob; 46 | } 47 | 48 | for (std::size_t key = 0; key < num_unique_keys; key++) { 49 | double prob = item_prob[key]; 50 | uint64_t lifetime = static_cast(prob_sum / prob); // 51 | inverse of the actual probability 52 | // if (key < 100) 53 | // printf("prob=%lf prob_sum=%lf lifetime=%zu\n", prob, 54 | prob_sum, lifetime); 55 | 56 | std::size_t item_class = 0; 57 | if (lifetime < 100000) 58 | item_class = 0; 59 | else if (lifetime < 500000) 60 | item_class = 1; 61 | else 62 | item_class = 2; 63 | 64 | item_class_[key] = item_class; 65 | item_lifetime_[key] = lifetime; 66 | 67 | class_prob_sum[item_class] += prob; 68 | class_count[item_class]++; 69 | } 70 | 71 | for (std::size_t i = 0; i < 3; i++) { 72 | class_lifetime_[i] = static_cast(prob_sum / 73 | (class_prob_sum[i] / static_cast(class_count[i]))); 74 | printf("class_count[%zu]=%zu\n", i, class_count[i]); 75 | printf("class_lifetime[%zu]=%zu\n", i, class_lifetime_[i]); 76 | } 77 | } 78 | 79 | virtual ~ItemLifetimeInfo() {} 80 | virtual std::size_t item_class(MeshDBKey key) { return item_class_[key]; } 81 | virtual uint64_t item_lifetime(MeshDBKey key) { return item_lifetime_[key]; 82 | } 83 | virtual uint64_t class_lifetime(std::size_t lifetime_class) { return 84 | class_lifetime_[lifetime_class]; } 85 | 86 | private: 87 | std::vector item_class_; 88 | std::vector item_lifetime_; 89 | std::vector class_lifetime_; 90 | }; 91 | */ 92 | 93 | void print_stats(std::vector& stats, uint64_t insert_bytes) { 94 | double wa_r_sum = 0.; 95 | double wa_w_sum = 0.; 96 | for (std::size_t i = 0; i < stats.size(); i++) { 97 | if (i == 0) 98 | printf(" stats\n"); 99 | else 100 | printf(" stats\n", i - 1); 101 | stats[i].print_status(); 102 | double wa_r = static_cast(stats[i].read_bytes()) / 103 | static_cast(insert_bytes); 104 | double wa_w = static_cast(stats[i].write_bytes()) / 105 | static_cast(insert_bytes); 106 | printf("WA_r: %5.2lf\n", wa_r); 107 | printf("WA_w: %5.2lf\n", wa_w); 108 | wa_r_sum += wa_r; 109 | wa_w_sum += wa_w; 110 | } 111 | printf("WA_r sum: %5.2lf\n", wa_r_sum); 112 | printf("WA_w sum: %5.2lf\n", wa_w_sum); 113 | } 114 | 115 | uint64_t get_usec() { 116 | struct timeval tv_now; 117 | gettimeofday(&tv_now, NULL); 118 | 119 | return (uint64_t)tv_now.tv_sec * 1000000UL + (uint64_t)tv_now.tv_usec; 120 | } 121 | 122 | template 123 | void test(const char* store_type_name, uint32_t num_unique_keys, 124 | ActiveKeyMode active_key_mode, DependencyMode dependency_mode, 125 | uint64_t num_requests, double theta, 126 | LevelDBCompactionMode compaction_mode, uint64_t wb_size, 127 | bool enable_fsync, bool use_custom_sizes, 128 | const std::vector& dump_points) { 129 | // The number of unique keys. 130 | // uint32_t num_unique_keys = 2 * 1000 * 1000; 131 | // The item size. 132 | uint32_t item_size = 1000; 133 | // The number of requests. 134 | // uint64_t num_requests = 20 * 1000 * 1000; 135 | // The skew of key popularity. -1. = uniform no ramdom; 0. = uniform; 0.99 = 136 | // skewed; 40. = one key 137 | // double theta = -1.; 138 | // double theta = 0.; 139 | // double theta = 0.99; 140 | 141 | printf("store_type=%s\n", store_type_name); 142 | printf("num_unique_keys=%u\n", num_unique_keys); 143 | printf("active_key_mode=%u\n", active_key_mode); 144 | printf("dependency_mode=%u\n", dependency_mode); 145 | printf("item_size=%u\n", item_size); 146 | printf("num_requests=%lu\n", num_requests); 147 | printf("theta=%lf\n", theta); 148 | printf("compaction_mode=%u\n", compaction_mode); 149 | printf("wb_size=%lu\n", wb_size); 150 | printf("enable_fsync=%s\n", enable_fsync ? "1" : "0"); 151 | printf("use_custom_sizes=%s\n", use_custom_sizes ? "1" : "0"); 152 | printf("\n"); 153 | fflush(stdout); 154 | 155 | bool verbose = true; 156 | // bool verbose = false; 157 | 158 | // Generate keys. 159 | // Uses uint32_t instead of uint64_t to reduce cache pollution. 160 | // TODO: Use hashing instead of the shuffle key array. 161 | std::vector keys; 162 | // assert(num_unique_keys < (1UL << 32)); 163 | sequence(num_unique_keys, keys); 164 | // Comment this out to disable hashing. 165 | shuffle(keys); 166 | 167 | // Initialize request generation. 168 | zipf_gen_state zipf_state; 169 | zipf_init(&zipf_state, static_cast(num_unique_keys), theta, 1); 170 | 171 | // ItemLifetimeInfo lifetime_info(zipf_state, num_unique_keys, keys); 172 | // for (std::size_t i = 0; i < 4; i++) 173 | // printf("class_lifetime(%zu)=%lu\n", i, 174 | // lifetime_info.class_lifetime(i)); 175 | // printf("item_class(0)=%lu\n", lifetime_info.item_class(keys[0])); 176 | // printf("item_class(100)=%lu\n", lifetime_info.item_class(keys[100])); 177 | // printf("item_class(10000)=%lu\n", lifetime_info.item_class(keys[10000])); 178 | // printf("item_class(1000000)=%lu\n", 179 | // lifetime_info.item_class(keys[1000000])); 180 | 181 | // Main simulation. 182 | std::vector stats; 183 | LevelDBParams params; 184 | params.compaction_mode = compaction_mode; 185 | params.log_size_threshold = wb_size; 186 | params.enable_fsync = enable_fsync; 187 | params.use_custom_sizes = use_custom_sizes; 188 | params.hint_num_unique_keys = num_unique_keys; 189 | params.hint_theta = theta; 190 | 191 | StoreType store(params, stats); 192 | 193 | // MeshDBParams params; 194 | // MeshDB store(params, stat, &lifetime_info); 195 | 196 | // std::size_t next_dump = 0; 197 | (void)dump_points; 198 | 199 | // const uint64_t request_batch_size = 1000000; // for debugging 200 | const uint64_t request_batch_size = 10000000; 201 | uint64_t num_processed_requests; 202 | uint64_t start_t; 203 | 204 | start_t = get_usec(); 205 | 206 | { 207 | printf("initial insertion of %u items\n\n", num_unique_keys); 208 | fflush(stdout); 209 | 210 | num_processed_requests = 0; 211 | uint32_t key = 0; 212 | while (num_processed_requests < static_cast(num_unique_keys)) { 213 | uint64_t this_request_batch_size = request_batch_size; 214 | if (num_processed_requests + this_request_batch_size > num_unique_keys) 215 | this_request_batch_size = num_unique_keys - num_processed_requests; 216 | 217 | for (uint64_t i = 0; i < this_request_batch_size; i++) { 218 | // for sequential insert 219 | store.put(key, item_size); 220 | // for random insert 221 | // store.put(keys[key], item_size); 222 | key++; 223 | } 224 | num_processed_requests += this_request_batch_size; 225 | 226 | if (verbose) { 227 | printf("key %lu/%u inserted\n", num_processed_requests, 228 | num_unique_keys); 229 | store.print_status(); 230 | print_stats(stats, 231 | num_processed_requests * static_cast(item_size)); 232 | printf("\n"); 233 | fflush(stdout); 234 | } 235 | } 236 | 237 | printf("key %lu/%u inserted\n", num_processed_requests, num_unique_keys); 238 | store.print_status(); 239 | print_stats(stats, 240 | num_processed_requests * static_cast(item_size)); 241 | printf("\n"); 242 | fflush(stdout); 243 | } 244 | 245 | printf("elapsed time: %.3lf seconds\n\n", 246 | (double)(get_usec() - start_t) / 1000000.); 247 | 248 | for (auto& stat : stats) stat.reset(); 249 | 250 | // How small fraction of keys are being used in the main transaction? 251 | const uint32_t active_key_factor = 10; 252 | 253 | // How many keys are dependent to each other? 254 | const int dependency_factor = 10; 255 | 256 | // Reinitialize request generation. 257 | uint32_t num_active_keys; 258 | switch (active_key_mode) { 259 | case ActiveKeyMode::kEntire: 260 | num_active_keys = num_unique_keys; 261 | break; 262 | case ActiveKeyMode::kClustered: 263 | num_active_keys = num_unique_keys / active_key_factor; 264 | sequence(num_active_keys, keys); 265 | shuffle(keys); 266 | break; 267 | case ActiveKeyMode::kScattered: 268 | num_active_keys = num_unique_keys / active_key_factor; 269 | break; 270 | default: 271 | assert(false); 272 | return; 273 | } 274 | zipf_init(&zipf_state, static_cast(num_active_keys), theta, 2); 275 | 276 | start_t = get_usec(); 277 | 278 | { 279 | printf("main transaction of %lu requests\n\n", num_requests); 280 | fflush(stdout); 281 | 282 | num_processed_requests = 0; 283 | while (num_processed_requests < num_requests) { 284 | uint64_t this_request_batch_size = request_batch_size; 285 | if (num_processed_requests + this_request_batch_size > num_requests) 286 | this_request_batch_size = num_requests - num_processed_requests; 287 | 288 | // Process a batch of requests. 289 | switch (dependency_mode) { 290 | case DependencyMode::kIndependent: { 291 | for (uint64_t i = 0; i < this_request_batch_size; i++) { 292 | uint32_t key = keys[zipf_next(&zipf_state)]; 293 | // uint32_t key = keys[static_cast(rand()) % 294 | // num_unique_keys]; 295 | // uint32_t key = static_cast(rand() % num_unique_keys); 296 | store.put(key, item_size); 297 | 298 | /* 299 | if (next_dump < dump_points.size() && dump_points[next_dump] == 300 | num_processed_requests + i + 1) { 301 | char filename[1024]; 302 | snprintf(filename, 1024, "output_state_%lu.txt", 303 | dump_points[next_dump]); 304 | FILE* fp_state = fopen(filename, "wt"); 305 | store.dump_state(fp_state); 306 | fclose(fp_state); 307 | next_dump++; 308 | } 309 | */ 310 | } 311 | } break; 312 | 313 | case DependencyMode::kClustered: { 314 | this_request_batch_size = 315 | (this_request_batch_size + dependency_factor - 1) / 316 | dependency_factor * dependency_factor; 317 | for (uint64_t i = 0; i < this_request_batch_size; 318 | i += dependency_factor) { 319 | uint32_t key = keys[zipf_next(&zipf_state)]; 320 | key = key / dependency_factor * dependency_factor; 321 | store.put(key, item_size); 322 | 323 | for (int j = 1; j < dependency_factor; j++) { 324 | key++; 325 | if (key >= num_unique_keys) key -= num_unique_keys; 326 | store.put(key, item_size); 327 | } 328 | } 329 | } break; 330 | 331 | case DependencyMode::kScattered: { 332 | const uint32_t key_skip = num_unique_keys / dependency_factor; 333 | this_request_batch_size = 334 | (this_request_batch_size + dependency_factor - 1) / 335 | dependency_factor * dependency_factor; 336 | for (uint64_t i = 0; i < this_request_batch_size; 337 | i += dependency_factor) { 338 | uint32_t key = keys[zipf_next(&zipf_state)]; 339 | key = key % key_skip; 340 | store.put(key, item_size); 341 | 342 | for (int j = 1; j < dependency_factor; j++) { 343 | key += key_skip; 344 | if (key >= num_unique_keys) key -= num_unique_keys; 345 | store.put(key, item_size); 346 | } 347 | } 348 | } break; 349 | 350 | case DependencyMode::kSequential: { 351 | for (uint64_t i = 0; i < this_request_batch_size; i++) { 352 | uint32_t key = static_cast((num_processed_requests + i) % 353 | num_active_keys); 354 | store.put(key, item_size); 355 | } 356 | } break; 357 | 358 | default: 359 | assert(false); 360 | return; 361 | } 362 | num_processed_requests += this_request_batch_size; 363 | 364 | if (verbose) { 365 | printf("request %lu/%lu processed\n", num_processed_requests, 366 | num_requests); 367 | store.print_status(); 368 | print_stats(stats, 369 | num_processed_requests * static_cast(item_size)); 370 | printf("\n"); 371 | fflush(stdout); 372 | } 373 | } 374 | 375 | printf("request %lu/%lu processed\n", num_processed_requests, num_requests); 376 | store.print_status(); 377 | print_stats(stats, 378 | num_processed_requests * static_cast(item_size)); 379 | printf("\n"); 380 | fflush(stdout); 381 | } 382 | 383 | printf("elapsed time: %.3lf seconds\n\n", 384 | (double)(get_usec() - start_t) / 1000000.); 385 | 386 | if (false) { 387 | printf("forcing compaction\n"); 388 | fflush(stdout); 389 | store.force_compact(); 390 | 391 | store.print_status(); 392 | print_stats(stats, 393 | num_processed_requests * static_cast(item_size)); 394 | printf("\n"); 395 | fflush(stdout); 396 | } 397 | 398 | /* 399 | { 400 | FILE* fp_state = fopen("output_state_final.txt", "wt"); 401 | store.dump_state(fp_state); 402 | fclose(fp_state); 403 | } 404 | 405 | // Write the key probability file. 406 | { 407 | std::vector prob; 408 | prob.resize(num_unique_keys); 409 | for (std::size_t i = 0; i < num_unique_keys; i++) { 410 | uint64_t key = static_cast(keys[i]); 411 | prob[key] = zipf_prob(&zipf_state, i); 412 | } 413 | 414 | FILE* fp_prob = fopen("output_prob.txt", "wt"); 415 | for (std::size_t i = 0; i < num_unique_keys; i++) 416 | fprintf(fp_prob, "prob:%lf\n", prob[i]); 417 | fclose(fp_prob); 418 | } 419 | */ 420 | } 421 | 422 | int main(int argc, const char* argv[]) { 423 | if (argc < 11) { 424 | printf( 425 | "%s STORE-TYPE NUM-UNIQUE-KEYS ACTIVE-KEY-MODE DEPENDENCY-MODE " 426 | "NUM-REQUESTS ZIPF-THETA COMPACTION-MODE WB-SIZE ENABLE-FSYNC " 427 | "USE-CUSTOM-SIZES [DUMP-POINTS]\n", 428 | argv[0]); 429 | printf("STORE-TYPE: leveldb-sim, leveldb-impl, rocksdb-impl\n"); 430 | printf("NUM-UNIQUE-KEYS: 1000000, ...\n"); 431 | printf("ACTIVE-KEY-MODE: 0, 1, 2\n"); 432 | printf("DEPENDENCY-MODE: 0, 1, 2, 3\n"); 433 | printf("NUM-REQUESTS: 10000000, ...\n"); 434 | printf("ZIPF-THETA: 0.00, 0.99, ...\n"); 435 | printf("COMPACTION-MODE: 0, 1, 2, ...\n"); 436 | printf("WB-SIZE: 4194304, ...\n"); 437 | printf("ENABLE-FSYNC: 0, 1\n"); 438 | printf("USE-CUSTOM-SIZES: 0, 1\n"); 439 | return 1; 440 | } 441 | int store_type; 442 | if (strcmp(argv[1], "leveldb-sim") == 0) 443 | store_type = 0; 444 | else if (strcmp(argv[1], "leveldb-impl") == 0) 445 | store_type = 1; 446 | else if (strcmp(argv[1], "rocksdb-impl") == 0) 447 | store_type = 2; 448 | else { 449 | printf("invalid STORE-TYPE\n"); 450 | return 1; 451 | } 452 | 453 | uint32_t num_unique_keys = static_cast(atoi(argv[2])); 454 | ActiveKeyMode active_key_mode = static_cast(atoi(argv[3])); 455 | DependencyMode dependency_mode = static_cast(atoi(argv[4])); 456 | uint64_t num_requests = static_cast(atol(argv[5])); 457 | double theta = atof(argv[6]); 458 | LevelDBCompactionMode compaction_mode = 459 | static_cast(atoi(argv[7])); 460 | uint64_t wb_size = static_cast(atol(argv[8])); 461 | bool enable_fsync = atoi(argv[9]) != 0; 462 | bool use_custom_sizes = atoi(argv[10]) != 0; 463 | std::vector dump_points; 464 | for (int i = 11; i < argc; i++) 465 | dump_points.push_back(static_cast(atol(argv[i]))); 466 | 467 | if (store_type == 0) 468 | test("leveldb-sim", num_unique_keys, active_key_mode, 469 | dependency_mode, num_requests, theta, compaction_mode, 470 | wb_size, enable_fsync, use_custom_sizes, dump_points); 471 | else if (store_type == 1) 472 | test("leveldb-impl", num_unique_keys, active_key_mode, 473 | dependency_mode, num_requests, theta, compaction_mode, 474 | wb_size, enable_fsync, use_custom_sizes, dump_points); 475 | else if (store_type == 2) 476 | test("rocksdb-impl", num_unique_keys, active_key_mode, 477 | dependency_mode, num_requests, theta, compaction_mode, 478 | wb_size, enable_fsync, use_custom_sizes, dump_points); 479 | else 480 | assert(false); 481 | 482 | return 0; 483 | } 484 | -------------------------------------------------------------------------------- /measure_rw.cpp: -------------------------------------------------------------------------------- 1 | #include "common.h" 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | uint64_t get_usec() { 11 | struct timeval tv_now; 12 | gettimeofday(&tv_now, NULL); 13 | 14 | return (uint64_t)tv_now.tv_sec * 1000000UL + (uint64_t)tv_now.tv_usec; 15 | } 16 | 17 | int main(int argc, const char* argv[]) { 18 | if (argc < 2) { 19 | printf("%s PATH\n", argv[0]); 20 | } 21 | 22 | // write 1 GiB 23 | const size_t stride_len = 4 * 1048576; 24 | const size_t stride_count = 256; 25 | 26 | char* bytes = new char[stride_len]; 27 | char* p; 28 | 29 | int fd = open(argv[1], O_CREAT | O_RDWR | O_TRUNC, 0644); 30 | double rw_cost_sum = 0.; 31 | 32 | for (int i = 0; i < 10; i++) { 33 | uint64_t start_t; 34 | double elapsed; 35 | size_t remaining_len; 36 | 37 | printf("seq %d\n", i + 1); 38 | fflush(stdout); 39 | 40 | memset(bytes, (i % 254) + 1, stride_len); 41 | 42 | // write data 43 | start_t = get_usec(); 44 | lseek(fd, 0, SEEK_SET); 45 | for (size_t stride = 0; stride < stride_count; stride++) { 46 | p = bytes; 47 | remaining_len = stride_len; 48 | while (remaining_len > 0) { 49 | ssize_t wrote_len = write(fd, p, stride_len); 50 | if (wrote_len < 0) { 51 | perror(""); 52 | close(fd); 53 | return 0; 54 | } 55 | p += wrote_len; 56 | remaining_len -= static_cast(wrote_len); 57 | } 58 | } 59 | fdatasync(fd); 60 | posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED); 61 | elapsed = (double)(get_usec() - start_t) / 1000000.; 62 | double write_tput = (stride_len * stride_count) / elapsed; 63 | 64 | // read data 65 | start_t = get_usec(); 66 | lseek(fd, 0, SEEK_SET); 67 | for (size_t stride = 0; stride < stride_count; stride++) { 68 | p = bytes; 69 | remaining_len = stride_len; 70 | while (remaining_len > 0) { 71 | ssize_t read_len = read(fd, p, remaining_len); 72 | if (read_len < 0) { 73 | perror(""); 74 | close(fd); 75 | return 0; 76 | } 77 | p += read_len; 78 | remaining_len -= static_cast(read_len); 79 | }; 80 | } 81 | elapsed = (double)(get_usec() - start_t) / 1000000.; 82 | double read_tput = (stride_len * stride_count) / elapsed; 83 | 84 | double rw_cost = write_tput / read_tput; 85 | rw_cost_sum += rw_cost; 86 | 87 | printf("write tput = %7.2lf MB/s\n", write_tput / 1000000.); 88 | printf("read tput = %7.2lf MB/s\n", read_tput / 1000000.); 89 | printf("r/w cost = %7.3lf\n", rw_cost); 90 | printf("(avg) = %7.3lf\n", rw_cost_sum / static_cast(i + 1)); 91 | printf("\n"); 92 | fflush(stdout); 93 | } 94 | 95 | close(fd); 96 | return 0; 97 | } 98 | -------------------------------------------------------------------------------- /meshdb.cpp: -------------------------------------------------------------------------------- 1 | #include "meshdb.h" 2 | #include "util.h" 3 | // #include 4 | 5 | MeshDB::MeshDB(const MeshDBParams& params, Stat& stat, 6 | MeshDBItemLifetimeInfo* lifetime_info) 7 | : params_(params), stat_(stat), lifetime_info_(lifetime_info) { 8 | log_bytes_ = 0; 9 | next_version_ = 0; 10 | updates_since_last_compaction_ = 0; 11 | next_compaction_key_ = 0; 12 | compaction_rand_seed_ = 0; 13 | 14 | for (std::size_t i = 0; i < 1 + num_lifetime_classes; i++) 15 | level_bytes_[i] = 0; 16 | 17 | // for (std::size_t i = 0; i < num_lifetime_classes; i++) 18 | // compaction_weight_[i] = 1. / static_cast(num_lifetime_classes); 19 | compaction_weight_[0] = 1.; 20 | for (std::size_t i = 1; i < num_lifetime_classes; i++) 21 | compaction_weight_[i] = compaction_weight_[i - 1] / 10; 22 | // compaction_weight_[i] = compaction_weight_[i - 1] / 4; 23 | 24 | compaction_weight_[0] = 1. / 18000.; 25 | compaction_weight_[1] = 1. / 130000.; 26 | compaction_weight_[2] = 1. / 1000000.; 27 | 28 | global_mutation_rate_ = 0.5; 29 | for (std::size_t i = 0; i < num_lifetime_classes; i++) 30 | level_mutation_rate_[i] = 0.5; 31 | 32 | for (std::size_t i = 0; i < num_lifetime_classes - 1; i++) 33 | lifetime_threshold_[i] = static_cast(i + 1); 34 | } 35 | 36 | MeshDB::~MeshDB() {} 37 | 38 | void MeshDB::print_status() const { 39 | printf("log: %zu items, %lu bytes\n", log_.size(), log_bytes_); 40 | for (std::size_t i = 0; i < 1 + num_lifetime_classes; i++) 41 | printf("level-%zu: %zu tables, %lu bytes\n", i, levels_[i].size(), 42 | level_bytes_[i]); 43 | } 44 | 45 | void MeshDB::dump_state(FILE* fp) const { 46 | // XXX: Memtable is not dumped now. 47 | fprintf(fp, "next_version:%lu\n", next_version_); 48 | 49 | fprintf(fp, "log:\n"); 50 | dump_state(fp, log_); 51 | 52 | fprintf(fp, "levels:\n"); 53 | for (std::size_t level = 0; level < 1 + num_lifetime_classes; level++) { 54 | auto& sstables = levels_[level]; 55 | fprintf(fp, "level:\n"); 56 | for (std::size_t i = 0; i < sstables.size(); i++) { 57 | fprintf(fp, "sstable:\n"); 58 | dump_state(fp, *sstables[i]); 59 | } 60 | } 61 | } 62 | 63 | void MeshDB::dump_state(FILE* fp, const sstable_t& l) { 64 | for (std::size_t i = 0; i < l.size(); i++) dump_state(fp, l[i]); 65 | } 66 | 67 | void MeshDB::dump_state(FILE* fp, const MeshDBItem& item) { 68 | fprintf(fp, "item:%lu,%lu,%lu,%s\n", item.key, item.version, item.size, 69 | item.deletion ? "T" : "F"); 70 | } 71 | 72 | void MeshDB::put(uint64_t key, uint64_t item_size) { 73 | MeshDBItem item{key, next_version_++, item_size, false}; 74 | append_to_log(item); 75 | } 76 | 77 | void MeshDB::del(uint64_t key) { 78 | MeshDBItem item{key, next_version_++, 16, true}; 79 | append_to_log(item); 80 | } 81 | 82 | uint64_t MeshDB::get(MeshDBKey key) { 83 | // TODO: Implement 84 | (void)key; 85 | return 0; 86 | } 87 | 88 | void MeshDB::force_compact() { 89 | flush_log(); 90 | 91 | std::size_t num_steps = 10; 92 | 93 | next_compaction_key_ = 0; 94 | while (next_compaction_key_ < 2000000) { 95 | // XXX: hardcoding to set a key range 96 | for (std::size_t step = 0; step < num_steps; step++) { 97 | MeshDBKey first = next_compaction_key_; 98 | MeshDBKey last = 99 | next_compaction_key_ + 100 | 2000000 / params_.level0_sstable_count_threshold / num_steps; 101 | next_compaction_key_ += 102 | 2000000 / params_.level0_sstable_count_threshold / num_steps; 103 | 104 | compact(1 + num_lifetime_classes, first, last); 105 | } 106 | } 107 | next_compaction_key_ = 0; 108 | } 109 | 110 | void MeshDB::append_to_log(const MeshDBItem& item) { 111 | log_.push_back(item); 112 | 113 | // Update statistics. 114 | auto new_log_bytes = log_bytes_ + item.size; 115 | // auto log_bytes_d = log_bytes_ / 4096; 116 | // auto new_log_bytes_d = new_log_bytes / 4096; 117 | // if (log_bytes_d != new_log_bytes_d) { 118 | // // New blocks are written. 119 | // stat_.write((new_log_bytes_d - log_bytes_d) * 4096); 120 | // } 121 | stat_.write(item.size); 122 | log_bytes_ = new_log_bytes; 123 | 124 | updates_since_last_compaction_ += 1; 125 | 126 | if (log_bytes_ > params_.log_size_threshold) flush_log(); 127 | } 128 | 129 | void MeshDB::flush_log() { 130 | if (log_.size() == 0) return; 131 | 132 | // Simplified for simulation; a new SSTable is created from the memtable, 133 | // causing no disk read. 134 | item_ptr_t items; 135 | item_ptr_t items2; 136 | sort_items(log_, items); 137 | deduplicate_items(items, items2); 138 | sstable_locs_t new_sstable_locs; 139 | create_sstables(1, items2, new_sstable_locs); 140 | delete_log(); 141 | 142 | double accum_p[num_lifetime_classes]; 143 | 144 | static int r = 0; 145 | if (r++ % 100 == 0) { 146 | printf("global_mutation_rate=%lf\n", global_mutation_rate_); 147 | for (std::size_t i = 0; i < num_lifetime_classes; i++) 148 | printf("level_mutation_rate[%zu]=%lf\n", i, level_mutation_rate_[i]); 149 | for (std::size_t i = 0; i < num_lifetime_classes; i++) 150 | printf("compaction_weight[%zu]=%lf\n", i, compaction_weight_[i]); 151 | // for (std::size_t i = 0; i < num_lifetime_classes - 1; i++) 152 | // printf("lifetime_threshold[%zu]=%lf\n", i, lifetime_threshold_[i]); 153 | } 154 | 155 | // std::size_t num_steps = 10; 156 | std::size_t num_steps = 1; 157 | 158 | for (std::size_t step = 0; step < num_steps; step++) { 159 | double accum_weight = 0.; 160 | { 161 | std::size_t lifetime_class = num_lifetime_classes - 1; 162 | while (true) { 163 | accum_weight += compaction_weight_[lifetime_class]; 164 | accum_p[lifetime_class] = accum_weight; 165 | if (lifetime_class == 0) break; 166 | lifetime_class--; 167 | } 168 | } 169 | 170 | double r = fast_rand_d(&compaction_rand_seed_) * accum_weight; 171 | 172 | std::size_t num_levels = 2; 173 | { 174 | std::size_t lifetime_class = num_lifetime_classes - 1; 175 | while (true) { 176 | if (r < accum_p[lifetime_class]) { 177 | num_levels = 2 + lifetime_class; 178 | break; 179 | } 180 | if (lifetime_class == 0) break; 181 | lifetime_class--; 182 | } 183 | } 184 | 185 | // XXX: hardcoding to set a key range 186 | MeshDBKey first = next_compaction_key_; 187 | MeshDBKey last = 188 | next_compaction_key_ + 189 | 2000000 / (params_.level0_sstable_count_threshold + 1) / num_steps; 190 | next_compaction_key_ += 191 | 2000000 / (params_.level0_sstable_count_threshold + 1) / num_steps; 192 | if (next_compaction_key_ >= 2000000) next_compaction_key_ = 0; 193 | 194 | bool any_key_in_level0 = true; 195 | std::vector sstable_indices; 196 | find_overlapping_tables(0, first, last, sstable_indices); 197 | for (auto i : sstable_indices) 198 | for (auto& item : *levels_[0][i]) 199 | if (first <= item.key && item.key <= last) { 200 | any_key_in_level0 = true; 201 | break; 202 | } 203 | if (any_key_in_level0 /*|| num_levels > 2*/) { 204 | // printf("compact: num_levels=%zu first=%lu last=%lu\n", num_levels, 205 | // first, last); 206 | compact(num_levels, first, last); 207 | } 208 | } 209 | 210 | // printf("\n"); 211 | } 212 | 213 | void MeshDB::delete_log() { 214 | // stat_.del(log_bytes_ / 4096 * 4096); 215 | stat_.del(log_bytes_); 216 | log_.clear(); 217 | log_bytes_ = 0; 218 | } 219 | 220 | struct _MeshDBDereferenceComparer { 221 | bool operator()(const MeshDBItem* a, const MeshDBItem* b) const { 222 | auto& item_a = *a; 223 | auto& item_b = *b; 224 | if (item_a.key < item_b.key) 225 | return true; 226 | else if (item_a.key == item_b.key && item_a.version < item_b.version) 227 | return true; 228 | return false; 229 | } 230 | }; 231 | 232 | void MeshDB::sort_items(sstable_t& items, item_ptr_t& out_items) { 233 | std::size_t count = items.size(); 234 | out_items.clear(); 235 | out_items.reserve(count); 236 | for (auto& item : items) out_items.push_back(&item); 237 | std::sort(out_items.begin(), out_items.end(), _MeshDBDereferenceComparer()); 238 | } 239 | 240 | struct _MeshDBSSTableComparer { 241 | const MeshDB::sstables_t& sstables; 242 | const std::vector& sstables_pos; 243 | 244 | bool operator()(const std::size_t& a, const std::size_t& b) const { 245 | auto& item_a = (*sstables[a])[sstables_pos[a]]; 246 | auto& item_b = (*sstables[b])[sstables_pos[b]]; 247 | if (item_a.key > item_b.key) 248 | return true; 249 | else if (item_a.key == item_b.key && item_a.version > item_b.version) 250 | return true; 251 | return false; 252 | } 253 | }; 254 | 255 | void MeshDB::merge_items(const sstables_t& sstables, item_ptr_t& out_items) { 256 | std::size_t total_count = 0; 257 | std::vector heap; 258 | std::vector sstable_pos; 259 | for (std::size_t i = 0; i < sstables.size(); i++) { 260 | total_count += sstables[i]->size(); 261 | if (sstables[i]->size() > 0) heap.push_back(i); 262 | sstable_pos.push_back(0); 263 | } 264 | 265 | out_items.clear(); 266 | out_items.reserve(total_count); 267 | 268 | // Since std::make_heap makes a max-heap, we use a comparator with the 269 | // opposite result. 270 | _MeshDBSSTableComparer comp{sstables, sstable_pos}; 271 | 272 | std::make_heap(heap.begin(), heap.end(), comp); 273 | while (heap.size() > 0) { 274 | auto sstable_index = heap.front(); 275 | 276 | std::pop_heap(heap.begin(), heap.end(), comp); 277 | heap.pop_back(); 278 | 279 | auto& sstable = sstables[sstable_index]; 280 | // assert(sstable_pos[sstable_index] < sstable->size()); 281 | auto& item = (*sstable)[sstable_pos[sstable_index]++]; 282 | 283 | out_items.push_back(&item); 284 | // if (out_items.size() >= 2) 285 | // assert(out_items[out_items.size() - 2]->key <= 286 | // out_items[out_items.size() - 1]->key); 287 | 288 | if (sstable_pos[sstable_index] < sstable->size()) { 289 | heap.push_back(sstable_index); 290 | std::push_heap(heap.begin(), heap.end(), comp); 291 | } 292 | } 293 | 294 | // assert(out_items.size() == total_count); 295 | // for (std::size_t i = 0; i < sstables.size(); i++) 296 | // assert(sstable_pos[i] == sstables[i]->size()); 297 | } 298 | 299 | void MeshDB::deduplicate_items(const item_ptr_t& items, item_ptr_t& out_items) { 300 | std::size_t count = items.size(); 301 | 302 | out_items.clear(); 303 | if (count == 0) return; 304 | out_items.reserve(count); 305 | 306 | for (std::size_t i = 0; i < count - 1; i++) { 307 | if (items[i]->key != items[i + 1]->key) out_items.push_back(items[i]); 308 | } 309 | if (count > 0) out_items.push_back(items[count - 1]); 310 | } 311 | 312 | void MeshDB::insert_sstable(std::size_t level, sstable_t* sstable) { 313 | assert(sstable->size() != 0); 314 | 315 | // TODO: Use binary search to find the insert point. 316 | auto it = levels_[level].begin(); 317 | std::size_t idx = 0; 318 | while (it != levels_[level].end() && 319 | (*it)->front().key <= sstable->front().key) { 320 | ++it; 321 | idx++; 322 | } 323 | levels_[level].insert(it, sstable); 324 | // levels_[level].push_back(sstable); 325 | } 326 | 327 | MeshDB::sstable_t* MeshDB::remove_sstable(std::size_t level, std::size_t idx) { 328 | sstable_t* t = levels_[level][idx]; 329 | 330 | for (auto j = idx; j < levels_[level].size() - 1; j++) 331 | levels_[level][j] = levels_[level][j + 1]; 332 | levels_[level].pop_back(); 333 | 334 | return t; 335 | } 336 | 337 | void MeshDB::create_sstables(std::size_t num_levels, const item_ptr_t& items, 338 | sstable_locs_t& out_new_sstables) { 339 | const std::size_t last_level = 1 + num_lifetime_classes - 1; 340 | 341 | sstable_t* sstables[num_levels]; 342 | // The current SSTable size in bytes. 343 | uint64_t sstable_sizes[num_levels]; 344 | 345 | for (std::size_t i = 0; i < num_levels; i++) { 346 | sstables[i] = nullptr; 347 | sstable_sizes[i] = 0; 348 | } 349 | 350 | auto insert_f = [&](std::size_t level) { 351 | insert_sstable(level, sstables[level]); 352 | out_new_sstables.push_back(std::make_pair(sstables[level], level)); 353 | level_bytes_[level] += sstable_sizes[level]; 354 | stat_.write(sstable_sizes[level]); 355 | sstables[level] = nullptr; 356 | sstable_sizes[level] = 0; 357 | }; 358 | 359 | for (auto& item : items) { 360 | std::size_t level = 1 + lifetime_info_->item_class(item->key); 361 | 362 | // uint64_t item_lifetime = lifetime_info_->item_lifetime(item->key); 363 | // std::size_t item_class; 364 | // for (item_class = 0; item_class < num_lifetime_classes - 1; item_class++) 365 | // if (static_cast(item_lifetime) <= 366 | // lifetime_threshold_[item_class]) 367 | // break; 368 | // std::size_t level = 1 + item_class; 369 | 370 | // std::size_t level = num_levels - 1; 371 | 372 | if (level >= num_levels) level = num_levels - 1; 373 | 374 | // Deletion is discarded when there is no more levels. 375 | // TODO: this leaves lots of deletion tombstones if the item's lifetime 376 | // class is not the last one. 377 | if (item->deletion && level == last_level) continue; 378 | 379 | if (sstables[level]) { 380 | bool need_new_sstable = false; 381 | if (sstable_sizes[level] + item->size > params_.sstable_size_threshold) { 382 | // Stop adding new items if this SSTable become large in size. 383 | need_new_sstable = true; 384 | } 385 | 386 | if (need_new_sstable) insert_f(level); 387 | } 388 | 389 | if (!sstables[level]) sstables[level] = new sstable_t(); 390 | sstables[level]->push_back(*item); 391 | sstable_sizes[level] += item->size; 392 | } 393 | for (std::size_t level = 0; level < num_levels; level++) { 394 | if (sstables[level]) { 395 | // Add any pending SSTable in construction. 396 | insert_f(level); 397 | } 398 | } 399 | } 400 | 401 | void MeshDB::find_overlapping_tables( 402 | std::size_t level, const MeshDBKey& first, const MeshDBKey& last, 403 | std::vector& out_sstable_indices) { 404 | // assert(level >= 1); 405 | // assert(level < levels_.size()); 406 | 407 | // TODO: Use binary search to reduce the search range. 408 | 409 | auto& level_tables = levels_[level]; 410 | std::size_t count = level_tables.size(); 411 | out_sstable_indices.clear(); 412 | 413 | for (std::size_t i = 0; i < count; i++) { 414 | auto& sstable = *level_tables[i]; 415 | if (!(last < sstable.front().key || sstable.back().key < first)) 416 | out_sstable_indices.push_back(i); 417 | } 418 | } 419 | 420 | struct _MeshDBReverseInt { 421 | bool operator()(const std::size_t& a, const std::size_t& b) const { 422 | return a > b; 423 | } 424 | }; 425 | 426 | // struct phash { 427 | // template 428 | // std::size_t operator()(const std::pair& x) const { 429 | // return std::hash()(x.first) ^ std::hash()(x.second); 430 | // } 431 | // }; 432 | 433 | void MeshDB::compact(std::size_t num_levels, const MeshDBKey& first, 434 | const MeshDBKey& last) { 435 | std::vector merge_source; 436 | std::vector> sstables_to_delete; 437 | 438 | // std::unordered_map, std::size_t, phash> 439 | // org_level; 440 | std::size_t org_size = 0; 441 | 442 | for (std::size_t level = 0; level < num_levels; level++) { 443 | std::vector sstable_indices; 444 | find_overlapping_tables(level, first, last, sstable_indices); 445 | 446 | sstable_t* temp_sstable; 447 | if (level > 0) { 448 | temp_sstable = new sstable_t(); 449 | merge_source.push_back(temp_sstable); 450 | } 451 | 452 | for (auto& i : sstable_indices) { 453 | if (level == 0) { 454 | temp_sstable = new sstable_t(); 455 | merge_source.push_back(temp_sstable); 456 | } 457 | 458 | auto& org_sstable = *levels_[level][i]; 459 | 460 | uint64_t sstable_size = 0; 461 | std::size_t item_start = 0; 462 | while (item_start < org_sstable.size() && 463 | org_sstable[item_start].key < first) 464 | item_start++; 465 | 466 | std::size_t item_end = item_start; 467 | while (item_end < org_sstable.size() && 468 | org_sstable[item_end].key <= last) { 469 | temp_sstable->push_back(org_sstable[item_end]); 470 | if (level == num_levels - 1) { 471 | // org_level[std::make_pair(org_sstable[item_end].key, 472 | // org_sstable[item_end].version)] = level; 473 | org_size += org_sstable[item_end].size; 474 | } 475 | sstable_size += org_sstable[item_end].size; 476 | item_end++; 477 | } 478 | 479 | stat_.read(sstable_size); 480 | stat_.del(sstable_size); 481 | level_bytes_[level] -= sstable_size; 482 | 483 | org_sstable.erase( 484 | org_sstable.begin() + static_cast(item_start), 485 | org_sstable.begin() + static_cast(item_end)); 486 | if (org_sstable.size() == 0) { 487 | sstables_to_delete.push_back(std::make_pair(level, i)); 488 | delete &org_sstable; 489 | } 490 | } 491 | } 492 | 493 | std::reverse(sstables_to_delete.begin(), sstables_to_delete.end()); 494 | for (auto p : sstables_to_delete) remove_sstable(p.first, p.second); 495 | 496 | item_ptr_t items; 497 | merge_items(merge_source, items); 498 | 499 | item_ptr_t items2; 500 | deduplicate_items(items, items2); 501 | 502 | // Calculate the mutation rate and modify the compaction weight 503 | std::size_t unmodified_size = 0; 504 | if (org_size != 0) { 505 | for (auto& item : items2) { 506 | if (item - &merge_source.back()->front() >= 0 && 507 | &merge_source.back()->back() - item >= 0) 508 | unmodified_size += item->size; 509 | } 510 | } 511 | 512 | // Create new SSTables. 513 | sstable_locs_t new_sstable_locs; 514 | create_sstables(num_levels, items2, new_sstable_locs); 515 | 516 | // Delete old SSTables. 517 | for (auto& sstable : merge_source) delete sstable; 518 | 519 | // Calculate the mutation rate and modify the compaction weight 520 | // std::size_t unmodified_size = 0; 521 | // for (auto& p : new_sstable_locs) { 522 | // if (p.second != num_levels - 1) 523 | // continue; 524 | // for (auto& item : *p.first) { 525 | // if (org_level[std::make_pair(item.key, item.version)] == p.second) 526 | // unmodified_size += item.size; 527 | // } 528 | // } 529 | double mutation_rate; 530 | if (org_size != 0) 531 | mutation_rate = 532 | 1. - 533 | static_cast(unmodified_size) / static_cast(org_size); 534 | else 535 | mutation_rate = 0.; 536 | if (mutation_rate < 0.) mutation_rate = 0.; 537 | if (mutation_rate > 1.) mutation_rate = 1.; 538 | 539 | // printf("num_levels=%zu mutation_rate=%lf\n", num_levels, mutation_rate); 540 | // if (mutation_rate < params_.target_mutation_rate) { 541 | // // Too little mutation; decrease weight for less frequent compaction. 542 | // compaction_weight_[num_levels - 2] /= 1.01; 543 | // } 544 | // else { 545 | // // Too much mutation; increase weight for more frequent compaction. 546 | // compaction_weight_[num_levels - 2] *= 1.01; 547 | // } 548 | 549 | // double weight_sum = 0.; 550 | // for (std::size_t i = 0; i < num_lifetime_classes; i++) 551 | // weight_sum += compaction_weight_[i]; 552 | 553 | // if (num_levels < 1 + num_lifetime_classes) { 554 | // // if (mutation_rate * compaction_weight_[num_levels - 2] < 555 | // level_mutation_rate_[num_levels - 1] * compaction_weight_[num_levels - 556 | // 1]) 557 | // if (mutation_rate < level_mutation_rate_[num_levels - 1]) 558 | // // compaction_weight_[num_levels - 2] /= pow(1.01, (1. / 559 | // (compaction_weight_[num_levels - 2] / weight_sum))); 560 | // compaction_weight_[num_levels - 2] /= 1.01; 561 | // else 562 | // // compaction_weight_[num_levels - 2] *= pow(1.01, (1. / 563 | // (compaction_weight_[num_levels - 2] / weight_sum))); 564 | // compaction_weight_[num_levels - 2] *= 1.01; 565 | // } 566 | 567 | // Normalize weights 568 | double weight_sum = 0.; 569 | for (std::size_t i = 0; i < num_lifetime_classes; i++) 570 | weight_sum += compaction_weight_[i]; 571 | for (std::size_t i = 0; i < num_lifetime_classes; i++) 572 | compaction_weight_[i] /= weight_sum; 573 | 574 | global_mutation_rate_ = global_mutation_rate_ * 0.99 + mutation_rate * 0.01; 575 | level_mutation_rate_[num_levels - 2] = 576 | level_mutation_rate_[num_levels - 2] * 0.99 + mutation_rate * 0.01; 577 | } 578 | -------------------------------------------------------------------------------- /meshdb.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.h" 4 | #include "stat.h" 5 | #include 6 | #include 7 | 8 | typedef uint64_t MeshDBKey; 9 | 10 | struct MeshDBParams { 11 | // When a log file exceeds this size, a new Level-0 SSTable is created, and a 12 | // new log file is created. 13 | uint64_t log_size_threshold; 14 | // When the level 0 ("young") has this many SSTables, all of them are merged 15 | // into the next level. 16 | uint64_t level0_sstable_count_threshold; 17 | // When an SSTable file exceeds this size, a new SSTable is created. 18 | uint64_t sstable_size_threshold; 19 | // Adjust compaction frequency to meet this mutation rate for SSTables in the 20 | // last level of compaction. 21 | double target_mutation_rate; 22 | 23 | MeshDBParams() { 24 | log_size_threshold = 4 * 1048576; 25 | level0_sstable_count_threshold = 4; 26 | // level0_sstable_count_threshold = 12; 27 | sstable_size_threshold = 2 * 1048576; 28 | target_mutation_rate = 0.10; 29 | } 30 | }; 31 | 32 | struct MeshDBItem { 33 | MeshDBKey key; 34 | uint64_t version; 35 | uint64_t size; 36 | bool deletion; 37 | }; 38 | 39 | class MeshDBItemLifetimeInfo { 40 | public: 41 | virtual ~MeshDBItemLifetimeInfo() {} 42 | virtual std::size_t item_class(MeshDBKey key) { 43 | (void)key; 44 | return 0; 45 | } 46 | virtual uint64_t item_lifetime(MeshDBKey key) { 47 | (void)key; 48 | return 1; 49 | } 50 | virtual uint64_t class_lifetime(std::size_t lifetime_class) { 51 | (void)lifetime_class; 52 | return 1; 53 | } 54 | }; 55 | 56 | // A MeshDB 57 | class MeshDB { 58 | public: 59 | static const std::size_t num_lifetime_classes = 60 | 3; // TODO: Allow custom class count. 61 | // static const std::size_t num_lifetime_classes = 5; // TODO: Allow 62 | // custom class count. 63 | 64 | MeshDB(const MeshDBParams& params, Stat& stat, 65 | MeshDBItemLifetimeInfo* lifetime_info); 66 | ~MeshDB(); 67 | 68 | // Prints the summary of the store. 69 | void print_status() const; 70 | 71 | // Writes the current items in the store to the file. 72 | void dump_state(FILE* fp) const; 73 | 74 | // Puts a new item in the store. 75 | void put(MeshDBKey key, uint64_t item_size); 76 | 77 | // Deletes an item from the store. 78 | void del(MeshDBKey key); 79 | 80 | // Gets an item from the store. 81 | uint64_t get(MeshDBKey key); 82 | 83 | // Forces compaction until there is no successor SSTable. 84 | void force_compact(); 85 | 86 | typedef std::vector sstable_t; 87 | typedef std::vector sstables_t; 88 | typedef std::vector> sstable_locs_t; 89 | 90 | typedef std::vector item_ptr_t; 91 | 92 | protected: 93 | // Adds a new item to the log. 94 | void append_to_log(const MeshDBItem& item); 95 | 96 | // Flushes all in-memory data to disk. This effectively creates new level-0 97 | // SSTables from the Memtable. 98 | void flush_log(); 99 | 100 | // Deletes the log. 101 | void delete_log(); 102 | 103 | // Sorts items. 104 | void sort_items(sstable_t& items, item_ptr_t& out_items); 105 | 106 | // Merges SSTables. 107 | void merge_items(const sstables_t& sstables, item_ptr_t& out_items); 108 | 109 | // Removes duplicate items and takes the latter ones. The items must be 110 | // sorted by key. 111 | void deduplicate_items(const item_ptr_t& items, item_ptr_t& out_items); 112 | 113 | // Creates new SSTables from the items designated by the indices. 114 | void create_sstables(std::size_t num_levels, const item_ptr_t& items, 115 | sstable_locs_t& out_new_sstables); 116 | 117 | // Finds all overlapping SSTables in the level. 118 | void find_overlapping_tables(std::size_t level, const MeshDBKey& first, 119 | const MeshDBKey& last, 120 | std::vector& out_sstable_indices); 121 | 122 | // Performs compaction with SSTables from the level and all over overlapping 123 | // SSTables in the next level. 124 | void compact(std::size_t num_levels, const MeshDBKey& first, 125 | const MeshDBKey& last); 126 | 127 | // Inserts a new SSTable into the level. 128 | void insert_sstable(std::size_t level, sstable_t* sstable); 129 | 130 | // Removes an SSTable from the level. This does not release the memory used 131 | // by the SSTable. 132 | sstable_t* remove_sstable(std::size_t level, std::size_t idx); 133 | 134 | // Writes an item list to the file. 135 | static void dump_state(FILE* fp, const sstable_t& l); 136 | static void dump_state(FILE* fp, const MeshDBItem& item); 137 | 138 | private: 139 | MeshDBParams params_; 140 | Stat& stat_; 141 | MeshDBItemLifetimeInfo* lifetime_info_; 142 | sstable_t log_; 143 | uint64_t log_bytes_; 144 | sstables_t levels_[1 + num_lifetime_classes]; 145 | uint64_t level_bytes_[1 + num_lifetime_classes]; 146 | uint64_t next_version_; 147 | uint64_t updates_since_last_compaction_; 148 | MeshDBKey next_compaction_key_; 149 | uint64_t compaction_rand_seed_; 150 | double compaction_weight_[num_lifetime_classes]; 151 | double global_mutation_rate_; 152 | double level_mutation_rate_[num_lifetime_classes]; 153 | double lifetime_threshold_[num_lifetime_classes - 1]; 154 | }; 155 | -------------------------------------------------------------------------------- /rocksdb_impl.cpp: -------------------------------------------------------------------------------- 1 | #include "rocksdb_impl.h" 2 | #pragma GCC diagnostic push 3 | #pragma GCC diagnostic ignored "-Wunused-parameter" 4 | #pragma GCC diagnostic ignored "-Winline" 5 | #include "rocksdb/db.h" 6 | #include "rocksdb/env.h" 7 | #pragma GCC diagnostic pop 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #define OVERRIDE override 16 | // #define OVERRIDE 17 | 18 | // A wrapper for SequentialFile that forwards the data read information to 19 | // RocksDBImpl. 20 | class RocksDBSequentialFile : public rocksdb::SequentialFile { 21 | public: 22 | RocksDBSequentialFile(RocksDBImpl* rocksdb_impl, 23 | std::unique_ptr* t) 24 | : rocksdb::SequentialFile(), rocksdb_impl_(rocksdb_impl), target_(t) {} 25 | 26 | virtual ~RocksDBSequentialFile() OVERRIDE { delete target_; } 27 | 28 | virtual rocksdb::Status Read(size_t n, rocksdb::Slice* result, 29 | char* scratch) OVERRIDE { 30 | rocksdb_impl_->Read(n); 31 | return (*target_)->Read(n, result, scratch); 32 | } 33 | 34 | virtual rocksdb::Status Skip(uint64_t n) OVERRIDE { 35 | return (*target_)->Skip(n); 36 | } 37 | 38 | virtual rocksdb::Status InvalidateCache(size_t offset, 39 | size_t length) OVERRIDE { 40 | return (*target_)->InvalidateCache(offset, length); 41 | } 42 | 43 | private: 44 | class RocksDBImpl* rocksdb_impl_; 45 | std::unique_ptr* target_; 46 | }; 47 | 48 | // A wrapper for RandomAccessFile that forwards the data read information to 49 | // RocksDBImpl. 50 | class RocksDBRandomAccessFile : public rocksdb::RandomAccessFile { 51 | public: 52 | RocksDBRandomAccessFile(RocksDBImpl* rocksdb_impl, 53 | std::unique_ptr* t) 54 | : rocksdb::RandomAccessFile(), rocksdb_impl_(rocksdb_impl), target_(t) {} 55 | 56 | virtual ~RocksDBRandomAccessFile() OVERRIDE { delete target_; } 57 | 58 | virtual rocksdb::Status Read(uint64_t offset, size_t n, 59 | rocksdb::Slice* result, 60 | char* scratch) const OVERRIDE { 61 | rocksdb_impl_->Read(n); 62 | return (*target_)->Read(offset, n, result, scratch); 63 | } 64 | 65 | virtual size_t GetUniqueId(char* id, size_t max_size) const OVERRIDE { 66 | return (*target_)->GetUniqueId(id, max_size); 67 | } 68 | 69 | virtual void Hint(AccessPattern pattern) OVERRIDE { 70 | (*target_)->Hint(pattern); 71 | } 72 | 73 | virtual rocksdb::Status InvalidateCache(size_t offset, 74 | size_t length) OVERRIDE { 75 | return (*target_)->InvalidateCache(offset, length); 76 | } 77 | 78 | private: 79 | class RocksDBImpl* rocksdb_impl_; 80 | std::unique_ptr* target_; 81 | }; 82 | 83 | // A wrapper for WritableFile that forwards the data append information to 84 | // RocksDBImpl. 85 | class RocksDBWritableFile : public rocksdb::WritableFile { 86 | public: 87 | RocksDBWritableFile(RocksDBImpl* rocksdb_impl, 88 | std::unique_ptr* t) 89 | : rocksdb::WritableFile(), rocksdb_impl_(rocksdb_impl), target_(t) {} 90 | 91 | virtual ~RocksDBWritableFile() OVERRIDE { delete target_; } 92 | 93 | virtual rocksdb::Status Append(const rocksdb::Slice& data) OVERRIDE { 94 | rocksdb_impl_->Append(data.size()); 95 | return (*target_)->Append(data); 96 | } 97 | 98 | virtual rocksdb::Status Close() OVERRIDE { return (*target_)->Close(); } 99 | 100 | virtual rocksdb::Status Flush() OVERRIDE { return (*target_)->Flush(); } 101 | 102 | virtual rocksdb::Status Sync() OVERRIDE { 103 | if (rocksdb_impl_->params_.enable_fsync) 104 | return (*target_)->Sync(); 105 | else { 106 | // Let's ignore Sync() for faster experiments. 107 | return rocksdb::Status::OK(); 108 | } 109 | } 110 | 111 | virtual rocksdb::Status Fsync() OVERRIDE { 112 | if (rocksdb_impl_->params_.enable_fsync) 113 | return (*target_)->Fsync(); 114 | else { 115 | // Let's ignore Fsync() for faster experiments. 116 | return rocksdb::Status::OK(); 117 | } 118 | } 119 | 120 | virtual bool IsSyncThreadSafe() const OVERRIDE { 121 | return (*target_)->IsSyncThreadSafe(); 122 | } 123 | 124 | virtual void SetIOPriority(rocksdb::Env::IOPriority pri) OVERRIDE { 125 | (*target_)->SetIOPriority(pri); 126 | } 127 | 128 | virtual rocksdb::Env::IOPriority GetIOPriority() OVERRIDE { 129 | return (*target_)->GetIOPriority(); 130 | } 131 | 132 | virtual uint64_t GetFileSize() OVERRIDE { return (*target_)->GetFileSize(); } 133 | 134 | virtual void GetPreallocationStatus(size_t* block_size, 135 | size_t* last_allocated_block) OVERRIDE { 136 | (*target_)->GetPreallocationStatus(block_size, last_allocated_block); 137 | } 138 | 139 | virtual size_t GetUniqueId(char* id, size_t max_size) const OVERRIDE { 140 | return (*target_)->GetUniqueId(id, max_size); 141 | } 142 | 143 | virtual rocksdb::Status InvalidateCache(size_t offset, 144 | size_t length) OVERRIDE { 145 | return (*target_)->InvalidateCache(offset, length); 146 | } 147 | 148 | private: 149 | class RocksDBImpl* rocksdb_impl_; 150 | std::unique_ptr* target_; 151 | }; 152 | 153 | class RocksDBDirectory : public rocksdb::Directory { 154 | public: 155 | RocksDBDirectory(RocksDBImpl* rocksdb_impl, 156 | std::unique_ptr* t) 157 | : rocksdb::Directory(), rocksdb_impl_(rocksdb_impl), target_(t) {} 158 | 159 | virtual ~RocksDBDirectory() OVERRIDE { delete target_; } 160 | 161 | virtual rocksdb::Status Fsync() OVERRIDE { 162 | if (rocksdb_impl_->params_.enable_fsync) 163 | return (*target_)->Fsync(); 164 | else { 165 | // Let's ignore Fsync() for faster experiments. 166 | return rocksdb::Status::OK(); 167 | } 168 | } 169 | 170 | private: 171 | class RocksDBImpl* rocksdb_impl_; 172 | std::unique_ptr* target_; 173 | }; 174 | 175 | // A wrapper for Env that forwards the file deletion information to RocksDBImpl. 176 | class RocksDBEnv : public rocksdb::EnvWrapper { 177 | public: 178 | RocksDBEnv(RocksDBImpl* rocksdb_impl) 179 | : rocksdb::EnvWrapper(rocksdb::Env::Default()), 180 | rocksdb_impl_(rocksdb_impl) {} 181 | 182 | virtual ~RocksDBEnv() OVERRIDE {} 183 | 184 | virtual rocksdb::Status NewSequentialFile( 185 | const std::string& f, std::unique_ptr* r, 186 | const rocksdb::EnvOptions& options) OVERRIDE { 187 | std::unique_ptr* r2 = 188 | new std::unique_ptr(); 189 | rocksdb::Status status = target()->NewSequentialFile(f, r2, options); 190 | if (*r2 != NULL) 191 | r->reset(new RocksDBSequentialFile(rocksdb_impl_, r2)); 192 | else 193 | delete r2; 194 | return status; 195 | } 196 | 197 | virtual rocksdb::Status NewRandomAccessFile( 198 | const std::string& f, std::unique_ptr* r, 199 | const rocksdb::EnvOptions& options) OVERRIDE { 200 | std::unique_ptr* r2 = 201 | new std::unique_ptr(); 202 | rocksdb::Status status = target()->NewRandomAccessFile(f, r2, options); 203 | if (*r2 != NULL) 204 | r->reset(new RocksDBRandomAccessFile(rocksdb_impl_, r2)); 205 | else 206 | delete r2; 207 | return status; 208 | } 209 | 210 | virtual rocksdb::Status NewWritableFile( 211 | const std::string& f, std::unique_ptr* r, 212 | const rocksdb::EnvOptions& options) OVERRIDE { 213 | std::unique_ptr* r2 = 214 | new std::unique_ptr(); 215 | rocksdb::Status status = target()->NewWritableFile(f, r2, options); 216 | if (*r2 != NULL) 217 | r->reset(new RocksDBWritableFile(rocksdb_impl_, r2)); 218 | else 219 | delete r2; 220 | return status; 221 | } 222 | 223 | virtual rocksdb::Status NewDirectory( 224 | const std::string& f, std::unique_ptr* r) OVERRIDE { 225 | std::unique_ptr* r2 = 226 | new std::unique_ptr(); 227 | rocksdb::Status status = target()->NewDirectory(f, r2); 228 | if (*r2 != NULL) 229 | r->reset(new RocksDBDirectory(rocksdb_impl_, r2)); 230 | else 231 | delete r2; 232 | return status; 233 | } 234 | 235 | virtual rocksdb::Status DeleteFile(const std::string& f) OVERRIDE { 236 | struct stat st; 237 | memset(&st, 0, sizeof(st)); 238 | // XXX: The file length *might* not be as large as its actual content 239 | // because the directory metadata can be updated later than the appends. 240 | int ret = stat(f.c_str(), &st); 241 | if (ret == 0) rocksdb_impl_->Delete(static_cast(st.st_size)); 242 | 243 | return target()->DeleteFile(f); 244 | } 245 | 246 | private: 247 | class RocksDBImpl* rocksdb_impl_; 248 | }; 249 | 250 | RocksDBImpl::RocksDBImpl(const LevelDBParams& params, std::vector& stats) 251 | : params_(params), stats_(stats) { 252 | stats_.push_back(Stat()); 253 | 254 | pthread_mutex_init(&stats_mutex_, NULL); 255 | read_ = 0; 256 | appended_ = 0; 257 | 258 | // Clean up old files. 259 | rocksdb::DestroyDB("rocksdb_files", rocksdb::Options()); 260 | 261 | options_ = new rocksdb::Options(); 262 | 263 | options_->create_if_missing = true; 264 | 265 | // Turn off Snappy. 266 | options_->compression = rocksdb::CompressionType::kNoCompression; 267 | 268 | // Use our Env to gather statistics. 269 | options_->env = new RocksDBEnv(this); 270 | 271 | // Limit the max open file count. 272 | options_->max_open_files = 900; 273 | 274 | // Configure the write buffer size. 275 | options_->write_buffer_size = params.log_size_threshold; 276 | 277 | // Do not overload insert. 278 | options_->level0_file_num_compaction_trigger = 4; 279 | options_->level0_slowdown_writes_trigger = 4; 280 | options_->level0_stop_writes_trigger = 4; 281 | 282 | // Use LevelDB-style table selection. 283 | if (params_.compaction_mode == LevelDBCompactionMode::kRocksDBMaxSize || 284 | params_.compaction_mode == LevelDBCompactionMode::kRocksDBMaxSizeMT) 285 | options_->use_leveldb_table_selection = false; 286 | else if (params_.compaction_mode == LevelDBCompactionMode::kRocksDBLinear || 287 | params_.compaction_mode == LevelDBCompactionMode::kRocksDBLinearMT) 288 | options_->use_leveldb_table_selection = true; 289 | else if (params_.compaction_mode == 290 | LevelDBCompactionMode::kRocksDBUniversal) { 291 | options_->use_leveldb_table_selection = 292 | false; // This will be ignored anyway. 293 | options_->compaction_style = rocksdb::kCompactionStyleUniversal; 294 | // Use a bit more level-0 files 295 | // options_->level0_file_num_compaction_trigger = 8; 296 | options_->level0_file_num_compaction_trigger = 12; 297 | // We have to adjust the maximum level-0 file count because RocksDB is stuck 298 | // with a deadlock otherwise. 299 | options_->level0_slowdown_writes_trigger = 300 | options_->level0_file_num_compaction_trigger + 2; 301 | options_->level0_stop_writes_trigger = 302 | options_->level0_file_num_compaction_trigger + 2; 303 | 304 | // Adjust size_ratio to handle skewed workloads gracefully without having to 305 | // increase the file count much. 306 | // options.compaction_options_universal.size_ratio = 10; 307 | } else 308 | assert(false); 309 | 310 | // Use multiple threads if requested. 311 | if (params_.compaction_mode == LevelDBCompactionMode::kRocksDBMaxSizeMT || 312 | params_.compaction_mode == LevelDBCompactionMode::kRocksDBLinearMT) { 313 | // 1 thread is dedicated as a "background flush" thread 314 | // (DBOptions::IncreaseParallelism() in rocksdb/util/options.cc) 315 | options_->IncreaseParallelism(4 + 1); 316 | } 317 | 318 | // Turn off checksumming for faster experiments (even though we already 319 | // disabled crc32c). 320 | // options_->verify_checksums_in_compaction = false; 321 | 322 | // Use custom level sizes 323 | if (params_.use_custom_sizes) { 324 | std::size_t* custom_level_sizes = new std::size_t[20]; 325 | 326 | std::ifstream ifs("output_sensitivity.txt"); 327 | while (!ifs.eof()) { 328 | std::string line; 329 | std::getline(ifs, line); 330 | 331 | std::istringstream iss(line); 332 | std::vector tokens{std::istream_iterator{iss}, 333 | std::istream_iterator{}}; 334 | 335 | if (tokens.size() < 5) continue; 336 | if (tokens[0] != "sensitivity_item_count_leveldb_best_sizes" && 337 | tokens[0] != "sensitivity_log_size_leveldb_best_sizes") 338 | continue; 339 | if (static_cast(atol(tokens[1].c_str())) != 340 | params_.hint_num_unique_keys) 341 | continue; 342 | if (atof(tokens[2].c_str()) != params_.hint_theta) continue; 343 | if (static_cast(atol(tokens[3].c_str())) != 344 | params_.log_size_threshold) 345 | continue; 346 | 347 | options_->custom_level_size_count = tokens.size() - 5 + 1; 348 | 349 | custom_level_sizes[0] = 0; 350 | std::size_t level; 351 | for (level = 1; level < options_->custom_level_size_count; level++) { 352 | custom_level_sizes[level] = static_cast( 353 | atof(tokens[5 + level - 1].c_str()) * 1000. + 0.5); 354 | printf("level-%zu: %zu\n", level, custom_level_sizes[level]); 355 | } 356 | // Make the last level very large and not spill. 357 | level--; 358 | custom_level_sizes[level] = 1000000000000000LU; 359 | printf("level-%zu: %zu (expanded)\n", level, custom_level_sizes[level]); 360 | printf("\n"); 361 | break; 362 | } 363 | assert(options_->custom_level_size_count != 0); 364 | 365 | options_->custom_level_sizes = custom_level_sizes; 366 | } 367 | 368 | rocksdb::Status status = rocksdb::DB::Open(*options_, "rocksdb_files", &db_); 369 | if (!status.ok()) { 370 | printf("%s\n", status.ToString().c_str()); 371 | assert(false); 372 | } 373 | 374 | memset(value_buf_, 0, sizeof(value_buf_)); 375 | } 376 | 377 | RocksDBImpl::~RocksDBImpl() { 378 | delete db_; 379 | 380 | delete options_->env; 381 | if (params_.use_custom_sizes) delete[] options_->custom_level_sizes; 382 | delete options_; 383 | 384 | pthread_mutex_destroy(&stats_mutex_); 385 | } 386 | 387 | void RocksDBImpl::print_status() const { 388 | // Force updating stats. 389 | const_cast(this)->Delete(0); 390 | } 391 | 392 | void RocksDBImpl::dump_state(FILE* fp) const { 393 | // TODO: Implement. 394 | (void)fp; 395 | } 396 | 397 | void RocksDBImpl::put(LevelDBKey key, uint32_t item_size) { 398 | // LevelDB includes the full SSTable file size during calculating the level 399 | // size; 400 | // we consider the average space overhead per item in LevelDB so that the 401 | // average stored size becomes similar to item_size. 402 | const uint32_t overhead = 18; 403 | 404 | rocksdb::Slice s_key(reinterpret_cast(&key), sizeof(key)); 405 | uint32_t value_size = 406 | static_cast(static_cast(item_size) - sizeof(key)) - 407 | overhead; 408 | assert(value_size < sizeof(value_buf_)); 409 | rocksdb::Slice s_value(value_buf_, value_size); 410 | 411 | rocksdb::Status status = db_->Put(rocksdb::WriteOptions(), s_key, s_value); 412 | if (!status.ok()) { 413 | printf("%s\n", status.ToString().c_str()); 414 | assert(false); 415 | } 416 | } 417 | 418 | void RocksDBImpl::del(LevelDBKey key) { 419 | rocksdb::Slice s_key(reinterpret_cast(&key), sizeof(key)); 420 | 421 | rocksdb::Status status = db_->Delete(rocksdb::WriteOptions(), s_key); 422 | if (!status.ok()) { 423 | printf("%s\n", status.ToString().c_str()); 424 | assert(false); 425 | } 426 | } 427 | 428 | uint64_t RocksDBImpl::get(LevelDBKey key) { 429 | rocksdb::Slice s_key(reinterpret_cast(&key), sizeof(key)); 430 | std::string s_value; 431 | uint64_t value; 432 | 433 | rocksdb::Status status = db_->Get(rocksdb::ReadOptions(), s_key, &s_value); 434 | if (!status.ok()) { 435 | printf("%s\n", status.ToString().c_str()); 436 | assert(false); 437 | } 438 | assert(s_value.size() >= sizeof(uint64_t)); 439 | value = *reinterpret_cast(s_value.data()); 440 | return value; 441 | } 442 | 443 | void RocksDBImpl::force_compact() { 444 | rocksdb::CompactRangeOptions options; 445 | options.change_level = false; 446 | options.target_level = -1; 447 | options.target_path_id = 0; 448 | 449 | db_->CompactRange(options, NULL, NULL); 450 | 451 | // Force stat update. 452 | Delete(0); 453 | } 454 | 455 | void RocksDBImpl::Read(std::size_t len) { __sync_fetch_and_add(&read_, len); } 456 | 457 | void RocksDBImpl::Append(std::size_t len) { 458 | __sync_fetch_and_add(&appended_, len); 459 | } 460 | 461 | void RocksDBImpl::Delete(std::size_t len) { 462 | uint64_t read = read_; 463 | __sync_fetch_and_sub(&read_, read); 464 | uint64_t appended = appended_; 465 | __sync_fetch_and_sub(&appended_, appended); 466 | 467 | pthread_mutex_lock(&stats_mutex_); 468 | if (read != 0) stats_.back().read(read); 469 | if (appended != 0) stats_.back().write(appended); 470 | if (len != 0) stats_.back().del(len); 471 | pthread_mutex_unlock(&stats_mutex_); 472 | } 473 | -------------------------------------------------------------------------------- /rocksdb_impl.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "leveldb.h" 4 | #include 5 | 6 | namespace rocksdb { 7 | // For forward declaration. 8 | class DB; 9 | class Options; 10 | } 11 | 12 | // An interface to the RocksDB implementation 13 | class RocksDBImpl { 14 | friend class RocksDBSequentialFile; 15 | friend class RocksDBRandomAccessFile; 16 | friend class RocksDBWritableFile; 17 | friend class RocksDBDirectory; 18 | friend class RocksDBEnv; 19 | 20 | public: 21 | RocksDBImpl(const LevelDBParams& params, std::vector& stats); 22 | ~RocksDBImpl(); 23 | 24 | // Prints the summary of the store. 25 | void print_status() const; 26 | 27 | // Writes the current items in the store to the file. 28 | void dump_state(FILE* fp) const; 29 | 30 | // Puts a new item in the store. 31 | void put(LevelDBKey key, uint32_t item_size); 32 | 33 | // Deletes an item from the store. 34 | void del(LevelDBKey key); 35 | 36 | // Gets an item from the store. 37 | uint64_t get(LevelDBKey key); 38 | 39 | // Forces compaction until there is no SSTable except the last level. 40 | void force_compact(); 41 | 42 | protected: 43 | void Read(std::size_t len); 44 | void Append(std::size_t len); 45 | void Delete(std::size_t len); 46 | 47 | private: 48 | LevelDBParams params_; 49 | std::vector& stats_; 50 | 51 | rocksdb::Options* options_; 52 | rocksdb::DB* db_; 53 | 54 | pthread_mutex_t stats_mutex_; 55 | volatile uint64_t read_; 56 | volatile uint64_t appended_; 57 | 58 | char value_buf_[1024]; 59 | }; 60 | -------------------------------------------------------------------------------- /stat.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.h" 4 | 5 | class Stat { 6 | public: 7 | Stat() { reset_all(); } 8 | 9 | void reset() { 10 | read_count_ = 0; 11 | read_bytes_ = 0; 12 | write_count_ = 0; 13 | write_bytes_ = 0; 14 | delete_count_ = 0; 15 | delete_bytes_ = 0; 16 | } 17 | 18 | void reset_all() { 19 | reset(); 20 | current_bytes_ = 0; 21 | } 22 | 23 | void read(uint64_t num_bytes) { 24 | read_count_++; 25 | read_bytes_ += static_cast(num_bytes); 26 | } 27 | 28 | void write(uint64_t num_bytes) { 29 | write_count_++; 30 | write_bytes_ += static_cast(num_bytes); 31 | current_bytes_ += static_cast(num_bytes); 32 | } 33 | 34 | void overwrite(uint64_t num_bytes) { 35 | write_count_++; 36 | write_bytes_ += static_cast(num_bytes); 37 | } 38 | 39 | void del(uint64_t num_bytes) { 40 | delete_count_++; 41 | delete_bytes_ += static_cast(num_bytes); 42 | current_bytes_ -= static_cast(num_bytes); 43 | } 44 | 45 | int64_t read_count() const { return read_count_; } 46 | int64_t read_bytes() const { return read_bytes_; } 47 | int64_t write_count() const { return write_count_; } 48 | int64_t write_bytes() const { return write_bytes_; } 49 | int64_t delete_count() const { return delete_count_; } 50 | int64_t delete_bytes() const { return delete_bytes_; } 51 | int64_t current_bytes() const { return current_bytes_; } 52 | 53 | void print_status() const { 54 | printf("Read: %ld times, %ld bytes\n", read_count_, read_bytes_); 55 | printf("Write: %ld times, %ld bytes\n", write_count_, write_bytes_); 56 | printf("Delete: %ld times, %ld bytes\n", delete_count_, delete_bytes_); 57 | printf("Current size: %ld bytes\n", current_bytes_); 58 | } 59 | 60 | private: 61 | int64_t read_count_; 62 | int64_t read_bytes_; 63 | int64_t write_count_; 64 | int64_t write_bytes_; 65 | int64_t delete_count_; 66 | int64_t delete_bytes_; 67 | int64_t current_bytes_; 68 | }; 69 | -------------------------------------------------------------------------------- /util.cpp: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | #include 3 | #include 4 | 5 | template 6 | void sequence(T n, std::vector& out) { 7 | out.clear(); 8 | out.reserve(static_cast(n)); 9 | for (T i = 0; i < n; i++) out.push_back(i); 10 | } 11 | 12 | template 13 | void shuffle(std::vector& v) { 14 | unsigned int seed = 15 | 0; // std::chrono::system_clock::now().time_since_epoch().count(); 16 | std::shuffle(v.begin(), v.end(), std::default_random_engine(seed)); 17 | // std::size_t count = v.size(); 18 | // for (std::size_t i = 0; i < count; i++) { 19 | // std::size_t j = i + (rand() % (count - i)); 20 | // std::swap(v[i], v[j]); 21 | // } 22 | } 23 | 24 | template void sequence(uint64_t n, std::vector& out); 25 | template void shuffle(std::vector& v); 26 | template void sequence(uint32_t n, std::vector& out); 27 | template void shuffle(std::vector& v); 28 | 29 | void uniform_pdf(uint64_t n, std::vector& out_pdf) { 30 | out_pdf.clear(); 31 | out_pdf.reserve(n); 32 | for (uint64_t i = 0; i < n; i++) out_pdf.push_back(1.); 33 | } 34 | 35 | void pdf_to_cdf(const std::vector& pdf, std::vector& out_cdf) { 36 | std::size_t count = pdf.size(); 37 | out_cdf.clear(); 38 | out_cdf.reserve(count); 39 | double s = 0.; 40 | for (std::size_t i = 0; i < count; i++) { 41 | s += pdf[i]; 42 | out_cdf.push_back(s); 43 | } 44 | } 45 | 46 | // def sample(cdf, count): 47 | // """Gets samples from CDF.""" 48 | // r = random.random 49 | // b = bisect.bisect_left 50 | // s = cdf[-1] 51 | // result = [0] * count 52 | // for i in range(count): 53 | // v = r() * s 54 | // k = b(cdf, v) 55 | // result[i] = k 56 | // return result 57 | -------------------------------------------------------------------------------- /util.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.h" 4 | #include 5 | #include 6 | 7 | // A sequence of increasing numbers. 8 | template 9 | void sequence(T n, std::vector& out); 10 | 11 | // In-place shuffling. 12 | template 13 | void shuffle(std::vector& v); 14 | 15 | // PDF of uniform distribution 16 | void uniform_pdf(uint64_t n, std::vector& out_pdf); 17 | 18 | // Convert PDF to CDF. 19 | void pdf_to_cdf(const std::vector& pdf, std::vector& out_cdf); 20 | 21 | // Fast random number generators. 22 | static uint32_t fast_rand(uint64_t* state) { 23 | // same as Java's 24 | *state = (*state * 0x5deece66dUL + 0xbUL) & ((1UL << 48) - 1); 25 | return (uint32_t)(*state >> (48 - 32)); 26 | } 27 | 28 | static double fast_rand_d(uint64_t* state) { 29 | *state = (*state * 0x5deece66dUL + 0xbUL) & ((1UL << 48) - 1); 30 | return (double)*state / (double)((1UL << 48) - 1); 31 | } 32 | -------------------------------------------------------------------------------- /zipf.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common.h" 4 | #include "util.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | struct zipf_gen_state { 11 | uint64_t n; // number of items (input) 12 | double theta; // skewness (input) in (0, 1); or, 0 = uniform, 1 = always zero 13 | double alpha; // only depends on theta 14 | double thres; // only depends on theta 15 | uint64_t last_n; // last n used to calculate the following 16 | double dbl_n; 17 | double zetan; 18 | double eta; 19 | // unsigned short rand_state[3]; // prng state 20 | uint64_t rand_state; 21 | }; 22 | 23 | static double pow_approx(double a, double b) { 24 | // from 25 | // http://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/ 26 | 27 | // calculate approximation with fraction of the exponent 28 | int e = (int)b; 29 | union { 30 | double d; 31 | int x[2]; 32 | } u = {a}; 33 | u.x[1] = (int)((b - (double)e) * (double)(u.x[1] - 1072632447) + 1072632447.); 34 | u.x[0] = 0; 35 | 36 | // exponentiation by squaring with the exponent's integer part 37 | // double r = u.d makes everything much slower, not sure why 38 | // TODO: use popcount? 39 | double r = 1.; 40 | while (e) { 41 | if (e & 1) r *= a; 42 | a *= a; 43 | e >>= 1; 44 | } 45 | 46 | return r * u.d; 47 | } 48 | 49 | static void zipf_init(struct zipf_gen_state* state, uint64_t n, double theta, 50 | uint64_t rand_seed) { 51 | assert(n > 0); 52 | if (theta > 0.992 && theta < 1) 53 | fprintf(stderr, "theta > 0.992 will be inaccurate due to approximation\n"); 54 | if (theta >= 1. && theta < 40.) { 55 | fprintf(stderr, "theta in [1., 40.) is not supported\n"); 56 | assert(false); 57 | } 58 | assert(theta == -1. || (theta >= 0. && theta < 1.) || theta >= 40.); 59 | assert(rand_seed < (1UL << 48)); 60 | memset(state, 0, sizeof(struct zipf_gen_state)); 61 | state->n = n; 62 | state->theta = theta; 63 | if (theta == -1.) 64 | rand_seed = rand_seed % n; 65 | else if (theta > 0. && theta < 1.) { 66 | state->alpha = 1. / (1. - theta); 67 | state->thres = 1. + pow_approx(0.5, theta); 68 | } else { 69 | state->alpha = 0.; // unused 70 | state->thres = 0.; // unused 71 | } 72 | state->last_n = 0; 73 | state->zetan = 0.; 74 | // state->rand_state[0] = (unsigned short)(rand_seed >> 0); 75 | // state->rand_state[1] = (unsigned short)(rand_seed >> 16); 76 | // state->rand_state[2] = (unsigned short)(rand_seed >> 32); 77 | state->rand_state = rand_seed; 78 | } 79 | 80 | static void zipf_init_copy(struct zipf_gen_state* state, 81 | const struct zipf_gen_state* src_state, 82 | uint64_t rand_seed) { 83 | assert(rand_seed < (1UL << 48)); 84 | memcpy(state, src_state, sizeof(struct zipf_gen_state)); 85 | // state->rand_state[0] = (unsigned short)(rand_seed >> 0); 86 | // state->rand_state[1] = (unsigned short)(rand_seed >> 16); 87 | // state->rand_state[2] = (unsigned short)(rand_seed >> 32); 88 | state->rand_state = rand_seed; 89 | } 90 | 91 | static void zipf_change_n(struct zipf_gen_state* state, uint64_t n) { 92 | state->n = n; 93 | } 94 | 95 | static double zeta(uint64_t last_n, double last_sum, uint64_t n, double theta) { 96 | if (last_n > n) { 97 | last_n = 0; 98 | last_sum = 0.; 99 | } 100 | while (last_n < n) { 101 | last_sum += 1. / pow_approx((double)last_n + 1., theta); 102 | last_n++; 103 | } 104 | return last_sum; 105 | } 106 | 107 | static uint64_t zipf_next(struct zipf_gen_state* state) { 108 | if (state->last_n != state->n) { 109 | if (state->theta > 0. && state->theta < 1.) { 110 | state->zetan = zeta(state->last_n, state->zetan, state->n, state->theta); 111 | state->eta = (1. - pow_approx(2. / (double)state->n, 1. - state->theta)) / 112 | (1. - zeta(0, 0., 2, state->theta) / state->zetan); 113 | } 114 | state->last_n = state->n; 115 | state->dbl_n = (double)state->n; 116 | } 117 | 118 | if (state->theta == -1.) { 119 | uint64_t v = state->rand_state; 120 | if (++state->rand_state >= state->n) state->rand_state = 0; 121 | return v; 122 | } else if (state->theta == 0.) { 123 | double u = fast_rand_d(&state->rand_state); 124 | return (uint64_t)(state->dbl_n * u); 125 | } else if (state->theta >= 40.) { 126 | return 0UL; 127 | } else { 128 | // from J. Gray et al. Quickly generating billion-record synthetic 129 | // databases. In SIGMOD, 1994. 130 | 131 | // double u = erand48(state->rand_state); 132 | double u = fast_rand_d(&state->rand_state); 133 | double uz = u * state->zetan; 134 | if (uz < 1.) 135 | return 0UL; 136 | else if (uz < state->thres) 137 | return 1UL; 138 | else 139 | return (uint64_t)(state->dbl_n * 140 | pow_approx(state->eta * (u - 1.) + 1., state->alpha)); 141 | } 142 | } 143 | 144 | static double zipf_prob(const struct zipf_gen_state* state, uint64_t i) { 145 | // this must be called after at least one zipf_next() invocation 146 | if (state->theta == -1.) 147 | return 1.; 148 | else if (state->theta == 0.) 149 | return 1.; 150 | else if (state->theta >= 40.) { 151 | if (i == 0) 152 | return 1.; 153 | else 154 | return 0.; 155 | } else { 156 | return 1. / pow_approx((double)i + 1., state->theta); 157 | } 158 | } 159 | 160 | static void test_zipf(double theta) { 161 | double zetan = 0.; 162 | const uint64_t n = 1000000UL; 163 | uint64_t i; 164 | 165 | for (i = 0; i < n; i++) zetan += 1. / pow((double)i + 1., theta); 166 | 167 | struct zipf_gen_state state; 168 | if (theta < 1. || theta >= 40.) zipf_init(&state, n, theta, 0); 169 | 170 | uint64_t num_key0 = 0; 171 | const uint64_t num_samples = 10000000UL; 172 | if (theta < 1. || theta >= 40.) { 173 | for (i = 0; i < num_samples; i++) 174 | if (zipf_next(&state) == 0) num_key0++; 175 | } 176 | 177 | printf("theta = %lf; using pow(): %.10lf", theta, 1. / zetan); 178 | if (theta < 1. || theta >= 40.) 179 | printf(", using approx-pow(): %.10lf", 180 | (double)num_key0 / (double)num_samples); 181 | printf("\n"); 182 | } 183 | --------------------------------------------------------------------------------