├── .gitignore
├── Analysis.jl
├── Common.jl
├── IntervalModel.jl
├── Main.jl
├── Makefile
├── README.md
├── SizeModel.jl
├── common.h
├── diff_leveldb.patch
├── diff_rocksdb.patch
├── leveldb.cpp
├── leveldb.h
├── leveldb_impl.cpp
├── leveldb_impl.h
├── main.cpp
├── measure_rw.cpp
├── meshdb.cpp
├── meshdb.h
├── rocksdb_impl.cpp
├── rocksdb_impl.h
├── stat.h
├── util.cpp
├── util.h
└── zipf.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.d
 3 | main
 4 | measure_rw
 5 | perf.data*
 6 | output.txt
 7 | output_*.txt
 8 | output_*.txt.tmp
 9 | #!output_sensitivity.txt
10 | #!output_leveldb-sim_*.txt
11 | #!output_leveldb-impl_*.txt
12 | #!output_rocksdb-impl_*.txt
13 | #!output_measure_rw.txt
14 | #!output_universal_compaction.txt
15 | .*.sw?
16 | *.pyc
17 | leveldb_files
18 | rocksdb_files
19 | msls_*.tar.bz2
20 | leveldb
21 | rocksdb


--------------------------------------------------------------------------------
/Common.jl:
--------------------------------------------------------------------------------
  1 | module Common
  2 | # using Ipopt
  3 | 
  4 | export Distribution
  5 | 
  6 | export check_validity
  7 | export update_derived_values
  8 | 
  9 | export unique
 10 | export unique_inv
 11 | 
 12 | export unique_avg
 13 | 
 14 | export density
 15 | export density_sum
 16 | export interval_from_density
 17 | 
 18 | export merge
 19 | 
 20 | export ccp
 21 | 
 22 | export load_zipf_compressed
 23 | 
 24 | export geom_mean
 25 | 
 26 | export get_wa
 27 | 
 28 | 
 29 | type Distribution
 30 | 	count::Int64
 31 | 	c::Array{Float64}
 32 | 	p::Array{Float64}
 33 | 
 34 | 	# derived values
 35 | 	p1::Array{Float64}
 36 | 	c_log_p1::Array{Float64}
 37 | end
 38 | 
 39 | function check_validity(X::Distribution)
 40 | 	@assert X.count > 0
 41 | 	@assert length(X.c) == length(X.p)
 42 | 
 43 | 	count = 0.
 44 | 	prob = 0.
 45 | 
 46 | 	for i = 1:length(X.c)
 47 | 		@assert X.c[i] != 0.
 48 | 		@assert X.p[i] != 0.
 49 | 		@assert X.p[i] != 1.
 50 | 
 51 | 		count += X.c[i]
 52 | 		prob += X.c[i] * X.p[i]
 53 | 	end
 54 | 
 55 | 	@assert abs(Float64(X.count) / count - 1.) < 0.001
 56 | 	@assert abs(prob - 1.) < 0.001
 57 | end
 58 | 
 59 | function update_derived_values(X::Distribution)
 60 | 	X.p1 = Array(Float64, length(X.c))
 61 | 	X.c_log_p1 = Array(Float64, length(X.c))
 62 | 	for i = 1:length(X.c)
 63 | 		p1 = 1. - X.p[i]
 64 | 		X.p1[i] = p1
 65 | 		X.c_log_p1[i] = X.c[i] * log(p1)
 66 | 	end
 67 | end
 68 | 
 69 | type UniqueParam
 70 | 	X::Distribution
 71 | 	c::Float64
 72 | end
 73 | 
 74 | function hash(x::UniqueParam)
 75 | 	hash(x.c)
 76 | end
 77 | 
 78 | function isequal(x::UniqueParam, y::UniqueParam)
 79 | 	x.c == y.c && x.X == y.X
 80 | end
 81 | 
 82 | # global unique_memoization = Dict{UniqueParam, Float64}()
 83 | 
 84 | function unique(X::Distribution, c::Float64)
 85 | 	# global unique_memoization::Dict{UniqueParam, Float64}
 86 | 
 87 | 	if c == Inf
 88 | 		return Float64(X.count)
 89 | 	end
 90 | 
 91 | 	@assert c >= 0.
 92 | 
 93 | 	# get!(unique_memoization, UniqueParam(X, c)) do
 94 | 		s = Float64(X.count)
 95 | 		for i = 1:length(X.c)
 96 | 			# s -= X.c[i] * ((1. - X.p[i]) ^ c)
 97 | 			s -= X.c[i] * (X.p1[i] ^ c)
 98 | 		end
 99 | 
100 | 		s
101 | 	# end
102 | end
103 | 
104 | function unique_diff(X::Distribution, c::Float64)
105 |     s = 0.
106 | 	for i = 1:length(X.c)
107 |         # p1 = 1. - X.p[i]
108 |         # s -= X.c[i] * (p1 ^ c) * log(p1)
109 |         s -= X.c_log_p1[i] * (X.p1[i] ^ c)
110 |     end
111 | 
112 |     s
113 | end
114 | 
115 | function unique_int(X::Distribution, c0::Float64, c1::Float64)
116 | 	@assert c0 < c1
117 | 	f = (c) -> begin
118 | 		unique(X, c)
119 | 	end
120 | 	I, E = quadgk(f, c0, c1, maxevals=10)
121 | 	I
122 | end
123 | 
124 | function discrete_sum(f, a::Float64, b::Float64, maxevals::Int64=10)
125 | 	pq = Collections.PriorityQueue()
126 | 	vs = []
127 | 
128 | 	f_a = f(a)
129 | 	push!(vs, (a, f_a))
130 | 	f_b = f(b)
131 | 	push!(vs, (b, f_b))
132 | 	Collections.enqueue!(pq, (a, b, f_a, f_b), -abs((f_a - f_b) * (a - b)))
133 | 	eval = 2
134 | 
135 | 	while eval < maxevals
136 | 		try
137 | 			a, b, f_a, f_b = Collections.dequeue!(pq)
138 | 		catch y
139 | 			if isa(y, BoundsError)
140 | 				break
141 | 			end
142 | 		end
143 | 		# println("eval=", eval, " a=", a, " b=", b, " f_a=", f_a, " f_b=", f_b, " diff=", abs(f_a - f_b))
144 | 		if a + 1. < b
145 | 			m = round((a + b) / 2.)
146 | 			@assert a != m
147 | 			@assert b != m
148 | 			f_m = f(m)
149 | 			push!(vs, (m, f_m))
150 | 			Collections.enqueue!(pq, (a, m, f_a, f_m), -abs((f_a - f_m) * (a - m)))
151 | 			Collections.enqueue!(pq, (m, b, f_m, f_b), -abs((f_m - f_b) * (m - b)))
152 | 			eval += 1
153 | 		end
154 | 	end
155 | 
156 | 	sort!(vs)
157 | 
158 | 	sum = 0.
159 | 	len = length(vs)
160 | 	for i = 2:len
161 | 		sum += vs[i - 1][2] * (vs[i][1] - vs[i - 1][1])
162 | 	end
163 | 	sum += vs[end][2] * 1.
164 | 
165 | 	sum
166 | end
167 | 
168 | function unique_avg(X::Distribution, c0::Float64, c1::Float64)
169 | 	# remove a negative range that are not valid for unique(); quadgk() can emit DomainError otherwise
170 | 	if c0 < 0.
171 | 		c0 = 0.
172 | 	end
173 | 	if c1 < 0.
174 | 		c1 = 0.
175 | 	end
176 | 
177 | 	if c0 > c1
178 | 		0.
179 | 	elseif c0 == c1
180 | 		unique(X, c0)
181 | 	else
182 | 		# if c1 / c0 < 100.
183 | 		#	sum = 0.
184 | 		#	count = 0
185 | 		#	for i = 1:int64(ceil(c1 / c0))
186 | 		#		sum += unique(X, c0 * Float64(i))
187 | 		#		count += 1
188 | 		#	end
189 | 		#	sum / count
190 | 		#	# discrete_sum(f, start_step, end_step) / (end_step - start_step + 1.)
191 | 		# else
192 | 			unique_int(X, c0, c1) / (c1 - c0)
193 | 		# end
194 | 
195 | 		# start_step = 1.
196 | 		# end_step = c1 / c0
197 | 		# f = (step) -> begin
198 | 		#	unique(X, c0 * step)
199 | 		# end
200 | 		# discrete_sum(f, start_step, end_step) / (end_step - start_step + 1.)
201 | 	end
202 | end
203 | 
204 | 
205 | type UniqueInvParam
206 | 	X::Distribution
207 | 	u::Float64
208 | end
209 | 
210 | function hash(x::UniqueInvParam)
211 | 	hash(x.u)
212 | end
213 | 
214 | function isequal(x::UniqueInvParam, y::UniqueInvParam)
215 | 	x.u == y.u && x.X == y.X
216 | end
217 | 
218 | # global unique_inv_memoization = Dict{UniqueInvParam, Float64}()
219 | 
220 | function unique_inv(X::Distribution, u::Float64)
221 | 	unique_inv_nt(X, u)
222 | 	# unique_inv_ipopt(X, u)
223 | end
224 | 
225 | function unique_inv_nt(X::Distribution, u::Float64)
226 |     # Newton's method
227 | 	# global unique_inv_memoization::Dict{UniqueInvParam, Float64}
228 | 
229 | 	if u >= Float64(X.count) * (1 - 0.000001)
230 | 		return Inf
231 | 	end
232 | 
233 | 	# get!(unique_inv_memoization, UniqueInvParam(X, u)) do
234 | 	    # take u as the initial c
235 | 	    c = u
236 | 	    for count = 1:100
237 | 	        u1 = unique(X, c)
238 | 	        if abs(u1 / u - 1.) < 0.001
239 | 	            break
240 | 	        end
241 | 	        c -= (u1 - u) / unique_diff(X, c)
242 | 
243 | 			if c < 0.
244 | 				c = 0.
245 | 			end
246 | 	    end
247 | 
248 | 	    c
249 | 	# end
250 | end
251 | 
252 | function unique_inv_ipopt(X::Distribution, u::Float64)
253 | 	# inaccurate
254 | 
255 | 	# if u >= Float64(X.count) * (1 - 0.000001)
256 | 	#	return Inf
257 | 	# end
258 | 
259 | 	eval_f = (x) -> begin
260 | 		abs(unique(X, x[1]) - u) / u
261 | 	end
262 | 
263 | 	eval_grad_f = (x, grad_f) -> begin
264 | 		# grad_f[1] = unique_diff(X, x[1])
265 | 		diff = x[1] * 0.001
266 | 		grad_f[1] = (abs(unique(X, x[1] + diff) - u) - abs(unique(X, x[1]) - u)) / u / diff
267 | 	end
268 | 
269 | 	eval_g = (x, g) -> begin
270 | 		# g[1] = x[1]
271 | 	end
272 | 
273 | 	eval_jac_g = (x, mode, rows, cols, values) -> begin
274 | 		# if mode == :Structure
275 | 		#	rows[1] = 1
276 | 		#	cols[1] = 1
277 | 		# else
278 | 		#	values[1] = 1.
279 | 		# end
280 | 	end
281 | 
282 | 	v_L = [1.]
283 | 	v_U = [Float64(X.c) ^ 2.]
284 | 
285 | 	# g_L = [1.]
286 | 	# # # g_U = [2.e19]
287 | 	# g_U = [Float64(X.c) ^ 2.]
288 | 	g_L = Array(Float64, 0)
289 | 	g_U = Array(Float64, 0)
290 | 
291 | 	prob = createProblem(1, v_L, v_U,
292 | 						 0, g_L, g_U,
293 | 						 0, 0,
294 | 						 eval_f, eval_g, eval_grad_f, eval_jac_g)
295 | 
296 | 	addOption(prob, "hessian_approximation", "limited-memory")
297 | 
298 | 	# addOption(prob, "tol", 0.1)
299 | 
300 | 	addOption(prob, "print_level", 2);
301 | 
302 | 	prob.x = [u]
303 | 	status = solveProblem(prob)
304 | 	# ret = Ipopt.ApplicationReturnStatus[status]
305 | 	# obj_val = prob.obj_val
306 | 	# println("$obj_val in unique_inv (returned $ret)")
307 | 
308 | 	prob.x[1]
309 | end
310 | 
311 | 
312 | function density(X::Distribution, interval::Float64, d::Float64)
313 | 	n = X.count
314 | 	v = unique(X, d / n * interval) / n
315 | 	#println(v)
316 | 	v
317 | end
318 | 
319 | function density_sum(X::Distribution, interval::Float64)
320 | 	@assert interval >= 0.
321 | 
322 | 	n = X.count
323 | 
324 | 	# using integration
325 | 	f = (d) -> begin
326 | 		v = density(X, interval, d)
327 | 		@assert !isnan(v)
328 | 		v
329 | 	end
330 | 	#I, E = quadgk(f, 1., n, maxevals=10)
331 | 	I, E = quadgk(f, 0., n - 1., maxevals=10)
332 | 	I
333 | 
334 | 	# using a geometric sum of unique() - this is fast but has a precision issue with large n due to the use of close-to-zero divisions
335 | 	# s = Float64(n)
336 | 	# for i = 1:length(X.c)
337 | 	#	s -= X.c[i] * (1. - (X.p1[i] ^ interval)) / (1. - (X.p1[i] ^ (interval / n))) / n
338 | 	# end
339 | 	# s
340 | end
341 | 
342 | function interval_from_density(X::Distribution, u::Float64)
343 | 	# fix up an invalid u that can be created by the solver
344 | 	u = min(u, float(X.count))
345 | 
346 | 	# unique_inv() * 2 is usually close to the solution
347 | 	c = unique_inv(X, u) * 2.
348 | 	for count = 1:100
349 | 	    u1 = density_sum(X, c)
350 | 	    if abs(u1 / u - 1.) < 0.001
351 | 	        break
352 | 	    end
353 | 	    diff = (u1 - density_sum(X, c * 1.01)) / (c - c * 1.01)
354 | 		if isnan(diff)
355 | 			println(diff, " ", u1, " ", density_sum(X, c * 1.1), " ", c)
356 | 			@assert false
357 | 		end
358 | 	    c -= (u1 - u) / diff
359 | 
360 | 		if c < 0.
361 | 			c = 0.
362 | 		end
363 | 	end
364 | 
365 | 	c
366 | end
367 | 
368 | 
369 | function merge(X::Distribution, n1::Float64, n2::Float64)
370 |     c = unique_inv(X, n1) + unique_inv(X, n2)
371 |     unique(X, c)
372 | end
373 | 
374 | 
375 | function ccp_subset_sum_choose(X::Distribution, q::Int64, pos::Int64, min::Int64, p_sum::Float64)
376 | 	if pos > q
377 | 		return 1. / (1. - p_sum)
378 | 	end
379 | 	m = X.count
380 | 	s = 0.
381 | 	for i = min:m
382 | 		s += ccp_subset_sum_choose(X, q, pos + 1, i + 1, p_sum + X.p[i])
383 | 	end
384 | 
385 | 	s
386 | end
387 | 
388 | function ccp_subset_sum(X::Distribution, q::Int64)
389 | 	ccp_subset_sum_choose(X, q, 1, 1, 0.)
390 | end
391 | 
392 | function ccp(X::Distribution, j::Int64)
393 | 	# Coupon collector's problem; expected time to collect j coupons whose distribution is X
394 | 	# this is quite slow for large X (e.g., > 30)
395 | 
396 | 	m = X.count
397 | 	for i = 1:m
398 | 		# ccp_subset_sum() cannot handle non-1 cardinality
399 | 		@assert X.c[i] == 1.
400 | 	end
401 | 
402 | 	t = 0.
403 | 	for q = 0:(j - 1)
404 | 		t += Float64((-1) ^ (j - 1 - q) * binomial(m - q - 1, m - j)) * ccp_subset_sum(X, q)
405 | 	end
406 | 
407 | 	t
408 | end
409 | 
410 | function zipf(count::Int64, s::Float64)
411 | 	X = Distribution(count, Array(Float64, count), Array(Float64, count), Array(Float64, 0), Array(Float64, 0))
412 | 	p_sum = 0.
413 | 	for i = 1:count
414 | 		if s == 0.
415 | 			p = 1.
416 | 		elseif s == 1.
417 | 			p = 1. / (Float64(i))
418 | 		else
419 | 			p = 1. / (Float64(i) ^ s)
420 | 		end
421 | 		p_sum += p
422 | 		X.c[i] = 1.
423 | 		X.p[i] = p
424 | 	end
425 | 	X.p /= p_sum
426 | 	check_validity(X)
427 | 
428 | 	X
429 | end
430 | 
431 | function zipf_compressed(count::Int64, s::Float64, rel_diff::Float64)
432 | 	X = Distribution(count, Array(Float64, 0), Array(Float64, 0), Array(Float64, 0), Array(Float64, 0))
433 | 
434 | 	p_denom = 0.
435 | 
436 | 	if s == 0.
437 | 		p = 1.
438 | 	elseif s == 1.
439 | 		p = 1. / (Float64(count + 1 - 1))
440 | 	else
441 | 		p = 1. / (Float64(count + 1 - 1) ^ s)
442 | 	end
443 | 	c = 1.
444 | 	p_denom += p
445 | 	min_p = p
446 | 	c_sum = c
447 | 	p_sum = c * p
448 | 	for i = 2:count
449 | 		if s == 0.
450 | 			p = 1.
451 | 		elseif s == 1.
452 | 			p = 1. / (Float64(count + 1 - i))
453 | 		else
454 | 			p = 1. / (Float64(count + 1 - i) ^ s)
455 | 		end
456 | 		c = 1.
457 | 		p_denom += p
458 | 		@assert min_p <= p
459 | 		if p / min_p - 1. <= rel_diff
460 | 			c_sum += c
461 | 			p_sum += c * p
462 | 		else
463 | 			push!(X.c, c_sum)
464 | 			push!(X.p, p_sum / c_sum)
465 | 			min_p = p
466 | 			c_sum = c
467 | 			p_sum = c * p
468 | 		end
469 | 	end
470 | 	push!(X.c, c_sum)
471 | 	push!(X.p, p_sum / c_sum)
472 | 	X.p /= p_denom
473 | 	check_validity(X)
474 | 
475 | 	X
476 | end
477 | 
478 | function load_zipf_compressed(count::Int64, s::Float64, rel_diff::Float64)
479 | 	filename = string("data/zipf_", count, "_", s, "_", rel_diff, ".dat")
480 | 
481 | 	X = Distribution(0, Array(Float64, 0), Array(Float64, 0), Array(Float64, 0), Array(Float64, 0))
482 | 	try
483 | 		f = open(filename, "r")
484 | 		X.count = deserialize(f)
485 | 		X.c = deserialize(f)
486 | 		X.p = deserialize(f)
487 | 		close(f)
488 | 	catch
489 | 		println("creating $filename")
490 | 		X = zipf_compressed(count, s, rel_diff)
491 | 		f = open(filename, "w")
492 | 		serialize(f, X.count)
493 | 		serialize(f, X.c)
494 | 		serialize(f, X.p)
495 | 		close(f)
496 | 	end
497 | 	X
498 | end
499 | 
500 | 
501 | function compress(X::Distribution, rel_diff::Float64)
502 | 	new_X = Distribution(X.count, Array(Float64, 0), Array(Float64, 0), Array(Float64, 0), Array(Float64, 0))
503 | 
504 | 	perm = sortperm(X.p)
505 | 
506 | 	p = X.p[perm[1]]
507 | 	c = X.c[perm[1]]
508 | 	min_p = p
509 | 	c_sum = c
510 | 	p_sum = c * p
511 | 	for idx in perm[2:end]
512 | 		p = X.p[idx]
513 | 		c = X.c[idx]
514 | 		@assert min_p <= p
515 | 		if p / min_p - 1. <= rel_diff
516 | 			c_sum += c
517 | 			p_sum += c * p
518 | 		else
519 | 			push!(new_X.c, c_sum)
520 | 			push!(new_X.p, p_sum / c_sum)
521 | 			min_p = p
522 | 			c_sum = c
523 | 			p_sum = c * p
524 | 		end
525 | 	end
526 | 	push!(new_X.c, c_sum)
527 | 	push!(new_X.p, p_sum / c_sum)
528 | 	check_validity(new_X)
529 | 
530 | 	new_X
531 | end
532 | 
533 | function geom_mean(A::Array{Float64})
534 | 	s = 0.
535 | 	for a in A
536 | 		s += 1. / a
537 | 	end
538 | 	1. / s
539 | end
540 | 
541 | function get_wa(wa_r_factor::Float64, t)
542 | 	wa_r = t[1]
543 | 	wa_w = t[2]
544 | 	return sum(wa_w) + wa_r_factor * sum(wa_r)
545 | end
546 | 
547 | end
548 | 


--------------------------------------------------------------------------------
/IntervalModel.jl:
--------------------------------------------------------------------------------
  1 | module IntervalModel
  2 | 
  3 | using Common
  4 | # using NLopt
  5 | using Ipopt
  6 | 
  7 | function init_intervals(log_size::Float64, l0_count::Float64, level_count::Int64)
  8 | 	interval = Array(Float64, 0)
  9 | 	for i = 1:level_count
 10 | 		# push!(interval, log_size * l0_count)
 11 | 		push!(interval, log_size * l0_count * (10. ^ Float64(i - 1)))
 12 | 	end
 13 | 	interval
 14 | end
 15 | 
 16 | function calculate_wa_twolevel!(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}, wa_r::Array{Float64}, wa_w::Array{Float64})
 17 | 	# mem->log
 18 | 	wa_r[1] = 0.
 19 | 	wa_w[1] = 1.
 20 | 
 21 | 	# log->0
 22 | 	wa_r[2] = 0.
 23 | 	wa_w[2] = Common.unique(X, log_size) / log_size
 24 | 
 25 | 	# ## amortized, full destination level
 26 | 	# # 0->1, 1->2, ...
 27 | 	# for i in 1:(length(intervals) - 1)
 28 | 	#	wa[2 + i] = Common.unique(X, intervals[i] + intervals[i + 1]) / intervals[i]
 29 | 	# end
 30 | 	# wa[2 + length(intervals)] = Float64(X.count) / intervals[end]
 31 | 
 32 | 	# ## amortized, compact entire level
 33 | 	# # 0->1, 1->2, ...
 34 | 	# interval = 0.
 35 | 	# next_interval = intervals[1]
 36 | 	# for i in 1:(length(intervals) - 1)
 37 | 	#	interval = interval * 0.5 + next_interval
 38 | 	#	next_interval = intervals[i + 1]
 39 | 	#	wa[2 + i] = unique_avg(X, interval, interval * 0.5 + next_interval) / interval
 40 | 	# end
 41 | 	# interval = interval * 0.5 + next_interval
 42 | 	# wa[2 + length(intervals)] = Float64(X.count) / interval
 43 | 
 44 | 	## deamortized, compact each sstable in a round-robin way
 45 | 	# 0->1, 1->2, ...
 46 | 	interval = 0.
 47 | 	next_interval = intervals[1]
 48 | 	for i in 1:(length(intervals) - 1)
 49 | 		if i == 1
 50 | 			# 0->1 compaction is usually a whole level
 51 | 			# do not use interval_from_density() and adding extra unique() to WA that are caused by using small tables
 52 | 			interval = next_interval
 53 | 			next_interval = intervals[i + 1]
 54 | 			wa_r[2 + i] = (Common.unique(X, log_size) * l0_count + Common.unique(X, next_interval)) / interval
 55 | 			wa_w[2 + i] = Common.unique(X, interval + next_interval) / interval
 56 | 		else
 57 | 			interval = interval + interval_from_density(X, Common.unique(X, next_interval))
 58 | 			next_interval = intervals[i + 1]
 59 | 			# using additional unique(); see SizeMode.jl for details
 60 | 			wa_r[2 + i] = (Common.unique(X, interval) + Common.unique(X, next_interval) + Common.unique(X, interval) * 1.) / interval
 61 | 			wa_w[2 + i] = (Common.unique(X, interval + next_interval) + Common.unique(X, interval) * 1.) / interval
 62 | 		end
 63 | 	end
 64 | 	interval = interval + interval_from_density(X, Common.unique(X, next_interval))
 65 | 	# using additional unique(); see SizeMode.jl for details
 66 | 	wa_r[2 + length(intervals)] = (Common.unique(X, interval) + Float64(X.count) + Common.unique(X, interval) * 1.) / interval
 67 | 	wa_w[2 + length(intervals)] = (Float64(X.count) + Common.unique(X, interval) * 1.) / interval
 68 | 
 69 | 	wa_r, wa_w
 70 | end
 71 | 
 72 | function calculate_wa_twolevel(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64})
 73 | 	wa_r = Array(Float64, 2 + length(intervals))
 74 | 	wa_w = Array(Float64, 2 + length(intervals))
 75 | 	calculate_wa_twolevel!(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}, wa_r, wa_w)
 76 | end
 77 | 
 78 | # function calculate_wa_twolevel_ratios(X::Distribution, log_size::Float64, l0_count::Float64, interval_ratios::Array{Float64})
 79 | #	current_interval = interval_ratios[1]
 80 | #	intervals = interval_ratios * (log_size * l0_count / current_interval)
 81 | #	return calculate_wa_twolevel(X, log_size, l0_count, intervals)
 82 | # end
 83 | 
 84 | function calculate_sizes_twolevel(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64})
 85 | 	sizes = Array(Float64, length(intervals))
 86 | 
 87 | 	for i in 1:(length(intervals) - 1)
 88 | 		sizes[i] = Common.unique(X, intervals[i + 1])
 89 | 	end
 90 | 	sizes[length(intervals)] = Float64(X.count)
 91 | 
 92 | 	sizes
 93 | end
 94 | 
 95 | function calculate_wa_multilevel!(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}, wa_r::Array{Float64}, wa_w::Array{Float64})
 96 | 	# TODO: wa_r
 97 | 
 98 | 	# mem->log
 99 | 	wa_r[1] = 0.
100 | 	wa_w[1] = 1.
101 | 
102 | 	# log->0
103 | 	wa_r[2] = 0.
104 | 	wa_w[2] = Common.unique(X, log_size) / log_size
105 | 
106 | 	# ## amortized, full destination level
107 | 	# # 0->1, 1->2, ...
108 | 	# for i in 1:(length(intervals) - 1)
109 | 	#	# level-0...1 size, level-0...2 size, ...
110 | 	#	level_size = Common.unique(X, geom_mean(intervals[(i + 1):end]))
111 | 	#	wa[2 + i] = level_size / intervals[i]
112 | 	# end
113 | 	# wa[2 + length(intervals)] = Float64(X.count) / intervals[end]
114 | 
115 | 	## amortized, compact entire level (TODO: do we need to modify interval to consider "0.5" factor?)
116 | 	# 0->1, 1->2, ...
117 | 	# interval = 0.
118 | 	# next_interval = geom_mean(intervals)
119 | 	for i in 1:(length(intervals) - 1)
120 | 		wa_r[2 + i] = 0.
121 | 		wa_w[2 + i] = unique_avg(X, geom_mean(intervals[i:end]), geom_mean(intervals[i:end]) * 0.5 + geom_mean(intervals[(i + 1):end])) / intervals[i]
122 | 		# interval = interval * 0.5 + next_interval
123 | 		# next_interval = geom_mean(intervals[(i + 1):end])
124 | 		# wa[2 + i] = unique_avg(X, interval, interval * 0.5 + next_interval) / interval
125 | 	end
126 | 	wa_r[2 + length(intervals)] = 0.
127 | 	wa_w[2 + length(intervals)] = Float64(X.count) / intervals[end]
128 | 
129 | 	wa_r, wa_w
130 | end
131 | 
132 | function calculate_wa_multilevel(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64})
133 | 	wa_r = Array(Float64, 2 + length(intervals))
134 | 	wa_w = Array(Float64, 2 + length(intervals))
135 | 	calculate_wa_multilevel!(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}, wa_r, wa_w)
136 | end
137 | 
138 | # function calculate_wa_multilevel_ratios(X::Distribution, log_size::Float64, l0_count::Float64, interval_ratios::Array{Float64})
139 | #	current_interval = geom_mean(interval_ratios)
140 | #	intervals = interval_ratios * (log_size * l0_count / current_interval)
141 | #	return calculate_wa_multilevel(X, log_size, l0_count, intervals)
142 | # end
143 | 
144 | function calculate_sizes_multilevel(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64})
145 | 	sizes = Array(Float64, length(intervals))
146 | 
147 | 	for i in 1:(length(intervals) - 1)
148 | 		sizes[i] = Common.unique(X, geom_mean(intervals[i + 1:end]))
149 | 	end
150 | 	sizes[length(intervals)] = Float64(X.count)
151 | 
152 | 	sizes
153 | end
154 | 
155 | function optimize_wa_twolevel(X::Distribution, log_size::Float64, l0_count::Float64, init_intervals::Array{Float64}, wa_r_factor::Float64, ftol::Float64, max_time::Float64)
156 | 	n = X.count
157 | 	level_count = length(init_intervals)
158 | 
159 | 	# v2 = Array(Float64, level_count)
160 | 	# v2[1] = log_size * l0_counum
161 | 
162 | 	# count = 0
163 | 	# wa_r = Array(Float64, 2 + level_count)
164 | 	# wa_w = Array(Float64, 2 + level_count)
165 | 	# f = (v, grad) -> begin
166 | 	#	count += 1
167 | 	#	v2[2:level_count] = v
168 | 	#	get_wa(wa_r_factor, calculate_wa_twolevel!(X, log_size, l0_count, v2, wa_r, wa_w))
169 | 	# end
170 | 
171 | 	# v = init_intervals[2:end]
172 | 
173 | 	# opt = Opt(:LN_COBYLA, level_count - 1)
174 | 	# min_objective!(opt, f)
175 | 	# # inequality_constraint!(opt, (v, grad) -> log_size * l0_count - v[1])		# <= 0
176 | 	# for i = 1:(level_count - 2)
177 | 	#	inequality_constraint!(opt, (v, grad) -> v[i] - v[i + 1])		# <= 0
178 | 	# end
179 | 	# ftol_abs!(opt, ftol)
180 | 	# maxtime!(opt, max_time)
181 | 	# @time (minf, minx, ret) = optimize(opt, v)
182 | 	# println("got $minf at $minx after $count iterations (returned $ret)")
183 | 
184 | 	# cat(1, [log_size * l0_count], minx)
185 | 
186 | 	#######################
187 | 
188 | 	v2 = Array(Float64, level_count)
189 | 	v2[1] = log_size * l0_count
190 | 
191 | 	count = 0
192 | 	wa_r = Array(Float64, 2 + level_count)
193 | 	wa_w = Array(Float64, 2 + level_count)
194 | 
195 | 	eval_f = (v) -> begin
196 | 		count += 1
197 | 		v2[2:level_count] = v
198 | 		get_wa(wa_r_factor, calculate_wa_twolevel!(X, log_size, l0_count, v2, wa_r, wa_w))
199 | 	end
200 | 
201 | 	eval_grad_f = (v, grad_f) -> begin
202 | 		v2[2:level_count] = v
203 | 		y = get_wa(wa_r_factor, calculate_wa_twolevel!(X, log_size, l0_count, v2, wa_r, wa_w))
204 | 		for i = 2:level_count
205 | 			diff = max(v2[i] * 0.001, 1.)
206 | 			org = v2[i]
207 | 			v2[i] += diff
208 | 			grad_f[i - 1] = (get_wa(wa_r_factor, calculate_wa_twolevel!(X, log_size, l0_count, v2, wa_r, wa_w)) - y) / diff
209 | 			v2[i] = org
210 | 		end
211 | 	end
212 | 
213 | 	eval_g = (v, g) -> begin
214 | 		for i = 1:(level_count - 2)
215 | 			g[i] = v[i] - v[i + 1]
216 | 		end
217 | 	end
218 | 
219 | 	# level i's interval - level i+1's interval <= 0
220 | 	eval_jac_g = (v, mode, rows, cols, values) -> begin
221 | 		if mode == :Structure
222 | 			c = 1
223 | 			for i = 1:level_count - 2
224 | 				rows[c] = i
225 | 				cols[c] = i
226 | 				c += 1
227 | 				rows[c] = i
228 | 				cols[c] = i + 1
229 | 				c += 1
230 | 			end
231 | 		else
232 | 			c = 1
233 | 			for i = 1:level_count - 2
234 | 				values[c] = 1.
235 | 				c += 1
236 | 				values[c] = -1.
237 | 				c += 1
238 | 			end
239 | 		end
240 | 	end
241 | 
242 | 	v_L = [log_size * l0_count for i = 1:level_count - 1]
243 | 	v_U = [2.e19 for i = 1:level_count - 1]
244 | 
245 | 	g_L = [-2.e19 for i = 1:level_count - 2]
246 | 	g_U = [0. for i = 1:level_count - 2]
247 | 
248 | 	prob = createProblem(level_count - 1, v_L, v_U,
249 | 						 level_count - 2, g_L, g_U,
250 | 						 (level_count - 2) * 2, 0,
251 | 						 eval_f, eval_g, eval_grad_f, eval_jac_g)
252 | 
253 | 	addOption(prob, "hessian_approximation", "limited-memory")
254 | 
255 | 	addOption(prob, "tol", ftol)
256 | 	addOption(prob, "max_cpu_time", max_time)
257 | 	addOption(prob, "acceptable_iter", 1000)
258 | 
259 | 	addOption(prob, "print_level", 2)
260 | 
261 | 	prob.x = init_intervals[2:end]
262 | 
263 | 	@time status = solveProblem(prob)
264 | 
265 | 	ret = Ipopt.ApplicationReturnStatus[status]
266 | 	minf = prob.obj_val
267 | 	minx = prob.x
268 | 	println("got $minf at $minx after $count iterations (returned $ret)")
269 | 
270 | 	cat(1, [log_size * l0_count], minx)
271 | end
272 | 
273 | function optimize_wa_multilevel(X::Distribution, log_size::Float64, l0_count::Float64, init_intervals::Array{Float64}, wa_r_factor::Float64, ftol::Float64, max_time::Float64)
274 | 	n = X.count
275 | 	level_count = length(init_intervals)
276 | 
277 | 	# v2 = Array(Float64, level_count)
278 | 
279 | 	# count = 0
280 | 	# wa_r = Array(Float64, 2 + level_count)
281 | 	# wa_w = Array(Float64, 2 + level_count)
282 | 	# f = (v, grad) -> begin
283 | 	#	count += 1
284 | 	#	# we need to make geom_mean(cat(1, [X], v)) = log_size * l0_count
285 | 	#	# 1/X + ..  = 1 / (log_size * l0_count)
286 | 	#	# 1/X = 1 / (log_size * l0_count) - ...
287 | 	#	# X = 1 / (1 / (log_size * l0_count) - ...)
288 | 	#	#   = geom_mean(cat(1, [log_size * l0_count], -v))
289 | 
290 | 	#	# v2[1] = geom_mean(cat(1, [log_size * l0_count], -v))
291 | 
292 | 	#	v2[1] = -(log_size * l0_count)
293 | 	#	v2[2:level_count] = v
294 | 	#	v2[1] = -geom_mean(v2)
295 | 	#	get_wa(wa_r_factor, calculate_wa_multilevel!(X, log_size, l0_count, v2, wa_r, wa_w))
296 | 	# end
297 | 
298 | 	# v = init_intervals[2:end]
299 | 
300 | 	# opt = Opt(:LN_COBYLA, level_count - 1)
301 | 	# min_objective!(opt, f)
302 | 	# for i = 1:(level_count - 2)
303 | 	#	inequality_constraint!(opt, (v, grad) -> v[i] - v[i + 1])		# <= 0
304 | 	# end
305 | 	# ftol_abs!(opt, ftol)
306 | 	# maxtime!(opt, max_time)
307 | 	# @time (minf, minx, ret) = optimize(opt, v)
308 | 	# println("got $minf at $minx after $count iterations (returned $ret)")
309 | 
310 | 	# x = geom_mean(cat(1, [log_size * l0_count], -minx))
311 | 	# cat(1, [x], minx)
312 | 
313 | 	#######################
314 | 
315 | 	v2 = Array(Float64, level_count)
316 | 	v2[1] = log_size * l0_count
317 | 
318 | 	count = 0
319 | 	wa_r = Array(Float64, 2 + level_count)
320 | 	wa_w = Array(Float64, 2 + level_count)
321 | 
322 | 	eval_f = (v) -> begin
323 | 		count += 1
324 | 		v2[1] = -(log_size * l0_count)
325 | 		v2[2:level_count] = v
326 | 		v2[1] = -geom_mean(v2)
327 | 		# note that v2[1] can become negative accidentally, which is not valid for unique()
328 | 		get_wa(wa_r_factor, calculate_wa_multilevel!(X, log_size, l0_count, v2, wa_r, wa_w))
329 | 	end
330 | 
331 | 	eval_grad_f = (v, grad_f) -> begin
332 | 		v2[1] = -(log_size * l0_count)
333 | 		v2[2:level_count] = v
334 | 		v2[1] = -geom_mean(v2)
335 | 		y = get_wa(wa_r_factor, calculate_wa_multilevel!(X, log_size, l0_count, v2, wa_r, wa_w))
336 | 		for i = 2:level_count
337 | 			diff = max(v2[i] * 0.001, 1.)
338 | 			org = v2[i]
339 | 			v2[i] += diff
340 | 			v2[1] = -(log_size * l0_count)
341 | 			v2[1] = -geom_mean(v2)
342 | 			grad_f[i - 1] = (get_wa(wa_r_factor, calculate_wa_multilevel!(X, log_size, l0_count, v2, wa_r, wa_w)) - y) / diff
343 | 			v2[i] = org
344 | 		end
345 | 	end
346 | 
347 | 	eval_g = (v, g) -> begin
348 | 		for i = 1:(level_count - 2)
349 | 			g[i] = v[i] - v[i + 1]
350 | 		end
351 | 	end
352 | 
353 | 	# level i's interval - level i+1's interval <= 0
354 | 	eval_jac_g = (v, mode, rows, cols, values) -> begin
355 | 		if mode == :Structure
356 | 			c = 1
357 | 			for i = 1:level_count - 2
358 | 				rows[c] = i
359 | 				cols[c] = i
360 | 				c += 1
361 | 				rows[c] = i
362 | 				cols[c] = i + 1
363 | 				c += 1
364 | 			end
365 | 		else
366 | 			c = 1
367 | 			for i = 1:level_count - 2
368 | 				values[c] = 1
369 | 				c += 1
370 | 				values[c] = -1
371 | 				c += 1
372 | 			end
373 | 		end
374 | 	end
375 | 
376 | 	v_L = [log_size * l0_count for i = 1:level_count - 1]
377 | 	v_U = [2.e19 for i = 1:level_count - 1]
378 | 
379 | 	g_L = [-2.e19 for i = 1:level_count - 2]
380 | 	g_U = [0. for i = 1:level_count - 2]
381 | 
382 | 	prob = createProblem(level_count - 1, v_L, v_U,
383 | 						 level_count - 2, g_L, g_U,
384 | 						 (level_count - 2) * 2, 0,
385 | 						 eval_f, eval_g, eval_grad_f, eval_jac_g)
386 | 
387 | 	addOption(prob, "hessian_approximation", "limited-memory")
388 | 
389 | 	addOption(prob, "tol", ftol)
390 | 	addOption(prob, "max_cpu_time", max_time)
391 | 	addOption(prob, "acceptable_iter", 1000)
392 | 
393 | 	addOption(prob, "print_level", 2)
394 | 
395 | 	prob.x = init_intervals[2:end]
396 | 
397 | 	@time status = solveProblem(prob)
398 | 
399 | 	ret = Ipopt.ApplicationReturnStatus[status]
400 | 	minf = prob.obj_val
401 | 	minx = prob.x
402 | 	println("got $minf at $minx after $count iterations (returned $ret)")
403 | 
404 | 	x = geom_mean(cat(1, [log_size * l0_count], -minx))
405 | 	cat(1, [x], minx)
406 | end
407 | 
408 | function print_twolevel(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}, wa_r_factor::Float64)
409 | 	level_count = length(intervals)
410 | 
411 | 	println("intervals = ", [iround(v) for v in intervals])
412 | 	println("exp. size = ", [iround(Common.unique(X, v)) for v in intervals])
413 | 	println("(", [round(intervals[i] / intervals[i - 1] * 100.) / 100. for i in 2:length(intervals)], " X)")
414 | 	wa = calculate_wa_twolevel(X, log_size, l0_count, intervals)
415 | 	println("WA (mem->log) = ", wa[2][1])
416 | 	println("WA (log->0) = ", wa[2][2])
417 | 	for i = 1:level_count; println("WA ($(i-1)->$i) = ", wa[2][i + 2]) end
418 | 	println("WA = ", get_wa(wa_r_factor, wa))
419 | end
420 | 
421 | function print_multilevel(X::Distribution, log_size::Float64, l0_count::Float64, intervals::Array{Float64}, wa_r_factor::Float64)
422 | 	level_count = length(intervals)
423 | 
424 | 	println("intervals = ", [iround(v) for v in intervals])
425 | 	println("(", [round(intervals[i] / intervals[i - 1] * 100.) / 100. for i in 2:length(intervals)], " X)")
426 | 	wa = calculate_wa_multilevel(X, log_size, l0_count, intervals)
427 | 	println("avg L0 intervals = ", iround(geom_mean(intervals)))
428 | 	println("WA (mem->log) = ", wa[2][1])
429 | 	println("WA (log->0) = ", wa[2][2])
430 | 	for i = 1:level_count; println("WA ($(i-1)->$i) = ", wa[2][i + 2]) end
431 | 	println("WA = ", get_wa(wa_r_factor, wa))
432 | end
433 | 
434 | end
435 | 


--------------------------------------------------------------------------------
/Main.jl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/julia
2 | 
3 | include("Common.jl")
4 | include("SizeModel.jl")
5 | include("IntervalModel.jl")
6 | include("Analysis.jl")
7 | 
8 | #Analysis.run()
9 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CXXFLAGS=\
 2 | 	-Ileveldb/include \
 3 | 	-Irocksdb/include \
 4 | 	-pthread \
 5 | 	-g -Wall -Wextra -Wsign-conversion -Winline -Wno-unused-function \
 6 | 	-Wconversion \
 7 | 	-O3 \
 8 | 	-march=native \
 9 | 	-std=c++0x
10 | 	# -std=c++11
11 | 	# -fno-omit-frame-pointer
12 | 
13 | MAIN_SRC=util.cpp leveldb.cpp leveldb_impl.cpp rocksdb_impl.cpp meshdb.cpp main.cpp
14 | MEASURE_RW_SRC=measure_rw.cpp
15 | 
16 | TARGETS=main measure_rw
17 | 
18 | MAIN_OBJ=$(patsubst %.cpp,%.o,$(MAIN_SRC))
19 | MAIN_DEPFILES:=$(patsubst %.cpp,%.d,$(MAIN_SRC))
20 | 
21 | MEASURE_RW_OBJ=$(patsubst %.cpp,%.o,$(MEASURE_RW_SRC))
22 | MEASURE_RW_DEPFILES:=$(patsubst %.cpp,%.d,$(MEASURE_RW_SRC))
23 | 
24 | all: $(TARGETS)
25 | 
26 | main: $(MAIN_OBJ) leveldb/libleveldb.a rocksdb/librocksdb.a
27 | 	$(CXX) $(CXXFLAGS) -o $@ $^ -lsnappy -lz -lbz2 -lrt
28 | 
29 | measure_rw: $(MEASURE_RW_OBJ)
30 | 	$(CXX) $(CXXFLAGS) -o $@ $^
31 | 
32 | clean:
33 | 	$(RM) $(MAIN_OBJ) $(MAIN_DEPFILES) $(MEASURE_RW_OBJ) $(MEASURE_RW_DEPFILES) $(TARGETS)
34 | 
35 | 
36 | # dependancy checking from https://stackoverflow.com/a/313787
37 | NODEPS:=clean
38 | 
39 | ifeq (0, $(words $(findstring $(MAKECMDGOALS), $(NODEPS))))
40 | 	-include $(DEPFILES)
41 | endif
42 | 
43 | %.d: %.cpp
44 | 	$(CXX) $(CXXFLAGS) -MM -MT '$(patsubst %.cpp,%.o,$<)' $< -MF $@
45 | # end
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Accurate and Fast Evaluation of Multi-Stage Log-Structured Designs
 2 | ==================================================================
 3 | 
 4 | 
 5 | 
 6 | Contributors
 7 | ------------
 8 | 
 9 |  * Hyeontaek Lim (CMU)
10 | 
11 | 
12 | License
13 | -------
14 | 
15 | 	Copyright 2014, 2015, 2016 Carnegie Mellon University
16 | 
17 | 	Licensed under the Apache License, Version 2.0 (the "License");
18 | 	you may not use this file except in compliance with the License.
19 | 	You may obtain a copy of the License at
20 | 
21 | 	    http://www.apache.org/licenses/LICENSE-2.0
22 | 
23 | 	Unless required by applicable law or agreed to in writing, software
24 | 	distributed under the License is distributed on an "AS IS" BASIS,
25 | 	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26 | 	See the License for the specific language governing permissions and
27 | 	limitations under the License.
28 | 
29 | 


--------------------------------------------------------------------------------
/SizeModel.jl:
--------------------------------------------------------------------------------
  1 | module SizeModel
  2 | 
  3 | using Common
  4 | # using NLopt
  5 | using Ipopt
  6 | 
  7 | function init_sizes(X::Distribution, l1_size::Float64, growth_factor::Float64=0., level_count::Int64=0)
  8 | 	n = X.count
  9 | 
 10 | 	if growth_factor != 0.
 11 | 		# we are fine
 12 | 	elseif level_count != 0
 13 | 		growth_factor = exp(log(Float64(n) / l1_size) / Float64(level_count - 1))
 14 | 	else
 15 | 		@assert false
 16 | 	end
 17 | 
 18 | 	sizes = Array(Float64, 0)
 19 | 	i = 1
 20 | 	while true
 21 | 		size = l1_size * ceil(growth_factor ^ Float64(i - 1))
 22 | 		if size < Float64(n)
 23 | 			push!(sizes, size)
 24 | 		else
 25 | 			push!(sizes, Float64(n))
 26 | 			break
 27 | 		end
 28 | 		i += 1
 29 | 	end
 30 | 
 31 | 	sizes
 32 | end
 33 | 
 34 | function calculate_ra!(X::Distribution, X_q::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64}, ra::Array{Float64})
 35 | end
 36 | 
 37 | function calculate_wa!(X::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64}, wa_r::Array{Float64}, wa_w::Array{Float64})
 38 | 	@assert sizes[end] == Float64(X.count)
 39 | 
 40 | 	# mem->log
 41 | 	wa_r[1] = 0.
 42 | 	wa_w[1] = 1.
 43 | 
 44 | 	# log->0
 45 | 	wa_r[2] = 0.
 46 | 	wa_w[2] = Common.unique(X, log_size) / log_size
 47 | 
 48 | 	# ## amortized, full destination level
 49 | 	# # 0->1
 50 | 	# wa[3] = Common.unique(X, unique_inv(X, sizes[1]) + (log_size * l0_count)) / (log_size * l0_count)
 51 | 	# # 1->2, 2->3, ...
 52 | 	# for i in 1:(length(sizes) - 1)
 53 | 	#	if i < length(sizes) - 1
 54 | 	#		wa[3 + i] = merge(X, sizes[i + 1], sizes[i]) / unique_inv(X, sizes[i])
 55 | 	#	else
 56 | 	#		wa[3 + i] = Float64(X.count) / unique_inv(X, sizes[i])
 57 | 	#	end
 58 | 	# end
 59 | 
 60 | 	# ## amortized, compact entire level, discrete interval calculation (maybe accurate, but does not work with optimizer)
 61 | 	# # 0->1
 62 | 	# interval = log_size * l0_count
 63 | 	# next_interval = unique_inv(X, sizes[1])
 64 | 	# effective_next_interval = floor(next_interval / interval + 1.) * interval
 65 | 	# wa[3] = unique_avg(X, interval, effective_next_interval) / interval
 66 | 	# # 1->2, 2->3, ...
 67 | 	# for i in 1:(length(sizes) - 1)
 68 | 	#	interval = effective_next_interval
 69 | 	#	if i < length(sizes) - 1
 70 | 	#		next_interval = unique_inv(X, sizes[i + 1])
 71 | 	#		effective_next_interval = floor(next_interval / interval + 1.) * interval
 72 | 	#		wa[3 + i] = unique_avg(X, interval, effective_next_interval) / interval
 73 | 	#	else
 74 | 	#		wa[3 + i] = sizes[end] / interval
 75 | 	#	end
 76 | 	# end
 77 | 
 78 | 	# note that "interval * 0.5" is the overflown amount that causes compaction
 79 | 	# 0.5 is just an approximate; it should be lower under high skew or with a key count close to the total unique count
 80 | 	# because the level grows slowly as its size approaches the maximum level size.
 81 | 
 82 | 	# ## amortized, compact entire level, continuous interval calculation (maybe less accurate)
 83 | 	# # 0->1
 84 | 	# interval = log_size * l0_count
 85 | 	# next_interval = unique_inv(X, sizes[1])
 86 | 	# wa[3] = unique_avg(X, interval, interval * 0.5 + next_interval) / interval
 87 | 	# # 1->2, 2->3, ...
 88 | 	# for i in 1:(length(sizes) - 1)
 89 | 	#   interval = interval * 0.5 + next_interval
 90 | 	#   if i < length(sizes) - 1
 91 | 	#   	next_interval = unique_inv(X, sizes[i + 1])
 92 | 	#   	wa[3 + i] = unique_avg(X, interval, interval * 0.5 + next_interval) / interval
 93 | 	#   else
 94 | 	#   	wa[3 + i] = sizes[end] / interval
 95 | 	#   end
 96 | 	# end
 97 | 
 98 | 	## deamortized, compact each sstable in a round-robin way
 99 | 	# 0->1
100 | 	interval = log_size * l0_count
101 | 	next_interval = unique_inv(X, sizes[1])
102 | 	wa_r[3] = (Common.unique(X, log_size) * l0_count + sizes[1]) / interval
103 | 	wa_w[3] = Common.unique(X, interval + next_interval) / interval
104 | 	# 1->2, 2->3, ...
105 | 	for i in 1:(length(sizes) - 1)
106 | 		# we need to take the previous interval as part of this interval ("interval +")
107 | 		# because the current level temporarily has to accommodate the data from the previous level
108 | 		interval = interval + interval_from_density(X, sizes[i])
109 | 		if i < length(sizes) - 1
110 | 			next_interval = unique_inv(X, sizes[i + 1])
111 | 			# plus unique(X, interval) * 1 to WA because of the overlapping tables' keys that do not actually overlap the compaction key range
112 | 			# TODO: this may become less accurate with spatial locality in key range because the overlapping tables' key range may be sparse
113 | 			wa_r[3 + i] = (Common.unique(X, interval) + sizes[i + 1] + Common.unique(X, interval) * 1.) / interval
114 | 			wa_w[3 + i] = (Common.unique(X, interval + next_interval) + Common.unique(X, interval) * 1.) / interval
115 | 		else
116 | 			wa_r[3 + i] = (Common.unique(X, interval) + sizes[end] + Common.unique(X, interval) * 1.) / interval
117 | 			wa_w[3 + i] = (sizes[end] + Common.unique(X, interval) * 1.) / interval
118 | 		end
119 | 	end
120 | 
121 | 	wa_r, wa_w
122 | end
123 | 
124 | function calculate_wa(X::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64})
125 | 	wa_r = Array(Float64, 2 + length(sizes))
126 | 	wa_w = Array(Float64, 2 + length(sizes))
127 | 	calculate_wa!(X::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64}, wa_r, wa_w)
128 | end
129 | 
130 | function optimize_wa(X::Distribution, log_size::Float64, l0_count::Float64, init_sizes::Array{Float64}, wa_r_factor::Float64, ftol::Float64, max_time::Float64)
131 | 	n = X.count
132 | 	level_count = length(init_sizes)
133 | 
134 | 	# v2 = Array(Float64, level_count)
135 | 	# v2[level_count] = n
136 | 
137 | 	# count = 0
138 | 	# wa = Array(Float64, 2 + level_count)
139 | 	# f = (v, grad) -> begin
140 | 	#	if length(grad) > 0
141 | 	#		v2[1:level_count - 1] = v
142 | 	#		y = get_wa(wa_r_factor, calculate_wa!(X, log_size, l0_count, v2, wa))
143 | 	#		for i = 1:length(grad)
144 | 	#			org = v2[i]
145 | 	#			v2[i] += 1
146 | 	#			grad[i] = get_wa(wa_r_factor, calculate_wa!(X, log_size, l0_count, v2, wa)) - y
147 | 	#			v2[i] = org
148 | 	#		end
149 | 	#	end
150 | 	#	count += 1
151 | 	#	v2[1:level_count - 1] = v
152 | 	#	get_wa(wa_r_factor, calculate_wa!(X, log_size, l0_count, v2, wa))
153 | 	# end
154 | 
155 | 	# gen_g = (i) -> (v, grad) -> begin
156 | 	#	if length(grad) > 0
157 | 	#		for j = 1:length(grad)
158 | 	#			if i == j
159 | 	#				grad[j] = 1.
160 | 	#			elseif i + 1 == j
161 | 	#				grad[j] = -1.
162 | 	#			else
163 | 	#				grad[j] = 0.
164 | 	#			end
165 | 	#		end
166 | 	#	end
167 | 	#	v[i] - v[i + 1]
168 | 	# end
169 | 
170 | 	# v = init_sizes[1:end - 1]
171 | 
172 | 	# v_L = [0. for i = 1:level_count - 1]
173 | 	# v_U = [2.e19 for i = 1:level_count - 1]
174 | 
175 | 	# opt = Opt(:LN_COBYLA, level_count - 1)
176 | 	# # opt = Opt(:LD_MMA, level_count - 1)
177 | 	# min_objective!(opt, f)
178 | 	# lower_bounds!(opt, v_L)
179 | 	# upper_bounds!(opt, v_U)
180 | 	# for i = 1:(level_count - 2)
181 | 	#	# inequality_constraint!(opt, (v, grad) -> v[i] - v[i + 1])		# <= 0
182 | 	#	inequality_constraint!(opt, gen_g(i))                      		# <= 0
183 | 	# end
184 | 	# # inequality_constraint!(opt, (v, grad) -> v[level_count - 2] - n)		# <= 0
185 | 	# ftol_abs!(opt, ftol)
186 | 	# maxtime!(opt, max_time)
187 | 	# @time (minf, minx, ret) = optimize(opt, v)
188 | 	# println("got $minf at $minx after $count iterations (returned $ret)")
189 | 
190 | 	# cat(1, minx, [n])
191 | 
192 | 	#########################
193 | 
194 | 	v2 = Array(Float64, level_count)
195 | 	v2[level_count] = n
196 | 
197 | 	count = 0
198 | 	wa_r = Array(Float64, 2 + level_count)
199 | 	wa_w = Array(Float64, 2 + level_count)
200 | 
201 | 	eval_f = (v) -> begin
202 | 		count += 1
203 | 		v2[1:level_count - 1] = v
204 | 		get_wa(wa_r_factor, calculate_wa!(X, log_size, l0_count, v2, wa_r, wa_w))
205 | 	end
206 | 
207 | 	eval_grad_f = (v, grad_f) -> begin
208 | 		v2[1:level_count - 1] = v
209 | 		y = get_wa(wa_r_factor, calculate_wa!(X, log_size, l0_count, v2, wa_r, wa_w))
210 | 		for i = 1:(level_count - 1)
211 | 			diff = max(v2[i] * 0.001, 1.)
212 | 			org = v2[i]
213 | 			v2[i] += diff
214 | 			grad_f[i] = (get_wa(wa_r_factor, calculate_wa!(X, log_size, l0_count, v2, wa_r, wa_w)) - y) / diff
215 | 			v2[i] = org
216 | 		end
217 | 	end
218 | 
219 | 	eval_g = (v, g) -> begin
220 | 		for i = 1:(level_count - 2)
221 | 			g[i] = v[i] - v[i + 1]
222 | 		end
223 | 	end
224 | 
225 | 	# level i's size - level i+1's size <= 0
226 | 	eval_jac_g = (v, mode, rows, cols, values) -> begin
227 | 		if mode == :Structure
228 | 			c = 1
229 | 			for i = 1:level_count - 2
230 | 				rows[c] = i
231 | 				cols[c] = i
232 | 				c += 1
233 | 				rows[c] = i
234 | 				cols[c] = i + 1
235 | 				c += 1
236 | 			end
237 | 		else
238 | 			c = 1
239 | 			for i = 1:level_count - 2
240 | 				values[c] = 1.
241 | 				c += 1
242 | 				values[c] = -1.
243 | 				c += 1
244 | 			end
245 | 		end
246 | 	end
247 | 
248 | 	v_L = [1. for i = 1:level_count - 1]
249 | 	v_U = [Float64(n) for i = 1:level_count - 1]
250 | 
251 | 	g_L = [-2.e19 for i = 1:level_count - 2]
252 | 	g_U = [0. for i = 1:level_count - 2]
253 | 
254 | 	prob = createProblem(level_count - 1, v_L, v_U,
255 | 						 level_count - 2, g_L, g_U,
256 | 						 (level_count - 2) * 2, 0,
257 | 						 eval_f, eval_g, eval_grad_f, eval_jac_g)
258 | 
259 | 	addOption(prob, "hessian_approximation", "limited-memory")
260 | 
261 | 	addOption(prob, "tol", ftol)
262 | 	addOption(prob, "max_cpu_time", max_time)
263 | 	addOption(prob, "acceptable_iter", 1000)
264 | 
265 | 	addOption(prob, "print_level", 2)
266 | 
267 | 	prob.x = init_sizes[1:end - 1]
268 | 
269 | 	@time status = solveProblem(prob)
270 | 
271 | 	ret = Ipopt.ApplicationReturnStatus[status]
272 | 	minf = prob.obj_val
273 | 	minx = prob.x
274 | 	println("got $minf at $minx after $count iterations (returned $ret)")
275 | 
276 | 	cat(1, minx, [n])
277 | end
278 | 
279 | 
280 | ###########
281 | 
282 | function calculate_random_compaction_wa!(X::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64}, wa_r::Array{Float64}, wa_w::Array{Float64})
283 | 	@assert sizes[end] == Float64(X.count)
284 | 
285 | 	# mem->log
286 | 	wa_r[1] = 0.
287 | 	wa_w[1] = 1.
288 | 
289 | 	# log->0
290 | 	wa_r[2] = 0.
291 | 	wa_w[2] = Common.unique(X, log_size) / log_size
292 | 
293 | 	## deamortized, compact each sstable in a random way
294 | 	# 0->1
295 | 	interval = log_size * l0_count
296 | 	next_interval = unique_inv(X, sizes[1])
297 | 	wa_r[3] = (Common.unique(X, log_size) * l0_count + sizes[1]) / interval
298 | 	wa_w[3] = Common.unique(X, interval + next_interval) / interval
299 | 	# 1->2, 2->3, ...
300 | 	for i in 1:(length(sizes) - 1)
301 | 		# we need to take the previous interval as part of this interval ("interval +")
302 | 		# because the current level temporarily has to accommodate the data from the previous level
303 | 		interval = interval + next_interval
304 | 		if i < length(sizes) - 1
305 | 			next_interval = unique_inv(X, sizes[i + 1])
306 | 			# plus unique(X, interval) * 1 to WA because of the overlapping tables' keys that do not actually overlap the compaction key range
307 | 			# TODO: this may become less accurate with spatial locality in key range because the overlapping tables' key range may be sparse
308 | 			wa_r[3 + i] = (Common.unique(X, interval) + sizes[i + 1] + Common.unique(X, interval) * 1.) / interval
309 | 			wa_w[3 + i] = (Common.unique(X, interval + next_interval) + Common.unique(X, interval) * 1.) / interval
310 | 		else
311 | 			wa_r[3 + i] = (Common.unique(X, interval) + sizes[end] + Common.unique(X, interval) * 1.) / interval
312 | 			wa_w[3 + i] = (sizes[end] + Common.unique(X, interval) * 1.) / interval
313 | 		end
314 | 	end
315 | 
316 | 	wa_r, wa_w
317 | end
318 | 
319 | function calculate_random_compaction_wa(X::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64})
320 | 	wa_r = Array(Float64, 2 + length(sizes))
321 | 	wa_w = Array(Float64, 2 + length(sizes))
322 | 	calculate_random_compaction_wa!(X::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64}, wa_r, wa_w)
323 | end
324 | 
325 | 
326 | 
327 | 
328 | ###########
329 | 
330 | 
331 | function calculate_mbuf_wa!(X::Distribution, mbuf_size::Float64, sizes::Array{Float64}, wa_r::Array{Float64}, wa_w::Array{Float64})
332 | 	@assert sizes[end] == Float64(X.count)
333 | 
334 | 	# mem->log
335 | 	wa_r[1] = 0.
336 | 	wa_w[1] = 1.
337 | 
338 | 	# log->mbuf
339 | 	wa_r[2] = 0.
340 | 	wa_w[2] = 0.
341 | 
342 | 	## deamortized, compact each sstable in a round-robin way
343 | 	# mbuf->1
344 | 	# no overflow from log because mbuf must be compacted proactively.
345 | 	interval = interval_from_density(X, mbuf_size)
346 | 	next_interval = unique_inv(X, sizes[1])
347 | 	# # however, we have to consider false overlaps since this is now incremental compaction.
348 | 	# wa[3] = (Common.unique(X, interval + next_interval) + Common.unique(X, interval) * 1.) / interval
349 | 	wa_r[3] = (Common.unique(X, log_size) * l0_count + sizes[1]) / interval
350 | 	wa_w[3] = Common.unique(X, interval + next_interval) / interval
351 | 	# 1->2, 2->3, ...
352 | 	for i in 1:(length(sizes) - 1)
353 | 		# we need to take the previous interval as part of this interval ("interval +")
354 | 		# because the current level temporarily has to accommodate the data from the previous level
355 | 		interval = interval + interval_from_density(X, sizes[i])
356 | 		if i < length(sizes) - 1
357 | 			next_interval = unique_inv(X, sizes[i + 1])
358 | 			# plus unique(X, interval) * 1 to WA because of the overlapping tables' keys that do not actually overlap the compaction key range
359 | 			# TODO: this may become less accurate with spatial locality in key range because the overlapping tables' key range may be sparse
360 | 			wa_r[3 + i] = (Common.unique(X, interval) + sizes[i + 1] + Common.unique(X, interval) * 1.) / interval
361 | 			wa_w[3 + i] = (Common.unique(X, interval + next_interval) + Common.unique(X, interval) * 1.) / interval
362 | 		else
363 | 			wa_r[3 + i] = (Common.unique(X, interval) + sizes[end] + Common.unique(X, interval) * 1.) / interval
364 | 			wa_w[3 + i] = (sizes[end] + Common.unique(X, interval) * 1.) / interval
365 | 		end
366 | 	end
367 | 
368 | 	wa_r, wa_w
369 | end
370 | 
371 | function calculate_mbuf_wa(X::Distribution, mbuf_size::Float64, sizes::Array{Float64})
372 | 	wa_r = Array(Float64, 2 + length(sizes))
373 | 	wa_w = Array(Float64, 2 + length(sizes))
374 | 	calculate_mbuf_wa!(X::Distribution, mbuf_size::Float64, sizes::Array{Float64}, wa_r, wa_w)
375 | end
376 | 
377 | function optimize_mbuf_wa(X::Distribution, mbuf_size::Float64, init_sizes::Array{Float64}, wa_r_factor::Float64, ftol::Float64, max_time::Float64)
378 | 	n = X.count
379 | 	level_count = length(init_sizes)
380 | 
381 | 	v2 = Array(Float64, level_count)
382 | 	v2[level_count] = n
383 | 
384 | 	count = 0
385 | 	wa = Array(Float64, 2 + level_count)
386 | 
387 | 	eval_f = (v) -> begin
388 | 		count += 1
389 | 		v2[1:level_count - 1] = v
390 | 		get_wa(wa_r_factor, calculate_mbuf_wa!(X, mbuf_size, v2, wa))
391 | 	end
392 | 
393 | 	eval_grad_f = (v, grad_f) -> begin
394 | 		v2[1:level_count - 1] = v
395 | 		y = get_wa(wa_r_factor, calculate_mbuf_wa!(X, mbuf_size, v2, wa))
396 | 		for i = 1:(level_count - 1)
397 | 			diff = max(v2[i] * 0.001, 1.)
398 | 			org = v2[i]
399 | 			v2[i] += diff
400 | 			grad_f[i] = (get_wa(wa_r_factor, calculate_mbuf_wa!(X, mbuf_size, v2, wa)) - y) / diff
401 | 			v2[i] = org
402 | 		end
403 | 	end
404 | 
405 | 	eval_g = (v, g) -> begin
406 | 		for i = 1:(level_count - 2)
407 | 			g[i] = v[i] - v[i + 1]
408 | 		end
409 | 	end
410 | 
411 | 	# level i's size - level i+1's size <= 0
412 | 	eval_jac_g = (v, mode, rows, cols, values) -> begin
413 | 		if mode == :Structure
414 | 			c = 1
415 | 			for i = 1:level_count - 2
416 | 				rows[c] = i
417 | 				cols[c] = i
418 | 				c += 1
419 | 				rows[c] = i
420 | 				cols[c] = i + 1
421 | 				c += 1
422 | 			end
423 | 		else
424 | 			c = 1
425 | 			for i = 1:level_count - 2
426 | 				values[c] = 1.
427 | 				c += 1
428 | 				values[c] = -1.
429 | 				c += 1
430 | 			end
431 | 		end
432 | 	end
433 | 
434 | 	# v_L = [1. for i = 1:level_count - 1]
435 | 	v_L = [mbuf_size for i = 1:level_count - 1]
436 | 	v_U = [Float64(n) for i = 1:level_count - 1]
437 | 
438 | 	g_L = [-2.e19 for i = 1:level_count - 2]
439 | 	g_U = [0. for i = 1:level_count - 2]
440 | 
441 | 	prob = createProblem(level_count - 1, v_L, v_U,
442 | 						 level_count - 2, g_L, g_U,
443 | 						 (level_count - 2) * 2, 0,
444 | 						 eval_f, eval_g, eval_grad_f, eval_jac_g)
445 | 
446 | 	addOption(prob, "hessian_approximation", "limited-memory")
447 | 
448 | 	addOption(prob, "tol", ftol)
449 | 	addOption(prob, "max_cpu_time", max_time)
450 | 	addOption(prob, "acceptable_iter", 1000)
451 | 
452 | 	addOption(prob, "print_level", 2)
453 | 
454 | 	prob.x = init_sizes[1:end - 1]
455 | 
456 | 	@time status = solveProblem(prob)
457 | 
458 | 	ret = Ipopt.ApplicationReturnStatus[status]
459 | 	minf = prob.obj_val
460 | 	minx = prob.x
461 | 	println("got $minf at $minx after $count iterations (returned $ret)")
462 | 
463 | 	cat(1, minx, [n])
464 | end
465 | 
466 | 
467 | 
468 | 
469 | 
470 | 
471 | ###########
472 | 
473 | 
474 | 
475 | function print(X::Distribution, log_size::Float64, l0_count::Float64, sizes::Array{Float64}, wa_r_factor::Float64)
476 | 	level_count = length(sizes)
477 | 
478 | 	println("sizes = ", [iround(v) for v in sizes])
479 | 	println("(", [round(sizes[i] / sizes[i - 1] * 100.) / 100. for i in 2:length(sizes)], " X)")
480 | 	wa = calculate_wa(X, log_size, l0_count, sizes)
481 | 	println("WA (mem->log) = ", wa[2][1])
482 | 	println("WA (log->0) = ", wa[2][2])
483 | 	for i = 1:level_count; println("WA ($(i-1)->$i) = ", wa[2][i + 2]) end
484 | 	println("WA = ", get_wa(wa_r_factor, wa))
485 | end
486 | 
487 | 
488 | ## TODO: COLA and SAMT should be moved to IntervalModel
489 | 
490 | # COLA
491 | function calculate_wa_cola!(X::Distribution, log_size::Float64, r::Int64, L::Int64, wa_r::Array{Float64}, wa_w::Array{Float64})
492 | 	# mem->log
493 | 	wa_r[1] = 0.
494 | 	wa_w[1] = 1.
495 | 
496 | 	# mem->1, 1->2, 2->3, ...
497 | 	interval = 0.
498 | 	next_interval = log_size
499 | 	for i in 0:(L - 2)
500 | 		interval = next_interval
501 | 		next_interval = interval * r
502 | 		r_ = 0.
503 | 		w = 0.
504 | 		# a level accepts merges up to r-1 times.
505 | 		# this means that we set r (g in the COLA paper) to be (B^e + 1), which is still in Theta(B^e).
506 | 		# choosing r in that way makes the number of levels bounded by O(log_{B^e + 1} N) = O(log_r N),
507 | 		# which results in the level count we intend to obtain.
508 | 		for j in 0:(r - 2)
509 | 			if i == 0
510 | 				r_ += Common.unique(X, interval * j)
511 | 			else
512 | 				r_ += Common.unique(X, interval) + Common.unique(X, interval * j)
513 | 			end
514 | 			w += Common.unique(X, interval + interval * j)
515 | 		end
516 | 		wa_r[2 + i] += r_ / next_interval
517 | 		wa_w[2 + i] += w / next_interval
518 | 	end
519 | 
520 | 	# (L-1)->L
521 | 	interval = next_interval
522 | 	wa_r[2 + L - 1] = (Common.unique(X, interval) + X.count) / interval
523 | 	wa_w[2 + L - 1] = X.count / interval
524 | 
525 | 	wa_r, wa_w
526 | end
527 | 
528 | function calculate_wa_cola(X::Distribution, log_size::Float64, r::Int64, L::Int64)
529 | 	wa_r = Array(Float64, 1 + L)
530 | 	wa_w = Array(Float64, 1 + L)
531 | 	calculate_wa_cola!(X::Distribution, log_size::Float64, r::Int64, L::Int64, wa_r, wa_w)
532 | end
533 | 
534 | # SAMT
535 | function calculate_wa_samt!(X::Distribution, log_size::Float64, r::Int64, L::Int64, wa_r::Array{Float64}, wa_w::Array{Float64})
536 | 	# mem->log
537 | 	wa_r[1] = 0.
538 | 	wa_w[1] = 1.
539 | 
540 | 	# mem->1, 1->2, 2->3, ...
541 | 	interval = 0.
542 | 	next_interval = log_size
543 | 	for i in 0:(L - 2)
544 | 		interval = next_interval
545 | 		next_interval = interval * r
546 | 		# a level has r slots to put merges
547 | 		# actually, we do not write anything to the last slot because
548 | 		# we can merge the level into the next level, which makes
549 | 		# COLA and SAMT identical when r = 2
550 | 		#wa[2 + i] = ((r - 1) * Common.unique(X, interval)) / next_interval
551 | 		# but we choose to do maintain full r slots because the SAMT paper seems to intend it.
552 | 		# this makes SAMT more expensive (and wasteful) than COLA with r = 2.
553 | 		# however, SAMT usually uses r = 4, and the compaction only needs to do up to r-way merge
554 | 		# (not ((r-1)^l)-way in COLA), which makes more sense in a practical standpoint.
555 | 		wa_r[2 + i] = (r * Common.unique(X, interval)) / next_interval
556 | 		wa_w[2 + i] = (r * Common.unique(X, interval)) / next_interval
557 | 	end
558 | 
559 | 	# (L-1)->L
560 | 	interval = next_interval
561 | 	wa_r[2 + L - 1] = (r * Common.unique(X, interval) + X.count) / interval
562 | 	wa_w[2 + L - 1] = X.count / interval
563 | 
564 | 	wa_r, wa_w
565 | end
566 | 
567 | function calculate_wa_samt(X::Distribution, log_size::Float64, r::Int64, L::Int64)
568 | 	wa_r = Array(Float64, 1 + L)
569 | 	wa_w = Array(Float64, 1 + L)
570 | 	calculate_wa_samt!(X::Distribution, log_size::Float64, r::Int64, L::Int64, wa_r, wa_w)
571 | end
572 | 
573 | ###########
574 | 
575 | 
576 | # original SILT with major compaction from HashStore to SortedStore
577 | function calculate_wa_silt!(X::Distribution, hash_size::Float64, hash_occupancy::Float64, hash_count::Int64, wa_r::Array{Float64}, wa_w::Array{Float64})
578 | 	convert_interval = unique_inv(X, hash_size * hash_occupancy)
579 | 
580 | 	# TODO: wa_r
581 | 
582 | 	# mem->log store
583 | 	wa_r[1] = 0.
584 | 	wa_w[1] = 1.
585 | 
586 | 	# log store->hash store
587 | 	wa_r[2] = 0.
588 | 	wa_w[2] = hash_size / convert_interval
589 | 
590 | 	# hash stores->sorted store
591 | 	wa_r[3] = 0.
592 | 	wa_w[3] = X.count / (convert_interval * hash_count)
593 | 
594 | 	wa_r, wa_w
595 | end
596 | 
597 | function calculate_wa_silt(X::Distribution, log_size::Float64, hash_occupancy::Float64, hash_count::Int64)
598 | 	wa = Array(Float64, 3)
599 | 	calculate_wa_silt!(X::Distribution, log_size::Float64, hash_occupancy::Float64, hash_count::Int64, wa)
600 | end
601 | 
602 | 
603 | # SILT + minor compaction among HashStore; assume any size of HashStore can be created
604 | function calculate_wa_silt_multi!(X::Distribution, hash_size::Float64, hash_occupancy::Float64, hash_count::Int64, hash_threshold::Float64, wa_r::Array{Float64}, wa_w::Array{Float64})
605 | 	convert_interval = unique_inv(X, hash_size * hash_occupancy)
606 | 
607 | 	# TODO: wa_r
608 | 
609 | 	# the interval of minor compaction
610 | 	minor_compaction_interval = convert_interval * hash_count
611 | 
612 | 	# the number of minor compaction to trigger major compaction; the last minor compaction does not actually write data
613 | 	minor_compaction_count = floor(unique_inv(X, X.count * hash_threshold) / minor_compaction_interval)
614 | 	@assert minor_compaction_count >= 1.0
615 | 
616 | 	# the interval of major compaction
617 | 	major_compaction_interval = minor_compaction_interval * minor_compaction_count
618 | 
619 | 	# mem->log
620 | 	wa_r[1] = 0.
621 | 	wa_w[1] = 1.
622 | 
623 | 	# log store->hash store
624 | 	wa_r[2] = 0.
625 | 	wa_w[2] = hash_size / convert_interval
626 | 
627 | 	# hash stores->hash store
628 | 	wa_r[3] = 0.
629 | 	wa_w[3] = 0.
630 | 	if minor_compaction_count >= 2
631 | 		for j in 0:(minor_compaction_count - 2)
632 | 			wa_w[3] += (Common.unique(X, minor_compaction_interval + minor_compaction_interval * j) / hash_occupancy) / major_compaction_interval
633 | 		end
634 | 	end
635 | 
636 | 	# hash stores->sorted
637 | 	wa_r[4] = 0.
638 | 	wa_w[4] = X.count / major_compaction_interval
639 | 
640 | 	wa_r, wa_w
641 | end
642 | 
643 | function calculate_wa_silt_multi(X::Distribution, hash_size::Float64, hash_occupancy::Float64, hash_count::Int64, hash_threshold::Float64)
644 | 	wa = Array(Float64, 4)
645 | 	calculate_wa_silt_multi!(X::Distribution, hash_size::Float64, hash_occupancy::Float64, hash_count::Int64, hash_threshold::Float64, wa)
646 | end
647 | 
648 | 
649 | end
650 | 
651 | 


--------------------------------------------------------------------------------
/common.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include <cassert>
4 | #include <cstdint>
5 | #include <cstdio>
6 | #include <vector>
7 | #include <set>
8 | #include <algorithm>
9 | 


--------------------------------------------------------------------------------
/diff_leveldb.patch:
--------------------------------------------------------------------------------
  1 | diff --git a/db/dbformat.h b/db/dbformat.h
  2 | index 5d8a032..20cccec 100644
  3 | --- a/db/dbformat.h
  4 | +++ b/db/dbformat.h
  5 | @@ -25,10 +25,12 @@ static const int kNumLevels = 7;
  6 |  static const int kL0_CompactionTrigger = 4;
  7 |  
  8 |  // Soft limit on number of level-0 files.  We slow down writes at this point.
  9 | -static const int kL0_SlowdownWritesTrigger = 8;
 10 | +//static const int kL0_SlowdownWritesTrigger = 8;
 11 | +static const int kL0_SlowdownWritesTrigger = 4;
 12 |  
 13 |  // Maximum number of level-0 files.  We stop writes at this point.
 14 | -static const int kL0_StopWritesTrigger = 12;
 15 | +//static const int kL0_StopWritesTrigger = 12;
 16 | +static const int kL0_StopWritesTrigger = 4;
 17 |  
 18 |  // Maximum level to which a new compacted memtable is pushed if it
 19 |  // does not create overlap.  We try to push to level 2 to avoid the
 20 | diff --git a/db/version_set.cc b/db/version_set.cc
 21 | index aa83df5..f5d8937 100644
 22 | --- a/db/version_set.cc
 23 | +++ b/db/version_set.cc
 24 | @@ -1038,7 +1038,13 @@ void VersionSet::Finalize(Version* v) {
 25 |      } else {
 26 |        // Compute the ratio of current size to size limit.
 27 |        const uint64_t level_bytes = TotalFileSize(v->files_[level]);
 28 | -      score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
 29 | +      // score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
 30 | +	  // MSLS
 31 | +	  if (level < options_->custom_level_size_count)
 32 | +        score = static_cast<double>(level_bytes) / static_cast<double>(options_->custom_level_sizes[level]);
 33 | +	  else {
 34 | +        score = static_cast<double>(level_bytes) / MaxBytesForLevel(level);
 35 | +      }
 36 |      }
 37 |  
 38 |      if (score > best_score) {
 39 | @@ -1286,6 +1292,8 @@ Compaction* VersionSet::PickCompaction() {
 40 |      // c->inputs_[0] earlier and replace it with an overlapping set
 41 |      // which will include the picked file.
 42 |      current_->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]);
 43 | +    // MSLS - test
 44 | +    //current_->GetOverlappingInputs(0, NULL, NULL, &c->inputs_[0]);
 45 |      assert(!c->inputs_[0].empty());
 46 |    }
 47 |  
 48 | diff --git a/include/leveldb/options.h b/include/leveldb/options.h
 49 | index fdda718..a6955d4 100644
 50 | --- a/include/leveldb/options.h
 51 | +++ b/include/leveldb/options.h
 52 | @@ -135,6 +135,15 @@ struct Options {
 53 |    // Default: NULL
 54 |    const FilterPolicy* filter_policy;
 55 |  
 56 | +  // MSLS: Use custom level sizes if custom_level_size_count != 0.
 57 | +  // custom_level_size_count is the maximum level number to change the size.
 58 | +  // custom_level_sizes[i] specifies the maximum size of level-i (i < custom_level_size_count).
 59 | +  // custom_level_sizes[0] is ignored.
 60 | +  //
 61 | +  // Default: 0, NULL
 62 | +  size_t custom_level_size_count;
 63 | +  const size_t* custom_level_sizes;
 64 | +
 65 |    // Create an Options object with default values for all fields.
 66 |    Options();
 67 |  };
 68 | diff --git a/util/crc32c.cc b/util/crc32c.cc
 69 | index 6db9e77..7adb5f9 100644
 70 | --- a/util/crc32c.cc
 71 | +++ b/util/crc32c.cc
 72 | @@ -284,6 +284,9 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) {
 73 |  }
 74 |  
 75 |  uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
 76 | +  // MSLS
 77 | +  return 0;
 78 | +  /*
 79 |    const uint8_t *p = reinterpret_cast<const uint8_t *>(buf);
 80 |    const uint8_t *e = p + size;
 81 |    uint32_t l = crc ^ 0xffffffffu;
 82 | @@ -326,6 +329,7 @@ uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
 83 |  #undef STEP4
 84 |  #undef STEP1
 85 |    return l ^ 0xffffffffu;
 86 | +  */
 87 |  }
 88 |  
 89 |  }  // namespace crc32c
 90 | diff --git a/util/options.cc b/util/options.cc
 91 | index 76af5b9..98ff188 100644
 92 | --- a/util/options.cc
 93 | +++ b/util/options.cc
 94 | @@ -22,7 +22,9 @@ Options::Options()
 95 |        block_size(4096),
 96 |        block_restart_interval(16),
 97 |        compression(kSnappyCompression),
 98 | -      filter_policy(NULL) {
 99 | +      filter_policy(NULL),
100 | +      custom_level_size_count(0),
101 | +      custom_level_sizes(NULL) {
102 |  }
103 |  
104 |  
105 | 


--------------------------------------------------------------------------------
/diff_rocksdb.patch:
--------------------------------------------------------------------------------
  1 | diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc
  2 | index 02f53e5..502d98d 100644
  3 | --- a/db/compaction_picker.cc
  4 | +++ b/db/compaction_picker.cc
  5 | @@ -878,15 +878,37 @@ Compaction* LevelCompactionPicker::PickCompaction(
  6 |      assert(i == 0 || score <= vstorage->CompactionScore(i - 1));
  7 |      if (score >= 1) {
  8 |        output_level = (level == 0) ? vstorage->base_level() : level + 1;
  9 | -      if (PickCompactionBySize(vstorage, level, output_level, &inputs,
 10 | -                               &parent_index, &base_index) &&
 11 | -          ExpandWhileOverlapping(cf_name, vstorage, &inputs)) {
 12 | -        // found the compaction!
 13 | -        break;
 14 | +      // MSLS
 15 | +      //if (PickCompactionBySize(vstorage, level, output_level, &inputs,
 16 | +      //                         &parent_index, &base_index) &&
 17 | +      //    ExpandWhileOverlapping(cf_name, vstorage, &inputs)) {
 18 | +      //  // found the compaction!
 19 | +      //  break;
 20 | +      //} else {
 21 | +      //  // didn't find the compaction, clear the inputs
 22 | +      //  inputs.clear();
 23 | +      //}
 24 | +      if (!mutable_cf_options.use_leveldb_table_selection) {
 25 | +        if (PickCompactionBySize(vstorage, level, output_level, &inputs,
 26 | +                                 &parent_index, &base_index) &&
 27 | +            ExpandWhileOverlapping(cf_name, vstorage, &inputs)) {
 28 | +          // found the compaction!
 29 | +          break;
 30 | +        } else {
 31 | +          // didn't find the compaction, clear the inputs
 32 | +          inputs.clear();
 33 | +        }
 34 |        } else {
 35 | -        // didn't find the compaction, clear the inputs
 36 | -        inputs.clear();
 37 | -      }
 38 | +        if (PickCompactionLevelDB(vstorage, level, output_level, &inputs,
 39 | +                                  &parent_index, &base_index) &&
 40 | +            ExpandWhileOverlapping(cf_name, vstorage, &inputs)) {
 41 | +          // found the compaction!
 42 | +          break;
 43 | +        } else {
 44 | +          // didn't find the compaction, clear the inputs
 45 | +          inputs.clear();
 46 | +        }
 47 | +	  }
 48 |      }
 49 |    }
 50 |  
 51 | @@ -1074,6 +1096,73 @@ bool LevelCompactionPicker::PickCompactionBySize(VersionStorageInfo* vstorage,
 52 |    return inputs->size() > 0;
 53 |  }
 54 |  
 55 | +// MSLS
 56 | +bool LevelCompactionPicker::PickCompactionLevelDB(VersionStorageInfo* vstorage,
 57 | +                                                  int level, int output_level,
 58 | +                                                  CompactionInputFiles* inputs,
 59 | +                                                  int* parent_index,
 60 | +                                                  int* base_index) {
 61 | +  // level 0 files are overlapping. So we cannot pick more
 62 | +  // than one concurrent compactions at this level. This
 63 | +  // could be made better by looking at key-ranges that are
 64 | +  // being compacted at level 0.
 65 | +  if (level == 0 && !level0_compactions_in_progress_.empty()) {
 66 | +    return false;
 67 | +  }
 68 | +
 69 | +  inputs->clear();
 70 | +
 71 | +  assert(level >= 0);
 72 | +
 73 | +  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(level);
 74 | +
 75 | +  std::string last_key = vstorage->LastKey(level);
 76 | +  bool respect_last_key = true;
 77 | +
 78 | +  for (unsigned int i = 0; i < level_files.size() * 2; i++) {
 79 | +    int index = (vstorage->NextCompactionIndex(level) + i) % (int)level_files.size();
 80 | +    assert(index >= 0 && static_cast<size_t>(index) < level_files.size());
 81 | +
 82 | +    FileMetaData* f = level_files[index];
 83 | +
 84 | +    if (i != 0 && index == 0) {
 85 | +      if (respect_last_key) {
 86 | +        respect_last_key = false;
 87 | +      }
 88 | +    }
 89 | +
 90 | +    if (respect_last_key && f->smallest.Encode().ToString() < last_key) {
 91 | +      //printf("too small key\n");
 92 | +      continue;
 93 | +    }
 94 | +
 95 | +    // do not pick a file to compact if it is being compacted
 96 | +    // from n-1 level.
 97 | +    if (f->being_compacted) {
 98 | +      //printf("being compacted\n");
 99 | +      continue;
100 | +    }
101 | +
102 | +    // Do not pick this file if its parents at level+1 are being compacted.
103 | +    // Maybe we can avoid redoing this work in SetupOtherInputs
104 | +    *parent_index = -1;
105 | +    if (RangeInCompaction(vstorage, &f->smallest, &f->largest, output_level,
106 | +                          parent_index)) {
107 | +      //printf("parents being compacted\n");
108 | +      continue;
109 | +    }
110 | +    inputs->files.push_back(f);
111 | +    inputs->level = level;
112 | +    *base_index = index;
113 | +    vstorage->SetNextCompactionIndex(level, index);
114 | +    vstorage->SetLastKey(level, f->largest.Encode().ToString());
115 | +    //printf("%d %d %d/%zu %hhx %hhx\n", level, i, index, level_files.size(), last_key.c_str()[0], f->largest.Encode().ToString().c_str()[0]);
116 | +    break;
117 | +  }
118 | +
119 | +  return inputs->size() > 0;
120 | +}
121 | +
122 |  #ifndef ROCKSDB_LITE
123 |  bool UniversalCompactionPicker::NeedsCompaction(
124 |      const VersionStorageInfo* vstorage) const {
125 | diff --git a/db/compaction_picker.h b/db/compaction_picker.h
126 | index 1d1abe3..b30b3df 100644
127 | --- a/db/compaction_picker.h
128 | +++ b/db/compaction_picker.h
129 | @@ -210,6 +210,13 @@ class LevelCompactionPicker : public CompactionPicker {
130 |                                                  VersionStorageInfo* vstorage,
131 |                                                  CompactionInputFiles* inputs,
132 |                                                  int* level, int* output_level);
133 | +
134 | +  // MSLS
135 | +  // Similar to PickCompactionBySize except it chooses files
136 | +  // in a round-robin fashion in the key space, like LevelDB does.
137 | +  bool PickCompactionLevelDB(VersionStorageInfo* vstorage, int level,
138 | +                             int output_level, CompactionInputFiles* inputs,
139 | +                             int* parent_index, int* base_index);
140 |  };
141 |  
142 |  #ifndef ROCKSDB_LITE
143 | diff --git a/db/version_set.cc b/db/version_set.cc
144 | index cedaa3e..73e45b1 100644
145 | --- a/db/version_set.cc
146 | +++ b/db/version_set.cc
147 | @@ -795,7 +795,8 @@ VersionStorageInfo::VersionStorageInfo(
148 |        accumulated_num_deletions_(0),
149 |        num_samples_(0),
150 |        estimated_compaction_needed_bytes_(0),
151 | -      finalized_(false) {
152 | +      finalized_(false),
153 | +      last_key_(num_levels_) {
154 |    if (ref_vstorage != nullptr) {
155 |      accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
156 |      accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_;
157 | @@ -804,6 +805,7 @@ VersionStorageInfo::VersionStorageInfo(
158 |          ref_vstorage->accumulated_num_non_deletions_;
159 |      accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_;
160 |      num_samples_ = ref_vstorage->num_samples_;
161 | +    last_key_ = ref_vstorage->last_key_;
162 |    }
163 |  }
164 |  
165 | @@ -1148,6 +1150,8 @@ void VersionStorageInfo::ComputeCompactionScore(
166 |        } else {
167 |          score = static_cast<double>(num_sorted_runs) /
168 |                  mutable_cf_options.level0_file_num_compaction_trigger;
169 | +        // MSLS - The previous version of RocksDB used to priotize level-0 -> level-1 compaction, which caused starvation that disallows making level-2, level-3, ...
170 | +        // This has been reverted back to LevelDB's method in newer RocksDB versions.
171 |        }
172 |      } else {
173 |        // Compute the ratio of current size to size limit.
174 | @@ -1803,6 +1807,13 @@ void VersionStorageInfo::CalculateBaseBytes(const ImmutableCFOptions& ioptions,
175 |        }
176 |      }
177 |    }
178 | +
179 | +  // MSLS
180 | +  for (auto i = 0u; i < options.custom_level_size_count; i++) {
181 | +    if (i < level_max_bytes_.size()) {
182 | +      level_max_bytes_[i] = options.custom_level_sizes[i];
183 | +    }
184 | +  }
185 |  }
186 |  
187 |  uint64_t VersionStorageInfo::EstimateLiveDataSize() const {
188 | diff --git a/db/version_set.h b/db/version_set.h
189 | index 7707bb1..825ad62 100644
190 | --- a/db/version_set.h
191 | +++ b/db/version_set.h
192 | @@ -323,6 +323,17 @@ class VersionStorageInfo {
193 |      return estimated_compaction_needed_bytes_;
194 |    }
195 |  
196 | +  // MSLS
197 | +  void SetLastKey(int level, const std::string& last_key) {
198 | +    assert(static_cast<size_t>(level) < last_key_.size());
199 | +    last_key_[level] = last_key;
200 | +  }
201 | +
202 | +  const std::string& LastKey(int level) {
203 | +    assert(static_cast<size_t>(level) < last_key_.size());
204 | +    return last_key_[level];
205 | +  }
206 | +
207 |   private:
208 |    const InternalKeyComparator* internal_comparator_;
209 |    const Comparator* user_comparator_;
210 | @@ -408,6 +419,9 @@ class VersionStorageInfo {
211 |    // No copying allowed
212 |    VersionStorageInfo(const VersionStorageInfo&) = delete;
213 |    void operator=(const VersionStorageInfo&) = delete;
214 | +
215 | +  // MSLS
216 | +  std::vector<std::string> last_key_;
217 |  };
218 |  
219 |  class Version {
220 | diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h
221 | index 23b8507..294111c 100644
222 | --- a/include/rocksdb/options.h
223 | +++ b/include/rocksdb/options.h
224 | @@ -1116,6 +1116,20 @@ struct DBOptions {
225 |    // Default: nullptr (disabled)
226 |    // Not supported in ROCKSDB_LITE mode!
227 |    std::shared_ptr<Cache> row_cache;
228 | +
229 | +  // MSLS: Use custom level sizes if custom_level_size_count != 0.
230 | +  // custom_level_size_count is the maximum level number to change the size.
231 | +  // custom_level_sizes[i] specifies the maximum size of level-i (i < custom_level_size_count).
232 | +  // custom_level_sizes[0] is ignored.
233 | +  //
234 | +  // Default: 0, NULL
235 | +  size_t custom_level_size_count;
236 | +  const size_t* custom_level_sizes;
237 | +
238 | +  // MSLS: Use LevelDB-style circular table selection for compaction.
239 | +  //
240 | +  // Default: false
241 | +  bool use_leveldb_table_selection;
242 |  };
243 |  
244 |  // Options to control the behavior of a database (passed to DB::Open)
245 | diff --git a/util/crc32c.cc b/util/crc32c.cc
246 | index b8d281a..87d884d 100644
247 | --- a/util/crc32c.cc
248 | +++ b/util/crc32c.cc
249 | @@ -394,7 +394,9 @@ bool IsFastCrc32Supported() {
250 |  Function ChosenExtend = Choose_Extend();
251 |  
252 |  uint32_t Extend(uint32_t crc, const char* buf, size_t size) {
253 | -  return ChosenExtend(crc, buf, size);
254 | +  // MSLS
255 | +  return 0;
256 | +  //return ChosenExtend(crc, buf, size);
257 |  }
258 |  
259 |  }  // namespace crc32c
260 | diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h
261 | index 4110ecc..38bb271 100644
262 | --- a/util/mutable_cf_options.h
263 | +++ b/util/mutable_cf_options.h
264 | @@ -44,7 +44,10 @@ struct MutableCFOptions {
265 |          max_sequential_skip_in_iterations(
266 |              options.max_sequential_skip_in_iterations),
267 |          paranoid_file_checks(options.paranoid_file_checks),
268 | -        compaction_measure_io_stats(options.compaction_measure_io_stats)
269 | +        compaction_measure_io_stats(options.compaction_measure_io_stats),
270 | +        custom_level_size_count(options.custom_level_size_count),
271 | +        custom_level_sizes(options.custom_level_sizes),
272 | +        use_leveldb_table_selection(options.use_leveldb_table_selection)
273 |  
274 |    {
275 |      RefreshDerivedOptions(ioptions);
276 | @@ -76,7 +79,10 @@ struct MutableCFOptions {
277 |          max_subcompactions(1),
278 |          max_sequential_skip_in_iterations(0),
279 |          paranoid_file_checks(false),
280 | -        compaction_measure_io_stats(false) {}
281 | +        compaction_measure_io_stats(false),
282 | +        custom_level_size_count(0),
283 | +        custom_level_sizes(NULL),
284 | +        use_leveldb_table_selection(false) {}
285 |  
286 |    // Must be called after any change to MutableCFOptions
287 |    void RefreshDerivedOptions(const ImmutableCFOptions& ioptions);
288 | @@ -132,6 +138,11 @@ struct MutableCFOptions {
289 |    bool paranoid_file_checks;
290 |    bool compaction_measure_io_stats;
291 |  
292 | +  // MSLS
293 | +  size_t custom_level_size_count;
294 | +  const size_t* custom_level_sizes;
295 | +  bool use_leveldb_table_selection;
296 | +
297 |    // Derived options
298 |    // Per-level target file size.
299 |    std::vector<uint64_t> max_file_size;
300 | diff --git a/util/options.cc b/util/options.cc
301 | index 7f3bf75..d8ff0dd 100644
302 | --- a/util/options.cc
303 | +++ b/util/options.cc
304 | @@ -250,7 +250,10 @@ DBOptions::DBOptions()
305 |        enable_thread_tracking(false),
306 |        delayed_write_rate(1024U * 1024U),
307 |        skip_stats_update_on_db_open(false),
308 | -      wal_recovery_mode(WALRecoveryMode::kTolerateCorruptedTailRecords) {
309 | +      wal_recovery_mode(WALRecoveryMode::kTolerateCorruptedTailRecords),
310 | +      custom_level_size_count(0),
311 | +      custom_level_sizes(NULL),
312 | +      use_leveldb_table_selection(false) {
313 |  }
314 |  
315 |  DBOptions::DBOptions(const Options& options)
316 | @@ -305,7 +308,10 @@ DBOptions::DBOptions(const Options& options)
317 |        delayed_write_rate(options.delayed_write_rate),
318 |        skip_stats_update_on_db_open(options.skip_stats_update_on_db_open),
319 |        wal_recovery_mode(options.wal_recovery_mode),
320 | -      row_cache(options.row_cache) {}
321 | +      row_cache(options.row_cache),
322 | +      custom_level_size_count(options.custom_level_size_count),
323 | +      custom_level_sizes(options.custom_level_sizes),
324 | +      use_leveldb_table_selection(options.use_leveldb_table_selection) {}
325 |  
326 |  static const char* const access_hints[] = {
327 |    "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED"
328 | 


--------------------------------------------------------------------------------
/leveldb.cpp:
--------------------------------------------------------------------------------
  1 | #include "leveldb.h"
  2 | #include "util.h"
  3 | #include <fstream>
  4 | #include <sstream>
  5 | #include <iterator>
  6 | #include <string>
  7 | 
  8 | // #define REMEMBER_NEXT_FIRST_KEY
  9 | 
 10 | LevelDB::LevelDB(const LevelDBParams& params, std::vector<Stat>& stats)
 11 |     : params_(params), stats_(stats) {
 12 |   log_bytes_ = 0;
 13 |   // for log and level-0 that do not use compact()
 14 |   for (auto i = stats_.size(); i < 2; i++) stats_.push_back(Stat());
 15 | 
 16 |   levels_.push_back(sstables_t());
 17 | 
 18 |   level_bytes_.push_back(0);
 19 |   level_bytes_threshold_.push_back(
 20 |       static_cast<uint64_t>(-1));  // level-0 can accept any SSTable size
 21 | 
 22 |   if (params_.compaction_mode == LevelDBCompactionMode::kLinear)
 23 |     level_next_compaction_key_.push_back(LevelDBKeyMax);
 24 |   else if (params_.compaction_mode == LevelDBCompactionMode::kLinearNextFirst)
 25 |     level_next_compaction_key_.push_back(LevelDBKeyMin);
 26 | 
 27 |   inserts_ = 0;
 28 |   level_overflows_.push_back(0);
 29 |   level_compactions_.push_back(0);
 30 |   level_overlapping_sstables_.push_back(0);
 31 |   level_overlapping_sstables_false_.push_back(0);
 32 |   level_sweeps_.push_back(0);
 33 | 
 34 |   next_version_ = 0;
 35 | }
 36 | 
 37 | LevelDB::~LevelDB() {
 38 |   for (std::size_t level = 0; level < levels_.size(); level++)
 39 |     for (auto& sstable : levels_[level]) delete sstable;
 40 | }
 41 | 
 42 | void LevelDB::print_status() const {
 43 |   printf("log: %zu items, %lu bytes\n", log_.size(), log_bytes_);
 44 |   for (std::size_t i = 0; i < levels_.size(); i++) {
 45 |     double overlaps = 0.;
 46 |     double overlaps_false = 0.;
 47 |     if (level_compactions_[i] != 0) {
 48 |       overlaps = level_overlapping_sstables_[i] /
 49 |                  static_cast<double>(level_compactions_[i]);
 50 |       overlaps_false = level_overlapping_sstables_false_[i] /
 51 |                        static_cast<double>(level_compactions_[i]);
 52 |     }
 53 |     uint64_t interval = 0;
 54 |     if (level_sweeps_[i] > 0) interval = inserts_ / level_sweeps_[i];
 55 |     printf(
 56 |         "level-%zu: %5zu tables, %14lu bytes, %6lu overflows, %6lu "
 57 |         "compactions, %5.2lf avg overlaps (%.2lf false), %4lu sweeps "
 58 |         "(interval=%8lu)\n",
 59 |         i, levels_[i].size(), level_bytes_[i], level_overflows_[i],
 60 |         level_compactions_[i], overlaps, overlaps_false, level_sweeps_[i],
 61 |         interval);
 62 |   }
 63 | }
 64 | 
 65 | void LevelDB::dump_state(FILE* fp) const {
 66 |   // XXX: Memtable is not dumped now.
 67 |   fprintf(fp, "next_version:%lu\n", next_version_);
 68 | 
 69 |   fprintf(fp, "log:\n");
 70 |   dump_state(fp, log_);
 71 | 
 72 |   fprintf(fp, "levels:\n");
 73 |   for (std::size_t level = 0; level < levels_.size(); level++) {
 74 |     auto& sstables = levels_[level];
 75 |     fprintf(fp, "level:\n");
 76 |     for (std::size_t i = 0; i < sstables.size(); i++) {
 77 |       fprintf(fp, "sstable:\n");
 78 |       dump_state(fp, *sstables[i]);
 79 |     }
 80 |   }
 81 | }
 82 | 
 83 | void LevelDB::dump_state(FILE* fp, const sstable_t& l) {
 84 |   for (std::size_t i = 0; i < l.size(); i++) dump_state(fp, l[i]);
 85 | }
 86 | 
 87 | void LevelDB::dump_state(FILE* fp, const LevelDBItem& item) {
 88 | #ifdef LEVELDB_TRACK_VERSION
 89 |   fprintf(fp, "item:%u,%lu,%u,%s\n", item.key, item.version,
 90 |           item.size & LevelDBItemSizeMask,
 91 |           item.size == LevelDBItemDeletion ? "T" : "F");
 92 | #else
 93 |   fprintf(fp, "item:%u,0,%u,%s\n", item.key, item.size & LevelDBItemSizeMask,
 94 |           item.size == LevelDBItemDeletion ? "T" : "F");
 95 | #endif
 96 | }
 97 | 
 98 | void LevelDB::put(LevelDBKey key, uint32_t item_size) {
 99 | #ifdef LEVELDB_TRACK_VERSION
100 |   LevelDBItem item{key, item_size, next_version_++};
101 | #else
102 |   LevelDBItem item{key, item_size};
103 | #endif
104 |   inserts_++;
105 |   append_to_log(item);
106 | }
107 | 
108 | void LevelDB::del(LevelDBKey key) {
109 | #ifdef LEVELDB_TRACK_VERSION
110 |   LevelDBItem item{key, LevelDBItemDeletion, next_version_++};
111 | #else
112 |   LevelDBItem item{key, LevelDBItemDeletion};
113 | #endif
114 |   append_to_log(item);
115 | }
116 | 
117 | uint64_t LevelDB::get(LevelDBKey key) {
118 |   // TODO: Implement
119 |   (void)key;
120 |   return 0;
121 | }
122 | 
123 | void LevelDB::force_compact() {
124 |   flush_log();
125 | 
126 |   for (std::size_t level = 0; level < levels_.size() - 1; level++) {
127 |     std::vector<std::vector<std::size_t>> sstable_indices;
128 |     sstable_indices.push_back(std::vector<std::size_t>());
129 |     sstable_indices.back().push_back(0);
130 |     while (levels_[level].size() > 0) {
131 |       compact(level, sstable_indices);
132 |     }
133 |   }
134 | }
135 | 
136 | void LevelDB::append_to_log(const LevelDBItem& item) {
137 |   log_.push_back(item);
138 | 
139 |   // Update statistics.
140 |   auto new_log_bytes = log_bytes_ + item.size;
141 |   // auto log_bytes_d = log_bytes_ / 4096;
142 |   // auto new_log_bytes_d = new_log_bytes / 4096;
143 |   // if (log_bytes_d != new_log_bytes_d) {
144 |   //     // New blocks are written.
145 |   //     stat_.write((new_log_bytes_d - log_bytes_d) * 4096);
146 |   // }
147 |   stats_[0].write(item.size);
148 |   log_bytes_ = new_log_bytes;
149 | 
150 |   if (log_bytes_ > params_.log_size_threshold) flush_log();
151 | }
152 | 
153 | void LevelDB::flush_log() {
154 |   if (log_.size() == 0) return;
155 | 
156 |   // Simplified for simulation; a new SSTable is created from the memtable,
157 |   // causing no disk read.
158 |   sort_items(log_);
159 |   levels_t sstable_runs;
160 |   sstable_runs.push_back(sstables_t());
161 |   sstable_runs.back().push_back(&log_);
162 |   merge_sstables(sstable_runs, 0);
163 |   delete_log();
164 | 
165 |   // TODO: LevelDB computes the score of each level: [current table count /
166 |   // compaction trigger] (for level = 0) or [current level byte size / max level
167 |   // byte size] (for level >= 1).
168 |   //       It picks a level of the highest score in VersionSet::Finalize()
169 |   //       (db/version_set.cc).
170 |   //       Our checking is fine because compaction here is done synchronously
171 |   //       and lower levels tend to get a higher score until being compacted.
172 |   for (std::size_t level = 0; level < levels_.size(); level++)
173 |     check_compaction(level);
174 | }
175 | 
176 | void LevelDB::delete_log() {
177 |   // stat_.del(log_bytes_ / 4096 * 4096);
178 |   stats_[0].del(log_bytes_);
179 |   log_.clear();
180 |   log_bytes_ = 0;
181 | }
182 | 
183 | struct _LevelDBKeyComparer {
184 |   bool operator()(const LevelDBItem& a, const LevelDBItem& b) const {
185 |     return a.key < b.key;
186 |   }
187 | };
188 | 
189 | void LevelDB::sort_items(sstable_t& items) {
190 |   std::stable_sort(items.begin(), items.end(), _LevelDBKeyComparer());
191 | }
192 | 
193 | struct _LevelDBSSTableComparer {
194 |   LevelDB::sstable_t** sstables;
195 |   std::size_t* sstables_pos;
196 | 
197 |   bool operator()(const std::size_t& a, const std::size_t& b) const {
198 |     auto& item_a = (*sstables[a])[sstables_pos[a]];
199 |     auto& item_b = (*sstables[b])[sstables_pos[b]];
200 |     // Since std::make_heap makes a max-heap, we use a comparator with the
201 |     // opposite result.
202 |     if (item_a.key > item_b.key)
203 |       return true;
204 |     else if (item_a.key == item_b.key && a > b)
205 |       return true;
206 |     return false;
207 |   }
208 | };
209 | 
210 | void LevelDB::merge_sstables(const levels_t& sstable_runs, std::size_t level) {
211 |   // The current SSTable in each run.
212 |   std::size_t sstables_idx[sstable_runs.size()];
213 |   sstable_t* sstables[sstable_runs.size()];
214 | 
215 |   // The current item in each run's current SSTable.
216 |   std::size_t sstables_pos[sstable_runs.size()];
217 | 
218 |   for (std::size_t i = 0; i < sstable_runs.size(); i++) {
219 |     assert(sstable_runs[i].size() != 0);
220 |     sstables_idx[i] = 0;
221 |     sstables[i] = sstable_runs[i][sstables_idx[i]];
222 |     sstables_pos[i] = 0;
223 |   }
224 | 
225 |   // Initialize push.
226 |   push_state state;
227 |   push_init(state, level);
228 | 
229 |   // Initialize a heap.
230 |   std::vector<std::size_t> heap;
231 |   _LevelDBSSTableComparer comp{sstables, sstables_pos};
232 |   sequence(sstable_runs.size(), heap);
233 |   std::make_heap(heap.begin(), heap.end(), comp);
234 | 
235 |   while (heap.size() != 0) {
236 |     // Get the smallest key's SSTable index.
237 |     auto i = heap.front();
238 |     std::pop_heap(heap.begin(), heap.end(), comp);
239 |     heap.pop_back();
240 | 
241 |     // Discover how many keys we can take from this SSTable.
242 |     sstable_t* sstable = sstables[i];
243 |     std::size_t size = sstable->size();
244 | 
245 |     std::size_t start = sstables_pos[i];
246 |     std::size_t end;
247 |     if (heap.size() == 0)
248 |       // No other SSTables; we can take the remaining items in this SSTable.
249 |       end = size;
250 |     else {
251 |       // Get the next smallest key's SSTable index (besides i's).
252 |       auto j = heap.front();
253 |       LevelDBKey next_possible_key = (*sstables[j])[sstables_pos[j]].key;
254 | 
255 |       end = start + 1;
256 |       while (end < size && (*sstable)[end].key < next_possible_key) end++;
257 |     }
258 | 
259 |     push_items(state, *sstable, start, end);
260 | 
261 |     if (end < size) {
262 |       // More items in this SSTable.
263 |       sstables_pos[i] = end;
264 | 
265 |       heap.push_back(i);
266 |       std::push_heap(heap.begin(), heap.end(), comp);
267 |     } else {
268 |       // No more items in this SSTable.  Select the next SSTable in the same
269 |       // run.
270 |       sstables_idx[i]++;
271 |       if (sstables_idx[i] < sstable_runs[i].size()) {
272 |         sstables[i] = sstable_runs[i][sstables_idx[i]];
273 |         sstables_pos[i] = 0;
274 | 
275 |         heap.push_back(i);
276 |         std::push_heap(heap.begin(), heap.end(), comp);
277 |       } else {
278 |         // all SSTables in the same run have been consumed.
279 |       }
280 |     }
281 |   }
282 | 
283 |   push_flush(state);
284 | }
285 | 
286 | void LevelDB::check_compaction(std::size_t level) {
287 |   if (level == 0) {
288 |     // Compact if we have too many level-0 SSTables.
289 |     if (levels_[0].size() >= params_.level0_sstable_count_threshold) {
290 |       level_overflows_[0]++;
291 |       level_sweeps_[0]++;
292 |       std::vector<std::vector<std::size_t>> sstable_indices;
293 |       for (std::size_t i = 0; i < levels_[0].size(); i++) {
294 |         sstable_indices.push_back(std::vector<std::size_t>());
295 |         sstable_indices.back().push_back(i);
296 |       }
297 |       compact(0, sstable_indices);
298 |       assert(levels_[0].size() == 0);
299 |     }
300 |   } else {
301 |     // Compact if we have too much data in this level.
302 |     if (level_bytes_[level] > level_bytes_threshold_[level]) {
303 |       level_overflows_[level]++;
304 |       std::vector<std::vector<std::size_t>> sstable_indices;
305 |       sstable_indices.push_back(std::vector<std::size_t>());
306 | 
307 |       while (level_bytes_[level] > level_bytes_threshold_[level]) {
308 |         sstable_indices.back().clear();
309 | 
310 |         if (params_.compaction_mode == LevelDBCompactionMode::kLinear ||
311 |             params_.compaction_mode ==
312 |                 LevelDBCompactionMode::kLinearNextFirst) {
313 |           // Find the next table to compact.
314 |           auto& level_tables = levels_[level];
315 |           std::size_t count = level_tables.size();
316 |           std::size_t i;
317 |           for (i = 0; i < count; i++) {
318 |             auto& sstable = *level_tables[i];
319 | 
320 |             if (params_.compaction_mode == LevelDBCompactionMode::kLinear) {
321 |               if (sstable.front().key > level_next_compaction_key_[level])
322 |                 break;
323 |             } else if (params_.compaction_mode ==
324 |                        LevelDBCompactionMode::kLinearNextFirst) {
325 |               if (sstable.front().key >= level_next_compaction_key_[level])
326 |                 break;
327 |             }
328 |           }
329 |           if (i == count) {
330 |             i = 0;
331 |             level_sweeps_[level]++;
332 |           }
333 |           if (params_.compaction_mode == LevelDBCompactionMode::kLinear) {
334 |             level_next_compaction_key_[level] = level_tables[i]->back().key;
335 |           } else if (params_.compaction_mode ==
336 |                      LevelDBCompactionMode::kLinearNextFirst) {
337 |             if (i < count - 1)
338 |               level_next_compaction_key_[level] =
339 |                   level_tables[i + 1]->front().key;
340 |             else
341 |               level_next_compaction_key_[level] = LevelDBKeyMax;
342 |           }
343 | 
344 |           sstable_indices.back().push_back(i);
345 |         } else if (params_.compaction_mode ==
346 |                    LevelDBCompactionMode::kMostNarrow) {
347 |           auto& level_tables = levels_[level];
348 |           std::size_t count = level_tables.size();
349 | 
350 |           // TODO: This is quite slow -- O(N).  We may probably want to make it
351 |           // O(logN) with a priority queue.
352 |           std::size_t selected = count;
353 |           LevelDBKey min_width = 0;
354 |           for (std::size_t i = 0; i < count; i++) {
355 |             auto& sstable = *level_tables[i];
356 |             LevelDBKey width = sstable.back().key - sstable.front().key;
357 |             if (selected == count || min_width > width) {
358 |               min_width = width;
359 |               selected = i;
360 |             }
361 |           }
362 |           assert(selected != count);
363 |           sstable_indices.back().push_back(selected);
364 |         } else if (params_.compaction_mode ==
365 |                    LevelDBCompactionMode::kLeastOverlap) {
366 |           auto& level_tables = levels_[level];
367 |           std::size_t count = level_tables.size();
368 | 
369 |           if (level < levels_.size() - 1) {
370 |             // TODO: This is quite slow -- O(N).  We may probably want to make
371 |             // it O(logN) with some magic (this is complicated because overlaps
372 |             // change as we compact).
373 |             auto& level_tables_next = levels_[level + 1];
374 |             std::size_t selected = count;
375 |             std::size_t min_overlap = 0;
376 |             std::size_t sstable_idx_start = 0;
377 |             std::size_t sstable_idx_end = 0;
378 |             for (std::size_t i = 0; i < count; i++) {
379 |               auto& sstable = *level_tables[i];
380 |               if (sstable_idx_end > 0) sstable_idx_start = sstable_idx_end - 1;
381 |               while (sstable_idx_start < level_tables_next.size() &&
382 |                      level_tables_next[sstable_idx_start]->back().key <
383 |                          sstable.front().key)
384 |                 sstable_idx_start++;
385 |               sstable_idx_end = sstable_idx_start;
386 |               while (sstable_idx_end < level_tables_next.size() &&
387 |                      level_tables_next[sstable_idx_end]->front().key <
388 |                          sstable.back().key)
389 |                 sstable_idx_end++;
390 | 
391 |               std::size_t overlap = sstable_idx_end - sstable_idx_start;
392 |               // if (overlap != 0) {
393 |               //     printf("range: [%u,%u]\n", sstable.front().key,
394 |               //     sstable.back().key);
395 |               //     printf("overlap: %zu[%u,%u] - %zu[%u,%u]\n",
396 |               //     sstable_idx_start,
397 |               //     level_tables_next[sstable_idx_start]->front().key,
398 |               //     level_tables_next[sstable_idx_start]->back().key,
399 |               //     sstable_idx_end - 1, level_tables_next[sstable_idx_end -
400 |               //     1]->front().key, level_tables_next[sstable_idx_end -
401 |               //     1]->back().key);
402 |               // }
403 |               if (selected == count || min_overlap > overlap) {
404 |                 min_overlap = overlap;
405 |                 selected = i;
406 |               }
407 |             }
408 |             assert(selected != count);
409 |             sstable_indices.back().push_back(selected);
410 |           } else {
411 |             // We cannot use find_overlapping_tables() if the next level is not
412 |             // created yet.
413 |             sstable_indices.back().push_back(0);
414 |           }
415 |         } else if (params_.compaction_mode ==
416 |                    LevelDBCompactionMode::kLargestRatio) {
417 |           auto& level_tables = levels_[level];
418 |           std::size_t count = level_tables.size();
419 | 
420 |           if (level < levels_.size() - 1) {
421 |             // TODO: This is quite slow -- O(N).  We may probably want to make
422 |             // it O(logN) with some magic (this is complicated because overlaps
423 |             // change as we compact).
424 |             auto& level_tables_next = levels_[level + 1];
425 |             std::size_t selected = count;
426 |             double max_ratio = 0.;
427 |             std::size_t sstable_idx_start = 0;
428 |             std::size_t sstable_idx_end = 0;
429 |             for (std::size_t i = 0; i < count; i++) {
430 |               auto& sstable = *level_tables[i];
431 |               if (sstable_idx_end > 0) sstable_idx_start = sstable_idx_end - 1;
432 |               while (sstable_idx_start < level_tables_next.size() &&
433 |                      level_tables_next[sstable_idx_start]->back().key <
434 |                          sstable.front().key)
435 |                 sstable_idx_start++;
436 |               sstable_idx_end = sstable_idx_start;
437 |               while (sstable_idx_end < level_tables_next.size() &&
438 |                      level_tables_next[sstable_idx_end]->front().key <
439 |                          sstable.back().key)
440 |                 sstable_idx_end++;
441 | 
442 |               // TODO: Use LevelDBItem::size instead of the item count.
443 |               std::size_t s = 0;
444 |               for (std::size_t j = sstable_idx_start; j < sstable_idx_end; j++)
445 |                 s += level_tables_next[j]->size();
446 |               // Make division cleaner.
447 |               if (s == 0) s = 1;
448 | 
449 |               double ratio =
450 |                   static_cast<double>(sstable.size()) / static_cast<double>(s);
451 |               if (selected == count || max_ratio < ratio) {
452 |                 max_ratio = ratio;
453 |                 selected = i;
454 |               }
455 |             }
456 |             assert(selected != count);
457 |             sstable_indices.back().push_back(selected);
458 |           } else {
459 |             // We cannot use find_overlapping_tables() if the next level is not
460 |             // created yet.
461 |             sstable_indices.back().push_back(0);
462 |           }
463 |         } else if (params_.compaction_mode ==
464 |                    LevelDBCompactionMode::kWholeLevel) {
465 |           level_sweeps_[level]++;
466 |           sequence(levels_[level].size(), sstable_indices.back());
467 |         } else
468 |           assert(false);
469 | 
470 |         compact(level, sstable_indices);
471 |       }
472 |     }
473 |   }
474 | }
475 | 
476 | void LevelDB::push_init(push_state& state, std::size_t level) {
477 |   state.level = level;
478 | 
479 |   state.pending_item = nullptr;
480 | 
481 |   state.current_sstable = nullptr;
482 | 
483 |   state.current_sstable_size = 0;
484 |   state.use_split_key = false;
485 | }
486 | 
487 | void LevelDB::push_items(push_state& state, const sstable_t& sstable,
488 |                          std::size_t start, std::size_t end) {
489 |   assert(start != end);
490 | 
491 |   bool level0 = (state.level == 0);
492 |   bool last_level = (state.level == levels_.size() - 1);
493 | 
494 |   if (state.pending_item == nullptr) {
495 |     state.pending_item = &sstable[start];
496 |     start++;
497 |   }
498 | 
499 |   while (start != end) {
500 |     bool drop_pending_item = false;
501 |     if (state.pending_item->size == LevelDBItemDeletion && last_level)
502 |       drop_pending_item = true;
503 |     else if (state.pending_item->key == sstable[start].key) {
504 | #ifdef LEVELDB_TRACK_VERSION
505 |       if (state.pending_item->version >= sstable[start].version)
506 |         printf("pv %lu cv %lu level %zu start %zu end %zu\n",
507 |                state.pending_item->version, sstable[start].version, state.level,
508 |                start, end);
509 |       assert(state.pending_item->version < sstable[start].version);
510 | #endif
511 |       drop_pending_item = true;
512 |     }
513 | 
514 |     if (!drop_pending_item) {
515 |       if (state.current_sstable == nullptr)
516 |         state.current_sstable = new sstable_t();
517 | 
518 |       state.current_sstable->push_back(*state.pending_item);
519 |       state.current_sstable_size +=
520 |           state.pending_item->size & LevelDBItemSizeMask;
521 | 
522 |       if (state.current_sstable->size() == 1 && !params_.use_custom_sizes) {
523 |         // Determine the split key; the current SSTable should not contain this
524 |         // split key, otherwise it will overlap with too many SSTables in the
525 |         // next level.
526 |         if (level0 || last_level)
527 |           state.use_split_key = false;
528 |         else {
529 |           auto& level_tables = levels_[state.level + 1];
530 |           std::size_t count = level_tables.size();
531 | 
532 |           std::size_t i;
533 |           // Choose the first SSTable in the next level that can potentially
534 |           // overlap.
535 |           // TODO: Use binary search and memorization from previous run.
536 |           for (i = 0; i < count; i++) {
537 |             auto& sstable = *level_tables[i];
538 |             if (state.pending_item->key <= sstable.back().key) break;
539 |           }
540 |           // XXX: This follows LevelDB's impl.html, but the actual
541 |           // implementation uses bytes instead of the number of SSTables.
542 |           //      See kMaxGrandParentOverlapBytes (db/version_set.cc).
543 |           std::size_t end =
544 |               std::min(i + params_.sstable_overlap_threshold, count);
545 |           if (end < count) {
546 |             // Remember the split key.
547 |             state.use_split_key = true;
548 |             state.split_key = level_tables[end]->front().key;
549 |           } else {
550 |             // Splitting by key will never happen because there will be few
551 |             // overlapping tables.
552 |             state.use_split_key = false;
553 |           }
554 |         }
555 |       }
556 |     }
557 | 
558 |     state.pending_item = &sstable[start];
559 | 
560 |     bool need_new_sstable = false;
561 |     if (state.use_split_key && state.pending_item->key >= state.split_key)
562 |       need_new_sstable = true;
563 |     else {
564 |       uint64_t item_size = state.pending_item->size & LevelDBItemSizeMask;
565 |       // Level-0 generates only one SSTable per merge. Otherwise, we obey the
566 |       // maximum SSTable size.
567 |       if (!level0 &&
568 |           state.current_sstable_size + item_size >
569 |               params_.sstable_size_threshold)
570 |         need_new_sstable = true;
571 |     }
572 | 
573 |     if (need_new_sstable) {
574 |       if (state.current_sstable != nullptr) {
575 |         state.current_sstable->shrink_to_fit();
576 |         state.completed_sstables.push_back(state.current_sstable);
577 |         level_bytes_[state.level] += state.current_sstable_size;
578 |         stats_[1 + state.level].write(state.current_sstable_size);
579 | 
580 |         state.current_sstable = nullptr;
581 | 
582 |         state.current_sstable_size = 0;
583 |         state.use_split_key = false;
584 |       }
585 |     }
586 | 
587 |     start++;
588 |   }
589 | }
590 | 
591 | void LevelDB::push_flush(push_state& state) {
592 |   // printf("push_flush level %zu\n", state.level);
593 |   bool level0 = (state.level == 0);
594 |   bool last_level = (state.level == levels_.size() - 1);
595 | 
596 |   // Flush the pending item.
597 |   if (state.pending_item != nullptr) {
598 |     bool drop_pending_item = false;
599 |     if (state.pending_item->size == LevelDBItemDeletion && last_level)
600 |       drop_pending_item = true;
601 | 
602 |     if (!drop_pending_item) {
603 |       if (state.current_sstable == nullptr) {
604 |         state.current_sstable = new sstable_t();
605 |         state.current_sstable_size = 0;
606 |       }
607 | 
608 |       state.current_sstable->push_back(*state.pending_item);
609 |       state.current_sstable_size +=
610 |           state.pending_item->size & LevelDBItemSizeMask;
611 |     }
612 |   }
613 | 
614 |   // Flush the current SSTable.
615 |   if (state.current_sstable != nullptr) {
616 |     state.current_sstable->shrink_to_fit();
617 |     state.completed_sstables.push_back(state.current_sstable);
618 |     level_bytes_[state.level] += state.current_sstable_size;
619 |     stats_[1 + state.level].write(state.current_sstable_size);
620 |   }
621 | 
622 |   // Insert new SSTables into the level.
623 |   if (level0)
624 |     levels_[0].insert(levels_[0].end(), state.completed_sstables.begin(),
625 |                       state.completed_sstables.end());
626 |   else {
627 |     auto& level_tables = levels_[state.level];
628 |     std::size_t count = level_tables.size();
629 | 
630 |     std::size_t i;
631 |     for (i = 0; i < count; i++) {
632 |       auto& sstable = *level_tables[i];
633 |       if (state.pending_item->key <= sstable.back().key) break;
634 |     }
635 | 
636 |     level_tables.insert(
637 |         std::next(level_tables.begin(), static_cast<std::ptrdiff_t>(i)),
638 |         state.completed_sstables.begin(), state.completed_sstables.end());
639 |   }
640 | }
641 | 
642 | void LevelDB::find_overlapping_tables(
643 |     std::size_t level, const LevelDBKey& first, const LevelDBKey& last,
644 |     std::vector<std::size_t>& out_sstable_indices) {
645 |   assert(level >= 1);
646 |   assert(level < levels_.size());
647 | 
648 |   // TODO: Use binary search to reduce the search range.
649 | 
650 |   auto& level_tables = levels_[level];
651 |   std::size_t count = level_tables.size();
652 |   out_sstable_indices.clear();
653 | 
654 |   for (std::size_t i = 0; i < count; i++) {
655 |     auto& sstable = *level_tables[i];
656 |     if (!(last < sstable.front().key || sstable.back().key < first))
657 |       out_sstable_indices.push_back(i);
658 |   }
659 | }
660 | 
661 | void LevelDB::compact(
662 |     std::size_t level,
663 |     const std::vector<std::vector<std::size_t>>& sstable_indices) {
664 |   // printf("compact level %zu\n", level);
665 | 
666 |   // Ensure we have all necessary data structures for the next level.
667 |   if (levels_.size() <= level + 1) {
668 |     levels_.push_back(sstables_t());
669 |     level_bytes_.push_back(0);
670 | 
671 |     for (auto i = stats_.size(); i < 2 + level + 1; i++)
672 |       stats_.push_back(Stat());
673 |     level_overflows_.push_back(0);
674 |     level_compactions_.push_back(0);
675 |     level_overlapping_sstables_.push_back(0);
676 |     level_overlapping_sstables_false_.push_back(0);
677 |     level_sweeps_.push_back(0);
678 | 
679 |     // E.g., level_size for level-1  = params_.level_size_ratio
680 |     // E.g., level_size for level-2  = params_.level_size_ratio *
681 |     // params_.growth_factor
682 |     uint64_t level_size = params_.level_size_ratio;
683 |     for (std::size_t i = 1; i < level + 1; i++)
684 |       level_size *= params_.growth_factor;
685 | 
686 |     if (params_.use_custom_sizes) {
687 |       level_size = 0;
688 |       std::ifstream ifs("output_sensitivity.txt");
689 |       while (!ifs.eof()) {
690 |         std::string line;
691 |         std::getline(ifs, line);
692 | 
693 |         std::istringstream iss(line);
694 |         std::vector<std::string> tokens{std::istream_iterator<std::string>{iss},
695 |                                         std::istream_iterator<std::string>{}};
696 | 
697 |         if (tokens.size() < 4) continue;
698 |         if (tokens[0] != "sensitivity_item_count_leveldb_best_sizes" &&
699 |             tokens[0] != "sensitivity_log_size_leveldb_best_sizes")
700 |           continue;
701 |         if (static_cast<uint64_t>(atol(tokens[1].c_str())) !=
702 |             params_.hint_num_unique_keys)
703 |           continue;
704 |         if (atof(tokens[2].c_str()) != params_.hint_theta) continue;
705 |         if (static_cast<uint64_t>(atol(tokens[3].c_str())) !=
706 |             params_.log_size_threshold)
707 |           continue;
708 | 
709 |         assert(level < tokens.size() - 5);
710 |         // Assume the item size of 1000 bytes.
711 |         level_size = static_cast<uint64_t>(
712 |             atof(tokens[5 + level].c_str()) * 1000. + 0.5);
713 |         break;
714 |       }
715 |       assert(level_size != 0);
716 |     }
717 |     printf("level-%zu: max size %lu bytes\n", level + 1, level_size);
718 |     level_bytes_threshold_.push_back(level_size);
719 | 
720 |     if (params_.compaction_mode == LevelDBCompactionMode::kLinear ||
721 |         params_.compaction_mode == LevelDBCompactionMode::kLinearNextFirst)
722 |       level_next_compaction_key_.push_back(LevelDBKeyMax);
723 |   }
724 | 
725 |   // Discover SSTables to merge.
726 |   std::vector<std::size_t> sstable_indices_current;
727 |   for (auto& sstable_indices_sub : sstable_indices)
728 |     for (auto i : sstable_indices_sub) sstable_indices_current.push_back(i);
729 | 
730 |   std::vector<std::size_t> sstable_indices_next;
731 |   LevelDBKey min_key;
732 |   LevelDBKey max_key;
733 |   if (params_.compaction_mode == LevelDBCompactionMode::kLinear ||
734 |       params_.compaction_mode == LevelDBCompactionMode::kLinearNextFirst ||
735 |       params_.compaction_mode == LevelDBCompactionMode::kMostNarrow ||
736 |       params_.compaction_mode == LevelDBCompactionMode::kLeastOverlap ||
737 |       params_.compaction_mode == LevelDBCompactionMode::kLargestRatio) {
738 |     min_key = LevelDBKeyMax;
739 |     max_key = LevelDBKeyMin;
740 |     for (auto i : sstable_indices_current) {
741 |       min_key = std::min(min_key, levels_[level][i]->front().key);
742 |       max_key = std::max(max_key, levels_[level][i]->back().key);
743 |     }
744 |     find_overlapping_tables(level + 1, min_key, max_key, sstable_indices_next);
745 |   } else if (params_.compaction_mode == LevelDBCompactionMode::kWholeLevel) {
746 |     min_key = LevelDBKeyMin;
747 |     max_key = LevelDBKeyMax;
748 |     sequence(levels_[level + 1].size(), sstable_indices_next);
749 |   } else
750 |     assert(false);
751 | 
752 |   // level_compactions_[level] += sstable_indices_current.size();
753 |   // level_overlapping_sstables_[level] += sstable_indices_next.size();
754 | 
755 |   // level_compactions_[level]++;
756 |   // level_overlapping_sstables_[level] +=
757 |   // static_cast<double>(sstable_indices_next.size()) /
758 |   // static_cast<double>(sstable_indices_current.size());
759 | 
760 |   // TODO: Use LevelDBItem::size instead of the item count.
761 |   uint64_t s0 = 0;
762 |   uint64_t s1 = 0;
763 |   uint64_t s1_false = 0;
764 |   for (auto i : sstable_indices_current) s0 += levels_[level][i]->size();
765 |   for (auto i : sstable_indices_next) s1 += levels_[level + 1][i]->size();
766 |   for (auto i : sstable_indices_next)
767 |     for (auto& item : *levels_[level + 1][i])
768 |       if (item.key < min_key || item.key > max_key) s1_false++;
769 |   level_compactions_[level]++;
770 |   level_overlapping_sstables_[level] +=
771 |       static_cast<double>(s1) / static_cast<double>(s0);
772 |   level_overlapping_sstables_false_[level] +=
773 |       static_cast<double>(s1_false) / static_cast<double>(s0);
774 | 
775 |   // printf("overlapping\n");
776 |   // printf("  level %zu (%zu):", level, levels_[level].size());
777 |   // for (auto i : sstable_indices_current)
778 |   // 	printf(" %zu", i);
779 |   // printf("\n  level %zu (%zu):", level + 1, levels_[level + 1].size());
780 |   // for (auto i : sstable_indices_next)
781 |   // 	printf(" %zu", i);
782 |   // printf("\n");
783 | 
784 |   levels_t source_sstables;
785 |   if (sstable_indices_next.size() != 0) {
786 |     source_sstables.push_back(sstables_t());
787 |     for (auto i : sstable_indices_next) {
788 |       source_sstables.back().push_back(levels_[level + 1][i]);
789 | 
790 |       std::uint64_t sstable_size = 0;
791 |       for (auto& item : *source_sstables.back().back())
792 |         sstable_size += item.size & LevelDBItemSizeMask;
793 |       level_bytes_[level + 1] -= sstable_size;
794 |       stats_[1 + level + 1].read(sstable_size);
795 |       stats_[1 + level + 1].del(sstable_size);
796 |     }
797 |   }
798 |   for (auto& sstable_indices_sub : sstable_indices) {
799 |     source_sstables.push_back(sstables_t());
800 |     for (auto i : sstable_indices_sub) {
801 |       source_sstables.back().push_back(levels_[level][i]);
802 | 
803 |       std::uint64_t sstable_size = 0;
804 |       for (auto& item : *source_sstables.back().back())
805 |         sstable_size += item.size & LevelDBItemSizeMask;
806 |       level_bytes_[level] -= sstable_size;
807 |       // We are reading from level, but let level+1 have the numbers to follow
808 |       // the convention used in the analysis
809 |       // stats_[1 + level].read(sstable_size);
810 |       stats_[1 + level + 1].read(sstable_size);
811 |       stats_[1 + level].del(sstable_size);
812 |     }
813 |   }
814 | 
815 |   {
816 |     std::sort(sstable_indices_current.begin(), sstable_indices_current.end());
817 |     std::reverse(sstable_indices_current.begin(),
818 |                  sstable_indices_current.end());
819 |     for (auto i : sstable_indices_current) remove_sstable(level, i);
820 | 
821 |     std::reverse(sstable_indices_next.begin(), sstable_indices_next.end());
822 |     for (auto i : sstable_indices_next) remove_sstable(level + 1, i);
823 |   }
824 | 
825 |   merge_sstables(source_sstables, level + 1);
826 | 
827 |   // Delete old SSTables.
828 |   for (auto& sstables : source_sstables)
829 |     for (auto& sstable : sstables) delete sstable;
830 | }
831 | 
832 | LevelDB::sstable_t* LevelDB::remove_sstable(std::size_t level,
833 |                                             std::size_t idx) {
834 |   sstable_t* t = levels_[level][idx];
835 | 
836 |   for (auto j = idx; j < levels_[level].size() - 1; j++)
837 |     levels_[level][j] = levels_[level][j + 1];
838 |   levels_[level].pop_back();
839 | 
840 |   return t;
841 | }
842 | 


--------------------------------------------------------------------------------
/leveldb.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "common.h"
  4 | #include "stat.h"
  5 | #include <cstdio>
  6 | 
  7 | // #define LEVELDB_TRACK_VERSION
  8 | 
  9 | typedef uint32_t LevelDBKey;
 10 | static const uint32_t LevelDBKeyMin = 0;
 11 | static const uint32_t LevelDBKeyMax = static_cast<uint32_t>(-1);
 12 | 
 13 | enum class LevelDBCompactionMode {
 14 |   // LevelDB's default compaction; pick one SSTable and pick the next linearly.
 15 |   kLinear = 0,
 16 |   // Similar to above but remember the first key of the next available SSTable
 17 |   // instead of the last key of the compacted SSTable.
 18 |   kLinearNextFirst = 1,
 19 |   // Pick an SSTable with the narrow key range.
 20 |   kMostNarrow = 2,
 21 |   // Pick an SSTable with the least number of next-level SSTables that overlap
 22 |   // with it.
 23 |   kLeastOverlap = 3,
 24 |   // Pick an SSTable whose size ratio to next-level overlapping SSTables size
 25 |   // (potential the inverse of write amplification) is the greatest; this is
 26 |   // similar to HyperLevelDB's strategy (see VersionSet::PickCompaction() in
 27 |   // HyperLevelDB/db/version_set.cc).
 28 |   kLargestRatio = 4,
 29 |   // Always compact the whole level (like LSM-tree).
 30 |   kWholeLevel = 5,
 31 | 
 32 |   // RocksDB - Pick an SSTable whose size is the maximum (default) + 1
 33 |   // compaction thread
 34 |   kRocksDBMaxSize = 6,
 35 |   // RocksDB - Pick an SSTable in the same way as kLinear + 1 compaction thread
 36 |   kRocksDBLinear = 7,
 37 |   // RocksDB - kRocksDBMaxSize + 4 compaction threads
 38 |   kRocksDBMaxSizeMT = 8,
 39 |   // RocksDB - kRocksDBLinear + 4 compaction threads
 40 |   kRocksDBLinearMT = 9,
 41 | 
 42 |   // RocksDB - Universal Compaction
 43 |   kRocksDBUniversal = 10,
 44 | };
 45 | 
 46 | struct LevelDBParams {
 47 |   // When a log file exceeds this size, a new Level-0 SSTable is created, and a
 48 |   // new log file is created.
 49 |   uint64_t log_size_threshold;
 50 |   // When the level 0 ("young") has this many SSTables, all of them are merged
 51 |   // into the next level.
 52 |   uint64_t level0_sstable_count_threshold;
 53 |   // When an SSTable file exceeds this size, a new SSTable is created.
 54 |   uint64_t sstable_size_threshold;
 55 |   // When a level-L SSTable's key range overlaps with this many level-(L+1)
 56 |   // SSTables, a new level-L SSTable is created.
 57 |   uint64_t sstable_overlap_threshold;
 58 |   // When the level L is (growth factor)^L * (level size ratio) bytes big, an
 59 |   // level-L SSTable and all overlapping level-(L+1) SSTables are merged and
 60 |   // form new level-(L+1) SSTables.  The level-L SSTable is chosen in a
 61 |   // round-robin way.
 62 |   uint64_t growth_factor;
 63 |   // The size of level 1.
 64 |   uint64_t level_size_ratio;
 65 | 
 66 |   // The compaction mode.
 67 |   LevelDBCompactionMode compaction_mode;
 68 | 
 69 |   // Use custom level sizes.
 70 |   bool use_custom_sizes;
 71 |   // Hints used for custom_sizes
 72 |   uint64_t hint_num_unique_keys;
 73 |   double hint_theta;
 74 | 
 75 |   // Enable fsync for implementation-based tests.
 76 |   bool enable_fsync;
 77 | 
 78 |   LevelDBParams() {
 79 |     log_size_threshold =
 80 |         4 * 1048576;  // write_buffer_size (include/leveldb/options.h)
 81 |     level0_sstable_count_threshold =
 82 |         4;  // When LevelDB triggers compaction (db/dbformat.h)
 83 |     // level0_sstable_count_threshold = 8;     // When LevelDB slows down new
 84 |     // insertion
 85 |     // level0_sstable_count_threshold = 12;     // When LevelDB stops handling
 86 |     // new insertion
 87 |     sstable_size_threshold =
 88 |         2 * 1048576;  // kTargetFileSize (db/version_set.cc)
 89 |     sstable_overlap_threshold =
 90 |         10;              // kMaxGrandParentOverlapBytes (db/version_set.cc)
 91 |     growth_factor = 10;  // MaxBytesForLevel() (db/version_set.cc)
 92 |     level_size_ratio = 10 * 1048576;  // MaxBytesForLevel() (db/version_set.cc)
 93 | 
 94 |     use_custom_sizes = false;
 95 |     hint_num_unique_keys = 0;
 96 |     hint_theta = 0.;
 97 | 
 98 |     enable_fsync = false;
 99 |   }
100 | };
101 | 
102 | struct LevelDBItem {
103 |   LevelDBKey key;
104 |   uint32_t size;
105 | #ifdef LEVELDB_TRACK_VERSION
106 |   uint64_t version;
107 | #endif
108 | };
109 | 
110 | static const uint32_t LevelDBItemSizeMask = 0x7fffffffU;
111 | static const uint32_t LevelDBItemDeletion = 0x80000010U;
112 | 
113 | // A LevelDB simulation based on
114 | // https://leveldb.googlecode.com/svn/trunk/doc/impl.html
115 | class LevelDB {
116 |  public:
117 |   LevelDB(const LevelDBParams& params, std::vector<Stat>& stats);
118 |   ~LevelDB();
119 | 
120 |   // Prints the summary of the store.
121 |   void print_status() const;
122 | 
123 |   // Writes the current items in the store to the file.
124 |   void dump_state(FILE* fp) const;
125 | 
126 |   // Puts a new item in the store.
127 |   void put(LevelDBKey key, uint32_t item_size);
128 | 
129 |   // Deletes an item from the store.
130 |   void del(LevelDBKey key);
131 | 
132 |   // Gets an item from the store.
133 |   uint64_t get(LevelDBKey key);
134 | 
135 |   // Forces compaction until there is no SSTable except the last level.
136 |   void force_compact();
137 | 
138 |   typedef std::vector<LevelDBItem> sstable_t;
139 |   typedef std::vector<sstable_t*> sstables_t;
140 |   typedef std::vector<sstables_t> levels_t;
141 | 
142 |   // typedef std::vector<LevelDBItem*> item_ptr_t;
143 | 
144 |  protected:
145 |   // Adds a new item to the log.
146 |   void append_to_log(const LevelDBItem& item);
147 | 
148 |   // Flushes all in-memory data to disk.  This effectively creates new level-0
149 |   // SSTables from the Memtable.
150 |   void flush_log();
151 | 
152 |   // Deletes the log.
153 |   void delete_log();
154 | 
155 |   // Sorts items in place.
156 |   void sort_items(sstable_t& items);
157 | 
158 |   // Merges SSTables and emits SSTable in the specified level.  Items at a later
159 |   // position take precedence.
160 |   void merge_sstables(const levels_t& source_sstables, std::size_t level);
161 | 
162 |   // Check if we need new compaction.
163 |   void check_compaction(std::size_t level);
164 | 
165 |   // Pushes items to a level, creating SSTables.
166 |   struct push_state {
167 |     std::size_t level;
168 | 
169 |     const LevelDBItem* pending_item;
170 | 
171 |     sstable_t* current_sstable;
172 | 
173 |     uint64_t current_sstable_size;
174 |     bool use_split_key;
175 |     LevelDBKey split_key;
176 | 
177 |     sstables_t completed_sstables;
178 |   };
179 |   void push_init(push_state& state, std::size_t level);
180 |   void push_items(push_state& state, const sstable_t& sstable,
181 |                   std::size_t start, std::size_t end);
182 |   void push_flush(push_state& state);
183 | 
184 |   // Finds all overlapping SSTables in the level.
185 |   void find_overlapping_tables(std::size_t level, const LevelDBKey& first,
186 |                                const LevelDBKey& last,
187 |                                std::vector<std::size_t>& out_sstable_indices);
188 | 
189 |   // Performs compaction with SSTables from the level and all over overlapping
190 |   // SSTables in the next level.
191 |   void compact(std::size_t level,
192 |                const std::vector<std::vector<std::size_t>>& sstable_indices);
193 | 
194 |   // // Removes an SSTable from the level.  This does not release the memory
195 |   // used by the SSTable.
196 |   sstable_t* remove_sstable(std::size_t level, std::size_t idx);
197 | 
198 |   // Writes an item list to the file.
199 |   static void dump_state(FILE* fp, const sstable_t& l);
200 |   static void dump_state(FILE* fp, const LevelDBItem& item);
201 | 
202 |  private:
203 |   LevelDBParams params_;
204 |   std::vector<Stat>& stats_;
205 |   sstable_t log_;
206 |   uint64_t log_bytes_;
207 |   levels_t levels_;
208 |   std::vector<uint64_t> level_bytes_;
209 |   std::vector<uint64_t> level_bytes_threshold_;
210 |   // for LevelDBCompactionMode::kLinear and
211 |   // LevelDBCompactionMode::kLinearNextFirst
212 |   std::vector<LevelDBKey> level_next_compaction_key_;
213 |   uint64_t inserts_;
214 |   std::vector<uint64_t> level_overflows_;
215 |   std::vector<uint64_t> level_compactions_;
216 |   std::vector<double> level_overlapping_sstables_;
217 |   std::vector<double> level_overlapping_sstables_false_;
218 |   std::vector<uint64_t> level_sweeps_;
219 |   uint64_t next_version_;
220 | };
221 | 


--------------------------------------------------------------------------------
/leveldb_impl.cpp:
--------------------------------------------------------------------------------
  1 | #include "leveldb_impl.h"
  2 | #include "leveldb/db.h"
  3 | #include "leveldb/env.h"
  4 | #include <stdlib.h>
  5 | #include <sys/stat.h>
  6 | #include <fstream>
  7 | #include <sstream>
  8 | #include <iterator>
  9 | #include <string>
 10 | 
 11 | #define OVERRIDE override
 12 | // #define OVERRIDE
 13 | 
 14 | // A wrapper for SequentialFile that forwards the data read information to
 15 | // LevelDBImpl.
 16 | class LevelDBSequentialFile : public leveldb::SequentialFile {
 17 |  public:
 18 |   LevelDBSequentialFile(LevelDBImpl* leveldb_impl, leveldb::SequentialFile* t)
 19 |       : leveldb::SequentialFile(), leveldb_impl_(leveldb_impl), target_(t) {}
 20 | 
 21 |   virtual ~LevelDBSequentialFile() OVERRIDE { delete target_; }
 22 | 
 23 |   virtual leveldb::Status Read(size_t n, leveldb::Slice* result,
 24 |                                char* scratch) OVERRIDE {
 25 |     leveldb_impl_->Read(n);
 26 |     return target_->Read(n, result, scratch);
 27 |   }
 28 | 
 29 |   virtual leveldb::Status Skip(uint64_t n) OVERRIDE { return target_->Skip(n); }
 30 | 
 31 |  private:
 32 |   class LevelDBImpl* leveldb_impl_;
 33 |   leveldb::SequentialFile* target_;
 34 | };
 35 | 
 36 | // A wrapper for RandomAccessFile that forwards the data read information to
 37 | // LevelDBImpl.
 38 | class LevelDBRandomAccessFile : public leveldb::RandomAccessFile {
 39 |  public:
 40 |   LevelDBRandomAccessFile(LevelDBImpl* leveldb_impl,
 41 |                           leveldb::RandomAccessFile* t)
 42 |       : leveldb::RandomAccessFile(), leveldb_impl_(leveldb_impl), target_(t) {}
 43 | 
 44 |   virtual ~LevelDBRandomAccessFile() OVERRIDE { delete target_; }
 45 | 
 46 |   virtual leveldb::Status Read(uint64_t offset, size_t n,
 47 |                                leveldb::Slice* result,
 48 |                                char* scratch) const OVERRIDE {
 49 |     leveldb_impl_->Read(n);
 50 |     return target_->Read(offset, n, result, scratch);
 51 |   }
 52 | 
 53 |  private:
 54 |   class LevelDBImpl* leveldb_impl_;
 55 |   leveldb::RandomAccessFile* target_;
 56 | };
 57 | 
 58 | // A wrapper for WritableFile that forwards the data append information to
 59 | // LevelDBImpl.
 60 | class LevelDBWritableFile : public leveldb::WritableFile {
 61 |  public:
 62 |   LevelDBWritableFile(LevelDBImpl* leveldb_impl, leveldb::WritableFile* t)
 63 |       : leveldb::WritableFile(), leveldb_impl_(leveldb_impl), target_(t) {}
 64 | 
 65 |   virtual ~LevelDBWritableFile() OVERRIDE { delete target_; }
 66 | 
 67 |   virtual leveldb::Status Append(const leveldb::Slice& data) OVERRIDE {
 68 |     leveldb_impl_->Append(data.size());
 69 |     return target_->Append(data);
 70 |   }
 71 | 
 72 |   virtual leveldb::Status Close() OVERRIDE { return target_->Close(); }
 73 | 
 74 |   virtual leveldb::Status Flush() OVERRIDE { return target_->Flush(); }
 75 | 
 76 |   virtual leveldb::Status Sync() OVERRIDE {
 77 |     if (leveldb_impl_->params_.enable_fsync)
 78 |       return target_->Sync();
 79 |     else {
 80 |       // Let's ignore Sync() for faster experiments.
 81 |       return leveldb::Status::OK();
 82 |     }
 83 |   }
 84 | 
 85 |  private:
 86 |   class LevelDBImpl* leveldb_impl_;
 87 |   leveldb::WritableFile* target_;
 88 | };
 89 | 
 90 | // A wrapper for Env that forwards the file deletion information to LevelDBImpl.
 91 | class LevelDBEnv : public leveldb::EnvWrapper {
 92 |  public:
 93 |   LevelDBEnv(LevelDBImpl* leveldb_impl)
 94 |       : leveldb::EnvWrapper(leveldb::Env::Default()),
 95 |         leveldb_impl_(leveldb_impl) {}
 96 | 
 97 |   virtual ~LevelDBEnv() OVERRIDE {}
 98 | 
 99 |   virtual leveldb::Status NewSequentialFile(
100 |       const std::string& f, leveldb::SequentialFile** r) OVERRIDE {
101 |     leveldb::Status status = target()->NewSequentialFile(f, r);
102 |     if (*r != NULL) *r = new LevelDBSequentialFile(leveldb_impl_, *r);
103 |     return status;
104 |   }
105 | 
106 |   virtual leveldb::Status NewRandomAccessFile(
107 |       const std::string& f, leveldb::RandomAccessFile** r) OVERRIDE {
108 |     leveldb::Status status = target()->NewRandomAccessFile(f, r);
109 |     if (*r != NULL) *r = new LevelDBRandomAccessFile(leveldb_impl_, *r);
110 |     return status;
111 |   }
112 | 
113 |   virtual leveldb::Status NewWritableFile(const std::string& f,
114 |                                           leveldb::WritableFile** r) OVERRIDE {
115 |     leveldb::Status status = target()->NewWritableFile(f, r);
116 |     if (*r != NULL) *r = new LevelDBWritableFile(leveldb_impl_, *r);
117 |     return status;
118 |   }
119 | 
120 |   virtual leveldb::Status DeleteFile(const std::string& f) OVERRIDE {
121 |     struct stat st;
122 |     memset(&st, 0, sizeof(st));
123 |     // XXX: The file length *might* not be as large as its actual content
124 |     // because the directory metadata can be updated later than the appends.
125 |     int ret = stat(f.c_str(), &st);
126 |     if (ret == 0) leveldb_impl_->Delete(static_cast<uint64_t>(st.st_size));
127 | 
128 |     return target()->DeleteFile(f);
129 |   }
130 | 
131 |  private:
132 |   class LevelDBImpl* leveldb_impl_;
133 | };
134 | 
135 | LevelDBImpl::LevelDBImpl(const LevelDBParams& params, std::vector<Stat>& stats)
136 |     : params_(params), stats_(stats) {
137 |   stats_.push_back(Stat());
138 | 
139 |   pthread_mutex_init(&stats_mutex_, NULL);
140 |   read_ = 0;
141 |   appended_ = 0;
142 | 
143 |   // Clean up old files.
144 |   leveldb::DestroyDB("leveldb_files", leveldb::Options());
145 | 
146 |   options_ = new leveldb::Options();
147 | 
148 |   options_->create_if_missing = true;
149 | 
150 |   // Turn off Snappy.
151 |   options_->compression = leveldb::CompressionType::kNoCompression;
152 | 
153 |   // Use our Env to gather statistics.
154 |   options_->env = new LevelDBEnv(this);
155 | 
156 |   // Limit the max open file count.
157 |   options_->max_open_files = 900;
158 | 
159 |   // Configure the write buffer size.
160 |   options_->write_buffer_size = params.log_size_threshold;
161 | 
162 |   // Do not overload insert.
163 |   // These are hardcoded in leveldb/db/dbformat.h
164 |   // options_->level0_file_num_compaction_trigger = 4;
165 |   // options_->level0_slowdown_writes_trigger = 4;
166 |   // options_->level0_stop_writes_trigger = 4;
167 | 
168 |   // Use custom level sizes
169 |   if (params_.use_custom_sizes) {
170 |     std::size_t* custom_level_sizes = new std::size_t[20];
171 | 
172 |     std::ifstream ifs("output_sensitivity.txt");
173 |     while (!ifs.eof()) {
174 |       std::string line;
175 |       std::getline(ifs, line);
176 | 
177 |       std::istringstream iss(line);
178 |       std::vector<std::string> tokens{std::istream_iterator<std::string>{iss},
179 |                                       std::istream_iterator<std::string>{}};
180 | 
181 |       if (tokens.size() < 5) continue;
182 |       if (tokens[0] != "sensitivity_item_count_leveldb_best_sizes" &&
183 |           tokens[0] != "sensitivity_log_size_leveldb_best_sizes")
184 |         continue;
185 |       if (static_cast<uint64_t>(atol(tokens[1].c_str())) !=
186 |           params_.hint_num_unique_keys)
187 |         continue;
188 |       if (atof(tokens[2].c_str()) != params_.hint_theta) continue;
189 |       if (static_cast<uint64_t>(atol(tokens[3].c_str())) !=
190 |           params_.log_size_threshold)
191 |         continue;
192 | 
193 |       options_->custom_level_size_count = tokens.size() - 5 + 1;
194 | 
195 |       custom_level_sizes[0] = 0;
196 |       std::size_t level;
197 |       for (level = 1; level < options_->custom_level_size_count; level++) {
198 |         custom_level_sizes[level] = static_cast<size_t>(
199 |             atof(tokens[5 + level - 1].c_str()) * 1000. + 0.5);
200 |         printf("level-%zu: %zu\n", level, custom_level_sizes[level]);
201 |       }
202 |       // Make the last level very large and not spill.
203 |       level--;
204 |       custom_level_sizes[level] = 1000000000000000LU;
205 |       printf("level-%zu: %zu (expanded)\n", level, custom_level_sizes[level]);
206 |       printf("\n");
207 |       break;
208 |     }
209 |     assert(options_->custom_level_size_count != 0);
210 | 
211 |     options_->custom_level_sizes = custom_level_sizes;
212 |   }
213 | 
214 |   leveldb::Status status = leveldb::DB::Open(*options_, "leveldb_files", &db_);
215 |   if (!status.ok()) {
216 |     printf("%s\n", status.ToString().c_str());
217 |     assert(false);
218 |   }
219 | 
220 |   memset(value_buf_, 0, sizeof(value_buf_));
221 | }
222 | 
223 | LevelDBImpl::~LevelDBImpl() {
224 |   delete db_;
225 | 
226 |   delete options_->env;
227 |   if (params_.use_custom_sizes) delete[] options_->custom_level_sizes;
228 |   delete options_;
229 | 
230 |   pthread_mutex_destroy(&stats_mutex_);
231 | }
232 | 
233 | void LevelDBImpl::print_status() const {
234 |   // Force updating stats.
235 |   const_cast<LevelDBImpl*>(this)->Delete(0);
236 | }
237 | 
238 | void LevelDBImpl::dump_state(FILE* fp) const {
239 |   // TODO: Implement.
240 |   (void)fp;
241 | }
242 | 
243 | void LevelDBImpl::put(LevelDBKey key, uint32_t item_size) {
244 |   // LevelDB includes the full SSTable file size during calculating the level
245 |   // size;
246 |   // we consider the average space overhead per item in LevelDB so that the
247 |   // average stored size becomes similar to item_size.
248 |   const uint32_t overhead = 18;
249 | 
250 |   leveldb::Slice s_key(reinterpret_cast<const char*>(&key), sizeof(key));
251 |   uint32_t value_size =
252 |       static_cast<uint32_t>(static_cast<std::size_t>(item_size) - sizeof(key)) -
253 |       overhead;
254 |   assert(value_size < sizeof(value_buf_));
255 |   leveldb::Slice s_value(value_buf_, value_size);
256 | 
257 |   leveldb::Status status = db_->Put(leveldb::WriteOptions(), s_key, s_value);
258 |   if (!status.ok()) {
259 |     printf("%s\n", status.ToString().c_str());
260 |     assert(false);
261 |   }
262 | }
263 | 
264 | void LevelDBImpl::del(LevelDBKey key) {
265 |   leveldb::Slice s_key(reinterpret_cast<const char*>(&key), sizeof(key));
266 | 
267 |   leveldb::Status status = db_->Delete(leveldb::WriteOptions(), s_key);
268 |   if (!status.ok()) {
269 |     printf("%s\n", status.ToString().c_str());
270 |     assert(false);
271 |   }
272 | }
273 | 
274 | uint64_t LevelDBImpl::get(LevelDBKey key) {
275 |   leveldb::Slice s_key(reinterpret_cast<const char*>(&key), sizeof(key));
276 |   std::string s_value;
277 |   uint64_t value;
278 | 
279 |   leveldb::Status status = db_->Get(leveldb::ReadOptions(), s_key, &s_value);
280 |   if (!status.ok()) {
281 |     printf("%s\n", status.ToString().c_str());
282 |     assert(false);
283 |   }
284 |   assert(s_value.size() >= sizeof(uint64_t));
285 |   value = *reinterpret_cast<const uint64_t*>(s_value.data());
286 |   return value;
287 | }
288 | 
289 | void LevelDBImpl::force_compact() {
290 |   db_->CompactRange(NULL, NULL);
291 | 
292 |   // Force stat update.
293 |   Delete(0);
294 | }
295 | 
296 | void LevelDBImpl::Read(std::size_t len) { __sync_fetch_and_add(&read_, len); }
297 | 
298 | void LevelDBImpl::Append(std::size_t len) {
299 |   __sync_fetch_and_add(&appended_, len);
300 | }
301 | 
302 | void LevelDBImpl::Delete(std::size_t len) {
303 |   uint64_t read = read_;
304 |   __sync_fetch_and_sub(&read_, read);
305 |   uint64_t appended = appended_;
306 |   __sync_fetch_and_sub(&appended_, appended);
307 | 
308 |   pthread_mutex_lock(&stats_mutex_);
309 |   if (read != 0) stats_.back().read(read);
310 |   if (appended != 0) stats_.back().write(appended);
311 |   if (len != 0) stats_.back().del(len);
312 |   pthread_mutex_unlock(&stats_mutex_);
313 | }
314 | 


--------------------------------------------------------------------------------
/leveldb_impl.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "leveldb.h"
 4 | #include <pthread.h>
 5 | 
 6 | namespace leveldb {
 7 | // For forward declaration.
 8 | class DB;
 9 | class Options;
10 | }
11 | 
12 | // An interface to the LevelDB implementation
13 | class LevelDBImpl {
14 |   friend class LevelDBSequentialFile;
15 |   friend class LevelDBRandomAccessFile;
16 |   friend class LevelDBWritableFile;
17 |   friend class LevelDBEnv;
18 | 
19 |  public:
20 |   LevelDBImpl(const LevelDBParams& params, std::vector<Stat>& stats);
21 |   ~LevelDBImpl();
22 | 
23 |   // Prints the summary of the store.
24 |   void print_status() const;
25 | 
26 |   // Writes the current items in the store to the file.
27 |   void dump_state(FILE* fp) const;
28 | 
29 |   // Puts a new item in the store.
30 |   void put(LevelDBKey key, uint32_t item_size);
31 | 
32 |   // Deletes an item from the store.
33 |   void del(LevelDBKey key);
34 | 
35 |   // Gets an item from the store.
36 |   uint64_t get(LevelDBKey key);
37 | 
38 |   // Forces compaction until there is no SSTable except the last level.
39 |   void force_compact();
40 | 
41 |  protected:
42 |   void Read(std::size_t len);
43 |   void Append(std::size_t len);
44 |   void Delete(std::size_t len);
45 | 
46 |  private:
47 |   LevelDBParams params_;
48 |   std::vector<Stat>& stats_;
49 | 
50 |   leveldb::Options* options_;
51 |   leveldb::DB* db_;
52 | 
53 |   pthread_mutex_t stats_mutex_;
54 |   volatile uint64_t read_;
55 |   volatile uint64_t appended_;
56 | 
57 |   char value_buf_[1024];
58 | };
59 | 


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
  1 | #include "common.h"
  2 | #include "util.h"
  3 | #include "zipf.h"
  4 | #include "leveldb.h"
  5 | #include "leveldb_impl.h"
  6 | #include "rocksdb_impl.h"
  7 | #include "meshdb.h"
  8 | #include <sys/time.h>
  9 | 
 10 | enum class ActiveKeyMode {
 11 |   kEntire = 0,
 12 |   kClustered = 1,
 13 |   kScattered = 2,
 14 | };
 15 | 
 16 | enum class DependencyMode {
 17 |   kIndependent = 0,
 18 |   kClustered = 1,
 19 |   kScattered = 2,
 20 |   kSequential = 3,
 21 | };
 22 | 
 23 | /*
 24 | class ItemLifetimeInfo : public MeshDBItemLifetimeInfo {
 25 | public:
 26 |     ItemLifetimeInfo(const zipf_gen_state& zipf_state, std::size_t
 27 | num_unique_keys, const std::vector<uint32_t>& keys) {
 28 |         item_class_.resize(num_unique_keys);
 29 |         item_lifetime_.resize(num_unique_keys);
 30 |         class_lifetime_.resize(3);
 31 | 
 32 |         std::vector<double> item_prob;
 33 |         item_prob.resize(num_unique_keys);
 34 | 
 35 |         std::vector<double> class_prob_sum;
 36 |         std::vector<std::size_t> class_count;
 37 |         class_prob_sum.resize(3);
 38 |         class_count.resize(3);
 39 | 
 40 |         double prob_sum = 0;
 41 |         for (std::size_t i = 0; i < num_unique_keys; i++) {
 42 |             uint64_t key = static_cast<uint64_t>(keys[i]);
 43 |             double prob = zipf_prob(&zipf_state, i);
 44 |             item_prob[key] = prob;
 45 |             prob_sum += prob;
 46 |         }
 47 | 
 48 |         for (std::size_t key = 0; key < num_unique_keys; key++) {
 49 |             double prob = item_prob[key];
 50 |             uint64_t lifetime = static_cast<uint64_t>(prob_sum / prob);      //
 51 | inverse of the actual probability
 52 |             // if (key < 100)
 53 |             //     printf("prob=%lf prob_sum=%lf lifetime=%zu\n", prob,
 54 | prob_sum, lifetime);
 55 | 
 56 |             std::size_t item_class = 0;
 57 |             if (lifetime < 100000)
 58 |                 item_class = 0;
 59 |             else if (lifetime < 500000)
 60 |                 item_class = 1;
 61 |             else
 62 |                 item_class = 2;
 63 | 
 64 |             item_class_[key] = item_class;
 65 |             item_lifetime_[key] = lifetime;
 66 | 
 67 |             class_prob_sum[item_class] += prob;
 68 |             class_count[item_class]++;
 69 |         }
 70 | 
 71 |         for (std::size_t i = 0; i < 3; i++) {
 72 |             class_lifetime_[i] = static_cast<uint64_t>(prob_sum /
 73 | (class_prob_sum[i] / static_cast<double>(class_count[i])));
 74 |             printf("class_count[%zu]=%zu\n", i, class_count[i]);
 75 |             printf("class_lifetime[%zu]=%zu\n", i, class_lifetime_[i]);
 76 |         }
 77 |     }
 78 | 
 79 |     virtual ~ItemLifetimeInfo() {}
 80 |     virtual std::size_t item_class(MeshDBKey key) { return item_class_[key]; }
 81 |     virtual uint64_t item_lifetime(MeshDBKey key) { return item_lifetime_[key];
 82 | }
 83 |     virtual uint64_t class_lifetime(std::size_t lifetime_class) { return
 84 | class_lifetime_[lifetime_class]; }
 85 | 
 86 | private:
 87 |     std::vector<std::size_t> item_class_;
 88 |     std::vector<uint64_t> item_lifetime_;
 89 |     std::vector<std::size_t> class_lifetime_;
 90 | };
 91 | */
 92 | 
 93 | void print_stats(std::vector<Stat>& stats, uint64_t insert_bytes) {
 94 |   double wa_r_sum = 0.;
 95 |   double wa_w_sum = 0.;
 96 |   for (std::size_t i = 0; i < stats.size(); i++) {
 97 |     if (i == 0)
 98 |       printf("<log> stats\n");
 99 |     else
100 |       printf("<level-%zu> stats\n", i - 1);
101 |     stats[i].print_status();
102 |     double wa_r = static_cast<double>(stats[i].read_bytes()) /
103 |                   static_cast<double>(insert_bytes);
104 |     double wa_w = static_cast<double>(stats[i].write_bytes()) /
105 |                   static_cast<double>(insert_bytes);
106 |     printf("WA_r: %5.2lf\n", wa_r);
107 |     printf("WA_w: %5.2lf\n", wa_w);
108 |     wa_r_sum += wa_r;
109 |     wa_w_sum += wa_w;
110 |   }
111 |   printf("WA_r sum: %5.2lf\n", wa_r_sum);
112 |   printf("WA_w sum: %5.2lf\n", wa_w_sum);
113 | }
114 | 
115 | uint64_t get_usec() {
116 |   struct timeval tv_now;
117 |   gettimeofday(&tv_now, NULL);
118 | 
119 |   return (uint64_t)tv_now.tv_sec * 1000000UL + (uint64_t)tv_now.tv_usec;
120 | }
121 | 
122 | template <class StoreType>
123 | void test(const char* store_type_name, uint32_t num_unique_keys,
124 |           ActiveKeyMode active_key_mode, DependencyMode dependency_mode,
125 |           uint64_t num_requests, double theta,
126 |           LevelDBCompactionMode compaction_mode, uint64_t wb_size,
127 |           bool enable_fsync, bool use_custom_sizes,
128 |           const std::vector<uint64_t>& dump_points) {
129 |   // The number of unique keys.
130 |   // uint32_t num_unique_keys = 2 * 1000 * 1000;
131 |   // The item size.
132 |   uint32_t item_size = 1000;
133 |   // The number of requests.
134 |   // uint64_t num_requests = 20 * 1000 * 1000;
135 |   // The skew of key popularity.  -1. = uniform no ramdom; 0. = uniform; 0.99 =
136 |   // skewed; 40. = one key
137 |   // double theta = -1.;
138 |   // double theta = 0.;
139 |   // double theta = 0.99;
140 | 
141 |   printf("store_type=%s\n", store_type_name);
142 |   printf("num_unique_keys=%u\n", num_unique_keys);
143 |   printf("active_key_mode=%u\n", active_key_mode);
144 |   printf("dependency_mode=%u\n", dependency_mode);
145 |   printf("item_size=%u\n", item_size);
146 |   printf("num_requests=%lu\n", num_requests);
147 |   printf("theta=%lf\n", theta);
148 |   printf("compaction_mode=%u\n", compaction_mode);
149 |   printf("wb_size=%lu\n", wb_size);
150 |   printf("enable_fsync=%s\n", enable_fsync ? "1" : "0");
151 |   printf("use_custom_sizes=%s\n", use_custom_sizes ? "1" : "0");
152 |   printf("\n");
153 |   fflush(stdout);
154 | 
155 |   bool verbose = true;
156 |   // bool verbose = false;
157 | 
158 |   // Generate keys.
159 |   // Uses uint32_t instead of uint64_t to reduce cache pollution.
160 |   // TODO: Use hashing instead of the shuffle key array.
161 |   std::vector<uint32_t> keys;
162 |   // assert(num_unique_keys < (1UL << 32));
163 |   sequence(num_unique_keys, keys);
164 |   // Comment this out to disable hashing.
165 |   shuffle(keys);
166 | 
167 |   // Initialize request generation.
168 |   zipf_gen_state zipf_state;
169 |   zipf_init(&zipf_state, static_cast<uint64_t>(num_unique_keys), theta, 1);
170 | 
171 |   // ItemLifetimeInfo lifetime_info(zipf_state, num_unique_keys, keys);
172 |   // for (std::size_t i = 0; i < 4; i++)
173 |   //     printf("class_lifetime(%zu)=%lu\n", i,
174 |   //     lifetime_info.class_lifetime(i));
175 |   // printf("item_class(0)=%lu\n", lifetime_info.item_class(keys[0]));
176 |   // printf("item_class(100)=%lu\n", lifetime_info.item_class(keys[100]));
177 |   // printf("item_class(10000)=%lu\n", lifetime_info.item_class(keys[10000]));
178 |   // printf("item_class(1000000)=%lu\n",
179 |   // lifetime_info.item_class(keys[1000000]));
180 | 
181 |   // Main simulation.
182 |   std::vector<Stat> stats;
183 |   LevelDBParams params;
184 |   params.compaction_mode = compaction_mode;
185 |   params.log_size_threshold = wb_size;
186 |   params.enable_fsync = enable_fsync;
187 |   params.use_custom_sizes = use_custom_sizes;
188 |   params.hint_num_unique_keys = num_unique_keys;
189 |   params.hint_theta = theta;
190 | 
191 |   StoreType store(params, stats);
192 | 
193 |   // MeshDBParams params;
194 |   // MeshDB store(params, stat, &lifetime_info);
195 | 
196 |   // std::size_t next_dump = 0;
197 |   (void)dump_points;
198 | 
199 |   // const uint64_t request_batch_size = 1000000;  // for debugging
200 |   const uint64_t request_batch_size = 10000000;
201 |   uint64_t num_processed_requests;
202 |   uint64_t start_t;
203 | 
204 |   start_t = get_usec();
205 | 
206 |   {
207 |     printf("initial insertion of %u items\n\n", num_unique_keys);
208 |     fflush(stdout);
209 | 
210 |     num_processed_requests = 0;
211 |     uint32_t key = 0;
212 |     while (num_processed_requests < static_cast<uint64_t>(num_unique_keys)) {
213 |       uint64_t this_request_batch_size = request_batch_size;
214 |       if (num_processed_requests + this_request_batch_size > num_unique_keys)
215 |         this_request_batch_size = num_unique_keys - num_processed_requests;
216 | 
217 |       for (uint64_t i = 0; i < this_request_batch_size; i++) {
218 |         // for sequential insert
219 |         store.put(key, item_size);
220 |         // for random insert
221 |         // store.put(keys[key], item_size);
222 |         key++;
223 |       }
224 |       num_processed_requests += this_request_batch_size;
225 | 
226 |       if (verbose) {
227 |         printf("key %lu/%u inserted\n", num_processed_requests,
228 |                num_unique_keys);
229 |         store.print_status();
230 |         print_stats(stats,
231 |                     num_processed_requests * static_cast<uint64_t>(item_size));
232 |         printf("\n");
233 |         fflush(stdout);
234 |       }
235 |     }
236 | 
237 |     printf("key %lu/%u inserted\n", num_processed_requests, num_unique_keys);
238 |     store.print_status();
239 |     print_stats(stats,
240 |                 num_processed_requests * static_cast<uint64_t>(item_size));
241 |     printf("\n");
242 |     fflush(stdout);
243 |   }
244 | 
245 |   printf("elapsed time: %.3lf seconds\n\n",
246 |          (double)(get_usec() - start_t) / 1000000.);
247 | 
248 |   for (auto& stat : stats) stat.reset();
249 | 
250 |   // How small fraction of keys are being used in the main transaction?
251 |   const uint32_t active_key_factor = 10;
252 | 
253 |   // How many keys are dependent to each other?
254 |   const int dependency_factor = 10;
255 | 
256 |   // Reinitialize request generation.
257 |   uint32_t num_active_keys;
258 |   switch (active_key_mode) {
259 |     case ActiveKeyMode::kEntire:
260 |       num_active_keys = num_unique_keys;
261 |       break;
262 |     case ActiveKeyMode::kClustered:
263 |       num_active_keys = num_unique_keys / active_key_factor;
264 |       sequence(num_active_keys, keys);
265 |       shuffle(keys);
266 |       break;
267 |     case ActiveKeyMode::kScattered:
268 |       num_active_keys = num_unique_keys / active_key_factor;
269 |       break;
270 |     default:
271 |       assert(false);
272 |       return;
273 |   }
274 |   zipf_init(&zipf_state, static_cast<uint64_t>(num_active_keys), theta, 2);
275 | 
276 |   start_t = get_usec();
277 | 
278 |   {
279 |     printf("main transaction of %lu requests\n\n", num_requests);
280 |     fflush(stdout);
281 | 
282 |     num_processed_requests = 0;
283 |     while (num_processed_requests < num_requests) {
284 |       uint64_t this_request_batch_size = request_batch_size;
285 |       if (num_processed_requests + this_request_batch_size > num_requests)
286 |         this_request_batch_size = num_requests - num_processed_requests;
287 | 
288 |       // Process a batch of requests.
289 |       switch (dependency_mode) {
290 |         case DependencyMode::kIndependent: {
291 |           for (uint64_t i = 0; i < this_request_batch_size; i++) {
292 |             uint32_t key = keys[zipf_next(&zipf_state)];
293 |             // uint32_t key = keys[static_cast<uint64_t>(rand()) %
294 |             // num_unique_keys];
295 |             // uint32_t key = static_cast<uint32_t>(rand() % num_unique_keys);
296 |             store.put(key, item_size);
297 | 
298 |             /*
299 |             if (next_dump < dump_points.size() && dump_points[next_dump] ==
300 |             num_processed_requests + i + 1) {
301 |                 char filename[1024];
302 |                 snprintf(filename, 1024, "output_state_%lu.txt",
303 |             dump_points[next_dump]);
304 |                 FILE* fp_state = fopen(filename, "wt");
305 |                 store.dump_state(fp_state);
306 |                 fclose(fp_state);
307 |                 next_dump++;
308 |             }
309 |             */
310 |           }
311 |         } break;
312 | 
313 |         case DependencyMode::kClustered: {
314 |           this_request_batch_size =
315 |               (this_request_batch_size + dependency_factor - 1) /
316 |               dependency_factor * dependency_factor;
317 |           for (uint64_t i = 0; i < this_request_batch_size;
318 |                i += dependency_factor) {
319 |             uint32_t key = keys[zipf_next(&zipf_state)];
320 |             key = key / dependency_factor * dependency_factor;
321 |             store.put(key, item_size);
322 | 
323 |             for (int j = 1; j < dependency_factor; j++) {
324 |               key++;
325 |               if (key >= num_unique_keys) key -= num_unique_keys;
326 |               store.put(key, item_size);
327 |             }
328 |           }
329 |         } break;
330 | 
331 |         case DependencyMode::kScattered: {
332 |           const uint32_t key_skip = num_unique_keys / dependency_factor;
333 |           this_request_batch_size =
334 |               (this_request_batch_size + dependency_factor - 1) /
335 |               dependency_factor * dependency_factor;
336 |           for (uint64_t i = 0; i < this_request_batch_size;
337 |                i += dependency_factor) {
338 |             uint32_t key = keys[zipf_next(&zipf_state)];
339 |             key = key % key_skip;
340 |             store.put(key, item_size);
341 | 
342 |             for (int j = 1; j < dependency_factor; j++) {
343 |               key += key_skip;
344 |               if (key >= num_unique_keys) key -= num_unique_keys;
345 |               store.put(key, item_size);
346 |             }
347 |           }
348 |         } break;
349 | 
350 |         case DependencyMode::kSequential: {
351 |           for (uint64_t i = 0; i < this_request_batch_size; i++) {
352 |             uint32_t key = static_cast<uint32_t>((num_processed_requests + i) %
353 |                                                  num_active_keys);
354 |             store.put(key, item_size);
355 |           }
356 |         } break;
357 | 
358 |         default:
359 |           assert(false);
360 |           return;
361 |       }
362 |       num_processed_requests += this_request_batch_size;
363 | 
364 |       if (verbose) {
365 |         printf("request %lu/%lu processed\n", num_processed_requests,
366 |                num_requests);
367 |         store.print_status();
368 |         print_stats(stats,
369 |                     num_processed_requests * static_cast<uint64_t>(item_size));
370 |         printf("\n");
371 |         fflush(stdout);
372 |       }
373 |     }
374 | 
375 |     printf("request %lu/%lu processed\n", num_processed_requests, num_requests);
376 |     store.print_status();
377 |     print_stats(stats,
378 |                 num_processed_requests * static_cast<uint64_t>(item_size));
379 |     printf("\n");
380 |     fflush(stdout);
381 |   }
382 | 
383 |   printf("elapsed time: %.3lf seconds\n\n",
384 |          (double)(get_usec() - start_t) / 1000000.);
385 | 
386 |   if (false) {
387 |     printf("forcing compaction\n");
388 |     fflush(stdout);
389 |     store.force_compact();
390 | 
391 |     store.print_status();
392 |     print_stats(stats,
393 |                 num_processed_requests * static_cast<uint64_t>(item_size));
394 |     printf("\n");
395 |     fflush(stdout);
396 |   }
397 | 
398 |   /*
399 |   {
400 |       FILE* fp_state = fopen("output_state_final.txt", "wt");
401 |       store.dump_state(fp_state);
402 |       fclose(fp_state);
403 |   }
404 | 
405 |   // Write the key probability file.
406 |   {
407 |       std::vector<double> prob;
408 |       prob.resize(num_unique_keys);
409 |       for (std::size_t i = 0; i < num_unique_keys; i++) {
410 |           uint64_t key = static_cast<uint64_t>(keys[i]);
411 |           prob[key] = zipf_prob(&zipf_state, i);
412 |       }
413 | 
414 |       FILE* fp_prob = fopen("output_prob.txt", "wt");
415 |       for (std::size_t i = 0; i < num_unique_keys; i++)
416 |           fprintf(fp_prob, "prob:%lf\n", prob[i]);
417 |       fclose(fp_prob);
418 |   }
419 |   */
420 | }
421 | 
422 | int main(int argc, const char* argv[]) {
423 |   if (argc < 11) {
424 |     printf(
425 |         "%s STORE-TYPE NUM-UNIQUE-KEYS ACTIVE-KEY-MODE DEPENDENCY-MODE "
426 |         "NUM-REQUESTS ZIPF-THETA COMPACTION-MODE WB-SIZE ENABLE-FSYNC "
427 |         "USE-CUSTOM-SIZES [DUMP-POINTS]\n",
428 |         argv[0]);
429 |     printf("STORE-TYPE: leveldb-sim, leveldb-impl, rocksdb-impl\n");
430 |     printf("NUM-UNIQUE-KEYS: 1000000, ...\n");
431 |     printf("ACTIVE-KEY-MODE: 0, 1, 2\n");
432 |     printf("DEPENDENCY-MODE: 0, 1, 2, 3\n");
433 |     printf("NUM-REQUESTS: 10000000, ...\n");
434 |     printf("ZIPF-THETA: 0.00, 0.99, ...\n");
435 |     printf("COMPACTION-MODE: 0, 1, 2, ...\n");
436 |     printf("WB-SIZE: 4194304, ...\n");
437 |     printf("ENABLE-FSYNC: 0, 1\n");
438 |     printf("USE-CUSTOM-SIZES: 0, 1\n");
439 |     return 1;
440 |   }
441 |   int store_type;
442 |   if (strcmp(argv[1], "leveldb-sim") == 0)
443 |     store_type = 0;
444 |   else if (strcmp(argv[1], "leveldb-impl") == 0)
445 |     store_type = 1;
446 |   else if (strcmp(argv[1], "rocksdb-impl") == 0)
447 |     store_type = 2;
448 |   else {
449 |     printf("invalid STORE-TYPE\n");
450 |     return 1;
451 |   }
452 | 
453 |   uint32_t num_unique_keys = static_cast<uint32_t>(atoi(argv[2]));
454 |   ActiveKeyMode active_key_mode = static_cast<ActiveKeyMode>(atoi(argv[3]));
455 |   DependencyMode dependency_mode = static_cast<DependencyMode>(atoi(argv[4]));
456 |   uint64_t num_requests = static_cast<uint64_t>(atol(argv[5]));
457 |   double theta = atof(argv[6]);
458 |   LevelDBCompactionMode compaction_mode =
459 |       static_cast<LevelDBCompactionMode>(atoi(argv[7]));
460 |   uint64_t wb_size = static_cast<uint64_t>(atol(argv[8]));
461 |   bool enable_fsync = atoi(argv[9]) != 0;
462 |   bool use_custom_sizes = atoi(argv[10]) != 0;
463 |   std::vector<uint64_t> dump_points;
464 |   for (int i = 11; i < argc; i++)
465 |     dump_points.push_back(static_cast<uint64_t>(atol(argv[i])));
466 | 
467 |   if (store_type == 0)
468 |     test<LevelDB>("leveldb-sim", num_unique_keys, active_key_mode,
469 |                   dependency_mode, num_requests, theta, compaction_mode,
470 |                   wb_size, enable_fsync, use_custom_sizes, dump_points);
471 |   else if (store_type == 1)
472 |     test<LevelDBImpl>("leveldb-impl", num_unique_keys, active_key_mode,
473 |                       dependency_mode, num_requests, theta, compaction_mode,
474 |                       wb_size, enable_fsync, use_custom_sizes, dump_points);
475 |   else if (store_type == 2)
476 |     test<RocksDBImpl>("rocksdb-impl", num_unique_keys, active_key_mode,
477 |                       dependency_mode, num_requests, theta, compaction_mode,
478 |                       wb_size, enable_fsync, use_custom_sizes, dump_points);
479 |   else
480 |     assert(false);
481 | 
482 |   return 0;
483 | }
484 | 


--------------------------------------------------------------------------------
/measure_rw.cpp:
--------------------------------------------------------------------------------
 1 | #include "common.h"
 2 | #include <sys/time.h>
 3 | 
 4 | #include <sys/types.h>
 5 | #include <sys/stat.h>
 6 | #include <unistd.h>
 7 | #include <fcntl.h>
 8 | #include <cstring>
 9 | 
10 | uint64_t get_usec() {
11 |   struct timeval tv_now;
12 |   gettimeofday(&tv_now, NULL);
13 | 
14 |   return (uint64_t)tv_now.tv_sec * 1000000UL + (uint64_t)tv_now.tv_usec;
15 | }
16 | 
17 | int main(int argc, const char* argv[]) {
18 |   if (argc < 2) {
19 |     printf("%s PATH\n", argv[0]);
20 |   }
21 | 
22 |   // write 1 GiB
23 |   const size_t stride_len = 4 * 1048576;
24 |   const size_t stride_count = 256;
25 | 
26 |   char* bytes = new char[stride_len];
27 |   char* p;
28 | 
29 |   int fd = open(argv[1], O_CREAT | O_RDWR | O_TRUNC, 0644);
30 |   double rw_cost_sum = 0.;
31 | 
32 |   for (int i = 0; i < 10; i++) {
33 |     uint64_t start_t;
34 |     double elapsed;
35 |     size_t remaining_len;
36 | 
37 |     printf("seq %d\n", i + 1);
38 |     fflush(stdout);
39 | 
40 |     memset(bytes, (i % 254) + 1, stride_len);
41 | 
42 |     // write data
43 |     start_t = get_usec();
44 |     lseek(fd, 0, SEEK_SET);
45 |     for (size_t stride = 0; stride < stride_count; stride++) {
46 |       p = bytes;
47 |       remaining_len = stride_len;
48 |       while (remaining_len > 0) {
49 |         ssize_t wrote_len = write(fd, p, stride_len);
50 |         if (wrote_len < 0) {
51 |           perror("");
52 |           close(fd);
53 |           return 0;
54 |         }
55 |         p += wrote_len;
56 |         remaining_len -= static_cast<size_t>(wrote_len);
57 |       }
58 |     }
59 |     fdatasync(fd);
60 |     posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
61 |     elapsed = (double)(get_usec() - start_t) / 1000000.;
62 |     double write_tput = (stride_len * stride_count) / elapsed;
63 | 
64 |     // read data
65 |     start_t = get_usec();
66 |     lseek(fd, 0, SEEK_SET);
67 |     for (size_t stride = 0; stride < stride_count; stride++) {
68 |       p = bytes;
69 |       remaining_len = stride_len;
70 |       while (remaining_len > 0) {
71 |         ssize_t read_len = read(fd, p, remaining_len);
72 |         if (read_len < 0) {
73 |           perror("");
74 |           close(fd);
75 |           return 0;
76 |         }
77 |         p += read_len;
78 |         remaining_len -= static_cast<size_t>(read_len);
79 |       };
80 |     }
81 |     elapsed = (double)(get_usec() - start_t) / 1000000.;
82 |     double read_tput = (stride_len * stride_count) / elapsed;
83 | 
84 |     double rw_cost = write_tput / read_tput;
85 |     rw_cost_sum += rw_cost;
86 | 
87 |     printf("write tput = %7.2lf MB/s\n", write_tput / 1000000.);
88 |     printf("read tput =  %7.2lf MB/s\n", read_tput / 1000000.);
89 |     printf("r/w cost =   %7.3lf\n", rw_cost);
90 |     printf("(avg) =      %7.3lf\n", rw_cost_sum / static_cast<double>(i + 1));
91 |     printf("\n");
92 |     fflush(stdout);
93 |   }
94 | 
95 |   close(fd);
96 |   return 0;
97 | }
98 | 


--------------------------------------------------------------------------------
/meshdb.cpp:
--------------------------------------------------------------------------------
  1 | #include "meshdb.h"
  2 | #include "util.h"
  3 | // #include <unordered_map>
  4 | 
  5 | MeshDB::MeshDB(const MeshDBParams& params, Stat& stat,
  6 |                MeshDBItemLifetimeInfo* lifetime_info)
  7 |     : params_(params), stat_(stat), lifetime_info_(lifetime_info) {
  8 |   log_bytes_ = 0;
  9 |   next_version_ = 0;
 10 |   updates_since_last_compaction_ = 0;
 11 |   next_compaction_key_ = 0;
 12 |   compaction_rand_seed_ = 0;
 13 | 
 14 |   for (std::size_t i = 0; i < 1 + num_lifetime_classes; i++)
 15 |     level_bytes_[i] = 0;
 16 | 
 17 |   // for (std::size_t i = 0; i < num_lifetime_classes; i++)
 18 |   //     compaction_weight_[i] = 1. / static_cast<double>(num_lifetime_classes);
 19 |   compaction_weight_[0] = 1.;
 20 |   for (std::size_t i = 1; i < num_lifetime_classes; i++)
 21 |     compaction_weight_[i] = compaction_weight_[i - 1] / 10;
 22 |   // compaction_weight_[i] = compaction_weight_[i - 1] / 4;
 23 | 
 24 |   compaction_weight_[0] = 1. / 18000.;
 25 |   compaction_weight_[1] = 1. / 130000.;
 26 |   compaction_weight_[2] = 1. / 1000000.;
 27 | 
 28 |   global_mutation_rate_ = 0.5;
 29 |   for (std::size_t i = 0; i < num_lifetime_classes; i++)
 30 |     level_mutation_rate_[i] = 0.5;
 31 | 
 32 |   for (std::size_t i = 0; i < num_lifetime_classes - 1; i++)
 33 |     lifetime_threshold_[i] = static_cast<double>(i + 1);
 34 | }
 35 | 
 36 | MeshDB::~MeshDB() {}
 37 | 
 38 | void MeshDB::print_status() const {
 39 |   printf("log: %zu items, %lu bytes\n", log_.size(), log_bytes_);
 40 |   for (std::size_t i = 0; i < 1 + num_lifetime_classes; i++)
 41 |     printf("level-%zu: %zu tables, %lu bytes\n", i, levels_[i].size(),
 42 |            level_bytes_[i]);
 43 | }
 44 | 
 45 | void MeshDB::dump_state(FILE* fp) const {
 46 |   // XXX: Memtable is not dumped now.
 47 |   fprintf(fp, "next_version:%lu\n", next_version_);
 48 | 
 49 |   fprintf(fp, "log:\n");
 50 |   dump_state(fp, log_);
 51 | 
 52 |   fprintf(fp, "levels:\n");
 53 |   for (std::size_t level = 0; level < 1 + num_lifetime_classes; level++) {
 54 |     auto& sstables = levels_[level];
 55 |     fprintf(fp, "level:\n");
 56 |     for (std::size_t i = 0; i < sstables.size(); i++) {
 57 |       fprintf(fp, "sstable:\n");
 58 |       dump_state(fp, *sstables[i]);
 59 |     }
 60 |   }
 61 | }
 62 | 
 63 | void MeshDB::dump_state(FILE* fp, const sstable_t& l) {
 64 |   for (std::size_t i = 0; i < l.size(); i++) dump_state(fp, l[i]);
 65 | }
 66 | 
 67 | void MeshDB::dump_state(FILE* fp, const MeshDBItem& item) {
 68 |   fprintf(fp, "item:%lu,%lu,%lu,%s\n", item.key, item.version, item.size,
 69 |           item.deletion ? "T" : "F");
 70 | }
 71 | 
 72 | void MeshDB::put(uint64_t key, uint64_t item_size) {
 73 |   MeshDBItem item{key, next_version_++, item_size, false};
 74 |   append_to_log(item);
 75 | }
 76 | 
 77 | void MeshDB::del(uint64_t key) {
 78 |   MeshDBItem item{key, next_version_++, 16, true};
 79 |   append_to_log(item);
 80 | }
 81 | 
 82 | uint64_t MeshDB::get(MeshDBKey key) {
 83 |   // TODO: Implement
 84 |   (void)key;
 85 |   return 0;
 86 | }
 87 | 
 88 | void MeshDB::force_compact() {
 89 |   flush_log();
 90 | 
 91 |   std::size_t num_steps = 10;
 92 | 
 93 |   next_compaction_key_ = 0;
 94 |   while (next_compaction_key_ < 2000000) {
 95 |     // XXX: hardcoding to set a key range
 96 |     for (std::size_t step = 0; step < num_steps; step++) {
 97 |       MeshDBKey first = next_compaction_key_;
 98 |       MeshDBKey last =
 99 |           next_compaction_key_ +
100 |           2000000 / params_.level0_sstable_count_threshold / num_steps;
101 |       next_compaction_key_ +=
102 |           2000000 / params_.level0_sstable_count_threshold / num_steps;
103 | 
104 |       compact(1 + num_lifetime_classes, first, last);
105 |     }
106 |   }
107 |   next_compaction_key_ = 0;
108 | }
109 | 
110 | void MeshDB::append_to_log(const MeshDBItem& item) {
111 |   log_.push_back(item);
112 | 
113 |   // Update statistics.
114 |   auto new_log_bytes = log_bytes_ + item.size;
115 |   // auto log_bytes_d = log_bytes_ / 4096;
116 |   // auto new_log_bytes_d = new_log_bytes / 4096;
117 |   // if (log_bytes_d != new_log_bytes_d) {
118 |   //     // New blocks are written.
119 |   //     stat_.write((new_log_bytes_d - log_bytes_d) * 4096);
120 |   // }
121 |   stat_.write(item.size);
122 |   log_bytes_ = new_log_bytes;
123 | 
124 |   updates_since_last_compaction_ += 1;
125 | 
126 |   if (log_bytes_ > params_.log_size_threshold) flush_log();
127 | }
128 | 
129 | void MeshDB::flush_log() {
130 |   if (log_.size() == 0) return;
131 | 
132 |   // Simplified for simulation; a new SSTable is created from the memtable,
133 |   // causing no disk read.
134 |   item_ptr_t items;
135 |   item_ptr_t items2;
136 |   sort_items(log_, items);
137 |   deduplicate_items(items, items2);
138 |   sstable_locs_t new_sstable_locs;
139 |   create_sstables(1, items2, new_sstable_locs);
140 |   delete_log();
141 | 
142 |   double accum_p[num_lifetime_classes];
143 | 
144 |   static int r = 0;
145 |   if (r++ % 100 == 0) {
146 |     printf("global_mutation_rate=%lf\n", global_mutation_rate_);
147 |     for (std::size_t i = 0; i < num_lifetime_classes; i++)
148 |       printf("level_mutation_rate[%zu]=%lf\n", i, level_mutation_rate_[i]);
149 |     for (std::size_t i = 0; i < num_lifetime_classes; i++)
150 |       printf("compaction_weight[%zu]=%lf\n", i, compaction_weight_[i]);
151 |     // for (std::size_t i = 0; i < num_lifetime_classes - 1; i++)
152 |     //     printf("lifetime_threshold[%zu]=%lf\n", i, lifetime_threshold_[i]);
153 |   }
154 | 
155 |   // std::size_t num_steps = 10;
156 |   std::size_t num_steps = 1;
157 | 
158 |   for (std::size_t step = 0; step < num_steps; step++) {
159 |     double accum_weight = 0.;
160 |     {
161 |       std::size_t lifetime_class = num_lifetime_classes - 1;
162 |       while (true) {
163 |         accum_weight += compaction_weight_[lifetime_class];
164 |         accum_p[lifetime_class] = accum_weight;
165 |         if (lifetime_class == 0) break;
166 |         lifetime_class--;
167 |       }
168 |     }
169 | 
170 |     double r = fast_rand_d(&compaction_rand_seed_) * accum_weight;
171 | 
172 |     std::size_t num_levels = 2;
173 |     {
174 |       std::size_t lifetime_class = num_lifetime_classes - 1;
175 |       while (true) {
176 |         if (r < accum_p[lifetime_class]) {
177 |           num_levels = 2 + lifetime_class;
178 |           break;
179 |         }
180 |         if (lifetime_class == 0) break;
181 |         lifetime_class--;
182 |       }
183 |     }
184 | 
185 |     // XXX: hardcoding to set a key range
186 |     MeshDBKey first = next_compaction_key_;
187 |     MeshDBKey last =
188 |         next_compaction_key_ +
189 |         2000000 / (params_.level0_sstable_count_threshold + 1) / num_steps;
190 |     next_compaction_key_ +=
191 |         2000000 / (params_.level0_sstable_count_threshold + 1) / num_steps;
192 |     if (next_compaction_key_ >= 2000000) next_compaction_key_ = 0;
193 | 
194 |     bool any_key_in_level0 = true;
195 |     std::vector<std::size_t> sstable_indices;
196 |     find_overlapping_tables(0, first, last, sstable_indices);
197 |     for (auto i : sstable_indices)
198 |       for (auto& item : *levels_[0][i])
199 |         if (first <= item.key && item.key <= last) {
200 |           any_key_in_level0 = true;
201 |           break;
202 |         }
203 |     if (any_key_in_level0 /*|| num_levels > 2*/) {
204 |       // printf("compact: num_levels=%zu first=%lu last=%lu\n", num_levels,
205 |       // first, last);
206 |       compact(num_levels, first, last);
207 |     }
208 |   }
209 | 
210 |   // printf("\n");
211 | }
212 | 
213 | void MeshDB::delete_log() {
214 |   // stat_.del(log_bytes_ / 4096 * 4096);
215 |   stat_.del(log_bytes_);
216 |   log_.clear();
217 |   log_bytes_ = 0;
218 | }
219 | 
220 | struct _MeshDBDereferenceComparer {
221 |   bool operator()(const MeshDBItem* a, const MeshDBItem* b) const {
222 |     auto& item_a = *a;
223 |     auto& item_b = *b;
224 |     if (item_a.key < item_b.key)
225 |       return true;
226 |     else if (item_a.key == item_b.key && item_a.version < item_b.version)
227 |       return true;
228 |     return false;
229 |   }
230 | };
231 | 
232 | void MeshDB::sort_items(sstable_t& items, item_ptr_t& out_items) {
233 |   std::size_t count = items.size();
234 |   out_items.clear();
235 |   out_items.reserve(count);
236 |   for (auto& item : items) out_items.push_back(&item);
237 |   std::sort(out_items.begin(), out_items.end(), _MeshDBDereferenceComparer());
238 | }
239 | 
240 | struct _MeshDBSSTableComparer {
241 |   const MeshDB::sstables_t& sstables;
242 |   const std::vector<std::size_t>& sstables_pos;
243 | 
244 |   bool operator()(const std::size_t& a, const std::size_t& b) const {
245 |     auto& item_a = (*sstables[a])[sstables_pos[a]];
246 |     auto& item_b = (*sstables[b])[sstables_pos[b]];
247 |     if (item_a.key > item_b.key)
248 |       return true;
249 |     else if (item_a.key == item_b.key && item_a.version > item_b.version)
250 |       return true;
251 |     return false;
252 |   }
253 | };
254 | 
255 | void MeshDB::merge_items(const sstables_t& sstables, item_ptr_t& out_items) {
256 |   std::size_t total_count = 0;
257 |   std::vector<std::size_t> heap;
258 |   std::vector<std::size_t> sstable_pos;
259 |   for (std::size_t i = 0; i < sstables.size(); i++) {
260 |     total_count += sstables[i]->size();
261 |     if (sstables[i]->size() > 0) heap.push_back(i);
262 |     sstable_pos.push_back(0);
263 |   }
264 | 
265 |   out_items.clear();
266 |   out_items.reserve(total_count);
267 | 
268 |   // Since std::make_heap makes a max-heap, we use a comparator with the
269 |   // opposite result.
270 |   _MeshDBSSTableComparer comp{sstables, sstable_pos};
271 | 
272 |   std::make_heap(heap.begin(), heap.end(), comp);
273 |   while (heap.size() > 0) {
274 |     auto sstable_index = heap.front();
275 | 
276 |     std::pop_heap(heap.begin(), heap.end(), comp);
277 |     heap.pop_back();
278 | 
279 |     auto& sstable = sstables[sstable_index];
280 |     // assert(sstable_pos[sstable_index] < sstable->size());
281 |     auto& item = (*sstable)[sstable_pos[sstable_index]++];
282 | 
283 |     out_items.push_back(&item);
284 |     // if (out_items.size() >= 2)
285 |     //     assert(out_items[out_items.size() - 2]->key <=
286 |     //     out_items[out_items.size() - 1]->key);
287 | 
288 |     if (sstable_pos[sstable_index] < sstable->size()) {
289 |       heap.push_back(sstable_index);
290 |       std::push_heap(heap.begin(), heap.end(), comp);
291 |     }
292 |   }
293 | 
294 |   // assert(out_items.size() == total_count);
295 |   // for (std::size_t i = 0; i < sstables.size(); i++)
296 |   //     assert(sstable_pos[i] == sstables[i]->size());
297 | }
298 | 
299 | void MeshDB::deduplicate_items(const item_ptr_t& items, item_ptr_t& out_items) {
300 |   std::size_t count = items.size();
301 | 
302 |   out_items.clear();
303 |   if (count == 0) return;
304 |   out_items.reserve(count);
305 | 
306 |   for (std::size_t i = 0; i < count - 1; i++) {
307 |     if (items[i]->key != items[i + 1]->key) out_items.push_back(items[i]);
308 |   }
309 |   if (count > 0) out_items.push_back(items[count - 1]);
310 | }
311 | 
312 | void MeshDB::insert_sstable(std::size_t level, sstable_t* sstable) {
313 |   assert(sstable->size() != 0);
314 | 
315 |   // TODO: Use binary search to find the insert point.
316 |   auto it = levels_[level].begin();
317 |   std::size_t idx = 0;
318 |   while (it != levels_[level].end() &&
319 |          (*it)->front().key <= sstable->front().key) {
320 |     ++it;
321 |     idx++;
322 |   }
323 |   levels_[level].insert(it, sstable);
324 |   // levels_[level].push_back(sstable);
325 | }
326 | 
327 | MeshDB::sstable_t* MeshDB::remove_sstable(std::size_t level, std::size_t idx) {
328 |   sstable_t* t = levels_[level][idx];
329 | 
330 |   for (auto j = idx; j < levels_[level].size() - 1; j++)
331 |     levels_[level][j] = levels_[level][j + 1];
332 |   levels_[level].pop_back();
333 | 
334 |   return t;
335 | }
336 | 
337 | void MeshDB::create_sstables(std::size_t num_levels, const item_ptr_t& items,
338 |                              sstable_locs_t& out_new_sstables) {
339 |   const std::size_t last_level = 1 + num_lifetime_classes - 1;
340 | 
341 |   sstable_t* sstables[num_levels];
342 |   // The current SSTable size in bytes.
343 |   uint64_t sstable_sizes[num_levels];
344 | 
345 |   for (std::size_t i = 0; i < num_levels; i++) {
346 |     sstables[i] = nullptr;
347 |     sstable_sizes[i] = 0;
348 |   }
349 | 
350 |   auto insert_f = [&](std::size_t level) {
351 |     insert_sstable(level, sstables[level]);
352 |     out_new_sstables.push_back(std::make_pair(sstables[level], level));
353 |     level_bytes_[level] += sstable_sizes[level];
354 |     stat_.write(sstable_sizes[level]);
355 |     sstables[level] = nullptr;
356 |     sstable_sizes[level] = 0;
357 |   };
358 | 
359 |   for (auto& item : items) {
360 |     std::size_t level = 1 + lifetime_info_->item_class(item->key);
361 | 
362 |     // uint64_t item_lifetime = lifetime_info_->item_lifetime(item->key);
363 |     // std::size_t item_class;
364 |     // for (item_class = 0; item_class < num_lifetime_classes - 1; item_class++)
365 |     //     if (static_cast<double>(item_lifetime) <=
366 |     //     lifetime_threshold_[item_class])
367 |     //         break;
368 |     // std::size_t level = 1 + item_class;
369 | 
370 |     // std::size_t level = num_levels - 1;
371 | 
372 |     if (level >= num_levels) level = num_levels - 1;
373 | 
374 |     // Deletion is discarded when there is no more levels.
375 |     // TODO: this leaves lots of deletion tombstones if the item's lifetime
376 |     // class is not the last one.
377 |     if (item->deletion && level == last_level) continue;
378 | 
379 |     if (sstables[level]) {
380 |       bool need_new_sstable = false;
381 |       if (sstable_sizes[level] + item->size > params_.sstable_size_threshold) {
382 |         // Stop adding new items if this SSTable become large in size.
383 |         need_new_sstable = true;
384 |       }
385 | 
386 |       if (need_new_sstable) insert_f(level);
387 |     }
388 | 
389 |     if (!sstables[level]) sstables[level] = new sstable_t();
390 |     sstables[level]->push_back(*item);
391 |     sstable_sizes[level] += item->size;
392 |   }
393 |   for (std::size_t level = 0; level < num_levels; level++) {
394 |     if (sstables[level]) {
395 |       // Add any pending SSTable in construction.
396 |       insert_f(level);
397 |     }
398 |   }
399 | }
400 | 
401 | void MeshDB::find_overlapping_tables(
402 |     std::size_t level, const MeshDBKey& first, const MeshDBKey& last,
403 |     std::vector<std::size_t>& out_sstable_indices) {
404 |   // assert(level >= 1);
405 |   // assert(level < levels_.size());
406 | 
407 |   // TODO: Use binary search to reduce the search range.
408 | 
409 |   auto& level_tables = levels_[level];
410 |   std::size_t count = level_tables.size();
411 |   out_sstable_indices.clear();
412 | 
413 |   for (std::size_t i = 0; i < count; i++) {
414 |     auto& sstable = *level_tables[i];
415 |     if (!(last < sstable.front().key || sstable.back().key < first))
416 |       out_sstable_indices.push_back(i);
417 |   }
418 | }
419 | 
420 | struct _MeshDBReverseInt {
421 |   bool operator()(const std::size_t& a, const std::size_t& b) const {
422 |     return a > b;
423 |   }
424 | };
425 | 
426 | // struct phash {
427 | //     template <typename T, typename U>
428 | //     std::size_t operator()(const std::pair<T, U>& x) const {
429 | //         return std::hash<T>()(x.first) ^ std::hash<U>()(x.second);
430 | //     }
431 | // };
432 | 
433 | void MeshDB::compact(std::size_t num_levels, const MeshDBKey& first,
434 |                      const MeshDBKey& last) {
435 |   std::vector<sstable_t*> merge_source;
436 |   std::vector<std::pair<std::size_t, std::size_t>> sstables_to_delete;
437 | 
438 |   // std::unordered_map<std::pair<MeshDBKey, uint64_t>, std::size_t, phash>
439 |   // org_level;
440 |   std::size_t org_size = 0;
441 | 
442 |   for (std::size_t level = 0; level < num_levels; level++) {
443 |     std::vector<std::size_t> sstable_indices;
444 |     find_overlapping_tables(level, first, last, sstable_indices);
445 | 
446 |     sstable_t* temp_sstable;
447 |     if (level > 0) {
448 |       temp_sstable = new sstable_t();
449 |       merge_source.push_back(temp_sstable);
450 |     }
451 | 
452 |     for (auto& i : sstable_indices) {
453 |       if (level == 0) {
454 |         temp_sstable = new sstable_t();
455 |         merge_source.push_back(temp_sstable);
456 |       }
457 | 
458 |       auto& org_sstable = *levels_[level][i];
459 | 
460 |       uint64_t sstable_size = 0;
461 |       std::size_t item_start = 0;
462 |       while (item_start < org_sstable.size() &&
463 |              org_sstable[item_start].key < first)
464 |         item_start++;
465 | 
466 |       std::size_t item_end = item_start;
467 |       while (item_end < org_sstable.size() &&
468 |              org_sstable[item_end].key <= last) {
469 |         temp_sstable->push_back(org_sstable[item_end]);
470 |         if (level == num_levels - 1) {
471 |           // org_level[std::make_pair(org_sstable[item_end].key,
472 |           // org_sstable[item_end].version)] = level;
473 |           org_size += org_sstable[item_end].size;
474 |         }
475 |         sstable_size += org_sstable[item_end].size;
476 |         item_end++;
477 |       }
478 | 
479 |       stat_.read(sstable_size);
480 |       stat_.del(sstable_size);
481 |       level_bytes_[level] -= sstable_size;
482 | 
483 |       org_sstable.erase(
484 |           org_sstable.begin() + static_cast<std::ptrdiff_t>(item_start),
485 |           org_sstable.begin() + static_cast<std::ptrdiff_t>(item_end));
486 |       if (org_sstable.size() == 0) {
487 |         sstables_to_delete.push_back(std::make_pair(level, i));
488 |         delete &org_sstable;
489 |       }
490 |     }
491 |   }
492 | 
493 |   std::reverse(sstables_to_delete.begin(), sstables_to_delete.end());
494 |   for (auto p : sstables_to_delete) remove_sstable(p.first, p.second);
495 | 
496 |   item_ptr_t items;
497 |   merge_items(merge_source, items);
498 | 
499 |   item_ptr_t items2;
500 |   deduplicate_items(items, items2);
501 | 
502 |   // Calculate the mutation rate and modify the compaction weight
503 |   std::size_t unmodified_size = 0;
504 |   if (org_size != 0) {
505 |     for (auto& item : items2) {
506 |       if (item - &merge_source.back()->front() >= 0 &&
507 |           &merge_source.back()->back() - item >= 0)
508 |         unmodified_size += item->size;
509 |     }
510 |   }
511 | 
512 |   // Create new SSTables.
513 |   sstable_locs_t new_sstable_locs;
514 |   create_sstables(num_levels, items2, new_sstable_locs);
515 | 
516 |   // Delete old SSTables.
517 |   for (auto& sstable : merge_source) delete sstable;
518 | 
519 |   // Calculate the mutation rate and modify the compaction weight
520 |   // std::size_t unmodified_size = 0;
521 |   // for (auto& p : new_sstable_locs) {
522 |   //     if (p.second != num_levels - 1)
523 |   //         continue;
524 |   //     for (auto& item : *p.first) {
525 |   //         if (org_level[std::make_pair(item.key, item.version)] == p.second)
526 |   //             unmodified_size += item.size;
527 |   //     }
528 |   // }
529 |   double mutation_rate;
530 |   if (org_size != 0)
531 |     mutation_rate =
532 |         1. -
533 |         static_cast<double>(unmodified_size) / static_cast<double>(org_size);
534 |   else
535 |     mutation_rate = 0.;
536 |   if (mutation_rate < 0.) mutation_rate = 0.;
537 |   if (mutation_rate > 1.) mutation_rate = 1.;
538 | 
539 |   // printf("num_levels=%zu mutation_rate=%lf\n", num_levels, mutation_rate);
540 |   // if (mutation_rate < params_.target_mutation_rate) {
541 |   //     // Too little mutation; decrease weight for less frequent compaction.
542 |   //     compaction_weight_[num_levels - 2] /= 1.01;
543 |   // }
544 |   // else {
545 |   //     // Too much mutation; increase weight for more frequent compaction.
546 |   //     compaction_weight_[num_levels - 2] *= 1.01;
547 |   // }
548 | 
549 |   // double weight_sum = 0.;
550 |   // for (std::size_t i = 0; i < num_lifetime_classes; i++)
551 |   //     weight_sum += compaction_weight_[i];
552 | 
553 |   // if (num_levels < 1 + num_lifetime_classes) {
554 |   //     // if (mutation_rate * compaction_weight_[num_levels - 2] <
555 |   //     level_mutation_rate_[num_levels - 1] * compaction_weight_[num_levels -
556 |   //     1])
557 |   //     if (mutation_rate < level_mutation_rate_[num_levels - 1])
558 |   //         // compaction_weight_[num_levels - 2] /= pow(1.01, (1. /
559 |   //         (compaction_weight_[num_levels - 2] / weight_sum)));
560 |   //         compaction_weight_[num_levels - 2] /= 1.01;
561 |   //     else
562 |   //         // compaction_weight_[num_levels - 2] *= pow(1.01, (1. /
563 |   //         (compaction_weight_[num_levels - 2] / weight_sum)));
564 |   //         compaction_weight_[num_levels - 2] *= 1.01;
565 |   // }
566 | 
567 |   // Normalize weights
568 |   double weight_sum = 0.;
569 |   for (std::size_t i = 0; i < num_lifetime_classes; i++)
570 |     weight_sum += compaction_weight_[i];
571 |   for (std::size_t i = 0; i < num_lifetime_classes; i++)
572 |     compaction_weight_[i] /= weight_sum;
573 | 
574 |   global_mutation_rate_ = global_mutation_rate_ * 0.99 + mutation_rate * 0.01;
575 |   level_mutation_rate_[num_levels - 2] =
576 |       level_mutation_rate_[num_levels - 2] * 0.99 + mutation_rate * 0.01;
577 | }
578 | 


--------------------------------------------------------------------------------
/meshdb.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "common.h"
  4 | #include "stat.h"
  5 | #include <cstdio>
  6 | #include <unordered_map>
  7 | 
  8 | typedef uint64_t MeshDBKey;
  9 | 
 10 | struct MeshDBParams {
 11 |   // When a log file exceeds this size, a new Level-0 SSTable is created, and a
 12 |   // new log file is created.
 13 |   uint64_t log_size_threshold;
 14 |   // When the level 0 ("young") has this many SSTables, all of them are merged
 15 |   // into the next level.
 16 |   uint64_t level0_sstable_count_threshold;
 17 |   // When an SSTable file exceeds this size, a new SSTable is created.
 18 |   uint64_t sstable_size_threshold;
 19 |   // Adjust compaction frequency to meet this mutation rate for SSTables in the
 20 |   // last level of compaction.
 21 |   double target_mutation_rate;
 22 | 
 23 |   MeshDBParams() {
 24 |     log_size_threshold = 4 * 1048576;
 25 |     level0_sstable_count_threshold = 4;
 26 |     // level0_sstable_count_threshold = 12;
 27 |     sstable_size_threshold = 2 * 1048576;
 28 |     target_mutation_rate = 0.10;
 29 |   }
 30 | };
 31 | 
 32 | struct MeshDBItem {
 33 |   MeshDBKey key;
 34 |   uint64_t version;
 35 |   uint64_t size;
 36 |   bool deletion;
 37 | };
 38 | 
 39 | class MeshDBItemLifetimeInfo {
 40 |  public:
 41 |   virtual ~MeshDBItemLifetimeInfo() {}
 42 |   virtual std::size_t item_class(MeshDBKey key) {
 43 |     (void)key;
 44 |     return 0;
 45 |   }
 46 |   virtual uint64_t item_lifetime(MeshDBKey key) {
 47 |     (void)key;
 48 |     return 1;
 49 |   }
 50 |   virtual uint64_t class_lifetime(std::size_t lifetime_class) {
 51 |     (void)lifetime_class;
 52 |     return 1;
 53 |   }
 54 | };
 55 | 
 56 | // A MeshDB
 57 | class MeshDB {
 58 |  public:
 59 |   static const std::size_t num_lifetime_classes =
 60 |       3;  // TODO: Allow custom class count.
 61 |   // static const std::size_t num_lifetime_classes = 5;      // TODO: Allow
 62 |   // custom class count.
 63 | 
 64 |   MeshDB(const MeshDBParams& params, Stat& stat,
 65 |          MeshDBItemLifetimeInfo* lifetime_info);
 66 |   ~MeshDB();
 67 | 
 68 |   // Prints the summary of the store.
 69 |   void print_status() const;
 70 | 
 71 |   // Writes the current items in the store to the file.
 72 |   void dump_state(FILE* fp) const;
 73 | 
 74 |   // Puts a new item in the store.
 75 |   void put(MeshDBKey key, uint64_t item_size);
 76 | 
 77 |   // Deletes an item from the store.
 78 |   void del(MeshDBKey key);
 79 | 
 80 |   // Gets an item from the store.
 81 |   uint64_t get(MeshDBKey key);
 82 | 
 83 |   // Forces compaction until there is no successor SSTable.
 84 |   void force_compact();
 85 | 
 86 |   typedef std::vector<MeshDBItem> sstable_t;
 87 |   typedef std::vector<sstable_t*> sstables_t;
 88 |   typedef std::vector<std::pair<sstable_t*, std::size_t>> sstable_locs_t;
 89 | 
 90 |   typedef std::vector<MeshDBItem*> item_ptr_t;
 91 | 
 92 |  protected:
 93 |   // Adds a new item to the log.
 94 |   void append_to_log(const MeshDBItem& item);
 95 | 
 96 |   // Flushes all in-memory data to disk.  This effectively creates new level-0
 97 |   // SSTables from the Memtable.
 98 |   void flush_log();
 99 | 
100 |   // Deletes the log.
101 |   void delete_log();
102 | 
103 |   // Sorts items.
104 |   void sort_items(sstable_t& items, item_ptr_t& out_items);
105 | 
106 |   // Merges SSTables.
107 |   void merge_items(const sstables_t& sstables, item_ptr_t& out_items);
108 | 
109 |   // Removes duplicate items and takes the latter ones.  The items must be
110 |   // sorted by key.
111 |   void deduplicate_items(const item_ptr_t& items, item_ptr_t& out_items);
112 | 
113 |   // Creates new SSTables from the items designated by the indices.
114 |   void create_sstables(std::size_t num_levels, const item_ptr_t& items,
115 |                        sstable_locs_t& out_new_sstables);
116 | 
117 |   // Finds all overlapping SSTables in the level.
118 |   void find_overlapping_tables(std::size_t level, const MeshDBKey& first,
119 |                                const MeshDBKey& last,
120 |                                std::vector<std::size_t>& out_sstable_indices);
121 | 
122 |   // Performs compaction with SSTables from the level and all over overlapping
123 |   // SSTables in the next level.
124 |   void compact(std::size_t num_levels, const MeshDBKey& first,
125 |                const MeshDBKey& last);
126 | 
127 |   // Inserts a new SSTable into the level.
128 |   void insert_sstable(std::size_t level, sstable_t* sstable);
129 | 
130 |   // Removes an SSTable from the level.  This does not release the memory used
131 |   // by the SSTable.
132 |   sstable_t* remove_sstable(std::size_t level, std::size_t idx);
133 | 
134 |   // Writes an item list to the file.
135 |   static void dump_state(FILE* fp, const sstable_t& l);
136 |   static void dump_state(FILE* fp, const MeshDBItem& item);
137 | 
138 |  private:
139 |   MeshDBParams params_;
140 |   Stat& stat_;
141 |   MeshDBItemLifetimeInfo* lifetime_info_;
142 |   sstable_t log_;
143 |   uint64_t log_bytes_;
144 |   sstables_t levels_[1 + num_lifetime_classes];
145 |   uint64_t level_bytes_[1 + num_lifetime_classes];
146 |   uint64_t next_version_;
147 |   uint64_t updates_since_last_compaction_;
148 |   MeshDBKey next_compaction_key_;
149 |   uint64_t compaction_rand_seed_;
150 |   double compaction_weight_[num_lifetime_classes];
151 |   double global_mutation_rate_;
152 |   double level_mutation_rate_[num_lifetime_classes];
153 |   double lifetime_threshold_[num_lifetime_classes - 1];
154 | };
155 | 


--------------------------------------------------------------------------------
/rocksdb_impl.cpp:
--------------------------------------------------------------------------------
  1 | #include "rocksdb_impl.h"
  2 | #pragma GCC diagnostic push
  3 | #pragma GCC diagnostic ignored "-Wunused-parameter"
  4 | #pragma GCC diagnostic ignored "-Winline"
  5 | #include "rocksdb/db.h"
  6 | #include "rocksdb/env.h"
  7 | #pragma GCC diagnostic pop
  8 | #include <stdlib.h>
  9 | #include <sys/stat.h>
 10 | #include <fstream>
 11 | #include <sstream>
 12 | #include <iterator>
 13 | #include <string>
 14 | 
 15 | #define OVERRIDE override
 16 | // #define OVERRIDE
 17 | 
 18 | // A wrapper for SequentialFile that forwards the data read information to
 19 | // RocksDBImpl.
 20 | class RocksDBSequentialFile : public rocksdb::SequentialFile {
 21 |  public:
 22 |   RocksDBSequentialFile(RocksDBImpl* rocksdb_impl,
 23 |                         std::unique_ptr<rocksdb::SequentialFile>* t)
 24 |       : rocksdb::SequentialFile(), rocksdb_impl_(rocksdb_impl), target_(t) {}
 25 | 
 26 |   virtual ~RocksDBSequentialFile() OVERRIDE { delete target_; }
 27 | 
 28 |   virtual rocksdb::Status Read(size_t n, rocksdb::Slice* result,
 29 |                                char* scratch) OVERRIDE {
 30 |     rocksdb_impl_->Read(n);
 31 |     return (*target_)->Read(n, result, scratch);
 32 |   }
 33 | 
 34 |   virtual rocksdb::Status Skip(uint64_t n) OVERRIDE {
 35 |     return (*target_)->Skip(n);
 36 |   }
 37 | 
 38 |   virtual rocksdb::Status InvalidateCache(size_t offset,
 39 |                                           size_t length) OVERRIDE {
 40 |     return (*target_)->InvalidateCache(offset, length);
 41 |   }
 42 | 
 43 |  private:
 44 |   class RocksDBImpl* rocksdb_impl_;
 45 |   std::unique_ptr<rocksdb::SequentialFile>* target_;
 46 | };
 47 | 
 48 | // A wrapper for RandomAccessFile that forwards the data read information to
 49 | // RocksDBImpl.
 50 | class RocksDBRandomAccessFile : public rocksdb::RandomAccessFile {
 51 |  public:
 52 |   RocksDBRandomAccessFile(RocksDBImpl* rocksdb_impl,
 53 |                           std::unique_ptr<rocksdb::RandomAccessFile>* t)
 54 |       : rocksdb::RandomAccessFile(), rocksdb_impl_(rocksdb_impl), target_(t) {}
 55 | 
 56 |   virtual ~RocksDBRandomAccessFile() OVERRIDE { delete target_; }
 57 | 
 58 |   virtual rocksdb::Status Read(uint64_t offset, size_t n,
 59 |                                rocksdb::Slice* result,
 60 |                                char* scratch) const OVERRIDE {
 61 |     rocksdb_impl_->Read(n);
 62 |     return (*target_)->Read(offset, n, result, scratch);
 63 |   }
 64 | 
 65 |   virtual size_t GetUniqueId(char* id, size_t max_size) const OVERRIDE {
 66 |     return (*target_)->GetUniqueId(id, max_size);
 67 |   }
 68 | 
 69 |   virtual void Hint(AccessPattern pattern) OVERRIDE {
 70 |     (*target_)->Hint(pattern);
 71 |   }
 72 | 
 73 |   virtual rocksdb::Status InvalidateCache(size_t offset,
 74 |                                           size_t length) OVERRIDE {
 75 |     return (*target_)->InvalidateCache(offset, length);
 76 |   }
 77 | 
 78 |  private:
 79 |   class RocksDBImpl* rocksdb_impl_;
 80 |   std::unique_ptr<rocksdb::RandomAccessFile>* target_;
 81 | };
 82 | 
 83 | // A wrapper for WritableFile that forwards the data append information to
 84 | // RocksDBImpl.
 85 | class RocksDBWritableFile : public rocksdb::WritableFile {
 86 |  public:
 87 |   RocksDBWritableFile(RocksDBImpl* rocksdb_impl,
 88 |                       std::unique_ptr<rocksdb::WritableFile>* t)
 89 |       : rocksdb::WritableFile(), rocksdb_impl_(rocksdb_impl), target_(t) {}
 90 | 
 91 |   virtual ~RocksDBWritableFile() OVERRIDE { delete target_; }
 92 | 
 93 |   virtual rocksdb::Status Append(const rocksdb::Slice& data) OVERRIDE {
 94 |     rocksdb_impl_->Append(data.size());
 95 |     return (*target_)->Append(data);
 96 |   }
 97 | 
 98 |   virtual rocksdb::Status Close() OVERRIDE { return (*target_)->Close(); }
 99 | 
100 |   virtual rocksdb::Status Flush() OVERRIDE { return (*target_)->Flush(); }
101 | 
102 |   virtual rocksdb::Status Sync() OVERRIDE {
103 |     if (rocksdb_impl_->params_.enable_fsync)
104 |       return (*target_)->Sync();
105 |     else {
106 |       // Let's ignore Sync() for faster experiments.
107 |       return rocksdb::Status::OK();
108 |     }
109 |   }
110 | 
111 |   virtual rocksdb::Status Fsync() OVERRIDE {
112 |     if (rocksdb_impl_->params_.enable_fsync)
113 |       return (*target_)->Fsync();
114 |     else {
115 |       // Let's ignore Fsync() for faster experiments.
116 |       return rocksdb::Status::OK();
117 |     }
118 |   }
119 | 
120 |   virtual bool IsSyncThreadSafe() const OVERRIDE {
121 |     return (*target_)->IsSyncThreadSafe();
122 |   }
123 | 
124 |   virtual void SetIOPriority(rocksdb::Env::IOPriority pri) OVERRIDE {
125 |     (*target_)->SetIOPriority(pri);
126 |   }
127 | 
128 |   virtual rocksdb::Env::IOPriority GetIOPriority() OVERRIDE {
129 |     return (*target_)->GetIOPriority();
130 |   }
131 | 
132 |   virtual uint64_t GetFileSize() OVERRIDE { return (*target_)->GetFileSize(); }
133 | 
134 |   virtual void GetPreallocationStatus(size_t* block_size,
135 |                                       size_t* last_allocated_block) OVERRIDE {
136 |     (*target_)->GetPreallocationStatus(block_size, last_allocated_block);
137 |   }
138 | 
139 |   virtual size_t GetUniqueId(char* id, size_t max_size) const OVERRIDE {
140 |     return (*target_)->GetUniqueId(id, max_size);
141 |   }
142 | 
143 |   virtual rocksdb::Status InvalidateCache(size_t offset,
144 |                                           size_t length) OVERRIDE {
145 |     return (*target_)->InvalidateCache(offset, length);
146 |   }
147 | 
148 |  private:
149 |   class RocksDBImpl* rocksdb_impl_;
150 |   std::unique_ptr<rocksdb::WritableFile>* target_;
151 | };
152 | 
153 | class RocksDBDirectory : public rocksdb::Directory {
154 |  public:
155 |   RocksDBDirectory(RocksDBImpl* rocksdb_impl,
156 |                    std::unique_ptr<rocksdb::Directory>* t)
157 |       : rocksdb::Directory(), rocksdb_impl_(rocksdb_impl), target_(t) {}
158 | 
159 |   virtual ~RocksDBDirectory() OVERRIDE { delete target_; }
160 | 
161 |   virtual rocksdb::Status Fsync() OVERRIDE {
162 |     if (rocksdb_impl_->params_.enable_fsync)
163 |       return (*target_)->Fsync();
164 |     else {
165 |       // Let's ignore Fsync() for faster experiments.
166 |       return rocksdb::Status::OK();
167 |     }
168 |   }
169 | 
170 |  private:
171 |   class RocksDBImpl* rocksdb_impl_;
172 |   std::unique_ptr<rocksdb::Directory>* target_;
173 | };
174 | 
175 | // A wrapper for Env that forwards the file deletion information to RocksDBImpl.
176 | class RocksDBEnv : public rocksdb::EnvWrapper {
177 |  public:
178 |   RocksDBEnv(RocksDBImpl* rocksdb_impl)
179 |       : rocksdb::EnvWrapper(rocksdb::Env::Default()),
180 |         rocksdb_impl_(rocksdb_impl) {}
181 | 
182 |   virtual ~RocksDBEnv() OVERRIDE {}
183 | 
184 |   virtual rocksdb::Status NewSequentialFile(
185 |       const std::string& f, std::unique_ptr<rocksdb::SequentialFile>* r,
186 |       const rocksdb::EnvOptions& options) OVERRIDE {
187 |     std::unique_ptr<rocksdb::SequentialFile>* r2 =
188 |         new std::unique_ptr<rocksdb::SequentialFile>();
189 |     rocksdb::Status status = target()->NewSequentialFile(f, r2, options);
190 |     if (*r2 != NULL)
191 |       r->reset(new RocksDBSequentialFile(rocksdb_impl_, r2));
192 |     else
193 |       delete r2;
194 |     return status;
195 |   }
196 | 
197 |   virtual rocksdb::Status NewRandomAccessFile(
198 |       const std::string& f, std::unique_ptr<rocksdb::RandomAccessFile>* r,
199 |       const rocksdb::EnvOptions& options) OVERRIDE {
200 |     std::unique_ptr<rocksdb::RandomAccessFile>* r2 =
201 |         new std::unique_ptr<rocksdb::RandomAccessFile>();
202 |     rocksdb::Status status = target()->NewRandomAccessFile(f, r2, options);
203 |     if (*r2 != NULL)
204 |       r->reset(new RocksDBRandomAccessFile(rocksdb_impl_, r2));
205 |     else
206 |       delete r2;
207 |     return status;
208 |   }
209 | 
210 |   virtual rocksdb::Status NewWritableFile(
211 |       const std::string& f, std::unique_ptr<rocksdb::WritableFile>* r,
212 |       const rocksdb::EnvOptions& options) OVERRIDE {
213 |     std::unique_ptr<rocksdb::WritableFile>* r2 =
214 |         new std::unique_ptr<rocksdb::WritableFile>();
215 |     rocksdb::Status status = target()->NewWritableFile(f, r2, options);
216 |     if (*r2 != NULL)
217 |       r->reset(new RocksDBWritableFile(rocksdb_impl_, r2));
218 |     else
219 |       delete r2;
220 |     return status;
221 |   }
222 | 
223 |   virtual rocksdb::Status NewDirectory(
224 |       const std::string& f, std::unique_ptr<rocksdb::Directory>* r) OVERRIDE {
225 |     std::unique_ptr<rocksdb::Directory>* r2 =
226 |         new std::unique_ptr<rocksdb::Directory>();
227 |     rocksdb::Status status = target()->NewDirectory(f, r2);
228 |     if (*r2 != NULL)
229 |       r->reset(new RocksDBDirectory(rocksdb_impl_, r2));
230 |     else
231 |       delete r2;
232 |     return status;
233 |   }
234 | 
235 |   virtual rocksdb::Status DeleteFile(const std::string& f) OVERRIDE {
236 |     struct stat st;
237 |     memset(&st, 0, sizeof(st));
238 |     // XXX: The file length *might* not be as large as its actual content
239 |     // because the directory metadata can be updated later than the appends.
240 |     int ret = stat(f.c_str(), &st);
241 |     if (ret == 0) rocksdb_impl_->Delete(static_cast<uint64_t>(st.st_size));
242 | 
243 |     return target()->DeleteFile(f);
244 |   }
245 | 
246 |  private:
247 |   class RocksDBImpl* rocksdb_impl_;
248 | };
249 | 
250 | RocksDBImpl::RocksDBImpl(const LevelDBParams& params, std::vector<Stat>& stats)
251 |     : params_(params), stats_(stats) {
252 |   stats_.push_back(Stat());
253 | 
254 |   pthread_mutex_init(&stats_mutex_, NULL);
255 |   read_ = 0;
256 |   appended_ = 0;
257 | 
258 |   // Clean up old files.
259 |   rocksdb::DestroyDB("rocksdb_files", rocksdb::Options());
260 | 
261 |   options_ = new rocksdb::Options();
262 | 
263 |   options_->create_if_missing = true;
264 | 
265 |   // Turn off Snappy.
266 |   options_->compression = rocksdb::CompressionType::kNoCompression;
267 | 
268 |   // Use our Env to gather statistics.
269 |   options_->env = new RocksDBEnv(this);
270 | 
271 |   // Limit the max open file count.
272 |   options_->max_open_files = 900;
273 | 
274 |   // Configure the write buffer size.
275 |   options_->write_buffer_size = params.log_size_threshold;
276 | 
277 |   // Do not overload insert.
278 |   options_->level0_file_num_compaction_trigger = 4;
279 |   options_->level0_slowdown_writes_trigger = 4;
280 |   options_->level0_stop_writes_trigger = 4;
281 | 
282 |   // Use LevelDB-style table selection.
283 |   if (params_.compaction_mode == LevelDBCompactionMode::kRocksDBMaxSize ||
284 |       params_.compaction_mode == LevelDBCompactionMode::kRocksDBMaxSizeMT)
285 |     options_->use_leveldb_table_selection = false;
286 |   else if (params_.compaction_mode == LevelDBCompactionMode::kRocksDBLinear ||
287 |            params_.compaction_mode == LevelDBCompactionMode::kRocksDBLinearMT)
288 |     options_->use_leveldb_table_selection = true;
289 |   else if (params_.compaction_mode ==
290 |            LevelDBCompactionMode::kRocksDBUniversal) {
291 |     options_->use_leveldb_table_selection =
292 |         false;  // This will be ignored anyway.
293 |     options_->compaction_style = rocksdb::kCompactionStyleUniversal;
294 |     // Use a bit more level-0 files
295 |     // options_->level0_file_num_compaction_trigger = 8;
296 |     options_->level0_file_num_compaction_trigger = 12;
297 |     // We have to adjust the maximum level-0 file count because RocksDB is stuck
298 |     // with a deadlock otherwise.
299 |     options_->level0_slowdown_writes_trigger =
300 |         options_->level0_file_num_compaction_trigger + 2;
301 |     options_->level0_stop_writes_trigger =
302 |         options_->level0_file_num_compaction_trigger + 2;
303 | 
304 |     // Adjust size_ratio to handle skewed workloads gracefully without having to
305 |     // increase the file count much.
306 |     // options.compaction_options_universal.size_ratio = 10;
307 |   } else
308 |     assert(false);
309 | 
310 |   // Use multiple threads if requested.
311 |   if (params_.compaction_mode == LevelDBCompactionMode::kRocksDBMaxSizeMT ||
312 |       params_.compaction_mode == LevelDBCompactionMode::kRocksDBLinearMT) {
313 |     // 1 thread is dedicated as a "background flush" thread
314 |     // (DBOptions::IncreaseParallelism() in rocksdb/util/options.cc)
315 |     options_->IncreaseParallelism(4 + 1);
316 |   }
317 | 
318 |   // Turn off checksumming for faster experiments (even though we already
319 |   // disabled crc32c).
320 |   // options_->verify_checksums_in_compaction = false;
321 | 
322 |   // Use custom level sizes
323 |   if (params_.use_custom_sizes) {
324 |     std::size_t* custom_level_sizes = new std::size_t[20];
325 | 
326 |     std::ifstream ifs("output_sensitivity.txt");
327 |     while (!ifs.eof()) {
328 |       std::string line;
329 |       std::getline(ifs, line);
330 | 
331 |       std::istringstream iss(line);
332 |       std::vector<std::string> tokens{std::istream_iterator<std::string>{iss},
333 |                                       std::istream_iterator<std::string>{}};
334 | 
335 |       if (tokens.size() < 5) continue;
336 |       if (tokens[0] != "sensitivity_item_count_leveldb_best_sizes" &&
337 |           tokens[0] != "sensitivity_log_size_leveldb_best_sizes")
338 |         continue;
339 |       if (static_cast<uint64_t>(atol(tokens[1].c_str())) !=
340 |           params_.hint_num_unique_keys)
341 |         continue;
342 |       if (atof(tokens[2].c_str()) != params_.hint_theta) continue;
343 |       if (static_cast<uint64_t>(atol(tokens[3].c_str())) !=
344 |           params_.log_size_threshold)
345 |         continue;
346 | 
347 |       options_->custom_level_size_count = tokens.size() - 5 + 1;
348 | 
349 |       custom_level_sizes[0] = 0;
350 |       std::size_t level;
351 |       for (level = 1; level < options_->custom_level_size_count; level++) {
352 |         custom_level_sizes[level] = static_cast<size_t>(
353 |             atof(tokens[5 + level - 1].c_str()) * 1000. + 0.5);
354 |         printf("level-%zu: %zu\n", level, custom_level_sizes[level]);
355 |       }
356 |       // Make the last level very large and not spill.
357 |       level--;
358 |       custom_level_sizes[level] = 1000000000000000LU;
359 |       printf("level-%zu: %zu (expanded)\n", level, custom_level_sizes[level]);
360 |       printf("\n");
361 |       break;
362 |     }
363 |     assert(options_->custom_level_size_count != 0);
364 | 
365 |     options_->custom_level_sizes = custom_level_sizes;
366 |   }
367 | 
368 |   rocksdb::Status status = rocksdb::DB::Open(*options_, "rocksdb_files", &db_);
369 |   if (!status.ok()) {
370 |     printf("%s\n", status.ToString().c_str());
371 |     assert(false);
372 |   }
373 | 
374 |   memset(value_buf_, 0, sizeof(value_buf_));
375 | }
376 | 
377 | RocksDBImpl::~RocksDBImpl() {
378 |   delete db_;
379 | 
380 |   delete options_->env;
381 |   if (params_.use_custom_sizes) delete[] options_->custom_level_sizes;
382 |   delete options_;
383 | 
384 |   pthread_mutex_destroy(&stats_mutex_);
385 | }
386 | 
387 | void RocksDBImpl::print_status() const {
388 |   // Force updating stats.
389 |   const_cast<RocksDBImpl*>(this)->Delete(0);
390 | }
391 | 
392 | void RocksDBImpl::dump_state(FILE* fp) const {
393 |   // TODO: Implement.
394 |   (void)fp;
395 | }
396 | 
397 | void RocksDBImpl::put(LevelDBKey key, uint32_t item_size) {
398 |   // LevelDB includes the full SSTable file size during calculating the level
399 |   // size;
400 |   // we consider the average space overhead per item in LevelDB so that the
401 |   // average stored size becomes similar to item_size.
402 |   const uint32_t overhead = 18;
403 | 
404 |   rocksdb::Slice s_key(reinterpret_cast<const char*>(&key), sizeof(key));
405 |   uint32_t value_size =
406 |       static_cast<uint32_t>(static_cast<std::size_t>(item_size) - sizeof(key)) -
407 |       overhead;
408 |   assert(value_size < sizeof(value_buf_));
409 |   rocksdb::Slice s_value(value_buf_, value_size);
410 | 
411 |   rocksdb::Status status = db_->Put(rocksdb::WriteOptions(), s_key, s_value);
412 |   if (!status.ok()) {
413 |     printf("%s\n", status.ToString().c_str());
414 |     assert(false);
415 |   }
416 | }
417 | 
418 | void RocksDBImpl::del(LevelDBKey key) {
419 |   rocksdb::Slice s_key(reinterpret_cast<const char*>(&key), sizeof(key));
420 | 
421 |   rocksdb::Status status = db_->Delete(rocksdb::WriteOptions(), s_key);
422 |   if (!status.ok()) {
423 |     printf("%s\n", status.ToString().c_str());
424 |     assert(false);
425 |   }
426 | }
427 | 
428 | uint64_t RocksDBImpl::get(LevelDBKey key) {
429 |   rocksdb::Slice s_key(reinterpret_cast<const char*>(&key), sizeof(key));
430 |   std::string s_value;
431 |   uint64_t value;
432 | 
433 |   rocksdb::Status status = db_->Get(rocksdb::ReadOptions(), s_key, &s_value);
434 |   if (!status.ok()) {
435 |     printf("%s\n", status.ToString().c_str());
436 |     assert(false);
437 |   }
438 |   assert(s_value.size() >= sizeof(uint64_t));
439 |   value = *reinterpret_cast<const uint64_t*>(s_value.data());
440 |   return value;
441 | }
442 | 
443 | void RocksDBImpl::force_compact() {
444 |   rocksdb::CompactRangeOptions options;
445 |   options.change_level = false;
446 |   options.target_level = -1;
447 |   options.target_path_id = 0;
448 | 
449 |   db_->CompactRange(options, NULL, NULL);
450 | 
451 |   // Force stat update.
452 |   Delete(0);
453 | }
454 | 
455 | void RocksDBImpl::Read(std::size_t len) { __sync_fetch_and_add(&read_, len); }
456 | 
457 | void RocksDBImpl::Append(std::size_t len) {
458 |   __sync_fetch_and_add(&appended_, len);
459 | }
460 | 
461 | void RocksDBImpl::Delete(std::size_t len) {
462 |   uint64_t read = read_;
463 |   __sync_fetch_and_sub(&read_, read);
464 |   uint64_t appended = appended_;
465 |   __sync_fetch_and_sub(&appended_, appended);
466 | 
467 |   pthread_mutex_lock(&stats_mutex_);
468 |   if (read != 0) stats_.back().read(read);
469 |   if (appended != 0) stats_.back().write(appended);
470 |   if (len != 0) stats_.back().del(len);
471 |   pthread_mutex_unlock(&stats_mutex_);
472 | }
473 | 


--------------------------------------------------------------------------------
/rocksdb_impl.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "leveldb.h"
 4 | #include <pthread.h>
 5 | 
 6 | namespace rocksdb {
 7 | // For forward declaration.
 8 | class DB;
 9 | class Options;
10 | }
11 | 
12 | // An interface to the RocksDB implementation
13 | class RocksDBImpl {
14 |   friend class RocksDBSequentialFile;
15 |   friend class RocksDBRandomAccessFile;
16 |   friend class RocksDBWritableFile;
17 |   friend class RocksDBDirectory;
18 |   friend class RocksDBEnv;
19 | 
20 |  public:
21 |   RocksDBImpl(const LevelDBParams& params, std::vector<Stat>& stats);
22 |   ~RocksDBImpl();
23 | 
24 |   // Prints the summary of the store.
25 |   void print_status() const;
26 | 
27 |   // Writes the current items in the store to the file.
28 |   void dump_state(FILE* fp) const;
29 | 
30 |   // Puts a new item in the store.
31 |   void put(LevelDBKey key, uint32_t item_size);
32 | 
33 |   // Deletes an item from the store.
34 |   void del(LevelDBKey key);
35 | 
36 |   // Gets an item from the store.
37 |   uint64_t get(LevelDBKey key);
38 | 
39 |   // Forces compaction until there is no SSTable except the last level.
40 |   void force_compact();
41 | 
42 |  protected:
43 |   void Read(std::size_t len);
44 |   void Append(std::size_t len);
45 |   void Delete(std::size_t len);
46 | 
47 |  private:
48 |   LevelDBParams params_;
49 |   std::vector<Stat>& stats_;
50 | 
51 |   rocksdb::Options* options_;
52 |   rocksdb::DB* db_;
53 | 
54 |   pthread_mutex_t stats_mutex_;
55 |   volatile uint64_t read_;
56 |   volatile uint64_t appended_;
57 | 
58 |   char value_buf_[1024];
59 | };
60 | 


--------------------------------------------------------------------------------
/stat.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common.h"
 4 | 
 5 | class Stat {
 6 |  public:
 7 |   Stat() { reset_all(); }
 8 | 
 9 |   void reset() {
10 |     read_count_ = 0;
11 |     read_bytes_ = 0;
12 |     write_count_ = 0;
13 |     write_bytes_ = 0;
14 |     delete_count_ = 0;
15 |     delete_bytes_ = 0;
16 |   }
17 | 
18 |   void reset_all() {
19 |     reset();
20 |     current_bytes_ = 0;
21 |   }
22 | 
23 |   void read(uint64_t num_bytes) {
24 |     read_count_++;
25 |     read_bytes_ += static_cast<int64_t>(num_bytes);
26 |   }
27 | 
28 |   void write(uint64_t num_bytes) {
29 |     write_count_++;
30 |     write_bytes_ += static_cast<int64_t>(num_bytes);
31 |     current_bytes_ += static_cast<int64_t>(num_bytes);
32 |   }
33 | 
34 |   void overwrite(uint64_t num_bytes) {
35 |     write_count_++;
36 |     write_bytes_ += static_cast<int64_t>(num_bytes);
37 |   }
38 | 
39 |   void del(uint64_t num_bytes) {
40 |     delete_count_++;
41 |     delete_bytes_ += static_cast<int64_t>(num_bytes);
42 |     current_bytes_ -= static_cast<int64_t>(num_bytes);
43 |   }
44 | 
45 |   int64_t read_count() const { return read_count_; }
46 |   int64_t read_bytes() const { return read_bytes_; }
47 |   int64_t write_count() const { return write_count_; }
48 |   int64_t write_bytes() const { return write_bytes_; }
49 |   int64_t delete_count() const { return delete_count_; }
50 |   int64_t delete_bytes() const { return delete_bytes_; }
51 |   int64_t current_bytes() const { return current_bytes_; }
52 | 
53 |   void print_status() const {
54 |     printf("Read: %ld times, %ld bytes\n", read_count_, read_bytes_);
55 |     printf("Write: %ld times, %ld bytes\n", write_count_, write_bytes_);
56 |     printf("Delete: %ld times, %ld bytes\n", delete_count_, delete_bytes_);
57 |     printf("Current size: %ld bytes\n", current_bytes_);
58 |   }
59 | 
60 |  private:
61 |   int64_t read_count_;
62 |   int64_t read_bytes_;
63 |   int64_t write_count_;
64 |   int64_t write_bytes_;
65 |   int64_t delete_count_;
66 |   int64_t delete_bytes_;
67 |   int64_t current_bytes_;
68 | };
69 | 


--------------------------------------------------------------------------------
/util.cpp:
--------------------------------------------------------------------------------
 1 | #include "util.h"
 2 | #include <cstdlib>
 3 | #include <cstdio>
 4 | 
 5 | template <typename T>
 6 | void sequence(T n, std::vector<T>& out) {
 7 |   out.clear();
 8 |   out.reserve(static_cast<std::size_t>(n));
 9 |   for (T i = 0; i < n; i++) out.push_back(i);
10 | }
11 | 
12 | template <typename T>
13 | void shuffle(std::vector<T>& v) {
14 |   unsigned int seed =
15 |       0;  // std::chrono::system_clock::now().time_since_epoch().count();
16 |   std::shuffle(v.begin(), v.end(), std::default_random_engine(seed));
17 |   // std::size_t count = v.size();
18 |   // for (std::size_t i = 0; i < count; i++) {
19 |   //	std::size_t j = i + (rand() % (count - i));
20 |   //	std::swap(v[i], v[j]);
21 |   // }
22 | }
23 | 
24 | template void sequence(uint64_t n, std::vector<uint64_t>& out);
25 | template void shuffle(std::vector<uint64_t>& v);
26 | template void sequence(uint32_t n, std::vector<uint32_t>& out);
27 | template void shuffle(std::vector<uint32_t>& v);
28 | 
29 | void uniform_pdf(uint64_t n, std::vector<double>& out_pdf) {
30 |   out_pdf.clear();
31 |   out_pdf.reserve(n);
32 |   for (uint64_t i = 0; i < n; i++) out_pdf.push_back(1.);
33 | }
34 | 
35 | void pdf_to_cdf(const std::vector<double>& pdf, std::vector<double>& out_cdf) {
36 |   std::size_t count = pdf.size();
37 |   out_cdf.clear();
38 |   out_cdf.reserve(count);
39 |   double s = 0.;
40 |   for (std::size_t i = 0; i < count; i++) {
41 |     s += pdf[i];
42 |     out_cdf.push_back(s);
43 |   }
44 | }
45 | 
46 | // def sample(cdf, count):
47 | //     """Gets samples from CDF."""
48 | //     r = random.random
49 | //     b = bisect.bisect_left
50 | //     s = cdf[-1]
51 | //     result = [0] * count
52 | //     for i in range(count):
53 | //         v = r() * s
54 | //         k = b(cdf, v)
55 | //         result[i] = k
56 | //     return result
57 | 


--------------------------------------------------------------------------------
/util.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common.h"
 4 | #include <random>
 5 | #include <array>
 6 | 
 7 | // A sequence of increasing numbers.
 8 | template <typename T>
 9 | void sequence(T n, std::vector<T>& out);
10 | 
11 | // In-place shuffling.
12 | template <typename T>
13 | void shuffle(std::vector<T>& v);
14 | 
15 | // PDF of uniform distribution
16 | void uniform_pdf(uint64_t n, std::vector<double>& out_pdf);
17 | 
18 | // Convert PDF to CDF.
19 | void pdf_to_cdf(const std::vector<double>& pdf, std::vector<double>& out_cdf);
20 | 
21 | // Fast random number generators.
22 | static uint32_t fast_rand(uint64_t* state) {
23 |   // same as Java's
24 |   *state = (*state * 0x5deece66dUL + 0xbUL) & ((1UL << 48) - 1);
25 |   return (uint32_t)(*state >> (48 - 32));
26 | }
27 | 
28 | static double fast_rand_d(uint64_t* state) {
29 |   *state = (*state * 0x5deece66dUL + 0xbUL) & ((1UL << 48) - 1);
30 |   return (double)*state / (double)((1UL << 48) - 1);
31 | }
32 | 


--------------------------------------------------------------------------------
/zipf.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "common.h"
  4 | #include "util.h"
  5 | #include <cstdlib>
  6 | #include <cstring>
  7 | #include <cmath>
  8 | #include <cstdio>
  9 | 
 10 | struct zipf_gen_state {
 11 |   uint64_t n;    // number of items (input)
 12 |   double theta;  // skewness (input) in (0, 1); or, 0 = uniform, 1 = always zero
 13 |   double alpha;  // only depends on theta
 14 |   double thres;  // only depends on theta
 15 |   uint64_t last_n;  // last n used to calculate the following
 16 |   double dbl_n;
 17 |   double zetan;
 18 |   double eta;
 19 |   // unsigned short rand_state[3];		// prng state
 20 |   uint64_t rand_state;
 21 | };
 22 | 
 23 | static double pow_approx(double a, double b) {
 24 |   // from
 25 |   // http://martin.ankerl.com/2012/01/25/optimized-approximative-pow-in-c-and-cpp/
 26 | 
 27 |   // calculate approximation with fraction of the exponent
 28 |   int e = (int)b;
 29 |   union {
 30 |     double d;
 31 |     int x[2];
 32 |   } u = {a};
 33 |   u.x[1] = (int)((b - (double)e) * (double)(u.x[1] - 1072632447) + 1072632447.);
 34 |   u.x[0] = 0;
 35 | 
 36 |   // exponentiation by squaring with the exponent's integer part
 37 |   // double r = u.d makes everything much slower, not sure why
 38 |   // TODO: use popcount?
 39 |   double r = 1.;
 40 |   while (e) {
 41 |     if (e & 1) r *= a;
 42 |     a *= a;
 43 |     e >>= 1;
 44 |   }
 45 | 
 46 |   return r * u.d;
 47 | }
 48 | 
 49 | static void zipf_init(struct zipf_gen_state* state, uint64_t n, double theta,
 50 |                       uint64_t rand_seed) {
 51 |   assert(n > 0);
 52 |   if (theta > 0.992 && theta < 1)
 53 |     fprintf(stderr, "theta > 0.992 will be inaccurate due to approximation\n");
 54 |   if (theta >= 1. && theta < 40.) {
 55 |     fprintf(stderr, "theta in [1., 40.) is not supported\n");
 56 |     assert(false);
 57 |   }
 58 |   assert(theta == -1. || (theta >= 0. && theta < 1.) || theta >= 40.);
 59 |   assert(rand_seed < (1UL << 48));
 60 |   memset(state, 0, sizeof(struct zipf_gen_state));
 61 |   state->n = n;
 62 |   state->theta = theta;
 63 |   if (theta == -1.)
 64 |     rand_seed = rand_seed % n;
 65 |   else if (theta > 0. && theta < 1.) {
 66 |     state->alpha = 1. / (1. - theta);
 67 |     state->thres = 1. + pow_approx(0.5, theta);
 68 |   } else {
 69 |     state->alpha = 0.;  // unused
 70 |     state->thres = 0.;  // unused
 71 |   }
 72 |   state->last_n = 0;
 73 |   state->zetan = 0.;
 74 |   // state->rand_state[0] = (unsigned short)(rand_seed >> 0);
 75 |   // state->rand_state[1] = (unsigned short)(rand_seed >> 16);
 76 |   // state->rand_state[2] = (unsigned short)(rand_seed >> 32);
 77 |   state->rand_state = rand_seed;
 78 | }
 79 | 
 80 | static void zipf_init_copy(struct zipf_gen_state* state,
 81 |                            const struct zipf_gen_state* src_state,
 82 |                            uint64_t rand_seed) {
 83 |   assert(rand_seed < (1UL << 48));
 84 |   memcpy(state, src_state, sizeof(struct zipf_gen_state));
 85 |   // state->rand_state[0] = (unsigned short)(rand_seed >> 0);
 86 |   // state->rand_state[1] = (unsigned short)(rand_seed >> 16);
 87 |   // state->rand_state[2] = (unsigned short)(rand_seed >> 32);
 88 |   state->rand_state = rand_seed;
 89 | }
 90 | 
 91 | static void zipf_change_n(struct zipf_gen_state* state, uint64_t n) {
 92 |   state->n = n;
 93 | }
 94 | 
 95 | static double zeta(uint64_t last_n, double last_sum, uint64_t n, double theta) {
 96 |   if (last_n > n) {
 97 |     last_n = 0;
 98 |     last_sum = 0.;
 99 |   }
100 |   while (last_n < n) {
101 |     last_sum += 1. / pow_approx((double)last_n + 1., theta);
102 |     last_n++;
103 |   }
104 |   return last_sum;
105 | }
106 | 
107 | static uint64_t zipf_next(struct zipf_gen_state* state) {
108 |   if (state->last_n != state->n) {
109 |     if (state->theta > 0. && state->theta < 1.) {
110 |       state->zetan = zeta(state->last_n, state->zetan, state->n, state->theta);
111 |       state->eta = (1. - pow_approx(2. / (double)state->n, 1. - state->theta)) /
112 |                    (1. - zeta(0, 0., 2, state->theta) / state->zetan);
113 |     }
114 |     state->last_n = state->n;
115 |     state->dbl_n = (double)state->n;
116 |   }
117 | 
118 |   if (state->theta == -1.) {
119 |     uint64_t v = state->rand_state;
120 |     if (++state->rand_state >= state->n) state->rand_state = 0;
121 |     return v;
122 |   } else if (state->theta == 0.) {
123 |     double u = fast_rand_d(&state->rand_state);
124 |     return (uint64_t)(state->dbl_n * u);
125 |   } else if (state->theta >= 40.) {
126 |     return 0UL;
127 |   } else {
128 |     // from J. Gray et al. Quickly generating billion-record synthetic
129 |     // databases. In SIGMOD, 1994.
130 | 
131 |     // double u = erand48(state->rand_state);
132 |     double u = fast_rand_d(&state->rand_state);
133 |     double uz = u * state->zetan;
134 |     if (uz < 1.)
135 |       return 0UL;
136 |     else if (uz < state->thres)
137 |       return 1UL;
138 |     else
139 |       return (uint64_t)(state->dbl_n *
140 |                         pow_approx(state->eta * (u - 1.) + 1., state->alpha));
141 |   }
142 | }
143 | 
144 | static double zipf_prob(const struct zipf_gen_state* state, uint64_t i) {
145 |   // this must be called after at least one zipf_next() invocation
146 |   if (state->theta == -1.)
147 |     return 1.;
148 |   else if (state->theta == 0.)
149 |     return 1.;
150 |   else if (state->theta >= 40.) {
151 |     if (i == 0)
152 |       return 1.;
153 |     else
154 |       return 0.;
155 |   } else {
156 |     return 1. / pow_approx((double)i + 1., state->theta);
157 |   }
158 | }
159 | 
160 | static void test_zipf(double theta) {
161 |   double zetan = 0.;
162 |   const uint64_t n = 1000000UL;
163 |   uint64_t i;
164 | 
165 |   for (i = 0; i < n; i++) zetan += 1. / pow((double)i + 1., theta);
166 | 
167 |   struct zipf_gen_state state;
168 |   if (theta < 1. || theta >= 40.) zipf_init(&state, n, theta, 0);
169 | 
170 |   uint64_t num_key0 = 0;
171 |   const uint64_t num_samples = 10000000UL;
172 |   if (theta < 1. || theta >= 40.) {
173 |     for (i = 0; i < num_samples; i++)
174 |       if (zipf_next(&state) == 0) num_key0++;
175 |   }
176 | 
177 |   printf("theta = %lf; using pow(): %.10lf", theta, 1. / zetan);
178 |   if (theta < 1. || theta >= 40.)
179 |     printf(", using approx-pow(): %.10lf",
180 |            (double)num_key0 / (double)num_samples);
181 |   printf("\n");
182 | }
183 | 


--------------------------------------------------------------------------------