├── .cproject
├── .gitignore
├── .project
├── .settings
└── language.settings.xml
├── Makefile
├── README.txt
├── TODO.txt
├── src
├── common.h
├── compress.c
├── list.c
├── list.h
├── stream.c
├── stream.h
├── symbol.c
└── symbol.h
├── test.sh
└── validate.sh
/.cproject:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /Debug/
2 | /gmon.out
3 | /Release/
4 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | compress
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.cdt.managedbuilder.core.genmakebuilder
10 | clean,full,incremental,
11 |
12 |
13 |
14 |
15 | org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder
16 | full,incremental,
17 |
18 |
19 |
20 |
21 |
22 | org.eclipse.cdt.core.cnature
23 | org.eclipse.cdt.managedbuilder.core.managedBuildNature
24 | org.eclipse.cdt.managedbuilder.core.ScannerConfigNature
25 |
26 |
27 |
--------------------------------------------------------------------------------
/.settings/language.settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 |
2 | PROG=Release/compress
3 |
4 | .PHONY: test
5 |
6 | test_se:
7 | $(PROG) -c -m se data.bin test_out.bin
8 | $(PROG) -e -m se test_out.bin test_in.bin
9 | diff data.bin test_in.bin
10 | $(PROG) -c -m se code.bin test_out.bin
11 | $(PROG) -e -m se test_out.bin test_in.bin
12 | diff code.bin test_in.bin
13 | $(PROG) -c -m se ash.bin test_out.bin
14 | $(PROG) -e -m se test_out.bin test_in.bin
15 | diff ash.bin test_in.bin
16 |
17 | test: test_se
18 |
--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
1 | PURPOSE
2 |
3 | This project features a generic compressor / decompressor, in standard C langage
4 | for best performance and portability.
5 |
6 | The compressor is intended to run on a host with standard resources (development
7 | PC). The decompressor is in turn intended to run on a target with limited
8 | resources (embedded, IoT).
9 |
10 | The main goal is to save storage space on the target, by compressing at most the
11 | read-only data on the host, and to decompress on the target at the lowest cost,
12 | for a limited impact on the load time.
13 |
14 | A secondary goal is to compress and decompress on the target some limited amount
15 | of read-write data, keeping the lowest cost but having a valuable ratio.
16 |
17 | Inspired by the famous & venerable Exomizer:
18 | https://github.com/bitshifters/exomizer
19 |
20 |
21 | DESIGN
22 |
23 | Because of small data sizes on the target, compression is performed on the
24 | whole initial sequence of base symbols (= characters as byte codes). This gives
25 | a better symbol ratio, but requires more computation than the algorithms using
26 | a sliding window (these are better suited for long data streams).
27 |
28 | The compressor repeatedly scans the sequence to find elementary patterns as
29 | symbol pairs, then replaces the most frequent & asymmetric pair by a secondary
30 | symbol, thus building a binary tree of symbols and a reduced final sequence.
31 |
32 | When no more asymmetric pair is duplicated, the compressor reduces the tree,
33 | (including the repeated symbols), then serializes that tree as an indexed table
34 | of words (= dictionary), plus the final sequence.
35 |
36 | As this dictionary is static, preceding or embedded in the sequence, it saves
37 | the cost of dynamically rebuild it at decompression.
38 |
39 | The table and the sequence are encoded as a bit stream. Base symbols are
40 | serialized as byte codes, while secondary ones are serialized using indexes.
41 |
42 | Prefixed coding is prefered to Huffman or arithmetic ones to keep the
43 | decompression cost low, even if less optimal.
44 |
45 | Decompression is much simpler. It decodes the bit stream, rebuild the symbol
46 | tree from the table, iterates on the sequence and recursively walks the tree.
47 |
48 |
49 | STATUS
50 |
51 | WORK IN PROGRESS
52 |
53 | Already implemented:
54 | - symbol listing
55 | - asymmetric pairing
56 | - repeated symbol in sequence
57 | - tree walking
58 | - bit coding & streaming
59 | - external loopback test
60 |
61 | Result:
62 | - already good symbol ratio
63 | - already good decompression time
64 | - acceptable compression time
65 | - but still bad compression ratio
66 |
67 | See TODO.txt for next steps.
68 |
69 |
70 | BENCHMARK
71 |
72 | Samples from ELKS project:
73 | https://github.com/jbruchon/elks
74 |
75 | - data: kernel data only
76 | - code: kernel code only
77 | - ash: shell (mixed code & data)
78 |
79 | Compression ratio:
80 |
81 | ENCODING DATA CODE ASH
82 |
83 | Initial 6151 43584 51216
84 | B(ase) 6151 43584 51216 Just for testing
85 | R(epeat)B 5650 48716 55948 Not efficient for code
86 | P(refix)B 4840 41659 48955
87 | RPB 4752 43472 50479 Less efficient for code
88 | S(ymbol)E 4835 31821 38006
89 | SI 4547 30853 36307
90 | RSE 3875 35903 41736 Less efficient for code
91 | RSI x x x
92 | PS x x x
93 | RPS x x x
94 |
95 | gzip -1 3084 30322 34807
96 | gzip 2999 29230 33660
97 | gzip -9 2999 29216 33652
98 |
99 | exomizer 2956 29073 33192
100 |
101 |
102 | Compression time for ASH (ms):
103 |
104 | ENCODING COMPRESS EXPAND
105 |
106 | B(ase) 6 2
107 | R(epeat)B - -
108 | P(refix)B 9 3
109 | RPB - -
110 | S(ymbol)E 3885 6
111 | SI 2240 2
112 | RSE 2395 2
113 | RSI x x
114 | PS x x
115 | RPS x x
116 |
117 | gzip -1 4 2
118 | gzip 6 2
119 | gzip -9 6 2
120 |
121 | exomizer 2146 3
122 |
--------------------------------------------------------------------------------
/TODO.txt:
--------------------------------------------------------------------------------
1 | TODO LIST
2 |
3 | Needed:
4 | - quick sort on list ?
5 |
6 | Improvements:
7 | - encoding option
8 | - precompute definition length before serializing ?
9 | - merge table and sequence in RS (= RSI)
10 | - repeat symbol in tree
11 | - coding cost computation
12 | - self optimization based on cost
13 | - automatic benchmarking
14 |
15 | Huffman coding experiment:
16 | - binary tree structure
17 | - binary tree coding
18 | - binary tree decoding
19 | - adaptative tree
20 | - symbol coding
21 | - lenght and count coding ?
22 |
--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | typedef unsigned char uchar_t;
4 | typedef unsigned char uchar;
5 | typedef unsigned int uint_t;
6 | typedef unsigned int uint;
7 |
8 | #define structof (type, member, pointer) ( \
9 | (type *) ((char *) pointer - offsetof (type, member)))
10 |
--------------------------------------------------------------------------------
/src/compress.c:
--------------------------------------------------------------------------------
1 | //------------------------------------------------------------------------------
2 | // Compressor
3 | //------------------------------------------------------------------------------
4 |
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | #include "common.h"
12 | #include "list.h"
13 | #include "stream.h"
14 | #include "symbol.h"
15 |
16 |
17 | // Element definition
18 | // Used for decompression
19 |
20 | struct elem_s
21 | {
22 | uint_t base;
23 | uint_t size;
24 | };
25 |
26 | typedef struct elem_s elem_t;
27 |
28 | static elem_t elements [SYMBOL_MAX];
29 | static uint_t elem_count;
30 |
31 | static uint_t patterns [FRAME_MAX];
32 | static uint_t patt_len;
33 |
34 |
35 | // Program options
36 |
37 | #define ALGO_DEF 0
38 | #define ALGO_BASE 1
39 | #define ALGO_REP_BASE 2
40 | #define ALGO_PREF 3
41 | #define ALGO_REP_PREF 4
42 | #define ALGO_SYM_EXT 5
43 | #define ALGO_SYM_INT 6
44 | #define ALGO_REP_SE 7
45 |
46 | uchar_t opt_algo;
47 | uchar_t opt_compress;
48 | uchar_t opt_expand;
49 | uchar_t opt_sym;
50 | uchar_t opt_verb;
51 |
52 |
53 | //------------------------------------------------------------------------------
54 | // Algorithms
55 | //------------------------------------------------------------------------------
56 |
57 | // Compression with "base" (no compression)
58 | // Just for testing
59 |
60 | static void compress_b ()
61 | {
62 | list_t * node = pos_root.next;
63 | while (node != &pos_root)
64 | {
65 | position_t * pos = (position_t *) node; // node as first member
66 | symbol_t * sym = pos->sym;
67 | out_byte (sym->code);
68 |
69 | node = node->next;
70 | }
71 | }
72 |
73 |
74 | // Decompression with "base" (no decompression)
75 | // Just for testing
76 |
77 | static void expand_b ()
78 | {
79 | for (uint_t i = 0; i < size_in; i++)
80 | {
81 | out_byte (in_byte ());
82 | }
83 | }
84 |
85 |
86 | // Compression with "repeated base"
87 | // Just for testing
88 |
89 | static void compress_rb ()
90 | {
91 | crunch_rep ();
92 |
93 | out_pref_odd (pos_count - 1);
94 |
95 | list_t * node = pos_root.next;
96 | while (node != &pos_root)
97 | {
98 | position_t * pos = (position_t *) node; // node as first member
99 | symbol_t * sym = pos->sym;
100 |
101 | if (sym->rep_count > 1)
102 | {
103 | out_bit (1); // repeat flag
104 |
105 | out_pref_odd (sym->rep_count - 2);
106 |
107 | sym = sym->left;
108 | }
109 | else
110 | {
111 | out_bit (0);
112 | }
113 |
114 | out_code (sym->code, 8);
115 |
116 | node = node->next;
117 | }
118 |
119 | out_pad ();
120 | }
121 |
122 |
123 | // Decompression with "repeated base"
124 | // Just for testing
125 |
126 | static void expand_rb ()
127 | {
128 | uint_t count = 1 + in_pref_odd ();
129 |
130 | for (uint_t p = 0; p < count; p++)
131 | {
132 | if (!in_bit ()) // code flag
133 | {
134 | out_byte (in_code (8));
135 | }
136 | else
137 | {
138 | uint_t rep = 2 + in_pref_odd ();
139 | uchar_t code = in_code (8);
140 | while (rep--) out_byte (code);
141 | }
142 | }
143 | }
144 |
145 |
146 | // Compression with "prefixed base"
147 | // Just for testing
148 |
149 | static void compress_pb ()
150 | {
151 | if (!opt_sym) sym_sort (SORT_ALL);
152 |
153 | // No more than 6 prefixed bits to save space
154 | // so no more than 14 indexed symbols
155 |
156 | uint_t count = (sym_count < 14) ? sym_count : 14;
157 |
158 | out_pref_odd (count - 1);
159 |
160 | for (uint_t i = 0; i < count; i++)
161 | {
162 | index_sym_t * index = index_sym + i;
163 | symbol_t * sym = index->sym;
164 | out_code (sym->code, 8);
165 | }
166 |
167 | out_pref_odd (pos_count - 1);
168 |
169 | list_t * node = pos_root.next;
170 | while (node != &pos_root)
171 | {
172 | position_t * pos = (position_t *) node; // node as first member
173 | symbol_t * sym = pos->sym;
174 |
175 | // Use index only when space gain
176 |
177 | if (sym->index < count)
178 | {
179 | out_bit (1); // index flag
180 | out_pref_even (sym->index);
181 | }
182 | else
183 | {
184 | out_bit (0); // code flag
185 | out_code (sym->code, 8);
186 | }
187 |
188 | node = node->next;
189 | }
190 |
191 | out_pad ();
192 | }
193 |
194 |
195 | // Decompression with "prefixed base"
196 | // Just for testing
197 |
198 | static void expand_pb ()
199 | {
200 | list_init (&sym_root);
201 |
202 | uint_t count = 1 + in_pref_odd ();
203 |
204 | for (uint_t i = 0; i < count; i++)
205 | {
206 | index_sym_t * index = index_sym + i;
207 | symbol_t * sym = sym_add ();
208 | sym->code = in_code (8);
209 | index->sym = sym;
210 | }
211 |
212 | count = 1 + in_pref_odd ();
213 |
214 | for (uint_t p = 0; p < count; p++)
215 | {
216 | if (in_bit ()) // index flag
217 | {
218 | uint_t i = in_pref_even ();
219 | index_sym_t * index = index_sym + i;
220 | symbol_t * sym = index->sym;
221 | out_byte (sym->code);
222 | }
223 | else
224 | {
225 | out_byte (in_code (8));
226 | }
227 | }
228 | }
229 |
230 |
231 | // Compression with "repeated prefixed base"
232 | // Just for testing
233 |
234 | static void compress_rpb ()
235 | {
236 | crunch_rep ();
237 |
238 | uint_t count = sym_sort (SORT_REP);
239 |
240 | // No more than 6 prefixed bits to save space
241 | // so no more than 14 indexed symbols
242 |
243 | count = (count < 14) ? count : 14;
244 |
245 | out_pref_odd (count - 1);
246 |
247 | for (uint_t i = 0; i < count; i++)
248 | {
249 | index_sym_t * index = index_sym + i;
250 | symbol_t * sym = index->sym;
251 | out_code (sym->code, 8);
252 | }
253 |
254 | out_pref_odd (pos_count - 1);
255 |
256 | list_t * node = pos_root.next;
257 | while (node != &pos_root)
258 | {
259 | position_t * pos = (position_t *) node; // node as first member
260 | symbol_t * sym = pos->sym;
261 |
262 | uchar_t rep = 0;
263 |
264 | if (sym->rep_count > 1)
265 | {
266 | out_bit (1); // repeat word
267 | out_bit (0);
268 |
269 | out_pref_odd (sym->rep_count - 2);
270 |
271 | sym = sym->left;
272 |
273 | rep = 1;
274 | }
275 |
276 | if (sym->index < 14)
277 | {
278 | out_bit (1); // index flag or word
279 | if (!rep) out_bit (1);
280 |
281 | out_pref_even (sym->index);
282 | }
283 | else
284 | {
285 | out_bit (0); // code flag and word
286 | out_code (sym->code, 8);
287 | }
288 |
289 | node = node->next;
290 | }
291 |
292 | out_pad ();
293 | }
294 |
295 |
296 | // Decompression with "repeated prefixed base"
297 | // Just for testing
298 |
299 | static void expand_rpb ()
300 | {
301 | list_init (&sym_root);
302 |
303 | uint_t count = 1 + in_pref_odd ();
304 |
305 | for (uint_t i = 0; i < count; i++)
306 | {
307 | index_sym_t * index = index_sym + i;
308 | symbol_t * sym = sym_add ();
309 | sym->code = in_code (8);
310 | index->sym = sym;
311 | }
312 |
313 | count = 1 + in_pref_odd ();
314 |
315 | for (uint_t p = 0; p < count; p++)
316 | {
317 | if (!in_bit ()) // code word
318 | {
319 | out_byte (in_code (8));
320 | }
321 | else
322 | {
323 | uint_t rep = 1;
324 |
325 | if (!in_bit ()) // repeat word
326 | {
327 | rep = 2 + in_pref_odd ();
328 |
329 | if (in_bit ()) // index flag
330 | {
331 | uint_t i = in_pref_even ();
332 | index_sym_t * index = index_sym + i;
333 | symbol_t * sym = index->sym;
334 | while (rep--) out_byte (sym->code);
335 | }
336 | else
337 | {
338 | uchar_t code = in_code (8);
339 | while (rep--) out_byte (code);
340 | }
341 | }
342 | else
343 | {
344 | uint_t i = in_pref_even ();
345 | index_sym_t * index = index_sym + i;
346 | symbol_t * sym = index->sym;
347 | out_byte (sym->code);
348 | }
349 | }
350 | }
351 | }
352 |
353 |
354 | // Walking the symbol tree
355 |
356 | static uint_t walk_sym_len (symbol_t * sym);
357 |
358 | static uint_t walk_child_len (symbol_t * sym)
359 | {
360 | uint_t len;
361 |
362 | if (!sym->keep)
363 | {
364 | len = walk_sym_len (sym);
365 | }
366 | else
367 | {
368 | len = 1; // reference
369 | }
370 |
371 | return len;
372 | }
373 |
374 | static uint_t walk_sym_len (symbol_t * sym)
375 | {
376 | if (!sym->len)
377 | {
378 | if (sym->size == 1)
379 | {
380 | sym->len = 1; // base code
381 | }
382 | else
383 | {
384 | sym->len = walk_child_len (sym->left);
385 | sym->len += walk_child_len (sym->right);
386 | }
387 | }
388 |
389 | return sym->len;
390 | }
391 |
392 |
393 | // Walk tree to compute cost
394 |
395 | static uint_t walk_def_cost (symbol_t * sym, uchar_t bit_len);
396 |
397 | static uint_t walk_use_cost (symbol_t * sym, uchar_t bit_len)
398 | {
399 | uint_t cost;
400 |
401 | if (!sym->keep)
402 | {
403 | cost = walk_def_cost (sym, bit_len);
404 | }
405 | else
406 | {
407 | // '1' for 'reference' + size of 'index'
408 | cost = 1 + bit_len;
409 | }
410 |
411 | return cost;
412 | }
413 |
414 | static uint_t walk_def_cost (symbol_t * sym, uchar_t bit_len)
415 | {
416 | uint_t cost;
417 |
418 | if (sym->size == 1)
419 | {
420 | // '0' for 'base' + 8 for base code
421 | cost = 1 + 8;
422 | }
423 | else
424 | {
425 | cost = walk_use_cost (sym->left, bit_len);
426 | cost += walk_use_cost (sym->right, bit_len);
427 | }
428 |
429 | return cost;
430 | }
431 |
432 |
433 | static void walk_def_out (symbol_t * sym, uchar_t bit_len);
434 |
435 | static void walk_use_out (symbol_t * sym, uchar_t bit_len)
436 | {
437 | if (!sym->keep)
438 | {
439 | walk_def_out (sym, bit_len);
440 | }
441 | else
442 | {
443 | out_bit (1); // index
444 | out_code (sym->index, bit_len);
445 | }
446 | }
447 |
448 | static void walk_def_out (symbol_t * sym, uchar_t bit_len)
449 | {
450 | if (sym->size == 1)
451 | {
452 | out_bit (0); // code
453 | out_code (sym->code, 8);
454 | }
455 | else
456 | {
457 | walk_use_out (sym->left, bit_len);
458 | walk_use_out (sym->right, bit_len);
459 | }
460 | }
461 |
462 |
463 | static void walk_sym_i (symbol_t * sym, uchar_t bit_len);
464 |
465 | static void walk_child_i (symbol_t * sym, uchar_t bit_len)
466 | {
467 | if (sym->size == 1 || (sym->sym_count == 1 && sym->pos_count == 0 && sym->rep_count != 1))
468 | {
469 | walk_sym_i (sym, bit_len);
470 | }
471 | else
472 | {
473 | if (!sym->len)
474 | {
475 | out_bit (1); // definition
476 | out_bit (0);
477 |
478 | out_pref_odd (walk_sym_len (sym) - 2);
479 | walk_sym_i (sym, bit_len);
480 |
481 | sym->index = index_count++;
482 | }
483 | else
484 | {
485 | out_bit (1); // reference
486 | out_bit (1);
487 |
488 | out_code (sym->index, bit_len);
489 | }
490 | }
491 | }
492 |
493 |
494 | static void walk_sym_i (symbol_t * sym, uchar_t bit_len)
495 | {
496 | if (sym->size == 1)
497 | {
498 | out_bit (0); // code
499 | out_code (sym->code, 8);
500 | }
501 | else
502 | {
503 | walk_child_i (sym->left, bit_len);
504 | walk_child_i (sym->right, bit_len);
505 | }
506 | }
507 |
508 |
509 | // Walk the element tree
510 |
511 | #define PATTERN_MAX (32768)
512 |
513 | static uint last_elem = 0;
514 | static uint depth = 0;
515 |
516 | static void walk_elem (uint_t i)
517 | {
518 | depth++;
519 | if (last_elem == i && depth > 1)
520 | {
521 | puts ("HELP !");
522 | }
523 |
524 | elem_t * elem = elements + i;
525 | uint_t base = elem->base;
526 |
527 | for (uint_t j = 0; j < elem->size; j++)
528 | {
529 | uint_t patt = patterns [base++];
530 | if (patt & PATTERN_MAX)
531 | {
532 | last_elem = i;
533 | walk_elem (patt & (PATTERN_MAX - 1));
534 | }
535 | else
536 | out_byte (patt);
537 |
538 | }
539 | depth--;
540 | }
541 |
542 |
543 | // Compression with "symbol"
544 | // Prepended dictionary (external)
545 |
546 | static void compress_se ()
547 | {
548 | crunch_word ();
549 |
550 | if (opt_sym)
551 | {
552 | sym_sort (SORT_DUP);
553 | sym_list (LIST_ALL);
554 | }
555 |
556 | // Initial symbol filtering
557 |
558 | uint_t def_count = filter_init ();
559 | uchar_t bit_len;
560 |
561 | uint min_cost = UINT_MAX;
562 | uint min_def = def_count;
563 |
564 | while (1)
565 | {
566 | bit_len = log2u (def_count - 1);
567 |
568 | // Compute tree cost
569 |
570 | uint_t tree_cost = 0;
571 | list_t * node = sym_root.next;
572 | for (uint_t i = 0; i < sym_count; i++)
573 | {
574 | symbol_t * sym = (symbol_t *) node; // node as first member
575 | walk_sym_len (sym);
576 | if (sym->keep)
577 | {
578 | uint_t cost0 = walk_def_cost (sym, bit_len);
579 | uint_t cost1 = cost0;
580 | if (sym->len > 1) cost1 += cost_pref_odd (sym->len);
581 | sym->tree_gain = cost0 * sym->sym_count - cost1 - (1 + bit_len) * sym->sym_count;
582 | tree_cost += cost1;
583 | }
584 |
585 | node = node->next;
586 | }
587 |
588 | // Compute frame cost
589 |
590 | uint_t pos_cost = 0;
591 | node = pos_root.next;
592 | while (node != &pos_root)
593 | {
594 | position_t * pos = (position_t *) node; // node as first member
595 | symbol_t * sym = pos->sym;
596 |
597 | uint_t cost0 = walk_use_cost (sym, bit_len);
598 | uint_t cost1 = walk_def_cost (sym, bit_len);
599 | sym->pos_gain += cost1 - cost0;
600 | pos_cost += cost0;
601 |
602 | node = node->next;
603 | }
604 |
605 | // Compute total cost
606 | // and get the gain looser
607 |
608 | int gain_min = INT_MAX;
609 | symbol_t * sym_min = NULL;
610 |
611 | node = sym_root.next;
612 | for (uint_t i = 0; i < sym_count; i++)
613 | {
614 | symbol_t * sym = (symbol_t *) node; // node as first member
615 | if (sym->keep)
616 | {
617 | sym->all_gain = sym->tree_gain + sym->pos_gain;
618 | if (sym->all_gain < gain_min)
619 | {
620 | gain_min = sym->all_gain;
621 | sym_min = sym;
622 | }
623 | }
624 |
625 | node = node->next;
626 | }
627 |
628 | uint_t all_cost = tree_cost + pos_cost;
629 | if (all_cost < min_cost)
630 | {
631 | min_cost = all_cost;
632 | min_def = def_count;
633 | }
634 |
635 | sym_min->keep = 0;
636 | sym_min->pass = def_count;
637 |
638 | // Reset previous calculation
639 |
640 | node = sym_root.next;
641 | while (node != &sym_root)
642 | {
643 | symbol_t * sym = (symbol_t *) node; // node as first member
644 | sym->len = 0;
645 | node = node->next;
646 | }
647 |
648 | if (--def_count == 0) break;
649 | }
650 |
651 | if (opt_verb)
652 | {
653 | printf ("Minimal encoding cost = %u\n", min_cost);
654 | printf ("Best definition count = %u\n\n", min_def);
655 | }
656 |
657 | def_count = min_def;
658 | bit_len = log2u (def_count - 1);
659 |
660 | // Index the symbols
661 |
662 | uint_t index = 0;
663 | list_t * node = sym_root.next;
664 | while (node != &sym_root)
665 | {
666 | symbol_t * sym = (symbol_t *) node; // node as first member
667 | if (sym->pass && sym->pass <= def_count)
668 | {
669 | sym->keep = 1;
670 | sym->index = index++;
671 | }
672 |
673 | node = node->next;
674 | }
675 |
676 | // Output symbol dictionary
677 |
678 | out_pref_odd (def_count - 1);
679 |
680 | node = sym_root.next;
681 | while (node != &sym_root)
682 | {
683 | symbol_t * sym = (symbol_t *) node; // node as first member
684 | if (sym->keep)
685 | {
686 | uint_t len = walk_sym_len (sym);
687 | if (len > 1) out_pref_odd (len - 1);
688 | walk_def_out (sym, bit_len);
689 | }
690 |
691 | node = node->next;
692 | }
693 |
694 | // Output frame
695 |
696 | node = pos_root.next;
697 | while (node != &pos_root)
698 | {
699 | position_t * pos = (position_t *) node; // node as first member
700 | symbol_t * sym = pos->sym;
701 |
702 | walk_use_out (sym, bit_len);
703 |
704 | node = node->next;
705 | }
706 |
707 | out_pad ();
708 | }
709 |
710 |
711 | // Decompression with "symbol"
712 | // Prepended dictionary (external)
713 |
714 | static void expand_se ()
715 | {
716 | uint_t def_count = 1 + in_pref_odd ();
717 | uchar_t bit_len = log2u (def_count - 1);
718 |
719 | for (uint_t i = 0; i < def_count; i++)
720 | {
721 | elem_t * elem = elements + i;
722 |
723 | uint_t size = 1 + in_pref_odd ();
724 |
725 | elem->size = size;
726 | elem->base = patt_len;
727 |
728 | if (size == 1)
729 | patterns [patt_len++] = in_code (8);
730 | else
731 | for (uint_t j = 0; j < size; j++)
732 | if (in_bit ()) // index
733 | patterns [patt_len++] = PATTERN_MAX | in_code (bit_len);
734 | else
735 | patterns [patt_len++] = in_code (8);
736 |
737 | }
738 |
739 | while (1)
740 | {
741 | if (in_eof ()) break;
742 |
743 | if (in_bit ()) // index
744 | {
745 | uint_t i = in_code (bit_len);
746 | walk_elem (i);
747 | }
748 | else
749 | {
750 | uchar_t code = in_code (8);
751 | out_byte (code);
752 | }
753 | }
754 | }
755 |
756 |
757 | // Compression with "symbol"
758 | // Embedded dictionary (internal)
759 |
760 | static void compress_si ()
761 | {
762 | crunch_word ();
763 |
764 | uint_t def_count = sym_sort (SORT_DUP);
765 | uchar_t bit_len = log2u (def_count);
766 |
767 | out_pref_odd (bit_len - 1);
768 | out_pref_odd (pos_count - 1);
769 |
770 | list_t * node = pos_root.next;
771 | while (node != &pos_root)
772 | {
773 | position_t * pos = (position_t *) node; // node as first member
774 | symbol_t * sym = pos->sym;
775 |
776 | walk_child_i (sym, bit_len);
777 |
778 | node = node->next;
779 | }
780 |
781 | out_pad ();
782 | }
783 |
784 |
785 | // Decompression with "symbol"
786 | // Embedded dictionary (internal)
787 |
788 | static uint_t in_elem (uchar_t bit_len)
789 | {
790 | uint_t size;
791 |
792 | if (!in_bit ()) // byte code
793 | {
794 | uchar_t code = in_code (8);
795 | out_byte (code);
796 | size = 1;
797 | }
798 | else
799 | {
800 | if (in_bit ()) // reference
801 | {
802 | uint_t i = in_code (bit_len);
803 | elem_t * elem = elements + i;
804 | size = elem->size;
805 |
806 | memcpy (frame_out + size_out, frame_out + elem->base, size);
807 |
808 | size_out += size;
809 | }
810 | else
811 | {
812 | // definition
813 |
814 | uint_t base = size_out;
815 | size = 0;
816 |
817 | uint_t len = 2 + in_pref_odd ();
818 |
819 | for (uint_t i = 0; i < len; i++)
820 | size += in_elem (bit_len);
821 |
822 | // Parent element created after child
823 |
824 | elem_t * elem = elements + elem_count++;
825 | elem->base = base;
826 | elem->size = size;
827 | }
828 | }
829 |
830 | return size;
831 | }
832 |
833 |
834 | static void expand_si ()
835 | {
836 | uchar_t bit_len = 1 + in_pref_odd ();
837 |
838 | uint_t pos_count = 1 + in_pref_odd ();
839 |
840 | for (uint_t p = 0; p < pos_count; p++)
841 | in_elem (bit_len);
842 |
843 | }
844 |
845 |
846 | // Compression with "repeated symbol"
847 | // Prepended dictionary (external)
848 |
849 | static void compress_rse ()
850 | {
851 | crunch_word ();
852 | crunch_rep ();
853 |
854 | uint_t count = sym_sort (SORT_DUP);
855 | uchar_t len = log2u (count);
856 |
857 | out_pref_odd (count - 1);
858 |
859 | for (uint_t i = 0; i < count; i++)
860 | {
861 | index_sym_t * index = index_sym + i;
862 | symbol_t * sym = index->sym;
863 |
864 | out_pref_odd (walk_sym_len (sym) - 2);
865 | walk_def_out (sym, len);
866 | }
867 |
868 | out_pref_odd (pos_count - 1);
869 |
870 | list_t * node = pos_root.next;
871 | while (node != &pos_root)
872 | {
873 | position_t * pos = (position_t *) node; // node as first member
874 | symbol_t * sym = pos->sym;
875 |
876 | uint_t rep = 1;
877 | if (sym->rep_count > 1)
878 | {
879 | rep = sym->rep_count;
880 | sym = sym->left;
881 |
882 | // TODO: use code, repeat and insert symbols
883 |
884 | out_bit (1); // repeat
885 | out_pref_odd (rep - 2);
886 | }
887 | else
888 | {
889 | out_bit (0); // no repeat
890 | }
891 |
892 | walk_use_out (sym, len);
893 |
894 | node = node->next;
895 | }
896 |
897 | out_pad ();
898 | }
899 |
900 |
901 | // Decompression with "repeated symbol"
902 | // Prepended dictionary (external)
903 |
904 | static void expand_rse ()
905 | {
906 | uint_t count = 1 + in_pref_odd ();
907 | uchar_t len = log2u (count);
908 |
909 | for (uint_t i = 0; i < count; i++)
910 | {
911 | elem_t * elem = elements + i;
912 |
913 | uint_t size = 2 + in_pref_odd ();
914 |
915 | elem->size = size;
916 | elem->base = patt_len;
917 |
918 | for (uint_t j = 0; j < size; j++)
919 | {
920 | if (in_bit ()) // index
921 | patterns [patt_len++] = 32768 | in_code (len);
922 | else
923 | patterns [patt_len++] = in_code (8);
924 |
925 | }
926 | }
927 |
928 | count = 1 + in_pref_odd ();
929 |
930 | for (uint_t p = 0; p < count; p++)
931 | {
932 | uint_t rep = 1;
933 | if (in_bit ()) // repeat
934 | rep = 2 + in_pref_odd ();
935 |
936 | if (in_bit ()) // index
937 | {
938 | uint_t i = in_code (len);
939 | while (rep--) walk_elem (i);
940 | }
941 | else
942 | {
943 | uchar_t code = in_code (8);
944 | while (rep--) out_byte (code);
945 | }
946 | }
947 | }
948 |
949 |
950 | //------------------------------------------------------------------------------
951 | // Main entry point
952 | //------------------------------------------------------------------------------
953 |
954 | int main (int argc, char * argv [])
955 | {
956 | clock_t clock_begin = clock ();
957 |
958 | while (1)
959 | {
960 | char opt;
961 |
962 | while (1)
963 | {
964 | opt = getopt (argc, argv, "cem:sv");
965 | if (opt < 0 || opt == '?') break;
966 |
967 | switch (opt)
968 | {
969 | case 'c': // compress
970 | opt_compress = 1;
971 | break;
972 |
973 | case 'e': // expand
974 | opt_expand = 1;
975 | break;
976 |
977 | case 'm': // algorithm
978 | if (!strcmp (optarg, "b"))
979 | opt_algo = ALGO_BASE;
980 | else if (!strcmp (optarg, "rb"))
981 | opt_algo = ALGO_REP_BASE;
982 | else if (!strcmp (optarg, "pb"))
983 | opt_algo = ALGO_PREF;
984 | else if (!strcmp (optarg, "rpb"))
985 | opt_algo = ALGO_REP_PREF;
986 | else if (!strcmp (optarg, "se"))
987 | opt_algo = ALGO_SYM_EXT;
988 | else if (!strcmp (optarg, "si"))
989 | opt_algo = ALGO_SYM_INT;
990 | else if (!strcmp (optarg, "rse"))
991 | opt_algo = ALGO_REP_SE;
992 | else
993 | error (1, 0, "unknown algorithm");
994 |
995 | break;
996 |
997 | case 's': // list symbols
998 | opt_sym = 1;
999 | break;
1000 |
1001 | case 'v': // verbose
1002 | opt_verb = 1;
1003 | break;
1004 |
1005 | }
1006 | }
1007 |
1008 | if (opt == '?' || optind != argc - 2 || (opt_compress == opt_expand))
1009 | {
1010 | printf ("usage: %s (-c | -d) [-sv] [-m ]