├── .cproject ├── .gitignore ├── .project ├── .settings └── language.settings.xml ├── Makefile ├── README.txt ├── TODO.txt ├── src ├── common.h ├── compress.c ├── list.c ├── list.h ├── stream.c ├── stream.h ├── symbol.c └── symbol.h ├── test.sh └── validate.sh /.cproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 27 | 28 | 34 | 35 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 80 | 81 | 85 | 86 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /Debug/ 2 | /gmon.out 3 | /Release/ 4 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | compress 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.cdt.managedbuilder.core.genmakebuilder 10 | clean,full,incremental, 11 | 12 | 13 | 14 | 15 | org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder 16 | full,incremental, 17 | 18 | 19 | 20 | 21 | 22 | org.eclipse.cdt.core.cnature 23 | org.eclipse.cdt.managedbuilder.core.managedBuildNature 24 | org.eclipse.cdt.managedbuilder.core.ScannerConfigNature 25 | 26 | 27 | -------------------------------------------------------------------------------- /.settings/language.settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | PROG=Release/compress 3 | 4 | .PHONY: test 5 | 6 | test_se: 7 | $(PROG) -c -m se data.bin test_out.bin 8 | $(PROG) -e -m se test_out.bin test_in.bin 9 | diff data.bin test_in.bin 10 | $(PROG) -c -m se code.bin test_out.bin 11 | $(PROG) -e -m se test_out.bin test_in.bin 12 | diff code.bin test_in.bin 13 | $(PROG) -c -m se ash.bin test_out.bin 14 | $(PROG) -e -m se test_out.bin test_in.bin 15 | diff ash.bin test_in.bin 16 | 17 | test: test_se 18 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | PURPOSE 2 | 3 | This project features a generic compressor / decompressor, in standard C langage 4 | for best performance and portability. 5 | 6 | The compressor is intended to run on a host with standard resources (development 7 | PC). The decompressor is in turn intended to run on a target with limited 8 | resources (embedded, IoT). 9 | 10 | The main goal is to save storage space on the target, by compressing at most the 11 | read-only data on the host, and to decompress on the target at the lowest cost, 12 | for a limited impact on the load time. 13 | 14 | A secondary goal is to compress and decompress on the target some limited amount 15 | of read-write data, keeping the lowest cost but having a valuable ratio. 16 | 17 | Inspired by the famous & venerable Exomizer: 18 | https://github.com/bitshifters/exomizer 19 | 20 | 21 | DESIGN 22 | 23 | Because of small data sizes on the target, compression is performed on the 24 | whole initial sequence of base symbols (= characters as byte codes). This gives 25 | a better symbol ratio, but requires more computation than the algorithms using 26 | a sliding window (these are better suited for long data streams). 27 | 28 | The compressor repeatedly scans the sequence to find elementary patterns as 29 | symbol pairs, then replaces the most frequent & asymmetric pair by a secondary 30 | symbol, thus building a binary tree of symbols and a reduced final sequence. 31 | 32 | When no more asymmetric pair is duplicated, the compressor reduces the tree, 33 | (including the repeated symbols), then serializes that tree as an indexed table 34 | of words (= dictionary), plus the final sequence. 35 | 36 | As this dictionary is static, preceding or embedded in the sequence, it saves 37 | the cost of dynamically rebuild it at decompression. 38 | 39 | The table and the sequence are encoded as a bit stream. Base symbols are 40 | serialized as byte codes, while secondary ones are serialized using indexes. 41 | 42 | Prefixed coding is prefered to Huffman or arithmetic ones to keep the 43 | decompression cost low, even if less optimal. 44 | 45 | Decompression is much simpler. It decodes the bit stream, rebuild the symbol 46 | tree from the table, iterates on the sequence and recursively walks the tree. 47 | 48 | 49 | STATUS 50 | 51 | WORK IN PROGRESS 52 | 53 | Already implemented: 54 | - symbol listing 55 | - asymmetric pairing 56 | - repeated symbol in sequence 57 | - tree walking 58 | - bit coding & streaming 59 | - external loopback test 60 | 61 | Result: 62 | - already good symbol ratio 63 | - already good decompression time 64 | - acceptable compression time 65 | - but still bad compression ratio 66 | 67 | See TODO.txt for next steps. 68 | 69 | 70 | BENCHMARK 71 | 72 | Samples from ELKS project: 73 | https://github.com/jbruchon/elks 74 | 75 | - data: kernel data only 76 | - code: kernel code only 77 | - ash: shell (mixed code & data) 78 | 79 | Compression ratio: 80 | 81 | ENCODING DATA CODE ASH 82 | 83 | Initial 6151 43584 51216 84 | B(ase) 6151 43584 51216 Just for testing 85 | R(epeat)B 5650 48716 55948 Not efficient for code 86 | P(refix)B 4840 41659 48955 87 | RPB 4752 43472 50479 Less efficient for code 88 | S(ymbol)E 4835 31821 38006 89 | SI 4547 30853 36307 90 | RSE 3875 35903 41736 Less efficient for code 91 | RSI x x x 92 | PS x x x 93 | RPS x x x 94 | 95 | gzip -1 3084 30322 34807 96 | gzip 2999 29230 33660 97 | gzip -9 2999 29216 33652 98 | 99 | exomizer 2956 29073 33192 100 | 101 | 102 | Compression time for ASH (ms): 103 | 104 | ENCODING COMPRESS EXPAND 105 | 106 | B(ase) 6 2 107 | R(epeat)B - - 108 | P(refix)B 9 3 109 | RPB - - 110 | S(ymbol)E 3885 6 111 | SI 2240 2 112 | RSE 2395 2 113 | RSI x x 114 | PS x x 115 | RPS x x 116 | 117 | gzip -1 4 2 118 | gzip 6 2 119 | gzip -9 6 2 120 | 121 | exomizer 2146 3 122 | -------------------------------------------------------------------------------- /TODO.txt: -------------------------------------------------------------------------------- 1 | TODO LIST 2 | 3 | Needed: 4 | - quick sort on list ? 5 | 6 | Improvements: 7 | - encoding option 8 | - precompute definition length before serializing ? 9 | - merge table and sequence in RS (= RSI) 10 | - repeat symbol in tree 11 | - coding cost computation 12 | - self optimization based on cost 13 | - automatic benchmarking 14 | 15 | Huffman coding experiment: 16 | - binary tree structure 17 | - binary tree coding 18 | - binary tree decoding 19 | - adaptative tree 20 | - symbol coding 21 | - lenght and count coding ? 22 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | typedef unsigned char uchar_t; 4 | typedef unsigned char uchar; 5 | typedef unsigned int uint_t; 6 | typedef unsigned int uint; 7 | 8 | #define structof (type, member, pointer) ( \ 9 | (type *) ((char *) pointer - offsetof (type, member))) 10 | -------------------------------------------------------------------------------- /src/compress.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // Compressor 3 | //------------------------------------------------------------------------------ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "common.h" 12 | #include "list.h" 13 | #include "stream.h" 14 | #include "symbol.h" 15 | 16 | 17 | // Element definition 18 | // Used for decompression 19 | 20 | struct elem_s 21 | { 22 | uint_t base; 23 | uint_t size; 24 | }; 25 | 26 | typedef struct elem_s elem_t; 27 | 28 | static elem_t elements [SYMBOL_MAX]; 29 | static uint_t elem_count; 30 | 31 | static uint_t patterns [FRAME_MAX]; 32 | static uint_t patt_len; 33 | 34 | 35 | // Program options 36 | 37 | #define ALGO_DEF 0 38 | #define ALGO_BASE 1 39 | #define ALGO_REP_BASE 2 40 | #define ALGO_PREF 3 41 | #define ALGO_REP_PREF 4 42 | #define ALGO_SYM_EXT 5 43 | #define ALGO_SYM_INT 6 44 | #define ALGO_REP_SE 7 45 | 46 | uchar_t opt_algo; 47 | uchar_t opt_compress; 48 | uchar_t opt_expand; 49 | uchar_t opt_sym; 50 | uchar_t opt_verb; 51 | 52 | 53 | //------------------------------------------------------------------------------ 54 | // Algorithms 55 | //------------------------------------------------------------------------------ 56 | 57 | // Compression with "base" (no compression) 58 | // Just for testing 59 | 60 | static void compress_b () 61 | { 62 | list_t * node = pos_root.next; 63 | while (node != &pos_root) 64 | { 65 | position_t * pos = (position_t *) node; // node as first member 66 | symbol_t * sym = pos->sym; 67 | out_byte (sym->code); 68 | 69 | node = node->next; 70 | } 71 | } 72 | 73 | 74 | // Decompression with "base" (no decompression) 75 | // Just for testing 76 | 77 | static void expand_b () 78 | { 79 | for (uint_t i = 0; i < size_in; i++) 80 | { 81 | out_byte (in_byte ()); 82 | } 83 | } 84 | 85 | 86 | // Compression with "repeated base" 87 | // Just for testing 88 | 89 | static void compress_rb () 90 | { 91 | crunch_rep (); 92 | 93 | out_pref_odd (pos_count - 1); 94 | 95 | list_t * node = pos_root.next; 96 | while (node != &pos_root) 97 | { 98 | position_t * pos = (position_t *) node; // node as first member 99 | symbol_t * sym = pos->sym; 100 | 101 | if (sym->rep_count > 1) 102 | { 103 | out_bit (1); // repeat flag 104 | 105 | out_pref_odd (sym->rep_count - 2); 106 | 107 | sym = sym->left; 108 | } 109 | else 110 | { 111 | out_bit (0); 112 | } 113 | 114 | out_code (sym->code, 8); 115 | 116 | node = node->next; 117 | } 118 | 119 | out_pad (); 120 | } 121 | 122 | 123 | // Decompression with "repeated base" 124 | // Just for testing 125 | 126 | static void expand_rb () 127 | { 128 | uint_t count = 1 + in_pref_odd (); 129 | 130 | for (uint_t p = 0; p < count; p++) 131 | { 132 | if (!in_bit ()) // code flag 133 | { 134 | out_byte (in_code (8)); 135 | } 136 | else 137 | { 138 | uint_t rep = 2 + in_pref_odd (); 139 | uchar_t code = in_code (8); 140 | while (rep--) out_byte (code); 141 | } 142 | } 143 | } 144 | 145 | 146 | // Compression with "prefixed base" 147 | // Just for testing 148 | 149 | static void compress_pb () 150 | { 151 | if (!opt_sym) sym_sort (SORT_ALL); 152 | 153 | // No more than 6 prefixed bits to save space 154 | // so no more than 14 indexed symbols 155 | 156 | uint_t count = (sym_count < 14) ? sym_count : 14; 157 | 158 | out_pref_odd (count - 1); 159 | 160 | for (uint_t i = 0; i < count; i++) 161 | { 162 | index_sym_t * index = index_sym + i; 163 | symbol_t * sym = index->sym; 164 | out_code (sym->code, 8); 165 | } 166 | 167 | out_pref_odd (pos_count - 1); 168 | 169 | list_t * node = pos_root.next; 170 | while (node != &pos_root) 171 | { 172 | position_t * pos = (position_t *) node; // node as first member 173 | symbol_t * sym = pos->sym; 174 | 175 | // Use index only when space gain 176 | 177 | if (sym->index < count) 178 | { 179 | out_bit (1); // index flag 180 | out_pref_even (sym->index); 181 | } 182 | else 183 | { 184 | out_bit (0); // code flag 185 | out_code (sym->code, 8); 186 | } 187 | 188 | node = node->next; 189 | } 190 | 191 | out_pad (); 192 | } 193 | 194 | 195 | // Decompression with "prefixed base" 196 | // Just for testing 197 | 198 | static void expand_pb () 199 | { 200 | list_init (&sym_root); 201 | 202 | uint_t count = 1 + in_pref_odd (); 203 | 204 | for (uint_t i = 0; i < count; i++) 205 | { 206 | index_sym_t * index = index_sym + i; 207 | symbol_t * sym = sym_add (); 208 | sym->code = in_code (8); 209 | index->sym = sym; 210 | } 211 | 212 | count = 1 + in_pref_odd (); 213 | 214 | for (uint_t p = 0; p < count; p++) 215 | { 216 | if (in_bit ()) // index flag 217 | { 218 | uint_t i = in_pref_even (); 219 | index_sym_t * index = index_sym + i; 220 | symbol_t * sym = index->sym; 221 | out_byte (sym->code); 222 | } 223 | else 224 | { 225 | out_byte (in_code (8)); 226 | } 227 | } 228 | } 229 | 230 | 231 | // Compression with "repeated prefixed base" 232 | // Just for testing 233 | 234 | static void compress_rpb () 235 | { 236 | crunch_rep (); 237 | 238 | uint_t count = sym_sort (SORT_REP); 239 | 240 | // No more than 6 prefixed bits to save space 241 | // so no more than 14 indexed symbols 242 | 243 | count = (count < 14) ? count : 14; 244 | 245 | out_pref_odd (count - 1); 246 | 247 | for (uint_t i = 0; i < count; i++) 248 | { 249 | index_sym_t * index = index_sym + i; 250 | symbol_t * sym = index->sym; 251 | out_code (sym->code, 8); 252 | } 253 | 254 | out_pref_odd (pos_count - 1); 255 | 256 | list_t * node = pos_root.next; 257 | while (node != &pos_root) 258 | { 259 | position_t * pos = (position_t *) node; // node as first member 260 | symbol_t * sym = pos->sym; 261 | 262 | uchar_t rep = 0; 263 | 264 | if (sym->rep_count > 1) 265 | { 266 | out_bit (1); // repeat word 267 | out_bit (0); 268 | 269 | out_pref_odd (sym->rep_count - 2); 270 | 271 | sym = sym->left; 272 | 273 | rep = 1; 274 | } 275 | 276 | if (sym->index < 14) 277 | { 278 | out_bit (1); // index flag or word 279 | if (!rep) out_bit (1); 280 | 281 | out_pref_even (sym->index); 282 | } 283 | else 284 | { 285 | out_bit (0); // code flag and word 286 | out_code (sym->code, 8); 287 | } 288 | 289 | node = node->next; 290 | } 291 | 292 | out_pad (); 293 | } 294 | 295 | 296 | // Decompression with "repeated prefixed base" 297 | // Just for testing 298 | 299 | static void expand_rpb () 300 | { 301 | list_init (&sym_root); 302 | 303 | uint_t count = 1 + in_pref_odd (); 304 | 305 | for (uint_t i = 0; i < count; i++) 306 | { 307 | index_sym_t * index = index_sym + i; 308 | symbol_t * sym = sym_add (); 309 | sym->code = in_code (8); 310 | index->sym = sym; 311 | } 312 | 313 | count = 1 + in_pref_odd (); 314 | 315 | for (uint_t p = 0; p < count; p++) 316 | { 317 | if (!in_bit ()) // code word 318 | { 319 | out_byte (in_code (8)); 320 | } 321 | else 322 | { 323 | uint_t rep = 1; 324 | 325 | if (!in_bit ()) // repeat word 326 | { 327 | rep = 2 + in_pref_odd (); 328 | 329 | if (in_bit ()) // index flag 330 | { 331 | uint_t i = in_pref_even (); 332 | index_sym_t * index = index_sym + i; 333 | symbol_t * sym = index->sym; 334 | while (rep--) out_byte (sym->code); 335 | } 336 | else 337 | { 338 | uchar_t code = in_code (8); 339 | while (rep--) out_byte (code); 340 | } 341 | } 342 | else 343 | { 344 | uint_t i = in_pref_even (); 345 | index_sym_t * index = index_sym + i; 346 | symbol_t * sym = index->sym; 347 | out_byte (sym->code); 348 | } 349 | } 350 | } 351 | } 352 | 353 | 354 | // Walking the symbol tree 355 | 356 | static uint_t walk_sym_len (symbol_t * sym); 357 | 358 | static uint_t walk_child_len (symbol_t * sym) 359 | { 360 | uint_t len; 361 | 362 | if (!sym->keep) 363 | { 364 | len = walk_sym_len (sym); 365 | } 366 | else 367 | { 368 | len = 1; // reference 369 | } 370 | 371 | return len; 372 | } 373 | 374 | static uint_t walk_sym_len (symbol_t * sym) 375 | { 376 | if (!sym->len) 377 | { 378 | if (sym->size == 1) 379 | { 380 | sym->len = 1; // base code 381 | } 382 | else 383 | { 384 | sym->len = walk_child_len (sym->left); 385 | sym->len += walk_child_len (sym->right); 386 | } 387 | } 388 | 389 | return sym->len; 390 | } 391 | 392 | 393 | // Walk tree to compute cost 394 | 395 | static uint_t walk_def_cost (symbol_t * sym, uchar_t bit_len); 396 | 397 | static uint_t walk_use_cost (symbol_t * sym, uchar_t bit_len) 398 | { 399 | uint_t cost; 400 | 401 | if (!sym->keep) 402 | { 403 | cost = walk_def_cost (sym, bit_len); 404 | } 405 | else 406 | { 407 | // '1' for 'reference' + size of 'index' 408 | cost = 1 + bit_len; 409 | } 410 | 411 | return cost; 412 | } 413 | 414 | static uint_t walk_def_cost (symbol_t * sym, uchar_t bit_len) 415 | { 416 | uint_t cost; 417 | 418 | if (sym->size == 1) 419 | { 420 | // '0' for 'base' + 8 for base code 421 | cost = 1 + 8; 422 | } 423 | else 424 | { 425 | cost = walk_use_cost (sym->left, bit_len); 426 | cost += walk_use_cost (sym->right, bit_len); 427 | } 428 | 429 | return cost; 430 | } 431 | 432 | 433 | static void walk_def_out (symbol_t * sym, uchar_t bit_len); 434 | 435 | static void walk_use_out (symbol_t * sym, uchar_t bit_len) 436 | { 437 | if (!sym->keep) 438 | { 439 | walk_def_out (sym, bit_len); 440 | } 441 | else 442 | { 443 | out_bit (1); // index 444 | out_code (sym->index, bit_len); 445 | } 446 | } 447 | 448 | static void walk_def_out (symbol_t * sym, uchar_t bit_len) 449 | { 450 | if (sym->size == 1) 451 | { 452 | out_bit (0); // code 453 | out_code (sym->code, 8); 454 | } 455 | else 456 | { 457 | walk_use_out (sym->left, bit_len); 458 | walk_use_out (sym->right, bit_len); 459 | } 460 | } 461 | 462 | 463 | static void walk_sym_i (symbol_t * sym, uchar_t bit_len); 464 | 465 | static void walk_child_i (symbol_t * sym, uchar_t bit_len) 466 | { 467 | if (sym->size == 1 || (sym->sym_count == 1 && sym->pos_count == 0 && sym->rep_count != 1)) 468 | { 469 | walk_sym_i (sym, bit_len); 470 | } 471 | else 472 | { 473 | if (!sym->len) 474 | { 475 | out_bit (1); // definition 476 | out_bit (0); 477 | 478 | out_pref_odd (walk_sym_len (sym) - 2); 479 | walk_sym_i (sym, bit_len); 480 | 481 | sym->index = index_count++; 482 | } 483 | else 484 | { 485 | out_bit (1); // reference 486 | out_bit (1); 487 | 488 | out_code (sym->index, bit_len); 489 | } 490 | } 491 | } 492 | 493 | 494 | static void walk_sym_i (symbol_t * sym, uchar_t bit_len) 495 | { 496 | if (sym->size == 1) 497 | { 498 | out_bit (0); // code 499 | out_code (sym->code, 8); 500 | } 501 | else 502 | { 503 | walk_child_i (sym->left, bit_len); 504 | walk_child_i (sym->right, bit_len); 505 | } 506 | } 507 | 508 | 509 | // Walk the element tree 510 | 511 | #define PATTERN_MAX (32768) 512 | 513 | static uint last_elem = 0; 514 | static uint depth = 0; 515 | 516 | static void walk_elem (uint_t i) 517 | { 518 | depth++; 519 | if (last_elem == i && depth > 1) 520 | { 521 | puts ("HELP !"); 522 | } 523 | 524 | elem_t * elem = elements + i; 525 | uint_t base = elem->base; 526 | 527 | for (uint_t j = 0; j < elem->size; j++) 528 | { 529 | uint_t patt = patterns [base++]; 530 | if (patt & PATTERN_MAX) 531 | { 532 | last_elem = i; 533 | walk_elem (patt & (PATTERN_MAX - 1)); 534 | } 535 | else 536 | out_byte (patt); 537 | 538 | } 539 | depth--; 540 | } 541 | 542 | 543 | // Compression with "symbol" 544 | // Prepended dictionary (external) 545 | 546 | static void compress_se () 547 | { 548 | crunch_word (); 549 | 550 | if (opt_sym) 551 | { 552 | sym_sort (SORT_DUP); 553 | sym_list (LIST_ALL); 554 | } 555 | 556 | // Initial symbol filtering 557 | 558 | uint_t def_count = filter_init (); 559 | uchar_t bit_len; 560 | 561 | uint min_cost = UINT_MAX; 562 | uint min_def = def_count; 563 | 564 | while (1) 565 | { 566 | bit_len = log2u (def_count - 1); 567 | 568 | // Compute tree cost 569 | 570 | uint_t tree_cost = 0; 571 | list_t * node = sym_root.next; 572 | for (uint_t i = 0; i < sym_count; i++) 573 | { 574 | symbol_t * sym = (symbol_t *) node; // node as first member 575 | walk_sym_len (sym); 576 | if (sym->keep) 577 | { 578 | uint_t cost0 = walk_def_cost (sym, bit_len); 579 | uint_t cost1 = cost0; 580 | if (sym->len > 1) cost1 += cost_pref_odd (sym->len); 581 | sym->tree_gain = cost0 * sym->sym_count - cost1 - (1 + bit_len) * sym->sym_count; 582 | tree_cost += cost1; 583 | } 584 | 585 | node = node->next; 586 | } 587 | 588 | // Compute frame cost 589 | 590 | uint_t pos_cost = 0; 591 | node = pos_root.next; 592 | while (node != &pos_root) 593 | { 594 | position_t * pos = (position_t *) node; // node as first member 595 | symbol_t * sym = pos->sym; 596 | 597 | uint_t cost0 = walk_use_cost (sym, bit_len); 598 | uint_t cost1 = walk_def_cost (sym, bit_len); 599 | sym->pos_gain += cost1 - cost0; 600 | pos_cost += cost0; 601 | 602 | node = node->next; 603 | } 604 | 605 | // Compute total cost 606 | // and get the gain looser 607 | 608 | int gain_min = INT_MAX; 609 | symbol_t * sym_min = NULL; 610 | 611 | node = sym_root.next; 612 | for (uint_t i = 0; i < sym_count; i++) 613 | { 614 | symbol_t * sym = (symbol_t *) node; // node as first member 615 | if (sym->keep) 616 | { 617 | sym->all_gain = sym->tree_gain + sym->pos_gain; 618 | if (sym->all_gain < gain_min) 619 | { 620 | gain_min = sym->all_gain; 621 | sym_min = sym; 622 | } 623 | } 624 | 625 | node = node->next; 626 | } 627 | 628 | uint_t all_cost = tree_cost + pos_cost; 629 | if (all_cost < min_cost) 630 | { 631 | min_cost = all_cost; 632 | min_def = def_count; 633 | } 634 | 635 | sym_min->keep = 0; 636 | sym_min->pass = def_count; 637 | 638 | // Reset previous calculation 639 | 640 | node = sym_root.next; 641 | while (node != &sym_root) 642 | { 643 | symbol_t * sym = (symbol_t *) node; // node as first member 644 | sym->len = 0; 645 | node = node->next; 646 | } 647 | 648 | if (--def_count == 0) break; 649 | } 650 | 651 | if (opt_verb) 652 | { 653 | printf ("Minimal encoding cost = %u\n", min_cost); 654 | printf ("Best definition count = %u\n\n", min_def); 655 | } 656 | 657 | def_count = min_def; 658 | bit_len = log2u (def_count - 1); 659 | 660 | // Index the symbols 661 | 662 | uint_t index = 0; 663 | list_t * node = sym_root.next; 664 | while (node != &sym_root) 665 | { 666 | symbol_t * sym = (symbol_t *) node; // node as first member 667 | if (sym->pass && sym->pass <= def_count) 668 | { 669 | sym->keep = 1; 670 | sym->index = index++; 671 | } 672 | 673 | node = node->next; 674 | } 675 | 676 | // Output symbol dictionary 677 | 678 | out_pref_odd (def_count - 1); 679 | 680 | node = sym_root.next; 681 | while (node != &sym_root) 682 | { 683 | symbol_t * sym = (symbol_t *) node; // node as first member 684 | if (sym->keep) 685 | { 686 | uint_t len = walk_sym_len (sym); 687 | if (len > 1) out_pref_odd (len - 1); 688 | walk_def_out (sym, bit_len); 689 | } 690 | 691 | node = node->next; 692 | } 693 | 694 | // Output frame 695 | 696 | node = pos_root.next; 697 | while (node != &pos_root) 698 | { 699 | position_t * pos = (position_t *) node; // node as first member 700 | symbol_t * sym = pos->sym; 701 | 702 | walk_use_out (sym, bit_len); 703 | 704 | node = node->next; 705 | } 706 | 707 | out_pad (); 708 | } 709 | 710 | 711 | // Decompression with "symbol" 712 | // Prepended dictionary (external) 713 | 714 | static void expand_se () 715 | { 716 | uint_t def_count = 1 + in_pref_odd (); 717 | uchar_t bit_len = log2u (def_count - 1); 718 | 719 | for (uint_t i = 0; i < def_count; i++) 720 | { 721 | elem_t * elem = elements + i; 722 | 723 | uint_t size = 1 + in_pref_odd (); 724 | 725 | elem->size = size; 726 | elem->base = patt_len; 727 | 728 | if (size == 1) 729 | patterns [patt_len++] = in_code (8); 730 | else 731 | for (uint_t j = 0; j < size; j++) 732 | if (in_bit ()) // index 733 | patterns [patt_len++] = PATTERN_MAX | in_code (bit_len); 734 | else 735 | patterns [patt_len++] = in_code (8); 736 | 737 | } 738 | 739 | while (1) 740 | { 741 | if (in_eof ()) break; 742 | 743 | if (in_bit ()) // index 744 | { 745 | uint_t i = in_code (bit_len); 746 | walk_elem (i); 747 | } 748 | else 749 | { 750 | uchar_t code = in_code (8); 751 | out_byte (code); 752 | } 753 | } 754 | } 755 | 756 | 757 | // Compression with "symbol" 758 | // Embedded dictionary (internal) 759 | 760 | static void compress_si () 761 | { 762 | crunch_word (); 763 | 764 | uint_t def_count = sym_sort (SORT_DUP); 765 | uchar_t bit_len = log2u (def_count); 766 | 767 | out_pref_odd (bit_len - 1); 768 | out_pref_odd (pos_count - 1); 769 | 770 | list_t * node = pos_root.next; 771 | while (node != &pos_root) 772 | { 773 | position_t * pos = (position_t *) node; // node as first member 774 | symbol_t * sym = pos->sym; 775 | 776 | walk_child_i (sym, bit_len); 777 | 778 | node = node->next; 779 | } 780 | 781 | out_pad (); 782 | } 783 | 784 | 785 | // Decompression with "symbol" 786 | // Embedded dictionary (internal) 787 | 788 | static uint_t in_elem (uchar_t bit_len) 789 | { 790 | uint_t size; 791 | 792 | if (!in_bit ()) // byte code 793 | { 794 | uchar_t code = in_code (8); 795 | out_byte (code); 796 | size = 1; 797 | } 798 | else 799 | { 800 | if (in_bit ()) // reference 801 | { 802 | uint_t i = in_code (bit_len); 803 | elem_t * elem = elements + i; 804 | size = elem->size; 805 | 806 | memcpy (frame_out + size_out, frame_out + elem->base, size); 807 | 808 | size_out += size; 809 | } 810 | else 811 | { 812 | // definition 813 | 814 | uint_t base = size_out; 815 | size = 0; 816 | 817 | uint_t len = 2 + in_pref_odd (); 818 | 819 | for (uint_t i = 0; i < len; i++) 820 | size += in_elem (bit_len); 821 | 822 | // Parent element created after child 823 | 824 | elem_t * elem = elements + elem_count++; 825 | elem->base = base; 826 | elem->size = size; 827 | } 828 | } 829 | 830 | return size; 831 | } 832 | 833 | 834 | static void expand_si () 835 | { 836 | uchar_t bit_len = 1 + in_pref_odd (); 837 | 838 | uint_t pos_count = 1 + in_pref_odd (); 839 | 840 | for (uint_t p = 0; p < pos_count; p++) 841 | in_elem (bit_len); 842 | 843 | } 844 | 845 | 846 | // Compression with "repeated symbol" 847 | // Prepended dictionary (external) 848 | 849 | static void compress_rse () 850 | { 851 | crunch_word (); 852 | crunch_rep (); 853 | 854 | uint_t count = sym_sort (SORT_DUP); 855 | uchar_t len = log2u (count); 856 | 857 | out_pref_odd (count - 1); 858 | 859 | for (uint_t i = 0; i < count; i++) 860 | { 861 | index_sym_t * index = index_sym + i; 862 | symbol_t * sym = index->sym; 863 | 864 | out_pref_odd (walk_sym_len (sym) - 2); 865 | walk_def_out (sym, len); 866 | } 867 | 868 | out_pref_odd (pos_count - 1); 869 | 870 | list_t * node = pos_root.next; 871 | while (node != &pos_root) 872 | { 873 | position_t * pos = (position_t *) node; // node as first member 874 | symbol_t * sym = pos->sym; 875 | 876 | uint_t rep = 1; 877 | if (sym->rep_count > 1) 878 | { 879 | rep = sym->rep_count; 880 | sym = sym->left; 881 | 882 | // TODO: use code, repeat and insert symbols 883 | 884 | out_bit (1); // repeat 885 | out_pref_odd (rep - 2); 886 | } 887 | else 888 | { 889 | out_bit (0); // no repeat 890 | } 891 | 892 | walk_use_out (sym, len); 893 | 894 | node = node->next; 895 | } 896 | 897 | out_pad (); 898 | } 899 | 900 | 901 | // Decompression with "repeated symbol" 902 | // Prepended dictionary (external) 903 | 904 | static void expand_rse () 905 | { 906 | uint_t count = 1 + in_pref_odd (); 907 | uchar_t len = log2u (count); 908 | 909 | for (uint_t i = 0; i < count; i++) 910 | { 911 | elem_t * elem = elements + i; 912 | 913 | uint_t size = 2 + in_pref_odd (); 914 | 915 | elem->size = size; 916 | elem->base = patt_len; 917 | 918 | for (uint_t j = 0; j < size; j++) 919 | { 920 | if (in_bit ()) // index 921 | patterns [patt_len++] = 32768 | in_code (len); 922 | else 923 | patterns [patt_len++] = in_code (8); 924 | 925 | } 926 | } 927 | 928 | count = 1 + in_pref_odd (); 929 | 930 | for (uint_t p = 0; p < count; p++) 931 | { 932 | uint_t rep = 1; 933 | if (in_bit ()) // repeat 934 | rep = 2 + in_pref_odd (); 935 | 936 | if (in_bit ()) // index 937 | { 938 | uint_t i = in_code (len); 939 | while (rep--) walk_elem (i); 940 | } 941 | else 942 | { 943 | uchar_t code = in_code (8); 944 | while (rep--) out_byte (code); 945 | } 946 | } 947 | } 948 | 949 | 950 | //------------------------------------------------------------------------------ 951 | // Main entry point 952 | //------------------------------------------------------------------------------ 953 | 954 | int main (int argc, char * argv []) 955 | { 956 | clock_t clock_begin = clock (); 957 | 958 | while (1) 959 | { 960 | char opt; 961 | 962 | while (1) 963 | { 964 | opt = getopt (argc, argv, "cem:sv"); 965 | if (opt < 0 || opt == '?') break; 966 | 967 | switch (opt) 968 | { 969 | case 'c': // compress 970 | opt_compress = 1; 971 | break; 972 | 973 | case 'e': // expand 974 | opt_expand = 1; 975 | break; 976 | 977 | case 'm': // algorithm 978 | if (!strcmp (optarg, "b")) 979 | opt_algo = ALGO_BASE; 980 | else if (!strcmp (optarg, "rb")) 981 | opt_algo = ALGO_REP_BASE; 982 | else if (!strcmp (optarg, "pb")) 983 | opt_algo = ALGO_PREF; 984 | else if (!strcmp (optarg, "rpb")) 985 | opt_algo = ALGO_REP_PREF; 986 | else if (!strcmp (optarg, "se")) 987 | opt_algo = ALGO_SYM_EXT; 988 | else if (!strcmp (optarg, "si")) 989 | opt_algo = ALGO_SYM_INT; 990 | else if (!strcmp (optarg, "rse")) 991 | opt_algo = ALGO_REP_SE; 992 | else 993 | error (1, 0, "unknown algorithm"); 994 | 995 | break; 996 | 997 | case 's': // list symbols 998 | opt_sym = 1; 999 | break; 1000 | 1001 | case 'v': // verbose 1002 | opt_verb = 1; 1003 | break; 1004 | 1005 | } 1006 | } 1007 | 1008 | if (opt == '?' || optind != argc - 2 || (opt_compress == opt_expand)) 1009 | { 1010 | printf ("usage: %s (-c | -d) [-sv] [-m ] \n\n", argv [0]); 1011 | puts (" -c compress"); 1012 | puts (" -e expand"); 1013 | puts (" -m algorithm"); 1014 | puts (" -s list symbols"); 1015 | puts (" -v verbose"); 1016 | puts (""); 1017 | puts ("algorithms:"); 1018 | puts (" b base (no compression)"); 1019 | puts (" rb repeat base"); 1020 | puts (" pb prefixed base"); 1021 | puts (" rpb repeat prefixed base"); 1022 | puts (" se symbol external (prepended dictionary)"); 1023 | puts (" si symbol internal (embedded dictionary)"); 1024 | puts (" rse repeat symbol external (default)"); 1025 | puts (""); 1026 | break; 1027 | } 1028 | 1029 | in_frame (argv [optind]); 1030 | 1031 | if (opt_compress) 1032 | { 1033 | if (size_in < 3) 1034 | error (1, 0, "frame too short"); 1035 | 1036 | scan_base (); 1037 | 1038 | if (opt_verb) 1039 | { 1040 | puts ("INITIAL"); 1041 | printf ("Frame length: %u\n", size_in); 1042 | printf ("Base symbol count: %u\n\n", sym_count); 1043 | } 1044 | 1045 | if (opt_sym) 1046 | { 1047 | sym_sort (SORT_ALL); 1048 | sym_list (LIST_ALL); 1049 | } 1050 | 1051 | if (opt_verb) puts ("Compressing...\n"); 1052 | 1053 | switch (opt_algo) 1054 | { 1055 | case ALGO_BASE: 1056 | compress_b (); 1057 | break; 1058 | 1059 | case ALGO_REP_BASE: 1060 | compress_rb (); 1061 | break; 1062 | 1063 | case ALGO_PREF: 1064 | compress_pb (); 1065 | break; 1066 | 1067 | case ALGO_REP_PREF: 1068 | compress_rpb (); 1069 | break; 1070 | 1071 | case ALGO_SYM_EXT: 1072 | compress_se (); 1073 | break; 1074 | 1075 | case ALGO_SYM_INT: 1076 | compress_si (); 1077 | break; 1078 | 1079 | case ALGO_REP_SE: 1080 | compress_rse (); 1081 | break; 1082 | 1083 | default: 1084 | compress_se (); 1085 | break; 1086 | 1087 | } 1088 | 1089 | if (opt_verb) 1090 | { 1091 | puts ("FINAL"); 1092 | printf ("Frame length: %u\n", pos_count); 1093 | printf ("Frame size: %u\n", size_out); 1094 | 1095 | double ratio = (double) size_out / size_in; 1096 | printf ("Compression ratio: %f\n\n", ratio); 1097 | } 1098 | 1099 | out_frame (argv [optind + 1]); 1100 | break; 1101 | } 1102 | 1103 | if (opt_expand) 1104 | { 1105 | if (opt_verb) printf ("Expanding..."); 1106 | 1107 | switch (opt_algo) 1108 | { 1109 | case ALGO_BASE: 1110 | expand_b (); 1111 | break; 1112 | 1113 | case ALGO_REP_BASE: 1114 | expand_rb (); 1115 | break; 1116 | 1117 | case ALGO_PREF: 1118 | expand_pb (); 1119 | break; 1120 | 1121 | case ALGO_REP_PREF: 1122 | expand_rpb (); 1123 | break; 1124 | 1125 | case ALGO_SYM_EXT: 1126 | expand_se (); 1127 | break; 1128 | 1129 | case ALGO_SYM_INT: 1130 | expand_si (); 1131 | break; 1132 | 1133 | case ALGO_REP_SE: 1134 | expand_rse (); 1135 | break; 1136 | 1137 | default: 1138 | expand_se (); 1139 | break; 1140 | 1141 | } 1142 | 1143 | if (opt_verb) puts (" DONE\n"); 1144 | 1145 | out_frame (argv [optind + 1]); 1146 | break; 1147 | } 1148 | 1149 | break; 1150 | } 1151 | 1152 | clock_t clock_end = clock (); 1153 | if (opt_verb) printf ("elapsed=%lf msecs\n\n", (clock_end - clock_begin) * 1000.0 / CLOCKS_PER_SEC); 1154 | 1155 | return 0; 1156 | } 1157 | 1158 | 1159 | //------------------------------------------------------------------------------ 1160 | -------------------------------------------------------------------------------- /src/list.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // Double-linked list 3 | //------------------------------------------------------------------------------ 4 | 5 | #include "list.h" 6 | 7 | 8 | void list_init (list_t * root) 9 | { 10 | root->prev = root; 11 | root->next = root; 12 | } 13 | 14 | 15 | #define LIST_LINK \ 16 | prev->next = node; \ 17 | node->prev = prev; \ 18 | next->prev = node; \ 19 | node->next = next; \ 20 | /**/ 21 | 22 | 23 | void insert_before (list_t * next, list_t * node) 24 | { 25 | list_t * prev = next->prev; 26 | LIST_LINK 27 | } 28 | 29 | void insert_after (list_t * prev, list_t * node) 30 | { 31 | list_t * next = prev->next; 32 | LIST_LINK 33 | } 34 | 35 | 36 | void list_add_tail (list_t * root, list_t * node) 37 | { 38 | insert_before (root, node); 39 | } 40 | 41 | void list_add_head (list_t * root, list_t * node) 42 | { 43 | insert_after (root, node); 44 | } 45 | 46 | 47 | void list_remove (list_t * node) 48 | { 49 | list_t * prev = node->prev; 50 | list_t * next = node->next; 51 | prev->next = next; 52 | next->prev = prev; 53 | } 54 | 55 | 56 | //------------------------------------------------------------------------------ 57 | -------------------------------------------------------------------------------- /src/list.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // Double-linked list 3 | //------------------------------------------------------------------------------ 4 | 5 | #pragma once 6 | 7 | 8 | struct list_s 9 | { 10 | struct list_s * prev; 11 | struct list_s * next; 12 | }; 13 | 14 | typedef struct list_s list_t; 15 | 16 | 17 | void list_init (list_t * root); 18 | 19 | void insert_before (list_t * next, list_t * node); 20 | void insert_after (list_t * prev, list_t * node); 21 | 22 | void list_add_tail (list_t * root, list_t * node); 23 | void list_add_head (list_t * root, list_t * node); 24 | 25 | void list_remove (list_t * node); 26 | 27 | 28 | //------------------------------------------------------------------------------ 29 | -------------------------------------------------------------------------------- /src/stream.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // Bit stream 3 | //------------------------------------------------------------------------------ 4 | 5 | #include "stream.h" 6 | 7 | #include 8 | #include 9 | 10 | 11 | // Global data 12 | 13 | uchar_t frame_in [FRAME_MAX]; 14 | uchar_t frame_out [FRAME_MAX]; 15 | 16 | uint_t size_in; 17 | uint_t size_out; 18 | 19 | 20 | // Local data 21 | 22 | static uint_t pos_in; 23 | 24 | static uchar_t byte_in; 25 | static uchar_t byte_out; 26 | 27 | static uchar_t shift_in; 28 | static uchar_t shift_out; 29 | 30 | 31 | // Frame load & store 32 | 33 | void in_frame (const char * name) 34 | { 35 | FILE * file = fopen (name, "r"); 36 | if (!file) error (1, errno, "open failed"); 37 | 38 | size_t size = fread (frame_in, sizeof (uchar_t), FRAME_MAX, file); 39 | if (ferror (file)) error (1, errno, "load failed"); 40 | 41 | size_in = size; 42 | fclose (file); 43 | } 44 | 45 | 46 | void out_frame (const char * name) 47 | { 48 | FILE * file = fopen (name, "w"); 49 | if (!file) error (1, errno, "open failed"); 50 | 51 | size_t size = fwrite (frame_out, sizeof (uchar_t), size_out, file); 52 | if ((size != size_out) || ferror (file)) error (1, errno, "store failed"); 53 | 54 | fclose (file); 55 | } 56 | 57 | 58 | // Byte code 59 | 60 | void out_byte (uchar_t val) 61 | { 62 | if (size_out >= FRAME_MAX) 63 | { 64 | puts ("HELP!"); 65 | error (1, 0, "out overflow"); 66 | } 67 | 68 | frame_out [size_out++] = val; 69 | } 70 | 71 | 72 | uchar in_eof () 73 | { 74 | return pos_in == size_in; 75 | } 76 | 77 | 78 | uchar_t in_byte () 79 | { 80 | if (pos_in >= FRAME_MAX) 81 | error (1, 0, "in overflow"); 82 | 83 | return frame_in [pos_in++]; 84 | } 85 | 86 | 87 | // Bit code 88 | 89 | void out_bit (uchar_t val) 90 | { 91 | byte_out = byte_out | (val ? 128 : 0); // 2^(8-1) 92 | if (++shift_out == 8) 93 | { 94 | out_byte (byte_out); 95 | 96 | shift_out = 0; 97 | byte_out = 0; 98 | return; 99 | } 100 | 101 | byte_out >>= 1; 102 | } 103 | 104 | 105 | uchar_t in_bit () 106 | { 107 | if (!shift_in) 108 | { 109 | byte_in = in_byte (); 110 | shift_in = 8; 111 | } 112 | 113 | uchar_t val = byte_in & 1; 114 | byte_in >>= 1; 115 | shift_in--; 116 | 117 | return val; 118 | } 119 | 120 | 121 | void out_pad () 122 | { 123 | if (shift_out > 0) 124 | { 125 | byte_out >>= (7 - shift_out); 126 | 127 | if (size_out >= FRAME_MAX) 128 | error (1, 0, "out overflow"); 129 | 130 | frame_out [size_out++] = byte_out; 131 | 132 | shift_out = 0; 133 | byte_out = 0; 134 | } 135 | } 136 | 137 | 138 | // Basic code 139 | 140 | void out_code (uint_t code, uchar_t len) 141 | { 142 | if (!len) return; 143 | if (len > 16) 144 | error (1, 0, "code too long"); 145 | 146 | uint_t i = 0; 147 | 148 | while (1) 149 | { 150 | out_bit (code & 1); 151 | if (++i == len) break; 152 | code >>= 1; 153 | } 154 | } 155 | 156 | 157 | uint_t in_code (uchar_t len) 158 | { 159 | if (!len) return 0; 160 | if (len > 16) 161 | error (1, 0, "code too long"); 162 | 163 | uint_t code = 0; 164 | uint_t i = 0; 165 | 166 | while (1) 167 | { 168 | code |= in_bit () ? 32768 : 0; // 2^(16-1) 169 | if (++i == len) break; 170 | code >>= 1; 171 | } 172 | 173 | code >>= (16 - len); 174 | return code; 175 | } 176 | 177 | 178 | // Prefixed code 179 | 180 | void out_len (uchar_t len) 181 | { 182 | while (len-- > 0) out_bit (1); 183 | out_bit (0); 184 | } 185 | 186 | uchar_t in_len () 187 | { 188 | uchar_t len = 0; 189 | 190 | while (in_bit ()) len++; 191 | 192 | return len; 193 | } 194 | 195 | 196 | uint cost_pref_odd (uint val) 197 | { 198 | uchar prefix = 0; 199 | uint base = 1; 200 | 201 | while (val >= base) 202 | { 203 | base = (base << 1) | 1; 204 | prefix++; 205 | } 206 | 207 | return 1 + prefix * 2; 208 | } 209 | 210 | void out_pref_odd (uint_t val) 211 | { 212 | uchar_t prefix = 0; 213 | uint_t base0 = 0; 214 | uint_t base1 = 1; 215 | 216 | while (val >= base1) 217 | { 218 | base0 = base1; 219 | base1 = (base1 << 1) | 1; 220 | prefix++; 221 | } 222 | 223 | out_len (prefix); 224 | 225 | if (prefix) out_code (val - base0, prefix); 226 | } 227 | 228 | 229 | uint_t in_pref_odd () 230 | { 231 | uchar_t suffix = 0; 232 | uint_t base = 0; 233 | 234 | while (in_bit ()) 235 | { 236 | suffix++; 237 | base = (base << 1) | 1; 238 | } 239 | 240 | uint_t val = 0; 241 | if (suffix) val = base + in_code (suffix); 242 | return val; 243 | } 244 | 245 | 246 | void out_pref_even (uint_t val) 247 | { 248 | uchar_t prefix = 0; 249 | uint_t base0 = 0; 250 | uint_t base1 = 2; 251 | 252 | while (val >= base1) 253 | { 254 | base0 = base1; 255 | base1 = (base1 << 1) | 2; 256 | prefix++; 257 | } 258 | 259 | out_len (prefix); 260 | 261 | out_code (val - base0, 1 + prefix); 262 | } 263 | 264 | 265 | uint_t in_pref_even () 266 | { 267 | uchar_t suffix = 0; 268 | uint_t base = 0; 269 | 270 | while (in_bit ()) 271 | { 272 | suffix++; 273 | base = (base << 1) | 2; 274 | } 275 | 276 | return (base + in_code (1 + suffix)); 277 | } 278 | 279 | 280 | uchar_t log2u (uint_t val) 281 | { 282 | if (!val) return 0; 283 | uchar_t log = 1; 284 | while (val >>= 1) log++; 285 | return log; 286 | } 287 | 288 | 289 | //------------------------------------------------------------------------------ 290 | -------------------------------------------------------------------------------- /src/stream.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // Bit stream 3 | //------------------------------------------------------------------------------ 4 | 5 | #pragma once 6 | 7 | #include 8 | 9 | #include "common.h" 10 | 11 | 12 | #define CODE_MAX 256 // 8 bits 13 | #define FRAME_MAX 65536 // 64K 14 | 15 | 16 | // Global data 17 | 18 | extern uchar_t frame_in [FRAME_MAX]; 19 | extern uchar_t frame_out [FRAME_MAX]; 20 | 21 | extern uint_t size_in; 22 | extern uint_t size_out; 23 | 24 | 25 | // Global functions 26 | 27 | void in_frame (); 28 | void out_frame (); 29 | 30 | void out_byte (uchar_t val); 31 | uchar in_eof (); 32 | uchar_t in_byte (); 33 | 34 | void out_bit (uchar_t val); 35 | uchar_t in_bit (); 36 | 37 | void out_pad (); 38 | 39 | void out_code (uint_t code, uchar_t len); 40 | uint_t in_code (uchar_t len); 41 | 42 | void out_len (uchar_t val); 43 | uchar_t in_len (); 44 | 45 | uint cost_pref_odd (uint val); 46 | void out_pref_odd (uint_t val); 47 | uint_t in_pref_odd (); 48 | 49 | void out_pref_even (uint_t val); 50 | uint_t in_pref_even (); 51 | 52 | uchar_t log2u (uint_t val); 53 | 54 | 55 | //------------------------------------------------------------------------------ 56 | -------------------------------------------------------------------------------- /src/symbol.c: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // Symbols 3 | //------------------------------------------------------------------------------ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "list.h" 11 | #include "stream.h" 12 | #include "symbol.h" 13 | 14 | 15 | // Global data 16 | 17 | list_t sym_root; 18 | uint_t sym_count; 19 | 20 | list_t pos_root; 21 | uint_t pos_count; 22 | 23 | index_sym_t index_sym [SYMBOL_MAX]; 24 | uint_t index_count; 25 | 26 | 27 | // Local data 28 | 29 | static list_t pair_root; 30 | static list_t hole_root; 31 | 32 | 33 | // Symbol helpers 34 | 35 | static int sym_comp (const void * v1, const void * v2) 36 | { 37 | uint_t k1 = ((index_sym_t *) v1)->key; 38 | uint_t k2 = ((index_sym_t *) v2)->key; 39 | 40 | return (k1 < k2) ? 1 : ((k1 > k2) ? -1 : 0); 41 | } 42 | 43 | 44 | symbol_t * sym_add () 45 | { 46 | if (sym_count >= SYMBOL_MAX) 47 | error (1, 0, "too many symbols"); 48 | 49 | symbol_t * sym = malloc (sizeof (symbol_t)); 50 | memset (sym, 0, sizeof (symbol_t)); 51 | 52 | list_add_tail (&sym_root, (list_t *) sym); // node as first member 53 | sym_count++; 54 | 55 | return sym; 56 | } 57 | 58 | 59 | // Build index and sort 60 | // TODO: build key in callback 61 | 62 | uint_t sym_sort (uint_t kind) 63 | { 64 | list_t * node = sym_root.next; 65 | for (uint_t i = 0; i < sym_count; i++) 66 | { 67 | index_sym_t * index = index_sym + i; 68 | symbol_t * sym = (symbol_t *) node; // node as first member 69 | 70 | sym->dup_count = sym->sym_count + sym->pos_count; 71 | 72 | switch (kind) 73 | { 74 | case SORT_ALL: 75 | index->key = sym->dup_count + (sym->rep_count > 1 ? sym->rep_count : 0) * sym->size; 76 | break; 77 | 78 | case SORT_REP: 79 | if (sym->rep_count > 1) 80 | { 81 | index->key = 0; 82 | break; 83 | } 84 | 85 | index->key = sym->dup_count; 86 | break; 87 | 88 | case SORT_DUP: 89 | index->key = sym->dup_count; 90 | break; 91 | 92 | default: 93 | error (1, 0, "unknown sorting"); 94 | } 95 | 96 | index->sym = sym; 97 | 98 | node = node->next; 99 | } 100 | 101 | qsort (index_sym, sym_count, sizeof (index_sym_t), sym_comp); 102 | 103 | // Set symbol indexes 104 | 105 | for (uint_t i = 0; i < sym_count; i++) 106 | { 107 | index_sym_t * index = index_sym + i; 108 | symbol_t * sym = index->sym; 109 | sym->index = i; 110 | } 111 | 112 | return 0; 113 | } 114 | 115 | 116 | // Initial symbol filtering 117 | 118 | uint_t filter_init () 119 | { 120 | uint_t filt_count = 0; 121 | 122 | list_t * node = sym_root.next; 123 | for (uint_t i = 0; i < sym_count; i++) 124 | { 125 | symbol_t * sym = (symbol_t *) node; // node as first member 126 | 127 | sym->dup_count = sym->pos_count + sym->sym_count; 128 | if (sym->dup_count > 1) 129 | { 130 | // Duplicated symbols are presumed valuable 131 | // until cost computation confirms or not 132 | sym->keep = 1; 133 | filt_count++; 134 | } 135 | 136 | node = node->next; 137 | } 138 | 139 | return filt_count; 140 | } 141 | 142 | 143 | // List the used symbols 144 | 145 | void sym_list (uint_t filter) 146 | { 147 | double entropy = 0.0; 148 | uint use_count = 0; 149 | 150 | puts ("\nSYMBOLS"); 151 | 152 | for (uint_t i = 0; i < sym_count; i++) 153 | { 154 | index_sym_t * index = index_sym + i; 155 | symbol_t * sym = index->sym; 156 | 157 | if (filter == LIST_KEEP && !sym->keep) continue; 158 | 159 | uint_t sym_dup = sym->pos_count + sym->sym_count; 160 | use_count += sym_dup; 161 | 162 | double p = (double) sym_dup / (pos_count + sym_count); 163 | entropy += -p * log2 (p); 164 | 165 | printf ("[%u] base=%x", i, sym->base); 166 | 167 | if (sym->size == 1) 168 | printf (" code=%hx", sym->code); 169 | else 170 | printf (" size=%u", sym->size); 171 | 172 | if (sym->rep_count > 1) 173 | printf (" rep=%u", sym->rep_count); 174 | else 175 | printf (" pos=%u", sym->pos_count); 176 | 177 | printf (" tree=%u\n", sym->sym_count); 178 | } 179 | 180 | printf ("\nEntropy: %f\n", entropy); 181 | printf ("Limit: %f\n\n", entropy * use_count); 182 | } 183 | 184 | 185 | static void hole_add (position_t * pos) 186 | { 187 | hole_t * hole = malloc (sizeof (hole_t)); 188 | hole->pos = pos; 189 | list_add_tail (&hole_root, (list_t *) hole); // node as first member 190 | } 191 | 192 | 193 | // Scan frame for all base symbols 194 | 195 | void scan_base () 196 | { 197 | list_init (&sym_root); 198 | list_init (&pos_root); 199 | 200 | list_init (&hole_root); 201 | 202 | memset (index_sym, 0, sizeof (index_sym_t) * CODE_MAX); 203 | 204 | // Count symbol occurrences 205 | 206 | for (uint_t i = 0; i < size_in; i++) 207 | { 208 | index_sym_t * index = index_sym + frame_in [i]; 209 | symbol_t * sym = index->sym; 210 | 211 | if (!sym) 212 | { 213 | sym = sym_add (); 214 | 215 | sym->code = frame_in [i]; 216 | sym->base = i; 217 | sym->size = 1; 218 | 219 | index->sym = sym; 220 | } 221 | 222 | position_t * pos = malloc (sizeof (position_t)); 223 | list_add_tail (&pos_root, (list_t *) pos); // node as first member 224 | 225 | pos->base = i; 226 | pos->sym = sym; 227 | pos->pair = NULL; 228 | 229 | // Record hole (= position without pair) 230 | 231 | if (i + 1 < size_in) hole_add (pos); 232 | 233 | sym->pos_count++; 234 | } 235 | 236 | pos_count = size_in; 237 | } 238 | 239 | 240 | // Scan frame holes for new pairs 241 | 242 | static void scan_pair () 243 | { 244 | list_t * hole_left = hole_root.next; 245 | while (hole_left != &hole_root) 246 | { 247 | position_t * pos_left = ((hole_t *) hole_left)->pos; // node as first member 248 | position_t * pos_left_next = (position_t *) (pos_left->node.next); 249 | 250 | // Add new pair 251 | 252 | pair_t * pair = malloc (sizeof (pair_t)); 253 | list_add_tail (&pair_root, (list_t *) pair); // node as first member 254 | 255 | pair->count = 1; 256 | 257 | pair->left = pos_left->sym; 258 | pair->right = pos_left_next->sym; 259 | 260 | pos_left->pair = pair; // pair now found there 261 | 262 | // Scan for pair duplicates 263 | 264 | list_t * hole_right = hole_left->next; 265 | while (hole_right != &hole_root) 266 | { 267 | list_t * hole_right_next = hole_right->next; 268 | 269 | position_t * pos_right = ((hole_t *) hole_right)->pos; // node as first member 270 | position_t * pos_right_next = (position_t *) (pos_right->node.next); 271 | 272 | if ((pair->left == pos_right->sym) && 273 | (pair->right == pos_right_next->sym)) 274 | { 275 | pair->count++; 276 | 277 | pos_right->pair = pair; // pair now found there 278 | list_remove (hole_right); 279 | memset (hole_right, 0, sizeof (hole_t)); // invalidate pointers 280 | free ((hole_t *) hole_right); // node as first member 281 | } 282 | 283 | hole_right = hole_right_next; 284 | } 285 | 286 | list_t * hole_left_next = hole_left->next; 287 | list_remove (hole_left); 288 | memset (hole_left, 0, sizeof (hole_t)); // invalidate pointers 289 | free ((hole_t *) hole_left); // node as first member 290 | hole_left = hole_left_next; 291 | } 292 | } 293 | 294 | 295 | static void dec_pair (pair_t * pair) 296 | { 297 | pair->count--; 298 | 299 | if (!pair->count) 300 | { 301 | list_remove ((list_t *) pair); // node as first member 302 | memset (pair, 0, sizeof (pair_t)); // invalidate pointers 303 | free (pair); 304 | } 305 | } 306 | 307 | 308 | // Crunch all occurrences of one pair 309 | 310 | static int crunch_pair (pair_t * pair) 311 | { 312 | int shrink = 0; // no shrink 313 | 314 | // Check all pair occurrences 315 | 316 | symbol_t * sym = NULL; 317 | 318 | list_t * node = pos_root.next; 319 | list_t * node_prev = &pos_root; 320 | 321 | while ((node != pos_root.prev) && (node != &pos_root)) 322 | { 323 | list_t * node_next = node->next; 324 | 325 | position_t * pos = (position_t *) node; // node as first member 326 | if (pos->pair == pair) 327 | { 328 | // Consider previous pair 329 | 330 | if (node_prev != &pos_root) 331 | { 332 | position_t * pos_prev = (position_t *) node_prev; // node as first member 333 | if (pos_prev->pair) // can be the pair just processed before 334 | { 335 | dec_pair (pos_prev->pair); 336 | pos_prev->pair = NULL; 337 | hole_add (pos_prev); 338 | } 339 | } 340 | 341 | // Consider next pair 342 | 343 | if ((node_next != pos_root.prev) && (node_next != &pos_root)) 344 | { 345 | position_t * pos_next = (position_t *) node_next; // node as first member 346 | dec_pair (pos_next->pair); 347 | pos_next->pair = NULL; 348 | //hole_add (pos_next); // position will be crunched later 349 | } 350 | 351 | // Replace current pair by new symbol 352 | 353 | symbol_t * sym_left = pair->left; 354 | symbol_t * sym_right = pair->right; 355 | 356 | if (!sym) 357 | { 358 | sym = sym_add (); 359 | 360 | sym->base = pos->base; 361 | sym->size = sym_left->size + sym_right->size; 362 | 363 | sym->left = sym_left; 364 | sym_left->sym_count++; 365 | 366 | sym->right = sym_right; 367 | sym_right->sym_count++; 368 | } 369 | 370 | sym_left->pos_count--; 371 | sym_right->pos_count--; 372 | sym->pos_count++; 373 | 374 | pos->sym = sym; 375 | 376 | dec_pair (pair); 377 | pos->pair = NULL; 378 | hole_add (pos); 379 | 380 | // Shift frame end to left 381 | 382 | list_t * node_after = node_next->next; 383 | list_remove (node_next); 384 | free ((position_t *) node_next); // node as first member 385 | node_next = node_after; 386 | 387 | pos_count--; 388 | 389 | shrink = 1; 390 | } 391 | 392 | node_prev = node; 393 | node = node_next; 394 | } 395 | 396 | return shrink; 397 | } 398 | 399 | 400 | // Crunch all pairs 401 | // Performed alone or before repeat crunch 402 | 403 | void crunch_word () 404 | { 405 | // Iterate on pair scan & crunch 406 | 407 | list_init (&pair_root); 408 | 409 | while (1) 410 | { 411 | scan_pair (); 412 | 413 | // Look for the pair the most duplicated 414 | 415 | uint_t count_max = 0; 416 | pair_t * pair_max = NULL; 417 | 418 | list_t * node = pair_root.next; 419 | while (node != &pair_root) 420 | { 421 | pair_t * pair = (pair_t *) node; // node as first member 422 | 423 | // Skip any symmetric pair 424 | 425 | if (pair->left != pair->right) 426 | { 427 | uint_t count = pair->count; 428 | if (count > count_max) 429 | { 430 | count_max = count; 431 | pair_max = pair; 432 | } 433 | } 434 | 435 | node = node->next; 436 | } 437 | 438 | if (count_max < 2) break; 439 | 440 | if (!crunch_pair (pair_max)) break; 441 | } 442 | } 443 | 444 | 445 | // Crunch all repeated symbols 446 | // Performed alone or after word crunch 447 | 448 | void crunch_rep () 449 | { 450 | list_t * node_left = pos_root.next; 451 | while ((node_left != pos_root.prev) && (node_left != &pos_root)) 452 | { 453 | position_t * pos_left = (position_t *) node_left; // node as first member 454 | 455 | symbol_t * sym_left = pos_left->sym; 456 | symbol_t * sym_rep = NULL; 457 | 458 | list_t * node_right = node_left->next; 459 | while (1) 460 | { 461 | if (node_right == &pos_root) break; 462 | 463 | position_t * pos_right = (position_t *) node_right; // node as first member 464 | if (sym_left != pos_right->sym) break; 465 | 466 | // Replace current symbol by repeat 467 | 468 | if (!sym_rep) 469 | { 470 | dec_pair (pos_left->pair); 471 | pos_left->pair = NULL; 472 | 473 | sym_rep = sym_add (); 474 | 475 | pos_left->sym = sym_rep; 476 | sym_rep->pos_count = 1; 477 | sym_left->pos_count--; 478 | 479 | sym_rep->rep_count = 1; 480 | sym_left->rep_count = 1; // repeated 481 | 482 | sym_rep->base = pos_left->base; 483 | sym_rep->size = sym_left->size; 484 | 485 | sym_rep->left = sym_left; 486 | sym_left->sym_count++; 487 | } 488 | 489 | sym_left->pos_count--; 490 | 491 | sym_rep->rep_count++; 492 | 493 | dec_pair (pos_right->pair); 494 | pos_right->pair = NULL; 495 | pos_right->sym = NULL; 496 | 497 | // Shift frame end to left 498 | 499 | list_t * node_after = node_right->next; 500 | list_remove (node_right); 501 | free ((position_t *) node_right); // node as first member 502 | node_right = node_after; 503 | 504 | pos_count--; 505 | } 506 | 507 | node_left = node_right; 508 | } 509 | } 510 | 511 | 512 | //------------------------------------------------------------------------------ 513 | -------------------------------------------------------------------------------- /src/symbol.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // Symbols 3 | //------------------------------------------------------------------------------ 4 | 5 | #pragma once 6 | 7 | 8 | // Symbol definitions 9 | 10 | #define SYMBOL_MAX 65536 // 64K 11 | 12 | struct symbol_s 13 | { 14 | list_t node; // must be the first member 15 | 16 | uint_t pos_count; // number of occurrences in the frame 17 | uint_t sym_count; // number of occurrences in the tree 18 | uint_t rep_count; // number of repetitions (1 = repeated) 19 | uint_t dup_count; // total number of occurences (= pos_count + tree_count) 20 | 21 | uchar_t keep; // define this symbol in the dictionary 22 | uint_t index; // index in the dictionary 23 | uint_t len; // length of symbol when defined or used 24 | uint pass; // pass number that discarded this symbol 25 | 26 | int tree_gain; // gain in tree when defined 27 | int pos_gain; // gain in frame when defined 28 | int all_gain; // overall gain when defined 29 | 30 | uchar_t code; // byte code of base symbol 31 | uint_t base; // offset of first occurrence in input frame 32 | uint_t size; // size in byte codes 33 | 34 | // For secondary symbol 35 | 36 | struct symbol_s * left; // left or repeated child 37 | struct symbol_s * right; // right child 38 | }; 39 | 40 | typedef struct symbol_s symbol_t; 41 | 42 | extern list_t sym_root; 43 | extern uint_t sym_count; 44 | 45 | 46 | // Pair definitions 47 | 48 | struct pair_s 49 | { 50 | list_t node; // must be the first member 51 | 52 | uint_t count; // number of occurrences in the frame 53 | 54 | struct symbol_s * left; 55 | struct symbol_s * right; 56 | }; 57 | 58 | typedef struct pair_s pair_t; 59 | 60 | 61 | // Position definitions 62 | 63 | struct position_s 64 | { 65 | list_t node; // must be the first member 66 | 67 | uint_t base; 68 | 69 | symbol_t * sym; 70 | pair_t * pair; 71 | }; 72 | 73 | typedef struct position_s position_t; 74 | 75 | extern list_t pos_root; 76 | extern uint_t pos_count; 77 | 78 | 79 | // Hole definitions 80 | 81 | struct hole_s 82 | { 83 | list_t node; // must be the first member 84 | 85 | position_t * pos; 86 | }; 87 | 88 | typedef struct hole_s hole_t; 89 | 90 | 91 | // Kind of sorting 92 | 93 | #define SORT_ALL 0 // sort by impact (occurences * size) 94 | #define SORT_REP 1 // filter repeat out 95 | #define SORT_DUP 2 // sort by occurences 96 | 97 | struct index_sym_s 98 | { 99 | uint_t key; 100 | symbol_t * sym; 101 | }; 102 | 103 | typedef struct index_sym_s index_sym_t; 104 | 105 | extern index_sym_t index_sym [SYMBOL_MAX]; 106 | extern uint_t index_count; 107 | 108 | 109 | // Listing filters 110 | 111 | #define LIST_ALL 0 // list all symbols 112 | #define LIST_KEEP 1 // list only defined 113 | 114 | 115 | // Global functions 116 | 117 | symbol_t * sym_add (); 118 | uint_t sym_sort (uint_t kind); 119 | void sym_list (uint_t filter); 120 | 121 | uint_t filter_init (); 122 | 123 | void scan_base (); 124 | 125 | void crunch_word (); 126 | void crunch_rep (); 127 | 128 | 129 | //------------------------------------------------------------------------------ 130 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | if [ $# -ne 2 ]; then 2 | echo "Missing input file and algorithm !" 3 | exit 1 4 | fi 5 | 6 | echo "1- COMPRESS" 7 | Release/compress -cvs $1 -m $2 test_out.bin | tee test.txt 8 | 9 | echo "2- EXPAND" 10 | Release/compress -evs -m $2 test_out.bin test_in.bin | tee -a test.txt 11 | 12 | echo "3- COMPARE" 13 | dump $1 > $1.txt 14 | dump test_in.bin > test_in.txt 15 | diff --color $1.txt test_in.txt 16 | -------------------------------------------------------------------------------- /validate.sh: -------------------------------------------------------------------------------- 1 | cp test.txt test.0.txt 2 | --------------------------------------------------------------------------------