├── LICENSE ├── README.md └── wrtpre.cpp /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2008 Matt Mahoney, Serge Osnach, Alexander Ratushnyak, 2 | Bill Pettis, Przemyslaw Skibinski, Matthew Fite, wowtiger, Andrew Paterson, 3 | Jan Ondrus, Andreas Morphis, Pavel L. Holoborodko, KZ., Simon Berger, 4 | Neill Corlett 5 | 6 | LICENSE 7 | 8 | This program is free software; you can redistribute it and/or 9 | modify it under the terms of the GNU General Public License as 10 | published by the Free Software Foundation; either version 2 of 11 | the License, or (at your option) any later version. 12 | 13 | This program is distributed in the hope that it will be useful, but 14 | WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 | General Public License for more details at 17 | Visit . -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | paq8pxd file compressor/archiver. Release by Kaido Orav, Aug. 14, 2013 2 | 3 | # COMMAND LINE INTERFACE 4 | 5 | - To install, put paq8pxd.exe somewhere in your PATH. 6 | - To compress: paq8pxd [-N] file1 [file2...] 7 | - To decompress: paq8pxd [-d] file1.paq8pxd [dir2] 8 | - To view contents: more < file1.paq8pxd 9 | 10 | The compressed output file is named by adding ".paq8pxd" extension to 11 | the first named file (file1.paq8pxd). Each file that exists will be 12 | added to the archive and its name will be stored without a path. 13 | The option -N specifies a compression level ranging from -0 14 | (fastest) to -8 (smallest). The default is -5. If there is 15 | no option and only one file, then the program will pause when 16 | finished until you press the ENTER key (to support drag and drop). 17 | If file1.paq8pxd exists then it is overwritten. 18 | 19 | If the first named file ends in ".paq8pxd" then it is assumed to be 20 | an archive and the files within are extracted to the same directory 21 | as the archive unless a different directory (dir2) is specified. 22 | The -d option forces extraction even if there is not a ".paq8pxd" 23 | extension. If any output file already exists, then it is compared 24 | with the archive content and the first byte that differs is reported. 25 | No files are overwritten or deleted. If there is only one argument 26 | (no -d or dir2) then the program will pause when finished until 27 | you press ENTER. 28 | 29 | For compression, if any named file is actually a directory, then all 30 | files and subdirectories are compressed, preserving the directory 31 | structure, except that empty directories are not stored, and file 32 | attributes (timestamps, permissions, etc.) are not preserved. 33 | During extraction, directories are created as needed. For example: 34 | 35 | paq8pxd -4 c:\tmp\foo bar 36 | 37 | compresses foo and bar (if they exist) to c:\tmp\foo.paq8pxd at level 4. 38 | 39 | paq8pxd -d c:\tmp\foo.paq8pxd . 40 | 41 | extracts foo and compares bar in the current directory. If foo and bar 42 | are directories then their contents are extracted/compared. 43 | 44 | There are no commands to update an existing archive or to extract 45 | part of an archive. Files and archives larger than 2GB are not 46 | supported (but might work on 64-bit machines, not tested). 47 | File names with nonprintable characters are not supported (spaces 48 | are OK). 49 | 50 | 51 | # TO COMPILE 52 | 53 | There are 2 files: paq8pxd.cpp (C++) and paq7asm.asm (NASM/YASM). 54 | paq7asm.asm is the same as in paq7 and paq8x. paq8pxd.cpp recognizes the 55 | following compiler options: 56 | 57 | * -DWINDOWS (to compile in Windows) 58 | * -DUNIX (to compile in Unix, Linux, Solairs, MacOS/Darwin, etc) 59 | * -DNOASM (to replace paq7asm.asm with equivalent C++) 60 | * -DDEFAULT_OPTION=N (to change the default compression level from 5 to N). 61 | 62 | If you compile without -DWINDOWS or -DUNIX, you can still compress files, 63 | but you cannot compress directories or create them during extraction. 64 | You can extract directories if you manually create the empty directories 65 | first. 66 | 67 | Use -DEFAULT_OPTION=N to change the default compression level to support 68 | drag and drop on machines with less than 256 MB of memory. Use 69 | -DDEFAULT_OPTION=4 for 128 MB, 3 for 64 MB, 2 for 32 MB, etc. 70 | 71 | Use -DNOASM for non x86-32 machines, or older than a Pentium-MMX (about 72 | 1997), or if you don't have NASM or YASM to assemble paq7asm.asm. The 73 | program will still work but it will be slower. For NASM in Windows, 74 | use the options "--prefix _" and either "-f win32" or "-f obj" depending 75 | on your C++ compiler. In Linux, use "-f elf". 76 | 77 | Recommended compiler commands and optimizations: 78 | 79 | UNIX/Linux (PC): 80 | g++ paq8pxd.cpp -DUNIX -O3 81 | 82 | MinGW produces faster executables than Borland or Mars, but Intel 9 83 | is about 4% faster than MinGW). 84 | 85 | 86 | # ARCHIVE FILE FORMAT 87 | 88 | An archive has the following format. It is intended to be both 89 | human and machine readable. The header ends with CTRL-Z (Windows EOF) 90 | so that the binary compressed data is not displayed on the screen. 91 | 92 | paq8pxd -N CR LF 93 | size TAB filename CR LF 94 | size TAB filename CR LF 95 | ... 96 | CTRL-Z 97 | compressed binary data 98 | 99 | -N is the option (-0 to -9), even if a default was used. 100 | Plain file names are stored without a path. Files in compressed 101 | directories are stored with path relative to the compressed directory 102 | (using UNIX style forward slashes "/"). For example, given these files: 103 | 104 | 123 C:\dir1\file1.txt 105 | 456 C:\dir2\file2.txt 106 | 107 | Then 108 | 109 | paq8pxd archive \dir1\file1.txt \dir2 110 | 111 | will create archive.paq8pxd with the header: 112 | 113 | paq8pxd -5 114 | 123 file1.txt 115 | 456 dir2/file2.txt 116 | 117 | The command: 118 | 119 | paq8pxd archive.paq8pxd C:\dir3 120 | 121 | will create the files: 122 | 123 | C:\dir3\file1.txt 124 | C:\dir3\dir2\file2.txt 125 | 126 | Decompression will fail if the first 7 bytes are not "paq8pxd -". Sizes 127 | are stored as decimal numbers. CR, LF, TAB, CTRL-Z are ASCII codes 128 | 13, 10, 9, 26 respectively. 129 | 130 | 131 | # ARITHMETIC CODING 132 | 133 | The binary data is arithmetic coded as the shortest base 256 fixed point 134 | number x = SUM_i x_i 256^-1-i such that p(= 16. 211 | 212 | The primaty output is t_i := stretch(sm(n0,n1,h)), where sm(.) is 213 | a stationary map with K = 1/256, initialized to 214 | sm(n0,n1,h) = (n1+(1/64))/(n+2/64). Four additional inputs are also 215 | be computed to improve compression slightly: 216 | 217 | p1_i = sm(n0,n1,h) 218 | p0_i = 1 - p1_i 219 | t_i := stretch(p_1) 220 | t_i+1 := K1 (p1_i - p0_i) 221 | t_i+2 := K2 stretch(p1) if n0 = 0, -K2 stretch(p1) if n1 = 0, else 0 222 | t_i+3 := K3 (-p0_i if n1 = 0, p1_i if n0 = 0, else 0) 223 | t_i+4 := K3 (-p0_i if n0 = 0, p1_i if n1 = 0, else 0) 224 | 225 | where K1..K4 are ad-hoc constants. 226 | 227 | h is updated as follows: 228 | If n < 4, append y_j to h. 229 | Else if n <= 16, set h := y_j. 230 | Else h = 0. 231 | 232 | The update rule is biased toward newer data in a way that allows 233 | n0 or n1, but not both, to grow large by discarding counts of the 234 | opposite bit. Large counts are incremented probabilistically. 235 | Specifically, when y_j = 0 then the update rule is: 236 | 237 | n0 := n0 + 1, n < 29 238 | n0 + 1 with probability 2^(27-n0)/2 else n0, 29 <= n0 < 41 239 | n0, n = 41. 240 | n1 := n1, n1 <= 5 241 | round(8/3 lg n1), if n1 > 5 242 | 243 | swapping (n0,n1) when y_j = 1. 244 | 245 | Furthermore, to allow an 8 bit representation for (n0,n1,h), states 246 | exceeding the following values of n0 or n1 are replaced with the 247 | state with the closest ratio n0:n1 obtained by decrementing the 248 | smaller count: (41,0,h), (40,1,h), (12,2,h), (5,3,h), (4,4,h), 249 | (3,5,h), (2,12,h), (1,40,h), (0,41,h). For example: 250 | (12,2,1) 0-> (7,1,0) because there is no state (13,2,0). 251 | 252 | - Match Model. The state is (c,b), initially (0,0), where c is 1 if 253 | the context was previously seen, else 0, and b is the next bit in 254 | this context. The prediction is: 255 | 256 | t_i := (2b - 1)Kc log(m + 1) 257 | 258 | where m is the length of the context. The update rule is c := 1, 259 | b := y_j. A match model can be implemented efficiently by storing 260 | input in a buffer and storing pointers into the buffer into a hash 261 | table indexed by context. Then c is indicated by a hash table entry 262 | and b can be retrieved from the buffer. 263 | 264 | 265 | # CONTEXTS 266 | 267 | High compression is achieved by combining a large number of contexts. 268 | Most (not all) contexts start on a byte boundary and end on the bit 269 | immediately preceding the predicted bit. The contexts below are 270 | modeled with both a run map and a nonstationary map unless indicated. 271 | 272 | - Order n. The last n bytes, up to about 16. For general purpose data. 273 | Most of the compression occurs here for orders up to about 6. 274 | An order 0 context includes only the 0-7 bits of the partially coded 275 | byte and the number of these bits (255 possible values). 276 | 277 | - Sparse. Usually 1 or 2 of the last 8 bytes preceding the byte containing 278 | the predicted bit, e.g (2), (3),..., (8), (1,3), (1,4), (1,5), (1,6), 279 | (2,3), (2,4), (3,6), (4,8). The ordinary order 1 and 2 context, (1) 280 | or (1,2) are included above. Useful for binary data. 281 | 282 | - Text. Contexts consists of whole words (a-z, converted to lower case 283 | and skipping other values). Contexts may be sparse, e.g (0,2) meaning 284 | the current (partially coded) word and the second word preceding the 285 | current one. Useful contexts are (0), (0,1), (0,1,2), (0,2), (0,3), 286 | (0,4). The preceding byte may or may not be included as context in the 287 | current word. 288 | 289 | - Formatted text. The column number (determined by the position of 290 | the last linefeed) is combined with other contexts: the charater to 291 | the left and the character above it. 292 | 293 | - Fixed record length. The record length is determined by searching for 294 | byte sequences with a uniform stride length. Once this is found, then 295 | the record length is combined with the context of the bytes immediately 296 | preceding it and the corresponding byte locations in the previous 297 | one or two records (as with formatted text). 298 | 299 | - Context gap. The distance to the previous occurrence of the order 1 300 | or order 2 context is combined with other low order (1-2) contexts. 301 | 302 | - FAX. For 2-level bitmapped images. Contexts are the surrounding 303 | pixels already seen. Image width is assumed to be 1728 bits (as 304 | in calgary/pic). 305 | 306 | - Image. For uncompressed 24-bit color BMP, TIFF and TGA images. Contexts 307 | are the high order bits of the surrounding pixels and linear 308 | combinations of those pixels, including other color planes. The 309 | image width is detected from the file header. When an image is 310 | detected, other models are turned off to improve speed. 311 | 312 | - JPEG. Files are further compressed by partially uncompressing back 313 | to the DCT coefficients to provide context for the next Huffman code. 314 | Only baseline DCT-Huffman coded files are modeled. (This ia about 315 | 90% of images, the others are usually progresssive coded). JPEG images 316 | embedded in other files (quite common) are detected by headers. The 317 | baseline JPEG coding process is: 318 | - Convert to grayscale and 2 chroma colorspace. 319 | - Sometimes downsample the chroma images 2:1 or 4:1 in X and/or Y. 320 | - Divide each of the 3 images into 8x8 blocks. 321 | - Convert using 2-D discrete cosine transform (DCT) to 64 12-bit signed 322 | coefficients. 323 | - Quantize the coefficients by integer division (lossy). 324 | - Split the image into horizontal slices coded independently, separated 325 | by restart codes. 326 | - Scan each block starting with the DC (0,0) coefficient in zigzag order 327 | to the (7,7) coefficient, interleaving the 3 color components in 328 | order to scan the whole image left to right starting at the top. 329 | - Subtract the previous DC component from the current in each color. 330 | - Code the coefficients using RS codes, where R is a run of R zeros (0-15) 331 | and S indicates 0-11 bits of a signed value to follow. (There is a 332 | special RS code (EOB) to indicate the rest of the 64 coefficients are 0). 333 | - Huffman code the RS symbol, followed by S literal bits. 334 | The most useful contexts are the current partially coded Huffman code 335 | (including S following bits) combined with the coefficient position 336 | (0-63), color (0-2), and last few RS codes. 337 | 338 | - Match. When a context match of 400 bytes or longer is detected, 339 | the next bit of the match is predicted and other models are turned 340 | off to improve speed. 341 | 342 | - Exe. When a x86 file (.exe, .obj, .dll) is detected, sparse contexts 343 | with gaps of 1-12 selecting only the prefix, opcode, and the bits 344 | of the modR/M byte that are relevant to parsing are selected. 345 | This model is turned off otherwise. 346 | 347 | - Indirect. The history of the last 1-3 bytes in the context of the 348 | last 1-2 bytes is combined with this 1-2 byte context. 349 | 350 | - DMC. A bitwise n-th order context is built from a state machine using 351 | DMC, described in http://plg.uwaterloo.ca/~ftp/dmc/dmc.c 352 | The effect is to extend a single context, one bit at a time and predict 353 | the next bit based on the history in this context. The model here differs 354 | in that two predictors are used. One is a pair of counts as in the original 355 | DMC. The second predictor is a bit history state mapped adaptively to 356 | a probability as as in a Nonstationary Map. 357 | 358 | # ARCHITECTURE 359 | 360 | The context models are mixed by several of several hundred neural networks 361 | selected by a low-order context. The outputs of these networks are 362 | combined using a second neural network, then fed through several stages of 363 | adaptive probability maps (APM) before arithmetic coding. 364 | 365 | For images, only one neural network is used and its context is fixed. 366 | 367 | An APM is a stationary map combining a context and an input probability. 368 | The input probability is stretched and divided into 32 segments to 369 | combine with other contexts. The output is interpolated between two 370 | adjacent quantized values of stretch(p1). There are 2 APM stages in series: 371 | 372 | p1 := (p1 + 3 APM(order 0, p1)) / 4. 373 | p1 := (APM(order 1, p1) + 2 APM(order 2, p1) + APM(order 3, p1)) / 4. 374 | 375 | # PREPROCESSING 376 | 377 | paq8pxd uses preprocessing transforms on certain data types to improve 378 | compression. To improve reliability, the decoding transform is 379 | tested during compression to ensure that the input file can be 380 | restored. If the decoder output is not identical to the input file 381 | due to a bug, then the transform is abandoned and the data is compressed 382 | without a transform so that it will still decompress correctly. 383 | 384 | The input is split into blocks with the format 385 | where is 1 byte (0 = no transform), is the size 386 | of the data after decoding, which may be different than the size of . 387 | Blocks do not span file boundaries, and have a maximum size of 4MB to 388 | 2GB depending on compression level. Large files are split into blocks 389 | of this size. The preprocessor has 3 parts: 390 | 391 | - Detector. Splits the input into smaller blocks depending on data type. 392 | 393 | - Coder. Input is a block to be compressed. Output is a temporary 394 | file. The coder determines whether a transform is to be applied 395 | based on file type, and if so, which one. A coder may use lots 396 | of resources (memory, time) and make multiple passes through the 397 | input file. The file type is stored (as one byte) during compression. 398 | 399 | - Decoder. Performs the inverse transform of the coder. It uses few 400 | resorces (fast, low memory) and runs in a single pass (stream oriented). 401 | It takes input either from a file or the arithmetic decoder. Each call 402 | to the decoder returns a single decoded byte. 403 | 404 | The following transforms are used: 405 | 406 | - EXE: CALL (0xE8) and JMP (0xE9) address operands are converted from 407 | relative to absolute address. The transform is to replace the sequence 408 | E8/E9 xx xx xx 00/FF by adding file offset modulo 2^25 (signed range, 409 | little-endian format). Data to transform is identified by trying the 410 | transform and applying a crude compression test: testing whether the 411 | byte following the E8/E8 (LSB of the address) occurred more recently 412 | in the transformed data than the original and within 4KB 4 times in 413 | a row. The block ends when this does not happen for 4KB. 414 | 415 | - JPEG: detected by SOI and SOF and ending with EOI or any nondecodable 416 | data. No transform is applied. The purpose is to separate images 417 | embedded in execuables to block the EXE transform, and for a future 418 | place to insert a transform. 419 | 420 | 421 | # IMPLEMENTATION 422 | 423 | Hash tables are designed to minimize cache misses, which consume most 424 | of the CPU time. 425 | 426 | Most of the memory is used by the nonstationary context models. 427 | Contexts are represented by 32 bits, possibly a hash. These are 428 | mapped to a bit history, represented by 1 byte. The hash table is 429 | organized into 64-byte buckets on cache line boundaries. Each bucket 430 | contains 7 x 7 bit histories, 7 16-bit checksums, and a 2 element LRU 431 | queue packed into one byte. Each 7 byte element represents 7 histories 432 | for a context ending on a 3-bit boundary plus 0-2 more bits. One 433 | element (for bits 0-1, which have 4 unused bytes) also contains a run model 434 | consisting of the last byte seen and a count (as 1 byte each). 435 | 436 | Run models use 4 byte hash elements consisting of a 2 byte checksum, a 437 | repeat count (0-255) and the byte value. The count also serves as 438 | a priority. 439 | 440 | Stationary models are most appropriate for small contexts, so the 441 | context is used as a direct table lookup without hashing. 442 | 443 | The match model maintains a pointer to the last match until a mismatching 444 | bit is found. At the start of the next byte, the hash table is referenced 445 | to find another match. The hash table of pointers is updated after each 446 | whole byte. There is no checksum. Collisions are detected by comparing 447 | the current and matched context in a rotating buffer. 448 | 449 | The inner loops of the neural network prediction (1) and training (2) 450 | algorithms are implemented in MMX assembler, which computes 4 elements 451 | at a time. Using assembler is 8 times faster than C++ for this code 452 | and 1/3 faster overall. (However I found that SSE2 code on an AMD-64, 453 | which computes 8 elements at a time, is not any faster). 454 | 455 | 456 | # DIFFERENCES FROM PAQ8PXD_V5 457 | 458 | * changes in wrt, use 0-9 ind dict if count is larger then a-zA-Z 459 | * 8-bit image model changes 460 | * base64 changes 461 | * contextmap from Tangelo 462 | * fixes in DMC model 463 | * cleanup of unused varibles 464 | * fixes in wordmodel 465 | etc. 466 | -------------------------------------------------------------------------------- /wrtpre.cpp: -------------------------------------------------------------------------------- 1 | 2 | // based on "XWRT 3.2 (29.10.2007) - XML compressor by P.Skibinski, inikep@gmail.com" 3 | // 4 | #include 5 | #include 6 | #pragma warning(disable:4786) 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #if defined WIN32 || defined WIN64 13 | #include 14 | #include 15 | #endif 16 | 17 | #define PRINT_CHARS(data) ;//printf data 18 | #define PRINT_CODEWORDS(data); // printf data 19 | #define PRINT_STACK(data) ;//printf data; 20 | #define PRINT_DICT(data) ;//printf data; 21 | #define PRINT_CONTAINERS(data) ;//printf data 22 | //#define PRINT_STATUS(data) printf data; 23 | 24 | #pragma warning(disable:4244) // '=' : conversion from ... to ..., possible loss of data 25 | #pragma warning(disable:4786) // STL warnings 26 | #pragma warning(disable:4996) // '_getch' was declared deprecated 27 | #pragma warning(disable:4503) // STL 28 | #pragma warning(disable:4390) // empty controlled statement found; is this the intent? 29 | #pragma warning(disable:4018) // signed/unsigned mismatch 30 | #define _CRT_SECURE_NO_DEPRECATE // VC++ 2005 deprecate warnings 31 | 32 | 33 | #if defined WIN32 || defined WIN64 34 | #define getch _getch 35 | #else 36 | #define getch getchar 37 | #endif 38 | 39 | #define CHAR_FIRSTUPPER 1 // for encode lower word with first capital letter 40 | #define CHAR_UPPERWORD 2 // for encode upper word 41 | #define CHAR_ESCAPE 3 // for encode reserved chars (CHAR_ESCAPE,CHAR_FIRSTUPPER,...) 42 | #define BINARY_FIRST 128 43 | #define BINARY_LAST 255 44 | 45 | #define OPTION_TRY_SHORTER_WORD 4 46 | 47 | 48 | #if !defined min 49 | #define min(a,b) (((a)>(b))?(b):(a)) 50 | #endif 51 | #define IF_OPTION(option) (preprocFlag & option) //, printf("%d",option) 52 | #define OPTION(option) (xml_wrt.preprocFlag & option) 53 | #define TURN_OFF(option) {if (preprocFlag & option) preprocFlag-=option;} 54 | #define TURN_ON(option) {if ((preprocFlag & option)==0) preprocFlag+=option;} 55 | #define RESET_OPTIONS (preprocFlag=0) 56 | 57 | #define WORD_MIN_SIZE 2 58 | #define WORD_AVG_SIZE 8 59 | #define WORD_MAX_SIZE 48 60 | #define STRING_MAX_SIZE 255 // 1-byte for container.size() 61 | 62 | #define MAX_DYNAMIC_DICT_COUNT (65536*256) 63 | #define HASH_TABLE_SIZE (1<<20) //1MB*4 64 | 65 | //#define BYTES_TO_DETECT (50*1024) 66 | 67 | //#define NUM_BASE 256 68 | #define HASH_DOUBLE_MULT 37 69 | #define HASH_MULT 23 70 | //#define CHARSET_COUNT 6 71 | 72 | 73 | enum EWordType { LOWERWORD, FIRSTUPPER, UPPERWORD, VARWORD, NUMBER}; 74 | enum ELetterType { LOWERCHAR, UPPERCHAR, UNKNOWNCHAR, RESERVEDCHAR, NUMBERCHAR }; 75 | #define OUT_OF_MEMORY() \ 76 | { \ 77 | printf("Not enough memory!\n");\ 78 | exit(0); \ 79 | } 80 | FILE* XWRT_file; 81 | FILE* XWRT_fileout; 82 | unsigned char** dict=NULL; 83 | int* dictfreq=NULL; 84 | unsigned char* dictlen=NULL; 85 | int wrtnum=0; 86 | #define PUTC(c) { putc(c,XWRT_fileout); } 87 | #define GETC(c) { c=getc(XWRT_file); } 88 | size_t fread_fast(unsigned char* dst, int len, FILE* file); 89 | size_t fwrite_fast(unsigned char* dst, int len, FILE* file); 90 | 91 | 92 | ///////////////////////////////////////////////////////// 93 | 94 | 95 | #define OUTPUT_BUFFER_MIN_SIZE 10240 96 | 97 | 98 | // Input/Output using dynamic memory allocation 99 | class CMemoryBuffer 100 | { 101 | public: 102 | CMemoryBuffer(std::string mname=""); 103 | ~CMemoryBuffer(); 104 | 105 | void OutTgtByte( unsigned char c ); 106 | int InpSrcByte( void ); 107 | inline int Size(); 108 | inline int Allocated(); 109 | inline void AllocSrcBuf( unsigned int len ); 110 | inline void Clear(); 111 | 112 | static unsigned int memsize; 113 | unsigned char* TargetBuf; 114 | unsigned char* SourceBuf; 115 | unsigned int SrcLen, TgtLen; 116 | unsigned int SrcPtr, TgtPtr; 117 | std::string name; 118 | 119 | private: 120 | inline void AllocTgtBuf( unsigned int len = OUTPUT_BUFFER_MIN_SIZE ); 121 | inline void ReallocTgtBuf(unsigned int len); 122 | }; 123 | 124 | class CContainers 125 | { 126 | public: 127 | CContainers(); 128 | void prepareMemBuffers(); 129 | void writeMemBuffers(int preprocFlag); 130 | void readMemBuffers(int preprocFlag, int maxMemSize); 131 | void freeMemBuffers(bool freeMem); 132 | 133 | CMemoryBuffer *memout; 134 | unsigned char *bigBuffer; 135 | 136 | private: 137 | std::vector mem_stack; 138 | std::map memmap; 139 | }; 140 | 141 | unsigned int CMemoryBuffer::memsize=0; 142 | 143 | CMemoryBuffer::CMemoryBuffer(std::string mname) 144 | { 145 | name=mname; 146 | Clear(); 147 | AllocTgtBuf(); 148 | }; 149 | 150 | CMemoryBuffer::~CMemoryBuffer() 151 | { 152 | if (TargetBuf) 153 | free(TargetBuf-3); 154 | 155 | if (SourceBuf) 156 | free(SourceBuf); 157 | }; 158 | 159 | inline void CMemoryBuffer::Clear() 160 | { 161 | TargetBuf=NULL; SourceBuf=NULL; SrcPtr=0; TgtPtr=0; SrcLen=0; TgtLen=0; 162 | } 163 | 164 | inline int CMemoryBuffer::Size() 165 | { 166 | return TgtPtr; 167 | } 168 | 169 | inline int CMemoryBuffer::Allocated() 170 | { 171 | return TgtLen; 172 | } 173 | 174 | void CMemoryBuffer::OutTgtByte( unsigned char c ) 175 | { 176 | memsize++; 177 | 178 | *(TargetBuf+(TgtPtr++))=c; 179 | if (TgtPtr>TgtLen-1){ 180 | if (TgtLen > (1<<19)) // 512 KB 181 | ReallocTgtBuf(TgtLen+(1<<19)); 182 | else 183 | ReallocTgtBuf(TgtLen*2); 184 | } 185 | } 186 | 187 | int CMemoryBuffer::InpSrcByte( void ) 188 | { 189 | memsize++; 190 | 191 | if (SrcPtr>=SrcLen) 192 | return EOF; 193 | 194 | return *(SourceBuf+(SrcPtr++)); 195 | } 196 | 197 | inline void CMemoryBuffer::AllocSrcBuf( unsigned int len ){ 198 | SrcLen = len; 199 | SourceBuf = (unsigned char*) malloc(SrcLen); 200 | if (SourceBuf==NULL) 201 | OUT_OF_MEMORY(); 202 | } 203 | 204 | inline void CMemoryBuffer::AllocTgtBuf( unsigned int len ){ 205 | TgtLen = len; 206 | TargetBuf = (unsigned char*) malloc(len+6); 207 | if (TargetBuf==NULL) 208 | OUT_OF_MEMORY(); 209 | TargetBuf += 3; 210 | } 211 | 212 | inline void CMemoryBuffer::ReallocTgtBuf(unsigned int len){ 213 | unsigned char* NewTargetBuf = (unsigned char*) malloc(len+6); 214 | 215 | if (NewTargetBuf==NULL) 216 | OUT_OF_MEMORY(); 217 | 218 | NewTargetBuf += 3; 219 | memcpy(NewTargetBuf,TargetBuf,min(TgtPtr,len)); 220 | TgtLen = len; 221 | delete(TargetBuf-3); 222 | TargetBuf=NewTargetBuf; 223 | } 224 | 225 | CContainers::CContainers() : bigBuffer(NULL) {}; 226 | 227 | void CContainers::prepareMemBuffers(){ 228 | memout=new CMemoryBuffer(); 229 | std::pair p("!data",memout); 230 | memmap.insert(p); 231 | } 232 | 233 | void CContainers::writeMemBuffers(int preprocFlag){ 234 | std::map::iterator it; 235 | 236 | int fileLen=0; 237 | //int len=0; 238 | //int lenCompr=0; 239 | //int allocated=0; 240 | 241 | for (it=memmap.begin(); it!=memmap.end(); it++) 242 | { 243 | CMemoryBuffer* b=it->second; 244 | fileLen=b->Size(); 245 | 246 | PRINT_CONTAINERS(("cont=%s fileLen=%d\n",it->first.c_str(),fileLen)); 247 | 248 | if (fileLen>0) 249 | { 250 | // allocated+=b->Allocated(); 251 | // len+=fileLen; 252 | 253 | PUTC((int)it->first.size()); 254 | for (int i=0; i<(int)it->first.size(); i++) 255 | PUTC(it->first[i]); 256 | 257 | PUTC(fileLen>>24); 258 | PUTC(fileLen>>16); 259 | PUTC(fileLen>>8); 260 | PUTC(fileLen); 261 | 262 | fwrite_fast(it->second->TargetBuf,it->second->TgtPtr,XWRT_fileout); 263 | // lenCompr+=fileLen; 264 | } 265 | } 266 | PUTC(0) 267 | //PRINT_DICT(("dataSize=%d compr=%d allocated=%d\n",len,lenCompr,allocated)); 268 | 269 | freeMemBuffers(true); 270 | prepareMemBuffers(); 271 | } 272 | 273 | void CContainers::readMemBuffers(int preprocFlag, int maxMemSize){ 274 | //unsigned char* buf=NULL; 275 | // unsigned int bufLen=0; 276 | unsigned int fileLen; 277 | // unsigned int ui; 278 | // int len=0; 279 | // int lenCompr=0; 280 | int i,c; 281 | unsigned char s[STRING_MAX_SIZE]; 282 | 283 | freeMemBuffers(true); 284 | prepareMemBuffers(); 285 | CMemoryBuffer* memout_tmp=NULL; 286 | 287 | while (true){ 288 | GETC(i); 289 | 290 | if (i<=0) 291 | break; 292 | 293 | for (c=0; c p(str,memout_tmp); 307 | memmap.insert(p); 308 | } 309 | 310 | int c; 311 | for (i=0, fileLen=0; i<4; i++){ 312 | GETC(c); 313 | fileLen=fileLen*256+c; 314 | } 315 | 316 | //len+=fileLen; 317 | //lenCompr+=fileLen; 318 | memout_tmp->AllocSrcBuf(fileLen); 319 | 320 | fread_fast(memout_tmp->SourceBuf,memout_tmp->SrcLen,XWRT_file); 321 | 322 | //printStatus(fileLen,0,false); 323 | } 324 | //PRINT_DICT(("readMemBuffers() dataSize=%d compr=%d allocated=%d\n",len,lenCompr,maxMemSize+10240)); 325 | } 326 | 327 | void CContainers::freeMemBuffers(bool freeMem){ 328 | mem_stack.clear(); 329 | 330 | std::map::iterator it; 331 | 332 | for (it=memmap.begin(); it!=memmap.end(); it++) 333 | { 334 | if (!freeMem) 335 | it->second->Clear(); 336 | delete(it->second); 337 | } 338 | 339 | memmap.clear(); 340 | } 341 | 342 | ///////////////////////////////////////////////////////// 343 | 344 | ///////////////////////////////////////////////////////// 345 | 346 | class XWRT_Common 347 | { 348 | public: 349 | XWRT_Common(int fileBufferSize=17); // 128 kb 350 | ~XWRT_Common(); 351 | 352 | void defaultSettings(int n); 353 | unsigned int flen( FILE* &f ); 354 | 355 | CContainers cont; 356 | int preprocFlag; 357 | 358 | protected: 359 | 360 | inline void stringHash(const unsigned char *ptr, int len,int& hash); 361 | int addWord(unsigned char* &mem,int &i); 362 | unsigned char* loadDynamicDictionary(unsigned char* mem,unsigned char* mem_end); 363 | void initializeLetterSet(); 364 | void initializeCodeWords(int word_count,bool initMem=true); 365 | bool initialize(bool encoding); 366 | void WRT_deinitialize(); 367 | 368 | void WRT_print_options(); 369 | int minSpacesFreq(); 370 | 371 | int* word_hash; 372 | bool decoding,fileCorrupted,detect,firstWarn; 373 | int maxDynDictBuf,minWordFreq,maxDictSize; 374 | int tryShorterBound,spaces,fileLenMB,beforeWord; 375 | int spacesCodeword[256]; 376 | int spacesCont[256]; 377 | std::vector sortedDict; 378 | 379 | ELetterType letterType; 380 | ELetterType letterSet[256]; 381 | 382 | int sizeDict,sizeDynDict; 383 | unsigned char* dictmem; 384 | unsigned char* dictmem_end; 385 | unsigned char* mem; 386 | 387 | int addSymbols[256]; // reserved symbols in output alphabet 388 | int reservedSet[256]; // reserved symbols in input alphabet 389 | int outputSet[256]; 390 | int wordSet[256]; 391 | int sym2codeword[256]; 392 | int codeword2sym[256]; 393 | 394 | int dictionary,dict1size,dict2size,dict3size,dict4size,dict1plus2plus3,dict1plus2; 395 | int bound4,bound3,dict123size,dict12size,collision,quoteOpen,quoteClose,detectedSym; 396 | int maxMemSize; 397 | int sortedDictSize; 398 | 399 | 400 | public: 401 | }; 402 | 403 | XWRT_Common::XWRT_Common(int fileBufferSize) : dictmem(NULL), 404 | detect(false), dictmem_end(NULL),fileCorrupted(false) 405 | { 406 | if (fileBufferSize<10) 407 | fileBufferSize=10; // 1 KB 408 | if (fileBufferSize>23) 409 | fileBufferSize=23; // 8 MB 410 | word_hash=new int[HASH_TABLE_SIZE]; 411 | if (!word_hash) 412 | OUT_OF_MEMORY(); 413 | } 414 | XWRT_Common::~XWRT_Common(){ 415 | if (word_hash) 416 | delete(word_hash); 417 | WRT_deinitialize(); 418 | } 419 | int XWRT_Common::minSpacesFreq(){ 420 | return 300+200*(fileLenMB/5); 421 | } 422 | 423 | // make hash from string 424 | inline void XWRT_Common::stringHash(const unsigned char *ptr, int len,int& hash){ 425 | for (hash = 0; len>0; len--, ptr++){ 426 | hash *= HASH_MULT; 427 | hash += *ptr; 428 | } 429 | hash=hash&(HASH_TABLE_SIZE-1); 430 | } 431 | int XWRT_Common::addWord(unsigned char* &mem,int &i){ 432 | int c,j; 433 | if (i<=1 || sizeDict>=dictionary) 434 | return -1; 435 | 436 | dictlen[sizeDict]=i; 437 | dict[sizeDict]=mem; 438 | 439 | mem[i]=0; 440 | stringHash(mem,i,j); 441 | 442 | if (word_hash[j]!=0) 443 | { 444 | if (dictlen[sizeDict]!=dictlen[word_hash[j]] || memcmp(dict[sizeDict],dict[word_hash[j]],dictlen[sizeDict])!=0) 445 | { 446 | c=(j+i*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1); 447 | if (word_hash[c]!=0) 448 | { 449 | if (dictlen[sizeDict]!=dictlen[word_hash[c]] || memcmp(dict[sizeDict],dict[word_hash[c]],dictlen[sizeDict])!=0) 450 | { 451 | c=(j+i*HASH_DOUBLE_MULT*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1); 452 | if (word_hash[c]!=0) 453 | { 454 | collision++; 455 | return -1; 456 | } 457 | else 458 | { 459 | word_hash[c]=sizeDict++; 460 | } 461 | } 462 | else 463 | return -1; // word already exists 464 | } 465 | else 466 | { 467 | word_hash[c]=sizeDict++; 468 | } 469 | } 470 | else 471 | return -1; // word already exists 472 | } 473 | else 474 | { 475 | word_hash[j]=sizeDict++; 476 | } 477 | return 1; 478 | } 479 | unsigned char* XWRT_Common::loadDynamicDictionary(unsigned char* mem,unsigned char* mem_end){ 480 | int i; 481 | for (i=0; i<256; i++) 482 | spacesCodeword[i]=0; 483 | int count=sortedDictSize; 484 | for (i=0; imem_end) 493 | break; 494 | } 495 | /*if (mem127 || letterSet[c]==LOWERCHAR || letterSet[c]==UPPERCHAR || (letterSet[c]==NUMBERCHAR && wrtnum==1) ||c==' ' /*|| c=='\''*/) // || c=='&') 522 | wordSet[c]=1; 523 | else 524 | wordSet[c]=0; 525 | } 526 | void XWRT_Common::initializeCodeWords(int word_count,bool initMem){ 527 | int c,charsUsed,i; 528 | detectedSym=0; 529 | for (c=0; c<256; c++){ 530 | addSymbols[c]=0; 531 | codeword2sym[c]=0; 532 | sym2codeword[c]=0; 533 | reservedSet[c]=0; 534 | outputSet[c]=0; 535 | } 536 | for (c=0; c<256; c++){ 537 | if (c==CHAR_ESCAPE || c==CHAR_FIRSTUPPER || c==CHAR_UPPERWORD ) 538 | { 539 | reservedSet[c]=1; 540 | addSymbols[c]=0; 541 | } 542 | } 543 | for (c=0; c<256; c++) 544 | if (addSymbols[c]) 545 | reservedSet[c]=1; 546 | initializeLetterSet(); 547 | for (c=BINARY_FIRST; c<=BINARY_LAST; c++) 548 | addSymbols[c]=1; 549 | 550 | for (c=0; c<256; c++){ 551 | if (reservedSet[c] || addSymbols[c]) 552 | outputSet[c]=1; 553 | } 554 | charsUsed=0; 555 | for (c=0; c<256; c++){ 556 | if (addSymbols[c]){ 557 | codeword2sym[c]=charsUsed; 558 | sym2codeword[charsUsed]=c; 559 | charsUsed++; 560 | { 561 | if (c<128+64) 562 | dict1size=charsUsed; 563 | if (c<128+64+32) 564 | dict2size=charsUsed; 565 | if (c<128+64+32+16) 566 | dict3size=charsUsed; 567 | if (c<128+64+32+16+16) 568 | dict4size=charsUsed; 569 | } 570 | } 571 | } 572 | c=word_count; 573 | 574 | dict4size-=dict3size; 575 | dict3size-=dict2size; 576 | dict2size-=dict1size; 577 | if (dict1size<4 || dict2size<4 || dict3size<4 || dict4size<4){ 578 | dict2size=dict3size=dict4size=charsUsed/4; 579 | dict1size=charsUsed-dict4size*3; 580 | for (i=0; ic){ 582 | dict1size=charsUsed-i*3; 583 | dict2size=i; 584 | dict3size=i; 585 | dict4size=i; 586 | break; 587 | } 588 | } 589 | } 590 | 591 | dictionary=(dict1size*dict2size*dict3size*dict4size+dict1size*dict2size*dict3size+dict1size*dict2size+dict1size); 592 | bound4=dict1size*dict2size*dict3size+dict1size*dict2size+dict1size; 593 | bound3=dict1size*dict2size+dict1size; 594 | dict123size=dict1size*dict2size*dict3size; 595 | dict12size=dict1size*dict2size; 596 | 597 | dict1plus2=dict1size+dict2size; 598 | dict1plus2plus3=dict1size+dict2size+dict3size; 599 | if (initMem){ 600 | dict=(unsigned char**)calloc(sizeof(unsigned char*)*(dictionary+1),1); 601 | dictlen=(unsigned char*)calloc(sizeof(unsigned char)*(dictionary+1),1); 602 | if (!dict || !dictlen) 603 | OUT_OF_MEMORY(); 604 | } 605 | PRINT_DICT((" %d %d %d %d(%d) charsUsed=%d sizeDict=%d\n",dict1size,dict2size,dict3size,dict4size,dictionary,charsUsed,sizeDict)); 606 | } 607 | // read dictionary from files to arrays 608 | bool XWRT_Common::initialize(bool encoding){ 609 | // int fileLen; 610 | // FILE* file; 611 | WRT_deinitialize(); 612 | memset(&word_hash[0],0,HASH_TABLE_SIZE*sizeof(word_hash[0])); 613 | 614 | 615 | dict123size=sortedDictSize; 616 | if (dict123size<20) 617 | dict123size=20; 618 | initializeCodeWords(dict123size); 619 | int dicsize=dictionary*WORD_AVG_SIZE*2; 620 | dictmem=(unsigned char*)calloc(dicsize,1); 621 | dictmem_end=dictmem+dicsize-256; 622 | PRINT_DICT(("allocated memory=%d\n",dicsize)); 623 | if (!dictmem) 624 | OUT_OF_MEMORY(); 625 | sizeDict=1; 626 | mem=loadDynamicDictionary(dictmem,dictmem_end); 627 | 628 | 629 | return true; 630 | } 631 | void XWRT_Common::WRT_deinitialize(){ 632 | if (dict){ 633 | free(dict); 634 | dict=NULL; 635 | } 636 | if (dictlen){ 637 | free(dictlen); 638 | dictlen=NULL; 639 | } 640 | if (dictmem){ 641 | free(dictmem); 642 | dictmem=NULL; 643 | } 644 | if (dictfreq){ 645 | free(dictfreq); 646 | dictfreq=NULL; 647 | } 648 | sizeDict=0; 649 | } 650 | 651 | void XWRT_Common::defaultSettings(int n){ 652 | RESET_OPTIONS; 653 | TURN_ON(OPTION_TRY_SHORTER_WORD); 654 | maxMemSize=8*1024*1024; 655 | maxDynDictBuf=8*4; 656 | maxDictSize=65535*32700; 657 | tryShorterBound=3;//4 658 | minWordFreq=7*2; //7*2 64; 659 | wrtnum=n; 660 | //printf("WRT: num: %d",wrtnum); 661 | //maxDictSize= //e 662 | //minWordFreq= // f 663 | //maxMemSize maxMemSize*=1024*1024;//m 664 | //maxDynDictBuf //b 665 | } 666 | 667 | size_t fread_fast(unsigned char* dst, int len, FILE* file){ 668 | return fread(dst,1,len,file); 669 | /* int rd; 670 | size_t sum=0; 671 | while (len > 1<<17) // 128 kb 672 | { 673 | rd=fread(dst,1,1<<17,file); 674 | dst+=rd; 675 | len-=rd; 676 | sum+=rd; 677 | } 678 | sum+=fread(dst,1,len,file); 679 | return sum;*/ 680 | } 681 | size_t fwrite_fast(unsigned char* dst, int len, FILE* file){ 682 | return fwrite(dst,1,len,file); 683 | /*int wt; 684 | size_t sum=0; 685 | while (len > 1<<17) // 128 kb 686 | { 687 | wt=fwrite(dst,1,1<<17,file); 688 | dst+=wt; 689 | len-=wt; 690 | sum+=wt; 691 | } 692 | sum+=fwrite(dst,1,len,file); 693 | return sum;*/ 694 | } 695 | ////////////////////////////////////////// 696 | 697 | class XWRT_Decoder : public XWRT_Common 698 | { 699 | public: 700 | 701 | XWRT_Decoder(); 702 | ~XWRT_Decoder(); 703 | 704 | int WRT_start_decoding(FILE* in); 705 | int WRT_decode(); 706 | private: 707 | 708 | inline void toUpper(unsigned char* s,int &s_size); 709 | void read_dict(); 710 | inline int decodeCodeWord(unsigned char* &s,int& c); 711 | 712 | enum EUpperType { UFALSE, UTRUE, FORCE }; 713 | 714 | int s_size,WRTd_c; 715 | int last_c; 716 | bool WRTd_upper; 717 | bool WRTd_initialized; 718 | unsigned char WRTd_data[STRING_MAX_SIZE]; 719 | unsigned char *WRTd_s; 720 | EUpperType upperWord; 721 | 722 | public: 723 | }; // end class 724 | 725 | XWRT_Decoder::XWRT_Decoder() : WRTd_s(&WRTd_data[0]) 726 | { 727 | 728 | }; 729 | 730 | XWRT_Decoder::~XWRT_Decoder(){ 731 | if (cont.bigBuffer) 732 | { 733 | free(cont.bigBuffer); 734 | cont.bigBuffer=NULL; 735 | cont.freeMemBuffers(false); 736 | } 737 | else 738 | cont.freeMemBuffers(true); 739 | } 740 | 741 | #define DECODE_GETC(c)\ 742 | {\ 743 | if (cont.memout->memsize>maxMemSize) \ 744 | { \ 745 | PRINT_DICT(("%d maxMemSize=%d\n",cont.memout->memsize,maxMemSize)); \ 746 | cont.readMemBuffers(preprocFlag,maxMemSize); \ 747 | cont.memout->memsize=0; \ 748 | } \ 749 | \ 750 | c=cont.memout->InpSrcByte(); \ 751 | } 752 | 753 | // decode word using dictionary 754 | #define DECODE_WORD(dictNo,i)\ 755 | {\ 756 | i++;\ 757 | if (i>0 && i0){ 914 | s_sizep=1; 915 | rchar=WRTd_s[0];; 916 | last_c=rchar; 917 | return rchar; 918 | } 919 | } 920 | 921 | if (WRTd_c>='0' && WRTd_c<='9'){ 922 | //unsigned int no,mult; 923 | //int c,i; 924 | //no=0; 925 | //mult=1; 926 | //static int wType=0; 927 | rchar=WRTd_c; 928 | DECODE_GETC(WRTd_c); 929 | last_c=rchar; 930 | return rchar; 931 | } 932 | 933 | PRINT_CHARS(("other c=%d (%d) upperWord=%d\n",fileLenMB,upperWord)); 934 | 935 | if (upperWord!=UFALSE){ 936 | if (upperWord==FORCE) 937 | upperWord=UTRUE; 938 | 939 | if (WRTd_c>='a' && WRTd_c<='z') 940 | WRTd_c=toupper(WRTd_c); 941 | else 942 | upperWord=UFALSE; 943 | } 944 | else 945 | if (WRTd_upper){ 946 | WRTd_upper=false; 947 | WRTd_c=toupper(WRTd_c); 948 | } 949 | rchar=WRTd_c; 950 | last_c=rchar; 951 | DECODE_GETC(WRTd_c); 952 | return rchar; 953 | } 954 | } 955 | 956 | void XWRT_Decoder::read_dict(){ 957 | int i,c,count; 958 | unsigned char* bound=(unsigned char*)&word_hash[0] + HASH_TABLE_SIZE*sizeof(word_hash[0]) - 6; 959 | 960 | unsigned char* bufferData=(unsigned char*)&word_hash[0] + 3; 961 | 962 | for (i=0, count=0; i<3; i++){ 963 | GETC(c); 964 | count=count*256+c; 965 | } 966 | 967 | fread_fast(bufferData,count,XWRT_file); 968 | 969 | 970 | count=bufferData[0]; bufferData++; 971 | count+=256*bufferData[0]; bufferData++; 972 | count+=65536*bufferData[0]; bufferData++; 973 | 974 | sortedDict.clear(); 975 | 976 | PRINT_DICT(("count=%d\n",count)); 977 | 978 | std::string s; 979 | std::string last_s; 980 | for (i=0; i=128){ 982 | s.append(last_s.c_str(),bufferData[0]-128); 983 | bufferData++; 984 | } 985 | 986 | while (bufferData[0]!=10){ 987 | s.append(1,bufferData[0]); 988 | bufferData++; 989 | 990 | if (s.size()>WORD_MAX_SIZE || bufferData>bound) 991 | { 992 | //printf("File corrupted (s.size()>WORD_MAX_SIZE)!\n"); 993 | OUT_OF_MEMORY(); 994 | } 995 | } 996 | bufferData++; 997 | 998 | sortedDict.push_back(s); 999 | last_s=s; 1000 | s.erase(); 1001 | } 1002 | 1003 | sortedDictSize=(int)sortedDict.size(); 1004 | PRINT_DICT(("read_dict count2=%d\n",count)); 1005 | 1006 | } 1007 | 1008 | 1009 | int XWRT_Decoder::WRT_start_decoding(FILE* in){ 1010 | int c; 1011 | XWRT_file=in; 1012 | last_c=0; 1013 | WRTd_upper=false; 1014 | upperWord=UFALSE; 1015 | s_size=0; 1016 | collision=0; 1017 | 1018 | defaultSettings(0); 1019 | GETC(maxMemSize); 1020 | maxMemSize*=1024*1024; 1021 | int fileLen; 1022 | 1023 | GETC(c); 1024 | fileLen=c; 1025 | GETC(c); 1026 | fileLen=fileLen|(c<<8); 1027 | GETC(c); 1028 | fileLen=fileLen|(c<<16); 1029 | GETC(c); 1030 | fileLen=fileLen|(c<<24); 1031 | fileLenMB=fileLen/(1024*1024); 1032 | if (fileLenMB>255*256) 1033 | fileLenMB=255*256; 1034 | 1035 | PRINT_DICT(("maxMemSize=%d fileLenMB=%d\n",maxMemSize,fileLenMB)); 1036 | read_dict(); 1037 | 1038 | cont.readMemBuffers(preprocFlag,maxMemSize); 1039 | cont.memout->memsize=0; 1040 | 1041 | WRT_deinitialize(); 1042 | 1043 | decoding=true; 1044 | if (!initialize(false)) 1045 | return 0; 1046 | 1047 | DECODE_GETC(WRTd_c); 1048 | PRINT_CHARS(("WRT_start_decoding WRTd_c=%d ftell=%d\n",WRTd_c,ftell(XWRT_file))); 1049 | 1050 | return fileLen; 1051 | } 1052 | 1053 | 1054 | 1055 | ///////////////////////////////////////////////////////////////////// 1056 | /////////// 1057 | ////////////////////////////////////////////////////////////////////// 1058 | 1059 | 1060 | class XWRT_Encoder : public XWRT_Common 1061 | { 1062 | public: 1063 | 1064 | XWRT_Encoder(); 1065 | ~XWRT_Encoder(); 1066 | 1067 | void WRT_start_encoding(FILE* in, FILE* out,unsigned int fileLen,bool type_detected); 1068 | 1069 | private: 1070 | 1071 | void WRT_encode( int filelen); 1072 | inline void encodeCodeWord(int &i); 1073 | inline void encodeSpaces(); 1074 | inline void encodeWord(unsigned char* s,int s_size,EWordType wordType,int& c); 1075 | inline void encodeAsText(unsigned char* &s,int &s_size,EWordType wordType); 1076 | inline int findShorterWord(unsigned char* &s,int &s_size); 1077 | inline void toLower(unsigned char* s,int &s_size); 1078 | inline void toUpper(unsigned char* s,int &s_size); 1079 | inline void checkWord(unsigned char* &s,int &s_size,int& c); 1080 | 1081 | inline void checkHashExactly(unsigned char* &s,int &s_size,int& i); 1082 | inline int checkHash(unsigned char* &s,int &s_size,int h); 1083 | inline void stringHash(const unsigned char *ptr, int len,int& hash); 1084 | 1085 | //void encodeMixed(unsigned char* s,int s_size,int& c); 1086 | void sortDict(int size); 1087 | 1088 | void write_dict(); 1089 | int WRT_detectFileType(int filelen); 1090 | void WRT_detectFinish(); 1091 | 1092 | int s_size; 1093 | int last_c_bak,last_c,last_last_c; 1094 | int filelento; 1095 | 1096 | 1097 | unsigned char* dynmem; 1098 | unsigned char *dictbound; 1099 | 1100 | public: 1101 | }; // end class 1102 | 1103 | int compare_freq( const void *arg1, const void *arg2 ); 1104 | 1105 | 1106 | XWRT_Encoder::XWRT_Encoder() : last_c_bak(0),filelento(0) 1107 | { 1108 | }; 1109 | XWRT_Encoder::~XWRT_Encoder(){ 1110 | 1111 | } 1112 | #define ENCODE_PUTC(c)\ 1113 | { \ 1114 | if (!detect) \ 1115 | { \ 1116 | if (cont.memout->memsize>maxMemSize) \ 1117 | { \ 1118 | PRINT_DICT(("%d maxMemSize=%d\n",cont.memout->memsize,maxMemSize)); \ 1119 | cont.writeMemBuffers(preprocFlag); \ 1120 | cont.memout->memsize=0; \ 1121 | } \ 1122 | \ 1123 | PRINT_CHARS(("output=%d (%c)\n",c,c)); \ 1124 | cont.memout->OutTgtByte(c); \ 1125 | } \ 1126 | } 1127 | 1128 | #define ENCODE_GETC(c) \ 1129 | { \ 1130 | last_last_c=last_c; \ 1131 | last_c=last_c_bak; \ 1132 | \ 1133 | c=getc(XWRT_file); \ 1134 | filelento++;\ 1135 | last_c_bak=c; \ 1136 | } 1137 | 1138 | 1139 | // encode word (should be lower case) using n-gram array (when word doesn't exist in the dictionary) 1140 | inline void XWRT_Encoder::encodeAsText(unsigned char* &s,int &s_size,EWordType wordType){ 1141 | int i=0; 1142 | for (i=0; i=bound4){ 1154 | first-=bound4; 1155 | fourth=first/dict123size; 1156 | first=first%dict123size; 1157 | third=first/dict12size; 1158 | first=first%dict12size; 1159 | second=first/dict1size; 1160 | first=first%dict1size; 1161 | ENCODE_PUTC(sym2codeword[dict1plus2plus3+fourth]); 1162 | PRINT_CODEWORDS(("1st=%d ",sym2codeword[dict1plus2plus3+fourth])); 1163 | ENCODE_PUTC(sym2codeword[dict1plus2+third]); 1164 | PRINT_CODEWORDS(("2nd=%d ",sym2codeword[dict1plus2+third])); 1165 | ENCODE_PUTC(sym2codeword[dict1size+second]); 1166 | PRINT_CODEWORDS(("3rd=%d ",sym2codeword[dict1size+second])); 1167 | ENCODE_PUTC(sym2codeword[first]); 1168 | PRINT_CODEWORDS(("4th=%d ",sym2codeword[first])); 1169 | } 1170 | else 1171 | if (first>=bound3){ 1172 | first-=bound3; 1173 | third=first/dict12size; 1174 | first=first%dict12size; 1175 | second=first/dict1size; 1176 | first=first%dict1size; 1177 | ENCODE_PUTC(sym2codeword[dict1plus2+third]); 1178 | PRINT_CODEWORDS(("1st=%d(%d) ",sym2codeword[dict1plus2+third],third)); 1179 | ENCODE_PUTC(sym2codeword[dict1size+second]); 1180 | PRINT_CODEWORDS(("2nd=%d(%d) ",sym2codeword[dict1size+second],second)); 1181 | ENCODE_PUTC(sym2codeword[first]); 1182 | PRINT_CODEWORDS(("3rd=%d(%d) ",sym2codeword[first],first)); 1183 | } 1184 | else 1185 | if (first>=dict1size){ 1186 | first-=dict1size; 1187 | second=first/dict1size; 1188 | first=first%dict1size; 1189 | ENCODE_PUTC(sym2codeword[dict1size+second]); 1190 | PRINT_CODEWORDS(("1st=%d ",sym2codeword[dict1size+second])); 1191 | 1192 | ENCODE_PUTC(sym2codeword[first]); 1193 | PRINT_CODEWORDS(("2nd=%d ",sym2codeword[first])); 1194 | } 1195 | else{ 1196 | ENCODE_PUTC(sym2codeword[first]); 1197 | PRINT_CODEWORDS(("1st=%d ",sym2codeword[first])); 1198 | } 1199 | 1200 | } 1201 | 1202 | inline void XWRT_Encoder::encodeSpaces(){ 1203 | if (spaces==1){ 1204 | ENCODE_PUTC(' '); 1205 | } 1206 | else 1207 | if (spaces>0){ 1208 | while (spaces>0){ 1209 | int sp=spaces; 1210 | if (spaces>=256) 1211 | sp=255; 1212 | 1213 | while (sp>0 && spacesCodeword[sp]==0) sp--; 1214 | if (spacesCodeword[sp]) { 1215 | encodeCodeWord(spacesCodeword[sp]); 1216 | spaces-=sp; 1217 | } 1218 | else{ 1219 | ENCODE_PUTC(' '); 1220 | spaces--; 1221 | } 1222 | } 1223 | } 1224 | spaces=0; 1225 | } 1226 | // make hash from string 1227 | inline void XWRT_Encoder::stringHash(const unsigned char *ptr, int len,int& hash){ 1228 | for (hash = 0; len>0; len--, ptr++){ 1229 | hash *= HASH_MULT; 1230 | hash += *ptr; 1231 | } 1232 | hash=hash&(HASH_TABLE_SIZE-1); 1233 | } 1234 | // check if word "s" does exist in the dictionary 1235 | inline void XWRT_Encoder::checkHashExactly(unsigned char* &s,int &s_size,int& i){ 1236 | int h; 1237 | stringHash(s,s_size,h); 1238 | i=word_hash[h]; 1239 | if (i>0){ 1240 | if (dictlen[i]!=s_size || memcmp(dict[i],s,s_size)!=0){ 1241 | i=word_hash[(h+s_size*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)]; 1242 | if (i>0){ 1243 | if (dictlen[i]!=s_size || memcmp(dict[i],s,s_size)!=0){ 1244 | i=word_hash[(h+s_size*HASH_DOUBLE_MULT*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)]; 1245 | if (i>0){ 1246 | if (dictlen[i]!=s_size || memcmp(dict[i],s,s_size)!=0) 1247 | i=-1; 1248 | } 1249 | else 1250 | i=-1; 1251 | } 1252 | } 1253 | else 1254 | i=-1; 1255 | } 1256 | } 1257 | else 1258 | i=-1; 1259 | if (i>=dictionary) 1260 | i=-1; 1261 | } 1262 | // check if word "s" (prefix of original word) does exist in the dictionary using hash "h" 1263 | inline int XWRT_Encoder::checkHash(unsigned char* &s,int &s_size,int h){ 1264 | int i=word_hash[h]; 1265 | if (i>0){ 1266 | if (dictlen[i]>s_size || memcmp(dict[i],s,s_size)!=0){ 1267 | i=word_hash[(h+s_size*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)]; 1268 | if (i>0){ 1269 | if (dictlen[i]>s_size || memcmp(dict[i],s,s_size)!=0){ 1270 | i=word_hash[(h+s_size*HASH_DOUBLE_MULT*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)]; 1271 | if (i>0){ 1272 | if (dictlen[i]>s_size || memcmp(dict[i],s,s_size)!=0) 1273 | i=-1; 1274 | } 1275 | else 1276 | i=-1; 1277 | } 1278 | } 1279 | else 1280 | i=-1; 1281 | } 1282 | } 1283 | else 1284 | i=-1; 1285 | if (i>=dictionary) 1286 | i=-1; 1287 | return i; 1288 | } 1289 | // check if word "s" or prefix of word "s" does exist in the dictionary using hash "h" 1290 | inline int XWRT_Encoder::findShorterWord(unsigned char* &s,int &s_size){ 1291 | int ret; 1292 | int i; 1293 | int best; 1294 | unsigned int hash; 1295 | hash = 0; 1296 | for (i=0; i=0) 1303 | best=ret; 1304 | hash = HASH_MULT*hash + s[i]; 1305 | } 1306 | return best; 1307 | } 1308 | // convert lower string to upper 1309 | inline void XWRT_Encoder::toUpper(unsigned char* s,int &s_size){ 1310 | for (int i=0; i=s_size) 1338 | break; 1339 | 1340 | start=ptr; 1341 | do 1342 | { 1343 | c=s[ptr++]; 1344 | letterType=letterSet[c]; 1345 | } 1346 | while (ptr0 && wordType!=3){ 1361 | //s[s_size+1]=0; 1362 | //printf("%s %d %d\n",s,s_size,wordType); 1363 | toLower(s,s_size); 1364 | } 1365 | if ((s_size<3 && s[0]>='0' && s[0]<='9') && wrtnum==1) return; //??? 1366 | // if (s_size==4 && ((s[0]=='1' && s[1]=='9') || (s[0]=='2' && s[1]=='0'))) return; 1367 | 1368 | checkWord(s,s_size,c); 1369 | return; 1370 | } 1371 | if (s_size<1){ 1372 | encodeSpaces(); 1373 | return; 1374 | } 1375 | int i=-1; 1376 | int size=0; 1377 | int flagToEncode=-1; 1378 | // bool justAdded=false; 1379 | 1380 | if (s_size>=WORD_MIN_SIZE){ 1381 | checkHashExactly(s,s_size,i); 1382 | PRINT_CODEWORDS(("i=%d/%d %s(%d)\n",i,sizeDynDict,s,s_size)); 1383 | 1384 | if (i>=0)// && codeWordSize(i)<=s_size) 1385 | wordType=LOWERWORD; 1386 | 1387 | if (i<0){ 1388 | if (wordType==FIRSTUPPER || wordType==UPPERWORD){ 1389 | if (wordType==FIRSTUPPER){ 1390 | flagToEncode=CHAR_FIRSTUPPER; 1391 | s[0]=tolower(s[0]); 1392 | } 1393 | else // wordType==UPPERWORD 1394 | { 1395 | flagToEncode=CHAR_UPPERWORD; 1396 | toLower(s,s_size); 1397 | } 1398 | checkHashExactly(s,s_size,i); 1399 | PRINT_CODEWORDS(("checkHashExactly i=%d %d=%s\n",i,s_size,s)); 1400 | } 1401 | 1402 | 1403 | if (i<0 ){ 1404 | // try to find shorter version of word in dictionary 1405 | i=findShorterWord(s,s_size); 1406 | PRINT_CODEWORDS(("findShorterWord i=%d\n",i)); 1407 | //s[s_size+1]=0; 1408 | //if (i>0 ) printf("findShorterWord i=%d %s\n",i, s); 1409 | if (i>=0){ 1410 | size=dictlen[i]; 1411 | if (wordType==UPPERWORD){ 1412 | int ss=s_size-size; 1413 | toUpper(s+size,ss); 1414 | } 1415 | } 1416 | } 1417 | } 1418 | } 1419 | if (i>=0){ 1420 | encodeSpaces(); 1421 | if (wordType==FIRSTUPPER || wordType==UPPERWORD){ 1422 | ENCODE_PUTC(flagToEncode); 1423 | } 1424 | encodeCodeWord(i); 1425 | if (size>0){ 1426 | if (wordType==FIRSTUPPER) 1427 | wordType=LOWERWORD; 1428 | unsigned char* s2=s+size; 1429 | int s_size2=s_size-size; 1430 | i=-1; 1431 | if (s_size2>(tryShorterBound+1)){ //try remainig word 1432 | // try to find shorter version of word in dictionary 1433 | i=findShorterWord(s2,s_size2); 1434 | PRINT_CODEWORDS(("findShorterWord i=%d\n",i)); 1435 | } 1436 | if (i>=0 && wordType!=UPPERWORD){ 1437 | size=dictlen[i]; 1438 | //encodeSpaces(); 1439 | encodeCodeWord(i); 1440 | s2=s2+size; 1441 | s_size2=s_size2-size; 1442 | if (s_size2>0) encodeAsText(s2,s_size2,wordType); 1443 | } 1444 | else encodeAsText(s2,s_size2,wordType); 1445 | } 1446 | } 1447 | else 1448 | { 1449 | if (wordType==FIRSTUPPER) 1450 | s[0]=toupper(s[0]); 1451 | else if (wordType==UPPERWORD) 1452 | toUpper(s,s_size); 1453 | encodeSpaces(); 1454 | encodeAsText(s,s_size,wordType); 1455 | } 1456 | return; 1457 | } 1458 | // process the file 1459 | void XWRT_Encoder::WRT_encode(int filelen){ 1460 | unsigned char s[STRING_MAX_SIZE]; 1461 | EWordType wordType; 1462 | int c; 1463 | spaces=0; 1464 | s_size=0; 1465 | last_c=0; 1466 | filelento=-1; 1467 | wordType=LOWERWORD; 1468 | ENCODE_GETC(c); 1469 | while (true) 1470 | { 1471 | if (filelento==filelen) 1472 | break; 1473 | PRINT_CHARS(("c=%c (%d) last=%c \n",c,c,last_c)); 1474 | 1475 | if (detect){ 1476 | letterType=letterSet[c]; 1477 | } 1478 | else 1479 | { 1480 | if (c==13){ 1481 | encodeWord(s,s_size,wordType,c); 1482 | s_size=0; 1483 | ENCODE_GETC(c); 1484 | if (addSymbols[13]) 1485 | ENCODE_PUTC(CHAR_ESCAPE); 1486 | ENCODE_PUTC(13); 1487 | continue; 1488 | } 1489 | 1490 | letterType=letterSet[c]; 1491 | 1492 | if (letterType==RESERVEDCHAR){ 1493 | PRINT_CHARS(("reservedSet[c] c=%d (%c)\n",c,c)); 1494 | 1495 | encodeWord(s,s_size,wordType,c); 1496 | s_size=0; 1497 | 1498 | PRINT_CHARS(("out CHAR_ESCAPE=%d\n",CHAR_ESCAPE)); 1499 | ENCODE_PUTC(CHAR_ESCAPE); 1500 | ENCODE_PUTC(c); 1501 | 1502 | ENCODE_GETC(c); 1503 | continue; 1504 | } 1505 | 1506 | 1507 | if (letterType==NUMBERCHAR && wrtnum==0){ 1508 | encodeWord(s,s_size,wordType,c); 1509 | s_size=0; 1510 | ENCODE_PUTC(c); 1511 | ENCODE_GETC(c); 1512 | // wordType=LOWERWORD; 1513 | continue; 1514 | } 1515 | 1516 | } 1517 | if (wordSet[c]){ 1518 | if (c!=' '){ 1519 | if (s_size==0){ 1520 | if (last_c!=' ') 1521 | beforeWord=last_c; 1522 | else 1523 | beforeWord=last_last_c; 1524 | if (letterType==LOWERCHAR) 1525 | wordType=LOWERWORD; 1526 | else 1527 | if (letterType==UPPERCHAR) 1528 | wordType=FIRSTUPPER; 1529 | else 1530 | wordType=VARWORD; 1531 | } 1532 | else 1533 | { 1534 | switch (wordType) 1535 | { 1536 | case LOWERWORD: 1537 | if (letterType!=LOWERCHAR) 1538 | wordType=VARWORD; 1539 | break; 1540 | case UPPERWORD: 1541 | if (letterType!=UPPERCHAR) 1542 | wordType=VARWORD; 1543 | break; 1544 | case FIRSTUPPER: 1545 | if (letterType!=LOWERCHAR) 1546 | { 1547 | if (s_size==1 && letterType==UPPERCHAR) 1548 | wordType=UPPERWORD; 1549 | else 1550 | wordType=VARWORD; 1551 | } 1552 | break; 1553 | } 1554 | } 1555 | } 1556 | else 1557 | { 1558 | encodeWord(s,s_size,wordType,c); 1559 | s_size=0; 1560 | spaces++; 1561 | while (true){ 1562 | ENCODE_GETC(c); 1563 | if (c!=' ') 1564 | break; 1565 | spaces++; 1566 | } 1567 | continue; 1568 | } 1569 | //detect words like and split. HiTerraMonda 1570 | if(s_size>2 && letterType==UPPERCHAR && letterSet[last_c]==LOWERCHAR){ 1571 | if (s_size>2 && wordType==VARWORD){ 1572 | if (letterSet[s[0]]==UPPERCHAR){ 1573 | wordType=FIRSTUPPER; 1574 | for(int i=1;i<=s_size;i++){ 1575 | if (letterSet[s[i]]==UPPERCHAR){ 1576 | wordType=VARWORD; 1577 | break; 1578 | } 1579 | } 1580 | } 1581 | } 1582 | encodeWord(s,s_size,wordType,c); 1583 | //s[s_size++]=0; 1584 | //printf("%s %d %d\n",s,s_size,wordType); 1585 | s_size=0; 1586 | 1587 | continue; 1588 | } 1589 | s[s_size++]=c; 1590 | if (s_size>=STRING_MAX_SIZE-2){ 1591 | encodeWord(s,s_size,wordType,c); 1592 | s_size=0; 1593 | } 1594 | ENCODE_GETC(c); 1595 | continue; 1596 | } 1597 | encodeWord(s,s_size,wordType,c); 1598 | s_size=0; 1599 | ENCODE_PUTC(c); 1600 | ENCODE_GETC(c); 1601 | } 1602 | encodeWord(s,s_size,wordType,c); 1603 | s_size=0; 1604 | } 1605 | inline int common(const char* offset1,const char* offset2, int bound){ 1606 | int lp=0; 1607 | while (offset1[lp]==offset2[lp] && lp0) 1625 | // cmn=common(sortedDict[i-1].c_str(),sortedDict[i].c_str(),min(sortedDict[i].size(),sortedDict[i-1].size())); 1626 | if ((cmn>0 || (unsigned char)(sortedDict[i][0])>=128)) 1627 | bufferData+=sprintf((char*)bufferData,"%c%s\n",128+cmn,sortedDict[i].c_str()+cmn); 1628 | else 1629 | bufferData+=sprintf((char*)bufferData,"%s\n",sortedDict[i].c_str()); 1630 | if (bufferData>bound) 1631 | break; 1632 | } 1633 | sortedDictSize=(int)i; // i<=count 1634 | PRINT_DICT(("sortedDictCount=%d\n",sortedDictSize)); 1635 | count_header[0]=sortedDictSize%256; 1636 | count_header[1]=(sortedDictSize/256)%256; 1637 | count_header[2]=sortedDictSize/65536; 1638 | count=(int)(bufferData-(writeBuffer+3)); 1639 | PRINT_DICT(("write_dict count=%d\n",count)); 1640 | PUTC(count>>16); 1641 | PUTC(count>>8); 1642 | PUTC(count); 1643 | fwrite_fast((unsigned char*)writeBuffer+3,count,XWRT_fileout); 1644 | } 1645 | 1646 | void XWRT_Encoder::WRT_start_encoding(FILE* in, FILE* out,unsigned int fileLen,bool type_detected){ 1647 | collision=0; 1648 | XWRT_file=in; 1649 | XWRT_fileout=out; 1650 | 1651 | fileLenMB=fileLen/(1024*1024); 1652 | if (fileLenMB>255*256) 1653 | fileLenMB=255*256; 1654 | 1655 | cont.prepareMemBuffers(); 1656 | cont.memout->memsize=0; 1657 | //if (fileLenMB>64) minWordFreq-=4,tryShorterBound=2; 1658 | 1659 | /* if (fileLenMB>64) minWordFreq-=16; 1660 | if (fileLenMB<16) 1661 | minWordFreq-=7; 1662 | //if (fileLenMB<6) 1663 | // minWordFreq=minWordFreq+15;*/ 1664 | if (fileLenMB<1) 1665 | minWordFreq=minWordFreq*6; 1666 | //if (fileLenMB<1) minWordFreq=9,tryShorterBound=4; 1667 | //if (fileLen<256*1024) minWordFreq=7,tryShorterBound=3; 1668 | int pos=ftell(XWRT_file); 1669 | if (!type_detected) 1670 | WRT_detectFileType(fileLen); 1671 | 1672 | fseek(XWRT_file, pos, SEEK_SET ); 1673 | 1674 | PUTC(maxMemSize/(1024*1024)); 1675 | PUTC(fileLen&0xFF); 1676 | PUTC((fileLen>>8)&0xFF); 1677 | PUTC((fileLen>>16)&0xFF); 1678 | PUTC((fileLen>>24)&0xFF); 1679 | 1680 | PRINT_DICT(("maxMemSize=%d fileLenMB=%d\n",maxMemSize,fileLenMB)); 1681 | write_dict(); // przed initialize() 1682 | decoding=false; 1683 | WRT_deinitialize(); 1684 | if (!initialize(true)) 1685 | return; 1686 | WRT_encode(fileLen); 1687 | cont.writeMemBuffers(preprocFlag); 1688 | cont.freeMemBuffers(true); 1689 | } 1690 | 1691 | inline void XWRT_Encoder::checkWord(unsigned char* &s,int &s_size,int& c){ 1692 | if (s_size<1){ 1693 | spaces=0; 1694 | return; 1695 | } 1696 | if (s_size>WORD_MAX_SIZE) 1697 | s_size=WORD_MAX_SIZE; 1698 | 1699 | spaces=0; 1700 | if (s_sizedictbound){ 1708 | if (firstWarn){ 1709 | //printf("warning: dictionary too big\n"); //-b option 1710 | firstWarn=false; 1711 | } 1712 | return; 1713 | } 1714 | memcpy(dynmem,s,s_size); 1715 | if (addWord(dynmem,s_size)==1){ 1716 | dynmem+=(s_size/4+1)*4; 1717 | dictfreq[sizeDict-1]=1; 1718 | } 1719 | } 1720 | else 1721 | { 1722 | dictfreq[i]++; 1723 | } 1724 | } 1725 | int XWRT_Encoder::WRT_detectFileType(int filelen){ 1726 | detect=true; 1727 | //memset(value,0,sizeof(value)); 1728 | memset(addSymbols,0,sizeof(addSymbols)); 1729 | memset(reservedSet,0,sizeof(reservedSet)); 1730 | memset(spacesCont,0,sizeof(spacesCont)); 1731 | spaces=0; 1732 | firstWarn=true; 1733 | sizeDict=1; 1734 | PRINT_DICT(("maxDynDictBuf=%d maxMemSize=%d\n",maxDynDictBuf,maxMemSize)); 1735 | dictionary=maxDynDictBuf*(MAX_DYNAMIC_DICT_COUNT/256); // 512k, dblp=372k 1736 | dictmem=(unsigned char*)calloc(dictionary*WORD_AVG_SIZE,1); 1737 | dictbound=dictmem+dictionary*WORD_AVG_SIZE-WORD_MAX_SIZE; 1738 | dict=(unsigned char**)calloc(sizeof(unsigned char*)*(dictionary+1),1); 1739 | dictlen=(unsigned char*)calloc(sizeof(unsigned char)*(dictionary+1),1); 1740 | dictfreq=(int*)calloc(sizeof(int)*(dictionary+1),1); 1741 | memset(&word_hash[0],0,HASH_TABLE_SIZE*sizeof(word_hash[0])); 1742 | dynmem=dictmem; 1743 | PRINT_DICT(("maxDict=%d allocatedMemory=%d hashTable=%d\n",dictionary,dictionary*WORD_AVG_SIZE+sizeof(unsigned char*)*(dictionary+1)+sizeof(unsigned char)*(dictionary+1)+sizeof(int)*(dictionary+1),HASH_TABLE_SIZE*sizeof(word_hash[0]))); 1744 | if (dictmem && dict && dictlen && dictfreq){ 1745 | initializeLetterSet(); 1746 | WRT_encode(filelen); 1747 | WRT_detectFinish(); 1748 | } 1749 | WRT_deinitialize(); 1750 | if (collision>0) 1751 | PRINT_DICT(("warning: hash collisions=%d\n",collision)); 1752 | detect=false; 1753 | return preprocFlag; 1754 | } 1755 | int compare_str( const void *arg1, const void *arg2 ){ 1756 | int a=*(int*)arg1; 1757 | int b=*(int*)arg2; 1758 | return strcmp((char*)dict[a],(char*)dict[b]); 1759 | } 1760 | int compare_str_rev( const void *arg1, const void *arg2 ){ 1761 | int a=*(int*)arg1; 1762 | int b=*(int*)arg2; 1763 | int minv=min(dictlen[a],dictlen[b]); 1764 | for (int i=1; i<=minv; i++){ 1765 | if (dict[a][dictlen[a]-i]!=dict[b][dictlen[b]-i]) 1766 | return dict[a][dictlen[a]-i] - dict[b][dictlen[b]-i]; 1767 | } 1768 | return dictlen[a] - dictlen[b]; 1769 | } 1770 | int compare_freq( const void *arg1, const void *arg2 ){ 1771 | int a=*(int*)arg1; 1772 | int b=*(int*)arg2; 1773 | return dictfreq[b]-dictfreq[a]; 1774 | } 1775 | void XWRT_Encoder::sortDict(int size){ 1776 | int i,add; 1777 | size--; 1778 | if (size<20) 1779 | return; 1780 | initializeCodeWords(size,false); 1781 | add=0; 1782 | dict1size-=add; 1783 | bound3-=add; 1784 | bound4-=add; 1785 | int* inttable=new int[size]; 1786 | if (!inttable) 1787 | OUT_OF_MEMORY(); 1788 | for (i=0; idict1size) 1794 | qsort(&inttable[dict1size],min(size,bound3)-dict1size,sizeof(inttable[0]),compare_str);//compare_str 1795 | if (size>bound3) 1796 | qsort(&inttable[bound3],min(size,bound4)-bound3,sizeof(inttable[0]),compare_str);//compare_str 1797 | if (size>bound4) 1798 | qsort(&inttable[bound4],size-bound4,sizeof(inttable[0]),compare_str);//compare_str 1799 | 1800 | for (i=0; i=minWordFreq || (num>=minWordFreq2 && (dictlen[i]>=7))) 1819 | ; 1820 | else 1821 | dictfreq[i]=0; 1822 | } 1823 | for (i=1, j=sizeDict-2; i0) 1825 | continue; 1826 | while (j>0 && dictfreq[j]==0) j--; 1827 | if (i>j) 1828 | break; 1829 | dict[i]=dict[j]; 1830 | dictlen[i]=dictlen[j]; 1831 | dictfreq[i]=dictfreq[j]; 1832 | dictfreq[j--]=0; 1833 | } 1834 | sizeDict=i; 1835 | if (sizeDict>maxDictSize) 1836 | sizeDict=maxDictSize; 1837 | PRINT_DICT(("reduced to %d words (freq>=%d)\n",sizeDict,minWordFreq)); 1838 | sortDict(sizeDict); 1839 | } 1840 | --------------------------------------------------------------------------------