├── LICENSE
├── README.md
└── wrtpre.cpp


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2008 Matt Mahoney, Serge Osnach, Alexander Ratushnyak,
 2 | Bill Pettis, Przemyslaw Skibinski, Matthew Fite, wowtiger, Andrew Paterson,
 3 | Jan Ondrus, Andreas Morphis, Pavel L. Holoborodko, KZ., Simon Berger,
 4 | Neill Corlett
 5 | 
 6 | LICENSE
 7 | 
 8 | This program is free software; you can redistribute it and/or
 9 | modify it under the terms of the GNU General Public License as
10 | published by the Free Software Foundation; either version 2 of
11 | the License, or (at your option) any later version.
12 | 
13 | This program is distributed in the hope that it will be useful, but
14 | WITHOUT ANY WARRANTY; without even the implied warranty of
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 | General Public License for more details at
17 | Visit <http://www.gnu.org/copyleft/gpl.html>.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | paq8pxd file compressor/archiver.  Release by Kaido Orav, Aug. 14, 2013
  2 | 
  3 | # COMMAND LINE INTERFACE
  4 | 
  5 | - To install, put paq8pxd.exe somewhere in your PATH.
  6 | - To compress:      paq8pxd [-N] file1 [file2...]
  7 | - To decompress:    paq8pxd [-d] file1.paq8pxd [dir2]
  8 | - To view contents: more < file1.paq8pxd
  9 | 
 10 | The compressed output file is named by adding ".paq8pxd" extension to
 11 | the first named file (file1.paq8pxd).  Each file that exists will be
 12 | added to the archive and its name will be stored without a path.
 13 | The option -N specifies a compression level ranging from -0
 14 | (fastest) to -8 (smallest).  The default is -5.  If there is
 15 | no option and only one file, then the program will pause when
 16 | finished until you press the ENTER key (to support drag and drop).
 17 | If file1.paq8pxd exists then it is overwritten.
 18 | 
 19 | If the first named file ends in ".paq8pxd" then it is assumed to be
 20 | an archive and the files within are extracted to the same directory
 21 | as the archive unless a different directory (dir2) is specified.
 22 | The -d option forces extraction even if there is not a ".paq8pxd"
 23 | extension.  If any output file already exists, then it is compared
 24 | with the archive content and the first byte that differs is reported.
 25 | No files are overwritten or deleted.  If there is only one argument
 26 | (no -d or dir2) then the program will pause when finished until
 27 | you press ENTER.
 28 | 
 29 | For compression, if any named file is actually a directory, then all
 30 | files and subdirectories are compressed, preserving the directory
 31 | structure, except that empty directories are not stored, and file
 32 | attributes (timestamps, permissions, etc.) are not preserved.
 33 | During extraction, directories are created as needed.  For example:
 34 | 
 35 |     paq8pxd -4 c:\tmp\foo bar
 36 | 
 37 | compresses foo and bar (if they exist) to c:\tmp\foo.paq8pxd at level 4.
 38 | 
 39 |     paq8pxd -d c:\tmp\foo.paq8pxd .
 40 | 
 41 | extracts foo and compares bar in the current directory.  If foo and bar
 42 | are directories then their contents are extracted/compared.
 43 | 
 44 | There are no commands to update an existing archive or to extract
 45 | part of an archive.  Files and archives larger than 2GB are not
 46 | supported (but might work on 64-bit machines, not tested).
 47 | File names with nonprintable characters are not supported (spaces
 48 | are OK).
 49 | 
 50 | 
 51 | # TO COMPILE
 52 | 
 53 | There are 2 files: paq8pxd.cpp (C++) and paq7asm.asm (NASM/YASM).
 54 | paq7asm.asm is the same as in paq7 and paq8x.  paq8pxd.cpp recognizes the
 55 | following compiler options:
 56 | 
 57 | *  -DWINDOWS           (to compile in Windows)
 58 | *  -DUNIX              (to compile in Unix, Linux, Solairs, MacOS/Darwin, etc)
 59 | *  -DNOASM             (to replace paq7asm.asm with equivalent C++)
 60 | *  -DDEFAULT_OPTION=N  (to change the default compression level from 5 to N).
 61 | 
 62 | If you compile without -DWINDOWS or -DUNIX, you can still compress files,
 63 | but you cannot compress directories or create them during extraction.
 64 | You can extract directories if you manually create the empty directories
 65 | first.
 66 | 
 67 | Use -DEFAULT_OPTION=N to change the default compression level to support
 68 | drag and drop on machines with less than 256 MB of memory.  Use
 69 | -DDEFAULT_OPTION=4 for 128 MB, 3 for 64 MB, 2 for 32 MB, etc.
 70 | 
 71 | Use -DNOASM for non x86-32 machines, or older than a Pentium-MMX (about
 72 | 1997), or if you don't have NASM or YASM to assemble paq7asm.asm.  The
 73 | program will still work but it will be slower.  For NASM in Windows,
 74 | use the options "--prefix _" and either "-f win32" or "-f obj" depending
 75 | on your C++ compiler.  In Linux, use "-f elf".
 76 | 
 77 | Recommended compiler commands and optimizations:
 78 | 
 79 | UNIX/Linux (PC):
 80 |     g++ paq8pxd.cpp -DUNIX -O3
 81 | 
 82 | MinGW produces faster executables than Borland or Mars, but Intel 9
 83 | is about 4% faster than MinGW).
 84 | 
 85 | 
 86 | # ARCHIVE FILE FORMAT
 87 | 
 88 | An archive has the following format.  It is intended to be both
 89 | human and machine readable.  The header ends with CTRL-Z (Windows EOF)
 90 | so that the binary compressed data is not displayed on the screen.
 91 | 
 92 |     paq8pxd -N CR LF
 93 |     size TAB filename CR LF
 94 |     size TAB filename CR LF
 95 |     ...
 96 |     CTRL-Z
 97 |     compressed binary data
 98 | 
 99 | -N is the option (-0 to -9), even if a default was used.
100 | Plain file names are stored without a path.  Files in compressed
101 | directories are stored with path relative to the compressed directory
102 | (using UNIX style forward slashes "/").  For example, given these files:
103 | 
104 |     123 C:\dir1\file1.txt
105 |     456 C:\dir2\file2.txt
106 | 
107 | Then
108 | 
109 |     paq8pxd archive \dir1\file1.txt \dir2
110 | 
111 | will create archive.paq8pxd with the header:
112 | 
113 |     paq8pxd -5
114 |     123     file1.txt
115 |     456     dir2/file2.txt
116 | 
117 | The command:
118 | 
119 |     paq8pxd archive.paq8pxd C:\dir3
120 | 
121 | will create the files:
122 | 
123 |     C:\dir3\file1.txt
124 |     C:\dir3\dir2\file2.txt
125 | 
126 | Decompression will fail if the first 7 bytes are not "paq8pxd -".  Sizes
127 | are stored as decimal numbers.  CR, LF, TAB, CTRL-Z are ASCII codes
128 | 13, 10, 9, 26 respectively.
129 | 
130 | 
131 | # ARITHMETIC CODING
132 | 
133 | The binary data is arithmetic coded as the shortest base 256 fixed point
134 | number x = SUM_i x_i 256^-1-i such that p(<y) <= x < p(<=y), where y is the
135 | input string, x_i is the i'th coded byte, p(<y) (and p(<=y)) means the
136 | probability that a string is lexicographcally less than (less than
137 | or equal to) y according to the model, _ denotes subscript, and ^ denotes
138 | exponentiation.
139 | 
140 | The model p(y) for y is a conditional bit stream,
141 | p(y) = PROD_j p(y_j | y_0..j-1) where y_0..j-1 denotes the first j
142 | bits of y, and y_j is the next bit.  Compression depends almost entirely
143 | on the ability to predict the next bit accurately.
144 | 
145 | 
146 | # MODEL MIXING
147 | 
148 | paq8pxd uses a neural network to combine a large number of models.  The
149 | i'th model independently predicts
150 | p1_i = p(y_j = 1 | y_0..j-1), p0_i = 1 - p1_i.
151 | The network computes the next bit probabilty
152 | 
153 |     p1 = squash(SUM_i w_i t_i), p0 = 1 - p1                        (1)
154 | 
155 | where t_i = stretch(p1_i) is the i'th input, p1_i is the prediction of
156 | the i'th model, p1 is the output prediction, stretch(p) = ln(p/(1-p)),
157 | and squash(s) = 1/(1+exp(-s)).  Note that squash() and stretch() are
158 | inverses of each other.
159 | 
160 | After bit y_j (0 or 1) is received, the network is trained:
161 | 
162 |     w_i := w_i + eta t_i (y_j - p1)                                (2)
163 | 
164 | where eta is an ad-hoc learning rate, t_i is the i'th input, (y_j - p1)
165 | is the prediction error for the j'th input but, and w_i is the i'th
166 | weight.  Note that this differs from back propagation:
167 | 
168 |     w_i := w_i + eta t_i (y_j - p1) p0 p1                          (3)
169 | 
170 | which is a gradient descent in weight space to minimize root mean square
171 | error.  Rather, the goal in compression is to minimize coding cost,
172 | which is -log(p0) if y = 1 or -log(p1) if y = 0.  Taking
173 | the partial derivative of cost with respect to w_i yields (2).
174 | 
175 | 
176 | # MODELS
177 | 
178 | Most models are context models.  A function of the context (last few
179 | bytes) is mapped by a lookup table or hash table to a state which depends
180 | on the bit history (prior sequence of 0 and 1 bits seen in this context).
181 | The bit history is then mapped to p1_i by a fixed or adaptive function.
182 | There are several types of bit history states:
183 | 
184 | - Run Map. The state is (b,n) where b is the last bit seen (0 or 1) and
185 |   n is the number of consecutive times this value was seen.  The initial
186 |   state is (0,0).  The output is computed directly:
187 | 
188 |     t_i = (2b - 1)K log(n + 1).
189 | 
190 |   where K is ad-hoc, around 4 to 10.  When bit y_j is seen, the state
191 |   is updated:
192 | 
193 |     (b,n) := (b,n+1) if y_j = b, else (y_j,1).
194 | 
195 | - Stationary Map.  The state is p, initially 1/2.  The output is
196 |   t_i = stretch(p).  The state is updated at ad-hoc rate K (around 0.01):
197 | 
198 |     p := p + K(y_j - p)
199 | 
200 | - Nonstationary Map.  This is a compromise between a stationary map, which
201 |   assumes uniform statistics, and a run map, which adapts quickly by
202 |   discarding old statistics.  An 8 bit state represents (n0,n1,h), initially
203 |   (0,0,0) where:
204 | 
205 |     n0 is the number of 0 bits seen "recently".
206 |     n1 is the number of 1 bits seen "recently".
207 |     n = n0 + n1.
208 |     h is the full bit history for 0 <= n <= 4,
209 |       the last bit seen (0 or 1) if 5 <= n <= 15,
210 |       0 for n >= 16.
211 | 
212 |   The primaty output is t_i := stretch(sm(n0,n1,h)), where sm(.) is
213 |   a stationary map with K = 1/256, initialized to
214 |   sm(n0,n1,h) = (n1+(1/64))/(n+2/64).  Four additional inputs are also
215 |   be computed to improve compression slightly:
216 | 
217 |     p1_i = sm(n0,n1,h)
218 |     p0_i = 1 - p1_i
219 |     t_i   := stretch(p_1)
220 |     t_i+1 := K1 (p1_i - p0_i)
221 |     t_i+2 := K2 stretch(p1) if n0 = 0, -K2 stretch(p1) if n1 = 0, else 0
222 |     t_i+3 := K3 (-p0_i if n1 = 0, p1_i if n0 = 0, else 0)
223 |     t_i+4 := K3 (-p0_i if n0 = 0, p1_i if n1 = 0, else 0)
224 | 
225 |   where K1..K4 are ad-hoc constants.
226 | 
227 |   h is updated as follows:
228 |     If n < 4, append y_j to h.
229 |     Else if n <= 16, set h := y_j.
230 |     Else h = 0.
231 | 
232 |   The update rule is biased toward newer data in a way that allows
233 |   n0 or n1, but not both, to grow large by discarding counts of the
234 |   opposite bit.  Large counts are incremented probabilistically.
235 |   Specifically, when y_j = 0 then the update rule is:
236 | 
237 |     n0 := n0 + 1, n < 29
238 |           n0 + 1 with probability 2^(27-n0)/2 else n0, 29 <= n0 < 41
239 |           n0, n = 41.
240 |     n1 := n1, n1 <= 5
241 |           round(8/3 lg n1), if n1 > 5
242 | 
243 |   swapping (n0,n1) when y_j = 1.
244 | 
245 |   Furthermore, to allow an 8 bit representation for (n0,n1,h), states
246 |   exceeding the following values of n0 or n1 are replaced with the
247 |   state with the closest ratio n0:n1 obtained by decrementing the
248 |   smaller count: (41,0,h), (40,1,h), (12,2,h), (5,3,h), (4,4,h),
249 |   (3,5,h), (2,12,h), (1,40,h), (0,41,h).  For example:
250 |   (12,2,1) 0-> (7,1,0) because there is no state (13,2,0).
251 | 
252 | - Match Model.  The state is (c,b), initially (0,0), where c is 1 if
253 |   the context was previously seen, else 0, and b is the next bit in
254 |   this context.  The prediction is:
255 | 
256 |     t_i := (2b - 1)Kc log(m + 1)
257 | 
258 |   where m is the length of the context.  The update rule is c := 1,
259 |   b := y_j.  A match model can be implemented efficiently by storing
260 |   input in a buffer and storing pointers into the buffer into a hash
261 |   table indexed by context.  Then c is indicated by a hash table entry
262 |   and b can be retrieved from the buffer.
263 | 
264 | 
265 | # CONTEXTS
266 | 
267 | High compression is achieved by combining a large number of contexts.
268 | Most (not all) contexts start on a byte boundary and end on the bit
269 | immediately preceding the predicted bit.  The contexts below are
270 | modeled with both a run map and a nonstationary map unless indicated.
271 | 
272 | - Order n.  The last n bytes, up to about 16.  For general purpose data.
273 |   Most of the compression occurs here for orders up to about 6.
274 |   An order 0 context includes only the 0-7 bits of the partially coded
275 |   byte and the number of these bits (255 possible values).
276 | 
277 | - Sparse.  Usually 1 or 2 of the last 8 bytes preceding the byte containing
278 |   the predicted bit, e.g (2), (3),..., (8), (1,3), (1,4), (1,5), (1,6),
279 |   (2,3), (2,4), (3,6), (4,8).  The ordinary order 1 and 2 context, (1)
280 |   or (1,2) are included above.  Useful for binary data.
281 | 
282 | - Text.  Contexts consists of whole words (a-z, converted to lower case
283 |   and skipping other values).  Contexts may be sparse, e.g (0,2) meaning
284 |   the current (partially coded) word and the second word preceding the
285 |   current one.  Useful contexts are (0), (0,1), (0,1,2), (0,2), (0,3),
286 |   (0,4).  The preceding byte may or may not be included as context in the
287 |   current word.
288 | 
289 | - Formatted text.  The column number (determined by the position of
290 |   the last linefeed) is combined with other contexts: the charater to
291 |   the left and the character above it.
292 | 
293 | - Fixed record length.  The record length is determined by searching for
294 |   byte sequences with a uniform stride length.  Once this is found, then
295 |   the record length is combined with the context of the bytes immediately
296 |   preceding it and the corresponding byte locations in the previous
297 |   one or two records (as with formatted text).
298 | 
299 | - Context gap.  The distance to the previous occurrence of the order 1
300 |   or order 2 context is combined with other low order (1-2) contexts.
301 | 
302 | - FAX.  For 2-level bitmapped images.  Contexts are the surrounding
303 |   pixels already seen.  Image width is assumed to be 1728 bits (as
304 |   in calgary/pic).
305 | 
306 | - Image.  For uncompressed 24-bit color BMP, TIFF and TGA images.  Contexts
307 |   are the high order bits of the surrounding pixels and linear
308 |   combinations of those pixels, including other color planes.  The
309 |   image width is detected from the file header.  When an image is
310 |   detected, other models are turned off to improve speed.
311 | 
312 | - JPEG.  Files are further compressed by partially uncompressing back
313 |   to the DCT coefficients to provide context for the next Huffman code.
314 |   Only baseline DCT-Huffman coded files are modeled.  (This ia about
315 |   90% of images, the others are usually progresssive coded).  JPEG images
316 |   embedded in other files (quite common) are detected by headers.  The
317 |   baseline JPEG coding process is:
318 |   - Convert to grayscale and 2 chroma colorspace.
319 |   - Sometimes downsample the chroma images 2:1 or 4:1 in X and/or Y.
320 |   - Divide each of the 3 images into 8x8 blocks.
321 |   - Convert using 2-D discrete cosine transform (DCT) to 64 12-bit signed
322 |     coefficients.
323 |   - Quantize the coefficients by integer division (lossy).
324 |   - Split the image into horizontal slices coded independently, separated
325 |     by restart codes.
326 |   - Scan each block starting with the DC (0,0) coefficient in zigzag order
327 |     to the (7,7) coefficient, interleaving the 3 color components in
328 |     order to scan the whole image left to right starting at the top.
329 |   - Subtract the previous DC component from the current in each color.
330 |   - Code the coefficients using RS codes, where R is a run of R zeros (0-15)
331 |     and S indicates 0-11 bits of a signed value to follow.  (There is a
332 |     special RS code (EOB) to indicate the rest of the 64 coefficients are 0).
333 |   - Huffman code the RS symbol, followed by S literal bits.
334 |   The most useful contexts are the current partially coded Huffman code
335 |   (including S following bits) combined with the coefficient position
336 |   (0-63), color (0-2), and last few RS codes.
337 | 
338 | - Match.  When a context match of 400 bytes or longer is detected,
339 |   the next bit of the match is predicted and other models are turned
340 |   off to improve speed.
341 | 
342 | - Exe.  When a x86 file (.exe, .obj, .dll) is detected, sparse contexts
343 |   with gaps of 1-12 selecting only the prefix, opcode, and the bits
344 |   of the modR/M byte that are relevant to parsing are selected.
345 |   This model is turned off otherwise.
346 | 
347 | - Indirect.  The history of the last 1-3 bytes in the context of the
348 |   last 1-2 bytes is combined with this 1-2 byte context.
349 | 
350 | - DMC. A bitwise n-th order context is built from a state machine using
351 |   DMC, described in http://plg.uwaterloo.ca/~ftp/dmc/dmc.c
352 |   The effect is to extend a single context, one bit at a time and predict
353 |   the next bit based on the history in this context.  The model here differs
354 |   in that two predictors are used.  One is a pair of counts as in the original
355 |   DMC.  The second predictor is a bit history state mapped adaptively to
356 |   a probability as as in a Nonstationary Map.
357 | 
358 | # ARCHITECTURE
359 | 
360 | The context models are mixed by several of several hundred neural networks
361 | selected by a low-order context.  The outputs of these networks are
362 | combined using a second neural network, then fed through several stages of
363 | adaptive probability maps (APM) before arithmetic coding.
364 | 
365 | For images, only one neural network is used and its context is fixed.
366 | 
367 | An APM is a stationary map combining a context and an input probability.
368 | The input probability is stretched and divided into 32 segments to
369 | combine with other contexts.  The output is interpolated between two
370 | adjacent quantized values of stretch(p1).  There are 2 APM stages in series:
371 | 
372 |     p1 := (p1 + 3 APM(order 0, p1)) / 4.
373 |     p1 := (APM(order 1, p1) + 2 APM(order 2, p1) + APM(order 3, p1)) / 4.
374 | 
375 | # PREPROCESSING
376 | 
377 | paq8pxd uses preprocessing transforms on certain data types to improve
378 | compression.  To improve reliability, the decoding transform is
379 | tested during compression to ensure that the input file can be
380 | restored.  If the decoder output is not identical to the input file
381 | due to a bug, then the transform is abandoned and the data is compressed
382 | without a transform so that it will still decompress correctly.
383 | 
384 | The input is split into blocks with the format <type> <decoded size> <data>
385 | where <type> is 1 byte (0 = no transform), <decoded size> is the size
386 | of the data after decoding, which may be different than the size of <data>.
387 | Blocks do not span file boundaries, and have a maximum size of 4MB to
388 | 2GB depending on compression level.  Large files are split into blocks
389 | of this size.  The preprocessor has 3 parts:
390 | 
391 | - Detector.  Splits the input into smaller blocks depending on data type.
392 | 
393 | - Coder.  Input is a block to be compressed.  Output is a temporary
394 |   file.  The coder determines whether a transform is to be applied
395 |   based on file type, and if so, which one.  A coder may use lots
396 |   of resources (memory, time) and make multiple passes through the
397 |   input file.  The file type is stored (as one byte) during compression.
398 | 
399 | - Decoder.  Performs the inverse transform of the coder.  It uses few
400 |   resorces (fast, low memory) and runs in a single pass (stream oriented).
401 |   It takes input either from a file or the arithmetic decoder.  Each call
402 |   to the decoder returns a single decoded byte.
403 | 
404 | The following transforms are used:
405 | 
406 | - EXE:  CALL (0xE8) and JMP (0xE9) address operands are converted from
407 |   relative to absolute address.  The transform is to replace the sequence
408 |   E8/E9 xx xx xx 00/FF by adding file offset modulo 2^25 (signed range,
409 |   little-endian format).  Data to transform is identified by trying the
410 |   transform and applying a crude compression test: testing whether the
411 |   byte following the E8/E8 (LSB of the address) occurred more recently
412 |   in the transformed data than the original and within 4KB 4 times in
413 |   a row.  The block ends when this does not happen for 4KB.
414 | 
415 | - JPEG: detected by SOI and SOF and ending with EOI or any nondecodable
416 |   data.  No transform is applied.  The purpose is to separate images
417 |   embedded in execuables to block the EXE transform, and for a future
418 |   place to insert a transform.
419 | 
420 | 
421 | # IMPLEMENTATION
422 | 
423 | Hash tables are designed to minimize cache misses, which consume most
424 | of the CPU time.
425 | 
426 | Most of the memory is used by the nonstationary context models.
427 | Contexts are represented by 32 bits, possibly a hash.  These are
428 | mapped to a bit history, represented by 1 byte.  The hash table is
429 | organized into 64-byte buckets on cache line boundaries.  Each bucket
430 | contains 7 x 7 bit histories, 7 16-bit checksums, and a 2 element LRU
431 | queue packed into one byte.  Each 7 byte element represents 7 histories
432 | for a context ending on a 3-bit boundary plus 0-2 more bits.  One
433 | element (for bits 0-1, which have 4 unused bytes) also contains a run model
434 | consisting of the last byte seen and a count (as 1 byte each).
435 | 
436 | Run models use 4 byte hash elements consisting of a 2 byte checksum, a
437 | repeat count (0-255) and the byte value.  The count also serves as
438 | a priority.
439 | 
440 | Stationary models are most appropriate for small contexts, so the
441 | context is used as a direct table lookup without hashing.
442 | 
443 | The match model maintains a pointer to the last match until a mismatching
444 | bit is found.  At the start of the next byte, the hash table is referenced
445 | to find another match.  The hash table of pointers is updated after each
446 | whole byte.  There is no checksum.  Collisions are detected by comparing
447 | the current and matched context in a rotating buffer.
448 | 
449 | The inner loops of the neural network prediction (1) and training (2)
450 | algorithms are implemented in MMX assembler, which computes 4 elements
451 | at a time.  Using assembler is 8 times faster than C++ for this code
452 | and 1/3 faster overall.  (However I found that SSE2 code on an AMD-64,
453 | which computes 8 elements at a time, is not any faster).
454 | 
455 | 
456 | # DIFFERENCES FROM PAQ8PXD_V5
457 | 
458 | * changes in wrt, use 0-9 ind dict if count is larger then a-zA-Z
459 | * 8-bit image model changes
460 | * base64 changes
461 | * contextmap from Tangelo
462 | * fixes in DMC model
463 | * cleanup of unused varibles
464 | * fixes in wordmodel
465 | etc.
466 | 


--------------------------------------------------------------------------------
/wrtpre.cpp:
--------------------------------------------------------------------------------
   1 | 
   2 | // based on  "XWRT 3.2 (29.10.2007) - XML compressor by P.Skibinski, inikep@gmail.com"
   3 | //
   4 | #include <stdlib.h> 
   5 | #include <memory.h>
   6 | #pragma warning(disable:4786)
   7 | #include <stdio.h>
   8 | #include <vector>
   9 | #include <string>
  10 | #include <map>
  11 | #include <string.h>
  12 | #if defined WIN32 || defined WIN64
  13 | #include <windows.h>
  14 | #include <conio.h>
  15 | #endif
  16 | 
  17 | #define PRINT_CHARS(data) ;//printf data
  18 | #define PRINT_CODEWORDS(data); // printf data
  19 | #define PRINT_STACK(data) ;//printf data;
  20 | #define PRINT_DICT(data) ;//printf data;
  21 | #define PRINT_CONTAINERS(data) ;//printf data
  22 | //#define PRINT_STATUS(data) printf data;
  23 | 
  24 | #pragma warning(disable:4244) //  '=' : conversion from ... to ..., possible loss of data
  25 | #pragma warning(disable:4786) // STL warnings
  26 | #pragma warning(disable:4996) // '_getch' was declared deprecated
  27 | #pragma warning(disable:4503) // STL
  28 | #pragma warning(disable:4390) // empty controlled statement found; is this the intent?
  29 | #pragma warning(disable:4018) // signed/unsigned mismatch
  30 | #define _CRT_SECURE_NO_DEPRECATE // VC++ 2005 deprecate warnings
  31 | 
  32 | 
  33 | #if defined WIN32 || defined WIN64
  34 | #define getch _getch
  35 | #else
  36 | #define getch getchar
  37 | #endif
  38 | 
  39 | #define CHAR_FIRSTUPPER		1	// for encode lower word with first capital letter
  40 | #define CHAR_UPPERWORD		2	// for encode upper word
  41 | #define CHAR_ESCAPE			3	// for encode reserved chars (CHAR_ESCAPE,CHAR_FIRSTUPPER,...)
  42 | #define BINARY_FIRST		128
  43 | #define BINARY_LAST			255
  44 | 
  45 | #define OPTION_TRY_SHORTER_WORD				4
  46 | 
  47 | 
  48 | #if !defined min
  49 | #define min(a,b) (((a)>(b))?(b):(a))
  50 | #endif
  51 | #define IF_OPTION(option) (preprocFlag & option) //, printf("%d",option)
  52 | #define OPTION(option) (xml_wrt.preprocFlag & option)
  53 | #define TURN_OFF(option) {if (preprocFlag & option) preprocFlag-=option;}
  54 | #define TURN_ON(option) {if ((preprocFlag & option)==0) preprocFlag+=option;}
  55 | #define RESET_OPTIONS (preprocFlag=0)
  56 | 
  57 | #define WORD_MIN_SIZE		2
  58 | #define WORD_AVG_SIZE		8
  59 | #define WORD_MAX_SIZE		48
  60 | #define STRING_MAX_SIZE		255  // 1-byte for container.size()
  61 | 
  62 | #define MAX_DYNAMIC_DICT_COUNT	(65536*256)
  63 | #define HASH_TABLE_SIZE			(1<<20) //1MB*4
  64 | 
  65 | //#define BYTES_TO_DETECT			(50*1024)
  66 | 
  67 | //#define NUM_BASE			256
  68 | #define HASH_DOUBLE_MULT	37
  69 | #define HASH_MULT			23
  70 | //#define CHARSET_COUNT		6
  71 | 
  72 | 
  73 | enum EWordType { LOWERWORD, FIRSTUPPER, UPPERWORD, VARWORD, NUMBER};
  74 | enum ELetterType { LOWERCHAR, UPPERCHAR, UNKNOWNCHAR, RESERVEDCHAR, NUMBERCHAR };
  75 | #define OUT_OF_MEMORY() \
  76 | 	{ \
  77 | 		printf("Not enough memory!\n");\
  78 | 		exit(0); \
  79 | 	}
  80 | FILE* XWRT_file;
  81 | FILE* XWRT_fileout;
  82 | unsigned char** dict=NULL;
  83 | int* dictfreq=NULL;
  84 | unsigned char* dictlen=NULL;
  85 | int wrtnum=0;
  86 | #define PUTC(c) { putc(c,XWRT_fileout); }
  87 | #define GETC(c) { c=getc(XWRT_file); }
  88 | size_t fread_fast(unsigned char* dst, int len, FILE* file);
  89 | size_t fwrite_fast(unsigned char* dst, int len, FILE* file);
  90 | 
  91 | 
  92 | /////////////////////////////////////////////////////////
  93 | 
  94 | 
  95 | #define OUTPUT_BUFFER_MIN_SIZE 10240
  96 | 
  97 | 
  98 | // Input/Output using dynamic memory allocation
  99 | class CMemoryBuffer
 100 | {
 101 | public:
 102 | 	CMemoryBuffer(std::string mname="");
 103 | 	~CMemoryBuffer();
 104 | 
 105 | 	void OutTgtByte( unsigned char c );
 106 | 	int InpSrcByte( void );
 107 | 	inline int Size();
 108 | 	inline int Allocated(); 
 109 | 	inline void AllocSrcBuf( unsigned int len );
 110 | 	inline void Clear();
 111 | 
 112 | 	static unsigned int memsize;
 113 | 	unsigned char* TargetBuf;
 114 | 	unsigned char* SourceBuf;
 115 | 	unsigned int SrcLen, TgtLen;
 116 | 	unsigned int SrcPtr, TgtPtr;
 117 | 	std::string name;
 118 | 
 119 | private:
 120 | 	inline void AllocTgtBuf( unsigned int len = OUTPUT_BUFFER_MIN_SIZE );
 121 | 	inline void ReallocTgtBuf(unsigned int len);
 122 | };
 123 | 
 124 | class CContainers
 125 | {
 126 | public:
 127 | 	CContainers();
 128 | 	void prepareMemBuffers();
 129 | 	void writeMemBuffers(int preprocFlag);
 130 | 	void readMemBuffers(int preprocFlag, int maxMemSize);
 131 | 	void freeMemBuffers(bool freeMem);
 132 | 
 133 | 	CMemoryBuffer *memout;
 134 | 	unsigned char *bigBuffer;	
 135 | 
 136 | private:
 137 | 	std::vector<CMemoryBuffer*> mem_stack;
 138 | 	std::map<std::string,CMemoryBuffer*> memmap;
 139 | };
 140 | 
 141 | unsigned int CMemoryBuffer::memsize=0;
 142 | 
 143 | CMemoryBuffer::CMemoryBuffer(std::string mname) 
 144 | { 
 145 | 	name=mname;
 146 | 	Clear(); 
 147 | 	AllocTgtBuf(); 
 148 | };
 149 | 
 150 | CMemoryBuffer::~CMemoryBuffer() 
 151 | { 
 152 | 	if (TargetBuf)
 153 | 	free(TargetBuf-3);
 154 | 	
 155 | 	if (SourceBuf)
 156 | 	free(SourceBuf);
 157 | };
 158 | 
 159 | inline void CMemoryBuffer::Clear()
 160 | {
 161 | 	TargetBuf=NULL; SourceBuf=NULL; SrcPtr=0; TgtPtr=0; SrcLen=0; TgtLen=0;
 162 | }
 163 | 
 164 | inline int CMemoryBuffer::Size() 
 165 | {
 166 | 	return TgtPtr;
 167 | }
 168 | 
 169 | inline int CMemoryBuffer::Allocated() 
 170 | {
 171 | 	return TgtLen;
 172 | }
 173 | 
 174 | void CMemoryBuffer::OutTgtByte( unsigned char c ) 
 175 | { 
 176 | 	memsize++;
 177 | 
 178 | 	*(TargetBuf+(TgtPtr++))=c; 
 179 | 	if (TgtPtr>TgtLen-1){
 180 | 		if (TgtLen > (1<<19))  // 512 KB
 181 | 		ReallocTgtBuf(TgtLen+(1<<19));
 182 | 		else
 183 | 		ReallocTgtBuf(TgtLen*2);
 184 | 	}
 185 | }
 186 | 
 187 | int CMemoryBuffer::InpSrcByte( void ) 
 188 | {
 189 | 	memsize++;
 190 | 
 191 | 	if (SrcPtr>=SrcLen)
 192 | 	return EOF;
 193 | 
 194 | 	return *(SourceBuf+(SrcPtr++)); 
 195 | }
 196 | 
 197 | inline void CMemoryBuffer::AllocSrcBuf( unsigned int len ){
 198 | 	SrcLen = len;
 199 | 	SourceBuf = (unsigned char*) malloc(SrcLen);
 200 | 	if (SourceBuf==NULL)
 201 | 	OUT_OF_MEMORY();
 202 | }
 203 | 
 204 | inline void CMemoryBuffer::AllocTgtBuf( unsigned int len ){
 205 | 	TgtLen = len;
 206 | 	TargetBuf = (unsigned char*) malloc(len+6);
 207 | 	if (TargetBuf==NULL)
 208 | 	OUT_OF_MEMORY();
 209 | 	TargetBuf += 3;
 210 | }
 211 | 
 212 | inline void CMemoryBuffer::ReallocTgtBuf(unsigned int len){
 213 | 	unsigned char* NewTargetBuf = (unsigned char*) malloc(len+6);
 214 | 
 215 | 	if (NewTargetBuf==NULL)
 216 | 	OUT_OF_MEMORY();
 217 | 
 218 | 	NewTargetBuf += 3;
 219 | 	memcpy(NewTargetBuf,TargetBuf,min(TgtPtr,len));
 220 | 	TgtLen = len;
 221 | 	delete(TargetBuf-3);
 222 | 	TargetBuf=NewTargetBuf;
 223 | }
 224 | 
 225 | CContainers::CContainers() : bigBuffer(NULL) {};
 226 | 
 227 | void CContainers::prepareMemBuffers(){
 228 | 	memout=new CMemoryBuffer();
 229 | 	std::pair<std::string,CMemoryBuffer*> p("!data",memout);
 230 | 	memmap.insert(p);	
 231 | }
 232 | 
 233 | void CContainers::writeMemBuffers(int preprocFlag){
 234 | 	std::map<std::string,CMemoryBuffer*>::iterator it;
 235 | 
 236 | 	int fileLen=0;
 237 | 	//int len=0;
 238 | 	//int lenCompr=0;
 239 | 	//int allocated=0;
 240 | 
 241 | 	for (it=memmap.begin(); it!=memmap.end(); it++)
 242 | 	{
 243 | 		CMemoryBuffer* b=it->second;
 244 | 		fileLen=b->Size();
 245 | 		
 246 | 		PRINT_CONTAINERS(("cont=%s fileLen=%d\n",it->first.c_str(),fileLen));
 247 | 
 248 | 		if (fileLen>0)
 249 | 		{
 250 | //			allocated+=b->Allocated();
 251 | 	//		len+=fileLen;
 252 | 			
 253 | 			PUTC((int)it->first.size());
 254 | 			for (int i=0; i<(int)it->first.size(); i++)
 255 | 			PUTC(it->first[i]);
 256 | 			
 257 | 			PUTC(fileLen>>24);
 258 | 			PUTC(fileLen>>16);
 259 | 			PUTC(fileLen>>8);
 260 | 			PUTC(fileLen);
 261 | 
 262 | 			fwrite_fast(it->second->TargetBuf,it->second->TgtPtr,XWRT_fileout);
 263 | //			lenCompr+=fileLen;
 264 | 		}
 265 | 	}
 266 | 	PUTC(0)
 267 | 	//PRINT_DICT(("dataSize=%d compr=%d allocated=%d\n",len,lenCompr,allocated));
 268 | 
 269 | 	freeMemBuffers(true);
 270 | 	prepareMemBuffers();
 271 | }
 272 | 
 273 | void CContainers::readMemBuffers(int preprocFlag, int maxMemSize){
 274 | 	//unsigned char* buf=NULL;
 275 | //	unsigned int bufLen=0;
 276 | 	unsigned int fileLen;
 277 | //	unsigned int ui;
 278 | //	int len=0;
 279 | //	int lenCompr=0;
 280 | 	int i,c;
 281 | 	unsigned char s[STRING_MAX_SIZE];
 282 | 
 283 | 	freeMemBuffers(true);
 284 | 	prepareMemBuffers();
 285 | 	CMemoryBuffer* memout_tmp=NULL;
 286 | 
 287 | 	while (true){			
 288 | 		GETC(i);
 289 | 
 290 | 		if (i<=0)
 291 | 		break;
 292 | 
 293 | 		for (c=0; c<i; c++)
 294 | 		GETC(s[c]);
 295 | 		
 296 | 		std::string str;
 297 | 		str.append((char*)s,i);
 298 | 
 299 | 		PRINT_CONTAINERS(("cont=%s\n",str.c_str()));
 300 | 
 301 | 		if (str=="!data")
 302 | 		memout_tmp=memout;
 303 | 		else
 304 | 		{
 305 | 			memout_tmp=new CMemoryBuffer(str);
 306 | 			std::pair<std::string,CMemoryBuffer*> p(str,memout_tmp);
 307 | 			memmap.insert(p);
 308 | 		}
 309 | 
 310 | 		int c;
 311 | 		for (i=0, fileLen=0; i<4; i++){
 312 | 			GETC(c);
 313 | 			fileLen=fileLen*256+c;
 314 | 		}
 315 | 		
 316 | 		//len+=fileLen;
 317 | 		//lenCompr+=fileLen;
 318 | 		memout_tmp->AllocSrcBuf(fileLen);
 319 | 
 320 | 		fread_fast(memout_tmp->SourceBuf,memout_tmp->SrcLen,XWRT_file);
 321 | 
 322 | 		//printStatus(fileLen,0,false);
 323 | 	}
 324 | 	//PRINT_DICT(("readMemBuffers() dataSize=%d compr=%d allocated=%d\n",len,lenCompr,maxMemSize+10240));
 325 | }
 326 | 
 327 | void CContainers::freeMemBuffers(bool freeMem){
 328 | 	mem_stack.clear();
 329 | 
 330 | 	std::map<std::string,CMemoryBuffer*>::iterator it;
 331 | 
 332 | 	for (it=memmap.begin(); it!=memmap.end(); it++)
 333 | 	{
 334 | 		if (!freeMem)
 335 | 		it->second->Clear();
 336 | 		delete(it->second);			
 337 | 	}
 338 | 
 339 | 	memmap.clear();
 340 | }
 341 | 
 342 | /////////////////////////////////////////////////////////
 343 | 
 344 | /////////////////////////////////////////////////////////
 345 | 
 346 | class XWRT_Common
 347 | {
 348 | public:
 349 | 	XWRT_Common(int fileBufferSize=17); // 128 kb
 350 | 	~XWRT_Common();
 351 | 
 352 | 	void defaultSettings(int n);
 353 | 	unsigned int flen( FILE* &f );
 354 | 	
 355 | 	CContainers cont;
 356 | 	int preprocFlag;
 357 | 
 358 | protected:
 359 | 	
 360 | 	inline void stringHash(const unsigned char *ptr, int len,int& hash);
 361 | 	int addWord(unsigned char* &mem,int &i);
 362 | 	unsigned char* loadDynamicDictionary(unsigned char* mem,unsigned char* mem_end);
 363 | 	void initializeLetterSet();
 364 | 	void initializeCodeWords(int word_count,bool initMem=true);
 365 | 	bool initialize(bool encoding);
 366 | 	void WRT_deinitialize();
 367 | 
 368 | 	void WRT_print_options();
 369 | 	int minSpacesFreq();
 370 | 
 371 | 	int* word_hash;
 372 | 	bool decoding,fileCorrupted,detect,firstWarn;
 373 | 	int maxDynDictBuf,minWordFreq,maxDictSize;
 374 | 	int tryShorterBound,spaces,fileLenMB,beforeWord;
 375 | 	int spacesCodeword[256];
 376 | 	int spacesCont[256];
 377 | 	std::vector<std::string> sortedDict;
 378 | 
 379 | 	ELetterType letterType;
 380 | 	ELetterType letterSet[256];
 381 | 
 382 | 	int sizeDict,sizeDynDict;
 383 | 	unsigned char* dictmem;
 384 | 	unsigned char* dictmem_end;
 385 | 	unsigned char* mem;
 386 | 	
 387 | 	int addSymbols[256]; // reserved symbols in output alphabet 
 388 | 	int reservedSet[256]; // reserved symbols in input alphabet
 389 | 	int outputSet[256];
 390 | 	int wordSet[256]; 
 391 | 	int sym2codeword[256]; 
 392 | 	int codeword2sym[256]; 
 393 | 
 394 | 	int dictionary,dict1size,dict2size,dict3size,dict4size,dict1plus2plus3,dict1plus2;
 395 | 	int bound4,bound3,dict123size,dict12size,collision,quoteOpen,quoteClose,detectedSym;
 396 | 	int maxMemSize;
 397 | 	int sortedDictSize;
 398 | 	
 399 | 
 400 | public:
 401 | };
 402 | 
 403 | XWRT_Common::XWRT_Common(int fileBufferSize) :  dictmem(NULL),
 404 | detect(false), dictmem_end(NULL),fileCorrupted(false)
 405 | { 
 406 | 	if (fileBufferSize<10)
 407 | 	fileBufferSize=10; // 1 KB
 408 | 	if (fileBufferSize>23)
 409 | 	fileBufferSize=23; // 8 MB
 410 | 	word_hash=new int[HASH_TABLE_SIZE];
 411 | 	if (!word_hash)
 412 | 	OUT_OF_MEMORY();
 413 | }
 414 | XWRT_Common::~XWRT_Common(){
 415 | 	if (word_hash)
 416 | 	delete(word_hash);
 417 | 	WRT_deinitialize(); 
 418 | }
 419 | int XWRT_Common::minSpacesFreq(){
 420 | 	return 300+200*(fileLenMB/5);
 421 | }
 422 | 
 423 | // make hash from string
 424 | inline void XWRT_Common::stringHash(const unsigned char *ptr, int len,int& hash){
 425 | 	for (hash = 0; len>0; len--, ptr++){
 426 | 		hash *= HASH_MULT;
 427 | 		hash += *ptr;
 428 | 	}
 429 | 	hash=hash&(HASH_TABLE_SIZE-1);
 430 | }
 431 | int XWRT_Common::addWord(unsigned char* &mem,int &i){
 432 | 	int c,j;
 433 | 	if (i<=1 || sizeDict>=dictionary)
 434 | 	return -1;
 435 | 	
 436 | 	dictlen[sizeDict]=i;
 437 | 	dict[sizeDict]=mem;
 438 | 	
 439 | 	mem[i]=0;
 440 | 	stringHash(mem,i,j);
 441 | 	
 442 | 	if (word_hash[j]!=0)
 443 | 	{
 444 | 		if (dictlen[sizeDict]!=dictlen[word_hash[j]] || memcmp(dict[sizeDict],dict[word_hash[j]],dictlen[sizeDict])!=0)
 445 | 		{
 446 | 			c=(j+i*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1);
 447 | 			if (word_hash[c]!=0)
 448 | 			{
 449 | 				if (dictlen[sizeDict]!=dictlen[word_hash[c]] || memcmp(dict[sizeDict],dict[word_hash[c]],dictlen[sizeDict])!=0)
 450 | 				{
 451 | 					c=(j+i*HASH_DOUBLE_MULT*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1);
 452 | 					if (word_hash[c]!=0)
 453 | 					{
 454 | 						collision++;
 455 | 						return -1;
 456 | 					}
 457 | 					else
 458 | 					{
 459 | 						word_hash[c]=sizeDict++;
 460 | 					}
 461 | 				}
 462 | 				else
 463 | 				return -1; // word already exists
 464 | 			}
 465 | 			else
 466 | 			{
 467 | 				word_hash[c]=sizeDict++;
 468 | 			}
 469 | 		}
 470 | 		else
 471 | 		return -1; // word already exists
 472 | 	}
 473 | 	else
 474 | 	{
 475 | 		word_hash[j]=sizeDict++;
 476 | 	}
 477 | 	return 1;
 478 | }
 479 | unsigned char* XWRT_Common::loadDynamicDictionary(unsigned char* mem,unsigned char* mem_end){
 480 | 	int i;
 481 | 	for (i=0; i<256; i++)
 482 | 	spacesCodeword[i]=0;
 483 | 	int count=sortedDictSize;
 484 | 	for (i=0; i<count; i++)
 485 | 	{
 486 | 		//std::string s=sortedDict[i];
 487 | 		int len=(int)sortedDict[i].size();
 488 | 		memcpy(mem,sortedDict[i].c_str(),len+1);
 489 | 		if (addWord(mem,len)==0)
 490 | 		break;
 491 | 		mem+=(len/4+1)*4;
 492 | 		if (mem>mem_end)
 493 | 		break;
 494 | 	}
 495 | 	/*if (mem<mem_end)
 496 | 	{
 497 | 		i=strlen("http://www.");
 498 | 		memcpy(mem,"http://www.",i);
 499 | 		if (addWord(mem,i)!=0)
 500 | 			mem+=(i/4+1)*4;
 501 | 	}*/
 502 | 	PRINT_DICT(("count=%d sortedDict.size()=%d\n",count,sortedDictSize));
 503 | 	sizeDynDict=sizeDict;
 504 | 	return mem;
 505 | }
 506 | 
 507 | void XWRT_Common::initializeLetterSet(){
 508 | 	int c;
 509 | 	for (c=0; c<256; c++)
 510 | 	letterSet[c]=UNKNOWNCHAR;
 511 | 	for (c='0'; c<='9'; c++)
 512 | 	letterSet[c]=NUMBERCHAR;
 513 | 	for (c='A'; c<='Z'; c++)
 514 | 	letterSet[c]=UPPERCHAR;
 515 | 	for (c='a'; c<='z'; c++)
 516 | 	letterSet[c]=LOWERCHAR;
 517 | 	for (c=0; c<256; c++)
 518 | 	if (reservedSet[c])
 519 | 	letterSet[c]=RESERVEDCHAR;
 520 | 	for (c=0; c<256; c++)  //                                                - _ . , :
 521 | 	if (c>127 || letterSet[c]==LOWERCHAR || letterSet[c]==UPPERCHAR || (letterSet[c]==NUMBERCHAR && wrtnum==1) ||c==' ' /*|| c=='\''*/) // || c=='&') 
 522 | 	wordSet[c]=1;
 523 | 	else
 524 | 	wordSet[c]=0;
 525 | }
 526 | void XWRT_Common::initializeCodeWords(int word_count,bool initMem){
 527 | 	int c,charsUsed,i;
 528 | 	detectedSym=0;
 529 | 	for (c=0; c<256; c++){
 530 | 		addSymbols[c]=0;
 531 | 		codeword2sym[c]=0;
 532 | 		sym2codeword[c]=0;
 533 | 		reservedSet[c]=0;
 534 | 		outputSet[c]=0;
 535 | 	}
 536 | 	for (c=0; c<256; c++){
 537 | 		if (c==CHAR_ESCAPE || c==CHAR_FIRSTUPPER || c==CHAR_UPPERWORD )
 538 | 		{
 539 | 			reservedSet[c]=1;
 540 | 			addSymbols[c]=0;
 541 | 		}
 542 | 	}
 543 | 	for (c=0; c<256; c++)
 544 | 	if (addSymbols[c])
 545 | 	reservedSet[c]=1;
 546 | 	initializeLetterSet();
 547 | 	for (c=BINARY_FIRST; c<=BINARY_LAST; c++)
 548 | 	addSymbols[c]=1;
 549 | 	
 550 | 	for (c=0; c<256; c++){
 551 | 		if (reservedSet[c] || addSymbols[c])
 552 | 		outputSet[c]=1;
 553 | 	}
 554 | 	charsUsed=0;
 555 | 	for (c=0; c<256; c++){
 556 | 		if (addSymbols[c]){
 557 | 			codeword2sym[c]=charsUsed;
 558 | 			sym2codeword[charsUsed]=c;
 559 | 			charsUsed++;
 560 | 			{
 561 | 				if (c<128+64)
 562 | 				dict1size=charsUsed;
 563 | 				if (c<128+64+32)
 564 | 				dict2size=charsUsed;
 565 | 				if (c<128+64+32+16)
 566 | 				dict3size=charsUsed;
 567 | 				if (c<128+64+32+16+16)
 568 | 				dict4size=charsUsed;
 569 | 			}
 570 | 		}
 571 | 	}
 572 | 	c=word_count;
 573 | 	
 574 | 	dict4size-=dict3size;
 575 | 	dict3size-=dict2size;
 576 | 	dict2size-=dict1size;
 577 | 	if (dict1size<4 || dict2size<4 || dict3size<4 || dict4size<4){
 578 | 		dict2size=dict3size=dict4size=charsUsed/4;
 579 | 		dict1size=charsUsed-dict4size*3;
 580 | 		for (i=0; i<charsUsed/4; i++){
 581 | 			if (i*i*i*(charsUsed-i*3)>c){
 582 | 				dict1size=charsUsed-i*3;
 583 | 				dict2size=i;
 584 | 				dict3size=i;
 585 | 				dict4size=i;
 586 | 				break;
 587 | 			}
 588 | 		}
 589 | 	}	
 590 | 	
 591 | 	dictionary=(dict1size*dict2size*dict3size*dict4size+dict1size*dict2size*dict3size+dict1size*dict2size+dict1size);
 592 | 	bound4=dict1size*dict2size*dict3size+dict1size*dict2size+dict1size;
 593 | 	bound3=dict1size*dict2size+dict1size;
 594 | 	dict123size=dict1size*dict2size*dict3size;
 595 | 	dict12size=dict1size*dict2size;
 596 | 
 597 | 	dict1plus2=dict1size+dict2size;
 598 | 	dict1plus2plus3=dict1size+dict2size+dict3size;
 599 | 	if (initMem){
 600 | 		dict=(unsigned char**)calloc(sizeof(unsigned char*)*(dictionary+1),1);
 601 | 		dictlen=(unsigned char*)calloc(sizeof(unsigned char)*(dictionary+1),1);
 602 | 		if (!dict || !dictlen)
 603 | 		OUT_OF_MEMORY();
 604 | 	}
 605 | 	PRINT_DICT((" %d %d %d %d(%d) charsUsed=%d sizeDict=%d\n",dict1size,dict2size,dict3size,dict4size,dictionary,charsUsed,sizeDict));
 606 | }
 607 | // read dictionary from files to arrays
 608 | bool XWRT_Common::initialize(bool encoding){
 609 | //	int fileLen;
 610 | //	FILE* file;
 611 | 	WRT_deinitialize();
 612 | 	memset(&word_hash[0],0,HASH_TABLE_SIZE*sizeof(word_hash[0]));
 613 | 	
 614 | 	
 615 | 	dict123size=sortedDictSize;
 616 | 	if (dict123size<20)
 617 | 	dict123size=20;
 618 | 	initializeCodeWords(dict123size);
 619 | 	int dicsize=dictionary*WORD_AVG_SIZE*2;
 620 | 	dictmem=(unsigned char*)calloc(dicsize,1);
 621 | 	dictmem_end=dictmem+dicsize-256;
 622 | 	PRINT_DICT(("allocated memory=%d\n",dicsize));
 623 | 	if (!dictmem)
 624 | 	OUT_OF_MEMORY();
 625 | 	sizeDict=1;
 626 | 	mem=loadDynamicDictionary(dictmem,dictmem_end);
 627 | 	
 628 | 	
 629 | 	return true;
 630 | }
 631 | void XWRT_Common::WRT_deinitialize(){
 632 | 	if (dict){
 633 | 		free(dict);
 634 | 		dict=NULL;
 635 | 	}
 636 | 	if (dictlen){
 637 | 		free(dictlen);
 638 | 		dictlen=NULL;
 639 | 	}
 640 | 	if (dictmem){
 641 | 		free(dictmem);
 642 | 		dictmem=NULL;
 643 | 	}
 644 | 	if (dictfreq){
 645 | 		free(dictfreq);
 646 | 		dictfreq=NULL;
 647 | 	}
 648 | 	sizeDict=0;
 649 | }
 650 | 
 651 | void XWRT_Common::defaultSettings(int n){
 652 | 	RESET_OPTIONS;
 653 | 	TURN_ON(OPTION_TRY_SHORTER_WORD);
 654 | 	maxMemSize=8*1024*1024;
 655 | 	maxDynDictBuf=8*4;
 656 | 	maxDictSize=65535*32700;
 657 | 	tryShorterBound=3;//4
 658 | 	minWordFreq=7*2; //7*2 64;
 659 | 	wrtnum=n;
 660 | 	//printf("WRT: num: %d",wrtnum);
 661 | 	//maxDictSize=	//e
 662 | 	//minWordFreq=  // f
 663 | 	//maxMemSize  maxMemSize*=1024*1024;//m
 664 | 	//maxDynDictBuf //b
 665 | }
 666 | 
 667 | size_t fread_fast(unsigned char* dst, int len, FILE* file){
 668 | 	return fread(dst,1,len,file);
 669 | /*	int rd;
 670 | 	size_t sum=0;
 671 | 	while (len > 1<<17) // 128 kb
 672 | 	{
 673 | 		rd=fread(dst,1,1<<17,file);
 674 | 		dst+=rd;
 675 | 		len-=rd;
 676 | 		sum+=rd;
 677 | 	}
 678 | 	sum+=fread(dst,1,len,file);
 679 | 	return sum;*/
 680 | }
 681 | size_t fwrite_fast(unsigned char* dst, int len, FILE* file){
 682 | 	return fwrite(dst,1,len,file);
 683 | 	/*int wt;
 684 | 	size_t sum=0;
 685 | 	while (len > 1<<17) // 128 kb
 686 | 	{
 687 | 		wt=fwrite(dst,1,1<<17,file);
 688 | 		dst+=wt;
 689 | 		len-=wt;
 690 | 		sum+=wt;
 691 | 	}
 692 | 	sum+=fwrite(dst,1,len,file);
 693 | 	return sum;*/
 694 | }
 695 | //////////////////////////////////////////
 696 | 
 697 | class XWRT_Decoder : public XWRT_Common
 698 | {
 699 | public:
 700 | 
 701 | 	XWRT_Decoder();
 702 | 	~XWRT_Decoder();
 703 | 
 704 | 	int WRT_start_decoding(FILE* in);
 705 | 	int WRT_decode();
 706 | private:
 707 | 
 708 | 	inline void toUpper(unsigned char* s,int &s_size);
 709 | 	void read_dict();
 710 | 	inline int decodeCodeWord(unsigned char* &s,int& c);
 711 | 
 712 | 	enum EUpperType { UFALSE, UTRUE, FORCE };
 713 | 
 714 | 	int s_size,WRTd_c;
 715 | 	int last_c;
 716 | 	bool WRTd_upper;
 717 | 	bool WRTd_initialized;
 718 | 	unsigned char WRTd_data[STRING_MAX_SIZE];
 719 | 	unsigned char *WRTd_s;
 720 | 	EUpperType upperWord;
 721 | 
 722 | public:
 723 | }; // end class 
 724 | 
 725 | XWRT_Decoder::XWRT_Decoder() : WRTd_s(&WRTd_data[0]) 
 726 | { 	
 727 | 
 728 | };
 729 | 
 730 | XWRT_Decoder::~XWRT_Decoder(){ 
 731 | 	if (cont.bigBuffer)
 732 | 	{
 733 | 		free(cont.bigBuffer);
 734 | 		cont.bigBuffer=NULL;
 735 | 		cont.freeMemBuffers(false);
 736 | 	}
 737 | 	else
 738 | 	cont.freeMemBuffers(true);
 739 | }
 740 | 
 741 | #define DECODE_GETC(c)\
 742 | 	{\
 743 | 		if (cont.memout->memsize>maxMemSize) \
 744 | 		{ \
 745 | 			PRINT_DICT(("%d maxMemSize=%d\n",cont.memout->memsize,maxMemSize)); \
 746 | 			cont.readMemBuffers(preprocFlag,maxMemSize); \
 747 | 			cont.memout->memsize=0; \
 748 | 		} \
 749 | 		\
 750 | 		c=cont.memout->InpSrcByte(); \
 751 | 	}
 752 | 
 753 | // decode word using dictionary
 754 | #define DECODE_WORD(dictNo,i)\
 755 | 	{\
 756 | 		i++;\
 757 | 		if (i>0 && i<sizeDict)\
 758 | 		{\
 759 | 			PRINT_CODEWORDS(("i=%d ",i)); \
 760 | 			s_size=dictlen[i];\
 761 | 			memcpy(s,dict[i],s_size+1);\
 762 | 			PRINT_CODEWORDS(("%s\n",dict[i])); \
 763 | 		}\
 764 | 		else\
 765 | 		{\
 766 | 			s_size=0; \
 767 | 			/*printf("File is corrupted %d/%d!\n",i,sizeDict);*/\
 768 | 			fileCorrupted=true;\
 769 | 		}\
 770 | 	}
 771 | 
 772 | 
 773 | // convert lower string to upper
 774 | inline void XWRT_Decoder::toUpper(unsigned char* s,int &s_size){
 775 | 	for (int i=0; i<s_size; i++)
 776 | 	s[i]=toupper(s[i]); 
 777 | }
 778 | 
 779 | inline int XWRT_Decoder::decodeCodeWord(unsigned char* &s,int& c){
 780 | 	int i,s_size;
 781 | 
 782 | 	if (codeword2sym[c]<dict1size){
 783 | 		i=codeword2sym[c];
 784 | 		DECODE_WORD(dictNo, i);
 785 | 		return s_size;
 786 | 	}
 787 | 	else
 788 | 	if (codeword2sym[c]<dict1plus2)
 789 | 	i=dict1size*(codeword2sym[c]-dict1size);
 790 | 	else
 791 | 	if (codeword2sym[c]<dict1plus2plus3){
 792 | 		PRINT_CODEWORDS(("DC1b c=%d\n",codeword2sym[c]-dict1plus2));
 793 | 		i=dict12size*(codeword2sym[c]-dict1plus2);
 794 | 	}
 795 | 	else
 796 | 	i=dict123size*(codeword2sym[c]-dict1plus2plus3);
 797 | 
 798 | 	DECODE_GETC(c);
 799 | 	PRINT_CODEWORDS(("DC1 c=%d i=%d\n",c,i));
 800 | 
 801 | 	if (codeword2sym[c]<dict1size){
 802 | 		i+=codeword2sym[c];
 803 | 		i+=dict1size; //dictNo=2;
 804 | 		DECODE_WORD(dictNo, i);
 805 | 		return s_size;
 806 | 	}
 807 | 	else
 808 | 	if (codeword2sym[c]<dict1plus2){
 809 | 		PRINT_CODEWORDS(("DC2b c=%d\n",codeword2sym[c]-dict1size));
 810 | 		i+=dict1size*(codeword2sym[c]-dict1size);
 811 | 	}
 812 | 	else
 813 | 	i+=dict12size*(codeword2sym[c]-dict1plus2);
 814 | 
 815 | 	DECODE_GETC(c);
 816 | 	PRINT_CODEWORDS(("DC2 c=%d i=%d\n",c,i));
 817 | 
 818 | 	if (codeword2sym[c]<dict1size){
 819 | 		PRINT_CODEWORDS(("DC3b c=%d\n",codeword2sym[c]));
 820 | 		i+=codeword2sym[c];
 821 | 		i+=bound3; //dictNo=3;
 822 | 		DECODE_WORD(dictNo, i);
 823 | 		return s_size;
 824 | 	}
 825 | 	else
 826 | 	if (codeword2sym[c]<dict1plus2)
 827 | 	i+=dict1size*(codeword2sym[c]-dict1size);
 828 | 
 829 | 
 830 | 	DECODE_GETC(c);
 831 | 	PRINT_CODEWORDS(("DC3 c=%d i=%d\n",c,i));
 832 | 
 833 | 	if (codeword2sym[c]<dict1size)
 834 | 	i+=codeword2sym[c];
 835 | 	//else 
 836 | 	//printf("File is corrupted (codeword2sym[c]<dict1size)!\n");
 837 | 
 838 | 	i+=bound4; //dictNo=4;
 839 | 	DECODE_WORD(dictNo, i);
 840 | 	return s_size;
 841 | }
 842 | 
 843 | int XWRT_Decoder::WRT_decode(){
 844 | 	int rchar=0;
 845 | //	int c;
 846 | 	static int s_sizep=0;
 847 | 	if (s_sizep<s_size && s_sizep!=0 ){
 848 | 		rchar=WRTd_s[s_sizep];
 849 | 		last_c=rchar;
 850 | 		++s_sizep;
 851 | 		return rchar;
 852 | 	}
 853 | 	if(s_sizep==s_size && s_sizep!=0 ){
 854 | 		DECODE_GETC(WRTd_c);
 855 | 		s_sizep=0;
 856 | 		s_size=0;
 857 | 	}
 858 | 	while (1){
 859 | 		if (fileCorrupted)
 860 | 		return -1;
 861 | 
 862 | 		PRINT_CHARS(("c=%d (%c)\n",WRTd_c,WRTd_c));
 863 | 
 864 | 		if (outputSet[WRTd_c]){
 865 | 			PRINT_CHARS(("addSymbols[%d] upperWord=%d\n",WRTd_c,upperWord));
 866 | 
 867 | 			switch (WRTd_c)
 868 | 			{
 869 | 			case CHAR_ESCAPE:
 870 | 				WRTd_upper=false;
 871 | 				upperWord=UFALSE;
 872 | 				DECODE_GETC(WRTd_c);
 873 | 				PRINT_CHARS(("c==CHAR_ESCAPE, next=%x\n",WRTd_c));
 874 | 				rchar=WRTd_c;
 875 | 				last_c=rchar;
 876 | 				DECODE_GETC(WRTd_c);
 877 | 				return rchar;
 878 | 
 879 | 			case CHAR_FIRSTUPPER:
 880 | 				PRINT_CHARS(("c==CHAR_FIRSTUPPER\n"));
 881 | 
 882 | 				WRTd_upper=true;
 883 | 				upperWord=UFALSE;
 884 | 				DECODE_GETC(WRTd_c);
 885 | 				continue;
 886 | 
 887 | 			case CHAR_UPPERWORD:
 888 | 				PRINT_CHARS(("c==CHAR_UPPERWORD\n"));
 889 | 
 890 | 				upperWord=FORCE;
 891 | 				DECODE_GETC(WRTd_c);
 892 | 				continue;
 893 | 
 894 | 			}
 895 | 
 896 | 			if (upperWord==FORCE)
 897 | 			upperWord=UTRUE;
 898 | 			else
 899 | 			upperWord=UFALSE;
 900 | 
 901 | 			s_size=decodeCodeWord(WRTd_s,WRTd_c);
 902 | 
 903 | 			if (WRTd_upper){
 904 | 				WRTd_upper=false;
 905 | 				WRTd_s[0]=toupper(WRTd_s[0]);
 906 | 			}
 907 | 			
 908 | 			if (upperWord!=UFALSE)
 909 | 			toUpper(&WRTd_s[0],s_size);
 910 | 			
 911 | 			upperWord=UFALSE;
 912 | 			
 913 | 			if (s_size>0){
 914 | 				s_sizep=1;
 915 | 				rchar=WRTd_s[0];;
 916 | 				last_c=rchar;
 917 | 				return rchar;
 918 | 			}
 919 | 		}
 920 | 
 921 | 		if (WRTd_c>='0' && WRTd_c<='9'){
 922 | 			//unsigned int no,mult;
 923 | 			//int c,i;
 924 | 			//no=0;
 925 | 			//mult=1;
 926 | 			//static int wType=0;
 927 | 			rchar=WRTd_c;
 928 | 			DECODE_GETC(WRTd_c);
 929 | 			last_c=rchar;
 930 | 			return rchar;
 931 | 		}
 932 | 
 933 | 		PRINT_CHARS(("other c=%d (%d) upperWord=%d\n",fileLenMB,upperWord));
 934 | 
 935 | 		if (upperWord!=UFALSE){
 936 | 			if (upperWord==FORCE)
 937 | 			upperWord=UTRUE;
 938 | 
 939 | 			if (WRTd_c>='a' && WRTd_c<='z')
 940 | 			WRTd_c=toupper(WRTd_c);
 941 | 			else
 942 | 			upperWord=UFALSE;
 943 | 		}
 944 | 		else
 945 | 		if (WRTd_upper){
 946 | 			WRTd_upper=false;
 947 | 			WRTd_c=toupper(WRTd_c);
 948 | 		}
 949 | 		rchar=WRTd_c;
 950 | 		last_c=rchar;
 951 | 		DECODE_GETC(WRTd_c);
 952 | 		return rchar;
 953 | 	}
 954 | }
 955 | 
 956 | void XWRT_Decoder::read_dict(){
 957 | 	int i,c,count;
 958 | 	unsigned char* bound=(unsigned char*)&word_hash[0] + HASH_TABLE_SIZE*sizeof(word_hash[0]) - 6;
 959 | 
 960 | 	unsigned char* bufferData=(unsigned char*)&word_hash[0] + 3;
 961 | 	
 962 | 	for (i=0, count=0; i<3; i++){
 963 | 		GETC(c);
 964 | 		count=count*256+c;
 965 | 	}
 966 | 
 967 | 	fread_fast(bufferData,count,XWRT_file);
 968 | 	
 969 | 
 970 | 	count=bufferData[0]; bufferData++;
 971 | 	count+=256*bufferData[0]; bufferData++;
 972 | 	count+=65536*bufferData[0]; bufferData++;
 973 | 	
 974 | 	sortedDict.clear();
 975 | 	
 976 | 	PRINT_DICT(("count=%d\n",count));
 977 | 	
 978 | 	std::string s;
 979 | 	std::string last_s;
 980 | 	for (i=0; i<count; i++){
 981 | 		if ( bufferData[0]>=128){
 982 | 			s.append(last_s.c_str(),bufferData[0]-128);
 983 | 			bufferData++;
 984 | 		}
 985 | 
 986 | 		while (bufferData[0]!=10){
 987 | 			s.append(1,bufferData[0]);
 988 | 			bufferData++;
 989 | 
 990 | 			if (s.size()>WORD_MAX_SIZE || bufferData>bound)
 991 | 			{
 992 | 				//printf("File corrupted (s.size()>WORD_MAX_SIZE)!\n");
 993 | 				OUT_OF_MEMORY();
 994 | 			}
 995 | 		}
 996 | 		bufferData++;
 997 | 
 998 | 		sortedDict.push_back(s);
 999 | 		last_s=s;
1000 | 		s.erase();
1001 | 	}
1002 | 
1003 | 	sortedDictSize=(int)sortedDict.size();
1004 | 	PRINT_DICT(("read_dict count2=%d\n",count));
1005 | 
1006 | }
1007 | 
1008 | 
1009 | int XWRT_Decoder::WRT_start_decoding(FILE* in){
1010 | 	int c;
1011 | 	XWRT_file=in;
1012 | 	last_c=0;
1013 | 	WRTd_upper=false;
1014 | 	upperWord=UFALSE;
1015 | 	s_size=0;
1016 | 	collision=0;
1017 | 
1018 | 	defaultSettings(0); 
1019 | 	GETC(maxMemSize); 
1020 | 	maxMemSize*=1024*1024;
1021 | 	int fileLen;
1022 | 
1023 | 	GETC(c);
1024 | 	fileLen=c;
1025 | 	GETC(c);
1026 | 	fileLen=fileLen|(c<<8);
1027 | 	GETC(c);
1028 | 	fileLen=fileLen|(c<<16);
1029 | 	GETC(c);
1030 | 	fileLen=fileLen|(c<<24);
1031 | 	fileLenMB=fileLen/(1024*1024);
1032 | 	if (fileLenMB>255*256)
1033 | 	fileLenMB=255*256;
1034 | 
1035 | 	PRINT_DICT(("maxMemSize=%d fileLenMB=%d\n",maxMemSize,fileLenMB));
1036 | 	read_dict();
1037 | 
1038 | 	cont.readMemBuffers(preprocFlag,maxMemSize);
1039 | 	cont.memout->memsize=0;
1040 | 
1041 | 	WRT_deinitialize();
1042 | 
1043 | 	decoding=true;
1044 | 	if (!initialize(false))
1045 | 	return 0;
1046 | 
1047 | 	DECODE_GETC(WRTd_c);
1048 | 	PRINT_CHARS(("WRT_start_decoding WRTd_c=%d ftell=%d\n",WRTd_c,ftell(XWRT_file)));
1049 | 
1050 | 	return fileLen;
1051 | }
1052 | 
1053 | 
1054 | 
1055 | /////////////////////////////////////////////////////////////////////
1056 | ///////////
1057 | //////////////////////////////////////////////////////////////////////
1058 | 
1059 | 
1060 | class XWRT_Encoder : public XWRT_Common
1061 | {
1062 | public:
1063 | 
1064 | 	XWRT_Encoder();
1065 | 	~XWRT_Encoder();
1066 | 
1067 | 	void WRT_start_encoding(FILE* in, FILE* out,unsigned int fileLen,bool type_detected);
1068 | 
1069 | private:
1070 | 
1071 | 	void WRT_encode( int filelen);
1072 | 	inline void encodeCodeWord(int &i);
1073 | 	inline void encodeSpaces();
1074 | 	inline void encodeWord(unsigned char* s,int s_size,EWordType wordType,int& c);
1075 | 	inline void encodeAsText(unsigned char* &s,int &s_size,EWordType wordType);
1076 | 	inline int findShorterWord(unsigned char* &s,int &s_size);
1077 | 	inline void toLower(unsigned char* s,int &s_size);
1078 | 	inline void toUpper(unsigned char* s,int &s_size);
1079 | 	inline void checkWord(unsigned char* &s,int &s_size,int& c);
1080 | 	
1081 | 	inline void checkHashExactly(unsigned char* &s,int &s_size,int& i);
1082 | 	inline int checkHash(unsigned char* &s,int &s_size,int h);
1083 | 	inline void stringHash(const unsigned char *ptr, int len,int& hash);
1084 | 	
1085 | 	//void encodeMixed(unsigned char* s,int s_size,int& c);
1086 | 	void sortDict(int size);
1087 | 
1088 | 	void write_dict();
1089 | 	int WRT_detectFileType(int filelen);
1090 | 	void WRT_detectFinish();
1091 | 
1092 | 	int s_size;
1093 | 	int last_c_bak,last_c,last_last_c;
1094 | 	int filelento;
1095 | 
1096 | 
1097 | 	unsigned char* dynmem;
1098 | 	unsigned char *dictbound;
1099 | 
1100 | public:
1101 | }; // end class 
1102 | 
1103 | int compare_freq( const void *arg1, const void *arg2 );
1104 | 
1105 | 
1106 | XWRT_Encoder::XWRT_Encoder() :  last_c_bak(0),filelento(0)
1107 | { 	
1108 | };
1109 | XWRT_Encoder::~XWRT_Encoder(){ 
1110 | 	
1111 | }
1112 | #define ENCODE_PUTC(c)\
1113 | 	{ \
1114 | 		if (!detect) \
1115 | 		{ \
1116 | 			if (cont.memout->memsize>maxMemSize) \
1117 | 			{ \
1118 | 				PRINT_DICT(("%d maxMemSize=%d\n",cont.memout->memsize,maxMemSize)); \
1119 | 				cont.writeMemBuffers(preprocFlag); \
1120 | 				cont.memout->memsize=0; \
1121 | 			} \
1122 | 			\
1123 | 			PRINT_CHARS(("output=%d (%c)\n",c,c)); \
1124 | 			cont.memout->OutTgtByte(c); \
1125 | 		} \
1126 | 	}
1127 | 
1128 | #define ENCODE_GETC(c) \
1129 | 	{ \
1130 | 		last_last_c=last_c; \
1131 | 		last_c=last_c_bak; \
1132 | 		\
1133 | 		c=getc(XWRT_file); \
1134 | 		filelento++;\
1135 | 		last_c_bak=c; \
1136 | 	}
1137 | 
1138 | 
1139 | // encode word (should be lower case) using n-gram array (when word doesn't exist in the dictionary)
1140 | inline void XWRT_Encoder::encodeAsText(unsigned char* &s,int &s_size,EWordType wordType){
1141 | 	int i=0;
1142 | 	for (i=0; i<s_size; i++)
1143 | 	{
1144 | 		if (addSymbols[s[i]])
1145 | 		ENCODE_PUTC(CHAR_ESCAPE);
1146 | 		ENCODE_PUTC(s[i]);
1147 | 	}
1148 | 	return;
1149 | }
1150 | inline void XWRT_Encoder::encodeCodeWord(int &i){
1151 | 	int first,second,third,fourth;
1152 | 	first=i-1;
1153 | 	if (first>=bound4){
1154 | 		first-=bound4;
1155 | 		fourth=first/dict123size;
1156 | 		first=first%dict123size;
1157 | 		third=first/dict12size;		
1158 | 		first=first%dict12size;
1159 | 		second=first/dict1size;		
1160 | 		first=first%dict1size;
1161 | 		ENCODE_PUTC(sym2codeword[dict1plus2plus3+fourth]);
1162 | 		PRINT_CODEWORDS(("1st=%d ",sym2codeword[dict1plus2plus3+fourth]));
1163 | 		ENCODE_PUTC(sym2codeword[dict1plus2+third]);
1164 | 		PRINT_CODEWORDS(("2nd=%d ",sym2codeword[dict1plus2+third]));
1165 | 		ENCODE_PUTC(sym2codeword[dict1size+second]);
1166 | 		PRINT_CODEWORDS(("3rd=%d ",sym2codeword[dict1size+second]));
1167 | 		ENCODE_PUTC(sym2codeword[first]);
1168 | 		PRINT_CODEWORDS(("4th=%d ",sym2codeword[first]));
1169 | 	}
1170 | 	else
1171 | 	if (first>=bound3){
1172 | 		first-=bound3;
1173 | 		third=first/dict12size;		
1174 | 		first=first%dict12size;
1175 | 		second=first/dict1size;		
1176 | 		first=first%dict1size;
1177 | 		ENCODE_PUTC(sym2codeword[dict1plus2+third]);
1178 | 		PRINT_CODEWORDS(("1st=%d(%d) ",sym2codeword[dict1plus2+third],third));
1179 | 		ENCODE_PUTC(sym2codeword[dict1size+second]);
1180 | 		PRINT_CODEWORDS(("2nd=%d(%d) ",sym2codeword[dict1size+second],second));
1181 | 		ENCODE_PUTC(sym2codeword[first]);
1182 | 		PRINT_CODEWORDS(("3rd=%d(%d) ",sym2codeword[first],first));
1183 | 	}
1184 | 	else
1185 | 	if (first>=dict1size){
1186 | 		first-=dict1size;
1187 | 		second=first/dict1size;		
1188 | 		first=first%dict1size;
1189 | 		ENCODE_PUTC(sym2codeword[dict1size+second]);
1190 | 		PRINT_CODEWORDS(("1st=%d ",sym2codeword[dict1size+second]));
1191 | 		
1192 | 		ENCODE_PUTC(sym2codeword[first]);
1193 | 		PRINT_CODEWORDS(("2nd=%d ",sym2codeword[first]));
1194 | 	}
1195 | 	else{
1196 | 		ENCODE_PUTC(sym2codeword[first]);
1197 | 		PRINT_CODEWORDS(("1st=%d ",sym2codeword[first]));
1198 | 	}
1199 | 
1200 | }
1201 | 
1202 | inline void XWRT_Encoder::encodeSpaces(){
1203 | 	if (spaces==1){
1204 | 		ENCODE_PUTC(' ');
1205 | 	}
1206 | 	else
1207 | 	if (spaces>0){
1208 | 		while (spaces>0){
1209 | 			int sp=spaces;
1210 | 			if (spaces>=256)
1211 | 			sp=255;
1212 | 			
1213 | 			while (sp>0 && spacesCodeword[sp]==0) sp--;
1214 | 			if (spacesCodeword[sp])	{		
1215 | 				encodeCodeWord(spacesCodeword[sp]);
1216 | 				spaces-=sp;
1217 | 			}
1218 | 			else{
1219 | 				ENCODE_PUTC(' ');
1220 | 				spaces--;
1221 | 			}
1222 | 		}
1223 | 	}
1224 | 	spaces=0;
1225 | }
1226 | // make hash from string
1227 | inline void XWRT_Encoder::stringHash(const unsigned char *ptr, int len,int& hash){
1228 | 	for (hash = 0; len>0; len--, ptr++){
1229 | 		hash *= HASH_MULT;
1230 | 		hash += *ptr;
1231 | 	}
1232 | 	hash=hash&(HASH_TABLE_SIZE-1);
1233 | }
1234 | // check if word "s" does exist in the dictionary 
1235 | inline void XWRT_Encoder::checkHashExactly(unsigned char* &s,int &s_size,int& i){
1236 | 	int h;
1237 | 	stringHash(s,s_size,h);
1238 | 	i=word_hash[h];
1239 | 	if (i>0){
1240 | 		if (dictlen[i]!=s_size || memcmp(dict[i],s,s_size)!=0){
1241 | 			i=word_hash[(h+s_size*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)];
1242 | 			if (i>0){
1243 | 				if (dictlen[i]!=s_size || memcmp(dict[i],s,s_size)!=0){
1244 | 					i=word_hash[(h+s_size*HASH_DOUBLE_MULT*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)];
1245 | 					if (i>0){
1246 | 						if (dictlen[i]!=s_size || memcmp(dict[i],s,s_size)!=0)
1247 | 						i=-1;
1248 | 					}
1249 | 					else
1250 | 					i=-1;
1251 | 				}
1252 | 			}
1253 | 			else
1254 | 			i=-1;
1255 | 		}
1256 | 	}
1257 | 	else
1258 | 	i=-1;
1259 | 	if (i>=dictionary)
1260 | 	i=-1;
1261 | }
1262 | // check if word "s" (prefix of original word) does exist in the dictionary using hash "h" 
1263 | inline int XWRT_Encoder::checkHash(unsigned char* &s,int &s_size,int h){
1264 | 	int i=word_hash[h];
1265 | 	if (i>0){
1266 | 		if (dictlen[i]>s_size || memcmp(dict[i],s,s_size)!=0){
1267 | 			i=word_hash[(h+s_size*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)];
1268 | 			if (i>0){
1269 | 				if (dictlen[i]>s_size || memcmp(dict[i],s,s_size)!=0){
1270 | 					i=word_hash[(h+s_size*HASH_DOUBLE_MULT*HASH_DOUBLE_MULT)&(HASH_TABLE_SIZE-1)];
1271 | 					if (i>0){
1272 | 						if (dictlen[i]>s_size || memcmp(dict[i],s,s_size)!=0)
1273 | 						i=-1;
1274 | 					}
1275 | 					else
1276 | 					i=-1;
1277 | 				}
1278 | 			}
1279 | 			else
1280 | 			i=-1;
1281 | 		}
1282 | 	}
1283 | 	else
1284 | 	i=-1;
1285 | 	if (i>=dictionary)
1286 | 	i=-1;
1287 | 	return i;
1288 | }
1289 | // check if word "s" or prefix of word "s" does exist in the dictionary using hash "h" 
1290 | inline int XWRT_Encoder::findShorterWord(unsigned char* &s,int &s_size){
1291 | 	int ret;
1292 | 	int i;
1293 | 	int best;
1294 | 	unsigned int hash;
1295 | 	hash = 0;
1296 | 	for (i=0; i<WORD_MIN_SIZE+tryShorterBound; i++)
1297 | 	hash = HASH_MULT * hash + s[i];
1298 | 
1299 | 	best=-1;
1300 | 	for (i=WORD_MIN_SIZE+tryShorterBound; i<s_size; i++){
1301 | 		ret=checkHash(s,i,hash&(HASH_TABLE_SIZE-1));	
1302 | 		if (ret>=0)
1303 | 		best=ret;
1304 | 		hash = HASH_MULT*hash + s[i];
1305 | 	}
1306 | 	return best;
1307 | }
1308 | // convert lower string to upper
1309 | inline void XWRT_Encoder::toUpper(unsigned char* s,int &s_size){
1310 | 	for (int i=0; i<s_size; i++)
1311 | 	s[i]=toupper(s[i]); 
1312 | }
1313 | // convert upper string to lower
1314 | inline void XWRT_Encoder::toLower(unsigned char* s,int &s_size){
1315 | 	for (int i=0; i<s_size; i++)
1316 | 	s[i]=tolower(s[i]);
1317 | }
1318 | /*void XWRT_Encoder::encodeMixed(unsigned char* s,int s_size,int& old_c){
1319 | 	int c,size,start,ptr=0;
1320 | 	EWordType wordType;
1321 | 	unsigned char* s2;
1322 | 	do
1323 | 	{
1324 | 		start=ptr;
1325 | 		do
1326 | 		{
1327 | 			c=s[ptr++];
1328 | 			letterType=letterSet[c];
1329 | 		}
1330 | 		while (ptr<s_size && letterType==NUMBERCHAR);
1331 | 		
1332 | 		if (letterType!=NUMBERCHAR)
1333 | 		ptr--;
1334 | 		wordType=NUMBER;
1335 | 		encodeWord(s+start,ptr-start,wordType,old_c);
1336 | 		
1337 | 		if (ptr>=s_size)
1338 | 		break;
1339 | 		
1340 | 		start=ptr;
1341 | 		do
1342 | 		{
1343 | 			c=s[ptr++];
1344 | 			letterType=letterSet[c];
1345 | 		}
1346 | 		while (ptr<s_size && letterType!=NUMBERCHAR);
1347 | 		
1348 | 		if (letterType==NUMBERCHAR)
1349 | 		ptr--;
1350 | 		wordType=VARWORD;
1351 | 		s2=s+start;
1352 | 		size=ptr-start;
1353 | 		encodeAsText(s2,size,wordType);
1354 | 	}
1355 | 	while (ptr<s_size);
1356 | }*/
1357 | // encode word "s" using dictionary
1358 | void XWRT_Encoder::encodeWord(unsigned char* s,int s_size,EWordType wordType,int& c){
1359 | 	if (detect)	{
1360 | 		if (s_size>0 && wordType!=3){
1361 | 			//s[s_size+1]=0;
1362 | 			//printf("%s %d %d\n",s,s_size,wordType);
1363 | 			toLower(s,s_size);
1364 | 		}
1365 | 		if ((s_size<3 && s[0]>='0' && s[0]<='9') && wrtnum==1) return; //???
1366 | 	//	if (s_size==4 && ((s[0]=='1' && s[1]=='9') || (s[0]=='2' && s[1]=='0'))) return;
1367 | 		
1368 | 		checkWord(s,s_size,c);
1369 | 		return;
1370 | 	}
1371 | 	if (s_size<1){
1372 | 		encodeSpaces();
1373 | 		return;
1374 | 	}
1375 | 	int i=-1;
1376 | 	int size=0;
1377 | 	int flagToEncode=-1;
1378 | //	bool justAdded=false;
1379 | 	
1380 | 	if (s_size>=WORD_MIN_SIZE){
1381 | 		checkHashExactly(s,s_size,i);
1382 | 		PRINT_CODEWORDS(("i=%d/%d %s(%d)\n",i,sizeDynDict,s,s_size));
1383 | 		
1384 | 		if (i>=0)// && codeWordSize(i)<=s_size)
1385 | 		wordType=LOWERWORD;
1386 | 		
1387 | 		if (i<0){
1388 | 			if (wordType==FIRSTUPPER || wordType==UPPERWORD){
1389 | 				if (wordType==FIRSTUPPER){
1390 | 					flagToEncode=CHAR_FIRSTUPPER;
1391 | 					s[0]=tolower(s[0]);
1392 | 				}
1393 | 				else // wordType==UPPERWORD
1394 | 				{
1395 | 					flagToEncode=CHAR_UPPERWORD;
1396 | 					toLower(s,s_size);
1397 | 				}
1398 | 				checkHashExactly(s,s_size,i);
1399 | 				PRINT_CODEWORDS(("checkHashExactly i=%d %d=%s\n",i,s_size,s));
1400 | 			}
1401 | 			
1402 | 			
1403 | 			if (i<0 ){
1404 | 				// try to find shorter version of word in dictionary
1405 | 				i=findShorterWord(s,s_size);
1406 | 				PRINT_CODEWORDS(("findShorterWord i=%d\n",i));
1407 | 				//s[s_size+1]=0;
1408 | 				//if (i>0 ) printf("findShorterWord i=%d %s\n",i, s);
1409 | 				if (i>=0){
1410 | 					size=dictlen[i];
1411 | 					if (wordType==UPPERWORD){
1412 | 						int ss=s_size-size;
1413 | 						toUpper(s+size,ss);
1414 | 					}
1415 | 				}
1416 | 			}
1417 | 		}
1418 | 	}
1419 | 	if (i>=0){
1420 | 		encodeSpaces();
1421 | 		if (wordType==FIRSTUPPER || wordType==UPPERWORD){
1422 | 			ENCODE_PUTC(flagToEncode);
1423 | 		}
1424 | 		encodeCodeWord(i);
1425 | 		if (size>0){
1426 | 			if (wordType==FIRSTUPPER)
1427 | 			wordType=LOWERWORD;
1428 | 			unsigned char* s2=s+size;
1429 | 			int s_size2=s_size-size;
1430 | 			i=-1;
1431 | 			if (s_size2>(tryShorterBound+1)){  //try remainig word
1432 | 				// try to find shorter version of word in dictionary
1433 | 				i=findShorterWord(s2,s_size2);
1434 | 				PRINT_CODEWORDS(("findShorterWord i=%d\n",i));
1435 | 			}
1436 | 			if (i>=0 && wordType!=UPPERWORD){
1437 | 				size=dictlen[i];
1438 | 				//encodeSpaces();
1439 | 				encodeCodeWord(i);
1440 | 				s2=s2+size;
1441 | 				s_size2=s_size2-size;
1442 | 				if (s_size2>0) encodeAsText(s2,s_size2,wordType);
1443 | 			}
1444 | 			else encodeAsText(s2,s_size2,wordType);
1445 | 		}
1446 | 	}
1447 | 	else
1448 | 	{
1449 | 		if (wordType==FIRSTUPPER)
1450 | 		s[0]=toupper(s[0]);
1451 | 		else if (wordType==UPPERWORD)
1452 | 		toUpper(s,s_size);
1453 | 		encodeSpaces();
1454 | 		encodeAsText(s,s_size,wordType);
1455 | 	}
1456 | 	return;
1457 | }
1458 | // process the file
1459 | void XWRT_Encoder::WRT_encode(int filelen){
1460 | 	unsigned char s[STRING_MAX_SIZE];
1461 | 	EWordType wordType;
1462 | 	int c;
1463 | 	spaces=0;
1464 | 	s_size=0;
1465 | 	last_c=0;
1466 | 	filelento=-1;
1467 | 	wordType=LOWERWORD;
1468 | 	ENCODE_GETC(c);
1469 | 	while (true) 
1470 | 	{
1471 | 		if (filelento==filelen)
1472 | 		break;
1473 | 		PRINT_CHARS(("c=%c (%d) last=%c \n",c,c,last_c));
1474 | 		
1475 | 		if (detect){
1476 | 			letterType=letterSet[c];
1477 | 		}
1478 | 		else
1479 | 		{
1480 | 			if (c==13){
1481 | 				encodeWord(s,s_size,wordType,c);
1482 | 				s_size=0;
1483 | 				ENCODE_GETC(c);
1484 | 				if (addSymbols[13])
1485 | 				ENCODE_PUTC(CHAR_ESCAPE);
1486 | 				ENCODE_PUTC(13);
1487 | 				continue;
1488 | 			}
1489 | 			
1490 | 			letterType=letterSet[c];
1491 | 			
1492 | 			if (letterType==RESERVEDCHAR){
1493 | 				PRINT_CHARS(("reservedSet[c] c=%d (%c)\n",c,c));
1494 | 				
1495 | 				encodeWord(s,s_size,wordType,c);
1496 | 				s_size=0;
1497 | 				
1498 | 				PRINT_CHARS(("out CHAR_ESCAPE=%d\n",CHAR_ESCAPE));
1499 | 				ENCODE_PUTC(CHAR_ESCAPE);	
1500 | 				ENCODE_PUTC(c);
1501 | 				
1502 | 				ENCODE_GETC(c);
1503 | 				continue;
1504 | 			}
1505 | 			
1506 | 			
1507 | 			if (letterType==NUMBERCHAR && wrtnum==0){	
1508 | 				encodeWord(s,s_size,wordType,c);
1509 | 				s_size=0;
1510 | 				ENCODE_PUTC(c);
1511 | 				ENCODE_GETC(c);
1512 | 				//	wordType=LOWERWORD;
1513 | 				continue;
1514 | 			}		
1515 | 			
1516 | 		}
1517 | 		if (wordSet[c]){
1518 | 			if (c!=' '){
1519 | 				if (s_size==0){
1520 | 					if (last_c!=' ')
1521 | 					beforeWord=last_c;
1522 | 					else
1523 | 					beforeWord=last_last_c;
1524 | 					if (letterType==LOWERCHAR)
1525 | 					wordType=LOWERWORD;
1526 | 					else
1527 | 					if (letterType==UPPERCHAR)
1528 | 					wordType=FIRSTUPPER;
1529 | 					else
1530 | 					wordType=VARWORD;
1531 | 				}
1532 | 				else
1533 | 				{
1534 | 					switch (wordType)
1535 | 					{
1536 | 					case LOWERWORD:
1537 | 						if (letterType!=LOWERCHAR)
1538 | 						wordType=VARWORD;
1539 | 						break;
1540 | 					case UPPERWORD:
1541 | 						if (letterType!=UPPERCHAR)
1542 | 						wordType=VARWORD;
1543 | 						break;
1544 | 					case FIRSTUPPER:
1545 | 						if (letterType!=LOWERCHAR)
1546 | 						{
1547 | 							if (s_size==1 && letterType==UPPERCHAR)
1548 | 							wordType=UPPERWORD;
1549 | 							else
1550 | 							wordType=VARWORD;
1551 | 						}
1552 | 						break;
1553 | 					}
1554 | 				}
1555 | 			}
1556 | 			else
1557 | 			{
1558 | 				encodeWord(s,s_size,wordType,c);
1559 | 				s_size=0;
1560 | 				spaces++;
1561 | 				while (true){
1562 | 					ENCODE_GETC(c);
1563 | 					if (c!=' ')
1564 | 					break;
1565 | 					spaces++;
1566 | 				}
1567 | 				continue;
1568 | 			}
1569 | 			//detect words like and split. HiTerraMonda
1570 | 			if(s_size>2 && letterType==UPPERCHAR && letterSet[last_c]==LOWERCHAR){
1571 | 				if (s_size>2 && wordType==VARWORD){
1572 | 					if (letterSet[s[0]]==UPPERCHAR){
1573 | 						wordType=FIRSTUPPER;
1574 | 						for(int i=1;i<=s_size;i++){
1575 | 							if (letterSet[s[i]]==UPPERCHAR){
1576 | 								wordType=VARWORD;
1577 | 								break;
1578 | 							}
1579 | 						}
1580 | 					}
1581 | 				}
1582 | 				encodeWord(s,s_size,wordType,c);
1583 | 				//s[s_size++]=0;
1584 | 				//printf("%s %d %d\n",s,s_size,wordType);
1585 | 				s_size=0;
1586 | 				
1587 | 				continue;
1588 | 			}
1589 | 			s[s_size++]=c;
1590 | 			if (s_size>=STRING_MAX_SIZE-2){
1591 | 				encodeWord(s,s_size,wordType,c);
1592 | 				s_size=0;
1593 | 			}
1594 | 			ENCODE_GETC(c);
1595 | 			continue;
1596 | 		}
1597 | 		encodeWord(s,s_size,wordType,c);
1598 | 		s_size=0;
1599 | 		ENCODE_PUTC(c);
1600 | 		ENCODE_GETC(c);
1601 | 	}
1602 | 	encodeWord(s,s_size,wordType,c);
1603 | 	s_size=0;
1604 | }
1605 | inline int common(const char* offset1,const char* offset2, int bound){
1606 | 	int lp=0;
1607 | 	while (offset1[lp]==offset2[lp] && lp<bound)
1608 | 	lp++;
1609 | 	return lp;
1610 | }
1611 | void XWRT_Encoder::write_dict(){
1612 | 	int i,count=0;
1613 | 	unsigned char *bound=(unsigned char*)&word_hash[0]+HASH_TABLE_SIZE*sizeof(word_hash[0])-WORD_MAX_SIZE;
1614 | 	unsigned char *writeBuffer=(unsigned char*)&word_hash[0]; //putcBuffer;
1615 | 	unsigned char *bufferData=writeBuffer+3;
1616 | 	
1617 | 	unsigned char *count_header=bufferData;
1618 | 	bufferData+=3;
1619 | 	PRINT_DICT(("sortedDict.size()=%d\n",sortedDict.size()));
1620 | 	int cmn;
1621 | 	count=(int)sortedDict.size();
1622 | 	for (i=0; i<count; i++){
1623 | 		cmn=0;
1624 | 		//if (i>0)
1625 | 		//	cmn=common(sortedDict[i-1].c_str(),sortedDict[i].c_str(),min(sortedDict[i].size(),sortedDict[i-1].size()));
1626 | 		if ((cmn>0 || (unsigned char)(sortedDict[i][0])>=128))
1627 | 		bufferData+=sprintf((char*)bufferData,"%c%s\n",128+cmn,sortedDict[i].c_str()+cmn);
1628 | 		else
1629 | 		bufferData+=sprintf((char*)bufferData,"%s\n",sortedDict[i].c_str());
1630 | 		if (bufferData>bound)
1631 | 		break;
1632 | 	}
1633 | 	sortedDictSize=(int)i; // i<=count
1634 | 	PRINT_DICT(("sortedDictCount=%d\n",sortedDictSize));
1635 | 	count_header[0]=sortedDictSize%256;
1636 | 	count_header[1]=(sortedDictSize/256)%256;
1637 | 	count_header[2]=sortedDictSize/65536;
1638 | 	count=(int)(bufferData-(writeBuffer+3));
1639 | 	PRINT_DICT(("write_dict count=%d\n",count));
1640 | 	PUTC(count>>16);
1641 | 	PUTC(count>>8);
1642 | 	PUTC(count);
1643 | 	fwrite_fast((unsigned char*)writeBuffer+3,count,XWRT_fileout);
1644 | }
1645 | 
1646 | void XWRT_Encoder::WRT_start_encoding(FILE* in, FILE* out,unsigned int fileLen,bool type_detected){
1647 | 	collision=0;
1648 | 	XWRT_file=in;
1649 | 	XWRT_fileout=out;
1650 | 
1651 | 	fileLenMB=fileLen/(1024*1024);
1652 | 	if (fileLenMB>255*256)
1653 | 	fileLenMB=255*256;
1654 | 	
1655 | 	cont.prepareMemBuffers();
1656 | 	cont.memout->memsize=0;
1657 | 	//if (fileLenMB>64) minWordFreq-=4,tryShorterBound=2;
1658 | 	
1659 | 	/*	if (fileLenMB>64) minWordFreq-=16;
1660 | 	if (fileLenMB<16)
1661 | 		minWordFreq-=7;
1662 | 	//if (fileLenMB<6)
1663 | 	//	minWordFreq=minWordFreq+15;*/
1664 | if (fileLenMB<1)
1665 | 		minWordFreq=minWordFreq*6;
1666 | 	//if (fileLenMB<1)		minWordFreq=9,tryShorterBound=4;
1667 | 	//if (fileLen<256*1024)		minWordFreq=7,tryShorterBound=3;
1668 | 	int pos=ftell(XWRT_file);
1669 | 	if (!type_detected)
1670 | 	WRT_detectFileType(fileLen);
1671 | 
1672 | 	fseek(XWRT_file, pos, SEEK_SET );
1673 | 
1674 | 	PUTC(maxMemSize/(1024*1024));
1675 | 	PUTC(fileLen&0xFF);
1676 | 	PUTC((fileLen>>8)&0xFF);
1677 | 	PUTC((fileLen>>16)&0xFF);
1678 | 	PUTC((fileLen>>24)&0xFF);
1679 | 
1680 | 	PRINT_DICT(("maxMemSize=%d fileLenMB=%d\n",maxMemSize,fileLenMB));
1681 | 	write_dict(); // przed initialize()
1682 | 	decoding=false;
1683 | 	WRT_deinitialize();
1684 | 	if (!initialize(true))
1685 | 	return;
1686 | 	WRT_encode(fileLen);
1687 | 	cont.writeMemBuffers(preprocFlag);
1688 | 	cont.freeMemBuffers(true);
1689 | }
1690 | 
1691 | inline void XWRT_Encoder::checkWord(unsigned char* &s,int &s_size,int& c){
1692 | 	if (s_size<1){
1693 | 		spaces=0;
1694 | 		return;
1695 | 	}
1696 | 	if (s_size>WORD_MAX_SIZE)
1697 | 	s_size=WORD_MAX_SIZE; 
1698 | 	
1699 | 	spaces=0;
1700 | 	if (s_size<WORD_MIN_SIZE){
1701 | 		spaces=0;
1702 | 		return;
1703 | 	} 
1704 | 	int i;
1705 | 	checkHashExactly(s,s_size,i);
1706 | 	if (i<0){
1707 | 		if (dynmem>dictbound){
1708 | 			if (firstWarn){
1709 | 				//printf("warning: dictionary too big\n"); //-b option
1710 | 				firstWarn=false;
1711 | 			}
1712 | 			return;
1713 | 		}
1714 | 		memcpy(dynmem,s,s_size);
1715 | 		if (addWord(dynmem,s_size)==1){
1716 | 			dynmem+=(s_size/4+1)*4;
1717 | 			dictfreq[sizeDict-1]=1;
1718 | 		}
1719 | 	}
1720 | 	else
1721 | 	{
1722 | 		dictfreq[i]++;
1723 | 	}
1724 | }
1725 | int XWRT_Encoder::WRT_detectFileType(int filelen){
1726 | 	detect=true;
1727 | 	//memset(value,0,sizeof(value));
1728 | 	memset(addSymbols,0,sizeof(addSymbols));
1729 | 	memset(reservedSet,0,sizeof(reservedSet));
1730 | 	memset(spacesCont,0,sizeof(spacesCont));
1731 | 	spaces=0;
1732 | 	firstWarn=true;
1733 | 	sizeDict=1;
1734 | 	PRINT_DICT(("maxDynDictBuf=%d maxMemSize=%d\n",maxDynDictBuf,maxMemSize));
1735 | 	dictionary=maxDynDictBuf*(MAX_DYNAMIC_DICT_COUNT/256);  // 512k, dblp=372k
1736 | 	dictmem=(unsigned char*)calloc(dictionary*WORD_AVG_SIZE,1);
1737 | 	dictbound=dictmem+dictionary*WORD_AVG_SIZE-WORD_MAX_SIZE;
1738 | 	dict=(unsigned char**)calloc(sizeof(unsigned char*)*(dictionary+1),1);
1739 | 	dictlen=(unsigned char*)calloc(sizeof(unsigned char)*(dictionary+1),1);
1740 | 	dictfreq=(int*)calloc(sizeof(int)*(dictionary+1),1);
1741 | 	memset(&word_hash[0],0,HASH_TABLE_SIZE*sizeof(word_hash[0]));
1742 | 	dynmem=dictmem;
1743 | 	PRINT_DICT(("maxDict=%d allocatedMemory=%d hashTable=%d\n",dictionary,dictionary*WORD_AVG_SIZE+sizeof(unsigned char*)*(dictionary+1)+sizeof(unsigned char)*(dictionary+1)+sizeof(int)*(dictionary+1),HASH_TABLE_SIZE*sizeof(word_hash[0])));
1744 | 	if (dictmem && dict && dictlen && dictfreq){
1745 | 		initializeLetterSet();
1746 | 		WRT_encode(filelen);
1747 | 		WRT_detectFinish();
1748 | 	}
1749 | 	WRT_deinitialize();
1750 | 	if (collision>0)
1751 | 	PRINT_DICT(("warning: hash collisions=%d\n",collision));
1752 | 	detect=false;
1753 | 	return preprocFlag;
1754 | }
1755 | int compare_str( const void *arg1, const void *arg2 ){
1756 | 	int a=*(int*)arg1;
1757 | 	int b=*(int*)arg2;
1758 | 	return strcmp((char*)dict[a],(char*)dict[b]);
1759 | }
1760 | int compare_str_rev( const void *arg1, const void *arg2 ){
1761 | 	int a=*(int*)arg1;
1762 | 	int b=*(int*)arg2;
1763 | 	int minv=min(dictlen[a],dictlen[b]);
1764 | 	for (int i=1; i<=minv; i++){
1765 | 		if (dict[a][dictlen[a]-i]!=dict[b][dictlen[b]-i])
1766 | 		return dict[a][dictlen[a]-i] - dict[b][dictlen[b]-i];
1767 | 	}
1768 | 	return dictlen[a] - dictlen[b];
1769 | }
1770 | int compare_freq( const void *arg1, const void *arg2 ){
1771 | 	int a=*(int*)arg1;
1772 | 	int b=*(int*)arg2;
1773 | 	return dictfreq[b]-dictfreq[a];
1774 | }
1775 | void XWRT_Encoder::sortDict(int size){
1776 | 	int i,add;
1777 | 	size--;
1778 | 	if (size<20)
1779 | 	return;
1780 | 	initializeCodeWords(size,false);
1781 | 	add=0;
1782 | 	dict1size-=add;
1783 | 	bound3-=add;
1784 | 	bound4-=add;
1785 | 	int* inttable=new int[size];
1786 | 	if (!inttable)
1787 | 	OUT_OF_MEMORY();
1788 | 	for (i=0; i<size; i++)
1789 | 	inttable[i]=i+1;
1790 | 	qsort(&inttable[0],size,sizeof(inttable[0]),compare_freq);
1791 | 	qsort(&inttable[0],min(size,dict1size),sizeof(inttable[0]),compare_str); //compare_str
1792 | 	
1793 | 	if (size>dict1size)
1794 | 	qsort(&inttable[dict1size],min(size,bound3)-dict1size,sizeof(inttable[0]),compare_str);//compare_str
1795 | 	if (size>bound3)
1796 | 	qsort(&inttable[bound3],min(size,bound4)-bound3,sizeof(inttable[0]),compare_str);//compare_str
1797 | 	if (size>bound4)
1798 | 	qsort(&inttable[bound4],size-bound4,sizeof(inttable[0]),compare_str);//compare_str
1799 | 	
1800 | 	for (i=0; i<size; i++){
1801 | 		std::string str=(char*)dict[inttable[i]];
1802 | 		sortedDict.push_back(str);
1803 | 	}
1804 | 	delete(inttable);
1805 | }
1806 | void XWRT_Encoder::WRT_detectFinish(){	
1807 | 	int i,j;
1808 | 	PRINT_DICT(("%d words ",sizeDict-1));
1809 | 	sortedDict.clear();
1810 | 	int num;
1811 | 	int minWordFreq2;
1812 | 	if (minWordFreq<6)
1813 | 	minWordFreq2=minWordFreq;
1814 | 	else
1815 | 	minWordFreq2=minWordFreq-2;
1816 | 	for (i=1; i<sizeDict-1; i++){
1817 | 		num=dictfreq[i];
1818 | 		if (num>=minWordFreq || (num>=minWordFreq2 && (dictlen[i]>=7))) 
1819 | 		;
1820 | 		else
1821 | 		dictfreq[i]=0;
1822 | 	}
1823 | 	for (i=1, j=sizeDict-2; i<j; i++){
1824 | 		if (dictfreq[i]>0)
1825 | 		continue;
1826 | 		while (j>0 && dictfreq[j]==0) j--;
1827 | 		if (i>j)
1828 | 		break;
1829 | 		dict[i]=dict[j];
1830 | 		dictlen[i]=dictlen[j];
1831 | 		dictfreq[i]=dictfreq[j];
1832 | 		dictfreq[j--]=0;
1833 | 	}
1834 | 	sizeDict=i;
1835 | 	if (sizeDict>maxDictSize)
1836 | 	sizeDict=maxDictSize;
1837 | 	PRINT_DICT(("reduced to %d words (freq>=%d)\n",sizeDict,minWordFreq));
1838 | 	sortDict(sizeDict);
1839 | }
1840 | 


--------------------------------------------------------------------------------