├── README.md └── compress.py /README.md: -------------------------------------------------------------------------------- 1 | ascii-zip 2 | ========= 3 | 4 | A deflate compressor that emits compressed data that is in the [A-Za-z0-9] ASCII byte range. 5 | 6 | Example 7 | ======= 8 | 9 | ```bash 10 | $ echo 'Hello ASCII world!' >hello 11 | $ ./compress.py --mode raw --output ./hello.infalted ./hello >/dev/null 12 | $ cat ./hello.infalted 13 | D0Up0IZUnnnnnnnnnnnnnnnnnnnUU5nnnnnn3SUUnUUUwCiudIbEAtwwwEt333 14 | G0GDGGDtGptw0GwDDDGtDGDt33333www03333sDdFPsgWWwackSKKaowOWGQ4 15 | ``` 16 | 17 | Why? 18 | ==== 19 | 20 | It can be used for bypassing certain filters in certain applications :) 21 | 22 | This algorithm is at the heart of the Rosetta Flash vulnerability. For additional information, see 23 | 24 | * [JSONP & handcrafted Flash files](http://quaxio.com/jsonp_handcrafted_flash_files/) 25 | * [Handcrafting ASCII Flash Files for Fun and Profit](https://molnarg.github.io/ascii-flash/) 26 | * [Bypassing Same Origin Policy With JSONP APIs and Flash](https://hackerone.com/reports/10373) 27 | * [Abusing JSONP with Rosetta Flash](http://miki.it/blog/2014/7/8/abusing-jsonp-with-rosetta-flash/). 28 | -------------------------------------------------------------------------------- /compress.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import binascii 4 | import zlib 5 | import struct 6 | import sys 7 | import argparse 8 | 9 | 10 | def binary(n, length, reverse=False): 11 | bits = ''.join(str(1 & (n >> i)) for i in range(length)[::-1]) 12 | return bits if reverse else bits[::-1] 13 | 14 | 15 | class WritableBitStream(object): 16 | def __init__(self): 17 | self.bits = '' 18 | 19 | def write(self, value, length=None, reverse=False): 20 | bits = value if (length is None) else binary(value, length, reverse) 21 | self.bits += bits 22 | 23 | def data(self): 24 | data = '' 25 | for cursor in range(0, len(self.bits), 8): 26 | bits = self.bits[cursor:cursor+8][::-1] 27 | #bits += '0' * (8 - len(bits)) 28 | data += chr(int(bits, 2)) 29 | return data 30 | 31 | 32 | class ASCIICompressor(object): 33 | def __init__(self, allowed): 34 | self.allowed = map(lambda x: binary(ord(x), 8), allowed) 35 | self.stream = WritableBitStream() 36 | # self._test() 37 | self.block_count = 0 38 | # self._padding_block() 39 | 40 | def _test(self): 41 | decompressor = zlib.decompressobj() 42 | decompressor.decompress('\x08' + chr(31 - (0x08*256) % 31)) 43 | print 'self test:',\ 44 | repr(decompressor.decompress(self.stream.data())) 45 | # print 'self test flush:', \ 46 | repr(decompressor.flush()) 47 | 48 | def compress(self, uncompressed_data): 49 | data = uncompressed_data 50 | previous_block_type = 2 51 | while len(data) > 0: 52 | block_type = 2 53 | cursor = 1 54 | 55 | # Choose the longest possible chunk for the type 2 encoder 56 | distinct_bytes = {ord(data[0])} 57 | while (cursor < len(data) and 58 | len(distinct_bytes) <= 50 and 59 | max(distinct_bytes) < 216): 60 | distinct_bytes.add(ord(data[cursor])) 61 | cursor += 1 62 | if cursor != len(data): 63 | cursor -= 1 64 | 65 | # Reduce the chunk until the type 2 encoder can actually encode it 66 | while cursor > 0: 67 | huffman = self._generate_huffman_2(data[:cursor]) 68 | if huffman is None: 69 | cursor -= 1 70 | else: 71 | break 72 | 73 | # If the type 1 encoder does better, then use that 74 | if cursor == 0: 75 | cursor += 1 76 | while cursor <= len(data): 77 | new_huffman = self._generate_huffman(data[:cursor]) 78 | if new_huffman is None: 79 | break 80 | else: 81 | huffman = new_huffman 82 | block_type = 1 83 | cursor += 1 84 | if block_type == 1: 85 | cursor -= 1 86 | 87 | # Do the actual encoding with the calculated huffman 88 | chunk = data[:cursor] 89 | data = data[cursor:] 90 | self.block_count += 1 91 | print 'compress', self.block_count, repr(chunk), huffman 92 | if previous_block_type == 2: 93 | self._padding_block() 94 | (self._compress_chunk if block_type == 1 else self._compress_chunk_2)( 95 | chunk, 96 | huffman[0], 97 | huffman[1], 98 | (len(data) == 0) 99 | ) 100 | previous_block_type = block_type 101 | 102 | print 'size:', len(self.stream.data()) 103 | 104 | return self.stream.data(), uncompressed_data 105 | 106 | def _generate_huffman(self, data): 107 | # print '_generate_huffman', repr(data) 108 | first_valid_8bit_code = 0b00011100 109 | valid_codes = sorted(map(lambda c: int(c, 2), self.allowed)) 110 | valid_codes = filter(lambda c: c >= first_valid_8bit_code, valid_codes) 111 | 112 | distinct_bytes = sorted(map(ord, list(set(data)))) 113 | 114 | def assign_codes(symbols, codes, valid): 115 | #print symbols, codes 116 | if len(symbols) == len(codes): 117 | return codes 118 | prev_code = codes[-1] 119 | prev_symbol = symbols[len(codes)-1] 120 | symbol = symbols[len(codes)] 121 | 122 | max_code = prev_code + (symbol - prev_symbol) 123 | reachable_codes = filter(lambda c: c <= max_code, valid) 124 | for chosen_code in reachable_codes[::-1]: 125 | assigned_codes = assign_codes( 126 | symbols, 127 | codes + [chosen_code], 128 | filter(lambda c: c > chosen_code, valid) 129 | ) 130 | if assigned_codes: 131 | return assigned_codes 132 | 133 | assigned_codes = assign_codes( 134 | [-1] + distinct_bytes, 135 | [first_valid_8bit_code - 1], 136 | valid_codes 137 | ) 138 | if not assigned_codes: 139 | return None 140 | assigned_codes = assigned_codes[1:] 141 | symbols = dict(zip(distinct_bytes, assigned_codes)) 142 | symbols[256] = 0b000011 143 | 144 | needed_6 = 3 # plus the end of block symbol and 3 after that 145 | needed_8 = assigned_codes[0] - first_valid_8bit_code 146 | code_lengths = [] 147 | while len(code_lengths) < 257 or needed_6 or needed_8: 148 | if len(distinct_bytes) > 0 and len(code_lengths) == distinct_bytes[0]: 149 | assert needed_8 == 0 150 | code_lengths.append(8) 151 | this_code = assigned_codes.pop(0) 152 | distinct_bytes.pop(0) 153 | if len(assigned_codes) > 0: 154 | needed_8 = assigned_codes[0] - this_code - 1 155 | else: 156 | needed_8 = 228 - code_lengths.count(8) 157 | elif len(code_lengths) == 256: 158 | if needed_6 > 0: 159 | return None 160 | else: 161 | code_lengths.append(6) 162 | needed_6 = 3 163 | elif needed_8 > 0: 164 | code_lengths.append(8) 165 | needed_8 -= 1 166 | elif needed_6 > 0: 167 | code_lengths.append(6) 168 | needed_6 -= 1 169 | else: 170 | code_lengths.append(0) 171 | 172 | assert ((pow(2, 6) - code_lengths.count(6))*4 - code_lengths.count(8)) == 0 173 | return code_lengths, symbols 174 | 175 | def _generate_huffman_2(self, data): 176 | print '_generate_huffman_2', repr(data) 177 | valid_codes = filter( 178 | lambda c: (binary(c & 0b00111111, 6, True) + '10') in self.allowed, 179 | range(0b10000000, 0b11000000) 180 | ) 181 | first_valid_8bit_code = 0b10000100 182 | valid_codes = filter(lambda c: c >= first_valid_8bit_code, valid_codes) 183 | # print self.allowed, len(self.allowed), valid_codes, len(valid_codes) 184 | # for c in valid_codes: 185 | # self.stream.write(c, 8, reverse=True) 186 | # print self.stream.data() 187 | 188 | # print valid_codes 189 | 190 | distinct_bytes = sorted(map(ord, list(set(data)))) 191 | # print 'distinct bytes:', len(distinct_bytes), distinct_bytes 192 | 193 | def assign_codes(symbols, codes, valid): 194 | # print symbols, codes 195 | if len(symbols) == len(codes): 196 | return codes 197 | prev_code = codes[-1] 198 | prev_symbol = symbols[len(codes)-1] 199 | symbol = symbols[len(codes)] 200 | 201 | max_code = min( 202 | prev_code + (symbol - prev_symbol), # max possible code 203 | valid[-(len(symbols) - len(codes))] # leave space for others 204 | ) 205 | reachable_codes = filter(lambda c: c <= max_code, valid) 206 | if symbol == ord(data[-1]): 207 | # The last char's code must be OK with 00 in the most 208 | # significant bits, since 00 is the end of block marker's code 209 | reachable_codes = filter( 210 | lambda c: (binary(c, 8, True)[2:] + '00') in self.allowed, 211 | reachable_codes 212 | ) 213 | for chosen_code in reachable_codes[::-1]: 214 | assigned_codes = assign_codes( 215 | symbols, 216 | codes + [chosen_code], 217 | filter(lambda c: c > chosen_code, valid) 218 | ) 219 | if assigned_codes: 220 | return assigned_codes 221 | 222 | assigned_codes = assign_codes( 223 | [-1] + distinct_bytes, 224 | [first_valid_8bit_code - 1], 225 | valid_codes 226 | ) 227 | if not assigned_codes: 228 | return None 229 | assigned_codes = assigned_codes[1:] 230 | symbols = dict(zip(distinct_bytes, assigned_codes)) 231 | symbols[256] = 0 232 | 233 | # A 257 legyen 2-es (00 kod) 234 | # Kell 1db 2-es a 257 utan (01 kod) 235 | # Kell 1db 6-s vhol (100001 kod) 236 | # Az utolso szimbolum vegzodjon 011-re (hogy utana lehessen 00 kod) 237 | needed_2 = 0 238 | needed_6 = 1 239 | needed_8 = assigned_codes[0] - first_valid_8bit_code 240 | code_lengths = [] 241 | while len(code_lengths) < 257 or needed_2 or needed_6 or needed_8: 242 | if len(distinct_bytes) > 0 and len(code_lengths) == distinct_bytes[0]: 243 | assert needed_8 == 0 244 | code_lengths.append(8) 245 | this_code = assigned_codes.pop(0) 246 | distinct_bytes.pop(0) 247 | if len(assigned_codes) > 0: 248 | needed_8 = assigned_codes[0] - this_code - 1 249 | else: 250 | # 256 - (covered by 2s) - (covered by 6s) - (covered by 8s) 251 | needed_8 = 256 - 64*2 - 4 - code_lengths.count(8) 252 | elif len(code_lengths) == 256: 253 | code_lengths.append(2) 254 | needed_2 = 1 255 | elif needed_8 > 0: 256 | code_lengths.append(8) 257 | needed_8 -= 1 258 | elif needed_6 > 0: 259 | code_lengths.append(6) 260 | needed_6 -= 1 261 | elif needed_2 > 0: 262 | code_lengths.append(2) 263 | needed_2 -= 1 264 | else: 265 | code_lengths.append(0) 266 | 267 | extra_codelengths = 257 - len(code_lengths) 268 | if 13 <= extra_codelengths <= 15 or extra_codelengths > 28: 269 | # HLIT would be invalid 270 | return None 271 | 272 | assert sum(map(lambda l: l and pow(2, 8-l), code_lengths)) == 256 273 | # sys.exit() 274 | return code_lengths, symbols 275 | 276 | def _padding_block(self): 277 | """Makes the next block start at (byte boundary - 2 bits)""" 278 | 279 | # Header 280 | self.stream.write(0, 1) # Not last block 281 | self.stream.write(2, 2) # Dynamic Huffman 282 | self.stream.write(8, 5) # HLIT = 8 283 | self.stream.write(16, 5) # HDIST = 16 284 | self.stream.write(9, 4) # HCLEN = 9 285 | 286 | # Lengths Huffman table definition 287 | self.stream.write(2, 3) # 16 length = 2 288 | self.stream.write(5, 3) # 17 length = 5 289 | self.stream.write(0, 3) # 18 length = 0 290 | self.stream.write(4, 3) # 0 length = 4 291 | self.stream.write(3, 3) # 8 length = 3 292 | self.stream.write(0, 3) # 7 length = 0 293 | self.stream.write(6, 3) # 9 length = 6 294 | self.stream.write(4, 3) # 6 length = 4 295 | self.stream.write(4, 3) # 10 length = 4 296 | self.stream.write(4, 3) # 5 length = 4 297 | self.stream.write(4, 3) # 11 length = 4 298 | self.stream.write(6, 3) # 4 length = 6 299 | self.stream.write(2, 3) # 12 length = 2 300 | 301 | # Liternal+length Huffman table definition 302 | def repeat(code, n): 303 | first = True 304 | while n > 0: 305 | # print n, len(self.stream.bits) % 8 306 | if n > 6 and not first and len(self.stream.bits) % 8 == 0: 307 | x = min(n, 10) 308 | self.stream.write('01', reverse=True) # Huffman 16 309 | self.stream.write(x-7, 2) # Repeat 3-6x 310 | self.stream.write('01', reverse=True) # Huffman 16 311 | self.stream.write(1, 2) # Repeat 4x 312 | n -= x 313 | else: 314 | self.stream.write(code, reverse=True) 315 | n -= 1 316 | first = False 317 | repeat('1010', 197) 318 | repeat('1100', 261 - 197) 319 | repeat('1010', 265 - 261) # TODO: Kell ez? 320 | 321 | # Distance Huffman table definition 322 | repeat('1010', 17) 323 | 324 | # Data 325 | self.stream.write('111011', reverse=True) # End of Block 326 | 327 | overhead = 0 328 | 329 | def _compress_chunk(self, chunk, code_lengths, symbols, last): 330 | l = len(self.stream.bits) 331 | 332 | # Header 333 | self.stream.write(last, 1) # Is it the last block? 334 | self.stream.write(2, 2) # Dynamic Huffman 335 | self.stream.write(len(code_lengths)-257, 5) # HLIT 336 | self.stream.write(25, 5) # HDIST = 25 337 | self.stream.write(9, 4) # HCLEN = 9 338 | 339 | # Lengths Huffman table definition 340 | self.stream.write(2, 3) # 16 length = 2 341 | self.stream.write(4, 3) # 17 length = 4 342 | self.stream.write(3, 3) # 18 length = 3 343 | self.stream.write(4, 3) # 0 length = 4 344 | self.stream.write(4, 3) # 8 length = 4 345 | self.stream.write(5, 3) # 7 length = 5 346 | self.stream.write(4, 3) # 9 length = 4 347 | self.stream.write(4, 3) # 6 length = 4 348 | self.stream.write(4, 3) # 10 length = 4 349 | self.stream.write(0, 3) # 5 length = 0 350 | self.stream.write(3, 3) # 11 length = 3 351 | self.stream.write(5, 3) # 4 length = 5 352 | self.stream.write(4, 3) # 12 length = 4 353 | 354 | # Liternal+length Huffman table definition 355 | def repeat(code, n): 356 | first = True 357 | while n > 0: 358 | if n > 6 and not first and len(self.stream.bits) % 8 == 2: 359 | x = n/6 360 | for i in range(x): 361 | self.stream.write('00', reverse=True) # Huffman 16 362 | self.stream.write(3, 2) # Repeat previous 6x 363 | n -= x*6 364 | else: 365 | self.stream.write(code, reverse=True) 366 | n -= 1 367 | first = False 368 | runs = [] 369 | for code_length in code_lengths: 370 | if len(runs) > 0 and runs[-1][0] == code_length: 371 | runs[-1][1] += 1 372 | else: 373 | runs.append([code_length, 1]) 374 | code_values = { 375 | 0: '1000', 376 | 6: '1001', 377 | 8: '1010' 378 | } 379 | for run in runs: 380 | repeat(code_values[run[0]], run[1]) 381 | # print runs 382 | 383 | # Distance Huffman table definition 384 | if len(self.stream.bits) % 8 == 2: 385 | self.stream.write('011', reverse=True) # Huffman 18 386 | self.stream.write(11, 7) # Repeat zero (11+11)x 387 | self.stream.write('00', reverse=True) # Huffman 16 388 | self.stream.write(1, 2) # Repeat previous 4x 389 | else: 390 | self.stream.write('1000', reverse=True) # Huffman 0 391 | self.stream.write('011', reverse=True) # Huffman 18 392 | self.stream.write(10, 7) # Repeat zero (11+10)x 393 | self.stream.write('00', reverse=True) # Huffman 16 394 | self.stream.write(1, 2) # Repeat previous 4x 395 | 396 | # Data 397 | for byte in chunk: 398 | symbol = symbols[ord(byte)] 399 | # print byte, symbol 400 | self.stream.write(symbol, 8, reverse=True) 401 | self.stream.write(symbols[256], 6, reverse=True) 402 | 403 | overhead = (len(self.stream.bits) - l) / 8 404 | self.overhead += overhead 405 | # print overhead, float(overhead) / len(chunk) 406 | 407 | def _compress_chunk_2(self, chunk, code_lengths, symbols, last): 408 | # Header 409 | self.stream.write(last, 1) # Is it the last block? 410 | self.stream.write(2, 2) # Dynamic Huffman 411 | self.stream.write(len(code_lengths)-257, 5) # HLIT 412 | self.stream.write(5, 5) # HDIST = 5 413 | self.stream.write(13, 4) # HCLEN = 13 414 | 415 | # Lengths Huffman table definition 416 | self.stream.write(2, 3) # 16 length = 2 417 | self.stream.write(5, 3) # 17 length = 5 418 | self.stream.write(3, 3) # 18 length = 3 419 | self.stream.write(4, 3) # 0 length = 4 420 | self.stream.write(4, 3) # 8 length = 4 421 | self.stream.write(5, 3) # 7 length = 5 422 | self.stream.write(4, 3) # 9 length = 4 423 | self.stream.write(4, 3) # 6 length = 4 424 | self.stream.write(4, 3) # 10 length = 4 425 | self.stream.write(0, 3) # 5 length = 0 426 | self.stream.write(3, 3) # 11 length = 3 427 | self.stream.write(5, 3) # 4 length = 5 428 | self.stream.write(0, 3) # 12 length = 0 429 | self.stream.write(5, 3) # 3 length = 5 430 | self.stream.write(0, 3) # 13 length = 0 431 | self.stream.write(4, 3) # 2 length = 4 432 | self.stream.write(0, 3) # 14 length = 0 433 | 434 | # Liternal+length Huffman table definition 435 | def repeat(code, n): 436 | first = True 437 | while n > 0: 438 | if n > 6 and not first and len(self.stream.bits) % 8 == 2: 439 | x = n/6 440 | for i in range(x): 441 | self.stream.write('00', reverse=True) # Huffman 16 442 | self.stream.write(3, 2) # Repeat previous 6x 443 | n -= x*6 444 | else: 445 | self.stream.write(code, reverse=True) 446 | n -= 1 447 | first = False 448 | runs = [] 449 | for code_length in code_lengths: 450 | if len(runs) > 0 and runs[-1][0] == code_length: 451 | runs[-1][1] += 1 452 | else: 453 | runs.append([code_length, 1]) 454 | code_values = { 455 | 0: '1000', 456 | 2: '1001', 457 | 6: '1010', 458 | 8: '1011', 459 | } 460 | for run in runs: 461 | repeat(code_values[run[0]], run[1]) 462 | 463 | # Distance Huffman table definition 464 | if len(self.stream.bits) % 8 == 2: 465 | self.stream.write('1000', reverse=True) # Huffman 0 466 | self.stream.write('1000', reverse=True) # Huffman 0 467 | self.stream.write('00', reverse=True) # Huffman 16 468 | self.stream.write(1, 2) # Repeat previous 4x 469 | else: 470 | self.stream.write('1001', reverse=True) # Huffman 2 471 | self.stream.write('00', reverse=True) # Huffman 16 472 | self.stream.write(0, 2) # Repeat previous 3x 473 | self.stream.write('1000', reverse=True) # Huffman 0 474 | self.stream.write('1000', reverse=True) # Huffman 0 475 | 476 | # Data 477 | for byte in chunk: 478 | symbol = symbols[ord(byte)] 479 | self.stream.write(symbol, 8, reverse=True) 480 | self.stream.write(symbols[256], 2, reverse=True) 481 | 482 | 483 | parser = argparse.ArgumentParser() 484 | parser.add_argument('filename', 485 | type=str, 486 | help='the file to compress') 487 | parser.add_argument('--mode', 488 | type=str, 489 | choices=['raw', 'gzip', 'zlib', 'swf'], 490 | help='format of the output data') 491 | parser.add_argument('--output', 492 | type=str, 493 | default=None, 494 | help='output file (the default is the standard output)') 495 | args = parser.parse_args() 496 | 497 | data = open(args.filename).read() 498 | compressor = ASCIICompressor( 499 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789' 500 | # 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' 501 | # map(chr, range(1, 128)) 502 | # map(chr, range(32,127)) 503 | ) 504 | 505 | output = open(args.output, 'wb') if args.output else sys.stdout 506 | 507 | 508 | def wrap_gzip(compressed): 509 | return ( 510 | binascii.unhexlify( 511 | '1f8b' + # Magic 512 | '08' + # Compression Method 513 | '00' + # Flags 514 | '00000000' + # MTime 515 | '00' + # Extra flags 516 | '99' # OS 517 | ) + 518 | compressed + 519 | struct.pack('