├── LICENSE ├── Makefile ├── README.md ├── example.c ├── fuzzing.c ├── smaz2.c ├── smaz2.h ├── smaz2.py └── words_256.txt /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 Salvatore Sanfilippo. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: smaz2 fuzzing 2 | 3 | smaz2: smaz2.c example.c 4 | $(CC) -O2 -Wall -W --pedantic smaz2.c example.c -o smaz2 5 | 6 | fuzzing: smaz2.c fuzzing.c 7 | $(CC) -O2 -Wall -W --pedantic smaz2.c fuzzing.c -o fuzzing 8 | 9 | clean: 10 | rm -f smaz2 fuzzing 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SMAZ2 small messages compression algorithm 2 | 3 | *Note: this library is no longer compatible with the old version, Smaz (still available in the old repository). It was redesigned to be much more resistant to non compressible input (it rarely enlarges the input text, almost never actually). It also compresses better than before.* 4 | 5 | ## Motivations 6 | 7 | LoRa networks have an extremely limited bandwidth and each message 8 | requires a long channel time in order to be transmitted (often in the order 9 | of *seconds!*). When LoRa is used to send messages between humans, a form 10 | of compression improves the channel utilization in a sensible way. 11 | 12 | This compression scheme is designed to compress small messages in extremely 13 | memory constrained devices, like ESP32 devices running MicroPython. 14 | The basic idea is to use a pre-computed bigrams and words tables to encode 15 | short messages more efficiently, for a total RAM usage of less than 16 | 2kbytes. 17 | 18 | The words table is composed of 256 words. Short words (len less than 4 bytes) 19 | are not present because they are better encoded with bigrams. 20 | This is the full list of the 256 words: 21 | 22 | "that", "this", "with", "from", "your", "have", "more", "will", 23 | "home", "about", "page", "search", "free", "other", "information", "time", 24 | "they", "site", "what", "which", "their", "news", "there", "only", 25 | "when", "contact", "here", "business", "also", "help", "view", "online", 26 | "first", "been", "would", "were", "services", "some", "these", "click", 27 | "like", "service", "than", "find", "price", "date", "back", "people", 28 | "list", "name", "just", "over", "state", "year", "into", "email", 29 | "health", "world", "next", "used", "work", "last", "most", "products", 30 | "music", "data", "make", "them", "should", "product", "system", "post", 31 | "city", "policy", "number", "such", "please", "available", "copyright", 32 | "support", "message", "after", "best", "software", "then", "good", "video", 33 | "well", "where", "info", "rights", "public", "books", "high", "school", 34 | "through", "each", "links", "review", "years", "order", "very", "privacy", 35 | "book", "items", "company", "read", "group", "need", "many", "user", 36 | "said", "does", "under", "general", "research", "university", "january", "mail", 37 | "full", "reviews", "program", "life", "know", "games", "days", "management", 38 | "part", "could", "great", "united", "hotel", "real", "item", "international", 39 | "center", "ebay", "must", "store", "travel", "comments", "made", "development", 40 | "report", "member", "details", "line", "terms", "before", "hotels", "send", 41 | "right", "type", "because", "local", "those", "using", "results", "office", 42 | "education", "national", "design", "take", "posted", "internet", "address", 43 | "community", "within", "states", "area", "want", "phone", "shipping", 44 | "reserved", "subject", "between", "forum", "family", "long", "based", "code", 45 | "show", "even", "black", "check", "special", "prices", "website", "index", 46 | "being", "women", "much", "sign", "file", "link", "open", "today", "technology", 47 | "south", "case", "project", "same", "pages", "version", "section", "found", 48 | "sports", "house", "related", "security", "both", "county", "american", "photo", 49 | "game", "members", "power", "while", "care", "network", "down", "computer", 50 | "systems", "three", "total", "place", "following", "download", "without", 51 | "access", "think", "north", "resources", "current", "posts", "media", "control", 52 | "water", "history", "pictures", "size", "personal", "since", "including", 53 | "guide", "shop", "directory", "board", "location", "change", "white", "text", 54 | "small", "rating", "rate", "government" 55 | 56 | If a word match is not found, the bigram table is used. The table is composed of the most common 128 bigrams by frequency, for a total of 256 bytes: 57 | 58 | *"intherreheanonesorteattistenntartondalitseediseangoulecomeneriroderaioicliofasetvetasihamaecomceelllcaurlachhidihofonsotacnarssoprrtsassusnoiltsemctgeloeebetrnipeiepancpooldaadviunamutwimoshyoaiewowosfiepttmiopiaweagsuiddoooirspplscaywaigeirylytuulivimabty"* 59 | 60 | When not even a matching bigram is found, bytes with value 0 or in the range 61 | from 9 to 127 can be encoded with a single byte (this happens for instance for 62 | all the ASCII uppercase letters, symbols, numbers...). The byte value can be 63 | left as it is. 64 | 65 | For bytes in the range from 1 to 8 and from 128 to 255, an escape sequence 66 | is generated and from 1 to 5 verbatim bytes are emitted. Bytes with values 67 | 6, 7, 8 are used as special escapes to emit a word from the table. The 68 | value of 6 is used. 69 | 70 | ## Encoding 71 | 72 | So this is how the encoding works: 73 | 74 | * A byte with value from 128 to 255 encodes a bigram with ID from 0 to 127. 75 | * A byte with value 0 or from 9 to 127 is just what it is. 76 | * A byte with value of 6 is followed by a byte representing the word ID to emit. 77 | * A byte with value 7 is like 6, but after the word a space is emitted. 78 | * A byte with value 8 is like 6, but before the word a space is emitted. 79 | * A byte with a value from 1 to 5 means that from 1 to 5 verbatim bytes follow. 80 | 81 | This means that this compression scheme will use more space than the input 82 | string only when emtting verbatim bytes, that is when the string contains 83 | special or unicode characters that happen to have bytes with values between 1 to 8 or >= 128: this is a very rare condition in pratice (at least in English), and even when it happens it is almost always compensated by the other words. 84 | 85 | As long as the messages are latin letters natural language messages with common statistical properties, the program will never use more space than needed and will often be able to compress words to less bytes. However programs using this scheme are likely to have a one bit flag in the header in order to signal if the message is compressed or not, so that every time the result would be larger than the uncompressed message, no compression can used in order to transmit the message. 86 | 87 | ## Real world compression achieved 88 | 89 | ./smaz2 "The program is designed to work well with English text" 90 | Compressed length: 44.44% 91 | 92 | ./smaz2 "As long as the messages are latin letters natural language messages with common statistical properties, the program will only seldom use more space than needed" 93 | Compressed length: 54.72% 94 | 95 | ./smaz2 "Anche se in maniera meno efficiente, questo algoritmo di compressione è in grado di comprimere testi in altre lingue." 96 | Compressed length: 66.95% 97 | 98 | ## Implementations 99 | 100 | In this repository you will find both a C and a Python implementation. 101 | The implementation is optimized for space (both RAM and code executable) 102 | and not for speed, since most use cases will use it very seldom, only when 103 | a short message will be sent. So the algorithm scans the tables at every 104 | string position, which is very costly in general, but should still be 105 | adequate for this library. 106 | 107 | ## License 108 | 109 | MIT license. 110 | -------------------------------------------------------------------------------- /example.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "smaz2.h" 5 | 6 | int main(int argc, char **argv) { 7 | if (argc != 2) { 8 | fprintf(stderr, 9 | "Usage: %s 'string to test'\n", argv[0]); 10 | exit(1); 11 | } 12 | 13 | unsigned char buf[256]; 14 | unsigned long olen; 15 | 16 | olen = smaz2_compress(buf,sizeof(buf),(unsigned char*)argv[1],strlen(argv[1])); 17 | printf("Compressed length (%lu): %.02f%%\n", olen, (float)olen/strlen(argv[1])*100); 18 | return 0; 19 | } 20 | -------------------------------------------------------------------------------- /fuzzing.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "smaz2.h" 7 | 8 | int main(void) { 9 | unsigned char in[512]; 10 | unsigned char out[4096]; 11 | unsigned char d[4096]; 12 | int comprlen, decomprlen; 13 | int j, ranlen; 14 | int times = 100000; 15 | char *strings[] = { 16 | "This is a small string", 17 | "foobar", 18 | "the end", 19 | "not-a-g00d-Exampl333", 20 | "Smaz2 is a simple compression library", 21 | "Nothing is more difficult, and therefore more precious, than to be able to decide", 22 | "When words in the table are used business internet however", 23 | "1000!! numbers 2000?!~~ and special...characters", 24 | "and now a few italian sentences:", 25 | "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura", 26 | "Mi illumino di immenso", 27 | "L'autore di questa libreria vive in Sicilia", 28 | "You shouldn’t connect through Bluetooth, it should connect when you open the app", 29 | "try it against urls", 30 | "http://google.com", 31 | "http://programming.reddit.com", 32 | "http://github.com/antirez/smaz/tree/master", 33 | "/media/hdb1/music/Alben/The Bla", 34 | NULL 35 | }; 36 | 37 | j=0; 38 | while(strings[j]) { 39 | int comprlevel; 40 | 41 | comprlen = smaz2_compress(out,sizeof(out),(unsigned char*)strings[j],strlen(strings[j])); 42 | comprlevel = 100-((100*comprlen)/strlen(strings[j])); 43 | decomprlen = smaz2_decompress(d,sizeof(d),out,comprlen); 44 | if (strlen(strings[j]) != (unsigned)decomprlen || 45 | memcmp(strings[j],d,decomprlen)) 46 | { 47 | printf("BUG: error compressing '%s'\n", strings[j]); 48 | printf("COMPRESSED TO: "); 49 | for (int j = 0; j < comprlen; j++) 50 | if (isprint(out[j])) 51 | printf("%c",out[j]); 52 | else 53 | printf("[%02x]",out[j]); 54 | printf("\n"); 55 | printf("DECOMPRESSED TO: '%.*s'\n", (int)decomprlen, d); 56 | exit(1); 57 | } 58 | if (comprlevel < 0) { 59 | printf("'%s' enlarged by %d%%\n",strings[j],-comprlevel); 60 | } else { 61 | printf("'%s' compressed by %d%%\n",strings[j],comprlevel); 62 | } 63 | j++; 64 | } 65 | printf("Compressing and decompressing %d test strings...\n", times); 66 | while(times--) { 67 | char charset[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvxyz/. "; 68 | ranlen = random() % 512; 69 | 70 | for (j = 0; j < ranlen; j++) { 71 | if (times & 1) 72 | in[j] = charset[random() % (sizeof(charset)-1)]; 73 | else 74 | in[j] = (char)(random() & 0xff); 75 | } 76 | comprlen = smaz2_compress(out,sizeof(out),in,ranlen); 77 | decomprlen = smaz2_decompress(d,sizeof(d),out,comprlen); 78 | 79 | if (ranlen != decomprlen || memcmp(in,d,ranlen)) { 80 | printf("Bug! TEST NOT PASSED\n"); 81 | exit(1); 82 | } 83 | if (times % 10000 == 0) { 84 | printf("."); 85 | fflush(stdout); 86 | } 87 | } 88 | printf("TEST PASSED :)\n"); 89 | return 0; 90 | } 91 | -------------------------------------------------------------------------------- /smaz2.c: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2024 by Salvatore Sanfilippo -- All rights reserved. 2 | * This code is licensed under the MIT license. See LICENSE file for info. */ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | /* 128 common bigrams. */ 10 | const char *bigrams = "intherreheanonesorteattistenntartondalitseediseangoulecomeneriroderaioicliofasetvetasihamaecomceelllcaurlachhidihofonsotacnarssoprrtsassusnoiltsemctgeloeebetrnipeiepancpooldaadviunamutwimoshyoaiewowosfiepttmiopiaweagsuiddoooirspplscaywaigeirylytuulivimabty"; 11 | 12 | /* 256 common English words of length four letters or more. */ 13 | char *words[256] = { 14 | "that", "this", "with", "from", "your", "have", "more", "will", "home", 15 | "about", "page", "search", "free", "other", "information", "time", "they", 16 | "what", "which", "their", "news", "there", "only", "when", "contact", "here", 17 | "business", "also", "help", "view", "online", "first", "been", "would", "were", 18 | "some", "these", "click", "like", "service", "than", "find", "date", "back", 19 | "people", "list", "name", "just", "over", "year", "into", "email", "health", 20 | "world", "next", "used", "work", "last", "most", "music", "data", "make", 21 | "them", "should", "product", "post", "city", "policy", "number", "such", 22 | "please", "available", "copyright", "support", "message", "after", "best", 23 | "software", "then", "good", "video", "well", "where", "info", "right", "public", 24 | "high", "school", "through", "each", "order", "very", "privacy", "book", "item", 25 | "company", "read", "group", "need", "many", "user", "said", "does", "under", 26 | "general", "research", "university", "january", "mail", "full", "review", 27 | "program", "life", "know", "days", "management", "part", "could", "great", 28 | "united", "real", "international", "center", "ebay", "must", "store", "travel", 29 | "comment", "made", "development", "report", "detail", "line", "term", "before", 30 | "hotel", "send", "type", "because", "local", "those", "using", "result", 31 | "office", "education", "national", "design", "take", "posted", "internet", 32 | "address", "community", "within", "state", "area", "want", "phone", "shipping", 33 | "reserved", "subject", "between", "forum", "family", "long", "based", "code", 34 | "show", "even", "black", "check", "special", "price", "website", "index", 35 | "being", "women", "much", "sign", "file", "link", "open", "today", "technology", 36 | "south", "case", "project", "same", "version", "section", "found", "sport", 37 | "house", "related", "security", "both", "county", "american", "game", "member", 38 | "power", "while", "care", "network", "down", "computer", "system", "three", 39 | "total", "place", "following", "download", "without", "access", "think", 40 | "north", "resource", "current", "media", "control", "water", "history", 41 | "picture", "size", "personal", "since", "including", "guide", "shop", 42 | "directory", "board", "location", "change", "white", "text", "small", "rating", 43 | "rate", "government", "child", "during", "return", "student", "shopping", 44 | "account", "site", "level", "digital", "profile", "previous", "form", "event", 45 | "love", "main", "another", "class", "still" 46 | }; 47 | 48 | /* Compress the string 's' of 'len' bytes and stores the compression 49 | * result in 'dst' for a maximum of 'dstlen' bytes. Returns the 50 | * amount of bytes written. */ 51 | unsigned long smaz2_compress(unsigned char *dst, unsigned long dstlen, unsigned char *s, unsigned long len) 52 | { 53 | 54 | int debug = 0; // Log debugging messages. 55 | int verblen = 0; /* Length of the emitted verbatim sequence, 0 if 56 | * no verbating sequence was emitted last time, 57 | * otherwise 1...5, it never reaches 8 even if we have 58 | * vertabim len of 8, since as we emit a verbatim 59 | * sequence of 8 bytes we reset verblen to 0 to 60 | * star emitting a new verbatim sequence. */ 61 | unsigned long y = 0; // Index of next byte to set in 'dst'. 62 | 63 | while(len && y < dstlen) { 64 | /* Try to emit a word. */ 65 | if (len >= 4) { 66 | unsigned int i, wordlen; 67 | for (i = 0; i < 256; i++) { 68 | const char *w = words[i]; 69 | wordlen = strlen(w); 70 | unsigned int space = s[0] == ' '; 71 | 72 | if (len >= wordlen+space && 73 | memcmp(w,s+space,wordlen) == 0) break; // Match. 74 | } 75 | 76 | /* Emit word if a match was found. 77 | * The escapes are: 78 | * byte value 6: simple word. 79 | * byte value 7: word + space. 80 | * byte value 8: space + word. */ 81 | if (i != 256) { 82 | if (s[0] == ' ') { 83 | if (debug) printf("( %s)", words[i]); 84 | if (y < dstlen) dst[y++] = 8; // Space + word. 85 | if (y < dstlen) dst[y++] = i; // Word ID. 86 | s++; len--; // Account for the space. 87 | } else if (len > wordlen && s[wordlen] == ' ') { 88 | if (debug) printf("(%s )", words[i]); 89 | if (y < dstlen) dst[y++] = 7; // Word + space. 90 | if (y < dstlen) dst[y++] = i; // Word ID. 91 | s++; len--; // Account for the space. 92 | } else { 93 | if (debug) printf("(%s)", words[i]); 94 | if (y < dstlen) dst[y++] = 6; // Simple word. 95 | if (y < dstlen) dst[y++] = i; // Word ID. 96 | } 97 | 98 | /* Consume. */ 99 | s += wordlen; 100 | len -= wordlen; 101 | verblen = 0; 102 | continue; 103 | } 104 | } 105 | 106 | /* Try to emit a bigram. */ 107 | if (len >= 2) { 108 | int i; 109 | for (i = 0; i < 128; i++) { 110 | const char *b = bigrams + i*2; 111 | if (s[0] == b[0] && s[1] == b[1]) break; 112 | } 113 | 114 | /* Emit bigram if a match was found. */ 115 | if (i != 128) { 116 | int x = 1; 117 | if (y < dstlen) dst[y++] = x<<7 | i; 118 | 119 | /* Consume. */ 120 | s += 2; 121 | len -= 2; 122 | verblen = 0; 123 | if (debug) printf("[%c%c]", bigrams[i*2], bigrams[i*2+1]); 124 | continue; 125 | } 126 | } 127 | 128 | /* No word/bigram match. Let's try if we can represent this 129 | * byte with a single output byte without escaping. We can 130 | * for all the bytes values but 1, 2, 3, 4, 5, 6, 7, 8. */ 131 | if (!(s[0] > 0 && s[0] < 9) && s[0] < 128) { 132 | if (y < dstlen) dst[y++] = s[0]; 133 | 134 | /* Consume. */ 135 | if (debug) printf("{%c}", s[0]); 136 | s++; 137 | len--; 138 | verblen = 0; 139 | continue; 140 | } 141 | 142 | /* If we are here, we got no match nor in the bigram nor 143 | * with the single byte. We have to emit 'varbatim' bytes 144 | * with the escape sequence. */ 145 | verblen++; 146 | if (verblen == 1) { 147 | if (debug) printf("_%c", s[0]); 148 | if (y+1 == dstlen) break; /* No room for 2 bytes. */ 149 | dst[y++] = verblen; 150 | dst[y++] = s[0]; 151 | } else { 152 | if (debug) printf("%c", s[0]); 153 | dst[y++] = s[0]; 154 | dst[y-(verblen+1)] = verblen; // Fix the verbatim bytes length. 155 | if (verblen == 5) verblen = 0; // Start to emit a new sequence. 156 | } 157 | 158 | /* Consume. */ 159 | s++; 160 | len--; 161 | } 162 | return y; 163 | } 164 | 165 | /* Decompress the string 'c' of 'len' bytes and stores the compression 166 | * result in 'dst' for a maximum of 'dstlen' bytes. Returns the 167 | * amount of bytes written. */ 168 | unsigned long smaz2_decompress(unsigned char *dst, unsigned long dstlen, unsigned char *c, unsigned long len) 169 | { 170 | unsigned long orig_dstlen = dstlen, i = 0; 171 | 172 | while (i < len) { 173 | if ((c[i] & 128) != 0) { 174 | /* Emit bigram. */ 175 | unsigned char idx = c[i]&127; 176 | if (dstlen && dstlen-- && i < len) *dst++ = bigrams[idx*2]; 177 | if (dstlen && dstlen-- && i < len) *dst++ = bigrams[idx*2+1]; 178 | i++; 179 | } else if (c[i] > 0 && c[i] < 6) { 180 | /* Emit verbatim sequence. */ 181 | unsigned char vlen = c[i++]; 182 | while(vlen-- && i < len) 183 | if (dstlen && dstlen--) *dst++ = c[i++]; 184 | } else if (c[i] > 5 && c[i] < 9) { 185 | /* Emit word. */ 186 | unsigned char escape = c[i]; 187 | if (dstlen && escape == 8 && dstlen--) *dst++ = ' '; 188 | i++; // Go to word ID byte. 189 | if (i == len) return 0; // Malformed input. 190 | unsigned char idx = c[i++], j = 0; 191 | while(words[idx][j] != 0) 192 | if (dstlen && dstlen--) *dst++ = words[idx][j++]; 193 | if (dstlen && escape == 7 && dstlen--) *dst++ = ' '; 194 | } else { 195 | /* Emit byte as it is. */ 196 | if (dstlen--) *dst++ = c[i++]; 197 | } 198 | } 199 | return orig_dstlen - dstlen; 200 | } 201 | -------------------------------------------------------------------------------- /smaz2.h: -------------------------------------------------------------------------------- 1 | /* Copyright (C) 2024 by Salvatore Sanfilippo -- All rights reserved. 2 | * This code is licensed under the MIT license. See LICENSE file for info. */ 3 | 4 | #ifndef SMAZ2_H 5 | #define SMAZ2_H 6 | 7 | unsigned long smaz2_compress(unsigned char *dst, unsigned long dstlen, unsigned char *s, unsigned long len); 8 | unsigned long smaz2_decompress(unsigned char *dst, unsigned long dstlen, unsigned char *c, unsigned long len); 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /smaz2.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Salvatore Sanfilippo 2 | # All Rights Reserved 3 | # 4 | # This code is released under the MIT license. 5 | # See the LICENSE file for more information. 6 | 7 | # Define common bigrams and words 8 | bigrams = "intherreheanonesorteattistenntartondalitseediseangoulecomeneriroderaioicliofasetvetasihamaecomceelllcaurlachhidihofonsotacnarssoprrtsassusnoiltsemctgeloeebetrnipeiepancpooldaadviunamutwimoshyoaiewowosfiepttmiopiaweagsuiddoooirspplscaywaigeirylytuulivimabty" 9 | 10 | words = [ 11 | "that", "this", "with", "from", "your", "have", "more", "will", "home", 12 | "about", "page", "search", "free", "other", "information", "time", "they", 13 | "what", "which", "their", "news", "there", "only", "when", "contact", "here", 14 | "business", "also", "help", "view", "online", "first", "been", "would", "were", 15 | "some", "these", "click", "like", "service", "than", "find", "date", "back", 16 | "people", "list", "name", "just", "over", "year", "into", "email", "health", 17 | "world", "next", "used", "work", "last", "most", "music", "data", "make", 18 | "them", "should", "product", "post", "city", "policy", "number", "such", 19 | "please", "available", "copyright", "support", "message", "after", "best", 20 | "software", "then", "good", "video", "well", "where", "info", "right", "public", 21 | "high", "school", "through", "each", "order", "very", "privacy", "book", "item", 22 | "company", "read", "group", "need", "many", "user", "said", "does", "under", 23 | "general", "research", "university", "january", "mail", "full", "review", 24 | "program", "life", "know", "days", "management", "part", "could", "great", 25 | "united", "real", "international", "center", "ebay", "must", "store", "travel", 26 | "comment", "made", "development", "report", "detail", "line", "term", "before", 27 | "hotel", "send", "type", "because", "local", "those", "using", "result", 28 | "office", "education", "national", "design", "take", "posted", "internet", 29 | "address", "community", "within", "state", "area", "want", "phone", "shipping", 30 | "reserved", "subject", "between", "forum", "family", "long", "based", "code", 31 | "show", "even", "black", "check", "special", "price", "website", "index", 32 | "being", "women", "much", "sign", "file", "link", "open", "today", "technology", 33 | "south", "case", "project", "same", "version", "section", "found", "sport", 34 | "house", "related", "security", "both", "county", "american", "game", "member", 35 | "power", "while", "care", "network", "down", "computer", "system", "three", 36 | "total", "place", "following", "download", "without", "access", "think", 37 | "north", "resource", "current", "media", "control", "water", "history", 38 | "picture", "size", "personal", "since", "including", "guide", "shop", 39 | "directory", "board", "location", "change", "white", "text", "small", "rating", 40 | "rate", "government", "child", "during", "return", "student", "shopping", 41 | "account", "site", "level", "digital", "profile", "previous", "form", "event", 42 | "love", "main", "another", "class", "still" 43 | ] 44 | 45 | # SMAX compression function 46 | def smax_compress(s): 47 | s = s.encode() 48 | dst = bytearray() 49 | verblen = 0 50 | 51 | while len(s) > 0: 52 | # Try to find a matching word. 53 | if len(s) >= 4: 54 | for i, w in enumerate(words): 55 | wordlen = len(w) 56 | space = s[0] == 32 57 | 58 | if len(s) >= wordlen + space and s[space:wordlen+space] == w.encode(): 59 | break 60 | else: 61 | i = False 62 | 63 | if i: 64 | if s[0] == 32: 65 | dst.append(8) # Space + word escape. 66 | dst.append(i) 67 | s = s[1:] 68 | elif len(s) > wordlen and s[wordlen] == 32: 69 | dst.append(7) # Word + space escape. 70 | dst.append(i) 71 | s = s[1:] 72 | else: 73 | dst.append(6) # Just word escape. 74 | dst.append(i) 75 | 76 | s = s[wordlen:] 77 | verblen = 0 78 | continue 79 | 80 | # Try to find a matching bigram. 81 | if len(s) >= 2: 82 | for i in range(0, len(bigrams), 2): 83 | if s[:2] == bigrams[i:i+2].encode(): 84 | break 85 | else: 86 | i = False 87 | 88 | if i: 89 | dst.append(1 << 7 | i // 2) 90 | s = s[2:] 91 | verblen = 0 92 | continue 93 | 94 | # Can this byte be represented by itself? 95 | if not (0 < s[0] < 9) and s[0] < 128: 96 | dst.append(s[0]) 97 | s = s[1:] 98 | verblen = 0 99 | continue 100 | 101 | # Otherwise, emit or update a verbatim sequence. 102 | verblen += 1 103 | if verblen == 1: 104 | dst.extend(bytes([verblen,s[0]])) 105 | else: 106 | dst.append(s[0]) 107 | dst[-(verblen + 1)] = verblen 108 | if verblen == 5: 109 | verblen = 0 110 | 111 | s = s[1:] 112 | 113 | return bytes(dst) 114 | 115 | # SMAX decompression function 116 | def smax_decompress(c): 117 | i = 0 118 | res = bytearray() 119 | while i < len(c): 120 | if c[i] & 128 != 0: # Emit bigram 121 | idx = c[i]&127 122 | res.extend(bigrams[idx*2:idx*2+2].encode()) 123 | i += 1 124 | continue 125 | elif 0 < c[i] < 6: # Emit verbatim 126 | res.extend(c[i+1:i+1+c[i]]) 127 | i += 1+c[i] 128 | continue 129 | elif 5 < c[i] < 9: # Emit word 130 | if c[i] == 8: res.append(32) 131 | res.extend(words[c[i+1]].encode()) 132 | if c[i] == 7: res.append(32) 133 | i += 2 134 | else: # Emit byte as it is 135 | res.append(c[i]) 136 | i += 1 137 | return res.decode() 138 | 139 | # Main function for command-line interface 140 | if __name__ == "__main__": 141 | import sys 142 | 143 | if len(sys.argv) != 2: 144 | sys.exit("Usage: {} 'string to compress'".format(sys.argv[0])) 145 | 146 | compressed = smax_compress(sys.argv[1]) 147 | print("Compressed length: {:.02f}%".format(len(compressed) / len(sys.argv[1].encode()) * 100)) 148 | print(compressed) 149 | decompressed = smax_decompress(compressed) 150 | print("Decompress back: ", decompressed) 151 | print("The strings are the same after the back and forth?", sys.argv[1] == decompressed) 152 | -------------------------------------------------------------------------------- /words_256.txt: -------------------------------------------------------------------------------- 1 | "that", "this", "with", "from", "your", "have", "more", "will", "home", 2 | "about", "page", "search", "free", "other", "information", "time", "they", 3 | "what", "which", "their", "news", "there", "only", "when", "contact", "here", 4 | "business", "also", "help", "view", "online", "first", "been", "would", "were", 5 | "some", "these", "click", "like", "service", "than", "find", "date", "back", 6 | "people", "list", "name", "just", "over", "year", "into", "email", "health", 7 | "world", "next", "used", "work", "last", "most", "music", "data", "make", 8 | "them", "should", "product", "post", "city", "policy", "number", "such", 9 | "please", "available", "copyright", "support", "message", "after", "best", 10 | "software", "then", "good", "video", "well", "where", "info", "right", "public", 11 | "high", "school", "through", "each", "order", "very", "privacy", "book", "item", 12 | "company", "read", "group", "need", "many", "user", "said", "does", "under", 13 | "general", "research", "university", "january", "mail", "full", "review", 14 | "program", "life", "know", "days", "management", "part", "could", "great", 15 | "united", "real", "international", "center", "ebay", "must", "store", "travel", 16 | "comment", "made", "development", "report", "detail", "line", "term", "before", 17 | "hotel", "send", "type", "because", "local", "those", "using", "result", 18 | "office", "education", "national", "design", "take", "posted", "internet", 19 | "address", "community", "within", "state", "area", "want", "phone", "shipping", 20 | "reserved", "subject", "between", "forum", "family", "long", "based", "code", 21 | "show", "even", "black", "check", "special", "price", "website", "index", 22 | "being", "women", "much", "sign", "file", "link", "open", "today", "technology", 23 | "south", "case", "project", "same", "version", "section", "found", "sport", 24 | "house", "related", "security", "both", "county", "american", "game", "member", 25 | "power", "while", "care", "network", "down", "computer", "system", "three", 26 | "total", "place", "following", "download", "without", "access", "think", 27 | "north", "resource", "current", "media", "control", "water", "history", 28 | "picture", "size", "personal", "since", "including", "guide", "shop", 29 | "directory", "board", "location", "change", "white", "text", "small", "rating", 30 | "rate", "government", "child", "during", "return", "student", "shopping", 31 | "account", "site", "level", "digital", "profile", "previous", "form", "event", 32 | "love", "main", "another", "class", "still" 33 | --------------------------------------------------------------------------------