├── LICENSE
├── Makefile
├── README.md
├── example.c
├── fuzzing.c
├── smaz2.c
├── smaz2.h
├── smaz2.py
└── words_256.txt


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2024 Salvatore Sanfilippo.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all: smaz2 fuzzing
 2 | 
 3 | smaz2: smaz2.c example.c
 4 | 	$(CC) -O2 -Wall -W --pedantic smaz2.c example.c -o smaz2
 5 | 
 6 | fuzzing: smaz2.c fuzzing.c
 7 | 	$(CC) -O2 -Wall -W --pedantic smaz2.c fuzzing.c -o fuzzing
 8 | 
 9 | clean:
10 | 	rm -f smaz2 fuzzing
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # SMAZ2 small messages compression algorithm
  2 | 
  3 | *Note: this library is no longer compatible with the old version, Smaz (still available in the old repository). It was redesigned to be much more resistant to non compressible input (it rarely enlarges the input text, almost never actually). It also compresses better than before.*
  4 | 
  5 | ## Motivations
  6 | 
  7 | LoRa networks have an extremely limited bandwidth and each message
  8 | requires a long channel time in order to be transmitted (often in the order
  9 | of *seconds!*). When LoRa is used to send messages between humans, a form
 10 | of compression improves the channel utilization in a sensible way.
 11 | 
 12 | This compression scheme is designed to compress small messages in extremely
 13 | memory constrained devices, like ESP32 devices running MicroPython.
 14 | The basic idea is to use a pre-computed bigrams and words tables to encode
 15 | short messages more efficiently, for a total RAM usage of less than
 16 | 2kbytes.
 17 | 
 18 | The words table is composed of 256 words. Short words (len less than 4 bytes)
 19 | are not present because they are better encoded with bigrams.
 20 | This is the full list of the 256 words:
 21 | 
 22 | "that", "this", "with", "from", "your", "have", "more", "will",
 23 | "home", "about", "page", "search", "free", "other", "information", "time",
 24 | "they", "site", "what", "which", "their", "news", "there", "only",
 25 | "when", "contact", "here", "business", "also", "help", "view", "online",
 26 | "first", "been", "would", "were", "services", "some", "these", "click",
 27 | "like", "service", "than", "find", "price", "date", "back", "people",
 28 | "list", "name", "just", "over", "state", "year", "into", "email",
 29 | "health", "world", "next", "used", "work", "last", "most", "products",
 30 | "music", "data", "make", "them", "should", "product", "system", "post",
 31 | "city", "policy", "number", "such", "please", "available", "copyright",
 32 | "support", "message", "after", "best", "software", "then", "good", "video",
 33 | "well", "where", "info", "rights", "public", "books", "high", "school",
 34 | "through", "each", "links", "review", "years", "order", "very", "privacy",
 35 | "book", "items", "company", "read", "group", "need", "many", "user",
 36 | "said", "does", "under", "general", "research", "university", "january", "mail",
 37 | "full", "reviews", "program", "life", "know", "games", "days", "management",
 38 | "part", "could", "great", "united", "hotel", "real", "item", "international",
 39 | "center", "ebay", "must", "store", "travel", "comments", "made", "development",
 40 | "report", "member", "details", "line", "terms", "before", "hotels", "send",
 41 | "right", "type", "because", "local", "those", "using", "results", "office",
 42 | "education", "national", "design", "take", "posted", "internet", "address",
 43 | "community", "within", "states", "area", "want", "phone", "shipping",
 44 | "reserved", "subject", "between", "forum", "family", "long", "based", "code",
 45 | "show", "even", "black", "check", "special", "prices", "website", "index",
 46 | "being", "women", "much", "sign", "file", "link", "open", "today", "technology",
 47 | "south", "case", "project", "same", "pages", "version", "section", "found",
 48 | "sports", "house", "related", "security", "both", "county", "american", "photo",
 49 | "game", "members", "power", "while", "care", "network", "down", "computer",
 50 | "systems", "three", "total", "place", "following", "download", "without",
 51 | "access", "think", "north", "resources", "current", "posts", "media", "control",
 52 | "water", "history", "pictures", "size", "personal", "since", "including",
 53 | "guide", "shop", "directory", "board", "location", "change", "white", "text",
 54 | "small", "rating", "rate", "government"
 55 | 
 56 | If a word match is not found, the bigram table is used. The table is composed of the most common 128 bigrams by frequency, for a total of 256 bytes:
 57 | 
 58 | *"intherreheanonesorteattistenntartondalitseediseangoulecomeneriroderaioicliofasetvetasihamaecomceelllcaurlachhidihofonsotacnarssoprrtsassusnoiltsemctgeloeebetrnipeiepancpooldaadviunamutwimoshyoaiewowosfiepttmiopiaweagsuiddoooirspplscaywaigeirylytuulivimabty"*
 59 | 
 60 | When not even a matching bigram is found, bytes with value 0 or in the range
 61 | from 9 to 127 can be encoded with a single byte (this happens for instance for
 62 | all the ASCII uppercase letters, symbols, numbers...). The byte value can be
 63 | left as it is.
 64 | 
 65 | For bytes in the range from 1 to 8 and from 128 to 255, an escape sequence
 66 | is generated and from 1 to 5 verbatim bytes are emitted. Bytes with values
 67 | 6, 7, 8 are used as special escapes to emit a word from the table. The
 68 | value of 6 is used.
 69 | 
 70 | ## Encoding
 71 | 
 72 | So this is how the encoding works:
 73 | 
 74 | * A byte with value from 128 to 255 encodes a bigram with ID from 0 to 127.
 75 | * A byte with value 0 or from 9 to 127 is just what it is.
 76 | * A byte with value of 6 is followed by a byte representing the word ID to emit.
 77 | * A byte with value 7 is like 6, but after the word a space is emitted.
 78 | * A byte with value 8 is like 6, but before the word a space is emitted.
 79 | * A byte with a value from 1 to 5 means that from 1 to 5 verbatim bytes follow.
 80 | 
 81 | This means that this compression scheme will use more space than the input
 82 | string only when emtting verbatim bytes, that is when the string contains
 83 | special or unicode characters that happen to have bytes with values between 1 to 8 or >= 128: this is a very rare condition in pratice (at least in English), and even when it happens it is almost always compensated by the other words.
 84 | 
 85 | As long as the messages are latin letters natural language messages with common statistical properties, the program will never use more space than needed and will often be able to compress words to less bytes. However programs using this scheme are likely to have a one bit flag in the header in order to signal if the message is compressed or not, so that every time the result would be larger than the uncompressed message, no compression can used in order to transmit the message.
 86 | 
 87 | ## Real world compression achieved
 88 | 
 89 | ./smaz2 "The program is designed to work well with English text"
 90 | Compressed length: 44.44%
 91 | 
 92 | ./smaz2 "As long as the messages are latin letters natural language messages with common statistical properties, the program will only seldom use more space than needed"
 93 | Compressed length: 54.72%
 94 | 
 95 | ./smaz2 "Anche se in maniera meno efficiente, questo algoritmo di compressione è in grado di comprimere testi in altre lingue."
 96 | Compressed length: 66.95%
 97 | 
 98 | ## Implementations
 99 | 
100 | In this repository you will find both a C and a Python implementation.
101 | The implementation is optimized for space (both RAM and code executable)
102 | and not for speed, since most use cases will use it very seldom, only when
103 | a short message will be sent. So the algorithm scans the tables at every
104 | string position, which is very costly in general, but should still be
105 | adequate for this library.
106 | 
107 | ## License
108 | 
109 | MIT license.
110 | 


--------------------------------------------------------------------------------
/example.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include "smaz2.h"
 5 | 
 6 | int main(int argc, char **argv) {
 7 |     if (argc != 2) {
 8 |         fprintf(stderr,
 9 |         "Usage: %s 'string to test'\n", argv[0]);
10 |         exit(1);
11 |     }
12 | 
13 |     unsigned char buf[256];
14 |     unsigned long olen;
15 | 
16 |     olen = smaz2_compress(buf,sizeof(buf),(unsigned char*)argv[1],strlen(argv[1]));
17 |     printf("Compressed length (%lu): %.02f%%\n", olen, (float)olen/strlen(argv[1])*100);
18 |     return 0;
19 | }
20 | 


--------------------------------------------------------------------------------
/fuzzing.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <ctype.h>
 5 | 
 6 | #include "smaz2.h"
 7 | 
 8 | int main(void) {
 9 |     unsigned char in[512];
10 |     unsigned char out[4096];
11 |     unsigned char d[4096];
12 |     int comprlen, decomprlen;
13 |     int j, ranlen;
14 |     int times = 100000;
15 |     char *strings[] = {
16 |         "This is a small string",
17 |         "foobar",
18 |         "the end",
19 |         "not-a-g00d-Exampl333",
20 |         "Smaz2 is a simple compression library",
21 |         "Nothing is more difficult, and therefore more precious, than to be able to decide",
22 |         "When words in the table are used business internet however",
23 |         "1000!! numbers 2000?!~~ and special...characters",
24 |         "and now a few italian sentences:",
25 |         "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura",
26 |         "Mi illumino di immenso",
27 |         "L'autore di questa libreria vive in Sicilia",
28 |         "You shouldn’t connect through Bluetooth, it should connect when you open the app",
29 |         "try it against urls",
30 |         "http://google.com",
31 |         "http://programming.reddit.com",
32 |         "http://github.com/antirez/smaz/tree/master",
33 |         "/media/hdb1/music/Alben/The Bla",
34 |         NULL
35 |     };
36 | 
37 |     j=0;
38 |     while(strings[j]) {
39 |         int comprlevel;
40 | 
41 |         comprlen = smaz2_compress(out,sizeof(out),(unsigned char*)strings[j],strlen(strings[j]));
42 |         comprlevel = 100-((100*comprlen)/strlen(strings[j]));
43 |         decomprlen = smaz2_decompress(d,sizeof(d),out,comprlen);
44 |         if (strlen(strings[j]) != (unsigned)decomprlen ||
45 |             memcmp(strings[j],d,decomprlen))
46 |         {
47 |             printf("BUG: error compressing '%s'\n", strings[j]);
48 |             printf("COMPRESSED TO: ");
49 |             for (int j = 0; j < comprlen; j++)
50 |                 if (isprint(out[j]))
51 |                     printf("%c",out[j]);
52 |                 else
53 |                     printf("[%02x]",out[j]);
54 |             printf("\n");
55 |             printf("DECOMPRESSED TO: '%.*s'\n", (int)decomprlen, d);
56 |             exit(1);
57 |         }
58 |         if (comprlevel < 0) {
59 |             printf("'%s' enlarged by %d%%\n",strings[j],-comprlevel);
60 |         } else {
61 |             printf("'%s' compressed by %d%%\n",strings[j],comprlevel);
62 |         }
63 |         j++;
64 |     }
65 |     printf("Compressing and decompressing %d test strings...\n", times);
66 |     while(times--) {
67 |         char charset[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvxyz/. ";
68 |         ranlen = random() % 512;
69 | 
70 |         for (j = 0; j < ranlen; j++) {
71 |             if (times & 1)
72 |                 in[j] = charset[random() % (sizeof(charset)-1)];
73 |             else
74 |                 in[j] = (char)(random() & 0xff);
75 |         }
76 |         comprlen = smaz2_compress(out,sizeof(out),in,ranlen);
77 |         decomprlen = smaz2_decompress(d,sizeof(d),out,comprlen);
78 | 
79 |         if (ranlen != decomprlen || memcmp(in,d,ranlen)) {
80 |             printf("Bug! TEST NOT PASSED\n");
81 |             exit(1);
82 |         }
83 |         if (times % 10000 == 0) {
84 |             printf(".");
85 |             fflush(stdout);
86 |         }
87 |     }
88 |     printf("TEST PASSED :)\n");
89 |     return 0;
90 | }
91 | 


--------------------------------------------------------------------------------
/smaz2.c:
--------------------------------------------------------------------------------
  1 | /* Copyright (C) 2024 by Salvatore Sanfilippo -- All rights reserved.
  2 |  * This code is licensed under the MIT license. See LICENSE file for info. */
  3 | 
  4 | #include <stdio.h>
  5 | #include <string.h>
  6 | #include <ctype.h>
  7 | #include <stdlib.h>
  8 | 
  9 | /* 128 common bigrams. */
 10 | const char *bigrams = "intherreheanonesorteattistenntartondalitseediseangoulecomeneriroderaioicliofasetvetasihamaecomceelllcaurlachhidihofonsotacnarssoprrtsassusnoiltsemctgeloeebetrnipeiepancpooldaadviunamutwimoshyoaiewowosfiepttmiopiaweagsuiddoooirspplscaywaigeirylytuulivimabty";
 11 | 
 12 | /* 256 common English words of length four letters or more. */
 13 | char *words[256] = {
 14 | "that", "this", "with", "from", "your", "have", "more", "will", "home",
 15 | "about", "page", "search", "free", "other", "information", "time", "they",
 16 | "what", "which", "their", "news", "there", "only", "when", "contact", "here",
 17 | "business", "also", "help", "view", "online", "first", "been", "would", "were",
 18 | "some", "these", "click", "like", "service", "than", "find", "date", "back",
 19 | "people", "list", "name", "just", "over", "year", "into", "email", "health",
 20 | "world", "next", "used", "work", "last", "most", "music", "data", "make",
 21 | "them", "should", "product", "post", "city", "policy", "number", "such",
 22 | "please", "available", "copyright", "support", "message", "after", "best",
 23 | "software", "then", "good", "video", "well", "where", "info", "right", "public",
 24 | "high", "school", "through", "each", "order", "very", "privacy", "book", "item",
 25 | "company", "read", "group", "need", "many", "user", "said", "does", "under",
 26 | "general", "research", "university", "january", "mail", "full", "review",
 27 | "program", "life", "know", "days", "management", "part", "could", "great",
 28 | "united", "real", "international", "center", "ebay", "must", "store", "travel",
 29 | "comment", "made", "development", "report", "detail", "line", "term", "before",
 30 | "hotel", "send", "type", "because", "local", "those", "using", "result",
 31 | "office", "education", "national", "design", "take", "posted", "internet",
 32 | "address", "community", "within", "state", "area", "want", "phone", "shipping",
 33 | "reserved", "subject", "between", "forum", "family", "long", "based", "code",
 34 | "show", "even", "black", "check", "special", "price", "website", "index",
 35 | "being", "women", "much", "sign", "file", "link", "open", "today", "technology",
 36 | "south", "case", "project", "same", "version", "section", "found", "sport",
 37 | "house", "related", "security", "both", "county", "american", "game", "member",
 38 | "power", "while", "care", "network", "down", "computer", "system", "three",
 39 | "total", "place", "following", "download", "without", "access", "think",
 40 | "north", "resource", "current", "media", "control", "water", "history",
 41 | "picture", "size", "personal", "since", "including", "guide", "shop",
 42 | "directory", "board", "location", "change", "white", "text", "small", "rating",
 43 | "rate", "government", "child", "during", "return", "student", "shopping",
 44 | "account", "site", "level", "digital", "profile", "previous", "form", "event",
 45 | "love", "main", "another", "class", "still"
 46 | };
 47 | 
 48 | /* Compress the string 's' of 'len' bytes and stores the compression
 49 |  * result in 'dst' for a maximum of 'dstlen' bytes. Returns the
 50 |  * amount of bytes written. */
 51 | unsigned long smaz2_compress(unsigned char *dst, unsigned long dstlen, unsigned char *s, unsigned long len)
 52 | {
 53 | 
 54 |     int debug = 0;       // Log debugging messages.
 55 |     int verblen = 0;     /* Length of the emitted verbatim sequence, 0 if
 56 |                           * no verbating sequence was emitted last time,
 57 |                           * otherwise 1...5, it never reaches 8 even if we have
 58 |                           * vertabim len of 8, since as we emit a verbatim
 59 |                           * sequence of 8 bytes we reset verblen to 0 to
 60 |                           * star emitting a new verbatim sequence. */
 61 |     unsigned long y = 0; // Index of next byte to set in 'dst'.
 62 | 
 63 |     while(len && y < dstlen) {
 64 |         /* Try to emit a word. */
 65 |         if (len >= 4) {
 66 |             unsigned int i, wordlen;
 67 |             for (i = 0; i < 256; i++) {
 68 |                 const char *w = words[i];
 69 |                 wordlen = strlen(w);
 70 |                 unsigned int space = s[0] == ' ';
 71 | 
 72 |                 if (len >= wordlen+space &&
 73 |                     memcmp(w,s+space,wordlen) == 0) break; // Match.
 74 |             }
 75 | 
 76 |             /* Emit word if a match was found.
 77 |              * The escapes are:
 78 |              * byte value 6: simple word.
 79 |              * byte value 7: word + space.
 80 |              * byte value 8: space + word. */
 81 |             if (i != 256) {
 82 |                 if (s[0] == ' ') {
 83 |                     if (debug) printf("( %s)", words[i]);
 84 |                     if (y < dstlen) dst[y++] = 8; // Space + word.
 85 |                     if (y < dstlen) dst[y++] = i; // Word ID.
 86 |                     s++; len--; // Account for the space.
 87 |                 } else if (len > wordlen && s[wordlen] == ' ') {
 88 |                     if (debug) printf("(%s )", words[i]);
 89 |                     if (y < dstlen) dst[y++] = 7; // Word + space.
 90 |                     if (y < dstlen) dst[y++] = i; // Word ID.
 91 |                     s++; len--; // Account for the space.
 92 |                 } else {
 93 |                     if (debug) printf("(%s)", words[i]);
 94 |                     if (y < dstlen) dst[y++] = 6; // Simple word.
 95 |                     if (y < dstlen) dst[y++] = i; // Word ID.
 96 |                 }
 97 |                 
 98 |                 /* Consume. */
 99 |                 s += wordlen;
100 |                 len -= wordlen;
101 |                 verblen = 0;
102 |                 continue;
103 |             }
104 |         }
105 | 
106 |         /* Try to emit a bigram. */
107 |         if (len >= 2) {
108 |             int i;
109 |             for (i = 0; i < 128; i++) {
110 |                 const char *b = bigrams + i*2;
111 |                 if (s[0] == b[0] && s[1] == b[1]) break;
112 |             }
113 | 
114 |             /* Emit bigram if a match was found. */
115 |             if (i != 128) {
116 |                 int x = 1;
117 |                 if (y < dstlen) dst[y++] = x<<7 | i;
118 |                 
119 |                 /* Consume. */
120 |                 s += 2;
121 |                 len -= 2;
122 |                 verblen = 0;
123 |                 if (debug) printf("[%c%c]", bigrams[i*2], bigrams[i*2+1]);
124 |                 continue;
125 |             }
126 |         }
127 | 
128 |         /* No word/bigram match. Let's try if we can represent this
129 |          * byte with a single output byte without escaping. We can
130 |          * for all the bytes values but 1, 2, 3, 4, 5, 6, 7, 8. */
131 |         if (!(s[0] > 0 && s[0] < 9) && s[0] < 128) {
132 |             if (y < dstlen) dst[y++] = s[0];
133 | 
134 |             /* Consume. */
135 |             if (debug) printf("{%c}", s[0]);
136 |             s++;
137 |             len--;
138 |             verblen = 0;
139 |             continue;
140 |         }
141 | 
142 |         /* If we are here, we got no match nor in the bigram nor
143 |          * with the single byte. We have to emit 'varbatim' bytes
144 |          * with the escape sequence. */
145 |         verblen++;
146 |         if (verblen == 1) {
147 |             if (debug) printf("_%c", s[0]);
148 |             if (y+1 == dstlen) break; /* No room for 2 bytes. */
149 |             dst[y++] = verblen;
150 |             dst[y++] = s[0];
151 |         } else {
152 |             if (debug) printf("%c", s[0]);
153 |             dst[y++] = s[0];
154 |             dst[y-(verblen+1)] = verblen; // Fix the verbatim bytes length.
155 |             if (verblen == 5) verblen = 0; // Start to emit a new sequence.
156 |         }
157 | 
158 |         /* Consume. */
159 |         s++;
160 |         len--;
161 |     }
162 |     return y;
163 | }
164 | 
165 | /* Decompress the string 'c' of 'len' bytes and stores the compression
166 |  * result in 'dst' for a maximum of 'dstlen' bytes. Returns the
167 |  * amount of bytes written. */
168 | unsigned long smaz2_decompress(unsigned char *dst, unsigned long dstlen, unsigned char *c, unsigned long len)
169 | {
170 |     unsigned long orig_dstlen = dstlen, i = 0;
171 | 
172 |     while (i < len) {
173 |         if ((c[i] & 128) != 0) {
174 |             /* Emit bigram. */
175 |             unsigned char idx = c[i]&127;
176 |             if (dstlen && dstlen-- && i < len) *dst++ = bigrams[idx*2];
177 |             if (dstlen && dstlen-- && i < len) *dst++ = bigrams[idx*2+1];
178 |             i++;
179 |         } else if (c[i] > 0 && c[i] < 6) {
180 |             /* Emit verbatim sequence. */
181 |             unsigned char vlen = c[i++];
182 |             while(vlen-- && i < len)
183 |                 if (dstlen && dstlen--) *dst++ = c[i++];
184 |         } else if (c[i] > 5 && c[i] < 9) {
185 |             /* Emit word. */
186 |             unsigned char escape = c[i];
187 |             if (dstlen && escape == 8 && dstlen--) *dst++ = ' ';
188 |             i++; // Go to word ID byte.
189 |             if (i == len) return 0; // Malformed input.
190 |             unsigned char idx = c[i++], j = 0;
191 |             while(words[idx][j] != 0)
192 |                 if (dstlen && dstlen--) *dst++ = words[idx][j++];
193 |             if (dstlen && escape == 7 && dstlen--) *dst++ = ' ';
194 |         } else {
195 |             /* Emit byte as it is. */
196 |             if (dstlen--) *dst++ = c[i++];
197 |         }
198 |     }
199 |     return orig_dstlen - dstlen;
200 | }
201 | 


--------------------------------------------------------------------------------
/smaz2.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (C) 2024 by Salvatore Sanfilippo -- All rights reserved.
 2 |  * This code is licensed under the MIT license. See LICENSE file for info. */
 3 | 
 4 | #ifndef SMAZ2_H
 5 | #define SMAZ2_H
 6 | 
 7 | unsigned long smaz2_compress(unsigned char *dst, unsigned long dstlen, unsigned char *s, unsigned long len);
 8 | unsigned long smaz2_decompress(unsigned char *dst, unsigned long dstlen, unsigned char *c, unsigned long len);
 9 | 
10 | #endif
11 | 


--------------------------------------------------------------------------------
/smaz2.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2024 Salvatore Sanfilippo <antirez@gmail.com>
  2 | # All Rights Reserved
  3 | #
  4 | # This code is released under the MIT license.
  5 | # See the LICENSE file for more information.
  6 | 
  7 | # Define common bigrams and words
  8 | bigrams = "intherreheanonesorteattistenntartondalitseediseangoulecomeneriroderaioicliofasetvetasihamaecomceelllcaurlachhidihofonsotacnarssoprrtsassusnoiltsemctgeloeebetrnipeiepancpooldaadviunamutwimoshyoaiewowosfiepttmiopiaweagsuiddoooirspplscaywaigeirylytuulivimabty"
  9 | 
 10 | words = [
 11 | "that", "this", "with", "from", "your", "have", "more", "will", "home",
 12 | "about", "page", "search", "free", "other", "information", "time", "they",
 13 | "what", "which", "their", "news", "there", "only", "when", "contact", "here",
 14 | "business", "also", "help", "view", "online", "first", "been", "would", "were",
 15 | "some", "these", "click", "like", "service", "than", "find", "date", "back",
 16 | "people", "list", "name", "just", "over", "year", "into", "email", "health",
 17 | "world", "next", "used", "work", "last", "most", "music", "data", "make",
 18 | "them", "should", "product", "post", "city", "policy", "number", "such",
 19 | "please", "available", "copyright", "support", "message", "after", "best",
 20 | "software", "then", "good", "video", "well", "where", "info", "right", "public",
 21 | "high", "school", "through", "each", "order", "very", "privacy", "book", "item",
 22 | "company", "read", "group", "need", "many", "user", "said", "does", "under",
 23 | "general", "research", "university", "january", "mail", "full", "review",
 24 | "program", "life", "know", "days", "management", "part", "could", "great",
 25 | "united", "real", "international", "center", "ebay", "must", "store", "travel",
 26 | "comment", "made", "development", "report", "detail", "line", "term", "before",
 27 | "hotel", "send", "type", "because", "local", "those", "using", "result",
 28 | "office", "education", "national", "design", "take", "posted", "internet",
 29 | "address", "community", "within", "state", "area", "want", "phone", "shipping",
 30 | "reserved", "subject", "between", "forum", "family", "long", "based", "code",
 31 | "show", "even", "black", "check", "special", "price", "website", "index",
 32 | "being", "women", "much", "sign", "file", "link", "open", "today", "technology",
 33 | "south", "case", "project", "same", "version", "section", "found", "sport",
 34 | "house", "related", "security", "both", "county", "american", "game", "member",
 35 | "power", "while", "care", "network", "down", "computer", "system", "three",
 36 | "total", "place", "following", "download", "without", "access", "think",
 37 | "north", "resource", "current", "media", "control", "water", "history",
 38 | "picture", "size", "personal", "since", "including", "guide", "shop",
 39 | "directory", "board", "location", "change", "white", "text", "small", "rating",
 40 | "rate", "government", "child", "during", "return", "student", "shopping",
 41 | "account", "site", "level", "digital", "profile", "previous", "form", "event",
 42 | "love", "main", "another", "class", "still"
 43 | ]
 44 | 
 45 | # SMAX compression function
 46 | def smax_compress(s):
 47 |     s = s.encode()
 48 |     dst = bytearray()
 49 |     verblen = 0
 50 | 
 51 |     while len(s) > 0:
 52 |         # Try to find a matching word.
 53 |         if len(s) >= 4:
 54 |             for i, w in enumerate(words):
 55 |                 wordlen = len(w)
 56 |                 space = s[0] == 32
 57 | 
 58 |                 if len(s) >= wordlen + space and s[space:wordlen+space] == w.encode():
 59 |                     break
 60 |             else:
 61 |                 i = False
 62 | 
 63 |             if i:
 64 |                 if s[0] == 32:
 65 |                     dst.append(8) # Space + word escape.
 66 |                     dst.append(i)
 67 |                     s = s[1:]
 68 |                 elif len(s) > wordlen and s[wordlen] == 32:
 69 |                     dst.append(7) # Word + space escape.
 70 |                     dst.append(i)
 71 |                     s = s[1:]
 72 |                 else:
 73 |                     dst.append(6) # Just word escape.
 74 |                     dst.append(i)
 75 | 
 76 |                 s = s[wordlen:]
 77 |                 verblen = 0
 78 |                 continue
 79 | 
 80 |         # Try to find a matching bigram.
 81 |         if len(s) >= 2:
 82 |             for i in range(0, len(bigrams), 2):
 83 |                 if s[:2] == bigrams[i:i+2].encode():
 84 |                     break
 85 |             else:
 86 |                 i = False
 87 | 
 88 |             if i:
 89 |                 dst.append(1 << 7 | i // 2)
 90 |                 s = s[2:]
 91 |                 verblen = 0
 92 |                 continue
 93 | 
 94 |         # Can this byte be represented by itself?
 95 |         if not (0 < s[0] < 9) and s[0] < 128:
 96 |             dst.append(s[0])
 97 |             s = s[1:]
 98 |             verblen = 0
 99 |             continue
100 | 
101 |         # Otherwise, emit or update a verbatim sequence.
102 |         verblen += 1
103 |         if verblen == 1:
104 |             dst.extend(bytes([verblen,s[0]]))
105 |         else:
106 |             dst.append(s[0])
107 |             dst[-(verblen + 1)] = verblen
108 |             if verblen == 5:
109 |                 verblen = 0
110 | 
111 |         s = s[1:]
112 | 
113 |     return bytes(dst)
114 | 
115 | # SMAX decompression function
116 | def smax_decompress(c):
117 |     i = 0
118 |     res = bytearray()
119 |     while i < len(c):
120 |         if c[i] & 128 != 0: # Emit bigram
121 |             idx = c[i]&127
122 |             res.extend(bigrams[idx*2:idx*2+2].encode())
123 |             i += 1
124 |             continue
125 |         elif 0 < c[i] < 6: # Emit verbatim
126 |             res.extend(c[i+1:i+1+c[i]])
127 |             i += 1+c[i]
128 |             continue
129 |         elif 5 < c[i] < 9: # Emit word
130 |             if c[i] == 8: res.append(32)
131 |             res.extend(words[c[i+1]].encode())
132 |             if c[i] == 7: res.append(32)
133 |             i += 2
134 |         else: # Emit byte as it is
135 |             res.append(c[i])
136 |             i += 1
137 |     return res.decode()
138 | 
139 | # Main function for command-line interface
140 | if __name__ == "__main__":
141 |     import sys
142 | 
143 |     if len(sys.argv) != 2:
144 |         sys.exit("Usage: {} 'string to compress'".format(sys.argv[0]))
145 | 
146 |     compressed = smax_compress(sys.argv[1])
147 |     print("Compressed length: {:.02f}%".format(len(compressed) / len(sys.argv[1].encode()) * 100))
148 |     print(compressed)
149 |     decompressed = smax_decompress(compressed)
150 |     print("Decompress back: ", decompressed)
151 |     print("The strings are the same after the back and forth?", sys.argv[1] == decompressed)
152 | 


--------------------------------------------------------------------------------
/words_256.txt:
--------------------------------------------------------------------------------
 1 | "that", "this", "with", "from", "your", "have", "more", "will", "home",
 2 | "about", "page", "search", "free", "other", "information", "time", "they",
 3 | "what", "which", "their", "news", "there", "only", "when", "contact", "here",
 4 | "business", "also", "help", "view", "online", "first", "been", "would", "were",
 5 | "some", "these", "click", "like", "service", "than", "find", "date", "back",
 6 | "people", "list", "name", "just", "over", "year", "into", "email", "health",
 7 | "world", "next", "used", "work", "last", "most", "music", "data", "make",
 8 | "them", "should", "product", "post", "city", "policy", "number", "such",
 9 | "please", "available", "copyright", "support", "message", "after", "best",
10 | "software", "then", "good", "video", "well", "where", "info", "right", "public",
11 | "high", "school", "through", "each", "order", "very", "privacy", "book", "item",
12 | "company", "read", "group", "need", "many", "user", "said", "does", "under",
13 | "general", "research", "university", "january", "mail", "full", "review",
14 | "program", "life", "know", "days", "management", "part", "could", "great",
15 | "united", "real", "international", "center", "ebay", "must", "store", "travel",
16 | "comment", "made", "development", "report", "detail", "line", "term", "before",
17 | "hotel", "send", "type", "because", "local", "those", "using", "result",
18 | "office", "education", "national", "design", "take", "posted", "internet",
19 | "address", "community", "within", "state", "area", "want", "phone", "shipping",
20 | "reserved", "subject", "between", "forum", "family", "long", "based", "code",
21 | "show", "even", "black", "check", "special", "price", "website", "index",
22 | "being", "women", "much", "sign", "file", "link", "open", "today", "technology",
23 | "south", "case", "project", "same", "version", "section", "found", "sport",
24 | "house", "related", "security", "both", "county", "american", "game", "member",
25 | "power", "while", "care", "network", "down", "computer", "system", "three",
26 | "total", "place", "following", "download", "without", "access", "think",
27 | "north", "resource", "current", "media", "control", "water", "history",
28 | "picture", "size", "personal", "since", "including", "guide", "shop",
29 | "directory", "board", "location", "change", "white", "text", "small", "rating",
30 | "rate", "government", "child", "during", "return", "student", "shopping",
31 | "account", "site", "level", "digital", "profile", "previous", "form", "event",
32 | "love", "main", "another", "class", "still"
33 | 


--------------------------------------------------------------------------------