├── .gitignore ├── setup.py ├── tests.py ├── README.md └── equitext.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | doc/ 3 | *.pyc -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='equitext', 5 | version='1', 6 | py_modules=['equitext'], 7 | test_suite='tests', 8 | 9 | author='foobuzz', 10 | author_email='dprosium@gmail.com', 11 | description='A text-to-text encoding. Characters have the same number of ' 12 | 'occurences in the encoded text', 13 | keywords='encoding text character occurence frequency', 14 | url='https://github.com/foobuzz/equitext' 15 | ) -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import string, random, traceback, sys 4 | 5 | import equitext 6 | 7 | 8 | def get_random_text(max_len): 9 | text = '' 10 | for i in range(random.randrange(max_len)): 11 | text += random.choice(string.printable) 12 | return text 13 | 14 | 15 | if __name__ == '__main__': 16 | texts = ['', 'a'] + [get_random_text(1000) for i in range(100)] 17 | for t in texts: 18 | try: 19 | t2 = equitext.decode(equitext.encode(t)) 20 | assert t2 == t 21 | except: 22 | type_, value, tb = sys.exc_info() 23 | print(type_, value) 24 | traceback.print_tb(tb) 25 | print('** Text:') 26 | print(t) 27 | break 28 | else: 29 | print('OK') 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | equitext 2 | ======== 3 | 4 | A Python module which encode strings so that every character has the same number of occurrences in the encoded string. It makes the string length grow by a factor of about 1.44. 5 | 6 | >>> import equitext 7 | >>> message = "A histogram is a graphical representation of the distribution of numerical data. It is an estimate of the probability distribution of a continuous variable (quantitative variable) and was first introduced by Karl Pearson." 8 | >>> equitext.histogram(message) 9 | ======================================================================== 0.145 10 | i ================================================= 0.1 11 | a ================================================= 0.1 12 | t ============================================= 0.09 13 | r =============================== 0.063 14 | o ============================= 0.059 15 | e ============================= 0.059 16 | n =========================== 0.054 17 | s ======================== 0.05 18 | b =============== 0.032 19 | u =============== 0.032 20 | d ============= 0.027 21 | l ============= 0.027 22 | f =========== 0.023 23 | h ========= 0.018 24 | c ========= 0.018 25 | m ====== 0.014 26 | v ====== 0.014 27 | p ====== 0.014 28 | y ==== 0.009 29 | g ==== 0.009 30 | . ==== 0.009 31 | ) == 0.005 32 | A == 0.005 33 | q == 0.005 34 | w == 0.005 35 | I == 0.005 36 | P == 0.005 37 | K == 0.005 38 | ( == 0.005 39 | >>> encoded = equitext.encode(message) 40 | >>> encoded 41 | ' ImobKAfgh)levscPwtp.(ydarnqui mrqolb(fevItu.npgiwyasAdch)KP sdc.pqrAnmihtaovfuKegw)lbIP(y huynbqo(iswlfIr.eP)gmtAapvKcd oKyahgsmwe)ntfuIip.PlrvqAd(bc (tawq.vAcplugI)mfsndyoirbPeKh( aeifnrPvtdhqwImgpylu.cb)AKos(.PwtsIboufmipKeAahv gn)yqldrc grPKIvil(dyA)s.hcpuqtbmaeofwn oKs(c)AgP.uhevIlqarmtpwnfydbi vwmhKqlrbf()ietopysPc.IudngAa' 42 | >>> equitext.histogram(encoded) 43 | . ======================================================================== 0.033 44 | m ======================================================================== 0.033 45 | ) ======================================================================== 0.033 46 | b ======================================================================== 0.033 47 | r ======================================================================== 0.033 48 | ( ======================================================================== 0.033 49 | ======================================================================== 0.033 50 | v ======================================================================== 0.033 51 | h ======================================================================== 0.033 52 | A ======================================================================== 0.033 53 | c ======================================================================== 0.033 54 | o ======================================================================== 0.033 55 | t ======================================================================== 0.033 56 | w ======================================================================== 0.033 57 | n ======================================================================== 0.033 58 | I ======================================================================== 0.033 59 | y ======================================================================== 0.033 60 | s ======================================================================== 0.033 61 | u ======================================================================== 0.033 62 | f ======================================================================== 0.033 63 | P ======================================================================== 0.033 64 | p ======================================================================== 0.033 65 | g ======================================================================== 0.033 66 | i ======================================================================== 0.033 67 | K ======================================================================== 0.033 68 | e ======================================================================== 0.033 69 | d ======================================================================== 0.033 70 | a ======================================================================== 0.033 71 | q ======================================================================== 0.033 72 | l ======================================================================== 0.033 73 | >>> equitext.decode(encoded) 74 | 'A histogram is a graphical representation of the distribution of numerical data. It is an estimate of the probability distribution of a continuous variable (quantitative variable) and was first introduced by Karl Pearson.' 75 | 76 | - Installation: install the `equitext` package for Python 3 via pip3 77 | 78 | - The algorithm: [on my blog](https://foobuzz.github.io/equitext-blog/) 79 | 80 | - Module documentation: [here](https://pythonhosted.org/equitext/) -------------------------------------------------------------------------------- /equitext.py: -------------------------------------------------------------------------------- 1 | """ 2 | A text-to-text encoding. Characters in the encoded text all have the same 3 | number of occurences. It makes the text length grow by a factor of about 4 | 1.44. 5 | """ 6 | 7 | import math 8 | 9 | version = '1' 10 | 11 | 12 | def get_combindex(chunk, tebahpla): 13 | """Return the index of a combination of characters belonging to the given 14 | alphabet. 15 | 16 | :param chunk: The combination of characters, given as a string 17 | :param tebahpla: a dictionary containing characters of the alphabet as 18 | keys and their ordinal (that is, their position in the alphabet) 19 | as values. 20 | 21 | :returns: The index of the given combination, an integer.""" 22 | p = len(chunk) - 1 23 | total = 0 24 | for char in chunk: 25 | total += tebahpla[char] * len(tebahpla)**p 26 | p -= 1 27 | return total 28 | 29 | 30 | def get_combination(index, alphabet): 31 | """Return the combination of characters having the given index according 32 | to the given alphabet. 33 | 34 | :param index: The index of the combination, given as an integer 35 | :param alphabet: The alphabet, given as a list of characters 36 | 37 | :returns: The combination corresponding to the index, a string.""" 38 | chunk = '' 39 | quot = index 40 | while quot != 0: 41 | quot, remain = divmod(quot, len(alphabet)) 42 | chunk = ''.join([alphabet[remain], chunk]) 43 | return chunk 44 | 45 | 46 | def get_permindex(permutation, alphabet): 47 | """Return the index of a permutation of the alphabet. 48 | 49 | :param permutation: The permutation of the alphabet, given as a string 50 | :param alphabet: The alphabet, given as a list of characters 51 | 52 | :returns: The index of the given permutation, an integer.""" 53 | radix = len(alphabet)-1 54 | index = 0 55 | subalphabet = alphabet.copy() 56 | for char in permutation: 57 | index += subalphabet.index(char) * math.factorial(radix) 58 | subalphabet.remove(char) 59 | radix -= 1 60 | return index 61 | 62 | 63 | def get_permutation(index, alphabet): 64 | """Return the permutation of the alphabet having the given index. 65 | 66 | :param index: The index of the permutation, given as an integer 67 | :param alphabet: The alphabet, given as a list of characters 68 | 69 | :returns: The permutation corresponding to the index, a string.""" 70 | digits = convert_factorial(index) 71 | digits = [0]*(len(alphabet)-len(digits)) + digits 72 | subalphabet = alphabet.copy() 73 | permutation = '' 74 | for d in digits: 75 | char = subalphabet[d] 76 | permutation += char 77 | subalphabet.remove(char) 78 | return permutation 79 | 80 | 81 | def convert_factorial(index): 82 | """Convert a number into the factorial number system. 83 | 84 | :param index: The integer to convert 85 | 86 | :returns: The digits of the number in the factorial number system, 87 | given as a list of integers. The digits are given in the reverse 88 | positional fashion, that is, digits corresponding to the highest 89 | radices are given first""" 90 | digits = [] 91 | quot = index 92 | value = 1 93 | while quot != 0: 94 | quot, remain = divmod(quot, value) 95 | digits.insert(0, remain) 96 | value += 1 97 | return digits 98 | 99 | 100 | def get_alphabet_from_encoded(text): 101 | """Return the alphabet of an equitext-encoded text. 102 | 103 | This simply returns the sorted list of characters used in the text (each 104 | character has a unique occurence in the list). While this is optimized 105 | for equitext-encoded texts, the simple :code:`sorted(list(set(text)))` 106 | is fine for nondescript texts 107 | 108 | :param text: The text from which extract the alphabet 109 | 110 | :returns: The alphabet used by the text, as a sorted list of characters""" 111 | alphabet = set() 112 | for char in text: 113 | if char in alphabet: 114 | break 115 | alphabet.add(char) 116 | return sorted(list(alphabet)) 117 | 118 | 119 | def get_chunk_length(alphabet): 120 | """Return the length of chunks that equitext should use for texts using the 121 | given alphabet. 122 | 123 | :param alphabet: The alphabet, given as a list of characters 124 | 125 | :returns: The length of chunks to use with this alphabet, an integer""" 126 | len_chunk = 0 127 | while len(alphabet)**len_chunk <= math.factorial(len(alphabet)): 128 | len_chunk += 1 129 | return len_chunk - 1 130 | 131 | 132 | def histogram(text, size=1, precision=3, occ=False, symbol='=', sort=1, 133 | reverse=True): 134 | """Print the histogram of occurences of characters in the given text 135 | 136 | :param text: The text to print the histogram for 137 | :param size: A coefficient which impacts the width of the histogram 138 | proportionally to its default width which is 80-characters 139 | :param precision: The precision for the ratios printed at the tip of 140 | the histogram's bar. Must be an integer. 141 | :param occ: A boolean which indicates whether to display the absolute 142 | number of occurences of each character in addition to their ratio 143 | :param symbol: The string which constitutes one unit of histogram's bar 144 | :param sort: If set to 0, sort the histogram according to the 145 | chracters Unicode code points. If set to 1, sort the histogram 146 | according to the number of occurences of each characters 147 | :param reverse: Reverse the order of the sort 148 | 149 | :returns: None 150 | """ 151 | occurs = {} 152 | max_occur = 0 153 | for char in text: 154 | if char in occurs: 155 | occurs[char] += 1 156 | else: 157 | occurs[char] = 1 158 | if occurs[char] > max_occur: 159 | max_occur = occurs[char] 160 | 161 | s_occurs = sorted(list(occurs.items()), key=lambda e: e[sort], 162 | reverse=reverse) 163 | 164 | # One line is made of: 165 | # character + space + bar + space + frequency + (space + occ) 166 | # This is character + space + space + frequency = 3 + frequency: 167 | barless_len = 3 + len(str(round(max_occur/len(text), precision))) 168 | # Additional optional occurence between parenthesis: 169 | if occ: 170 | barless_len += 3 + len(str(max_occur)) 171 | # We want the biggest line to fit in 80 characters 172 | unitlen = (80-barless_len)/max_occur 173 | # The size makes the histogram bigger or shorter: 174 | unitlen *= size 175 | 176 | for char, occur in s_occurs: 177 | line = ' '.join([ 178 | char, 179 | symbol*int(unitlen*occur), 180 | str(round(occur/len(text), precision)) 181 | ]) 182 | if occ: 183 | line += ' ({})'.format(occur) 184 | print(line) 185 | 186 | 187 | def encode(text): 188 | """Encode a text using equitext 189 | 190 | :param text: The string to encode 191 | 192 | :returns: The encoded string 193 | """ 194 | alphabet = sorted(list(set(text))) 195 | if len(alphabet) <= 1: 196 | return text 197 | tebahpla = {c:i for i, c in enumerate(alphabet)} 198 | len_chunk = get_chunk_length(alphabet) 199 | len_pad = len_chunk - len(text)%len_chunk 200 | text += alphabet[len_pad] * len_pad 201 | encoded = '' 202 | for i in range(0, len(text), len_chunk): 203 | chunk = text[i:i+len_chunk] 204 | index = get_combindex(chunk, tebahpla) 205 | encoded += get_permutation(index, alphabet) 206 | return encoded 207 | 208 | 209 | def decode(text): 210 | """Decode a text using equitext 211 | 212 | :param text: The string to decode 213 | 214 | :returns: The decoded string 215 | """ 216 | alphabet = get_alphabet_from_encoded(text) 217 | if len(alphabet) <= 1: 218 | return text 219 | len_chunk = get_chunk_length(alphabet) 220 | decoded = '' 221 | for i in range(0, len(text), len(alphabet)): 222 | permutation = text[i:i+len(alphabet)] 223 | index = get_permindex(permutation, alphabet) 224 | chunk = get_combination(index, alphabet) 225 | chunk = ''.join([alphabet[0]*(len_chunk-len(chunk)), chunk]) 226 | decoded += chunk 227 | len_pad = alphabet.index(decoded[-1]) 228 | return decoded[:-len_pad] 229 | 230 | 231 | if __name__ == '__main__': 232 | histogram('hello, world!') 233 | --------------------------------------------------------------------------------