├── .gitignore
├── setup.py
├── tests.py
├── README.md
└── equitext.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | doc/
3 | *.pyc


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 | 	name='equitext',
 5 | 	version='1',
 6 | 	py_modules=['equitext'],
 7 | 	test_suite='tests',
 8 | 
 9 | 	author='foobuzz',
10 | 	author_email='dprosium@gmail.com',
11 | 	description='A text-to-text encoding. Characters have the same number of '
12 | 	'occurences in the encoded text',
13 | 	keywords='encoding text character occurence frequency',
14 | 	url='https://github.com/foobuzz/equitext'
15 | )


--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | 
 3 | import string, random, traceback, sys
 4 | 
 5 | import equitext
 6 | 
 7 | 
 8 | def get_random_text(max_len):
 9 | 	text = ''
10 | 	for i in range(random.randrange(max_len)):
11 | 		text += random.choice(string.printable)
12 | 	return text
13 | 
14 | 
15 | if __name__ == '__main__':
16 | 	texts = ['', 'a'] + [get_random_text(1000) for i in range(100)]
17 | 	for t in texts:
18 | 		try:
19 | 			t2 = equitext.decode(equitext.encode(t))
20 | 			assert t2 == t
21 | 		except:
22 | 			type_, value, tb = sys.exc_info()
23 | 			print(type_, value)
24 | 			traceback.print_tb(tb)
25 | 			print('** Text:')
26 | 			print(t)
27 | 			break
28 | 	else:
29 | 		print('OK')
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | equitext
 2 | ========
 3 | 
 4 | A Python module which encode strings so that every character has the same number of occurrences in the encoded string. It makes the string length grow by a factor of about 1.44.
 5 | 
 6 | 	>>> import equitext
 7 | 	>>> message = "A histogram is a graphical representation of the distribution of numerical data. It is an estimate of the probability distribution of a continuous variable (quantitative variable) and was first introduced by Karl Pearson."
 8 | 	>>> equitext.histogram(message)
 9 | 	  ======================================================================== 0.145
10 | 	i ================================================= 0.1
11 | 	a ================================================= 0.1
12 | 	t ============================================= 0.09
13 | 	r =============================== 0.063
14 | 	o ============================= 0.059
15 | 	e ============================= 0.059
16 | 	n =========================== 0.054
17 | 	s ======================== 0.05
18 | 	b =============== 0.032
19 | 	u =============== 0.032
20 | 	d ============= 0.027
21 | 	l ============= 0.027
22 | 	f =========== 0.023
23 | 	h ========= 0.018
24 | 	c ========= 0.018
25 | 	m ====== 0.014
26 | 	v ====== 0.014
27 | 	p ====== 0.014
28 | 	y ==== 0.009
29 | 	g ==== 0.009
30 | 	. ==== 0.009
31 | 	) == 0.005
32 | 	A == 0.005
33 | 	q == 0.005
34 | 	w == 0.005
35 | 	I == 0.005
36 | 	P == 0.005
37 | 	K == 0.005
38 | 	( == 0.005
39 | 	>>> encoded = equitext.encode(message)
40 | 	>>> encoded
41 | 	' ImobKAfgh)levscPwtp.(ydarnqui mrqolb(fevItu.npgiwyasAdch)KP sdc.pqrAnmihtaovfuKegw)lbIP(y huynbqo(iswlfIr.eP)gmtAapvKcd oKyahgsmwe)ntfuIip.PlrvqAd(bc (tawq.vAcplugI)mfsndyoirbPeKh( aeifnrPvtdhqwImgpylu.cb)AKos(.PwtsIboufmipKeAahv gn)yqldrc grPKIvil(dyA)s.hcpuqtbmaeofwn oKs(c)AgP.uhevIlqarmtpwnfydbi vwmhKqlrbf()ietopysPc.IudngAa'
42 | 	>>> equitext.histogram(encoded)
43 | 	. ======================================================================== 0.033
44 | 	m ======================================================================== 0.033
45 | 	) ======================================================================== 0.033
46 | 	b ======================================================================== 0.033
47 | 	r ======================================================================== 0.033
48 | 	( ======================================================================== 0.033
49 | 	  ======================================================================== 0.033
50 | 	v ======================================================================== 0.033
51 | 	h ======================================================================== 0.033
52 | 	A ======================================================================== 0.033
53 | 	c ======================================================================== 0.033
54 | 	o ======================================================================== 0.033
55 | 	t ======================================================================== 0.033
56 | 	w ======================================================================== 0.033
57 | 	n ======================================================================== 0.033
58 | 	I ======================================================================== 0.033
59 | 	y ======================================================================== 0.033
60 | 	s ======================================================================== 0.033
61 | 	u ======================================================================== 0.033
62 | 	f ======================================================================== 0.033
63 | 	P ======================================================================== 0.033
64 | 	p ======================================================================== 0.033
65 | 	g ======================================================================== 0.033
66 | 	i ======================================================================== 0.033
67 | 	K ======================================================================== 0.033
68 | 	e ======================================================================== 0.033
69 | 	d ======================================================================== 0.033
70 | 	a ======================================================================== 0.033
71 | 	q ======================================================================== 0.033
72 | 	l ======================================================================== 0.033
73 | 	>>> equitext.decode(encoded)
74 | 	'A histogram is a graphical representation of the distribution of numerical data. It is an estimate of the probability distribution of a continuous variable (quantitative variable) and was first introduced by Karl Pearson.'
75 | 
76 |  - Installation: install the `equitext` package for Python 3 via pip3
77 | 
78 |  - The algorithm: [on my blog](https://foobuzz.github.io/equitext-blog/)
79 | 
80 |  - Module documentation: [here](https://pythonhosted.org/equitext/)


--------------------------------------------------------------------------------
/equitext.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A text-to-text encoding. Characters in the encoded text all have the same
  3 | number of occurences. It makes the text length grow by a factor of about
  4 | 1.44.
  5 | """
  6 | 
  7 | import math
  8 | 
  9 | version = '1'
 10 | 
 11 | 
 12 | def get_combindex(chunk, tebahpla):
 13 | 	"""Return the index of a combination of characters belonging to the given
 14 | 		alphabet.
 15 | 
 16 | 		:param chunk: The combination of characters, given as a string
 17 | 		:param tebahpla: a dictionary containing characters of the alphabet as
 18 | 			keys and their ordinal (that is, their position in the alphabet)
 19 | 			as values.
 20 | 	
 21 | 		:returns: The index of the given combination, an integer."""
 22 | 	p = len(chunk) - 1
 23 | 	total = 0
 24 | 	for char in chunk:
 25 | 		total += tebahpla[char] * len(tebahpla)**p
 26 | 		p -= 1
 27 | 	return total
 28 | 
 29 | 
 30 | def get_combination(index, alphabet):
 31 | 	"""Return the combination of characters having the given index according
 32 | 		to the given alphabet.
 33 | 
 34 | 		:param index: The index of the combination, given as an integer
 35 | 		:param alphabet: The alphabet, given as a list of characters
 36 | 	
 37 | 		:returns: The combination corresponding to the index, a string."""
 38 | 	chunk = ''
 39 | 	quot = index
 40 | 	while quot != 0:
 41 | 		quot, remain = divmod(quot, len(alphabet))
 42 | 		chunk = ''.join([alphabet[remain], chunk])
 43 | 	return chunk
 44 | 
 45 | 
 46 | def get_permindex(permutation, alphabet):
 47 | 	"""Return the index of a permutation of the alphabet.
 48 | 
 49 | 		:param permutation: The permutation of the alphabet, given as a string
 50 | 		:param alphabet: The alphabet, given as a list of characters
 51 | 	
 52 | 		:returns: The index of the given permutation, an integer."""
 53 | 	radix = len(alphabet)-1
 54 | 	index = 0
 55 | 	subalphabet = alphabet.copy()
 56 | 	for char in permutation:
 57 | 		index += subalphabet.index(char) * math.factorial(radix)
 58 | 		subalphabet.remove(char)
 59 | 		radix -= 1
 60 | 	return index
 61 | 
 62 | 
 63 | def get_permutation(index, alphabet):
 64 | 	"""Return the permutation of the alphabet having the given index.
 65 | 
 66 | 		:param index: The index of the permutation, given as an integer
 67 | 		:param alphabet: The alphabet, given as a list of characters
 68 | 	
 69 | 		:returns: The permutation corresponding to the index, a string."""
 70 | 	digits = convert_factorial(index)
 71 | 	digits = [0]*(len(alphabet)-len(digits)) + digits
 72 | 	subalphabet = alphabet.copy()
 73 | 	permutation = ''
 74 | 	for d in digits:
 75 | 		char = subalphabet[d]
 76 | 		permutation += char
 77 | 		subalphabet.remove(char)
 78 | 	return permutation
 79 | 
 80 | 
 81 | def convert_factorial(index):
 82 | 	"""Convert a number into the factorial number system.
 83 | 
 84 | 		:param index: The integer to convert
 85 | 
 86 | 		:returns: The digits of the number in the factorial number system,
 87 | 			given as a list of integers. The digits are given in the reverse
 88 | 			positional fashion, that is, digits corresponding to the highest
 89 | 			radices are given first"""
 90 | 	digits = []
 91 | 	quot = index
 92 | 	value = 1
 93 | 	while quot != 0:
 94 | 		quot, remain = divmod(quot, value)
 95 | 		digits.insert(0, remain)
 96 | 		value += 1
 97 | 	return digits
 98 | 
 99 | 
100 | def get_alphabet_from_encoded(text):
101 | 	"""Return the alphabet of an equitext-encoded text.
102 | 
103 | 		This simply returns the sorted list of characters used in the text (each
104 | 		character has a unique occurence in the list). While this is optimized
105 | 		for equitext-encoded texts, the simple :code:`sorted(list(set(text)))`
106 | 		is fine for nondescript texts
107 | 
108 | 		:param text: The text from which extract the alphabet
109 | 
110 | 		:returns: The alphabet used by the text, as a sorted list of characters"""
111 | 	alphabet = set()
112 | 	for char in text:
113 | 		if char in alphabet:
114 | 			break
115 | 		alphabet.add(char)
116 | 	return sorted(list(alphabet))
117 | 
118 | 
119 | def get_chunk_length(alphabet):
120 | 	"""Return the length of chunks that equitext should use for texts using the
121 | 		given alphabet.
122 | 
123 | 		:param alphabet: The alphabet, given as a list of characters
124 | 
125 | 		:returns: The length of chunks to use with this alphabet, an integer"""
126 | 	len_chunk = 0
127 | 	while len(alphabet)**len_chunk <= math.factorial(len(alphabet)):
128 | 		len_chunk += 1
129 | 	return len_chunk - 1
130 | 
131 | 
132 | def histogram(text, size=1, precision=3, occ=False, symbol='=', sort=1,
133 | 	reverse=True):
134 | 	"""Print the histogram of occurences of characters in the given text
135 | 
136 | 		:param text: The text to print the histogram for
137 | 		:param size: A coefficient which impacts the width of the histogram
138 | 			proportionally to its default width which is 80-characters
139 | 		:param precision: The precision for the ratios printed at the tip of
140 | 			the histogram's bar. Must be an integer.
141 | 		:param occ: A boolean which indicates whether to display the absolute
142 | 			number of occurences of each character in addition to their ratio
143 | 		:param symbol: The string which constitutes one unit of histogram's bar
144 | 		:param sort: If set to 0, sort the histogram according to the
145 | 			chracters Unicode code points. If set to 1, sort the histogram
146 | 			according to the number of occurences of each characters
147 | 		:param reverse: Reverse the order of the sort
148 | 
149 | 		:returns: None
150 | 		"""
151 | 	occurs = {}
152 | 	max_occur = 0
153 | 	for char in text:
154 | 		if char in occurs:
155 | 			occurs[char] += 1
156 | 		else:
157 | 			occurs[char] = 1
158 | 		if occurs[char] > max_occur:
159 | 			max_occur = occurs[char]
160 | 
161 | 	s_occurs = sorted(list(occurs.items()), key=lambda e: e[sort],
162 | 		reverse=reverse)
163 | 
164 | 	# One line is made of:
165 | 	# character + space + bar + space + frequency + (space + occ)
166 | 	# This is character + space + space + frequency = 3 + frequency:
167 | 	barless_len = 3 + len(str(round(max_occur/len(text), precision)))
168 | 	# Additional optional occurence between parenthesis:
169 | 	if occ:
170 | 		barless_len += 3 + len(str(max_occur))
171 | 	# We want the biggest line to fit in 80 characters
172 | 	unitlen = (80-barless_len)/max_occur
173 | 	# The size makes the histogram bigger or shorter:
174 | 	unitlen *= size
175 | 	
176 | 	for char, occur in s_occurs:
177 | 		line = ' '.join([
178 | 			char,
179 | 			symbol*int(unitlen*occur),
180 | 			str(round(occur/len(text), precision))
181 | 			])
182 | 		if occ:
183 | 			line += ' ({})'.format(occur)
184 | 		print(line)
185 | 
186 | 
187 | def encode(text):
188 | 	"""Encode a text using equitext
189 | 
190 | 		:param text: The string to encode
191 | 
192 | 		:returns: The encoded string
193 | 	"""
194 | 	alphabet = sorted(list(set(text)))
195 | 	if len(alphabet) <= 1:
196 | 		return text
197 | 	tebahpla = {c:i for i, c in enumerate(alphabet)}
198 | 	len_chunk = get_chunk_length(alphabet)
199 | 	len_pad = len_chunk - len(text)%len_chunk
200 | 	text += alphabet[len_pad] * len_pad
201 | 	encoded = ''
202 | 	for i in range(0, len(text), len_chunk):
203 | 		chunk = text[i:i+len_chunk]
204 | 		index = get_combindex(chunk, tebahpla)
205 | 		encoded += get_permutation(index, alphabet)
206 | 	return encoded
207 | 
208 | 
209 | def decode(text):
210 | 	"""Decode a text using equitext
211 | 
212 | 		:param text: The string to decode
213 | 
214 | 		:returns: The decoded string
215 | 	"""
216 | 	alphabet = get_alphabet_from_encoded(text)
217 | 	if len(alphabet) <= 1:
218 | 		return text
219 | 	len_chunk = get_chunk_length(alphabet)
220 | 	decoded = ''
221 | 	for i in range(0, len(text), len(alphabet)):
222 | 		permutation = text[i:i+len(alphabet)]
223 | 		index = get_permindex(permutation, alphabet)
224 | 		chunk = get_combination(index, alphabet)
225 | 		chunk = ''.join([alphabet[0]*(len_chunk-len(chunk)), chunk])
226 | 		decoded += chunk
227 | 	len_pad = alphabet.index(decoded[-1])
228 | 	return decoded[:-len_pad]
229 | 
230 | 
231 | if __name__ == '__main__':
232 | 	histogram('hello, world!')
233 | 


--------------------------------------------------------------------------------