├── README.md
├── create_dictionnary.py
├── decipher_MCMC.py
├── liste.de.mots.francais.frgut.txt
├── swann.txt
└── text_utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Decipher using Markov Chain Monte Carlo
 2 | 
 3 | Code I created for my Youtube video : 
 4 | 
 5 | [![screenshot](https://github.com/user-attachments/assets/409208d8-0d14-4727-a22b-42d2928658a2)](https://www.youtube.com/watch?v=z4tkHuWZbRA)
 6 | 
 7 | Provided "as is" and without warranties of any kind :-)
 8 | 
 9 | I don't do any maintenance on it.
10 | 
11 | Run create_dictionnary.py once to create the dictionary.data file
12 | 
13 | Then run decipher_MCMC.py :
14 | - set COUNT_BIGRAMS = True at least once to create the bigrams file (can then subsequently set to False)
15 | - text to decipher should be in ciphered_text string (line 58)
16 | 


--------------------------------------------------------------------------------
/create_dictionnary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Created on Sat Apr 10 10:20:37 2021
 5 | 
 6 | @author: david
 7 | """
 8 | 
 9 | import codecs
10 | import pickle
11 | from text_utils import transform_to_caps
12 | 
13 | filepath = "liste.de.mots.francais.frgut.txt"
14 | 
15 | whole_dico = ""
16 | with codecs.open(filepath, "r", "utf-8") as lines:
17 |     for l in  lines:
18 |         whole_dico += l[:-1] + " "
19 |         
20 | dico = transform_to_caps(whole_dico)
21 | # We add simple letters and qu'
22 | words = ['QU'] + [chr(i) for i in range(65,91)] + dico.split(" ")
23 |     
24 | pickle.dump(words, open( "dictionnary.data", "wb" ))


--------------------------------------------------------------------------------
/decipher_MCMC.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import numpy as np
  5 | from text_utils import load_corpus, transform_to_caps, count_bigrams, char_to_id, id_to_char, count_correct_words, score_correct_words, frequency_order, apply_code, invert_code
  6 | import pickle
  7 | 
  8 | # =============================================================================
  9 | 
 10 | # CREATE BRIGRAMS (needed only the first time)
 11 | COUNT_BIGRAMS = True
 12 | 
 13 | corpus_filename = "swann.txt"
 14 | bigrams_filename = "bigrams.dat"
 15 | 
 16 | if COUNT_BIGRAMS:
 17 |     corpus = load_corpus(corpus_filename)
 18 |     corpus = transform_to_caps(corpus)    
 19 |     count_bigrams(corpus, bigrams_filename, "bigrams.png")
 20 | 
 21 | # =============================================================================
 22 |     
 23 | # LOAD BIGRAMS and build log(probability) matrix
 24 |     
 25 | bigrams = np.fromfile(bigrams_filename,dtype="int32").reshape(27,27)
 26 | p=bigrams.astype('float')/np.tile(sum(bigrams.T),(27,1)).T
 27 | p[np.isnan(p)] = 0
 28 | EPSILON = 1e-6
 29 | logp = np.log(p + EPSILON)
 30 | 
 31 | # =============================================================================
 32 | 
 33 | # Log likelihood per caracter
 34 | def likelihood(s):
 35 |     res = 0
 36 |     c1 = s[0]
 37 |     for c2 in s[1:]:
 38 |         i = char_to_id(c1)
 39 |         j = char_to_id(c2)
 40 |         res += logp[i,j]
 41 |         c1 = c2
 42 |         
 43 |     return res/len(s)
 44 | 
 45 | 
 46 | 
 47 | # Apply one permutation to a code
 48 | def permute_code(code,i,j):
 49 |     newcode = code.copy()
 50 |     newcode[j] = code[i]
 51 |     newcode[i] = code[j]
 52 |     return newcode
 53 | 
 54 | 
 55 | # Text to compute frequency of letter for initial guess
 56 | freq_text = transform_to_caps(load_corpus("swann.txt"))
 57 | 
 58 | ciphered_text = "Ci kmvjpx vb ci wmoxi"
 59 | 
 60 | # =============================================================================
 61 | # =============================================================================
 62 | 
 63 | # Text to decipher : choose among the following options
 64 | 
 65 | # =============================================================================
 66 | 
 67 | ## 1) TEST TEXT : random sentence from corpus
 68 |   
 69 | #test_text = transform_to_caps(load_corpus("swann.txt"))
 70 | #START = np.random.randint(180000)
 71 | #SIZE = 30 # Nb de mots
 72 | #original = " ".join(test_text.split(" ")[START:(START+SIZE)])
 73 | #true_code = [0] + list(np.random.permutation(range(1,27)))
 74 | #ciphered_text = apply_code(original, true_code)
 75 | 
 76 | # =============================================================================
 77 | 
 78 | # 2) FOR ERIC
 79 | 
 80 | 
 81 | #foreric = """NASJX OXH NXH SOE BXYA SJXEA PY SNXYZEA ! YH ZASJSEG BSP ZAEJESG, RSA G SP ZY JY ? OS BAXBXPEZEXH EHRGYZ GS DEPBSAEZEXH, P'SKKASHRUEPPSHZ D'YH SZZAENYZ BSAOE JEHTZ-PEM. BSP RXH ! (XYE C'SE KSEZ KXAZ) 
 82 | #           JXEGS QYE PSHP PXYBRXH PSYAS ASJEA ZXH RENXYGXZ, XY DY OXEHP ZXH SAZ DY BLZUXH. SY BGSEPEA D'YH CXYA HXYP JXEA SYZXYA D'YH NXRV XY DY KGSRXH D'YH JEH, QY'EG PXEZ NGSHR XY AYNEP. ZXH JEG SOE"""
 83 | #ciphered_text = transform_to_caps(foreric)
 84 | 
 85 | # =============================================================================
 86 | 
 87 | # 3) BAER
 88 | 
 89 | #otis = """Mais, vous savez, moi je ne crois pas qu’il y ait de bonne ou de mauvaise situation. Moi, si je devais résumer ma vie aujourd’hui avec vous, je dirais que c’est d’abord des rencontres, des gens qui m’ont tendu la main"""#, peut-être à un moment où je ne pouvais pas, où j’étais seul chez moi. Et c’est assez curieux de se dire que les hasards, les rencontres forgent une destinée… Parce que quand on a le goût de la chose, quand on a le goût de la chose bien faite, le beau geste, parfois on ne trouve pas l’interlocuteur en face, je dirais, le miroir qui vous aide à avancer"""
 90 | #otis = """Et c’est assez curieux de se dire que les hasards, les rencontres forgent une destinée"""
 91 | #otis = """moi je ne crois pas qu’il y ait de bonne ou de mauvaise situation"""
 92 | #
 93 | #original = transform_to_caps(otis)
 94 | #np.random.seed(3)
 95 | #true_code = [0] + list(np.random.permutation(range(1,27)))
 96 | #ciphered_text = apply_code(original, true_code)
 97 | #print(" ".join([chr(64+i) for i in true_code[1:]]))
 98 | #true_decode = invert_code(true_code)
 99 | #print(" ".join([chr(64+i) for i in true_decode[1:]]))
100 | #
101 | #print(original + "   ("+str(len(original)) + " chars)")
102 | #print(likelihood(original))
103 | #print("\n\n")
104 | 
105 | 
106 | # =============================================================================
107 | # =============================================================================
108 | 
109 | ciphered_text = transform_to_caps(ciphered_text)
110 | 
111 | print(ciphered_text + " Likelihood : {0:.2f}".format(likelihood(ciphered_text)))
112 | print("\n")
113 | 
114 | # Initial guess from frequency
115 | 
116 | ref_freq = frequency_order(freq_text)
117 | obs_freq = frequency_order(ciphered_text)
118 | 
119 | freq_code = [0] + list(range(1,27))
120 | 
121 | for i in range(1,27):
122 |     pos = obs_freq.index(i)
123 |     freq_code[i] = ref_freq[pos]
124 | 
125 | cur_code = freq_code.copy()
126 | cur_trad = apply_code(ciphered_text, cur_code)
127 | cur_like = likelihood(cur_trad)
128 | 
129 | 
130 | 
131 | 
132 | # Best found so far
133 | best_code = cur_code.copy()
134 | best_like = cur_like
135 | best_trad = cur_trad
136 | 
137 | print(best_trad + "    N=" + str(0) + " L={0:.2f}".format(best_like))
138 | 
139 | 
140 | # Now the main loop
141 | 
142 | MIN_ITER = 2000
143 | MAX_ITER = 100000
144 | THRESHOLD = -2.05
145 | ALPHA = 1
146 | 
147 | for k in range(MAX_ITER):
148 |     
149 |     # Build a tentative move
150 |     i = np.random.randint(1,27)
151 |     j = np.random.randint(1,27)
152 |     tt_code = permute_code(cur_code, i, j)
153 |     
154 |     tt_trad = apply_code(ciphered_text, tt_code)
155 |     tt_like = likelihood(tt_trad)
156 |     
157 |     # Test whether move should be accepted    
158 |     x = np.random.rand()
159 |     p = np.exp(ALPHA*(tt_like - cur_like) * len(ciphered_text))
160 |     
161 |     if(x < p):    
162 |         cur_code = tt_code.copy()
163 |         cur_trad = tt_trad
164 |         cur_like = tt_like
165 |         #print("ACCEPT")
166 |         
167 |         if(cur_like > best_like):
168 |             best_code = cur_code.copy()
169 |             best_like = cur_like
170 |             best_trad = cur_trad     
171 |             print(best_trad + "    [k=" + str(k) + " L={0:.2f}]".format(best_like))
172 |             
173 |     if k > MIN_ITER and best_like > THRESHOLD:
174 |         break
175 |          
176 |     
177 | #######################################################################################################    
178 | 
179 | print("\nEnter second phase")      
180 | 
181 | 
182 | with open('dictionnary.dat', 'rb') as filehandle:
183 |     dictionnary_words = pickle.load(filehandle)          
184 | 
185 | cnt, total = count_correct_words(best_trad, dictionnary_words) 
186 | word_score = score_correct_words(best_trad, dictionnary_words)
187 | 
188 | print("Mots OK "+str(cnt)+"/"+str(total)+" score="+ str(word_score))     
189 | 
190 | GAMMA = 4.0
191 | best_score = GAMMA * word_score + best_like
192 | 
193 | 
194 | 
195 | cur_code = best_code
196 | cur_score = best_score
197 | cur_trad = best_trad
198 | 
199 | NITER2 = 2000
200 | temperature = 0.05
201 | rho = 0.999
202 | 
203 | for k in range(NITER2):
204 |     
205 |     # Build a tentative move and compute score
206 |     i = np.random.randint(1,27)
207 |     j = np.random.randint(1,27)
208 |     tt_code = permute_code(cur_code, i, j)    
209 |     tt_trad = apply_code(ciphered_text, tt_code)    
210 |     tt_word_score = score_correct_words(tt_trad, dictionnary_words)
211 |     tt_like = likelihood(tt_trad)
212 |     tt_score = GAMMA * tt_word_score + tt_like
213 |     
214 | 
215 |     # Test whether move should be accepted    
216 |     x = np.random.rand()
217 |     p = np.exp((tt_score-cur_score)/temperature)
218 |     temperature = temperature * rho
219 |     
220 |     if(x < p):        
221 |         cur_code = tt_code.copy()
222 |         cur_trad = tt_trad
223 |         cur_score = tt_score
224 |         
225 |         if(cur_score > best_score):
226 |             best_code = cur_code
227 |             best_score = cur_score
228 |             best_trad = cur_trad
229 |             print(tt_trad + "  W={0:.2f}".format(tt_word_score))
230 |             #print(tt_trad + "  W={0:.2f}  L={1:.2f} S={2:.2f} T={3:.3f} k={4}".format(tt_word_score, tt_like, tt_score,temperature,k))
231 |             
232 | 
233 | 


--------------------------------------------------------------------------------
/text_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import codecs
  5 | import re
  6 | import numpy as np
  7 | import matplotlib.pyplot as plt
  8 | import codecs
  9 | import pickle
 10 | 
 11 | # Functions to map space and A-Z to [0-27]
 12 | 
 13 | def char_to_id(c):
 14 |     return 0 if c==" " else ord(c) - 64
 15 | 
 16 | def id_to_char(i):
 17 |     return " " if i==0 else chr(i+64)
 18 | 
 19 | 
 20 | def apply_code(s,code):
 21 |     res = ""
 22 |     for c in s:
 23 |         i = char_to_id(c)
 24 |         res += id_to_char(code[i])
 25 |     return res
 26 | 
 27 | 
 28 | def invert_code(code):
 29 |     res = [-1] * 27
 30 |     for i in range(27):
 31 |         j = code.index(i)
 32 |         res[i] = j
 33 |     return res
 34 | 
 35 | 
 36 | def transform_to_caps(s):
 37 |     # Substitution "oe"
 38 |     s = re.sub(chr(338),"OE", s)
 39 |     s = re.sub(chr(339),"OE", s)
 40 |     
 41 |     # Replace accents, turn ponctuation signs into space
 42 |     # In the end, everything should be space (ascii 32) or capitals A-Z (ascii 65-90)
 43 |     to_A = [192,224,226]
 44 |     to_C = [199,231]
 45 |     to_E = [200,201,202,232,233,234,235]
 46 |     to_I = [238,239]
 47 |     to_O = [244]
 48 |     to_U = [249,251,252]
 49 |     to_SPACE = list(range(33,65)) + [171,187,8217,8230]
 50 |     
 51 |     s_sub = ""
 52 |     for c in s:
 53 |         c2 = c
 54 |         if(ord(c2) in range(97,123)):
 55 |             c2 = chr(ord(c2)-32)
 56 |         if ord(c2) in to_SPACE:
 57 |             c2 = " "
 58 |         if ord(c2) in to_A:
 59 |             c2 = "A"
 60 |         if ord(c2) in to_C:
 61 |             c2 = "C"
 62 |         if ord(c2) in to_E:
 63 |             c2 = "E"
 64 |         if ord(c2) in to_I:
 65 |             c2 = "I"
 66 |         if ord(c2) in to_O:
 67 |             c2 = "O"
 68 |         if ord(c2) in to_U:
 69 |             c2 = "U"  
 70 |         s_sub += c2
 71 |     
 72 |     # Remove multiple spaces
 73 |     res = re.sub('\s+',' ', s_sub)
 74 |     
 75 |     return res
 76 | 
 77 | def load_corpus(filename):
 78 |     
 79 |     # Load the text and make it one string
 80 |     encoding = "utf-8"    
 81 |     whole_text = ""
 82 |     with codecs.open(filename, "r", encoding) as lines:
 83 |         for l in lines:
 84 |             # :-2 to remove endline and then concatenate
 85 |             whole_text = whole_text + l[:-2] + " " 
 86 |     
 87 |     return whole_text
 88 | 
 89 |     
 90 | def count_correct_words(s, dictionnary_words):
 91 |     cnt = 0
 92 |     word_list = s.split(" ")
 93 |     for w in word_list:
 94 |         if w in dictionnary_words:
 95 |             cnt +=1
 96 |     return cnt, len(word_list)
 97 | 
 98 | def score_correct_words(s, dictionnary_words):
 99 |     res = 0
100 |     tot = 0
101 |     word_list = s.split(" ")
102 |     for w in word_list:
103 |         if w in dictionnary_words:
104 |             res += len(w)
105 |         tot += len(w)
106 |     return res/tot
107 | 
108 | def find_wrong_words(s, dictionnary_words):
109 |     word_list = s.split(" ")
110 |     for w in word_list:
111 |         if w not in dictionnary_words:
112 |             print(w)
113 |             
114 |             
115 | def frequency_order(s):
116 |     res = np.zeros(26)          
117 |     for i in range(26):
118 |         res[i] = s.count(chr(65+i))
119 |     return list(1 + np.argsort(res)[::-1])
120 | 
121 | def count_bigrams(corpus, outfile, imagefile = None):
122 |     
123 |     # Count of ASCII characters to check everything is ok
124 |     count = np.zeros(512)
125 |     for c in corpus:
126 |         count[ord(c)] += 1
127 |         
128 |     for i in range(512):
129 |         if count[i] > 0:
130 |             print(str(i) + " " + chr(i) + " " + str(count[i]))
131 |     
132 |         
133 |     # Now we are ready to count the bigrams                
134 |     bigrams = np.zeros((27,27),dtype='int32')
135 |     i = 0
136 |     for c in corpus:
137 |         j = 0 if c == " " else ord(c) - 64
138 |         bigrams[i,j] += 1
139 |         i = j
140 |     
141 |     bigrams.tofile(outfile)
142 |     
143 |     if imagefile != None:
144 |         # Plot of the matrix, normalized per line
145 |         p2D=bigrams.astype('float')/np.tile(sum(bigrams.T),(27,1)).T
146 |         p2D[np.isnan(p2D)] = 0
147 |         
148 |         alpha = 0.33
149 |         p2Da = p2D**alpha
150 |         plt.figure(figsize=(8,8))
151 |         plt.imshow(p2Da,interpolation='nearest', cmap = 'inferno')
152 |         plt.axis('off')
153 |         
154 |         for ip, i in enumerate([32]+list(range(65,91))):
155 |             plt.text(-1,ip,chr(i),horizontalalignment='center',
156 |                                     verticalalignment='center')
157 |             plt.text(ip,-1,chr(i),horizontalalignment='center',
158 |                                     verticalalignment='center')
159 |         plt.savefig(imagefile)     
160 |         
161 |         return p2Da
162 |         
163 |         
164 | def create_dictionnary():
165 |     filepath = "liste.de.mots.francais.frgut.txt"
166 |     
167 |     whole_dico = ""
168 |     with codecs.open(filepath, "r", "utf-8") as lines:
169 |         for l in  lines:
170 |             whole_dico += l[:-1] + " "
171 |             
172 |     dico = transform_to_caps(whole_dico)
173 |     # We add simple letters and qu'
174 |     words = ['QU'] + ['A','C','D','L','Y','M','N','S','T'] + dico.split(" ")
175 |         
176 |     pickle.dump(words, open( "dictionnary.data", "wb" ))
177 |     
178 |     
179 | emolist = [
180 |  '\U0001F600',
181 |  '\U0001F601',
182 |  '\U0001F602',
183 |  '\U0001F603',
184 |  '\U0001F604',
185 |  '\U0001F605',
186 |  '\U0001F606',
187 |  '\U0001F607',
188 |  '\U0001F608',
189 |  '\U0001F609',
190 |  '\U0001F610',
191 |  '\U0001F611',
192 |  '\U0001F612',
193 |  '\U0001F613',
194 |  '\U0001F614',
195 |  '\U0001F615',
196 |  '\U0001F616',
197 |  '\U0001F617',
198 |  '\U0001F618',
199 |  '\U0001F619',
200 |  '\U0001F620',
201 |  '\U0001F621',
202 |  '\U0001F622',
203 |  '\U0001F623',
204 |  '\U0001F624',
205 |  '\U0001F625']
206 | 


--------------------------------------------------------------------------------