├── README.md ├── create_dictionnary.py ├── decipher_MCMC.py ├── liste.de.mots.francais.frgut.txt ├── swann.txt └── text_utils.py /README.md: -------------------------------------------------------------------------------- 1 | # Decipher using Markov Chain Monte Carlo 2 | 3 | Code I created for my Youtube video : 4 | 5 | [![screenshot](https://github.com/user-attachments/assets/409208d8-0d14-4727-a22b-42d2928658a2)](https://www.youtube.com/watch?v=z4tkHuWZbRA) 6 | 7 | Provided "as is" and without warranties of any kind :-) 8 | 9 | I don't do any maintenance on it. 10 | 11 | Run create_dictionnary.py once to create the dictionary.data file 12 | 13 | Then run decipher_MCMC.py : 14 | - set COUNT_BIGRAMS = True at least once to create the bigrams file (can then subsequently set to False) 15 | - text to decipher should be in ciphered_text string (line 58) 16 | -------------------------------------------------------------------------------- /create_dictionnary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Sat Apr 10 10:20:37 2021 5 | 6 | @author: david 7 | """ 8 | 9 | import codecs 10 | import pickle 11 | from text_utils import transform_to_caps 12 | 13 | filepath = "liste.de.mots.francais.frgut.txt" 14 | 15 | whole_dico = "" 16 | with codecs.open(filepath, "r", "utf-8") as lines: 17 | for l in lines: 18 | whole_dico += l[:-1] + " " 19 | 20 | dico = transform_to_caps(whole_dico) 21 | # We add simple letters and qu' 22 | words = ['QU'] + [chr(i) for i in range(65,91)] + dico.split(" ") 23 | 24 | pickle.dump(words, open( "dictionnary.data", "wb" )) -------------------------------------------------------------------------------- /decipher_MCMC.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy as np 5 | from text_utils import load_corpus, transform_to_caps, count_bigrams, char_to_id, id_to_char, count_correct_words, score_correct_words, frequency_order, apply_code, invert_code 6 | import pickle 7 | 8 | # ============================================================================= 9 | 10 | # CREATE BRIGRAMS (needed only the first time) 11 | COUNT_BIGRAMS = True 12 | 13 | corpus_filename = "swann.txt" 14 | bigrams_filename = "bigrams.dat" 15 | 16 | if COUNT_BIGRAMS: 17 | corpus = load_corpus(corpus_filename) 18 | corpus = transform_to_caps(corpus) 19 | count_bigrams(corpus, bigrams_filename, "bigrams.png") 20 | 21 | # ============================================================================= 22 | 23 | # LOAD BIGRAMS and build log(probability) matrix 24 | 25 | bigrams = np.fromfile(bigrams_filename,dtype="int32").reshape(27,27) 26 | p=bigrams.astype('float')/np.tile(sum(bigrams.T),(27,1)).T 27 | p[np.isnan(p)] = 0 28 | EPSILON = 1e-6 29 | logp = np.log(p + EPSILON) 30 | 31 | # ============================================================================= 32 | 33 | # Log likelihood per caracter 34 | def likelihood(s): 35 | res = 0 36 | c1 = s[0] 37 | for c2 in s[1:]: 38 | i = char_to_id(c1) 39 | j = char_to_id(c2) 40 | res += logp[i,j] 41 | c1 = c2 42 | 43 | return res/len(s) 44 | 45 | 46 | 47 | # Apply one permutation to a code 48 | def permute_code(code,i,j): 49 | newcode = code.copy() 50 | newcode[j] = code[i] 51 | newcode[i] = code[j] 52 | return newcode 53 | 54 | 55 | # Text to compute frequency of letter for initial guess 56 | freq_text = transform_to_caps(load_corpus("swann.txt")) 57 | 58 | ciphered_text = "Ci kmvjpx vb ci wmoxi" 59 | 60 | # ============================================================================= 61 | # ============================================================================= 62 | 63 | # Text to decipher : choose among the following options 64 | 65 | # ============================================================================= 66 | 67 | ## 1) TEST TEXT : random sentence from corpus 68 | 69 | #test_text = transform_to_caps(load_corpus("swann.txt")) 70 | #START = np.random.randint(180000) 71 | #SIZE = 30 # Nb de mots 72 | #original = " ".join(test_text.split(" ")[START:(START+SIZE)]) 73 | #true_code = [0] + list(np.random.permutation(range(1,27))) 74 | #ciphered_text = apply_code(original, true_code) 75 | 76 | # ============================================================================= 77 | 78 | # 2) FOR ERIC 79 | 80 | 81 | #foreric = """NASJX OXH NXH SOE BXYA SJXEA PY SNXYZEA ! YH ZASJSEG BSP ZAEJESG, RSA G SP ZY JY ? OS BAXBXPEZEXH EHRGYZ GS DEPBSAEZEXH, P'SKKASHRUEPPSHZ D'YH SZZAENYZ BSAOE JEHTZ-PEM. BSP RXH ! (XYE C'SE KSEZ KXAZ) 82 | # JXEGS QYE PSHP PXYBRXH PSYAS ASJEA ZXH RENXYGXZ, XY DY OXEHP ZXH SAZ DY BLZUXH. SY BGSEPEA D'YH CXYA HXYP JXEA SYZXYA D'YH NXRV XY DY KGSRXH D'YH JEH, QY'EG PXEZ NGSHR XY AYNEP. ZXH JEG SOE""" 83 | #ciphered_text = transform_to_caps(foreric) 84 | 85 | # ============================================================================= 86 | 87 | # 3) BAER 88 | 89 | #otis = """Mais, vous savez, moi je ne crois pas qu’il y ait de bonne ou de mauvaise situation. Moi, si je devais résumer ma vie aujourd’hui avec vous, je dirais que c’est d’abord des rencontres, des gens qui m’ont tendu la main"""#, peut-être à un moment où je ne pouvais pas, où j’étais seul chez moi. Et c’est assez curieux de se dire que les hasards, les rencontres forgent une destinée… Parce que quand on a le goût de la chose, quand on a le goût de la chose bien faite, le beau geste, parfois on ne trouve pas l’interlocuteur en face, je dirais, le miroir qui vous aide à avancer""" 90 | #otis = """Et c’est assez curieux de se dire que les hasards, les rencontres forgent une destinée""" 91 | #otis = """moi je ne crois pas qu’il y ait de bonne ou de mauvaise situation""" 92 | # 93 | #original = transform_to_caps(otis) 94 | #np.random.seed(3) 95 | #true_code = [0] + list(np.random.permutation(range(1,27))) 96 | #ciphered_text = apply_code(original, true_code) 97 | #print(" ".join([chr(64+i) for i in true_code[1:]])) 98 | #true_decode = invert_code(true_code) 99 | #print(" ".join([chr(64+i) for i in true_decode[1:]])) 100 | # 101 | #print(original + " ("+str(len(original)) + " chars)") 102 | #print(likelihood(original)) 103 | #print("\n\n") 104 | 105 | 106 | # ============================================================================= 107 | # ============================================================================= 108 | 109 | ciphered_text = transform_to_caps(ciphered_text) 110 | 111 | print(ciphered_text + " Likelihood : {0:.2f}".format(likelihood(ciphered_text))) 112 | print("\n") 113 | 114 | # Initial guess from frequency 115 | 116 | ref_freq = frequency_order(freq_text) 117 | obs_freq = frequency_order(ciphered_text) 118 | 119 | freq_code = [0] + list(range(1,27)) 120 | 121 | for i in range(1,27): 122 | pos = obs_freq.index(i) 123 | freq_code[i] = ref_freq[pos] 124 | 125 | cur_code = freq_code.copy() 126 | cur_trad = apply_code(ciphered_text, cur_code) 127 | cur_like = likelihood(cur_trad) 128 | 129 | 130 | 131 | 132 | # Best found so far 133 | best_code = cur_code.copy() 134 | best_like = cur_like 135 | best_trad = cur_trad 136 | 137 | print(best_trad + " N=" + str(0) + " L={0:.2f}".format(best_like)) 138 | 139 | 140 | # Now the main loop 141 | 142 | MIN_ITER = 2000 143 | MAX_ITER = 100000 144 | THRESHOLD = -2.05 145 | ALPHA = 1 146 | 147 | for k in range(MAX_ITER): 148 | 149 | # Build a tentative move 150 | i = np.random.randint(1,27) 151 | j = np.random.randint(1,27) 152 | tt_code = permute_code(cur_code, i, j) 153 | 154 | tt_trad = apply_code(ciphered_text, tt_code) 155 | tt_like = likelihood(tt_trad) 156 | 157 | # Test whether move should be accepted 158 | x = np.random.rand() 159 | p = np.exp(ALPHA*(tt_like - cur_like) * len(ciphered_text)) 160 | 161 | if(x < p): 162 | cur_code = tt_code.copy() 163 | cur_trad = tt_trad 164 | cur_like = tt_like 165 | #print("ACCEPT") 166 | 167 | if(cur_like > best_like): 168 | best_code = cur_code.copy() 169 | best_like = cur_like 170 | best_trad = cur_trad 171 | print(best_trad + " [k=" + str(k) + " L={0:.2f}]".format(best_like)) 172 | 173 | if k > MIN_ITER and best_like > THRESHOLD: 174 | break 175 | 176 | 177 | ####################################################################################################### 178 | 179 | print("\nEnter second phase") 180 | 181 | 182 | with open('dictionnary.dat', 'rb') as filehandle: 183 | dictionnary_words = pickle.load(filehandle) 184 | 185 | cnt, total = count_correct_words(best_trad, dictionnary_words) 186 | word_score = score_correct_words(best_trad, dictionnary_words) 187 | 188 | print("Mots OK "+str(cnt)+"/"+str(total)+" score="+ str(word_score)) 189 | 190 | GAMMA = 4.0 191 | best_score = GAMMA * word_score + best_like 192 | 193 | 194 | 195 | cur_code = best_code 196 | cur_score = best_score 197 | cur_trad = best_trad 198 | 199 | NITER2 = 2000 200 | temperature = 0.05 201 | rho = 0.999 202 | 203 | for k in range(NITER2): 204 | 205 | # Build a tentative move and compute score 206 | i = np.random.randint(1,27) 207 | j = np.random.randint(1,27) 208 | tt_code = permute_code(cur_code, i, j) 209 | tt_trad = apply_code(ciphered_text, tt_code) 210 | tt_word_score = score_correct_words(tt_trad, dictionnary_words) 211 | tt_like = likelihood(tt_trad) 212 | tt_score = GAMMA * tt_word_score + tt_like 213 | 214 | 215 | # Test whether move should be accepted 216 | x = np.random.rand() 217 | p = np.exp((tt_score-cur_score)/temperature) 218 | temperature = temperature * rho 219 | 220 | if(x < p): 221 | cur_code = tt_code.copy() 222 | cur_trad = tt_trad 223 | cur_score = tt_score 224 | 225 | if(cur_score > best_score): 226 | best_code = cur_code 227 | best_score = cur_score 228 | best_trad = cur_trad 229 | print(tt_trad + " W={0:.2f}".format(tt_word_score)) 230 | #print(tt_trad + " W={0:.2f} L={1:.2f} S={2:.2f} T={3:.3f} k={4}".format(tt_word_score, tt_like, tt_score,temperature,k)) 231 | 232 | 233 | -------------------------------------------------------------------------------- /text_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import codecs 5 | import re 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | import codecs 9 | import pickle 10 | 11 | # Functions to map space and A-Z to [0-27] 12 | 13 | def char_to_id(c): 14 | return 0 if c==" " else ord(c) - 64 15 | 16 | def id_to_char(i): 17 | return " " if i==0 else chr(i+64) 18 | 19 | 20 | def apply_code(s,code): 21 | res = "" 22 | for c in s: 23 | i = char_to_id(c) 24 | res += id_to_char(code[i]) 25 | return res 26 | 27 | 28 | def invert_code(code): 29 | res = [-1] * 27 30 | for i in range(27): 31 | j = code.index(i) 32 | res[i] = j 33 | return res 34 | 35 | 36 | def transform_to_caps(s): 37 | # Substitution "oe" 38 | s = re.sub(chr(338),"OE", s) 39 | s = re.sub(chr(339),"OE", s) 40 | 41 | # Replace accents, turn ponctuation signs into space 42 | # In the end, everything should be space (ascii 32) or capitals A-Z (ascii 65-90) 43 | to_A = [192,224,226] 44 | to_C = [199,231] 45 | to_E = [200,201,202,232,233,234,235] 46 | to_I = [238,239] 47 | to_O = [244] 48 | to_U = [249,251,252] 49 | to_SPACE = list(range(33,65)) + [171,187,8217,8230] 50 | 51 | s_sub = "" 52 | for c in s: 53 | c2 = c 54 | if(ord(c2) in range(97,123)): 55 | c2 = chr(ord(c2)-32) 56 | if ord(c2) in to_SPACE: 57 | c2 = " " 58 | if ord(c2) in to_A: 59 | c2 = "A" 60 | if ord(c2) in to_C: 61 | c2 = "C" 62 | if ord(c2) in to_E: 63 | c2 = "E" 64 | if ord(c2) in to_I: 65 | c2 = "I" 66 | if ord(c2) in to_O: 67 | c2 = "O" 68 | if ord(c2) in to_U: 69 | c2 = "U" 70 | s_sub += c2 71 | 72 | # Remove multiple spaces 73 | res = re.sub('\s+',' ', s_sub) 74 | 75 | return res 76 | 77 | def load_corpus(filename): 78 | 79 | # Load the text and make it one string 80 | encoding = "utf-8" 81 | whole_text = "" 82 | with codecs.open(filename, "r", encoding) as lines: 83 | for l in lines: 84 | # :-2 to remove endline and then concatenate 85 | whole_text = whole_text + l[:-2] + " " 86 | 87 | return whole_text 88 | 89 | 90 | def count_correct_words(s, dictionnary_words): 91 | cnt = 0 92 | word_list = s.split(" ") 93 | for w in word_list: 94 | if w in dictionnary_words: 95 | cnt +=1 96 | return cnt, len(word_list) 97 | 98 | def score_correct_words(s, dictionnary_words): 99 | res = 0 100 | tot = 0 101 | word_list = s.split(" ") 102 | for w in word_list: 103 | if w in dictionnary_words: 104 | res += len(w) 105 | tot += len(w) 106 | return res/tot 107 | 108 | def find_wrong_words(s, dictionnary_words): 109 | word_list = s.split(" ") 110 | for w in word_list: 111 | if w not in dictionnary_words: 112 | print(w) 113 | 114 | 115 | def frequency_order(s): 116 | res = np.zeros(26) 117 | for i in range(26): 118 | res[i] = s.count(chr(65+i)) 119 | return list(1 + np.argsort(res)[::-1]) 120 | 121 | def count_bigrams(corpus, outfile, imagefile = None): 122 | 123 | # Count of ASCII characters to check everything is ok 124 | count = np.zeros(512) 125 | for c in corpus: 126 | count[ord(c)] += 1 127 | 128 | for i in range(512): 129 | if count[i] > 0: 130 | print(str(i) + " " + chr(i) + " " + str(count[i])) 131 | 132 | 133 | # Now we are ready to count the bigrams 134 | bigrams = np.zeros((27,27),dtype='int32') 135 | i = 0 136 | for c in corpus: 137 | j = 0 if c == " " else ord(c) - 64 138 | bigrams[i,j] += 1 139 | i = j 140 | 141 | bigrams.tofile(outfile) 142 | 143 | if imagefile != None: 144 | # Plot of the matrix, normalized per line 145 | p2D=bigrams.astype('float')/np.tile(sum(bigrams.T),(27,1)).T 146 | p2D[np.isnan(p2D)] = 0 147 | 148 | alpha = 0.33 149 | p2Da = p2D**alpha 150 | plt.figure(figsize=(8,8)) 151 | plt.imshow(p2Da,interpolation='nearest', cmap = 'inferno') 152 | plt.axis('off') 153 | 154 | for ip, i in enumerate([32]+list(range(65,91))): 155 | plt.text(-1,ip,chr(i),horizontalalignment='center', 156 | verticalalignment='center') 157 | plt.text(ip,-1,chr(i),horizontalalignment='center', 158 | verticalalignment='center') 159 | plt.savefig(imagefile) 160 | 161 | return p2Da 162 | 163 | 164 | def create_dictionnary(): 165 | filepath = "liste.de.mots.francais.frgut.txt" 166 | 167 | whole_dico = "" 168 | with codecs.open(filepath, "r", "utf-8") as lines: 169 | for l in lines: 170 | whole_dico += l[:-1] + " " 171 | 172 | dico = transform_to_caps(whole_dico) 173 | # We add simple letters and qu' 174 | words = ['QU'] + ['A','C','D','L','Y','M','N','S','T'] + dico.split(" ") 175 | 176 | pickle.dump(words, open( "dictionnary.data", "wb" )) 177 | 178 | 179 | emolist = [ 180 | '\U0001F600', 181 | '\U0001F601', 182 | '\U0001F602', 183 | '\U0001F603', 184 | '\U0001F604', 185 | '\U0001F605', 186 | '\U0001F606', 187 | '\U0001F607', 188 | '\U0001F608', 189 | '\U0001F609', 190 | '\U0001F610', 191 | '\U0001F611', 192 | '\U0001F612', 193 | '\U0001F613', 194 | '\U0001F614', 195 | '\U0001F615', 196 | '\U0001F616', 197 | '\U0001F617', 198 | '\U0001F618', 199 | '\U0001F619', 200 | '\U0001F620', 201 | '\U0001F621', 202 | '\U0001F622', 203 | '\U0001F623', 204 | '\U0001F624', 205 | '\U0001F625'] 206 | --------------------------------------------------------------------------------