├── anechoic_chamber_recordings ├── ps2_keybrd │ ├── frog │ ├── fruit │ ├── fourier │ └── stress └── laptop_keybrd │ ├── dear │ ├── frog │ ├── fruit │ ├── fourier │ └── stress ├── recordwords.sh ├── dispsim.py ├── sorcmd.sh ├── createwmat.py ├── README ├── simlist.py └── final_ver5.py /anechoic_chamber_recordings/ps2_keybrd/frog: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/ps2_keybrd/frog -------------------------------------------------------------------------------- /anechoic_chamber_recordings/ps2_keybrd/fruit: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/ps2_keybrd/fruit -------------------------------------------------------------------------------- /anechoic_chamber_recordings/laptop_keybrd/dear: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/laptop_keybrd/dear -------------------------------------------------------------------------------- /anechoic_chamber_recordings/laptop_keybrd/frog: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/laptop_keybrd/frog -------------------------------------------------------------------------------- /anechoic_chamber_recordings/laptop_keybrd/fruit: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/laptop_keybrd/fruit -------------------------------------------------------------------------------- /anechoic_chamber_recordings/ps2_keybrd/fourier: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/ps2_keybrd/fourier -------------------------------------------------------------------------------- /anechoic_chamber_recordings/ps2_keybrd/stress: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/ps2_keybrd/stress -------------------------------------------------------------------------------- /anechoic_chamber_recordings/laptop_keybrd/fourier: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/laptop_keybrd/fourier -------------------------------------------------------------------------------- /anechoic_chamber_recordings/laptop_keybrd/stress: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/laptop_keybrd/stress -------------------------------------------------------------------------------- /recordwords.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo $samples 4 | 5 | while read inputline; 6 | do 7 | word="$inputline" 8 | if [ -z "${word}" ]; 9 | then 10 | exit 11 | fi 12 | 13 | let samples=${#word}*48000 14 | echo "Samples:"$samples 15 | echo "Type the word:"$word 16 | python2 audio_to_file.py -I hw:0,0 -N $samples $1$word 17 | echo -e "\nWord saved" 18 | echo -e "\nEnter new word:" 19 | done 20 | 21 | -------------------------------------------------------------------------------- /dispsim.py: -------------------------------------------------------------------------------- 1 | import simlist 2 | import sys,ast,itertools,math 3 | 4 | key=simlist.KEY 5 | adj=simlist.ADJ 6 | near=simlist.NEAR 7 | dist=simlist.DIST 8 | 9 | comb=list(itertools.combinations(sys.argv[1],2)) 10 | ret=[] 11 | for i,tup in enumerate(comb): 12 | inter=[] 13 | if ( tup[0].upper() in key and tup[1].upper() in key): 14 | if tup[1].upper() == tup[0].upper(): 15 | inter.append('EQ') 16 | if tup[1].upper() in adj[tup[0].upper()]: 17 | inter.append('ADJ') 18 | if tup[1].upper() in near[tup[0].upper()]: 19 | inter.append('NEAR') 20 | if tup[1].upper() in dist[tup[0].upper()]: 21 | inter.append('DIST') 22 | ret.append(tup[0]+tup[1]+"-->"+str(inter)) 23 | 24 | print ret 25 | -------------------------------------------------------------------------------- /sorcmd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | file=$1 3 | for w in $(tr 'A-Z ,."()?!;:' 'a-z\n' < $file); do echo ${#w} $w; done | sort -u | sort -n | cut -d " " -f2 > "${file}_sorted" 4 | #for w in $(tr 'A-Z ,."()?!;:' 'a-z\n' < $file); do echo ${#w} $w; done | sort -u | sort -n > "${file}_sorted" 5 | a="{" 6 | lineno=2 7 | for i in `seq 1 22`; 8 | do 9 | len=`tr ' ' '\n' <"${file}_sorted" | awk -v n=$i 'length($0)==n' | wc -l` 10 | let lineend=$len+$lineno-1 11 | a="${a}${i}:[$lineno,$lineend], " 12 | let lineno=$lineno+$len 13 | done 14 | a="$a}" 15 | sed -i "1i $a" ./"${file}_sorted" 16 | echo "Created sorted file "${file}_sorted"" 17 | 18 | if [ -d "wmat" ]; then 19 | echo "Removing wmat directory" 20 | rm -rf ./wmat 21 | fi 22 | mkdir wmat 23 | 24 | python2 ./createwmat.py "${file}_sorted" 25 | 26 | 27 | -------------------------------------------------------------------------------- /createwmat.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys,ast,itertools,math 3 | 4 | import simlist 5 | 6 | pth="./wmat/" 7 | 8 | key=simlist.KEY 9 | adj=simlist.ADJ 10 | near=simlist.NEAR 11 | dist=simlist.DIST 12 | 13 | def nCr(n,r): 14 | f = math.factorial 15 | return f(n) / f(r) / f(n-r) 16 | 17 | 18 | fp=open(sys.argv[1],"r") 19 | index=ast.literal_eval(fp.readline()) 20 | 21 | #make W(adj,near and dist) matrix for set each n length words 22 | lastl=0 23 | cl=0 24 | weq=np.zeros(0,dtype='int') 25 | wadj=np.zeros(0,dtype='int') 26 | wnear=np.zeros(0,dtype='int') 27 | wdist=np.zeros(0,dtype='int') 28 | fw=open(pth+"dummy","w") 29 | for j,line in enumerate(fp): 30 | #taking \n to account 31 | line=line.replace("\n","") 32 | l=len(line) 33 | if(l!=lastl and l>=2): 34 | np.save(fw,weq) 35 | np.save(fw,wadj) 36 | np.save(fw,wnear) 37 | np.save(fw,wdist) 38 | fw.close() 39 | fw=open(pth+"wmat_"+str(l),"w") 40 | lastl=l 41 | coll=index[l][1]-index[l][0]+1 42 | weq=np.zeros((nCr(l,2),coll),dtype='int') 43 | wadj=np.zeros((nCr(l,2),coll),dtype='int') 44 | wnear=np.zeros((nCr(l,2),coll),dtype='int') 45 | wdist=np.zeros((nCr(l,2),coll),dtype='int') 46 | cl=0 47 | #print "cl init",coll 48 | 49 | comb=list(itertools.combinations(line,2)) 50 | 51 | for i,tup in enumerate(comb): 52 | #print l,i,cl,line 53 | if ( tup[0].upper() in key and tup[1].upper() in key): 54 | if (tup[1].upper() == tup[0].upper()): 55 | weq[i][cl]=1 56 | 57 | if tup[1].upper() in adj[tup[0].upper()]: 58 | wadj[i][cl]=1 59 | 60 | if tup[1].upper() in near[tup[0].upper()]: 61 | wnear[i][cl]=1 62 | 63 | if tup[1].upper() in dist[tup[0].upper()]: 64 | wdist[i][cl]=1 65 | cl=cl+1 66 | #if line=='dove': 67 | #print cl-1,weq[:,cl-1],"\n",wadj[:,cl-1],"\n",wnear[:,cl-1],"\n",wdist[:,cl-1] 68 | 69 | print "Matrix creation done" 70 | fp.close() 71 | 72 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | ----------------------------------------------------------------------------------- 2 | Directory Structure 3 | ----------------------------------------------------------------------------------- 4 | 5 | key_board_sidechannel_10307933_10307936 6 | ├── audio_to_file.py 7 | ├── createwmat.py 8 | ├── dict 9 | ├── dispsim.py 10 | ├── final_ver5.py 11 | ├── Readme 12 | ├── recordwords.sh 13 | ├── simlist.py 14 | └── sorcmd.sh 15 | 16 | ----------------------------------------------------------------------------------- 17 | Dependencies 18 | ----------------------------------------------------------------------------------- 19 | Numpy and Scipy --- For matrix and array operations 20 | Matplotlib --- For plotting 21 | Gnuradio --- For basic audio processing and recording (any 22 | other program can be used to directly record 23 | from mic ) 24 | 25 | Python 2.x --- Aliased as python2 everywhere in the program 26 | 27 | ----------------------------------------------------------------------------------- 28 | Repository 29 | ----------------------------------------------------------------------------------- 30 | Entire code along with the audio recordings can be found in the github repository. 31 | Audio recordings are not included in this submission archive and should be 32 | downloaded from the repository as the file sizes are large. 33 | 34 | HTTP link: https://github.com/prajithrg/accoustic_cryptanalysis 35 | 36 | or clone the entire project by 37 | 38 | git clone https://github.com/prajithrg/accoustic_cryptanalysis.git 39 | git clone git://github.com/prajithrg/accoustic_cryptanalysis.git 40 | 41 | Audio recordings can be found in anechoic_chamber_recordings folder 42 | in the repository. 43 | 44 | ----------------------------------------------------------------------------------- 45 | Program files 46 | ----------------------------------------------------------------------------------- 47 | Readme --- This readme file 48 | dict --- Sample dictionary containing 27K words 49 | simlist.py --- ADJ, DIST and NEAR tables for QWERTY keyboard 50 | createwmat.py --- Program that creates constraint matrices (weq,wadj etc) 51 | audio_to_file.py --- Used for recording keystrokes from standard audio input 52 | recordwords.sh --- Used for continuous recording of various words 53 | sorcmd.sh --- Sorts the dictionary according to word length and adds 54 | header which makes further accessing easier. Makes use 55 | of createwmat.py to create constraint matrices. 56 | dispsim.py --- Displays original constraints for any given word 57 | final_ver5.py --- Original program that implements the attack 58 | 59 | ----------------------------------------------------------------------------------- 60 | Usage and brief explanation 61 | ----------------------------------------------------------------------------------- 62 | 1) Audio recording -- optional 63 | 64 | python2 audio_to_file.py -I hw:0,0 -N 48000 sampledump 65 | 66 | check -h option for more details 67 | 68 | 2) Sorting dictionary and creating constraint matrices for dictionary words 69 | 70 | ./sorcmd.sh dict 71 | 72 | This created a 'dict_sorted' file and 'wmat' directory that contains the 73 | constraint matrices for various wordlengths 74 | 75 | 3) Actual constraints 76 | 77 | python2 dispsim.py 78 | 79 | e.g python2 dispsim.py dear 80 | 81 | This program can be used for generating the actual constraints for a word 82 | 83 | 4) Final attack 84 | 85 | python2 final_ver5.py 86 | 87 | e.g. python2 final_ver5.py anechoic_chamber_recordings/laptop_keybrd/ 88 | 89 | ----------------------------------------------------------------------------------- 90 | -------------------------------------------------------------------------------- /simlist.py: -------------------------------------------------------------------------------- 1 | KEY=['Q','A','Z','W','S','X','E','D','C','R','F','V','T', 2 | 'G','B','Y','H','N','U','J','M','I','K','O','L','P'] 3 | 4 | ADJ={ 5 | 'Q':['Q','W','S','A'], 6 | 'A':['A','Q','W','S','Z'], 7 | 'Z':['Z','A','S','X'], 8 | 'W':['W','Q','A','S','D','E'], 9 | 'S':['S','Q','A','Z','X','D','E','W'], 10 | 'X':['X','Z','A','S','D','C'], 11 | 'E':['E','W','S','D','F','R'], 12 | 'D':['D','E','W','S','X','C','F','R'], 13 | 'C':['C','X','D','F','V'], 14 | 'R':['R','E','D','F','G','T'], 15 | 'F':['F','R','E','D','C','V','G','T'], 16 | 'V':['V','C','D','F','G','B'], 17 | 'T':['T','R','F','G','H','Y'], 18 | 'G':['G','T','R','F','V','B','H','Y'], 19 | 'B':['B','V','G','H','N'], 20 | 'Y':['Y','T','G','H','J','U'], 21 | 'H':['H','Y','T','G','B','N','J','U'], 22 | 'N':['N','B','H','J','M'], 23 | 'U':['U','Y','H','J','K','I'], 24 | 'J':['J','U','Y','H','N','M','K','I'], 25 | 'M':['M','N','J','K'], 26 | 'I':['I','U','J','K','L','O'], 27 | 'K':['K','I','U','J','M','L','O'], 28 | 'O':['O','I','K','L','P'], 29 | 'L':['L','O','I','K','P'], 30 | 'P':['P','O','L'], 31 | } 32 | 33 | NEAR={ 34 | 'Q':['Q','W','A','E','S','Z','D','X'], 35 | 'A':['A','Q','Z','W','S','X','E','D'], 36 | 'Z':['Z','Q','A','W','S','X','E','D','C'], 37 | 'W':['W','Q','A','Z','S','X','E','D','C','R','F'], 38 | 'S':['S','Q','A','Z','W','X','E','D','C','R','F'], 39 | 'X':['X','Q','A','Z','W','S','E','D','C','F','V'], 40 | 'E':['E','Q','A','Z','W','S','X','D','C','R','F','V','T','G'], 41 | 'D':['D','Q','A','Z','W','S','X','E','C','R','F','V','T','G'], 42 | 'C':['C','W','S','Z','E','S','X','R','D','F','V','T','G','B'], 43 | 'R':['R','W','S','X','E','D','C','F','V','T','G','B','Y','H'], 44 | 'F':['F','W','S','X','E','D','C','R','V','T','G','Y','H','B'], 45 | 'V':['V','E','D','X','R','F','C','T','G','B','Y','H','N'], 46 | 'T':['T','E','D','C','R','F','V','G','Y','H','B','U','J','N'], 47 | 'G':['G','E','D','C','R','F','V','T','B','Y','H','N','U','J'], 48 | 'B':['R','D','C','T','F','V','G','Y','H','N','U','J','M'], 49 | 'Y':['Y','R','F','V','T','G','B','U','H','I','J','N'], 50 | 'H':['H','R','F','V','T','G','B','Y','U','J','N','I','K','M'], 51 | 'N':['N','T','F','V','Y','G','B','H','U','J','M','I','K'], 52 | 'U':['U','T','G','B','Y','H','N','I','J','O','K','M','L'], 53 | 'J':['J','T','G','B','Y','H','U','N','I','K','M','O','L'], 54 | 'M':['M','Y','H','B','U','J','N','I','K','O','L'], 55 | 'I':['I','Y','H','N','U','J','M','O','K','P','L'], 56 | 'K':['K','Y','H','N','U','J','M','I','O','L','P'], 57 | 'O':['O','U','J','M','I','K','P','L'], 58 | 'L':['L','U','J','N','I','K','M','O','P'], 59 | 'P':['P','I','J','M','I','K','O','L'] 60 | } 61 | 62 | DIST={ 63 | 'Q':['B','C','F','G','H','I','J','K','L','M','N','O','P','R','T','U','V','Y'], 64 | 'A':['B','C','F','G','H','I','J','K','L','M','N','O','P','R','T','U','V','Y'], 65 | 'Z':['B','F','G','H','I','J','K','L','M','N','O','P','R','T','U','V','Y'], 66 | 'W':['B','G','H','I','J','K','L','M','N','O','P','T','U','V','Y'], 67 | 'S':['B','G','H','I','J','K','L','M','N','O','P','T','U','V','Y'], 68 | 'X':['B','G','H','I','J','K','L','M','N','O','P','R','T','U','Y'], 69 | 'E':['B','H','I','J','K','L','M','N','O','P','U','Y'], 70 | 'D':['B','H','I','J','K','L','M','N','O','P','U','Y'], 71 | 'C':['A','H','I','J','K','L','M','N','O','P','Q','U','Y'], 72 | 'R':['A','I','J','K','L','M','N','O','P','Q','U','Z'], 73 | 'F':['A','I','J','K','L','M','N','O','P','Q','U','Z'], 74 | 'V':['A','I','J','K','L','M','O','P','Q','S','U','W','Z'], 75 | 'T':['A','I','K','L','M','O','P','Q','S','W','X','Z',], 76 | 'G':['A','I','K','L','M','O','P','Q','S','W','X','Z'], 77 | 'B':['A','B','E','I','K','L','O','P','Q','S','W','X','Z'], 78 | 'Y':['A','C','D','E','K','L','M','O','P','Q','S','W','X','Z'], 79 | 'H':['A','C','D','E','L','O','P','Q','S','W','X','Z'], 80 | 'N':['A','C','D','E','L','O','P','Q','R','S','W','X','Z'], 81 | 'U':['A','C','D','E','F','P','Q','R','S','V','W','X','Z'], 82 | 'J':['A','C','D','E','F','P','Q','R','S','V','W','X','Z'], 83 | 'M':['A','C','D','E','F','G','P','Q','R','S','T','V','W','X','Z'], 84 | 'I':['A','B','C','D','E','F','G','Q','R','S','T','V','W','X','Z'], 85 | 'K':['A','B','C','D','E','F','G','Q','R','S','T','V','W','X','Z'], 86 | 'O':['A','B','C','D','E','F','G','H','N','Q','R','S','T','V','W','X','Y','Z'], 87 | 'L':['A','B','C','D','E','F','G','H','Q','R','S','T','V','W','X','Y','Z'], 88 | 'P':['A','B','C','D','E','F','G','H','N','Q','R','S','T','U','V','W','X','Y','Z'] 89 | } 90 | 91 | -------------------------------------------------------------------------------- /final_ver5.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy as sp 3 | import scipy.signal as sg 4 | import matplotlib.pyplot as plt 5 | 6 | import ast,random,sys 7 | 8 | 9 | #sample_rate=44100 10 | pth="./wmat/" 11 | sample_rate=48000 12 | stroke_period=.002 13 | 14 | div=int(sample_rate*stroke_period) 15 | 16 | data=np.fromfile(sys.argv[1],dtype="float32") 17 | #print len(data) 18 | dl=data.tolist() 19 | plt.figure(0) 20 | t=np.arange(0,len(dl),1) 21 | plt.plot(t,dl) 22 | 23 | 24 | 25 | fft_d=np.fft.fft(dl) 26 | l=len(fft_d) 27 | #plt.figure(1) 28 | #t=np.arange(0,len(dl),1) 29 | #plt.plot(t,abs(fft_d)) 30 | abs_fft=abs(fft_d) 31 | #print len(abs_fft) 32 | sum_lst=[] 33 | 34 | for i in range(0,l/div): 35 | fft_v=np.fft.fft(dl[i*div:(i+1)*div]) 36 | dummy=np.sum(abs(fft_v)) 37 | sum_lst.append(dummy) 38 | 39 | #nomalize the values 40 | sum_lst=np.array(sum_lst)/max(sum_lst) 41 | 42 | 43 | #new code to find a key press and release. 44 | press_pos=[] 45 | release_pos=[] 46 | energythresh=0.22 47 | glitchwin=5 48 | pressrelwin=20 49 | twokeysep=100 50 | 51 | #find postions where energy is greater than a threshold 52 | key_pos=np.array([i for i,x in enumerate(sum_lst) if x >energythresh]) 53 | 54 | #split the array for different key-strokes 55 | keyslots=key_pos[np.diff(key_pos)>=twokeysep] 56 | keyslotindex=[np.where(key_pos==j)[0][0]+1 for i,j in np.ndenumerate(keyslots)] 57 | keyseparate=np.split(key_pos,keyslotindex) 58 | 59 | #split each key to find press postion and release position 60 | pressrelslots=[val[np.diff(val)>=pressrelwin] for val in keyseparate] 61 | prerelslotindex=[] 62 | for k,arr in enumerate(pressrelslots): 63 | inter=[] 64 | for i,j in np.ndenumerate(arr): 65 | inter.append(np.where(keyseparate[k]==j)[0][0]+1) 66 | #print inter 67 | prerelslotindex.append(inter) 68 | 69 | prerelseparate=[np.split(val,prerelslotindex[i]) for i,val in enumerate(keyseparate)] 70 | 71 | press_pos=[val[0].tolist() for i,val in enumerate(prerelseparate)] 72 | for i,val in enumerate(prerelseparate): 73 | if len(val)>1: 74 | release_pos.append(val[1].tolist()) 75 | else: 76 | release_pos.append([]) 77 | 78 | #print "key",keyslots,keyslotindex,keyseparate 79 | #print 80 | #print "pressrelslots",pressrelslots 81 | #print 82 | #print "pressrelslotindex",prerelslotindex 83 | #print 84 | #print "prerelseparate",prerelseparate 85 | #print 86 | print "--"*70 87 | print "Press position matrix:\n",np.matrix(press_pos) 88 | print "--"*70 89 | print "Release position matrix\n",np.matrix(release_pos) 90 | print "--"*70 91 | 92 | 93 | #create key-similarity matrices 94 | def simmat(posarr,dl): 95 | #count=4 96 | retmat=[] 97 | for row,val in enumerate(posarr): 98 | if posarr[row]!=[]: 99 | corrstart=posarr[row][0]*div 100 | #corrend=(posarr[row][-1]+1)*div 101 | corrend=(posarr[row][0]+25)*div 102 | res=[] 103 | for r,c in enumerate(posarr): 104 | if posarr[r]!=[]: 105 | cs=posarr[r][0]*div 106 | #ce=(posarr[r][-1]+1)*div 107 | ce=(posarr[r][0]+25)*div 108 | #print "correlationpoints",corrstart,corrend,cs,ce 109 | mid=(len(dl[corrstart:corrend])+len(dl[cs:ce]))/2.0 110 | first=dl[corrstart:corrend] 111 | second=dl[cs:ce] 112 | #l=np.correlate(dl[corrstart:corrend],dl[cs:ce],'full').tolist() 113 | l=np.correlate(first,second,'full').tolist() 114 | #plt.figure(count) 115 | #t=np.arange(0,len(l),1) 116 | #plt.title("fig"+str(count)) 117 | #plt.plot(t,abs(np.array(l))) 118 | #count=count+1 119 | res.append(abs(mid-l.index(max(l)))) 120 | #res.append(l.index(max(l))) 121 | else: 122 | res.append([]) 123 | #checkarr=list(res) 124 | #checkarr.remove(1.0) 125 | #print "arrmin",np.array(checkarr)-min(checkarr) 126 | #print "arrmax",np.array(checkarr)-max(checkarr) 127 | retmat.append(res) 128 | else: 129 | retmat.append([]) 130 | return retmat 131 | 132 | dl=np.array(dl)/max(dl) 133 | dl=dl.tolist() 134 | presssim_mat=simmat(press_pos,dl) 135 | #releasesim_mat=simmat(release_pos,dl) 136 | #mean will not work change later 137 | releasesim_mat=simmat(release_pos,dl) 138 | 139 | 140 | print "Press correlation matrix\n",np.matrix(presssim_mat) 141 | print "--"*70 142 | print "Release correlation matrix\n",releasesim_mat 143 | print "--"*70 144 | 145 | #finding the mean matrix between press and relase sim matrices 146 | simmeanmat=[] 147 | for i,val in enumerate(presssim_mat): 148 | if releasesim_mat[i] !=[]: 149 | inter=[] 150 | for j,el in enumerate(val): 151 | if releasesim_mat[i][j]!=[]: 152 | inter.append((el+releasesim_mat[i][j])/2.0) 153 | else: 154 | inter.append(el) 155 | simmeanmat.append(inter) 156 | else: 157 | simmeanmat.append(val) 158 | 159 | 160 | print "Mean correlation matrix\n",np.matrix(simmeanmat) 161 | print "--"*70 162 | 163 | def rankmat_justsorted(simmat): 164 | retmat=[] 165 | for row,val in enumerate(simmat): 166 | rowd=np.delete(val,row) 167 | s=sorted(rowd) 168 | #print "s,rowd",s,rowd 169 | res=[s.index(i) for i in rowd] 170 | res.insert(row,-1) 171 | retmat.append(res) 172 | return retmat 173 | 174 | 175 | #find the rank matrices 176 | def rankmat(simmat): 177 | retmat=[] 178 | for row,val in enumerate(simmat): 179 | rowd=np.delete(val,row) 180 | rowdl=rowd.tolist() 181 | s=sorted(rowd) 182 | d=np.array(s) 183 | diffth=min(np.diff(d)) 184 | dummypos=[] 185 | pos1=d[d<=50] 186 | pos2=d[np.logical_and(d>50,d<=150)] 187 | pos3=d[np.logical_and(d>150,d<=250)] 188 | pos4=d[np.logical_and(d>250,d<=350)] 189 | pos5=d[np.logical_and(d>350,d<=500)] 190 | pos6=d[np.logical_and(d>500,d<=900)] 191 | pos7=d[d>900] 192 | #print pos1,pos2,pos3,pos4,pos5,pos6,pos7 193 | if(len(pos1)): 194 | dummypos.append(pos1[-1]) 195 | if(len(pos2)): 196 | dummypos.append(pos2[-1]) 197 | if(len(pos3)): 198 | dummypos.append(pos3[-1]) 199 | if(len(pos4)): 200 | dummypos.append(pos4[-1]) 201 | if(len(pos5)): 202 | dummypos.append(pos5[-1]) 203 | if(len(pos6)): 204 | dummypos.append(pos6[-1]) 205 | if(len(pos7)): 206 | dummypos.append(pos7[-1]) 207 | #print "dummypos",dummypos 208 | pos=np.array(dummypos) 209 | posindex=[np.where(d==j)[0][0]+1 for i,j in np.ndenumerate(pos)] 210 | #print "posindex",posindex 211 | grouped=np.split(d,posindex) 212 | res=[3]*len(s) 213 | #print "grouped",diffth,grouped 214 | print np.matrix(grouped) 215 | for i,val in enumerate(grouped): 216 | for k,l in np.ndenumerate(val): 217 | #print l,s.index(l) 218 | res[rowdl.index(l)]=i+1 219 | res.insert(row,-1) 220 | retmat.append(res) 221 | return retmat 222 | 223 | #rmat=rankmat(presssim_mat) 224 | #rmat=rankmat(simmeanmat) 225 | print "Grouped Matrix" 226 | rmat=rankmat(simmeanmat) 227 | print "--"*70 228 | print "Rank Matrix\n",rmat 229 | print "--"*70 230 | 231 | 232 | rule_4=[['EQ','EQ','ADJ','NEAR'], 233 | ['EQ','ADJ','NEAR','DIST'], 234 | ['ADJ','NEAR','NEAR','DIST'], 235 | ['NEAR','DIST','DIST','DIST'] 236 | ] 237 | 238 | #assuming wordlength 239 | wl=len(rmat[0]) 240 | #prints rule list 241 | rulemat=[] 242 | for i in range(wl): 243 | #res=[] 244 | for j in range(i,wl): 245 | if i!=j: 246 | if (rmat[i][j]<=3 and rmat[j][i]<=3): 247 | rulemat.append(rule_4[rmat[i][j]][rmat[j][i]]) 248 | else: 249 | rulemat.append('DIST') 250 | #res.insert(i,'EQ') 251 | #rulemat.append(res) 252 | 253 | #rulemat=['DIST', 'DIST', 'DIST', 'ADJ','ADJ','NEAR', 'DIST', 'NEAR', 'DIST', 'DIST'] 254 | print "Rule Matrix\n",rulemat 255 | print "--"*70 256 | 257 | 258 | #load wadj,wnear and wdist matrices and find the word 259 | fp=open(pth+"wmat_"+str(wl),"r") 260 | weq=np.load(fp) 261 | wadj=np.load(fp) 262 | wnear=np.load(fp) 263 | wdist=np.load(fp) 264 | fp.close() 265 | 266 | #all constrains b/w two words 267 | allc=[] 268 | for i,val in enumerate(rulemat): 269 | if val=='EQ': 270 | allc.append(weq[i]) 271 | elif val=='ADJ': 272 | allc.append(wadj[i]) 273 | elif val=='NEAR': 274 | allc.append(wnear[i]) 275 | elif val=='DIST': 276 | allc.append(wdist[i]) 277 | #print i,len(allc) 278 | fullcon=np.matrix(allc) 279 | def all_indices_equal(value, qlist): 280 | indices = [] 281 | idx = -1 282 | while True: 283 | try: 284 | idx = qlist.index(value, idx+1) 285 | indices.append(idx) 286 | except ValueError: 287 | break 288 | return indices 289 | 290 | indexcnt={} 291 | tcsum=np.matrix(np.zeros(len(wadj[0]))) 292 | for i in range(50): 293 | #conin=np.matrix(np.random.randint(2,size=len(rulemat))) 294 | #conin=np.matrix(np.ones(len(rulemat))) 295 | #random length 296 | tcl=len(rulemat) 297 | rlen=np.random.randint(int(tcl*0.8),tcl) 298 | a = np.arange(tcl) 299 | np.random.shuffle(a) 300 | oneindex=a[:rlen] 301 | con=np.zeros(tcl) 302 | con[oneindex]=1 303 | conin=np.matrix(con) 304 | 305 | csum=conin*fullcon 306 | tcsum=csum+tcsum 307 | #print "csum",conin,csum.max(),csum[0,643] 308 | csum=csum.tolist()[0] 309 | #indices=all_indices_equal(max(csum),csum) 310 | #all indices greater than a value 311 | cmax=max(csum) 312 | mid=cmax/2 313 | indices=[ i for i,x in enumerate(csum) if x>=mid] 314 | for j in indices: 315 | if j in indexcnt.keys(): 316 | indexcnt[j]=indexcnt[j]+1 317 | else: 318 | indexcnt[j]=1 319 | 320 | ranklist=sorted([(value,key) for (key,value) in indexcnt.items()])[::-1] 321 | topindex=[i[1] for i in ranklist] 322 | tcsumlist=tcsum.tolist()[0] 323 | #top25=[i for i in topindex if i>=50] 324 | valthresh=int(max(tcsumlist)*0.8) 325 | ranklist=[i for i,val in enumerate(tcsumlist) if val>=valthresh] 326 | #top25=sorted(ranklist)[::-1][:150] 327 | top25=ranklist[:150] 328 | #print "tcsumdetails",len(top25),tcsum.max(),tcsum[0,484],len(top25),ranklist 329 | 330 | count=0 331 | print "Top word results" 332 | #display words 333 | fp=open("dict_sorted","r") 334 | index=ast.literal_eval(fp.readline()) 335 | #print "index",index[wl] 336 | #-2 added to count for file line numbering 337 | npindices=np.array(top25)+index[wl][0]-2 338 | #print "npindices",npindices 339 | for lineno,line in enumerate(fp): 340 | #line=line.replace("\n","") 341 | #if line=='cloud': 342 | #print "cloudline",lineno 343 | if lineno in npindices: 344 | print line.replace("\n",""),", ", 345 | count=count+1 346 | 347 | print 348 | print "--"*70 349 | plt.figure(3) 350 | t=np.arange(0,len(sum_lst),1) 351 | #plt.plot(t,abs(np.array(sum_diff_lst)),'bo') 352 | plt.plot(t,abs(np.array(sum_lst))) 353 | plt.show() 354 | --------------------------------------------------------------------------------