├── anechoic_chamber_recordings
    ├── ps2_keybrd
    │   ├── frog
    │   ├── fruit
    │   ├── fourier
    │   └── stress
    └── laptop_keybrd
    │   ├── dear
    │   ├── frog
    │   ├── fruit
    │   ├── fourier
    │   └── stress
├── recordwords.sh
├── dispsim.py
├── sorcmd.sh
├── createwmat.py
├── README
├── simlist.py
└── final_ver5.py


/anechoic_chamber_recordings/ps2_keybrd/frog:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/ps2_keybrd/frog


--------------------------------------------------------------------------------
/anechoic_chamber_recordings/ps2_keybrd/fruit:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/ps2_keybrd/fruit


--------------------------------------------------------------------------------
/anechoic_chamber_recordings/laptop_keybrd/dear:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/laptop_keybrd/dear


--------------------------------------------------------------------------------
/anechoic_chamber_recordings/laptop_keybrd/frog:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/laptop_keybrd/frog


--------------------------------------------------------------------------------
/anechoic_chamber_recordings/laptop_keybrd/fruit:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/laptop_keybrd/fruit


--------------------------------------------------------------------------------
/anechoic_chamber_recordings/ps2_keybrd/fourier:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/ps2_keybrd/fourier


--------------------------------------------------------------------------------
/anechoic_chamber_recordings/ps2_keybrd/stress:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/ps2_keybrd/stress


--------------------------------------------------------------------------------
/anechoic_chamber_recordings/laptop_keybrd/fourier:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/laptop_keybrd/fourier


--------------------------------------------------------------------------------
/anechoic_chamber_recordings/laptop_keybrd/stress:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/prajithrg/Accoustic_Cryptanalysis/HEAD/anechoic_chamber_recordings/laptop_keybrd/stress


--------------------------------------------------------------------------------
/recordwords.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo $samples
 4 | 
 5 | while read inputline; 
 6 | do
 7 | word="$inputline"
 8 | if [ -z "${word}" ];
 9 | then
10 | exit
11 | fi
12 | 
13 | let samples=${#word}*48000
14 | echo "Samples:"$samples
15 | echo "Type the word:"$word
16 | python2 audio_to_file.py  -I hw:0,0 -N $samples $1$word
17 | echo -e "\nWord saved" 
18 | echo -e "\nEnter new word:"
19 | done
20 | 
21 | 


--------------------------------------------------------------------------------
/dispsim.py:
--------------------------------------------------------------------------------
 1 | import simlist
 2 | import sys,ast,itertools,math
 3 |     
 4 | key=simlist.KEY
 5 | adj=simlist.ADJ
 6 | near=simlist.NEAR
 7 | dist=simlist.DIST
 8 | 
 9 | comb=list(itertools.combinations(sys.argv[1],2))
10 | ret=[]
11 | for i,tup in enumerate(comb):
12 |     inter=[]
13 |     if ( tup[0].upper() in key and  tup[1].upper() in key):
14 |         if tup[1].upper() ==  tup[0].upper():
15 |             inter.append('EQ')
16 |         if tup[1].upper() in adj[tup[0].upper()]:
17 |             inter.append('ADJ')
18 |         if tup[1].upper() in near[tup[0].upper()]:
19 |             inter.append('NEAR')
20 |         if tup[1].upper() in dist[tup[0].upper()]:
21 |             inter.append('DIST')
22 |         ret.append(tup[0]+tup[1]+"-->"+str(inter))
23 | 
24 | print ret
25 | 


--------------------------------------------------------------------------------
/sorcmd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | file=$1
 3 | for w in $(tr 'A-Z ,."()?!;:' 'a-z\n' < $file); do echo ${#w} $w; done | sort -u | sort -n | cut -d " " -f2 > "${file}_sorted"
 4 | #for w in $(tr 'A-Z ,."()?!;:' 'a-z\n' < $file); do echo ${#w} $w; done | sort -u | sort -n > "${file}_sorted"
 5 | a="{"
 6 | lineno=2
 7 | for i in `seq 1 22`;
 8 | do
 9 | len=`tr ' ' '\n' <"${file}_sorted" | awk -v n=$i 'length($0)==n' | wc -l`
10 | let lineend=$len+$lineno-1
11 | a="${a}${i}:[$lineno,$lineend], "
12 | let lineno=$lineno+$len
13 | done  
14 | a="$a}"
15 | sed -i "1i $a" ./"${file}_sorted"
16 | echo "Created sorted file "${file}_sorted""
17 | 
18 | if [ -d "wmat" ]; then
19 | echo "Removing wmat directory"
20 | rm -rf ./wmat
21 | fi
22 | mkdir wmat
23 | 
24 | python2 ./createwmat.py "${file}_sorted"
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/createwmat.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys,ast,itertools,math
 3 | 
 4 | import simlist
 5 | 
 6 | pth="./wmat/"
 7 | 
 8 | key=simlist.KEY
 9 | adj=simlist.ADJ
10 | near=simlist.NEAR
11 | dist=simlist.DIST
12 | 
13 | def nCr(n,r):
14 |     f = math.factorial
15 |     return f(n) / f(r) / f(n-r)
16 | 
17 | 
18 | fp=open(sys.argv[1],"r")
19 | index=ast.literal_eval(fp.readline())
20 | 
21 | #make W(adj,near and dist) matrix for set each n length words
22 | lastl=0
23 | cl=0
24 | weq=np.zeros(0,dtype='int')
25 | wadj=np.zeros(0,dtype='int')
26 | wnear=np.zeros(0,dtype='int')
27 | wdist=np.zeros(0,dtype='int')
28 | fw=open(pth+"dummy","w")
29 | for j,line in enumerate(fp):
30 |     #taking \n to account
31 |     line=line.replace("\n","")
32 |     l=len(line)
33 |     if(l!=lastl and l>=2):
34 |         np.save(fw,weq)
35 |         np.save(fw,wadj)
36 |         np.save(fw,wnear)
37 |         np.save(fw,wdist)
38 |         fw.close()
39 |         fw=open(pth+"wmat_"+str(l),"w")
40 |         lastl=l
41 |         coll=index[l][1]-index[l][0]+1
42 |         weq=np.zeros((nCr(l,2),coll),dtype='int')
43 |         wadj=np.zeros((nCr(l,2),coll),dtype='int')
44 |         wnear=np.zeros((nCr(l,2),coll),dtype='int')
45 |         wdist=np.zeros((nCr(l,2),coll),dtype='int')
46 |         cl=0
47 | #print "cl init",coll
48 | 
49 |     comb=list(itertools.combinations(line,2))
50 |     
51 |     for i,tup in enumerate(comb):
52 | #print l,i,cl,line
53 |         if ( tup[0].upper() in key and  tup[1].upper() in key):
54 |             if (tup[1].upper() ==  tup[0].upper()):
55 |                 weq[i][cl]=1
56 | 
57 |             if tup[1].upper() in adj[tup[0].upper()]:
58 |                 wadj[i][cl]=1
59 |         
60 |             if tup[1].upper() in near[tup[0].upper()]:
61 |                 wnear[i][cl]=1
62 | 
63 |             if tup[1].upper() in dist[tup[0].upper()]:
64 |                 wdist[i][cl]=1
65 |     cl=cl+1
66 | #if line=='dove':
67 | #print cl-1,weq[:,cl-1],"\n",wadj[:,cl-1],"\n",wnear[:,cl-1],"\n",wdist[:,cl-1]
68 | 
69 | print "Matrix creation done"
70 | fp.close()
71 | 
72 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | -----------------------------------------------------------------------------------
 2 | Directory Structure
 3 | -----------------------------------------------------------------------------------
 4 | 
 5 | key_board_sidechannel_10307933_10307936
 6 | ├── audio_to_file.py 
 7 | ├── createwmat.py
 8 | ├── dict
 9 | ├── dispsim.py
10 | ├── final_ver5.py
11 | ├── Readme
12 | ├── recordwords.sh
13 | ├── simlist.py
14 | └── sorcmd.sh
15 | 
16 | -----------------------------------------------------------------------------------
17 | Dependencies
18 | -----------------------------------------------------------------------------------
19 | Numpy and Scipy --- For matrix and array operations
20 | Matplotlib      --- For plotting
21 | Gnuradio        --- For basic audio processing and recording (any 
22 | 					other program can be used to directly record 
23 | 					from mic )
24 | 
25 | Python 2.x      --- Aliased as python2 everywhere in the program
26 | 
27 | -----------------------------------------------------------------------------------
28 | Repository
29 | -----------------------------------------------------------------------------------
30 | Entire code along with the audio recordings can be found in the github repository. 
31 | Audio recordings are not included in this submission archive and should be 
32 | downloaded from the repository as the file sizes are large.
33 | 
34 | HTTP link: https://github.com/prajithrg/accoustic_cryptanalysis 
35 | 
36 | or clone the entire project by 
37 | 
38 | git clone https://github.com/prajithrg/accoustic_cryptanalysis.git
39 | git clone git://github.com/prajithrg/accoustic_cryptanalysis.git
40 | 
41 | Audio recordings can be found in anechoic_chamber_recordings folder
42 | in the repository.
43 | 
44 | -----------------------------------------------------------------------------------
45 | Program files
46 | -----------------------------------------------------------------------------------
47 | Readme              --- This readme file
48 | dict                --- Sample dictionary containing 27K words
49 | simlist.py          --- ADJ, DIST and NEAR tables for QWERTY keyboard
50 | createwmat.py       --- Program that creates constraint matrices (weq,wadj etc)
51 | audio_to_file.py    --- Used for recording keystrokes from standard audio input 
52 | recordwords.sh      --- Used for continuous recording of various words
53 | sorcmd.sh           --- Sorts the dictionary according to word length and adds 
54 |                         header which makes further accessing easier. Makes use 
55 |                         of createwmat.py to create constraint matrices.
56 | dispsim.py          --- Displays original constraints for any given word
57 | final_ver5.py       --- Original program that implements the attack
58 | 
59 | -----------------------------------------------------------------------------------
60 | Usage and brief explanation
61 | -----------------------------------------------------------------------------------
62 | 1) Audio recording -- optional 
63 | 
64 | 	python2 audio_to_file.py  -I hw:0,0 -N 48000 sampledump
65 | 
66 | 	check -h option for more details
67 | 
68 | 2) Sorting dictionary and creating constraint matrices for dictionary words   
69 | 	
70 | 	./sorcmd.sh dict
71 | 
72 | 	This created a 'dict_sorted' file and 'wmat' directory that contains the 
73 | 	constraint matrices for various wordlengths
74 | 
75 | 3) Actual constraints
76 | 
77 |     python2 dispsim.py <word>
78 | 
79 | 	e.g python2 dispsim.py dear
80 | 
81 | 	This program can be used for generating the actual constraints for a word
82 | 
83 | 4) Final attack
84 |    
85 | 	python2 final_ver5.py <audio_recording>
86 | 
87 | 	e.g. python2 final_ver5.py anechoic_chamber_recordings/laptop_keybrd/<recording>
88 | 
89 | -----------------------------------------------------------------------------------
90 | 


--------------------------------------------------------------------------------
/simlist.py:
--------------------------------------------------------------------------------
 1 | KEY=['Q','A','Z','W','S','X','E','D','C','R','F','V','T',
 2 | 'G','B','Y','H','N','U','J','M','I','K','O','L','P']
 3 | 
 4 | ADJ={
 5 |  		'Q':['Q','W','S','A'],
 6 |  		'A':['A','Q','W','S','Z'],
 7 |  		'Z':['Z','A','S','X'],
 8 |  		'W':['W','Q','A','S','D','E'], 
 9 |  		'S':['S','Q','A','Z','X','D','E','W'], 
10 |  		'X':['X','Z','A','S','D','C'],
11 |  		'E':['E','W','S','D','F','R'],
12 |  		'D':['D','E','W','S','X','C','F','R'],
13 |  		'C':['C','X','D','F','V'],
14 |  		'R':['R','E','D','F','G','T'],
15 |  		'F':['F','R','E','D','C','V','G','T'],
16 |  		'V':['V','C','D','F','G','B'],
17 |  		'T':['T','R','F','G','H','Y'],
18 |  		'G':['G','T','R','F','V','B','H','Y'],
19 |  		'B':['B','V','G','H','N'],
20 |  		'Y':['Y','T','G','H','J','U'],
21 |  		'H':['H','Y','T','G','B','N','J','U'],
22 |  		'N':['N','B','H','J','M'],
23 |  		'U':['U','Y','H','J','K','I'],
24 |  		'J':['J','U','Y','H','N','M','K','I'],
25 |  		'M':['M','N','J','K'],
26 |  		'I':['I','U','J','K','L','O'],
27 |  		'K':['K','I','U','J','M','L','O'],
28 |  		'O':['O','I','K','L','P'],
29 |  		'L':['L','O','I','K','P'],
30 |  		'P':['P','O','L'],
31 | 	 }
32 | 
33 | NEAR={
34 |   		'Q':['Q','W','A','E','S','Z','D','X'],
35 |   		'A':['A','Q','Z','W','S','X','E','D'],
36 |   		'Z':['Z','Q','A','W','S','X','E','D','C'], 
37 |   		'W':['W','Q','A','Z','S','X','E','D','C','R','F'], 
38 |   		'S':['S','Q','A','Z','W','X','E','D','C','R','F'],
39 |   		'X':['X','Q','A','Z','W','S','E','D','C','F','V'],
40 |   		'E':['E','Q','A','Z','W','S','X','D','C','R','F','V','T','G'],
41 |   		'D':['D','Q','A','Z','W','S','X','E','C','R','F','V','T','G'],
42 |   		'C':['C','W','S','Z','E','S','X','R','D','F','V','T','G','B'],
43 |   		'R':['R','W','S','X','E','D','C','F','V','T','G','B','Y','H'],
44 |   		'F':['F','W','S','X','E','D','C','R','V','T','G','Y','H','B'],
45 |   		'V':['V','E','D','X','R','F','C','T','G','B','Y','H','N'],
46 |   		'T':['T','E','D','C','R','F','V','G','Y','H','B','U','J','N'],
47 |   		'G':['G','E','D','C','R','F','V','T','B','Y','H','N','U','J'],
48 |   		'B':['R','D','C','T','F','V','G','Y','H','N','U','J','M'],
49 |   		'Y':['Y','R','F','V','T','G','B','U','H','I','J','N'],
50 |   		'H':['H','R','F','V','T','G','B','Y','U','J','N','I','K','M'],
51 |   		'N':['N','T','F','V','Y','G','B','H','U','J','M','I','K'],
52 |   		'U':['U','T','G','B','Y','H','N','I','J','O','K','M','L'],
53 |   		'J':['J','T','G','B','Y','H','U','N','I','K','M','O','L'],
54 |   		'M':['M','Y','H','B','U','J','N','I','K','O','L'],
55 |   		'I':['I','Y','H','N','U','J','M','O','K','P','L'],
56 |   		'K':['K','Y','H','N','U','J','M','I','O','L','P'],
57 |   		'O':['O','U','J','M','I','K','P','L'],
58 |   		'L':['L','U','J','N','I','K','M','O','P'],
59 |   		'P':['P','I','J','M','I','K','O','L']
60 | 	 }
61 | 
62 | DIST={
63 |   		'Q':['B','C','F','G','H','I','J','K','L','M','N','O','P','R','T','U','V','Y'], 
64 |   		'A':['B','C','F','G','H','I','J','K','L','M','N','O','P','R','T','U','V','Y'],
65 |   		'Z':['B','F','G','H','I','J','K','L','M','N','O','P','R','T','U','V','Y'],
66 |   		'W':['B','G','H','I','J','K','L','M','N','O','P','T','U','V','Y'],
67 |   		'S':['B','G','H','I','J','K','L','M','N','O','P','T','U','V','Y'],
68 |   		'X':['B','G','H','I','J','K','L','M','N','O','P','R','T','U','Y'],
69 |   		'E':['B','H','I','J','K','L','M','N','O','P','U','Y'],
70 |   		'D':['B','H','I','J','K','L','M','N','O','P','U','Y'],
71 |   		'C':['A','H','I','J','K','L','M','N','O','P','Q','U','Y'],
72 |   		'R':['A','I','J','K','L','M','N','O','P','Q','U','Z'],
73 |   		'F':['A','I','J','K','L','M','N','O','P','Q','U','Z'],
74 |   		'V':['A','I','J','K','L','M','O','P','Q','S','U','W','Z'],
75 |   		'T':['A','I','K','L','M','O','P','Q','S','W','X','Z',],
76 |   		'G':['A','I','K','L','M','O','P','Q','S','W','X','Z'],
77 |   		'B':['A','B','E','I','K','L','O','P','Q','S','W','X','Z'],
78 |   		'Y':['A','C','D','E','K','L','M','O','P','Q','S','W','X','Z'],
79 |   		'H':['A','C','D','E','L','O','P','Q','S','W','X','Z'],
80 |   		'N':['A','C','D','E','L','O','P','Q','R','S','W','X','Z'],
81 |   		'U':['A','C','D','E','F','P','Q','R','S','V','W','X','Z'],
82 |   		'J':['A','C','D','E','F','P','Q','R','S','V','W','X','Z'],
83 |   		'M':['A','C','D','E','F','G','P','Q','R','S','T','V','W','X','Z'],
84 |   		'I':['A','B','C','D','E','F','G','Q','R','S','T','V','W','X','Z'],
85 |   		'K':['A','B','C','D','E','F','G','Q','R','S','T','V','W','X','Z'],
86 |   		'O':['A','B','C','D','E','F','G','H','N','Q','R','S','T','V','W','X','Y','Z'],
87 |   		'L':['A','B','C','D','E','F','G','H','Q','R','S','T','V','W','X','Y','Z'],
88 |   		'P':['A','B','C','D','E','F','G','H','N','Q','R','S','T','U','V','W','X','Y','Z']
89 | 	}
90 | 
91 | 


--------------------------------------------------------------------------------
/final_ver5.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy as sp
  3 | import scipy.signal as sg
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | import ast,random,sys
  7 | 
  8 | 
  9 | #sample_rate=44100
 10 | pth="./wmat/"
 11 | sample_rate=48000
 12 | stroke_period=.002
 13 | 
 14 | div=int(sample_rate*stroke_period)
 15 | 
 16 | data=np.fromfile(sys.argv[1],dtype="float32")
 17 | #print len(data)
 18 | dl=data.tolist()
 19 | plt.figure(0)
 20 | t=np.arange(0,len(dl),1)
 21 | plt.plot(t,dl)
 22 | 
 23 | 
 24 | 
 25 | fft_d=np.fft.fft(dl)
 26 | l=len(fft_d)
 27 | #plt.figure(1)
 28 | #t=np.arange(0,len(dl),1)
 29 | #plt.plot(t,abs(fft_d))
 30 | abs_fft=abs(fft_d)
 31 | #print len(abs_fft)
 32 | sum_lst=[]
 33 | 
 34 | for i in range(0,l/div):
 35 |     fft_v=np.fft.fft(dl[i*div:(i+1)*div])
 36 |     dummy=np.sum(abs(fft_v))
 37 |     sum_lst.append(dummy)
 38 | 
 39 | #nomalize the values
 40 | sum_lst=np.array(sum_lst)/max(sum_lst)
 41 | 
 42 | 
 43 | #new code to find a key press and release.
 44 | press_pos=[]
 45 | release_pos=[]
 46 | energythresh=0.22
 47 | glitchwin=5
 48 | pressrelwin=20
 49 | twokeysep=100
 50 | 
 51 | #find postions where energy is greater than a threshold
 52 | key_pos=np.array([i for i,x in enumerate(sum_lst) if x >energythresh])
 53 | 
 54 | #split the array for different key-strokes
 55 | keyslots=key_pos[np.diff(key_pos)>=twokeysep]
 56 | keyslotindex=[np.where(key_pos==j)[0][0]+1 for i,j in np.ndenumerate(keyslots)]
 57 | keyseparate=np.split(key_pos,keyslotindex)
 58 | 
 59 | #split each key to find press postion and release position
 60 | pressrelslots=[val[np.diff(val)>=pressrelwin] for val in keyseparate]
 61 | prerelslotindex=[] 
 62 | for k,arr in enumerate(pressrelslots):
 63 |     inter=[]
 64 |     for i,j in np.ndenumerate(arr):
 65 |         inter.append(np.where(keyseparate[k]==j)[0][0]+1)
 66 | #print inter
 67 |     prerelslotindex.append(inter)    
 68 | 
 69 | prerelseparate=[np.split(val,prerelslotindex[i]) for i,val in enumerate(keyseparate)]
 70 | 
 71 | press_pos=[val[0].tolist() for i,val in enumerate(prerelseparate)]
 72 | for i,val in enumerate(prerelseparate):
 73 |     if len(val)>1:
 74 |         release_pos.append(val[1].tolist())
 75 |     else:
 76 |         release_pos.append([])
 77 | 
 78 | #print "key",keyslots,keyslotindex,keyseparate
 79 | #print
 80 | #print "pressrelslots",pressrelslots
 81 | #print
 82 | #print "pressrelslotindex",prerelslotindex
 83 | #print
 84 | #print "prerelseparate",prerelseparate
 85 | #print
 86 | print "--"*70
 87 | print "Press position matrix:\n",np.matrix(press_pos)
 88 | print "--"*70
 89 | print "Release position matrix\n",np.matrix(release_pos)
 90 | print "--"*70
 91 | 
 92 | 
 93 | #create key-similarity matrices
 94 | def simmat(posarr,dl):
 95 | #count=4
 96 |     retmat=[]
 97 |     for row,val in enumerate(posarr):
 98 |         if posarr[row]!=[]:
 99 |             corrstart=posarr[row][0]*div
100 | #corrend=(posarr[row][-1]+1)*div
101 |             corrend=(posarr[row][0]+25)*div
102 |             res=[]
103 |             for r,c in enumerate(posarr):
104 |                 if posarr[r]!=[]:
105 |                     cs=posarr[r][0]*div
106 | #ce=(posarr[r][-1]+1)*div
107 |                     ce=(posarr[r][0]+25)*div
108 | #print "correlationpoints",corrstart,corrend,cs,ce
109 |                     mid=(len(dl[corrstart:corrend])+len(dl[cs:ce]))/2.0
110 |                     first=dl[corrstart:corrend]
111 |                     second=dl[cs:ce]
112 | #l=np.correlate(dl[corrstart:corrend],dl[cs:ce],'full').tolist()
113 |                     l=np.correlate(first,second,'full').tolist()
114 | #plt.figure(count)
115 | #t=np.arange(0,len(l),1)
116 | #plt.title("fig"+str(count))
117 | #plt.plot(t,abs(np.array(l)))
118 | #count=count+1
119 |                     res.append(abs(mid-l.index(max(l))))
120 | #res.append(l.index(max(l)))
121 |                 else:
122 |                     res.append([])
123 | #checkarr=list(res)
124 | #checkarr.remove(1.0)
125 | #print "arrmin",np.array(checkarr)-min(checkarr)
126 | #print "arrmax",np.array(checkarr)-max(checkarr)
127 |             retmat.append(res)
128 |         else:
129 |             retmat.append([])
130 |     return retmat
131 | 
132 | dl=np.array(dl)/max(dl)
133 | dl=dl.tolist()
134 | presssim_mat=simmat(press_pos,dl)
135 | #releasesim_mat=simmat(release_pos,dl)
136 | #mean will not work change later
137 | releasesim_mat=simmat(release_pos,dl)
138 | 
139 | 
140 | print "Press correlation matrix\n",np.matrix(presssim_mat)
141 | print "--"*70
142 | print "Release correlation matrix\n",releasesim_mat
143 | print "--"*70
144 | 
145 | #finding the mean matrix between press and relase sim matrices
146 | simmeanmat=[]
147 | for i,val in enumerate(presssim_mat):
148 |     if releasesim_mat[i] !=[]:
149 |         inter=[]
150 |         for j,el in enumerate(val):
151 |             if releasesim_mat[i][j]!=[]:
152 |                 inter.append((el+releasesim_mat[i][j])/2.0)
153 |             else:
154 |                 inter.append(el)
155 |         simmeanmat.append(inter)
156 |     else:
157 |         simmeanmat.append(val)
158 | 
159 | 
160 | print "Mean correlation matrix\n",np.matrix(simmeanmat)
161 | print "--"*70
162 | 
163 | def rankmat_justsorted(simmat):
164 |     retmat=[]
165 |     for row,val in enumerate(simmat):
166 |         rowd=np.delete(val,row)
167 |         s=sorted(rowd)
168 | #print "s,rowd",s,rowd
169 |         res=[s.index(i) for i in rowd]
170 |         res.insert(row,-1)
171 |         retmat.append(res)
172 |     return retmat
173 | 
174 | 
175 | #find the rank matrices
176 | def rankmat(simmat):
177 |     retmat=[]
178 |     for row,val in enumerate(simmat):
179 |         rowd=np.delete(val,row)
180 |         rowdl=rowd.tolist()
181 |         s=sorted(rowd)
182 |         d=np.array(s)
183 |         diffth=min(np.diff(d))
184 |         dummypos=[]
185 |         pos1=d[d<=50]
186 |         pos2=d[np.logical_and(d>50,d<=150)]
187 |         pos3=d[np.logical_and(d>150,d<=250)]
188 |         pos4=d[np.logical_and(d>250,d<=350)]
189 |         pos5=d[np.logical_and(d>350,d<=500)]
190 |         pos6=d[np.logical_and(d>500,d<=900)]
191 |         pos7=d[d>900]
192 | #print pos1,pos2,pos3,pos4,pos5,pos6,pos7
193 |         if(len(pos1)):
194 |             dummypos.append(pos1[-1])
195 |         if(len(pos2)):
196 |             dummypos.append(pos2[-1])
197 |         if(len(pos3)):
198 |             dummypos.append(pos3[-1])
199 |         if(len(pos4)):
200 |             dummypos.append(pos4[-1])
201 |         if(len(pos5)):
202 |             dummypos.append(pos5[-1])
203 |         if(len(pos6)):
204 |             dummypos.append(pos6[-1])
205 |         if(len(pos7)):
206 |             dummypos.append(pos7[-1])
207 | #print "dummypos",dummypos
208 |         pos=np.array(dummypos)
209 |         posindex=[np.where(d==j)[0][0]+1 for i,j in np.ndenumerate(pos)]
210 | #print "posindex",posindex
211 |         grouped=np.split(d,posindex)
212 |         res=[3]*len(s)
213 | #print "grouped",diffth,grouped
214 |         print np.matrix(grouped)
215 |         for i,val in enumerate(grouped):
216 |             for k,l in np.ndenumerate(val):
217 | #print l,s.index(l)
218 |                 res[rowdl.index(l)]=i+1
219 |         res.insert(row,-1)
220 |         retmat.append(res)
221 |     return retmat
222 | 
223 | #rmat=rankmat(presssim_mat)
224 | #rmat=rankmat(simmeanmat)
225 | print "Grouped Matrix"
226 | rmat=rankmat(simmeanmat)
227 | print "--"*70
228 | print "Rank Matrix\n",rmat
229 | print "--"*70
230 | 
231 | 
232 | rule_4=[['EQ','EQ','ADJ','NEAR'],
233 |         ['EQ','ADJ','NEAR','DIST'],
234 |         ['ADJ','NEAR','NEAR','DIST'],
235 |         ['NEAR','DIST','DIST','DIST']
236 |         ]
237 | 
238 | #assuming wordlength
239 | wl=len(rmat[0])
240 | #prints rule list
241 | rulemat=[]
242 | for i in range(wl):
243 | #res=[]
244 |     for j in range(i,wl):
245 |         if i!=j:
246 |             if (rmat[i][j]<=3 and rmat[j][i]<=3):
247 |                 rulemat.append(rule_4[rmat[i][j]][rmat[j][i]]) 
248 |             else:
249 |                 rulemat.append('DIST')
250 | #res.insert(i,'EQ')
251 | #rulemat.append(res)
252 | 
253 | #rulemat=['DIST', 'DIST', 'DIST', 'ADJ','ADJ','NEAR', 'DIST', 'NEAR', 'DIST', 'DIST']
254 | print "Rule Matrix\n",rulemat
255 | print "--"*70
256 | 
257 | 
258 | #load wadj,wnear and wdist matrices and find the word
259 | fp=open(pth+"wmat_"+str(wl),"r")
260 | weq=np.load(fp)
261 | wadj=np.load(fp)
262 | wnear=np.load(fp)
263 | wdist=np.load(fp)
264 | fp.close()
265 | 
266 | #all constrains b/w two words 
267 | allc=[]
268 | for i,val in enumerate(rulemat):
269 |     if val=='EQ':
270 |         allc.append(weq[i])
271 |     elif val=='ADJ':
272 |         allc.append(wadj[i])
273 |     elif val=='NEAR':
274 |         allc.append(wnear[i])
275 |     elif val=='DIST':
276 |         allc.append(wdist[i])
277 | #print i,len(allc)
278 | fullcon=np.matrix(allc)
279 | def all_indices_equal(value, qlist):
280 |     indices = []
281 |     idx = -1
282 |     while True:
283 |         try:
284 |             idx = qlist.index(value, idx+1)
285 |             indices.append(idx)
286 |         except ValueError:
287 |             break
288 |     return indices
289 | 
290 | indexcnt={}
291 | tcsum=np.matrix(np.zeros(len(wadj[0])))
292 | for i in range(50):
293 | #conin=np.matrix(np.random.randint(2,size=len(rulemat)))
294 | #conin=np.matrix(np.ones(len(rulemat)))
295 | #random length
296 |     tcl=len(rulemat)
297 |     rlen=np.random.randint(int(tcl*0.8),tcl)
298 |     a = np.arange(tcl)
299 |     np.random.shuffle(a)
300 |     oneindex=a[:rlen]
301 |     con=np.zeros(tcl)
302 |     con[oneindex]=1
303 |     conin=np.matrix(con)
304 | 
305 |     csum=conin*fullcon
306 |     tcsum=csum+tcsum
307 | #print "csum",conin,csum.max(),csum[0,643]
308 |     csum=csum.tolist()[0]
309 | #indices=all_indices_equal(max(csum),csum)
310 | #all indices greater than a value
311 |     cmax=max(csum)
312 |     mid=cmax/2
313 |     indices=[ i for i,x in enumerate(csum) if x>=mid]
314 |     for j in indices:
315 |         if j in indexcnt.keys():
316 |             indexcnt[j]=indexcnt[j]+1
317 |         else:
318 |             indexcnt[j]=1
319 | 
320 | ranklist=sorted([(value,key) for (key,value) in indexcnt.items()])[::-1]
321 | topindex=[i[1] for i in ranklist]
322 | tcsumlist=tcsum.tolist()[0]
323 | #top25=[i  for i in topindex if i>=50]
324 | valthresh=int(max(tcsumlist)*0.8)
325 | ranklist=[i  for i,val in enumerate(tcsumlist) if val>=valthresh]
326 | #top25=sorted(ranklist)[::-1][:150]
327 | top25=ranklist[:150]
328 | #print "tcsumdetails",len(top25),tcsum.max(),tcsum[0,484],len(top25),ranklist
329 | 
330 | count=0
331 | print "Top word results"
332 | #display words
333 | fp=open("dict_sorted","r")
334 | index=ast.literal_eval(fp.readline())
335 | #print "index",index[wl]
336 | #-2 added to count for file line numbering
337 | npindices=np.array(top25)+index[wl][0]-2
338 | #print "npindices",npindices
339 | for lineno,line in enumerate(fp):
340 | #line=line.replace("\n","")
341 | #if line=='cloud':
342 | #print "cloudline",lineno
343 |     if lineno in npindices:
344 |         print line.replace("\n",""),",  ",
345 |         count=count+1
346 | 
347 | print 
348 | print "--"*70
349 | plt.figure(3)
350 | t=np.arange(0,len(sum_lst),1)
351 | #plt.plot(t,abs(np.array(sum_diff_lst)),'bo')
352 | plt.plot(t,abs(np.array(sum_lst)))
353 | plt.show()
354 | 


--------------------------------------------------------------------------------