├── README.md
├── data
    ├── test.arm
    ├── test.x86
    └── test_labels.txt
├── instr_sim.py
├── output
    ├── out.arm.We.zip
    ├── out.arm.words
    ├── out.arm.zip
    ├── out.x86.We.zip
    ├── out.x86.words
    └── out.x86.zip
├── query.py
├── senvec.py
└── tsne2.py


/README.md:
--------------------------------------------------------------------------------
 1 | # cross-arch-instr-model.github.io
 2 | 
 3 | Thank you for looking at our work!
 4 | The programs included here were created for the following paper:
 5 | 
 6 | "A Cross-Architecture Instruction Embedding Model for Natural Language Processing-Inspired Binary Code Analysis"
 7 | 
 8 | Kimberly Redmond, Lannan Luo, and Qiang Zeng
 9 | 
10 | The NDSS Workshop on Binary Analysis Research (BAR), 2019.
11 | 
12 | ############################
13 | 
14 | The trained cross-architecture instruction embedding model used in our paper are included in the output/ directory. Please remember to unzip the four output files.
15 | 
16 | Our embeddings were trained on the model Bivec, which is based on Word2Vec.
17 | You may find it here:
18 | 
19 | https://github.com/lmthang/bivec
20 | 
21 | ############################
22 | 
23 | ABOUT THESE PROGRAMS
24 | 
25 | All file paths and instruction selections are hard-coded into these programs. For your
26 | convenience, they are listed in variables near the top; feel free to modify them for your use.
27 | 
28 | ./senvec.py
29 | 
30 | Returns ROC plots and AUC scores for cross-architecture basic block similarity tests.
31 | Basic block embeddings are calculated by summing instruction embeddings within a block
32 | 
33 | Similarity is computed using Cosine similarity
34 | 
35 | ./tsne2.py
36 | 
37 | Returns 2 t-SNE figures with different displays:
38 | 	1) an unlabeled figure displaying all instructions in one vector space
39 | 	2) a labeled figure displaying selected instructions in one vector space
40 | 
41 | ./instr_sim.py
42 | 
43 | Returns 2 ROC plots and AUC scores for instruction-level similarity tests.
44 | Instructions are evaluated in pairs, in 2 ways:
45 | 	1) mono-architecture
46 | 	2) cross-architecture
47 | 
48 | The similarity metric used is cosine similarity.
49 | 
50 | ./query.py
51 | 
52 | Returns a list of the top-5 most similar instructions, given an instruction.
53 | Each instruction returns the top 6 instructions from its own architecture
54 | (#1 is itself), and the top 5 instructions from the other architecture,
55 | according to cosine similarity.
56 | 


--------------------------------------------------------------------------------
/instr_sim.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | """
  4 |     Kim Redmond
  5 |     A program to evaluate cosine similarity
  6 |     between INSTRUCTIONS, not basic blocks.
  7 |     These instructions are randomly hand-chosen.
  8 |         - ARM-ARM (100)
  9 |         - x86-x86 (100)
 10 |         - ARM-x86 (50)
 11 |     Test accuracy by plotting ROC/AUC.
 12 | """
 13 | 
 14 | import math
 15 | import numpy
 16 | from scipy import spatial
 17 | from sklearn import metrics
 18 | from sklearn.metrics import pairwise
 19 | import matplotlib.pyplot as plt
 20 | 
 21 | 
 22 | ###      CHANGE FILES HERE      ###
 23 | 
 24 | arm_emb = "output/out.arm"
 25 | x86_emb = "output/out.x86"
 26 | dim = 200
 27 | 
 28 | 
 29 | ###
 30 | #  Instruction pairs to test...
 31 | ###
 32 | 
 33 | # mono-architecture
 34 | # 50 similar, 50 dissimilar
 35 | 
 36 | arm1codes = ['ADD~R0,R5,R7', 'MOV~R1,0', 'CMP~R0,0', 'ORR~R0,R0,0', 'AND~R12,R6,0',
 37 |             'ADD~R0,R9,R7,LSL0', 'MOV~R0,R4', 'LDR~R0,[R0]', 'ORRS~R0,R5,R7', 'ADDS~R1,R1,0',
 38 |             'SBCS~R2,R2,0', 'SUBS~R2,R1,0', 'MOVEQ~R5,0', 'LDRB~R1,[R6+R7]', 'MVN~R12,0',
 39 |             'MOVNE~R2,R4', 'STM~R2,{R0,R1,R5}', 'ADC~R9,R5,R4', 'LSL~R1,R3,0', 'EOR~R11,R6,R1',
 40 |             'STMIB~SP,{R0,R7}', 'LDRLO~R1,[SP+0]', 'LDM~R7,{R3,R7}', 'RSBS~R3,R3,0', 'ORREQ~R6,R11,0',
 41 |             'ANDNE~R3,R3,0', 'MOVGE~R7,0', 'RSCS~R1,R10,0', 'ADDEQ~R8,R8,0', 'TST~R0,0',
 42 |             'LDRBNE~R7,[R9+R7]', 'STR~R6,[R0]', 'STRB~R0,[R2-0]', 'UMULL~R1,R2,R0,R3', 'PUSH~{R4,R5,R6,R7,R8,R9,R10,R11,LR}',
 43 |             'BIC~SP,SP,0', 'MLA~R10,R2,R4,R0', 'SBC~R3,R12,0', 'MOV~R2,R7', 'MUL~R1,R0,R2',
 44 |             'LDMIB~R7,{R2,R3}', 'MOVLE~R0,R1', 'ASR~R0,R0,0', 'LSLGE~R2,R3,R7', 'POP~{R4,R5,R6,R7,R8,R9,R10,R11,LR}',
 45 |             'MVNNE~R7,0', 'ASRNE~R6,R6,0', 'STMNE~R0,{R6,R8}', 'SUB~R0,R8,R0', 'LDREQ~R0,<TAG>',
 46 |             'ADD~R0,R5,R7', 'ADDS~R1,R1,0', 'ADD~R0,SP,0', 'ADDS~R4,R4,0', 'ADD~R9,R10,R10,LSL0',
 47 |             'ADD~R10,SP,0', 'ADD~R11,SP,0', 'ADD~R1,SP,0', 'ADD~R0,R0,R4', 'ADDS~R2,R11,R0',
 48 |             'BL~FOO', 'BEQ~<TAG>', 'MVN~R2,0', 'UMULL~R2,R3,R0,R1', 'ADC~R0,R5,R6',
 49 |             'LDR~R1,<TAG>', 'LDR~R1,[R5]', 'MOV~R9,R8', 'MOVNE~R0,0', 'B~<TAG>',
 50 |             'PUSH~{R4,R5,R6,R7,R8,LR}', 'RSCS~R3,R1,R4,ASR0', 'BHS~<TAG>', 'ORRSNE~R2,R2,R3', 'CMN~R0,0',
 51 |             'ORR~R8,R1,R8', 'ORR~R10,R3,R10', 'ORRS~R1,R0,R2', 'ADD~R6,SP,0', 'BLO~<TAG>',
 52 |             'MOV~R2,R5', 'MOV~R2,0', 'MOV~R0,R9', 'MOV~R6,R4', 'MOV~R0,R8',
 53 |             'STRH~R0,[SP+0]', 'CMPNE~R1,0', 'SUBS~R2,R4,0', 'MOVNE~R1,R4', 'ORR~R1,R1,0',
 54 |             'POP~{R4,R5,R6,LR}', 'UMULL~R3,R7,R0,R4', 'UMULL~R6,R5,R3,R4', 'CMPEQ~R6,0', 'MOV~R0,R10',
 55 |             'ASR~R11,R0,0', 'ORRNE~R6,R8,R6,LSL0', 'LDR~R2,[SP+0]', 'RSBS~R0,R5,0', 'BLS~<TAG>']
 56 | 
 57 | arm2codes = ['ADD~R6,R6,R10', 'MOV~R10,0', 'CMP~R4,0', 'ORR~R2,R2,0', 'AND~R1,R1,0',
 58 |             'ADD~R2,R1,R8,LSL0', 'MOV~R0,R9', 'LDR~R0,[SP+0]', 'ORRS~R0,R0,R1', 'ADDS~R4,R1,0',
 59 |             'SBCS~R0,R4,R1', 'SUBS~R2,R0,R4', 'MOVEQ~R0,0', 'LDRB~R5,[R4+R7+LSR0]', 'MVN~R2,0',
 60 |             'MOVNE~R7,0', 'STM~R10,{R0,R1}', 'ADC~R3,R10,0', 'LSL~R2,R7,0', 'EOR~R0,R5,0',
 61 |             'STMIB~R0,{R8,R9}', 'LDRLO~R1,[R12]', 'LDM~R1,{R0,R1}', 'RSBS~R4,R4,0', 'ORREQ~R9,R9,R5,LSL0',
 62 |             'ANDNE~R7,R4,R6,LSR0', 'MOVGE~R1,0', 'RSCS~R1,R2,0', 'ADDEQ~R2,R0,0', 'TST~R3,0',
 63 |             'LDRBNE~R3,[R0]', 'STR~R4,[SP+0]', 'STRB~R7,[R0],0', 'UMULL~R8,R0,R2,R6', 'PUSH~{R4,R5,R6,R7,R8,R9,R10,LR}',
 64 |             'BIC~R0,R0,0', 'MLA~R2,R3,R8,R1', 'SBC~R3,R5,R3', 'MOV~PC,R2', 'MUL~R3,R0,R2',
 65 |             'LDMIB~R7,{R0,R1}', 'MOVLE~R1,R0', 'ASR~R3,R2,0', 'LSLGE~R4,R0,R3', 'POP~{R4,LR}',
 66 |             'MVNNE~R0,0', 'ASRNE~R0,R5,0', 'STMNE~R1,{R3,R12}', 'SUB~SP,R11,0', 'LDREQ~R0,[R10+0]',
 67 |             'SUBS~R2,R0,R4', 'SUB~R0,SP,R0', 'SUBS~R4,R2,R0', 'SUBS~R0,R0,0', 'SUBS~R7,R10,R7',
 68 |             'BNE~<TAG>', 'LSR~R0,R0,0', 'MOV~R8,R0', 'BLT~<TAG>', 'B~<TAG>',
 69 |             'ADC~R4,R0,0', 'STM~SP,{R0,R4}', 'BL~FOO', 'LDRB~R0,[R5+0]!', 'ORRS~R0,R0,R1',
 70 |             'TST~R11,0', 'BHI~<TAG>', 'ADD~SP,SP,0', 'LDRB~R6,[R4]', 'LSL~R2,R1,0',
 71 |             'POP~{R4,R5,R6,R7,R8,R9,R10,R11,LR}', 'CMP~R0,0', 'ADD~R1,R0,0', 'ANDS~R5,R8,0', 'AND~R6,R2,0',
 72 |             'AND~R2,R0,0', 'AND~R0,R0,0', 'ASR~R1,R0,0', 'SUB~R0,R6,0', 'BIC~R1,R7,0',
 73 |             'BGE~<TAG>', 'LDMIB~R6,{R2,R3}', 'SUBS~R7,R2,R0', 'AND~R0,R0,0', 'AND~R4,R1,0',
 74 |             'BL~FOO', 'STR~R6,[SP+0]', 'ADDS~R10,R10,0', 'AND~R2,R2,R1', 'CMN~R9,-0',
 75 |             'PUSH~{R11,LR}', 'LDM~R2,{R3,R6}', 'LDRB~R6,[R1+0]', 'ADC~R10,R10,0', 'BNE~<TAG>',
 76 |             'SUB~SP,SP,0', 'TST~R0,0', 'BNE~<TAG>', 'EOR~R0,R5,0', 'MLA~R3,R10,R6,R2']
 77 | 
 78 | x861codes = ['ADDQ~RSP,0', 'MOVQ~RDI,RBX', 'CMPQ~RDX,0', 'XORL~EAX,EAX', 'ANDQ~R15,-0',
 79 |             'ADDQ~R14,RAX', 'MOVQ~RDX,RCX', 'LEAQ~RAX,[RBX+0]', 'XORL~EDI,EDI', 'ADDL~EAX,-0',
 80 |             'SUBQ~RSP,0', 'SUBL~EAX,R15D', 'MOVL~EAX,EBX', 'CMPL~ESI,0', 'SHLL~EBX,0',
 81 |             'MOVUPS~[RAX+0],XMM0', 'TESTQ~RCX,RCX', 'ADCQ~RBX,0', 'ANDB~AL,0', 'CMPB~AL,0',
 82 |             'ORL~ECX,EAX', 'TESTB~DL,0', 'MOVZBL~EDX,[RAX+0]', 'SHRL~ESI,0', 'ORL~ESI,0',
 83 |             'IDIVL~EDI', 'CMOVNEQ~R13,R15', 'NEGQ~R10', 'SETE~DL', 'ORQ~RDX,RDI',
 84 |             'NOTL~ECX', 'MOVSLQ~R8,[R13+0]', 'CMOVBL~EAX,ESI', 'NOTQ~RDX', 'CMOVAL~R11D,EDI',
 85 |             'PSHUFD~XMM0,XMM1,0', 'PUNPCKLDQ~XMM0,[RIP+<TAG>]', 'SUBPD~XMM0,[RIP+<TAG>]', 'MOVAPD~[RSP],XMM1', 'XORPS~XMM0,XMM0',
 86 |             'MOVAPS~[RSP+0],XMM0', 'NOTB~BL', 'CALLQ~*RAX', 'MOVABSQ~RCX,0', 'POPQ~RBX',
 87 |             'MOVDQU~[RDI,RDX,0-0],XMM2', 'PSHUFLW~XMM3,XMM3,0', 'PSRAD~XMM2,0', 'PUNPCKLBW~XMM2,XMM2', 'PADDQ~XMM3,XMM1',
 88 |             'ADDQ~RSP,0', 'ADDQ~R12,0', 'ADDQ~RAX,-0', 'PADDQ~XMM0,XMM2', 'ADDQ~R13,0',
 89 |             'NOTQ~RCX', 'CLTQ', 'NEGL~R9D', 'SETE~R15B', 'SHRB~BL,0',
 90 |             'JMP~<TAG>', 'CALLQ~FOO', 'RETQ', 'JAE~<TAG>', 'JNE~<TAG>',
 91 |             'MOVB~AL,0', 'MOVL~EAX,0', 'MOVL~ESI,0', 'MOVZBL~EAX,BL', 'MOVL~EDI,<STR>',
 92 |             'XORL~ECX,ECX', 'XORL~EBX,EBX', 'XORL~EAX,EAX', 'XORL~R8D,R8D', 'XORL~EDI,EDI',
 93 |             'CMPL~R14D,0', 'CMPQ~RDX,RBX', 'CMPL~EAX,-0', 'CMPQ~R14,-0', 'CMPQ~[R12+0],0',
 94 |             'MOVW~<TAG>+[RIP+0],0', 'MOVQ~RCX,RBX', 'MOVZBL~EAX,[RIP+<TAG>]', 'MOVQ~RDI,R12', 'MOVQ~RBX,RDI',
 95 |             'LEAQ~RSI,[RBX+0]', 'LEAQ~RSI,[RCX+0]', 'LEAQ~RDI,[RSP+0]', 'LEAL~EDI,[RDX,RSI]', 'LEAL~EBP,[RSI-0]',
 96 |             'PUSHQ~RBP', 'PUSHQ~R15', 'PUSHQ~R12', 'PUSHQ~RBX', 'PUSHQ~R14',
 97 |             'JMPQ~*[RAX*0+<TAG>]', 'CALLQ~*[RIP+<TAG>]', 'CMOVGL~R8D,EDX', 'IMULL~EDI,EDX,0', 'SBBL~EBP,EBP']
 98 | 
 99 | x862codes = ['ADDQ~R14,R12', 'MOVQ~R14,RSP', 'CMPQ~[RIP+<TAG>],RAX', 'XORL~R14D,R14D', 'ANDQ~RAX,R15',
100 |             'ADDQ~RDX,-0', 'MOVQ~RAX,[R8+0]', 'LEAQ~R14,[R8+0]', 'XORL~EBX,EBX', 'ANDQ~RBP,-0',
101 |             'SUBQ~RBX,RAX', 'SUBL~ESI,EBP', 'MOVL~EDX,[RBX+0]', 'CMPL~R15D,0', 'SHLL~EDX,CL',
102 |             'MOVUPS~XMM0,[RIP+<TAG>]', 'TESTQ~R8,R8', 'ADCQ~R8,0', 'ANDB~[RBX+0],-0', 'CMPB~[RSP+0],0',
103 |             'ORL~EAX,R8D', 'TESTB~[RSI+0],0', 'MOVZBL~EDX,R13B', 'SHRL~ECX,0', 'ORL~EAX,0',
104 |             'IDIVL~R15D', 'CMOVNEQ~RAX,R10', 'NEGQ~RDX', 'SETE~CL', 'ORQ~RAX,R15',
105 |             'NOTL~EDX', 'MOVSLQ~RSI,EBP', 'CMOVBL~EAX,ECX', 'NOTQ~RAX', 'CMOVAL~EAX,ECX',
106 |             'PSHUFD~XMM2,XMM1,0', 'PUNPCKLDQ~XMM2,[RIP+<TAG>]', 'SUBPD~XMM2,[RIP+<TAG>]', 'MOVAPD~XMM5,XMM1', 'XORPS~XMM1,XMM1',
107 |             'MOVAPS~XMM0,[RIP+<TAG>]', 'NOTB~[R15]', 'CALLQ~FOO', 'MOVABSQ~RAX,0', 'POPQ~R13',
108 |             'MOVDQU~XMM0,[R12]', 'PSHUFLW~XMM0,XMM0,0', 'PSRAD~XMM0,0', 'PUNPCKLBW~XMM5,<TAG>', 'PADDQ~XMM2,XMM10',
109 |             'SUBL~ESI,EBP', 'SUBL~EAX,R15D', 'SUBL~ESI,[RSP+0]', 'SUBQ~RSP,0', 'SUBL~EAX,R12D',
110 |             'ADDL~R12D,-0', 'ORL~EBP,R12D', 'TESTQ~RAX,RAX', 'CMPB~[R15,RBX],0', 'ORB~DL,R8B',
111 |             'ANDL~EAX,0', 'XORL~ESI,ESI', 'POPQ~R15', 'MOVB~R15B,0', 'MOVZBL~ECX,[R14+0]',
112 |             'SHLQ~R8,0', 'POPQ~RBX', 'POPQ~R14', 'JE~<TAG>', 'JA~<TAG>',
113 |             'TESTB~[RBX+0],0', 'TESTL~EAX,EAX', 'TESTQ~RDI,RDI', 'ANDL~R11D,0', 'ANDL~R9D,0',
114 |             'ADDQ~R14,RBX', 'XORL~ESI,ESI', 'JAE~<TAG>', 'JMP~<TAG>', 'TESTQ~RAX,RAX',
115 |             'ANDL~ECX,-0', 'RETQ', 'CMPB~[R12],0', 'POPQ~RBX', 'TESTL~EAX,EAX',
116 |             'CMPQ~RBX,0', 'TESTB~BPL,0', 'SETNE~AL', 'JMP~<TAG>', 'CMPQ~RBP,0',
117 |             'POPQ~RBX', 'POPQ~RBP', 'POPQ~R15', 'POPQ~R12', 'XORL~EAX,EAX',
118 |             'SHRQ~R8,0', 'MOVUPS~[RCX],XMM0', 'SUBQ~RBP,RDI', 'JB~<TAG>', 'CALLQ~FOO']
119 | true_labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
120 |                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
121 |                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
122 |                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 
123 |                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
124 |                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
125 | num = 100
126 | 
127 | # cross-architecture instructions
128 | # 25 similar, 25 dissimilar
129 | arm_cross = ['ADD~R0,R5,0', 'ADD~R6,R0,0', 'ADC~R7,R1,0', 'ADD~SP,SP,0', 'ADDS~R0,R2,R0',
130 |              'ORR~R0,R0,0', 'ORR~R9,R9,0', 'AND~R0,R9,0', 'AND~R2,R2,0', 'ORRS~R3,R1,R2',
131 |              'SUBS~R0,R6,0', 'SUB~R3,R3,R8,LSL0', 'SUBS~R0,R7,R10', 'SUB~SP,SP,0', 'SUBS~R2,R10,0',
132 |              'CMP~R0,0', 'MOV~R0,R5', 'MOV~R6,0', 'LDR~R0,[R8]', 'LDR~R0,[R10+0]',
133 |              'BL~FOO', 'BNE~<TAG>', 'EOR~R1,R4,R9,ASR0', 'B~<TAG>', 'STR~R8,[R4]',
134 | 
135 |              'ADDNE~R0,R12,0', 'ADC~R6,R6,0', 'ADC~R1,R1,0', 'ADDS~R9,R9,0', 'ADDS~R0,R0,R7',
136 |              'ORR~R1,R1,R0,LSR0', 'ORRS~R0,R6,R7', 'AND~R0,R4,0', 'AND~R0,R3,R2', 'ORRS~R0,R0,R11',
137 |              'SUB~R11,R0,R3', 'SUBS~R2,R0,0', 'SUBS~R6,R0,R2', 'LDR~R2,[SP+0]', 'LDR~R0,[R6]',
138 |              'POP~{R4,LR}', 'PUSH~{R4,R5,R6,R7,R8,R9,R10,R11,LR}', 'MOV~R9,R2', 'MOVEQ~R2,R1', 'CMP~R5,0',
139 |              'TST~R0,0', 'BL~FOO', 'BEQ~<TAG>', 'SBC~R7,R7,0', 'SMULL~R6,R5,R7,LR']
140 | 
141 | x86_cross = ['ADDQ~RAX,0', 'ADDQ~RBX,RAX', 'ADDL~ECX,-0', 'ADDQ~RSP,0', 'ADDL~EBP,0',
142 |              'XORL~EBP,EBP', 'ORQ~RCX,RDX', 'ANDL~EDX,0', 'TESTQ~RCX,RCX', 'ORQ~RSI,RBX',
143 |              'SUBQ~RAX,[RSP+0]', 'SUBQ~R12,RAX', 'SUBL~ESI,EBX', 'SUBQ~RSP,0', 'SUBL~ESI,R12D',
144 |              'CMPQ~[R14],0', 'MOVQ~RBX,RAX', 'MOVQ~RDI,RAX', 'LEAQ~RBX,[R14,RBP]', 'LEAQ~RSI,[RSP+0]',
145 |              'CALLQ~FOO', 'JNE~<TAG>', 'XORL~EDX,EDX', 'JMP~<TAG>', 'MOVQ~RAX,[RBX]',
146 | 
147 |              'CMOVLL~ECX,EAX', 'PUSHQ~RBX', 'PSUBQ~XMM0,XMM1', 'XORL~EAX,EAX', 'SUBQ~RSP,0',
148 |              'POPQ~RBP', 'MOVQ~RDI,R13', 'TESTB~AL,AL', 'CMOVNEQ~RBX,RCX', 'IMULQ~RBX,R8',
149 |              'CMPQ~[RSP+0],0', 'CMPL~EAX,0', 'MOVZWL~EAX,[R13+0]', 'PUSHQ~R15', 'JS~<TAG>',
150 |              'BTQ~R14,RAX', 'JA~<TAG>', 'DIVB~[RSP+0]', 'JNE~<TAG>', 'MOVSLQ~R13,EBP',
151 |              'SUBL~EAX,[RIP+<TAG>]', 'LEAL~EAX,[R14,RBP]', 'RETQ', 'MOVQ~RDI,RAX', 'ADDQ~RBX,RBP']
152 | 
153 | cross_labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
154 |                 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
155 |                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]
156 | 
157 | armdict = {}
158 | x86dict = {}
159 | 
160 | 
161 | # Method to read from embedding files
162 | # Stored as [instruction]:embedding pairs
163 | def read_file(fname, dictionary, dim):
164 |     with open(fname) as f:
165 |         for line in f:
166 |             items = line.split(" ")
167 | 
168 |             # remove header row
169 |             if len(items) == 2:
170 |                 continue
171 | 
172 |             if "</s>" in items:
173 |                 continue
174 | 
175 |             if "<unk>" in items:
176 |                 continue
177 | 
178 |             if "\n" in items:
179 |                 endline = int(dim)+1
180 |                 del items[endline]
181 | 
182 |             instr = items[0]
183 |             emb = numpy.array([float(feat) for feat in items[1:]])
184 | 
185 |             # turn corrupted values to 0
186 |             if numpy.any(numpy.isnan(emb)):
187 |                 print("Corrupted!: %s" % instr)
188 |                 emb = numpy.nan_to_num(emb)
189 | 
190 |             dictionary[instr] = emb
191 | 
192 | 
193 | ##                          ##
194 | #   Construct dictionaries   #
195 | ##                          ##
196 | print("Constructing dictionaries from files...")
197 | read_file(arm_emb, armdict, dim)
198 | read_file(x86_emb, x86dict, dim)
199 | 
200 | 
201 | ##                           ##
202 | #   Instr Cosine Similarity   #
203 | ##                           ##
204 | print("Calculating cosine similarities...")
205 | 
206 | def cos_sim(out_list, dictry, codes1, codes2):
207 |     for x, y in zip(codes1, codes2):
208 |         num1 = dictry[x]
209 |         num2 = dictry[y]
210 | 
211 |         emb1 = num1.reshape(1, -1)
212 |         emb2 = num2.reshape(1, -1)
213 | 
214 |         sim = pairwise.cosine_similarity(emb1, emb2) # returns array
215 | 
216 |         print("%s %s     %f" % (x, y, sim))
217 |         out_list.append(sim[0,0])
218 | 
219 | arm_cos = []
220 | cos_sim(arm_cos, armdict, arm1codes, arm2codes)
221 | 
222 | x86_cos = []
223 | cos_sim(x86_cos, x86dict, x861codes, x862codes)
224 | 
225 | cross_cos = []
226 | for x,y in zip(arm_cross, x86_cross):
227 |     num1 = armdict[x]
228 |     num2 = x86dict[y]
229 | 
230 |     emb1 = num1.reshape(1,-1)
231 |     emb2 = num2.reshape(1,-1)
232 |     sim = pairwise.cosine_similarity(emb1, emb2) # returns array
233 | 
234 |     print("%s %s       %f" % (x, y, sim))
235 |     cross_cos.append(sim[0,0])
236 | 
237 | # ROC, AUC
238 | fpra, tpra, thresholdsa = metrics.roc_curve(true_labels, arm_cos, pos_label=1)
239 | fprx, tprx, thresholdsx = metrics.roc_curve(true_labels, x86_cos, pos_label=1)
240 | fprc, tprc, thresholdsc = metrics.roc_curve(cross_labels, cross_cos, pos_label=1)
241 | 
242 | auca = metrics.auc(fpra, tpra)
243 | aucx = metrics.auc(fprx, tprx)
244 | aucc = metrics.auc(fprc, tprc)
245 | 
246 | plt.title("Mono-Architecture Instruction-Level Similarity")
247 | plt.plot(fpra, tpra, label="ARM Instructions, AUC=%f" % auca)
248 | plt.plot(fprx, tprx, label="X86 Instructions, AUC=%f" % aucx)
249 | 
250 | plt.legend(loc = 'lower right')
251 | plt.plot([0,1], [0,1], 'r--')
252 | plt.xlim([0,1])
253 | plt.ylim([0,1])
254 | plt.ylabel("True Positive Rate")
255 | plt.xlabel("False Positive Rate")
256 | plt.show()
257 | 
258 | plt.title("Cross-Architecture Instruction-Level Similarity")
259 | plt.plot(fprc, tprc, label="ARM-X86 Instructions, AUC=%f" % aucc)
260 | 
261 | plt.legend(loc = 'lower right')
262 | plt.plot([0,1], [0,1], 'r--')
263 | plt.xlim([0,1])
264 | plt.ylim([0,1])
265 | plt.ylabel("True Positive Rate")
266 | plt.xlabel("False Positive Rate")
267 | plt.show()
268 | 


--------------------------------------------------------------------------------
/output/out.arm.We.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-code-analysis/cross-arch-instr-model/008caecddadb76c495aabd0b00fc17ceb7c877f4/output/out.arm.We.zip


--------------------------------------------------------------------------------
/output/out.arm.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-code-analysis/cross-arch-instr-model/008caecddadb76c495aabd0b00fc17ceb7c877f4/output/out.arm.zip


--------------------------------------------------------------------------------
/output/out.x86.We.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-code-analysis/cross-arch-instr-model/008caecddadb76c495aabd0b00fc17ceb7c877f4/output/out.x86.We.zip


--------------------------------------------------------------------------------
/output/out.x86.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nlp-code-analysis/cross-arch-instr-model/008caecddadb76c495aabd0b00fc17ceb7c877f4/output/out.x86.zip


--------------------------------------------------------------------------------
/query.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | """
  4 |     Query Program for Word Similarity.
  5 |     - Query the top-10 most similar instructions (according to
  6 |       cosine similarity) within and across architectures:
  7 |         ARM-ARM
  8 |         ARM-x86
  9 |         x86-x86
 10 |         x86-ARM
 11 |     - Return 5 most similar instructions and their sim scores.
 12 | """
 13 | 
 14 | import math
 15 | import numpy
 16 | from scipy import spatial
 17 | from sklearn import metrics
 18 | from sklearn.metrics import pairwise
 19 | import matplotlib.pyplot as plt
 20 | 
 21 | 
 22 | ###      CHANGE FILES HERE      ###
 23 | 
 24 | # Files containing embeddings
 25 | arm_out_emb = "output/out.arm"
 26 | x86_out_emb = "output/out.x86"
 27 | 
 28 | # Dimension size
 29 | dim = 200
 30 | 
 31 | # Instructions to query
 32 | arm_query = ['ADD~SP,SP,0', 'SUB~SP,SP,0',
 33 |             'LDR~R0,[R5+0]', 'BL~FOO', 
 34 |             'LDRNE~R4,[SP+0]', 'ADD~R1,R0,R7',
 35 |             'BLT~<TAG>', 'BEQ~<TAG>', 'MOV~R0,R5',
 36 |             'MOV~R8,R2', 'ADD~R1,R0,R7', 'SUB~SP,SP,0',
 37 |             'LDR~R0,[R5+0]',  'MVN~R0,0', 'CMP~R8,0']
 38 | x86_query = ['ADDQ~RSP,0', 'SUBQ~RSP,0',
 39 |             'MOVQ~RDI,[R12+0]','CALLQ~FOO',
 40 |             'CMOVNEQ~RCX,RAX', 'ADDQ~RSI,R12',
 41 |             'JLE~<TAG>', 'JE~<TAG>', 'MOVL~EAX,R14D',
 42 |             'MOVQ~R13,RDX', 'ADDQ~R13,RBX', 'SUBQ~RSP,0',
 43 |             'MOVQ~RDI,[R12+0]', 'MOVL~EAX,-0', 'CMPL~R13D,0']
 44 | noinstr = 15
 45 | 
 46 | 
 47 | ##                        ##
 48 | #   ARM/x86 Dictionaries   #
 49 | ##                        ##
 50 | armdict = {}
 51 | x86dict = {}
 52 | armlist = []
 53 | x86list = []
 54 | 
 55 | 
 56 | # Method to read from embedding files
 57 | # Stored as [instruction]:embedding pairs
 58 | def read_file(fname, dictionary, dim, lst):
 59 |     with open(fname) as f:
 60 |         for line in f:
 61 |             items = line.split(" ")
 62 | 
 63 |             # remove header row
 64 |             if len(items) == 2:
 65 |                 continue
 66 | 
 67 |             if "</s>" in items:
 68 |                 continue
 69 | 
 70 |             if "<unk>" in items:
 71 |                 continue
 72 | 
 73 |             if "\n" in items:
 74 |                 endline = int(dim)+1
 75 |                 del items[endline]
 76 | 
 77 |             instr = items[0]
 78 |             emb = numpy.array([float(feat) for feat in items[1:]])
 79 | 
 80 |             # turn corrupted values to 0
 81 |             if numpy.any(numpy.isnan(emb)):
 82 |                 print("Corrupted!: %s" % instr)
 83 |                 #emb = numpy.nan_to_num(emb)
 84 | 
 85 |             dictionary[instr] = emb
 86 |             lst.append(instr)
 87 | 
 88 | 
 89 | ##                          ##
 90 | #   Construct dictionaries   #
 91 | ##                          ##
 92 | print("Constructing dictionaries from files...")
 93 | read_file(arm_out_emb, armdict, dim, armlist)
 94 | read_file(x86_out_emb, x86dict, dim, x86list)
 95 | 
 96 | 
 97 | ##                               ##
 98 | #   Calculate similarity scores   #
 99 | ##                               ##
100 | print("Calculating similarity scores...")
101 | 
102 | arm = len(armlist)
103 | x86 = len(x86list)
104 | 
105 | arm2arm = numpy.ndarray([noinstr, arm])
106 | arm2x86 = numpy.ndarray([noinstr, x86])
107 | x862x86 = numpy.ndarray([noinstr, x86])
108 | x862arm = numpy.ndarray([noinstr, arm])
109 | 
110 | def cosinesim(instr1, instr2):
111 |     emb1 = instr1.reshape(1,-1)
112 |     emb2 = instr2.reshape(1,-1)
113 |     sim = pairwise.cosine_similarity(emb1, emb2)
114 |     return sim
115 | 
116 | 
117 | # Calc similarities for each query instruction
118 | row = 0
119 | for instr in arm_query:
120 |     emb = armdict[instr]
121 |     index = 0
122 |     for instr2 in armdict.values():
123 |         arm2arm[row,index] = cosinesim(emb, instr2)
124 |         index += 1
125 | 
126 |     index = 0
127 |     for instr2 in x86dict.values():
128 |         arm2x86[row,index] = cosinesim(emb, instr2)
129 |         index += 1
130 | 
131 |     row += 1
132 | 
133 | row = 0
134 | for instr in x86_query:
135 |     emb = x86dict[instr]
136 |     index = 0
137 |     for instr2 in x86dict.values():
138 |         x862x86[row,index] = cosinesim(emb, instr2)
139 |         index += 1
140 | 
141 |     index = 0
142 |     for instr2 in armdict.values():
143 |         x862arm[row,index] = cosinesim(emb, instr2)
144 |         index += 1
145 | 
146 |     row += 1
147 | 
148 | 
149 | ##                               ##
150 | #   Top 5 Similar Instructions   #
151 | ##                               ##
152 | 
153 | print("Returning top 5 similar instructions...")
154 | 
155 | row = 0
156 | for instr in arm_query:
157 |     armrow = arm2arm[row]
158 |     x86row = arm2x86[row]
159 | 
160 |     print("\tARM-ARM\t%s" % instr)
161 |     for x in range(6):
162 |         cos = numpy.max(armrow) # top cos value
163 |         index = numpy.argmax(armrow) # index of that value
164 |         word = armlist[index] # find word in dictionary
165 | 
166 |         print("%s\t\t\t%f" % (word, cos))
167 |         armrow = numpy.delete(armrow, index)
168 | 
169 |     print("\n")
170 | 
171 |     print("\tARM-X86\t%s" % instr)
172 |     for x in range(5):
173 |         cos = numpy.max(x86row)
174 |         index = numpy.argmax(x86row)
175 |         word = x86list[index]
176 | 
177 |         print("%s\t\t\t%f" % (word, cos))
178 |         x86row = numpy.delete(x86row, index)
179 | 
180 |     print("\n")
181 |     
182 |     row += 1
183 | 
184 | row = 0
185 | for instr in x86_query:
186 |     armrow = x862arm[row]
187 |     x86row = x862x86[row]
188 | 
189 |     print("\tX86-X86\t%s" % instr)
190 |     for x in range(6):
191 |         cos = numpy.max(x86row)
192 |         index = numpy.argmax(x86row)
193 |         word = x86list[index]
194 | 
195 |         print("%s\t\t\t%f" % (word, cos))
196 |         x86row = numpy.delete(x86row, index)
197 | 
198 |     print("\n")
199 | 
200 |     print("\tX86-ARM\t%s" % instr)
201 |     for x in range(5):
202 |         cos = numpy.max(armrow)
203 |         index = numpy.argmax(armrow)
204 |         word = armlist[index]
205 | 
206 |         print("%s\t\t\t%f" % (word, cos))
207 |         armrow = numpy.delete(armrow, index)
208 | 
209 |     print("\n")
210 | 
211 |     row += 1
212 | 


--------------------------------------------------------------------------------
/senvec.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | """
  4 |     Sentence vector program.
  5 |     Takes word embeddings and raw instruction file.
  6 |     Stores word embeddings in dictionary.
  7 |     Scans raw instruction file and queries each embedding.
  8 |     Embeddings are summed per basic block.
  9 |     Basic blocks are compared across architecture languages.
 10 | """
 11 | 
 12 | import math
 13 | import numpy
 14 | from numpy import linalg
 15 | from scipy import spatial
 16 | from sklearn import metrics
 17 | from sklearn.metrics import confusion_matrix
 18 | import matplotlib.pyplot as plt
 19 | 
 20 | 
 21 | ###      CHANGE FILES HERE      ###
 22 | 
 23 | # files containing word + embedding
 24 | param_file1a = "output/out.arm"
 25 | param_file1x = "output/out.x86"
 26 | 
 27 | # files containing test set of instructions
 28 | test_arm = "data/test.arm"
 29 | test_x86 = "data/test.x86"
 30 | 
 31 | # ground truth labels for test set
 32 | test_labels = "data/test_labels.txt"
 33 | 
 34 | # dimension size
 35 | dim = 200
 36 | 
 37 | 
 38 | # test set raw-text basic blocks
 39 | armtest = []
 40 | x86test = []
 41 | 
 42 | # dictionary for each trained lang/model
 43 | armdict1 = {}
 44 | x86dict1 = {}
 45 | 
 46 | 
 47 | """
 48 |     Store the raw instructions from test files.
 49 |     Each line is a basic block.
 50 | """
 51 | with open(test_arm) as f:
 52 |     for line in f:
 53 |         armtest.append(line.split())
 54 | 
 55 | with open(test_x86) as f:
 56 |     for line in f:
 57 |         x86test.append(line.split())
 58 | 
 59 | 
 60 | """
 61 |     Each model will have its words + embeddings
 62 |     stored into a dictionary.
 63 | """
 64 | 
 65 | def read_file(fname, dictionary, dim):
 66 |     with open(fname) as f:
 67 |         for line in f:
 68 | 
 69 |             # separate instructions and features
 70 |             items = line.split(" ")
 71 | 
 72 |             # remove header row
 73 |             if len(items) == 2:
 74 |                 continue
 75 | 
 76 |             if "</s>" in items:
 77 |                 continue
 78 | 
 79 |             if "<unk>" in items:
 80 |                 continue
 81 | 
 82 |             if "\n" in items:
 83 |                 endline = int(dim)+1
 84 |                 del items[endline]
 85 | 
 86 |             instr = items[0]
 87 |             emb = numpy.array([float(feat) for feat in items[1:]])
 88 | 
 89 |             # turn corrupted read-in values to 0
 90 |             if numpy.any(numpy.isnan(emb)):
 91 |                 print("Corrupted!: %s" % instr)
 92 |                 emb = numpy.nan_to_num(emb)
 93 | 
 94 |             dictionary[instr] = emb
 95 | 
 96 | 
 97 | # Read in embeddings from each architecture file
 98 | print("Reading in embedding files...")
 99 | read_file(param_file1a, armdict1, dim)
100 | read_file(param_file1x, x86dict1, dim)
101 | 
102 | 
103 | # create sentence embedding for each bb
104 | # find an instruction, query the dict
105 | # sum instr embeddings to form a bb
106 | def calculateSentenceVectors(testset, dim, dictry):
107 |     finalout = []
108 |     for bb in testset:
109 |         count = 0 # number of embeddings found
110 |         senvec = numpy.zeros([1, int(dim)])
111 | 
112 |         for word in bb:
113 |             if word in dictry:
114 |                 wordvec = dictry[word]
115 |                 senvec += wordvec
116 |                 count += 1
117 |             # else: unknown instructions are skipped
118 | 
119 |         if count > 0:
120 | 
121 |             # if averaging embeddings:
122 |             # senvec /= float(count)
123 |             finalout.append(senvec)
124 | 
125 |         else:
126 |             print("Empty BB: no instructions found in dictionary.")
127 | 
128 |             # cannot use all zeros to compute cos similarity
129 |             if numpy.all(senvec == 0.0):
130 |                 senvec.fill(0.1)
131 | 
132 |             finalout.append(senvec)
133 | 
134 |     return finalout
135 | 
136 | 
137 | print("Calculating sentence vectors...")
138 | bbfinal_a1 = calculateSentenceVectors(armtest, dim, armdict1)
139 | bbfinal_x1 = calculateSentenceVectors(x86test, dim, x86dict1)
140 | 
141 | 
142 | ###                 ###
143 | #  Cosine Similarity  #
144 | ###                 ###
145 | 
146 | """
147 |     - Calculate cosine similarity between ARM/X86
148 |         basic blocks
149 |     - Plot ROC based on true labels
150 | """
151 | 
152 | true_labels = []
153 | 
154 | # ground truth labels
155 | with open(test_labels) as f:
156 |     for line in f:
157 |         true_labels.append(line.split())
158 |         
159 | true_labels = [int(''.join((str(i) for i in a))) for a in true_labels]
160 | 
161 | 
162 | # method to calculate cos similarity / ROC curve
163 | def ROC(bbfinal_a, bbfinal_x):
164 |     cos_sim = []
165 |     for bb1, bb2 in zip(bbfinal_a, bbfinal_x):
166 |         sim = 1 - spatial.distance.cosine(numpy.asarray(bb1), numpy.asarray(bb2))
167 |         cos_sim.append(sim)
168 | 
169 |     return metrics.roc_curve(true_labels, cos_sim, pos_label=1)
170 | 
171 | 
172 | print("Calculating similarity scores, ROC...")
173 | fpr1, tpr1, thresholds1 = ROC(bbfinal_a1, bbfinal_x1)
174 | auc1 = metrics.auc(fpr1, tpr1)
175 | print("AUC avg cos: %f" % auc1)
176 | 
177 | 
178 | plt.title('ARM-X86 Basic Block Similarity')
179 | plt.plot(fpr1, tpr1, label="Cosine Similarity, AUC=%f" %auc1)
180 | 
181 | plt.legend(loc = 'lower right')
182 | plt.plot([0,1], [0,1], 'r--')
183 | plt.xlim([0,1])
184 | plt.ylim([0,1])
185 | plt.ylabel('True Positive Rate')
186 | plt.xlabel('False Positive Rate')
187 | plt.show()
188 | 


--------------------------------------------------------------------------------
/tsne2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | """
  4 |     t-SNE to create a visualization of x86, arm embeddings.
  5 |     Output: 2 t-SNE plots
  6 |         1) unlabeled plot displaying all instructions
  7 |         2) labeled plot displaying selected instructions
  8 | """
  9 | 
 10 | from sklearn.manifold import TSNE
 11 | import matplotlib
 12 | import matplotlib.pyplot as plt
 13 | from sklearn.decomposition import PCA
 14 | 
 15 | import math
 16 | import numpy
 17 | from numpy import dot
 18 | from numpy.linalg import norm
 19 | from adjustText import adjust_text
 20 | 
 21 | 
 22 | ##                          ##
 23 | #            data            #
 24 | ##                          ##
 25 | 
 26 | 
 27 | ###      CHANGE FILES HERE      ###
 28 | 
 29 | DATA_PATH_ARM = 'output/out.arm'
 30 | DATA_PATH_X86 = 'output/out.x86'
 31 | VECTOR_SIZE = 200
 32 | 
 33 | 
 34 | # select instructions to plot only a few
 35 | armcodes = ['ADD~R1,R0,R7', 'SUB~SP,SP,0',
 36 |             'LDR~R0,[R5+0]', 
 37 |             'CMP~R8,0', 'POP~{R4,LR}',
 38 |             'B~<TAG>', 'BL~FOO']
 39 | x86codes = ['ADDQ~R13,RBX', 'SUBQ~RSP,0',
 40 |             'MOVQ~RDI,[R12+0]', 
 41 |             'CMPL~R13D,0', 'POPQ~RBP',
 42 |             'JMP~<TAG>', 'CALLQ~FOO']
 43 | instr_ct = 7
 44 | 
 45 | 
 46 | all_arm_codes = []
 47 | all_x86_codes = []
 48 | 
 49 | armdict = {}
 50 | x86dict = {}
 51 | 
 52 | arm_embedding_matrix = numpy.empty([instr_ct, VECTOR_SIZE])
 53 | x86_embedding_matrix = numpy.empty([instr_ct, VECTOR_SIZE])
 54 | 
 55 | 
 56 | ##                          ##
 57 | #            code            #
 58 | ##                          ##
 59 | 
 60 | # construct ARM dictionary
 61 | print("Constructing arm dictionary...")
 62 | with open(DATA_PATH_ARM) as f:
 63 |     for line in f:
 64 |         items = line.split(" ")
 65 | 
 66 |         # remove header row
 67 |         if len(items) == 2:
 68 |             dim1 = items[1]
 69 |             continue
 70 | 
 71 |         if "</s>" in items:
 72 |             continue
 73 | 
 74 |         if "\n" in items:
 75 |             endline = int(dim1)+1
 76 |             del items[endline]
 77 | 
 78 |         instr = items[0]
 79 |         emb = numpy.array([float(feat) for feat in items[1:]])
 80 | 
 81 |         armdict[instr] = emb
 82 | 
 83 | # construct X86 dictionary
 84 | print("Constructing x86 dictionary...")
 85 | with open(DATA_PATH_X86) as f:
 86 |     for line in f:
 87 |         items = line.split(" ")
 88 | 
 89 |         # remove header row
 90 |         if len(items) == 2:
 91 |             dim1 = items[1]
 92 |             continue
 93 | 
 94 |         if "</s>" in items:
 95 |             continue
 96 | 
 97 |         if "\n" in items:
 98 |             endline = int(dim1)+1
 99 |             del items[endline]
100 | 
101 |         instr = items[0]
102 |         emb = numpy.array([float(feat) for feat in items[1:]])
103 | 
104 |         x86dict[instr] = emb
105 | 
106 | 
107 | ##                    ##
108 | #     TSNE and PLOT    #
109 | ##                    ##
110 | 
111 | print("Compiling all embeddings together...")
112 | 
113 | arm_matrix = numpy.empty([len(armdict), VECTOR_SIZE])
114 | x86_matrix = numpy.empty([len(x86dict), VECTOR_SIZE])
115 | arm_index = 0
116 | x86_index = 0
117 | 
118 | for embedding in armdict.values():
119 |     arm_matrix[arm_index] = embedding
120 |     arm_index += 1
121 | 
122 | for embedding in x86dict.values():
123 |     x86_matrix[x86_index] = embedding
124 |     x86_index += 1
125 | 
126 | # compile all embeddings together
127 | final_len = len(armdict) + len(x86dict)
128 | final_matrix = numpy.empty([final_len, VECTOR_SIZE])
129 | index = 0
130 | 
131 | for elem in arm_matrix:
132 |     final_matrix[index] = elem
133 |     index += 1
134 | 
135 | for elem in x86_matrix:
136 |     final_matrix[index] = elem
137 |     index += 1
138 | 
139 | ### final_matrix contains ALL embeddings
140 | 
141 | # check length vs matrix size
142 | print("Final len: %d Matrix size: %d" % (final_len, index))
143 | 
144 | # dimension reduction
145 | print("Running PCA...")
146 | pca = PCA(n_components = 50)
147 | new_final_matrix = pca.fit_transform(final_matrix)
148 | 
149 | print("Running TSNE...")
150 | tsne_matrix = TSNE(n_components=2).fit_transform(new_final_matrix)
151 | 
152 | 
153 | # Plot
154 | matplotlib.rcParams.update({'font.size': 7})
155 | 
156 | # We do not want to print out all possible instructions
157 | # when plotting the queried instructions.
158 | # This creates a new matrix for our small sample.
159 | plot_arm_matrix = numpy.zeros([instr_ct,2])
160 | plot_x86_matrix = numpy.zeros([instr_ct,2])
161 | plot_all_matrix = numpy.zeros([(instr_ct*2),2])
162 | plot_arm_index = 0
163 | plot_x86_index = 0
164 | plot_all_index = 0
165 | 
166 | armlabels = []
167 | x86labels = []
168 | alllabels = []
169 | 
170 | # compile all instruction names
171 | instr_list = []
172 | 
173 | for arminstr in armdict.keys():
174 |     instr_list.append(arminstr)
175 | 
176 | for x86instr in x86dict.keys():
177 |     instr_list.append(x86instr)
178 | 
179 | 
180 | ###     TSNE - All Instructions     ###
181 | 
182 | print("Plotting ALL instructions:")
183 | print("Setting scatter plot...")
184 | 
185 | final_arm_matrix = numpy.zeros([len(armdict), 2])
186 | final_x86_matrix = numpy.zeros([len(x86dict), 2])
187 | armcount = 0
188 | x86count = 0
189 | 
190 | for x, y in zip(tsne_matrix[:,0], tsne_matrix[:,1]):
191 |     if armcount > (len(armdict) - 1):
192 |         final_x86_matrix[x86count,] = x,y
193 |         x86count += 1
194 |     else:
195 |         final_arm_matrix[armcount,] = x,y
196 |         armcount += 1
197 | 
198 | plt.scatter(final_arm_matrix[:,0], final_arm_matrix[:,1], s=50, marker="o", c='blue', alpha=0.1)
199 | plt.scatter(final_x86_matrix[:,0], final_x86_matrix[:,1], s=50, marker="^", c='red', alpha=0.1)
200 | 
201 | plt.xticks([])
202 | plt.yticks([])
203 | plt.show()
204 | 
205 | ###     TSNE - Selected Instructions    ###
206 | 
207 | for key, x, y in zip(instr_list, tsne_matrix[:,0], tsne_matrix[:,1]):
208 |     if key in armcodes:
209 |         plot_arm_matrix[int(plot_arm_index),] = x,y
210 |         armlabels.append(key)
211 |         plot_arm_index += 1
212 | 
213 |     if key in x86codes:
214 |         plot_x86_matrix[int(plot_x86_index),] = x,y
215 |         x86labels.append(key)
216 |         plot_x86_index += 1
217 | 
218 | # Plot selected instructions only
219 | print("Plotting selected instructions only:")
220 | print("Setting scatter plot...")
221 | 
222 | for label, x, y in zip(armlabels, plot_arm_matrix[:,0], plot_arm_matrix[:,1]):
223 |     plt.annotate(
224 |         label,
225 |         size = 12,
226 |         xy = (x, y),
227 |         ha = 'left',
228 |         va = 'top')
229 | 
230 | for label, x, y in zip(x86labels, plot_x86_matrix[:,0], plot_x86_matrix[:,1]):
231 |     plt.annotate(
232 |         label,
233 |         size = 12,
234 |         xy = (x, y),
235 |         ha = 'right',
236 |         va = 'bottom')
237 | 
238 | plt.scatter(plot_arm_matrix[:,0], plot_arm_matrix[:,1], s=50, marker="o", c='blue', alpha=0.8)
239 | plt.scatter(plot_x86_matrix[:,0], plot_x86_matrix[:,1], s=50, marker="^", c='red', alpha=0.8)
240 | 
241 | 
242 | plt.xticks([])
243 | plt.yticks([])
244 | plt.show()
245 | 
246 | 


--------------------------------------------------------------------------------