├── README.md ├── data ├── test.arm ├── test.x86 └── test_labels.txt ├── instr_sim.py ├── output ├── out.arm.We.zip ├── out.arm.words ├── out.arm.zip ├── out.x86.We.zip ├── out.x86.words └── out.x86.zip ├── query.py ├── senvec.py └── tsne2.py /README.md: -------------------------------------------------------------------------------- 1 | # cross-arch-instr-model.github.io 2 | 3 | Thank you for looking at our work! 4 | The programs included here were created for the following paper: 5 | 6 | "A Cross-Architecture Instruction Embedding Model for Natural Language Processing-Inspired Binary Code Analysis" 7 | 8 | Kimberly Redmond, Lannan Luo, and Qiang Zeng 9 | 10 | The NDSS Workshop on Binary Analysis Research (BAR), 2019. 11 | 12 | ############################ 13 | 14 | The trained cross-architecture instruction embedding model used in our paper are included in the output/ directory. Please remember to unzip the four output files. 15 | 16 | Our embeddings were trained on the model Bivec, which is based on Word2Vec. 17 | You may find it here: 18 | 19 | https://github.com/lmthang/bivec 20 | 21 | ############################ 22 | 23 | ABOUT THESE PROGRAMS 24 | 25 | All file paths and instruction selections are hard-coded into these programs. For your 26 | convenience, they are listed in variables near the top; feel free to modify them for your use. 27 | 28 | ./senvec.py 29 | 30 | Returns ROC plots and AUC scores for cross-architecture basic block similarity tests. 31 | Basic block embeddings are calculated by summing instruction embeddings within a block 32 | 33 | Similarity is computed using Cosine similarity 34 | 35 | ./tsne2.py 36 | 37 | Returns 2 t-SNE figures with different displays: 38 | 1) an unlabeled figure displaying all instructions in one vector space 39 | 2) a labeled figure displaying selected instructions in one vector space 40 | 41 | ./instr_sim.py 42 | 43 | Returns 2 ROC plots and AUC scores for instruction-level similarity tests. 44 | Instructions are evaluated in pairs, in 2 ways: 45 | 1) mono-architecture 46 | 2) cross-architecture 47 | 48 | The similarity metric used is cosine similarity. 49 | 50 | ./query.py 51 | 52 | Returns a list of the top-5 most similar instructions, given an instruction. 53 | Each instruction returns the top 6 instructions from its own architecture 54 | (#1 is itself), and the top 5 instructions from the other architecture, 55 | according to cosine similarity. 56 | -------------------------------------------------------------------------------- /instr_sim.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """ 4 | Kim Redmond 5 | A program to evaluate cosine similarity 6 | between INSTRUCTIONS, not basic blocks. 7 | These instructions are randomly hand-chosen. 8 | - ARM-ARM (100) 9 | - x86-x86 (100) 10 | - ARM-x86 (50) 11 | Test accuracy by plotting ROC/AUC. 12 | """ 13 | 14 | import math 15 | import numpy 16 | from scipy import spatial 17 | from sklearn import metrics 18 | from sklearn.metrics import pairwise 19 | import matplotlib.pyplot as plt 20 | 21 | 22 | ### CHANGE FILES HERE ### 23 | 24 | arm_emb = "output/out.arm" 25 | x86_emb = "output/out.x86" 26 | dim = 200 27 | 28 | 29 | ### 30 | # Instruction pairs to test... 31 | ### 32 | 33 | # mono-architecture 34 | # 50 similar, 50 dissimilar 35 | 36 | arm1codes = ['ADD~R0,R5,R7', 'MOV~R1,0', 'CMP~R0,0', 'ORR~R0,R0,0', 'AND~R12,R6,0', 37 | 'ADD~R0,R9,R7,LSL0', 'MOV~R0,R4', 'LDR~R0,[R0]', 'ORRS~R0,R5,R7', 'ADDS~R1,R1,0', 38 | 'SBCS~R2,R2,0', 'SUBS~R2,R1,0', 'MOVEQ~R5,0', 'LDRB~R1,[R6+R7]', 'MVN~R12,0', 39 | 'MOVNE~R2,R4', 'STM~R2,{R0,R1,R5}', 'ADC~R9,R5,R4', 'LSL~R1,R3,0', 'EOR~R11,R6,R1', 40 | 'STMIB~SP,{R0,R7}', 'LDRLO~R1,[SP+0]', 'LDM~R7,{R3,R7}', 'RSBS~R3,R3,0', 'ORREQ~R6,R11,0', 41 | 'ANDNE~R3,R3,0', 'MOVGE~R7,0', 'RSCS~R1,R10,0', 'ADDEQ~R8,R8,0', 'TST~R0,0', 42 | 'LDRBNE~R7,[R9+R7]', 'STR~R6,[R0]', 'STRB~R0,[R2-0]', 'UMULL~R1,R2,R0,R3', 'PUSH~{R4,R5,R6,R7,R8,R9,R10,R11,LR}', 43 | 'BIC~SP,SP,0', 'MLA~R10,R2,R4,R0', 'SBC~R3,R12,0', 'MOV~R2,R7', 'MUL~R1,R0,R2', 44 | 'LDMIB~R7,{R2,R3}', 'MOVLE~R0,R1', 'ASR~R0,R0,0', 'LSLGE~R2,R3,R7', 'POP~{R4,R5,R6,R7,R8,R9,R10,R11,LR}', 45 | 'MVNNE~R7,0', 'ASRNE~R6,R6,0', 'STMNE~R0,{R6,R8}', 'SUB~R0,R8,R0', 'LDREQ~R0,', 46 | 'ADD~R0,R5,R7', 'ADDS~R1,R1,0', 'ADD~R0,SP,0', 'ADDS~R4,R4,0', 'ADD~R9,R10,R10,LSL0', 47 | 'ADD~R10,SP,0', 'ADD~R11,SP,0', 'ADD~R1,SP,0', 'ADD~R0,R0,R4', 'ADDS~R2,R11,R0', 48 | 'BL~FOO', 'BEQ~', 'MVN~R2,0', 'UMULL~R2,R3,R0,R1', 'ADC~R0,R5,R6', 49 | 'LDR~R1,', 'LDR~R1,[R5]', 'MOV~R9,R8', 'MOVNE~R0,0', 'B~', 50 | 'PUSH~{R4,R5,R6,R7,R8,LR}', 'RSCS~R3,R1,R4,ASR0', 'BHS~', 'ORRSNE~R2,R2,R3', 'CMN~R0,0', 51 | 'ORR~R8,R1,R8', 'ORR~R10,R3,R10', 'ORRS~R1,R0,R2', 'ADD~R6,SP,0', 'BLO~', 52 | 'MOV~R2,R5', 'MOV~R2,0', 'MOV~R0,R9', 'MOV~R6,R4', 'MOV~R0,R8', 53 | 'STRH~R0,[SP+0]', 'CMPNE~R1,0', 'SUBS~R2,R4,0', 'MOVNE~R1,R4', 'ORR~R1,R1,0', 54 | 'POP~{R4,R5,R6,LR}', 'UMULL~R3,R7,R0,R4', 'UMULL~R6,R5,R3,R4', 'CMPEQ~R6,0', 'MOV~R0,R10', 55 | 'ASR~R11,R0,0', 'ORRNE~R6,R8,R6,LSL0', 'LDR~R2,[SP+0]', 'RSBS~R0,R5,0', 'BLS~'] 56 | 57 | arm2codes = ['ADD~R6,R6,R10', 'MOV~R10,0', 'CMP~R4,0', 'ORR~R2,R2,0', 'AND~R1,R1,0', 58 | 'ADD~R2,R1,R8,LSL0', 'MOV~R0,R9', 'LDR~R0,[SP+0]', 'ORRS~R0,R0,R1', 'ADDS~R4,R1,0', 59 | 'SBCS~R0,R4,R1', 'SUBS~R2,R0,R4', 'MOVEQ~R0,0', 'LDRB~R5,[R4+R7+LSR0]', 'MVN~R2,0', 60 | 'MOVNE~R7,0', 'STM~R10,{R0,R1}', 'ADC~R3,R10,0', 'LSL~R2,R7,0', 'EOR~R0,R5,0', 61 | 'STMIB~R0,{R8,R9}', 'LDRLO~R1,[R12]', 'LDM~R1,{R0,R1}', 'RSBS~R4,R4,0', 'ORREQ~R9,R9,R5,LSL0', 62 | 'ANDNE~R7,R4,R6,LSR0', 'MOVGE~R1,0', 'RSCS~R1,R2,0', 'ADDEQ~R2,R0,0', 'TST~R3,0', 63 | 'LDRBNE~R3,[R0]', 'STR~R4,[SP+0]', 'STRB~R7,[R0],0', 'UMULL~R8,R0,R2,R6', 'PUSH~{R4,R5,R6,R7,R8,R9,R10,LR}', 64 | 'BIC~R0,R0,0', 'MLA~R2,R3,R8,R1', 'SBC~R3,R5,R3', 'MOV~PC,R2', 'MUL~R3,R0,R2', 65 | 'LDMIB~R7,{R0,R1}', 'MOVLE~R1,R0', 'ASR~R3,R2,0', 'LSLGE~R4,R0,R3', 'POP~{R4,LR}', 66 | 'MVNNE~R0,0', 'ASRNE~R0,R5,0', 'STMNE~R1,{R3,R12}', 'SUB~SP,R11,0', 'LDREQ~R0,[R10+0]', 67 | 'SUBS~R2,R0,R4', 'SUB~R0,SP,R0', 'SUBS~R4,R2,R0', 'SUBS~R0,R0,0', 'SUBS~R7,R10,R7', 68 | 'BNE~', 'LSR~R0,R0,0', 'MOV~R8,R0', 'BLT~', 'B~', 69 | 'ADC~R4,R0,0', 'STM~SP,{R0,R4}', 'BL~FOO', 'LDRB~R0,[R5+0]!', 'ORRS~R0,R0,R1', 70 | 'TST~R11,0', 'BHI~', 'ADD~SP,SP,0', 'LDRB~R6,[R4]', 'LSL~R2,R1,0', 71 | 'POP~{R4,R5,R6,R7,R8,R9,R10,R11,LR}', 'CMP~R0,0', 'ADD~R1,R0,0', 'ANDS~R5,R8,0', 'AND~R6,R2,0', 72 | 'AND~R2,R0,0', 'AND~R0,R0,0', 'ASR~R1,R0,0', 'SUB~R0,R6,0', 'BIC~R1,R7,0', 73 | 'BGE~', 'LDMIB~R6,{R2,R3}', 'SUBS~R7,R2,R0', 'AND~R0,R0,0', 'AND~R4,R1,0', 74 | 'BL~FOO', 'STR~R6,[SP+0]', 'ADDS~R10,R10,0', 'AND~R2,R2,R1', 'CMN~R9,-0', 75 | 'PUSH~{R11,LR}', 'LDM~R2,{R3,R6}', 'LDRB~R6,[R1+0]', 'ADC~R10,R10,0', 'BNE~', 76 | 'SUB~SP,SP,0', 'TST~R0,0', 'BNE~', 'EOR~R0,R5,0', 'MLA~R3,R10,R6,R2'] 77 | 78 | x861codes = ['ADDQ~RSP,0', 'MOVQ~RDI,RBX', 'CMPQ~RDX,0', 'XORL~EAX,EAX', 'ANDQ~R15,-0', 79 | 'ADDQ~R14,RAX', 'MOVQ~RDX,RCX', 'LEAQ~RAX,[RBX+0]', 'XORL~EDI,EDI', 'ADDL~EAX,-0', 80 | 'SUBQ~RSP,0', 'SUBL~EAX,R15D', 'MOVL~EAX,EBX', 'CMPL~ESI,0', 'SHLL~EBX,0', 81 | 'MOVUPS~[RAX+0],XMM0', 'TESTQ~RCX,RCX', 'ADCQ~RBX,0', 'ANDB~AL,0', 'CMPB~AL,0', 82 | 'ORL~ECX,EAX', 'TESTB~DL,0', 'MOVZBL~EDX,[RAX+0]', 'SHRL~ESI,0', 'ORL~ESI,0', 83 | 'IDIVL~EDI', 'CMOVNEQ~R13,R15', 'NEGQ~R10', 'SETE~DL', 'ORQ~RDX,RDI', 84 | 'NOTL~ECX', 'MOVSLQ~R8,[R13+0]', 'CMOVBL~EAX,ESI', 'NOTQ~RDX', 'CMOVAL~R11D,EDI', 85 | 'PSHUFD~XMM0,XMM1,0', 'PUNPCKLDQ~XMM0,[RIP+]', 'SUBPD~XMM0,[RIP+]', 'MOVAPD~[RSP],XMM1', 'XORPS~XMM0,XMM0', 86 | 'MOVAPS~[RSP+0],XMM0', 'NOTB~BL', 'CALLQ~*RAX', 'MOVABSQ~RCX,0', 'POPQ~RBX', 87 | 'MOVDQU~[RDI,RDX,0-0],XMM2', 'PSHUFLW~XMM3,XMM3,0', 'PSRAD~XMM2,0', 'PUNPCKLBW~XMM2,XMM2', 'PADDQ~XMM3,XMM1', 88 | 'ADDQ~RSP,0', 'ADDQ~R12,0', 'ADDQ~RAX,-0', 'PADDQ~XMM0,XMM2', 'ADDQ~R13,0', 89 | 'NOTQ~RCX', 'CLTQ', 'NEGL~R9D', 'SETE~R15B', 'SHRB~BL,0', 90 | 'JMP~', 'CALLQ~FOO', 'RETQ', 'JAE~', 'JNE~', 91 | 'MOVB~AL,0', 'MOVL~EAX,0', 'MOVL~ESI,0', 'MOVZBL~EAX,BL', 'MOVL~EDI,', 92 | 'XORL~ECX,ECX', 'XORL~EBX,EBX', 'XORL~EAX,EAX', 'XORL~R8D,R8D', 'XORL~EDI,EDI', 93 | 'CMPL~R14D,0', 'CMPQ~RDX,RBX', 'CMPL~EAX,-0', 'CMPQ~R14,-0', 'CMPQ~[R12+0],0', 94 | 'MOVW~+[RIP+0],0', 'MOVQ~RCX,RBX', 'MOVZBL~EAX,[RIP+]', 'MOVQ~RDI,R12', 'MOVQ~RBX,RDI', 95 | 'LEAQ~RSI,[RBX+0]', 'LEAQ~RSI,[RCX+0]', 'LEAQ~RDI,[RSP+0]', 'LEAL~EDI,[RDX,RSI]', 'LEAL~EBP,[RSI-0]', 96 | 'PUSHQ~RBP', 'PUSHQ~R15', 'PUSHQ~R12', 'PUSHQ~RBX', 'PUSHQ~R14', 97 | 'JMPQ~*[RAX*0+]', 'CALLQ~*[RIP+]', 'CMOVGL~R8D,EDX', 'IMULL~EDI,EDX,0', 'SBBL~EBP,EBP'] 98 | 99 | x862codes = ['ADDQ~R14,R12', 'MOVQ~R14,RSP', 'CMPQ~[RIP+],RAX', 'XORL~R14D,R14D', 'ANDQ~RAX,R15', 100 | 'ADDQ~RDX,-0', 'MOVQ~RAX,[R8+0]', 'LEAQ~R14,[R8+0]', 'XORL~EBX,EBX', 'ANDQ~RBP,-0', 101 | 'SUBQ~RBX,RAX', 'SUBL~ESI,EBP', 'MOVL~EDX,[RBX+0]', 'CMPL~R15D,0', 'SHLL~EDX,CL', 102 | 'MOVUPS~XMM0,[RIP+]', 'TESTQ~R8,R8', 'ADCQ~R8,0', 'ANDB~[RBX+0],-0', 'CMPB~[RSP+0],0', 103 | 'ORL~EAX,R8D', 'TESTB~[RSI+0],0', 'MOVZBL~EDX,R13B', 'SHRL~ECX,0', 'ORL~EAX,0', 104 | 'IDIVL~R15D', 'CMOVNEQ~RAX,R10', 'NEGQ~RDX', 'SETE~CL', 'ORQ~RAX,R15', 105 | 'NOTL~EDX', 'MOVSLQ~RSI,EBP', 'CMOVBL~EAX,ECX', 'NOTQ~RAX', 'CMOVAL~EAX,ECX', 106 | 'PSHUFD~XMM2,XMM1,0', 'PUNPCKLDQ~XMM2,[RIP+]', 'SUBPD~XMM2,[RIP+]', 'MOVAPD~XMM5,XMM1', 'XORPS~XMM1,XMM1', 107 | 'MOVAPS~XMM0,[RIP+]', 'NOTB~[R15]', 'CALLQ~FOO', 'MOVABSQ~RAX,0', 'POPQ~R13', 108 | 'MOVDQU~XMM0,[R12]', 'PSHUFLW~XMM0,XMM0,0', 'PSRAD~XMM0,0', 'PUNPCKLBW~XMM5,', 'PADDQ~XMM2,XMM10', 109 | 'SUBL~ESI,EBP', 'SUBL~EAX,R15D', 'SUBL~ESI,[RSP+0]', 'SUBQ~RSP,0', 'SUBL~EAX,R12D', 110 | 'ADDL~R12D,-0', 'ORL~EBP,R12D', 'TESTQ~RAX,RAX', 'CMPB~[R15,RBX],0', 'ORB~DL,R8B', 111 | 'ANDL~EAX,0', 'XORL~ESI,ESI', 'POPQ~R15', 'MOVB~R15B,0', 'MOVZBL~ECX,[R14+0]', 112 | 'SHLQ~R8,0', 'POPQ~RBX', 'POPQ~R14', 'JE~', 'JA~', 113 | 'TESTB~[RBX+0],0', 'TESTL~EAX,EAX', 'TESTQ~RDI,RDI', 'ANDL~R11D,0', 'ANDL~R9D,0', 114 | 'ADDQ~R14,RBX', 'XORL~ESI,ESI', 'JAE~', 'JMP~', 'TESTQ~RAX,RAX', 115 | 'ANDL~ECX,-0', 'RETQ', 'CMPB~[R12],0', 'POPQ~RBX', 'TESTL~EAX,EAX', 116 | 'CMPQ~RBX,0', 'TESTB~BPL,0', 'SETNE~AL', 'JMP~', 'CMPQ~RBP,0', 117 | 'POPQ~RBX', 'POPQ~RBP', 'POPQ~R15', 'POPQ~R12', 'XORL~EAX,EAX', 118 | 'SHRQ~R8,0', 'MOVUPS~[RCX],XMM0', 'SUBQ~RBP,RDI', 'JB~', 'CALLQ~FOO'] 119 | true_labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 120 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 121 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 122 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 123 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 124 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1] 125 | num = 100 126 | 127 | # cross-architecture instructions 128 | # 25 similar, 25 dissimilar 129 | arm_cross = ['ADD~R0,R5,0', 'ADD~R6,R0,0', 'ADC~R7,R1,0', 'ADD~SP,SP,0', 'ADDS~R0,R2,R0', 130 | 'ORR~R0,R0,0', 'ORR~R9,R9,0', 'AND~R0,R9,0', 'AND~R2,R2,0', 'ORRS~R3,R1,R2', 131 | 'SUBS~R0,R6,0', 'SUB~R3,R3,R8,LSL0', 'SUBS~R0,R7,R10', 'SUB~SP,SP,0', 'SUBS~R2,R10,0', 132 | 'CMP~R0,0', 'MOV~R0,R5', 'MOV~R6,0', 'LDR~R0,[R8]', 'LDR~R0,[R10+0]', 133 | 'BL~FOO', 'BNE~', 'EOR~R1,R4,R9,ASR0', 'B~', 'STR~R8,[R4]', 134 | 135 | 'ADDNE~R0,R12,0', 'ADC~R6,R6,0', 'ADC~R1,R1,0', 'ADDS~R9,R9,0', 'ADDS~R0,R0,R7', 136 | 'ORR~R1,R1,R0,LSR0', 'ORRS~R0,R6,R7', 'AND~R0,R4,0', 'AND~R0,R3,R2', 'ORRS~R0,R0,R11', 137 | 'SUB~R11,R0,R3', 'SUBS~R2,R0,0', 'SUBS~R6,R0,R2', 'LDR~R2,[SP+0]', 'LDR~R0,[R6]', 138 | 'POP~{R4,LR}', 'PUSH~{R4,R5,R6,R7,R8,R9,R10,R11,LR}', 'MOV~R9,R2', 'MOVEQ~R2,R1', 'CMP~R5,0', 139 | 'TST~R0,0', 'BL~FOO', 'BEQ~', 'SBC~R7,R7,0', 'SMULL~R6,R5,R7,LR'] 140 | 141 | x86_cross = ['ADDQ~RAX,0', 'ADDQ~RBX,RAX', 'ADDL~ECX,-0', 'ADDQ~RSP,0', 'ADDL~EBP,0', 142 | 'XORL~EBP,EBP', 'ORQ~RCX,RDX', 'ANDL~EDX,0', 'TESTQ~RCX,RCX', 'ORQ~RSI,RBX', 143 | 'SUBQ~RAX,[RSP+0]', 'SUBQ~R12,RAX', 'SUBL~ESI,EBX', 'SUBQ~RSP,0', 'SUBL~ESI,R12D', 144 | 'CMPQ~[R14],0', 'MOVQ~RBX,RAX', 'MOVQ~RDI,RAX', 'LEAQ~RBX,[R14,RBP]', 'LEAQ~RSI,[RSP+0]', 145 | 'CALLQ~FOO', 'JNE~', 'XORL~EDX,EDX', 'JMP~', 'MOVQ~RAX,[RBX]', 146 | 147 | 'CMOVLL~ECX,EAX', 'PUSHQ~RBX', 'PSUBQ~XMM0,XMM1', 'XORL~EAX,EAX', 'SUBQ~RSP,0', 148 | 'POPQ~RBP', 'MOVQ~RDI,R13', 'TESTB~AL,AL', 'CMOVNEQ~RBX,RCX', 'IMULQ~RBX,R8', 149 | 'CMPQ~[RSP+0],0', 'CMPL~EAX,0', 'MOVZWL~EAX,[R13+0]', 'PUSHQ~R15', 'JS~', 150 | 'BTQ~R14,RAX', 'JA~', 'DIVB~[RSP+0]', 'JNE~', 'MOVSLQ~R13,EBP', 151 | 'SUBL~EAX,[RIP+]', 'LEAL~EAX,[R14,RBP]', 'RETQ', 'MOVQ~RDI,RAX', 'ADDQ~RBX,RBP'] 152 | 153 | cross_labels = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 154 | 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 155 | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1] 156 | 157 | armdict = {} 158 | x86dict = {} 159 | 160 | 161 | # Method to read from embedding files 162 | # Stored as [instruction]:embedding pairs 163 | def read_file(fname, dictionary, dim): 164 | with open(fname) as f: 165 | for line in f: 166 | items = line.split(" ") 167 | 168 | # remove header row 169 | if len(items) == 2: 170 | continue 171 | 172 | if "" in items: 173 | continue 174 | 175 | if "" in items: 176 | continue 177 | 178 | if "\n" in items: 179 | endline = int(dim)+1 180 | del items[endline] 181 | 182 | instr = items[0] 183 | emb = numpy.array([float(feat) for feat in items[1:]]) 184 | 185 | # turn corrupted values to 0 186 | if numpy.any(numpy.isnan(emb)): 187 | print("Corrupted!: %s" % instr) 188 | emb = numpy.nan_to_num(emb) 189 | 190 | dictionary[instr] = emb 191 | 192 | 193 | ## ## 194 | # Construct dictionaries # 195 | ## ## 196 | print("Constructing dictionaries from files...") 197 | read_file(arm_emb, armdict, dim) 198 | read_file(x86_emb, x86dict, dim) 199 | 200 | 201 | ## ## 202 | # Instr Cosine Similarity # 203 | ## ## 204 | print("Calculating cosine similarities...") 205 | 206 | def cos_sim(out_list, dictry, codes1, codes2): 207 | for x, y in zip(codes1, codes2): 208 | num1 = dictry[x] 209 | num2 = dictry[y] 210 | 211 | emb1 = num1.reshape(1, -1) 212 | emb2 = num2.reshape(1, -1) 213 | 214 | sim = pairwise.cosine_similarity(emb1, emb2) # returns array 215 | 216 | print("%s %s %f" % (x, y, sim)) 217 | out_list.append(sim[0,0]) 218 | 219 | arm_cos = [] 220 | cos_sim(arm_cos, armdict, arm1codes, arm2codes) 221 | 222 | x86_cos = [] 223 | cos_sim(x86_cos, x86dict, x861codes, x862codes) 224 | 225 | cross_cos = [] 226 | for x,y in zip(arm_cross, x86_cross): 227 | num1 = armdict[x] 228 | num2 = x86dict[y] 229 | 230 | emb1 = num1.reshape(1,-1) 231 | emb2 = num2.reshape(1,-1) 232 | sim = pairwise.cosine_similarity(emb1, emb2) # returns array 233 | 234 | print("%s %s %f" % (x, y, sim)) 235 | cross_cos.append(sim[0,0]) 236 | 237 | # ROC, AUC 238 | fpra, tpra, thresholdsa = metrics.roc_curve(true_labels, arm_cos, pos_label=1) 239 | fprx, tprx, thresholdsx = metrics.roc_curve(true_labels, x86_cos, pos_label=1) 240 | fprc, tprc, thresholdsc = metrics.roc_curve(cross_labels, cross_cos, pos_label=1) 241 | 242 | auca = metrics.auc(fpra, tpra) 243 | aucx = metrics.auc(fprx, tprx) 244 | aucc = metrics.auc(fprc, tprc) 245 | 246 | plt.title("Mono-Architecture Instruction-Level Similarity") 247 | plt.plot(fpra, tpra, label="ARM Instructions, AUC=%f" % auca) 248 | plt.plot(fprx, tprx, label="X86 Instructions, AUC=%f" % aucx) 249 | 250 | plt.legend(loc = 'lower right') 251 | plt.plot([0,1], [0,1], 'r--') 252 | plt.xlim([0,1]) 253 | plt.ylim([0,1]) 254 | plt.ylabel("True Positive Rate") 255 | plt.xlabel("False Positive Rate") 256 | plt.show() 257 | 258 | plt.title("Cross-Architecture Instruction-Level Similarity") 259 | plt.plot(fprc, tprc, label="ARM-X86 Instructions, AUC=%f" % aucc) 260 | 261 | plt.legend(loc = 'lower right') 262 | plt.plot([0,1], [0,1], 'r--') 263 | plt.xlim([0,1]) 264 | plt.ylim([0,1]) 265 | plt.ylabel("True Positive Rate") 266 | plt.xlabel("False Positive Rate") 267 | plt.show() 268 | -------------------------------------------------------------------------------- /output/out.arm.We.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-code-analysis/cross-arch-instr-model/008caecddadb76c495aabd0b00fc17ceb7c877f4/output/out.arm.We.zip -------------------------------------------------------------------------------- /output/out.arm.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-code-analysis/cross-arch-instr-model/008caecddadb76c495aabd0b00fc17ceb7c877f4/output/out.arm.zip -------------------------------------------------------------------------------- /output/out.x86.We.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-code-analysis/cross-arch-instr-model/008caecddadb76c495aabd0b00fc17ceb7c877f4/output/out.x86.We.zip -------------------------------------------------------------------------------- /output/out.x86.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlp-code-analysis/cross-arch-instr-model/008caecddadb76c495aabd0b00fc17ceb7c877f4/output/out.x86.zip -------------------------------------------------------------------------------- /query.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """ 4 | Query Program for Word Similarity. 5 | - Query the top-10 most similar instructions (according to 6 | cosine similarity) within and across architectures: 7 | ARM-ARM 8 | ARM-x86 9 | x86-x86 10 | x86-ARM 11 | - Return 5 most similar instructions and their sim scores. 12 | """ 13 | 14 | import math 15 | import numpy 16 | from scipy import spatial 17 | from sklearn import metrics 18 | from sklearn.metrics import pairwise 19 | import matplotlib.pyplot as plt 20 | 21 | 22 | ### CHANGE FILES HERE ### 23 | 24 | # Files containing embeddings 25 | arm_out_emb = "output/out.arm" 26 | x86_out_emb = "output/out.x86" 27 | 28 | # Dimension size 29 | dim = 200 30 | 31 | # Instructions to query 32 | arm_query = ['ADD~SP,SP,0', 'SUB~SP,SP,0', 33 | 'LDR~R0,[R5+0]', 'BL~FOO', 34 | 'LDRNE~R4,[SP+0]', 'ADD~R1,R0,R7', 35 | 'BLT~', 'BEQ~', 'MOV~R0,R5', 36 | 'MOV~R8,R2', 'ADD~R1,R0,R7', 'SUB~SP,SP,0', 37 | 'LDR~R0,[R5+0]', 'MVN~R0,0', 'CMP~R8,0'] 38 | x86_query = ['ADDQ~RSP,0', 'SUBQ~RSP,0', 39 | 'MOVQ~RDI,[R12+0]','CALLQ~FOO', 40 | 'CMOVNEQ~RCX,RAX', 'ADDQ~RSI,R12', 41 | 'JLE~', 'JE~', 'MOVL~EAX,R14D', 42 | 'MOVQ~R13,RDX', 'ADDQ~R13,RBX', 'SUBQ~RSP,0', 43 | 'MOVQ~RDI,[R12+0]', 'MOVL~EAX,-0', 'CMPL~R13D,0'] 44 | noinstr = 15 45 | 46 | 47 | ## ## 48 | # ARM/x86 Dictionaries # 49 | ## ## 50 | armdict = {} 51 | x86dict = {} 52 | armlist = [] 53 | x86list = [] 54 | 55 | 56 | # Method to read from embedding files 57 | # Stored as [instruction]:embedding pairs 58 | def read_file(fname, dictionary, dim, lst): 59 | with open(fname) as f: 60 | for line in f: 61 | items = line.split(" ") 62 | 63 | # remove header row 64 | if len(items) == 2: 65 | continue 66 | 67 | if "" in items: 68 | continue 69 | 70 | if "" in items: 71 | continue 72 | 73 | if "\n" in items: 74 | endline = int(dim)+1 75 | del items[endline] 76 | 77 | instr = items[0] 78 | emb = numpy.array([float(feat) for feat in items[1:]]) 79 | 80 | # turn corrupted values to 0 81 | if numpy.any(numpy.isnan(emb)): 82 | print("Corrupted!: %s" % instr) 83 | #emb = numpy.nan_to_num(emb) 84 | 85 | dictionary[instr] = emb 86 | lst.append(instr) 87 | 88 | 89 | ## ## 90 | # Construct dictionaries # 91 | ## ## 92 | print("Constructing dictionaries from files...") 93 | read_file(arm_out_emb, armdict, dim, armlist) 94 | read_file(x86_out_emb, x86dict, dim, x86list) 95 | 96 | 97 | ## ## 98 | # Calculate similarity scores # 99 | ## ## 100 | print("Calculating similarity scores...") 101 | 102 | arm = len(armlist) 103 | x86 = len(x86list) 104 | 105 | arm2arm = numpy.ndarray([noinstr, arm]) 106 | arm2x86 = numpy.ndarray([noinstr, x86]) 107 | x862x86 = numpy.ndarray([noinstr, x86]) 108 | x862arm = numpy.ndarray([noinstr, arm]) 109 | 110 | def cosinesim(instr1, instr2): 111 | emb1 = instr1.reshape(1,-1) 112 | emb2 = instr2.reshape(1,-1) 113 | sim = pairwise.cosine_similarity(emb1, emb2) 114 | return sim 115 | 116 | 117 | # Calc similarities for each query instruction 118 | row = 0 119 | for instr in arm_query: 120 | emb = armdict[instr] 121 | index = 0 122 | for instr2 in armdict.values(): 123 | arm2arm[row,index] = cosinesim(emb, instr2) 124 | index += 1 125 | 126 | index = 0 127 | for instr2 in x86dict.values(): 128 | arm2x86[row,index] = cosinesim(emb, instr2) 129 | index += 1 130 | 131 | row += 1 132 | 133 | row = 0 134 | for instr in x86_query: 135 | emb = x86dict[instr] 136 | index = 0 137 | for instr2 in x86dict.values(): 138 | x862x86[row,index] = cosinesim(emb, instr2) 139 | index += 1 140 | 141 | index = 0 142 | for instr2 in armdict.values(): 143 | x862arm[row,index] = cosinesim(emb, instr2) 144 | index += 1 145 | 146 | row += 1 147 | 148 | 149 | ## ## 150 | # Top 5 Similar Instructions # 151 | ## ## 152 | 153 | print("Returning top 5 similar instructions...") 154 | 155 | row = 0 156 | for instr in arm_query: 157 | armrow = arm2arm[row] 158 | x86row = arm2x86[row] 159 | 160 | print("\tARM-ARM\t%s" % instr) 161 | for x in range(6): 162 | cos = numpy.max(armrow) # top cos value 163 | index = numpy.argmax(armrow) # index of that value 164 | word = armlist[index] # find word in dictionary 165 | 166 | print("%s\t\t\t%f" % (word, cos)) 167 | armrow = numpy.delete(armrow, index) 168 | 169 | print("\n") 170 | 171 | print("\tARM-X86\t%s" % instr) 172 | for x in range(5): 173 | cos = numpy.max(x86row) 174 | index = numpy.argmax(x86row) 175 | word = x86list[index] 176 | 177 | print("%s\t\t\t%f" % (word, cos)) 178 | x86row = numpy.delete(x86row, index) 179 | 180 | print("\n") 181 | 182 | row += 1 183 | 184 | row = 0 185 | for instr in x86_query: 186 | armrow = x862arm[row] 187 | x86row = x862x86[row] 188 | 189 | print("\tX86-X86\t%s" % instr) 190 | for x in range(6): 191 | cos = numpy.max(x86row) 192 | index = numpy.argmax(x86row) 193 | word = x86list[index] 194 | 195 | print("%s\t\t\t%f" % (word, cos)) 196 | x86row = numpy.delete(x86row, index) 197 | 198 | print("\n") 199 | 200 | print("\tX86-ARM\t%s" % instr) 201 | for x in range(5): 202 | cos = numpy.max(armrow) 203 | index = numpy.argmax(armrow) 204 | word = armlist[index] 205 | 206 | print("%s\t\t\t%f" % (word, cos)) 207 | armrow = numpy.delete(armrow, index) 208 | 209 | print("\n") 210 | 211 | row += 1 212 | -------------------------------------------------------------------------------- /senvec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | """ 4 | Sentence vector program. 5 | Takes word embeddings and raw instruction file. 6 | Stores word embeddings in dictionary. 7 | Scans raw instruction file and queries each embedding. 8 | Embeddings are summed per basic block. 9 | Basic blocks are compared across architecture languages. 10 | """ 11 | 12 | import math 13 | import numpy 14 | from numpy import linalg 15 | from scipy import spatial 16 | from sklearn import metrics 17 | from sklearn.metrics import confusion_matrix 18 | import matplotlib.pyplot as plt 19 | 20 | 21 | ### CHANGE FILES HERE ### 22 | 23 | # files containing word + embedding 24 | param_file1a = "output/out.arm" 25 | param_file1x = "output/out.x86" 26 | 27 | # files containing test set of instructions 28 | test_arm = "data/test.arm" 29 | test_x86 = "data/test.x86" 30 | 31 | # ground truth labels for test set 32 | test_labels = "data/test_labels.txt" 33 | 34 | # dimension size 35 | dim = 200 36 | 37 | 38 | # test set raw-text basic blocks 39 | armtest = [] 40 | x86test = [] 41 | 42 | # dictionary for each trained lang/model 43 | armdict1 = {} 44 | x86dict1 = {} 45 | 46 | 47 | """ 48 | Store the raw instructions from test files. 49 | Each line is a basic block. 50 | """ 51 | with open(test_arm) as f: 52 | for line in f: 53 | armtest.append(line.split()) 54 | 55 | with open(test_x86) as f: 56 | for line in f: 57 | x86test.append(line.split()) 58 | 59 | 60 | """ 61 | Each model will have its words + embeddings 62 | stored into a dictionary. 63 | """ 64 | 65 | def read_file(fname, dictionary, dim): 66 | with open(fname) as f: 67 | for line in f: 68 | 69 | # separate instructions and features 70 | items = line.split(" ") 71 | 72 | # remove header row 73 | if len(items) == 2: 74 | continue 75 | 76 | if "" in items: 77 | continue 78 | 79 | if "" in items: 80 | continue 81 | 82 | if "\n" in items: 83 | endline = int(dim)+1 84 | del items[endline] 85 | 86 | instr = items[0] 87 | emb = numpy.array([float(feat) for feat in items[1:]]) 88 | 89 | # turn corrupted read-in values to 0 90 | if numpy.any(numpy.isnan(emb)): 91 | print("Corrupted!: %s" % instr) 92 | emb = numpy.nan_to_num(emb) 93 | 94 | dictionary[instr] = emb 95 | 96 | 97 | # Read in embeddings from each architecture file 98 | print("Reading in embedding files...") 99 | read_file(param_file1a, armdict1, dim) 100 | read_file(param_file1x, x86dict1, dim) 101 | 102 | 103 | # create sentence embedding for each bb 104 | # find an instruction, query the dict 105 | # sum instr embeddings to form a bb 106 | def calculateSentenceVectors(testset, dim, dictry): 107 | finalout = [] 108 | for bb in testset: 109 | count = 0 # number of embeddings found 110 | senvec = numpy.zeros([1, int(dim)]) 111 | 112 | for word in bb: 113 | if word in dictry: 114 | wordvec = dictry[word] 115 | senvec += wordvec 116 | count += 1 117 | # else: unknown instructions are skipped 118 | 119 | if count > 0: 120 | 121 | # if averaging embeddings: 122 | # senvec /= float(count) 123 | finalout.append(senvec) 124 | 125 | else: 126 | print("Empty BB: no instructions found in dictionary.") 127 | 128 | # cannot use all zeros to compute cos similarity 129 | if numpy.all(senvec == 0.0): 130 | senvec.fill(0.1) 131 | 132 | finalout.append(senvec) 133 | 134 | return finalout 135 | 136 | 137 | print("Calculating sentence vectors...") 138 | bbfinal_a1 = calculateSentenceVectors(armtest, dim, armdict1) 139 | bbfinal_x1 = calculateSentenceVectors(x86test, dim, x86dict1) 140 | 141 | 142 | ### ### 143 | # Cosine Similarity # 144 | ### ### 145 | 146 | """ 147 | - Calculate cosine similarity between ARM/X86 148 | basic blocks 149 | - Plot ROC based on true labels 150 | """ 151 | 152 | true_labels = [] 153 | 154 | # ground truth labels 155 | with open(test_labels) as f: 156 | for line in f: 157 | true_labels.append(line.split()) 158 | 159 | true_labels = [int(''.join((str(i) for i in a))) for a in true_labels] 160 | 161 | 162 | # method to calculate cos similarity / ROC curve 163 | def ROC(bbfinal_a, bbfinal_x): 164 | cos_sim = [] 165 | for bb1, bb2 in zip(bbfinal_a, bbfinal_x): 166 | sim = 1 - spatial.distance.cosine(numpy.asarray(bb1), numpy.asarray(bb2)) 167 | cos_sim.append(sim) 168 | 169 | return metrics.roc_curve(true_labels, cos_sim, pos_label=1) 170 | 171 | 172 | print("Calculating similarity scores, ROC...") 173 | fpr1, tpr1, thresholds1 = ROC(bbfinal_a1, bbfinal_x1) 174 | auc1 = metrics.auc(fpr1, tpr1) 175 | print("AUC avg cos: %f" % auc1) 176 | 177 | 178 | plt.title('ARM-X86 Basic Block Similarity') 179 | plt.plot(fpr1, tpr1, label="Cosine Similarity, AUC=%f" %auc1) 180 | 181 | plt.legend(loc = 'lower right') 182 | plt.plot([0,1], [0,1], 'r--') 183 | plt.xlim([0,1]) 184 | plt.ylim([0,1]) 185 | plt.ylabel('True Positive Rate') 186 | plt.xlabel('False Positive Rate') 187 | plt.show() 188 | -------------------------------------------------------------------------------- /tsne2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | t-SNE to create a visualization of x86, arm embeddings. 5 | Output: 2 t-SNE plots 6 | 1) unlabeled plot displaying all instructions 7 | 2) labeled plot displaying selected instructions 8 | """ 9 | 10 | from sklearn.manifold import TSNE 11 | import matplotlib 12 | import matplotlib.pyplot as plt 13 | from sklearn.decomposition import PCA 14 | 15 | import math 16 | import numpy 17 | from numpy import dot 18 | from numpy.linalg import norm 19 | from adjustText import adjust_text 20 | 21 | 22 | ## ## 23 | # data # 24 | ## ## 25 | 26 | 27 | ### CHANGE FILES HERE ### 28 | 29 | DATA_PATH_ARM = 'output/out.arm' 30 | DATA_PATH_X86 = 'output/out.x86' 31 | VECTOR_SIZE = 200 32 | 33 | 34 | # select instructions to plot only a few 35 | armcodes = ['ADD~R1,R0,R7', 'SUB~SP,SP,0', 36 | 'LDR~R0,[R5+0]', 37 | 'CMP~R8,0', 'POP~{R4,LR}', 38 | 'B~', 'BL~FOO'] 39 | x86codes = ['ADDQ~R13,RBX', 'SUBQ~RSP,0', 40 | 'MOVQ~RDI,[R12+0]', 41 | 'CMPL~R13D,0', 'POPQ~RBP', 42 | 'JMP~', 'CALLQ~FOO'] 43 | instr_ct = 7 44 | 45 | 46 | all_arm_codes = [] 47 | all_x86_codes = [] 48 | 49 | armdict = {} 50 | x86dict = {} 51 | 52 | arm_embedding_matrix = numpy.empty([instr_ct, VECTOR_SIZE]) 53 | x86_embedding_matrix = numpy.empty([instr_ct, VECTOR_SIZE]) 54 | 55 | 56 | ## ## 57 | # code # 58 | ## ## 59 | 60 | # construct ARM dictionary 61 | print("Constructing arm dictionary...") 62 | with open(DATA_PATH_ARM) as f: 63 | for line in f: 64 | items = line.split(" ") 65 | 66 | # remove header row 67 | if len(items) == 2: 68 | dim1 = items[1] 69 | continue 70 | 71 | if "" in items: 72 | continue 73 | 74 | if "\n" in items: 75 | endline = int(dim1)+1 76 | del items[endline] 77 | 78 | instr = items[0] 79 | emb = numpy.array([float(feat) for feat in items[1:]]) 80 | 81 | armdict[instr] = emb 82 | 83 | # construct X86 dictionary 84 | print("Constructing x86 dictionary...") 85 | with open(DATA_PATH_X86) as f: 86 | for line in f: 87 | items = line.split(" ") 88 | 89 | # remove header row 90 | if len(items) == 2: 91 | dim1 = items[1] 92 | continue 93 | 94 | if "" in items: 95 | continue 96 | 97 | if "\n" in items: 98 | endline = int(dim1)+1 99 | del items[endline] 100 | 101 | instr = items[0] 102 | emb = numpy.array([float(feat) for feat in items[1:]]) 103 | 104 | x86dict[instr] = emb 105 | 106 | 107 | ## ## 108 | # TSNE and PLOT # 109 | ## ## 110 | 111 | print("Compiling all embeddings together...") 112 | 113 | arm_matrix = numpy.empty([len(armdict), VECTOR_SIZE]) 114 | x86_matrix = numpy.empty([len(x86dict), VECTOR_SIZE]) 115 | arm_index = 0 116 | x86_index = 0 117 | 118 | for embedding in armdict.values(): 119 | arm_matrix[arm_index] = embedding 120 | arm_index += 1 121 | 122 | for embedding in x86dict.values(): 123 | x86_matrix[x86_index] = embedding 124 | x86_index += 1 125 | 126 | # compile all embeddings together 127 | final_len = len(armdict) + len(x86dict) 128 | final_matrix = numpy.empty([final_len, VECTOR_SIZE]) 129 | index = 0 130 | 131 | for elem in arm_matrix: 132 | final_matrix[index] = elem 133 | index += 1 134 | 135 | for elem in x86_matrix: 136 | final_matrix[index] = elem 137 | index += 1 138 | 139 | ### final_matrix contains ALL embeddings 140 | 141 | # check length vs matrix size 142 | print("Final len: %d Matrix size: %d" % (final_len, index)) 143 | 144 | # dimension reduction 145 | print("Running PCA...") 146 | pca = PCA(n_components = 50) 147 | new_final_matrix = pca.fit_transform(final_matrix) 148 | 149 | print("Running TSNE...") 150 | tsne_matrix = TSNE(n_components=2).fit_transform(new_final_matrix) 151 | 152 | 153 | # Plot 154 | matplotlib.rcParams.update({'font.size': 7}) 155 | 156 | # We do not want to print out all possible instructions 157 | # when plotting the queried instructions. 158 | # This creates a new matrix for our small sample. 159 | plot_arm_matrix = numpy.zeros([instr_ct,2]) 160 | plot_x86_matrix = numpy.zeros([instr_ct,2]) 161 | plot_all_matrix = numpy.zeros([(instr_ct*2),2]) 162 | plot_arm_index = 0 163 | plot_x86_index = 0 164 | plot_all_index = 0 165 | 166 | armlabels = [] 167 | x86labels = [] 168 | alllabels = [] 169 | 170 | # compile all instruction names 171 | instr_list = [] 172 | 173 | for arminstr in armdict.keys(): 174 | instr_list.append(arminstr) 175 | 176 | for x86instr in x86dict.keys(): 177 | instr_list.append(x86instr) 178 | 179 | 180 | ### TSNE - All Instructions ### 181 | 182 | print("Plotting ALL instructions:") 183 | print("Setting scatter plot...") 184 | 185 | final_arm_matrix = numpy.zeros([len(armdict), 2]) 186 | final_x86_matrix = numpy.zeros([len(x86dict), 2]) 187 | armcount = 0 188 | x86count = 0 189 | 190 | for x, y in zip(tsne_matrix[:,0], tsne_matrix[:,1]): 191 | if armcount > (len(armdict) - 1): 192 | final_x86_matrix[x86count,] = x,y 193 | x86count += 1 194 | else: 195 | final_arm_matrix[armcount,] = x,y 196 | armcount += 1 197 | 198 | plt.scatter(final_arm_matrix[:,0], final_arm_matrix[:,1], s=50, marker="o", c='blue', alpha=0.1) 199 | plt.scatter(final_x86_matrix[:,0], final_x86_matrix[:,1], s=50, marker="^", c='red', alpha=0.1) 200 | 201 | plt.xticks([]) 202 | plt.yticks([]) 203 | plt.show() 204 | 205 | ### TSNE - Selected Instructions ### 206 | 207 | for key, x, y in zip(instr_list, tsne_matrix[:,0], tsne_matrix[:,1]): 208 | if key in armcodes: 209 | plot_arm_matrix[int(plot_arm_index),] = x,y 210 | armlabels.append(key) 211 | plot_arm_index += 1 212 | 213 | if key in x86codes: 214 | plot_x86_matrix[int(plot_x86_index),] = x,y 215 | x86labels.append(key) 216 | plot_x86_index += 1 217 | 218 | # Plot selected instructions only 219 | print("Plotting selected instructions only:") 220 | print("Setting scatter plot...") 221 | 222 | for label, x, y in zip(armlabels, plot_arm_matrix[:,0], plot_arm_matrix[:,1]): 223 | plt.annotate( 224 | label, 225 | size = 12, 226 | xy = (x, y), 227 | ha = 'left', 228 | va = 'top') 229 | 230 | for label, x, y in zip(x86labels, plot_x86_matrix[:,0], plot_x86_matrix[:,1]): 231 | plt.annotate( 232 | label, 233 | size = 12, 234 | xy = (x, y), 235 | ha = 'right', 236 | va = 'bottom') 237 | 238 | plt.scatter(plot_arm_matrix[:,0], plot_arm_matrix[:,1], s=50, marker="o", c='blue', alpha=0.8) 239 | plt.scatter(plot_x86_matrix[:,0], plot_x86_matrix[:,1], s=50, marker="^", c='red', alpha=0.8) 240 | 241 | 242 | plt.xticks([]) 243 | plt.yticks([]) 244 | plt.show() 245 | 246 | --------------------------------------------------------------------------------