├── risc-v
    ├── .gitignore
    ├── Makefile
    ├── src
    │   └── hello.s
    └── README.md
├── gas
    ├── .gitignore
    ├── linux
    │   ├── something.h
    │   ├── instructions.s
    │   └── hello.S
    ├── func.cc
    ├── func.s
    ├── jump.s
    ├── cli.s
    ├── i386
    │   ├── Makefile
    │   └── learning_i386.s
    ├── avx_mul.c
    ├── avx.c
    ├── loop.s
    ├── jz.s
    ├── jne.s
    ├── cli_lea.s
    ├── Makefile
    ├── 64bit.s
    └── README.md
├── inline-assembly
    ├── .gitignore
    └── basic.c
├── c
    ├── inline.h
    ├── empty.c
    ├── rel.c
    ├── sleep.c
    ├── pic.c
    ├── stack.c
    ├── args.c
    ├── sp.c
    ├── ptr.c
    ├── simple.c
    ├── func.c
    ├── null.cc
    ├── inline.c
    ├── overflow.c
    ├── apple.c
    ├── loop.c
    ├── while.cc
    └── vmmap-example.c
├── .gitignore
├── nasm
    ├── linux
    │   ├── .gitignore
    │   ├── libsample.c
    │   ├── align.asm
    │   ├── jnz.asm
    │   ├── printf.asm
    │   ├── hello.asm
    │   ├── func-args.asm
    │   ├── function.asm
    │   └── Makefile
    └── macos
    │   ├── Makefile
    │   └── hello.asm
├── arm
    ├── Dockerfile
    ├── src
    │   ├── bic.s
    │   ├── uxtb.s
    │   ├── xzr.s
    │   ├── ldr.s
    │   ├── func.s
    │   ├── space.s
    │   ├── branch.s
    │   ├── rename.s
    │   ├── psr.s
    │   ├── add.s
    │   ├── ldmia.s
    │   └── first.s
    ├── .gitignore
    ├── fedora_aarch64.repo
    ├── Makefile
    └── README.md
├── cdecl.c
├── stdcall.c
├── fastcall.c
├── linux
    ├── zero-flag.s
    ├── cmov.s
    ├── first.s
    ├── direction-flag.s
    ├── arr.s
    ├── .gitignore
    ├── check_zero.s
    ├── bcd.s
    ├── div.s
    ├── setne.s
    ├── mul.s
    ├── mmx.s
    ├── function.s
    ├── exec-stack.s
    ├── test.s
    ├── carry.s
    ├── float.s
    ├── write.s
    ├── jle.s
    ├── cfi.s
    ├── Makefile
    ├── sse.s
    ├── multi.s
    ├── notes
    │   ├── mmx.md
    │   ├── bcd.md
    │   ├── float.md
    │   └── sse.md
    └── README.md
├── Makefile
├── cache.c
└── aix
    └── README.md


/risc-v/.gitignore:
--------------------------------------------------------------------------------
1 | out
2 | 


--------------------------------------------------------------------------------
/gas/.gitignore:
--------------------------------------------------------------------------------
1 | out
2 | *.dSYM
3 | 


--------------------------------------------------------------------------------
/inline-assembly/.gitignore:
--------------------------------------------------------------------------------
1 | basic
2 | 


--------------------------------------------------------------------------------
/c/inline.h:
--------------------------------------------------------------------------------
1 | static inline void doit(int b);
2 | 


--------------------------------------------------------------------------------
/c/empty.c:
--------------------------------------------------------------------------------
1 | int main(void) {
2 |     return 0;
3 | }
4 | 


--------------------------------------------------------------------------------
/gas/linux/something.h:
--------------------------------------------------------------------------------
1 | #define SOMETHING "Hellr, world@\n"
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.swp
3 | cdecl
4 | stdcall
5 | fastcall
6 | cache
7 | 


--------------------------------------------------------------------------------
/c/rel.c:
--------------------------------------------------------------------------------
1 | extern int something;
2 | 
3 | int function(void) {
4 |   return something;
5 | }
6 | 


--------------------------------------------------------------------------------
/c/sleep.c:
--------------------------------------------------------------------------------
1 | #include <unistd.h>
2 | int main () {
3 |     usleep(1000); // will sleep for 1 ms
4 | }
5 | 


--------------------------------------------------------------------------------
/nasm/linux/.gitignore:
--------------------------------------------------------------------------------
1 | hello
2 | printf
3 | function
4 | align
5 | func-args
6 | libsample
7 | jnz
8 | 


--------------------------------------------------------------------------------
/c/pic.c:
--------------------------------------------------------------------------------
1 | int myglob = 42;
2 | 
3 | int ml_func(int a, int b)
4 | {
5 |     return myglob + a + b;
6 | }
7 | 


--------------------------------------------------------------------------------
/c/stack.c:
--------------------------------------------------------------------------------
1 | #include <stdio.h>
2 | 
3 | int main (int argc, char **argv)
4 | {
5 |     return 0;
6 | }
7 | 


--------------------------------------------------------------------------------
/arm/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM arm64v8/alpine
2 | 
3 | RUN apk add util-linux pciutils usbutils coreutils binutils findutils grep gdb
4 | 


--------------------------------------------------------------------------------
/c/args.c:
--------------------------------------------------------------------------------
1 | #include <stdio.h>
2 | 
3 | int main (int argc, char **argv)
4 | {
5 |     printf ("%d\n", argc);
6 |     return 0;
7 | }
8 | 


--------------------------------------------------------------------------------
/c/sp.c:
--------------------------------------------------------------------------------
1 | #gcc -fomit-frame-pointer -S sp.c
2 | void function(void)
3 | {
4 |     int i = 100;
5 |     int j = 200;
6 |     int k = 300;
7 | }
8 | 


--------------------------------------------------------------------------------
/arm/src/bic.s:
--------------------------------------------------------------------------------
 1 | .text
 2 | 
 3 | .global _start
 4 | 
 5 | _start:
 6 |   mov r0, #15
 7 |   bic r1, r0, #4
 8 |   bic r1, r0, #12
 9 |   b .
10 | 


--------------------------------------------------------------------------------
/c/ptr.c:
--------------------------------------------------------------------------------
1 | #include <stdio.h>
2 | 
3 | int main (int argc, char **argv) {
4 |   int* p = new int(10);
5 |   printf ("%p\n", p);
6 |   return 0;
7 | }
8 | 


--------------------------------------------------------------------------------
/c/simple.c:
--------------------------------------------------------------------------------
1 | // CFLAGS="-g -o0
2 | // gcc -g -O0 simple.c -o simple
3 | int main() {
4 |     int a = 5;
5 |     int b = a + 6;
6 |     return 0;
7 | }
8 | 


--------------------------------------------------------------------------------
/arm/src/uxtb.s:
--------------------------------------------------------------------------------
 1 | .text
 2 | 
 3 | .global _start
 4 | 
 5 | _start:
 6 |   ldr r0, =#0xFFFFFF65
 7 |   uxtb r1, r0
 8 |   uxth r1, r0
 9 |   b .
10 | 
11 | 


--------------------------------------------------------------------------------
/c/func.c:
--------------------------------------------------------------------------------
1 | // clang -g -o func func.c
2 | int doit(int i) {
3 |   return i;
4 | }
5 | 
6 | int main(int argc, char** argv) {
7 |   int i = doit(6);
8 | }
9 | 


--------------------------------------------------------------------------------
/arm/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | src/first
 3 | first
 4 | add
 5 | xzr
 6 | branch
 7 | ldr
 8 | func
 9 | space
10 | rename
11 | ldmia
12 | bic
13 | uxtb
14 | psr
15 | 


--------------------------------------------------------------------------------
/nasm/linux/libsample.c:
--------------------------------------------------------------------------------
1 | #include <stdio.h>
2 | #include <stdlib.h>
3 | 
4 | int main(int argc, char** argv) {
5 |     puts("hello\n");
6 |     exit(0);
7 | }
8 | 


--------------------------------------------------------------------------------
/arm/src/xzr.s:
--------------------------------------------------------------------------------
 1 | .text
 2 | 
 3 | .global _start
 4 | 
 5 | _start: 
 6 |   mov x0, #0
 7 |   mov x0, xzr
 8 | 
 9 |   mov x0, xzr
10 |   mov x8, #93
11 |   svc #0
12 | 


--------------------------------------------------------------------------------
/gas/linux/instructions.s:
--------------------------------------------------------------------------------
 1 | .global _start
 2 | .data
 3 | 
 4 | .text
 5 | _start:
 6 |   movl  $1, %eax
 7 |   REX.W movl  $1, %eax
 8 |   movl  $0, %ebx
 9 |   int   $0x80
10 | 


--------------------------------------------------------------------------------
/arm/src/ldr.s:
--------------------------------------------------------------------------------
 1 | .text
 2 | 
 3 | .global _start
 4 | 
 5 | _start:
 6 |   mov x0, #3
 7 |   str x0, [SP, #-16]!
 8 |   ldr x0, [SP], #16
 9 | 
10 |   mov x8, #93
11 |   svc #0
12 | 


--------------------------------------------------------------------------------
/cdecl.c:
--------------------------------------------------------------------------------
1 | 
2 | void __attribute__ ((__cdecl__)) something(int a, int b, int c) {
3 | }
4 | 
5 | int main(int argc, char** argv) {
6 |   something(1, 2, 3);
7 |   return 0;
8 | }
9 | 


--------------------------------------------------------------------------------
/stdcall.c:
--------------------------------------------------------------------------------
1 | 
2 | void __attribute__ ((__stdcall__)) something(int a, int b, int c) {
3 | }
4 | 
5 | int main(int argc, char** argv) {
6 |   something(1, 2, 3);
7 |   return 0;
8 | }
9 | 


--------------------------------------------------------------------------------
/fastcall.c:
--------------------------------------------------------------------------------
1 | 
2 | void __attribute__ ((__fastcall__)) something(int a, int b, int c) {
3 | }
4 | 
5 | int main(int argc, char** argv) {
6 |   something(1, 2, 3);
7 |   return 0;
8 | }
9 | 


--------------------------------------------------------------------------------
/gas/func.cc:
--------------------------------------------------------------------------------
1 | #include <iostream>
2 | extern "C" int dot(int x);
3 | 
4 | int main(int argc, char** argv) {
5 |   int ret = dot(10);
6 |   std::cout << ret << '\n';
7 |   return 0;
8 | }
9 | 


--------------------------------------------------------------------------------
/linux/zero-flag.s:
--------------------------------------------------------------------------------
 1 | .global _start
 2 | 
 3 | .text
 4 | 
 5 | _start:
 6 |   mov $2, %rax
 7 |   mov $2, %rcx
 8 |   sub %rax, %rcx
 9 | 
10 |   mov $60, %rax
11 |   mov $0, %rdi
12 |   syscall
13 | 


--------------------------------------------------------------------------------
/nasm/macos/Makefile:
--------------------------------------------------------------------------------
 1 | hello: hello.o
 2 | 	ld $@.o -o $@
 3 | 
 4 | hello.o: hello.asm
 5 | 	nasm -f macho64 -o $@.o $<
 6 | 
 7 | clean:
 8 | 	$(info "cleaning...")
 9 | 	rm -rf hello
10 | 
11 | 


--------------------------------------------------------------------------------
/arm/src/func.s:
--------------------------------------------------------------------------------
 1 | .text
 2 | 
 3 | .global _start
 4 | 
 5 | _start:
 6 |   mov x0, #1
 7 |   mov x1, #2
 8 |   bl func
 9 | 
10 |   mov x8, #93
11 |   svc #1
12 | 
13 | func:
14 |   mov x0, #3
15 |   ret
16 | 


--------------------------------------------------------------------------------
/c/null.cc:
--------------------------------------------------------------------------------
 1 | // $ clang++ -c null.cc -std=c++11
 2 | extern "C" {
 3 |   int* something = nullptr;
 4 | }
 5 | 
 6 | int main(int argc, char** argv) {
 7 |   int* one = nullptr;
 8 |   return 0;
 9 | }
10 | 


--------------------------------------------------------------------------------
/arm/src/space.s:
--------------------------------------------------------------------------------
 1 | 
 2 | .text
 3 | .global _start
 4 | 
 5 | _start:
 6 |   ldr r0, =A
 7 |   mov r1, #2
 8 |   str r1, [r0]
 9 |   //mov r0, #1
10 | 
11 | stop:
12 |   b   stop
13 | 
14 | .data
15 | A: .space 4
16 | 


--------------------------------------------------------------------------------
/linux/cmov.s:
--------------------------------------------------------------------------------
 1 | 
 2 | .global _start
 3 | 
 4 | .text
 5 | _start:
 6 |   mov $10, %rax
 7 |   mov $10, %rcx
 8 |   mov $3, %rsi
 9 |   cmovae %rsi, %rdi
10 | 
11 |   mov $60, %rax
12 |   mov $1, %rdi
13 |   syscall
14 | 


--------------------------------------------------------------------------------
/c/inline.c:
--------------------------------------------------------------------------------
 1 | // clang -g -o inline inline.c
 2 | #include "inline.h"
 3 | 
 4 | static inline void doit(int k) {
 5 |   int i = k;
 6 | }
 7 | 
 8 | int main(int argc, char** argv) {
 9 |   int i = 8;
10 |   doit(i);
11 | }
12 | 


--------------------------------------------------------------------------------
/arm/src/branch.s:
--------------------------------------------------------------------------------
 1 | .text
 2 | .global _start
 3 | 
 4 | _start:
 5 |   mov x0, #10
 6 |   mov x1, #0
 7 | loop:
 8 |   add x1, x1, #1
 9 |   cmp x0, x1
10 |   b.NE loop
11 | 
12 |   mov x0, x1
13 |   mov x8, #93
14 |   svc #0
15 | 


--------------------------------------------------------------------------------
/c/overflow.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | void return_input (void) {
 4 |    char array[30];
 5 |    gets (array);
 6 |    printf("%s\n", array);
 7 | }
 8 | 
 9 | int main() {
10 |    return_input();
11 |    return 0;
12 | }
13 | 


--------------------------------------------------------------------------------
/linux/first.s:
--------------------------------------------------------------------------------
 1 | .data
 2 | something:
 3 |   .byte 2
 4 | 
 5 | .text
 6 | .globl _start
 7 | _start:
 8 |   movl $something, %eax
 9 |   call _exit
10 | 
11 | _exit: 
12 |   mov $60, %rax
13 |   xor %rdi, %rdi
14 |   syscall
15 | 


--------------------------------------------------------------------------------
/arm/src/rename.s:
--------------------------------------------------------------------------------
 1 | ZEROR .req r0
 2 | /* .equ does not work for registers which is the reason for .req */
 3 | 
 4 | .global _start
 5 | 
 6 | .text
 7 | 
 8 | _start:
 9 |   mov ZEROR, #4
10 | 
11 | stop:
12 |   b stop
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/c/apple.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | int main (int argc, char **argv, char **envp, char **apple)
 4 | {
 5 |     int i = 0;
 6 |     for (i=0; i < 4; i++)
 7 |         printf ("%s\n", apple[i]);
 8 | 
 9 |     return 0;
10 | }
11 | 


--------------------------------------------------------------------------------
/gas/func.s:
--------------------------------------------------------------------------------
 1 | .section __DATA, __data
 2 | .section __TEXT, __text
 3 | .globl _dot
 4 | 
 5 | _dot:
 6 |   pushq %rbp
 7 |   movq %rsp,%rbp
 8 | 
 9 |   addq $22, %rdi
10 |   movq %rdi, %rax # return code
11 |   popq %rbp
12 |   retq
13 | 


--------------------------------------------------------------------------------
/gas/jump.s:
--------------------------------------------------------------------------------
 1 | .section __TEXT, __text
 2 | .globl _main
 3 | 
 4 | _main:
 5 |   jmp overhere
 6 |   mov $0x2000001, %eax
 7 |   mov $0, %rdi
 8 |   syscall
 9 | overhere:
10 |   mov $0x2000001, %eax
11 |   mov $2, %rdi
12 |   syscall
13 | 


--------------------------------------------------------------------------------
/nasm/linux/align.asm:
--------------------------------------------------------------------------------
 1 | section .data
 2 |   dummy db 3
 3 |   nr dw 10
 4 | 
 5 | section .bss
 6 | section .text
 7 |   global main
 8 | 
 9 | main: 
10 |   push rbp
11 |   mov rbp, rsp
12 |   mov ax, [nr]
13 |   leave
14 |   ret
15 | 


--------------------------------------------------------------------------------
/linux/direction-flag.s:
--------------------------------------------------------------------------------
 1 | .global _start
 2 | 
 3 | .text
 4 | _start:
 5 |   // set the direction flag
 6 |   std
 7 |   // clear the direction flag
 8 |   cld
 9 | 
10 |   nop
11 | 
12 |   mov $60, %rax
13 |   mov $0, %rdi
14 |   syscall
15 | 


--------------------------------------------------------------------------------
/c/loop.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | // $ clang -O3 loop.c -S
 3 | int testarr [] = {1, 2, 3};
 4 | 
 5 | int main (int argc, char **argv)
 6 | {
 7 |     for (int i = 0; i <= 2; i++) {
 8 |         printf("i=%d\n", testarr[i]);
 9 |     }
10 |     return 0;
11 | }
12 | 


--------------------------------------------------------------------------------
/linux/arr.s:
--------------------------------------------------------------------------------
 1 | .text
 2 |   .global _start
 3 |   .type _start, @function
 4 | 
 5 | _start:
 6 |   nop
 7 |   sub  $1, %rsp
 8 |   movb $1, 0(%rsp)
 9 |   movb $2, -1(%rsp)
10 |   movb $3, -2(%rsp)
11 | 
12 |   mov $60, %rax
13 |   xor %rdi, %rdi
14 |   syscall
15 | 


--------------------------------------------------------------------------------
/nasm/linux/jnz.asm:
--------------------------------------------------------------------------------
 1 | section .data
 2 | section .bss
 3 | section .text
 4 |   global main:
 5 | 
 6 | main:
 7 |   push rbp
 8 |   mov rbp, rsp
 9 |   mov eax, 4
10 |   doit: 
11 |     dec eax
12 |     jnz doit
13 | 
14 |   mov rax, 60
15 |   mov rdi, 0
16 |   syscall
17 | 


--------------------------------------------------------------------------------
/arm/src/psr.s:
--------------------------------------------------------------------------------
 1 | .text
 2 | .global _start
 3 | 
 4 | _start:
 5 |   mov r2, #4
 6 |   mov r3, #2
 7 |   subs r1, r3, r2
 8 |   bne negative
 9 | positive:
10 |   mov r0, #1
11 |   b end
12 | 
13 | negative:
14 |   mov r0, #-1
15 |   b end
16 | 
17 | end:
18 | 
19 |   b .
20 | 


--------------------------------------------------------------------------------
/linux/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | first
 3 | check_zero
 4 | write
 5 | jle
 6 | cfi
 7 | arr
 8 | multi
 9 | exec-stack
10 | carry
11 | direction-flag
12 | zero-flag
13 | div
14 | cmov
15 | function
16 | float
17 | sse
18 | test
19 | setne
20 | bcd
21 | setne
22 | mmx
23 | mul
24 | 


--------------------------------------------------------------------------------
/linux/check_zero.s:
--------------------------------------------------------------------------------
 1 | .text
 2 | .globl _start
 3 | 
 4 | _start:
 5 |   mov $0, %rcx
 6 |   test %rcx, %rcx
 7 |   je _zero_func
 8 |   movq $60, %rax
 9 |   movq $1, %rdi
10 |   syscall
11 | 
12 | _zero_func:
13 |   movq $60, %rax
14 |   movq $2, %rdi
15 |   syscall
16 | 
17 | 


--------------------------------------------------------------------------------
/linux/bcd.s:
--------------------------------------------------------------------------------
 1 | .data
 2 | 
 3 | unpacked1: .byte 8
 4 | unpacked2: .byte 4
 5 | 
 6 | .global _start
 7 | 
 8 | .text
 9 | _start: 
10 |   mov unpacked1, %al
11 |   mov unpacked2, %bl
12 |   add %bl, %al
13 |   aaa
14 | 
15 |   mov $1, %eax
16 |   mov $0, %ebx
17 |   int $0x80
18 | 


--------------------------------------------------------------------------------
/c/while.cc:
--------------------------------------------------------------------------------
 1 | // CFLAGS="-g -o0
 2 | // clang -g -O0 while.c -o while
 3 | int main() {
 4 |     bool flag = true;
 5 |     int a = 5;
 6 |     do {
 7 |         if (a == 5) {
 8 |             flag = false;
 9 |         }
10 |         a++;
11 |     } while(flag == true);
12 |     return 0;
13 | }
14 | 


--------------------------------------------------------------------------------
/risc-v/Makefile:
--------------------------------------------------------------------------------
 1 | AS=riscv64-linux-gnu-as
 2 | LD=riscv64-linux-gnu-ld
 3 | ASFLAGS=-g
 4 | OUT_DIR=out
 5 | 
 6 | out/hello: src/hello.s | $(OUT_DIR)
 7 | 	$(AS) $(ASFLAGS) $< -o $@.o
 8 | 	$(LD) $@.o -o $@
 9 | 
10 | out:
11 | 	@mkdir $(OUT_DIR)
12 | 
13 | .PHONY: clean
14 | clean: 
15 | 	@${RM} -rf $(OUT_DIR)
16 | 


--------------------------------------------------------------------------------
/arm/src/add.s:
--------------------------------------------------------------------------------
 1 | AREA |.text|, CODE,READONLY 
 2 | 
 3 | .text
 4 | 
 5 | .globl _start
 6 | _start:
 7 |   mov x1, #2
 8 |   mov x3, #3
 9 |   add x4, x1, x3
10 | 
11 |   /* syscall exit(int status) */
12 |   mov     x0, x4      /* status */
13 |   mov     w8, #93     /* exit syscall #93 */
14 |   svc     #0          
15 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | cdecl: cdecl.c
 2 | 	${CC} -m32 -o $@ $<
 3 | 
 4 | stdcall: stdcall.c
 5 | 	${CC} -m32 -o $@ $<
 6 | 
 7 | fastcall: fastcall.c
 8 | 	${CC} -m32 -o $@ $<
 9 | 	#${CC} -m32 -s $<
10 | 
11 | cache: cache.c
12 | 	${CC} -g -O0 -o $@ $<
13 | 
14 | .PHONY: clean
15 | clean:
16 | 	${RM} -f cdecl stdcall fastcall.s
17 | 


--------------------------------------------------------------------------------
/linux/div.s:
--------------------------------------------------------------------------------
 1 | .text
 2 | .global _start
 3 | 
 4 | _start:
 5 |   mov $23, %rax
 6 |   mov $2, %rbx
 7 |   div %rbx
 8 | 
 9 |   add $48, %rax
10 |   push %rax
11 | 
12 |   mov $1, %rax
13 |   mov $1, %rdi
14 |   lea (%rsp), %rsi
15 |   mov $1, %dx
16 |   syscall
17 | 
18 |   mov $60, %rax
19 |   mov $1, %rdi
20 |   syscall
21 | 


--------------------------------------------------------------------------------
/linux/setne.s:
--------------------------------------------------------------------------------
 1 | .global _start
 2 | 
 3 | .text
 4 | _start:
 5 |   mov $1, %rax
 6 |   mov $2, %rbx
 7 |   cmp %rbx, %rax
 8 |   setne %cl
 9 | 
10 |   xor %rax, %rax
11 |   xor %rbx, %rbx
12 |   xor %cl, %cl
13 |   mov $1, %rbx
14 |   cmp %rbx, %rax
15 |   setnz %cl
16 | 
17 |   mov $60, %rax
18 |   mov $0, %rdi
19 |   syscall
20 | 


--------------------------------------------------------------------------------
/gas/linux/hello.S:
--------------------------------------------------------------------------------
 1 | #include "something.h"
 2 | 
 3 | .global	_start
 4 | 
 5 | .text
 6 | _start:
 7 | 	movl  $4, %eax
 8 | 	movl  $1, %ebx
 9 | 	movl  $msg, %ecx
10 | 	movl  $len, %edx
11 | 	int   $0x80
12 | 
13 | 	movl  $1, %eax
14 | 	movl  $0, %ebx
15 | 	int   $0x80
16 | .data
17 | msg:
18 | 	.ascii  SOMETHING
19 | 	len =   . - msg
20 | 


--------------------------------------------------------------------------------
/arm/src/ldmia.s:
--------------------------------------------------------------------------------
 1 | .data
 2 | 
 3 | .text
 4 | array:
 5 |   .word 0x000000000 /* 4 bytes */
 6 |   .word 0x000000001 /* 4 bytes */
 7 |   .word 0x000000010 /* 4 bytes */
 8 |   .word 0x000000011 /* 4 bytes */
 9 |   .word 0x000000100 /* 4 bytes */
10 | 
11 | .global _start
12 | 
13 | _start:
14 |   adr r0, array
15 |   ldmia r0, {r1, r2, r3, r4, r5}
16 |   b .
17 | 


--------------------------------------------------------------------------------
/linux/mul.s:
--------------------------------------------------------------------------------
 1 | .text
 2 | .global _start
 3 | 
 4 | _start:
 5 |   mov $33, %al
 6 |   mov $2, %dl
 7 |   imul %dl
 8 | 
 9 |   push %rax
10 | 
11 |   mov $1, %rax
12 |   mov $1, %rdi
13 |   lea (%rsp), %rsi
14 |   mov $1, %dx
15 |   syscall         # Will print B which is the ascii character for 66
16 | 
17 |   mov $60, %rax
18 |   mov $1, %rdi
19 |   syscall
20 | 


--------------------------------------------------------------------------------
/gas/cli.s:
--------------------------------------------------------------------------------
 1 | .section __DATA, __data
 2 | argc:
 3 |   .asciz "There are %d parameters\n"
 4 | 
 5 | .section __TEXT, __text
 6 | .globl _main
 7 | 
 8 | _main:
 9 |   pushq %rbp
10 |   movq %rsp,%rbp
11 |   movq %rdi, %rsi
12 |   movq argc@GOTPCREL(%rip), %rdi
13 |   callq _printf
14 |   movl $0x2000001, %eax # exit code
15 |   movq $0, %rdi # return code
16 |   syscall
17 | 


--------------------------------------------------------------------------------
/linux/mmx.s:
--------------------------------------------------------------------------------
 1 | .data
 2 | 
 3 | v1: .quad 1              # 64-bit
 4 | v2: .double 1, 2         # 2 32-bit values
 5 | v3: .word 1, 2, 3, 4     # 4 16-bit values
 6 | 
 7 | .global _start
 8 | 
 9 | .text
10 | 
11 | _start:
12 |   movq v1, %mm0
13 |   movd v2, %mm1
14 |   movq v3, %mm2
15 |   paddw %mm2, %mm2
16 | 
17 |   mov $60, %rax
18 |   mov $1, %rdi
19 |   syscall
20 | 


--------------------------------------------------------------------------------
/inline-assembly/basic.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | int main(int argc, char** argv) {
 4 |     printf("main...\n");
 5 | 
 6 |     int x = 10;
 7 |     int y = 20;
 8 |     int z;
 9 |     asm("mov %1, %0\n\t"
10 |         "add %2, %0"
11 |         : "=r" (z)
12 |         : "r" (x), "r" (y));
13 | 
14 |     printf("x=%d, y=%d, z=%d\n", x, y, z);
15 |     return 0;
16 | }
17 | 


--------------------------------------------------------------------------------
/linux/function.s:
--------------------------------------------------------------------------------
 1 | .global _start
 2 | 
 3 | .type print_msg, @function
 4 | 
 5 | .data
 6 | 
 7 | msg: .ascii "something\n"
 8 | len: .int . - msg
 9 | 
10 | .text
11 | _start:
12 |   call print_msg
13 |   mov $60, %rax
14 |   mov $1, %rdi
15 |   syscall
16 | 
17 | print_msg:
18 |   mov $1, %rax
19 |   mov $1, %rdi
20 |   lea msg, %rsi
21 |   mov len, %edx
22 |   syscall
23 |   ret
24 | 


--------------------------------------------------------------------------------
/nasm/macos/hello.asm:
--------------------------------------------------------------------------------
 1 | %define SYSCALL_WRITE 0x2000004
 2 | %define SYSCALL_EXIT  0x2000001
 3 | 
 4 | global start
 5 | start:
 6 |   mov rdi, 1
 7 |   mov rsi, str
 8 |   mov rdx, strlen
 9 |   mov rax, SYSCALL_WRITE
10 |   syscall
11 | 
12 |   mov rax, SYSCALL_EXIT
13 |   mov rdi, 0
14 |   syscall
15 | 
16 | section .data
17 | str:
18 |   db `Hey, assembly!\n` 
19 | strlen equ $ - str
20 | 


--------------------------------------------------------------------------------
/gas/i386/Makefile:
--------------------------------------------------------------------------------
 1 | MACOSX := 10.11
 2 | OUT := out
 3 | 
 4 | PROGS := $(addprefix out/,learning_i386.o)
 5 | 
 6 | $(OUT)/%.o : %.s
 7 | 	as -W -g -arch i386 -o $@ $<
 8 | 	ld  -no_pie -arch i386 -macosx_version_min $(MACOSX) -o $(OUT)/$* $@ -lSystem
 9 | 
10 | all: $(PROGS)
11 | 
12 | $(PROGS) : | out
13 | 
14 | out: 
15 | 	mkdir $(OUT)
16 | 
17 | .PHONY: clean
18 | 
19 | clean: 
20 | 	rm -fr $(OUT)
21 | 


--------------------------------------------------------------------------------
/nasm/linux/printf.asm:
--------------------------------------------------------------------------------
 1 | extern printf
 2 | 
 3 | section .data
 4 |   msg db "hello world",0
 5 |   fmt db "Using printf to write: %s", 10, 0
 6 | 
 7 | section .bss 
 8 | section .text
 9 |   global main
10 | 
11 | main:
12 |   push rbp
13 |   mov rbp, rsp
14 |   mov rdi, fmt
15 |   mov rsi, msg
16 |   mov rax, 0
17 |   call printf
18 | 
19 |   mov rsp, rbp
20 |   mov rax, 60
21 |   mov rdi, 0
22 |   syscall
23 | 


--------------------------------------------------------------------------------
/linux/exec-stack.s:
--------------------------------------------------------------------------------
 1 | .data
 2 | code: .long 0xCC
 3 | 
 4 | .text
 5 | .global _start
 6 | 
 7 | _start:
 8 |   jmp code
 9 | 
10 |   mov $60, %rax
11 |   mov $0, %rdi
12 |   syscall
13 | 
14 | /* Notice that with the following directive it is not possible to execute
15 |    instructions in the data section */
16 | #if defined(__linux__) && defined(__ELF__)
17 | #.section .note.GNU-stack,"",%progbits
18 | #endif
19 | 


--------------------------------------------------------------------------------
/gas/avx_mul.c:
--------------------------------------------------------------------------------
 1 | #include <immintrin.h>
 2 | #include <stdio.h>
 3 | 
 4 | int main() {
 5 | 
 6 |   // set as big endian (r = revers of default little endian)
 7 |   __m256d a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
 8 |   __m256d b = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
 9 | 
10 |   __m256d result = _mm256_mul_pd(a, b);
11 | 
12 |   double* d = (double*)&result;
13 |   printf("%f %f %f %f\n", d[0], d[1], d[2], d[3]);
14 | 
15 |   return 0;
16 | }
17 | 


--------------------------------------------------------------------------------
/gas/avx.c:
--------------------------------------------------------------------------------
 1 | #include <immintrin.h>
 2 | #include <stdio.h>
 3 | 
 4 | int main() {
 5 | 
 6 |   // set as big endian (r = revers of default little endian)
 7 |   __m256d evens = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
 8 |   __m256d odds = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
 9 | 
10 |   __m256d result = _mm256_mul_pd(evens, odds);
11 | 
12 |   double* d = (double*)&result;
13 |   printf("%f %f %f %f\n", d[0], d[1], d[2], d[3]);
14 | 
15 |   return 0;
16 | }
17 | 


--------------------------------------------------------------------------------
/linux/test.s:
--------------------------------------------------------------------------------
 1 | .data
 2 | zero: .ascii "zero\n"
 3 | zero_len: .int . - zero
 4 | 
 5 | .global _start
 6 | .text
 7 | 
 8 | _start:
 9 |   nop
10 |   mov $1, %rax
11 |   test $2, %rax     # test will perform 0010 & 0001 = 0000 
12 |   jz _zero
13 |   jmp _exit
14 | 
15 | _zero:
16 |   mov $1, %rax
17 |   mov $1, %rdi
18 |   lea zero, %rsi
19 |   mov zero_len, %dx
20 |   syscall
21 | 
22 | _exit:
23 |   mov $60, %rax
24 |   mov $1, %rdi
25 |   syscall
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/gas/loop.s:
--------------------------------------------------------------------------------
 1 | .section __DATA, __data
 2 | val:
 3 |   .asciz "Value =  %d\n"
 4 | values:
 5 |   .int 10, 20, 30, 40, 50
 6 | .section __TEXT, __text
 7 | .globl _main
 8 | 
 9 | _main:
10 |   subq $8, %rsp
11 |   movabsq $0, %r12
12 |   leaq values(%rip), %r13
13 | loop:
14 |   movq (%r13, %r12, 4), %rsi
15 |   movq val@GOTPCREL(%rip), %rdi
16 |   callq _printf
17 |   incq %r12
18 |   cmpq $5, %r12
19 |   jne loop
20 |   movl $0x2000001, %eax # exit code
21 |   movq $0, %rdi # return code
22 |   syscall
23 | 


--------------------------------------------------------------------------------
/gas/jz.s:
--------------------------------------------------------------------------------
 1 | .section __DATA, __data
 2 | res:
 3 |   .asciz "Zero \n"
 4 | .section __TEXT, __text
 5 | .globl _main
 6 | 
 7 | _main:
 8 |   pushq %rbp
 9 |   movq %rsp,%rbp
10 | 
11 |   movq $10, %rax
12 |   movq $10, %rbx
13 |   subq %rax, %rbx
14 |   jz zero
15 |   jmp end
16 | 
17 | zero:
18 |   movq res@GOTPCREL(%rip), %rdi
19 |   callq _printf
20 | 
21 | end: 
22 |   movl $0x2000001, %eax # exit code
23 |   movq $0, %rdi # return code
24 |   syscall
25 |   movq %rsp, %rbp
26 |   popq %rbp
27 |   ret
28 | 


--------------------------------------------------------------------------------
/nasm/linux/hello.asm:
--------------------------------------------------------------------------------
 1 | section .data
 2 |   msg db "hello world",10,0
 3 |   ; note that 10 is the new line character in decimal
 4 |   len equ $-msg-1
 5 |   ; not that $ is the current address, we substract the address
 6 |   ; of msg and then take that value -1 to ignore the 0/null byte
 7 | 
 8 | section .bss 
 9 | section .text
10 |   global main
11 | 
12 | main:
13 |   mov rax, 1
14 |   mov rdi, 1
15 |   mov rsi, msg
16 |   mov rdx, len
17 |   syscall
18 | 
19 |   mov rax, 60
20 |   mov rdi, 0
21 |   syscall
22 | 


--------------------------------------------------------------------------------
/gas/jne.s:
--------------------------------------------------------------------------------
 1 | .section __DATA, __data
 2 | res:
 3 |   .asciz "Equal\n"
 4 | .section __TEXT, __text
 5 | .globl _main
 6 | 
 7 | _main:
 8 |   pushq %rbp
 9 |   movq %rsp,%rbp
10 | 
11 |   # move the value of rdi (argc) into rsi which is used pass 2nd argument to functions  
12 |   movq $10, %rax
13 |   movq $10, %rbx
14 |   cmpq %rax, %rbx
15 |   jne end
16 |   movq res@GOTPCREL(%rip), %rdi
17 |   callq _printf
18 | 
19 | end:
20 |   movl $0x2000001, %eax # exit code
21 |   movq $0, %rdi # return code
22 |   syscall
23 | 


--------------------------------------------------------------------------------
/linux/carry.s:
--------------------------------------------------------------------------------
 1 | .global _start
 2 | 
 3 | .text
 4 | 
 5 | _start:
 6 |   mov $10, %rax
 7 |   mov $9, %rsi
 8 |   // source, destination
 9 |   // sub destination source 11 - 10 = 1
10 |   // (lldb) register read rflags -f b
11 |   //rflags = 0b0000000000000000000000000000000000000000000000000000001010010111
12 |   cmp %rax, %rsi
13 |   sub %rax, %rsi
14 | 
15 |   // set the carry flag
16 |   stc
17 |   // clear the carry flag
18 |   clc
19 |   nop
20 | 
21 |   mov $60, %rax
22 |   mov $0, %rdi
23 |   syscall
24 | 
25 | 


--------------------------------------------------------------------------------
/arm/fedora_aarch64.repo:
--------------------------------------------------------------------------------
 1 | [copr:copr.fedorainfracloud.org:lantw44:aarch64-linux-gnu-toolchain]
 2 | name=Copr repo for aarch64-linux-gnu-toolchain owned by lantw44
 3 | baseurl=https://download.copr.fedorainfracloud.org/results/lantw44/aarch64-linux-gnu-toolchain/fedora-$releasever-$basearch/
 4 | type=rpm-md
 5 | skip_if_unavailable=True
 6 | gpgcheck=1
 7 | gpgkey=https://download.copr.fedorainfracloud.org/results/lantw44/aarch64-linux-gnu-toolchain/pubkey.gpg
 8 | repo_gpgcheck=0
 9 | enabled=1
10 | enabled_metadata=1
11 | 


--------------------------------------------------------------------------------
/nasm/linux/func-args.asm:
--------------------------------------------------------------------------------
 1 | section .data
 2 | section .bss
 3 | section .text
 4 |   global main
 5 | 
 6 | doit:
 7 |   push rbp ; and this push decreases the stack 
 8 |   mov rbp, rsp
 9 |   ; call pushed the return address onto the stack, so that is 8 bytes (64 bits)
10 |   ; push rbp pushed another 8 bytes. So to access the first arg a have to 
11 |   ; specify 8+8=16
12 |   mov rax, [rbp+16]
13 |   leave
14 |   ret
15 | 
16 | main: 
17 |   push rbp
18 |   mov rbp, rsp
19 |   push $18
20 |   call doit
21 |   leave
22 |   ret
23 | 


--------------------------------------------------------------------------------
/gas/cli_lea.s:
--------------------------------------------------------------------------------
 1 | .section __DATA, __data
 2 | msg:
 3 |   .asciz "There are %d parameters\n"
 4 | 
 5 | .section __TEXT, __text
 6 | .globl _main
 7 | 
 8 | _main:
 9 |   pushq %rbp
10 |   movq %rsp,%rbp
11 | 
12 |   # move the value of rdi (argc) into rsi which is used pass 2nd argument to functions
13 |   movq %rdi, %rsi
14 |   # lea loads a pointer to a msg, mov would loads the actual value
15 |   leaq msg(%rip), %rdi
16 |   callq _printf
17 | 
18 |   movl $0x2000001, %eax # exit code
19 |   movq $0, %rdi # return code
20 |   syscall
21 | 


--------------------------------------------------------------------------------
/nasm/linux/function.asm:
--------------------------------------------------------------------------------
 1 | section .data
 2 | section .bss
 3 | section .text
 4 |       global main
 5 | 
 6 | main:
 7 | section .text
 8 |   push  rbp
 9 |   mov   rbp, rsp
10 |   mov   rdi, $8
11 |   call  doit
12 |   leave
13 |   ret
14 | 
15 | doit:
16 | section .data
17 |   .msg db "doit...",10,0 ; not visible outside of doit.
18 |   .len equ $-.msg-1      ; not visible outside of doit.
19 | section .text
20 |   push  rbp
21 |   mov   rbp, rsp
22 |   mov rdx, rdi
23 |   mov rax, 1
24 |   mov rdi, 1
25 |   mov rsi, .msg
26 |   syscall
27 |   leave
28 |   ret
29 | 


--------------------------------------------------------------------------------
/arm/src/first.s:
--------------------------------------------------------------------------------
 1 | .data
 2 | 
 3 | msg:
 4 |     .ascii        "Bajja\n"
 5 | len = . - msg
 6 | 
 7 | .text
 8 | 
 9 | .globl _start
10 | _start:
11 |     /* syscall write(int fd, const void *buf, size_t count) */
12 |     mov     x0, #1      /* fd */
13 |     ldr     x1, =msg    /* buf */
14 |     ldr     x2, =len    /* count */
15 |     mov     w8, #64     /* write syscall #64 */
16 |     svc     #0          
17 | 
18 |     /* syscall exit(int status) */
19 |     mov     x0, #0      /* status */
20 |     mov     w8, #93     /* exit syscall #93 */
21 |     svc     #0          
22 | 


--------------------------------------------------------------------------------
/linux/float.s:
--------------------------------------------------------------------------------
 1 | .global _start
 2 | 
 3 | .data
 4 |   radius: .float 3.14   # .float is 64 bits
 5 |   m: .float 2.1
 6 |   result: .float 0
 7 |   s: .short             # .short is 32 bits
 8 |   age: .word 46
 9 | 
10 | .text
11 | _start:
12 |   nop
13 |   mov %rsp, %rbp
14 | 
15 |   fld radius    # load onto the FPU stack
16 |   fld m         # load onto the FPU stack
17 |   fmulp         #
18 |   fstp result   # store floating point value in result and pop the stack
19 | 
20 |   fild age      # load integer and convert to floating point, then push onto the stack
21 | 
22 |   mov $60, %rax
23 |   mov $1, %rdi
24 |   syscall
25 | 


--------------------------------------------------------------------------------
/risc-v/src/hello.s:
--------------------------------------------------------------------------------
 1 | .global _start
 2 | 
 3 | _start:
 4 |   addi  a0, x0, 1      # 1 = standard out
 5 |   la    a1, msg        # load address of message
 6 |   addi  a2, x0, 8      # length of our string
 7 |   addi  a7, x0, 64     # linux write system call
 8 |   ecall                # Call linux to output the string
 9 | 
10 | # Setup the parameters to exit the program
11 | # and then call Linux to do it.
12 | 
13 |   addi    a0, x0, 0   # Use 0 return code
14 |   addi    a7, x0, 93  # Service command code 93 terminates
15 |   ecall               # Call linux to terminate the program
16 | 
17 | .data
18 | msg:      .ascii "Bajja!\n"
19 | 


--------------------------------------------------------------------------------
/linux/write.s:
--------------------------------------------------------------------------------
 1 | .data
 2 | msg: .ascii "something something\n"
 3 | len: .int . - msg
 4 | 
 5 | .text
 6 |   .globl _start
 7 |   .type	_start, @function
 8 | 
 9 | _start:
10 |   /*
11 |      long syscall(long number, ...);
12 |      ssize_t write(int fd, const void* buf, size_t nbytes)
13 |   */
14 |   mov $1, %rax   /* syscall number */
15 |   mov $1, %rdi   /* file descriptor (stdout) */
16 |   lea msg, %rsi  /* load the address of msg into rsi */
17 |   #mov $msg, %rsi  /* also works to place the address of msg into rsi */
18 |   mov len, %edx /* length to write */
19 |   syscall
20 | 
21 |   mov $60, %rax
22 |   xor %rdi, %rdi
23 |   syscall
24 | 


--------------------------------------------------------------------------------
/linux/jle.s:
--------------------------------------------------------------------------------
 1 | 
 2 | .data
 3 | msg: .ascii "bajja\n"
 4 | msg_len = . - msg
 5 | 
 6 | done: .ascii "done...\n"
 7 | .set done_len, . - done
 8 | 
 9 | .text
10 | .global _start
11 | 
12 | _start:
13 |   push %rbp
14 |   mov %rsp, %rbp
15 | 
16 |   xor %rcx, %rcx
17 |   mov $1, %rdx
18 | lp: 
19 |   lea msg(,%rcx), %rsi
20 |   inc %rcx
21 |   push %rcx
22 |   call print
23 |   pop %rcx
24 |   cmp $msg_len, %rcx
25 |   jl lp
26 | 
27 |   lea done, %rsi
28 |   mov $done_len, %rdx
29 |   call print
30 | 
31 |   mov $60, %rax 
32 |   xor %rdi, %rdi
33 |   syscall
34 | 
35 | print:
36 |   push %rbp
37 |   mov %rsp, %rbp
38 |   mov $1, %rax
39 |   mov $1, %rdi
40 |   syscall
41 |   leave
42 |   ret
43 | 


--------------------------------------------------------------------------------
/gas/Makefile:
--------------------------------------------------------------------------------
 1 | MACOSX := 10.11
 2 | OUT := out
 3 | 
 4 | SRC_FILES := $(filter-out func.s, $(wildcard *.s))
 5 | OBJ_FILES := $(filter-out out/func.o, $(addprefix out/,cli.o cli_lea.o loop.o, 64bit.o, jump.o, jne.o, jz.o, jnz.o, test.o,))
 6 | 
 7 | $(OUT)/%.o : %.s | out
 8 | 	@echo "compile " $@
 9 | 	as -g -W -arch x86_64 -o $@ $(*F).s
10 | 	ld -e _main -macosx_version_min $(MACOSX) -lc -lSystem -arch x86_64 $@ -o $*
11 | 
12 | $(OUT)/func.o : func.s | out
13 | 	as -g -W -arch x86_64 -o $@ $<
14 | 
15 | $(OUT)/func: func.cc $(OUT)/func.o | out
16 | 	clang++ -std=c++11 -O0 -g $(OUT)/func.o func.cc -o $(OUT)/func
17 | 
18 | out:
19 | 	mkdir $(OUT)
20 | 
21 | $(OUT)/avx_mul : avx_mul.c | out
22 | 	clang -mavx -o $@ $<
23 | 
24 | .PHONY: clean
25 | 
26 | clean: 
27 | 	@rm -fr $(OUT)
28 | 


--------------------------------------------------------------------------------
/gas/i386/learning_i386.s:
--------------------------------------------------------------------------------
 1 | # Mach-O has segments that contain sections
 2 | # __TEXT is a segment and __text a section
 3 | .section __TEXT, __text
 4 | .globl _main
 5 | 
 6 | _main:
 7 |     pushl %ebp        # push the value of ebp onto the stack, to save 
 8 |     movl %esp, %ebp   # store the current stack pointer in ebp to we can use ebp
 9 |                       # with indirect addressing
10 |     
11 |     subl $0x4, %esp   # make room for a 32 bit value on the stack to avoid overwriting
12 |                       # the return address on the stack
13 |     pushl $msg        # push msg onto the stack (will increment esp but ebp will remain
14 |     call _puts
15 |     addl $0x8, %esp   # clean up the stack
16 |     
17 |     movl $1, %eax
18 |     popl %ebp
19 |     ret
20 | 
21 | .section __DATA, __data
22 | msg:
23 |     .ascii "Learning Assembler\0"
24 | 


--------------------------------------------------------------------------------
/gas/64bit.s:
--------------------------------------------------------------------------------
 1 | #as -g -arch x86_64 64bit.s -o 64bit.o
 2 | #ld -e _start -macosx_version_min 10.8 -lSystem -arch x86_64 64bit.o -o 64bit
 3 | 
 4 | .data  
 5 | msg:
 6 |   .ascii "Assembly x86_64!\n" 
 7 | len:
 8 |   .long . - msg  
 9 | 
10 | .text  
11 | .globl _main 
12 | 
13 | _main:
14 |   # write call (SYSCALL_CONSTRUCT_UNIX in /usr/include/sys/syscall.h).
15 |   movq $0x2000004, %rax   
16 |   # file descriptior (stdout). rdi is used for the first argument to functions
17 |   movq $1, %rdi   
18 |   # string to print. rsi is used for the second argument to functions in x86_64
19 |   movq msg@GOTPCREL(%rip), %rsi 
20 |   # length of string. rdx is used for the third argument to functions in x86_64
21 |   movq len(%rip), %rdx  
22 |   # call write
23 |   syscall    
24 | 
25 |   # exit call
26 |   movq $0x2000001, %rax  
27 |   # return code
28 |   movq $0, %rdi   
29 |   # call exit
30 |   syscall    
31 | 


--------------------------------------------------------------------------------
/linux/cfi.s:
--------------------------------------------------------------------------------
 1 | .data
 2 | msg: .ascii "cfi example\n"
 3 | .set len, . - msg
 4 | 
 5 | .text
 6 |   .globl _start
 7 |   .type	_start, @function
 8 | 
 9 | _start:
10 |   .cfi_startproc
11 |   push %rbp 
12 |   /* Document that Common Frame Address will be stored on the stack */
13 |   .cfi_def_cfa_offset 16
14 |   /* Document that the value of register 6 (rbp) is saved on the stack */
15 |   .cfi_offset 6, -16
16 |   mov %rbp, %rsp
17 |   /* Document that rbp will be used as the CFA from this point onwards */
18 |   .cfi_def_cfa_register 6
19 |   mov $1, %rax   /* syscall number */
20 |   mov $1, %rdi   /* file descriptor (stdout) */
21 |   lea msg, %rsi  /* load the address of msg into rsi */
22 |   mov $len, %rdx /* length to write */
23 |   syscall
24 | 
25 |   mov $60, %rax
26 |   xor %rdi, %rdi
27 |   syscall
28 |   pop %rbp
29 |   /* Document that CFA is now rsp and at offset 8 */
30 |   .cfi_def_cfa 7, 8
31 |   .cfi_endproc
32 | 


--------------------------------------------------------------------------------
/cache.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <string.h>
 3 | #include <stdint.h>
 4 | #include <x86intrin.h>
 5 | 
 6 | int main(int argc, char** argv) {
 7 |   int cache_line_size = 64;
 8 |   size_t cache_size = 256 * cache_line_size;;
 9 |   printf("256 array/64 bytes: %d\n", cache_size);
10 | 
11 |   register uint64_t start, end;
12 |   int core_id = 0;
13 | 
14 |   int array[cache_size];
15 |   for (int i = 0; i < cache_size; i++) {
16 |     start = __rdtscp(&core_id);
17 |     array[i] = 1;
18 |     end = __rdtscp(&core_id);
19 |     unsigned long duration = end - start;
20 |     printf("Duration of set operation: %u core_id: %x\n", duration, core_id);
21 |   }
22 |   for (int i = 0; i < cache_size; i+=cache_line_size) {
23 |     //printf("clflush array[%d]\n", i);
24 |     _mm_clflush(&array[i]);
25 |   }
26 | 
27 |   for (int i = 0; i < cache_size; i++) {
28 |     start = __rdtscp(&core_id);
29 |     array[i]++;
30 |     end = __rdtscp(&core_id);
31 |     unsigned long duration = end - start;
32 |     //printf("Duration of increment operation: %u core_id: %x\n", duration, core_id);
33 |   }
34 |   return 0;
35 | }
36 | 


--------------------------------------------------------------------------------
/nasm/linux/Makefile:
--------------------------------------------------------------------------------
 1 | hello: hello.o
 2 | 	ld -no-pie -e main -o $@ $<
 3 | 
 4 | hello.o: hello.asm
 5 | 	nasm -f elf64 -g -F dwarf -o $@ $<
 6 | 
 7 | printf: printf.o
 8 | 	gcc -o $@ $< -no-pie
 9 | 
10 | printf.o: printf.asm
11 | 	nasm -f elf64 -g -F dwarf -o $@ $<
12 | 
13 | function: function.o
14 | 	gcc -o $@ $< -no-pie
15 | 
16 | function.o: function.asm
17 | 	nasm -f elf64 -g -F dwarf -o $@ $<
18 | 
19 | func-args: func-args.o
20 | 	gcc -o $@ $< -no-pie
21 | 
22 | func-args.o: func-args.asm
23 | 	nasm -f elf64 -g -F dwarf -o $@ $<
24 | 
25 | simd-intro: simd-intro.o
26 | 	gcc -o $@ $< -no-pie
27 | 
28 | simd-intro.o: simd-intro.asm
29 | 	nasm -f elf64 -g -F dwarf -o $@ $<
30 | 
31 | align: align.o
32 | 	gcc -o $@ $< -no-pie
33 | 
34 | align.o: align.asm
35 | 	nasm -f elf64 -g -F dwarf -o $@ $<
36 | 
37 | half-adder: half-adder.o
38 | 	gcc -o $@ $< -no-pie
39 | 
40 | half-adder.o: half-adder.asm
41 | 	nasm -f elf64 -g -F dwarf -o $@ $<
42 | 
43 | jnz: jnz.o
44 | 	gcc -o $@ $< -no-pie
45 | 
46 | jnz.o: jnz.asm
47 | 	nasm -f elf64 -g -F dwarf -o $@ $<
48 | 
49 | clean:
50 | 	$(info "cleaning...")
51 | 	rm -rf hello printf function
52 | 
53 | 


--------------------------------------------------------------------------------
/linux/Makefile:
--------------------------------------------------------------------------------
 1 | first: first.o
 2 | 	${LD} -o $@ $<
 3 | 
 4 | check_zero: check_zero.o
 5 | 	${LD} -o $@ $<
 6 | 
 7 | write: write.o
 8 | 	${LD} -o $@ $<
 9 | 
10 | jle: jle.o
11 | 	${LD} -o $@ $<
12 | 
13 | cfi: cfi.o
14 | 	${LD} -o $@ $<
15 | 
16 | arr: arr.o
17 | 	${LD} -o $@ $<
18 | 
19 | multi: multi.o
20 | 	${LD} -o $@ $<
21 | 
22 | div: div.o
23 | 	${LD} -o $@ $<
24 | 
25 | mul: mul.o
26 | 	${LD} -o $@ $<
27 | 
28 | exec-stack: exec-stack.o
29 | 	${LD} -o $@ $<
30 | 
31 | carry: carry.o
32 | 	${LD} -o $@ $<
33 | 
34 | zero-flag: zero-flag.o
35 | 	${LD} -o $@ $<
36 | 
37 | direction-flag: direction-flag.o
38 | 	${LD} -o $@ $<
39 | 
40 | cmov: cmov.o
41 | 	${LD} -o $@ $<
42 | 
43 | function: function.o
44 | 	${LD} -o $@ $<
45 | 
46 | 
47 | float: float.o
48 | 	${LD} -o $@ $<
49 | 
50 | sse: sse.o
51 | 	${LD} -o $@ $<
52 | 
53 | mmx: mmx.o
54 | 	${LD} -o $@ $<
55 | 
56 | test: test.o
57 | 	${LD} -o $@ $<
58 | 
59 | setne: setne.o
60 | 	${LD} -o $@ $<
61 | 
62 | bcd: bcd.o
63 | 	${LD} -melf_i386 -o $@ $<
64 | 
65 | bcd.o: bcd.s
66 | 	${AS} -g --32 -o bcd.o bcd.s
67 | 
68 | %.o: %.s
69 | 	${AS} -g -o $@ $<
70 | 
71 | .PHONY: clean
72 | clean:
73 | 	${RM} -f first check_zero jle write *.o
74 | 


--------------------------------------------------------------------------------
/c/vmmap-example.c:
--------------------------------------------------------------------------------
 1 | // gcc -o vmmap-example -c vmmap-example.c
 2 | // ./vmmap-example &
 3 | // vmmap <pid>
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <unistd.h>
 7 | 
 8 | //int global_j;
 9 | //const int ci = 24;
10 | 
11 | int main (int argc, char **argv)
12 | {
13 |     //int local_stack = 0;
14 |     //char *const_data = "This data is constant";
15 |     //char *tiny = malloc (32);            /* allocate 32 bytes */
16 |     //char *small = malloc (2*1024);       /* Allocate 2K */
17 |     //char *large = malloc (1*1024*1024);  /* Allocate 1MB */
18 |         
19 |     //printf("Text is %p\n", main);
20 |     //printf("Global Data is %p\n", &global_j);
21 |     //printf("Local (Stack) is %p\n", &local_stack);
22 |     //printf("Constant data is %p\n", &ci);
23 |     //printf("Hardcoded string (also constant) are at %p\n", const_data);
24 |     //printf("Tiny allocations from %p\n", tiny);
25 |     //printf("Small allocations from %p\n", small);
26 |     //printf("Large allocations from %p\n", large);
27 |     //printf("Malloc (i.e. libSystem) is at %p\n", malloc);
28 |     //sleep(100); /* so we can use vmmap on this process before it exits */
29 |     usleep(1000);
30 |     return 1;
31 | }
32 | 


--------------------------------------------------------------------------------
/linux/sse.s:
--------------------------------------------------------------------------------
 1 | .data
 2 | v1: .float 1.0, 2.0, 3.0, 4.0  # .float is 4 bytes. 32 bytes in total
 3 | v2: .float 5.0, 6.0, 7.0, 8.0  
 4 | 
 5 | v3: .double 1.0, 2.0
 6 | v4: .double 3.0, 4.0
 7 | 
 8 | i1: .int 1, 2, 3, 4
 9 | i2: .int 5, 6, 7, 8
10 | 
11 | i3: .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
12 | 
13 | .global _start
14 | 
15 | .text
16 | _start:
17 |   nop
18 |   movaps v1, %xmm0    # move aligned packed single precision
19 |   movaps v2, %xmm1    # move aligned packed single precision
20 |   addps %xmm1, %xmm0  # add packed single precision src, dest (result in dest)
21 | 
22 |   xorps %xmm0, %xmm0   # xor packed single precision
23 |   xorps %xmm1, %xmm1   # xor packed single precision
24 | 
25 |   movapd v3, %xmm0    # move aligned packed double precision
26 |   movapd v4, %xmm1    # move aligned packed double precision
27 |   addpd %xmm1, %xmm0  # add packed double precision src, dest (result in dest)
28 | 
29 |   xorps %xmm0, %xmm0   # xor packed single precision
30 |   movapd i1, %xmm0
31 |   addps i2, %xmm0  # add packed double precision src, dest (result in dest)
32 | 
33 |   xorps %xmm0, %xmm0   # xor packed single precision
34 |   movapd i3, %xmm0
35 |   paddb i3, %xmm0
36 |   
37 |   mov $60, %rax
38 |   mov $0, %rdi
39 |   syscall
40 | 


--------------------------------------------------------------------------------
/linux/multi.s:
--------------------------------------------------------------------------------
 1 | 
 2 | .text
 3 | .global _start
 4 | 
 5 | _start:
 6 |   mov $28, %rax
 7 |   mov $2, %rbx
 8 |   mul %rbx
 9 | 
10 |   xor %rcx, %rcx
11 | 
12 | loop:
13 |   xor %rdx, %rdx
14 |   mov $10, %rbx
15 |   div %rbx        /* divides what is in %rax */
16 |   add $48, %rdx   /* add ascii 48 to the remainder which is in rdx */
17 |   push %rdx       /* push to the right most digit onto the stack */
18 |   inc %rcx        /* number of digits counter */
19 |   cmp $0, %rax    /* continue as long as there the value in %rax is on 0*/
20 |   jz next         /* jz wil jump if the zero flag is set, so if %rax is 0 */
21 |   jmp loop        /* else continue loop */
22 | 
23 | next:
24 |   cmp $0, %rcx    /* check that are numbers to print, %rsi is the counter */
25 |   jz  exit        /* if there are not jump to exit */
26 |   dec %rcx
27 | 
28 |   mov $1, %rax     /* syscall 1 write                  */
29 |   mov $1, %rdi     /* fd                               */
30 |   lea (%rsp), %rsi /* current number on stack to print */
31 |   mov $1, %dx      /* number of characters to print    */
32 |   push %rcx        /* save the value of rcx on the stack as it may be clobbered by the syscall */
33 |   syscall
34 |   pop %rcx         /* restore rcx */
35 |   add $8, %rsp     /* move rsp backward */
36 |   jmp next
37 | 
38 | exit:
39 |   mov $1, %rax
40 |   mov $1, %rdi
41 |   push $10
42 |   lea (%rsp), %rsi
43 |   syscall
44 |   
45 |   mov $60, %rax
46 |   mov $0, %rdi
47 |   syscall
48 | 


--------------------------------------------------------------------------------
/linux/notes/mmx.md:
--------------------------------------------------------------------------------
 1 | ### Multimedia Media Extension (MMX)
 2 | Introduced new instructions and data types.
 3 | 
 4 | There are 8 new registers, 57 new instructions four new data types.
 5 | 
 6 | #### Registers
 7 | Unlike the FPU the mmx registers are freely addressable (they are not stack based
 8 | like in x87 FPU). Note that the mmx registers cannot be used to perform
 9 | floating point arithmetic.
10 | 
11 | ```
12 | MM0...MM7
13 | 
14 |    64                   0
15 |    +--------------------+
16 |    |      MM0           |
17 |    +--------------------+
18 |    ...
19 | ```
20 | This is done by still mainting compability with existing operating systems by
21 | mapping these new register to the ones in the FPU (see notes
22 | [float.md](./float.md)) which if we recall were 8 80-bit registers.
23 | 
24 | So these new register map to those but only to the 64-bits. If one wants to mix
25 | FPU and MMX one needs to be careful and call the `emms` instruction before
26 | switching.
27 | 
28 | #### Data types
29 | So we saw above that the registers are 64 bits in size. So we can place one
30 | 64-bit value (Quadword) in a register, or 2 32-bit (Packed doubleword), or
31 | 4 16-bit values (Packed word), or 8 8-bit (Packed byte).
32 | 
33 | Each value is a separate fixed point integer.
34 | 
35 | ### mov
36 | So lets start simple by moving a value into a mmx register:
37 | ```assembly
38 | v1: .double 1, 2
39 | 
40 | movd v1, %mm0
41 | 
42 | ```
43 | 
44 | ### paddb, paddw
45 | Packed add byte/word/doubleword/quadword is used for adding packed integers.
46 | 
47 | ```assembly
48 | v3: .word 1, 2, 3, 4
49 | 
50 |   movq v3, %mm2
51 |   paddw %mm2, %mm2
52 | ```
53 | ```console
54 | (lldb) register read --format int16 mm2
55 |      mm2 = {2 4 6 8}
56 | ```
57 | 


--------------------------------------------------------------------------------
/arm/Makefile:
--------------------------------------------------------------------------------
  1 | first: first.o
  2 | 	aarch64-linux-gnu-ld -g -o $@ $<
  3 | 
  4 | first.o: src/first.s
  5 | 	aarch64-linux-gnu-as -g -o $@ $<
  6 | 
  7 | add: add.o
  8 | 	aarch64-linux-gnu-ld -g -o $@ $<
  9 | 
 10 | add.o: src/add.s
 11 | 	#aarch64-linux-gnu-as -g -o $@ $<
 12 | 	armasm -g -o $@ $<
 13 | 
 14 | branch: branch.o
 15 | 	aarch64-linux-gnu-ld -g -o $@ $<
 16 | 
 17 | branch.o: src/branch.s
 18 | 	aarch64-linux-gnu-as -g -o $@ $<
 19 | 
 20 | xzr: xzr.o
 21 | 	aarch64-linux-gnu-ld -g -o $@ $<
 22 | 
 23 | xzr.o: src/xzr.s
 24 | 	aarch64-linux-gnu-as -g -o $@ $<
 25 | 
 26 | stack: stack.o
 27 | 	aarch64-linux-gnu-ld -g -o $@ $<
 28 | 
 29 | stack.o: src/stack.s
 30 | 	aarch64-linux-gnu-as -g -o $@ $<
 31 | 
 32 | ldr: ldr.o
 33 | 	aarch64-linux-gnu-ld -g -o $@ $<
 34 | 
 35 | ldr.o: src/ldr.s
 36 | 	aarch64-linux-gnu-as -g -o $@ $<
 37 | 
 38 | func: func.o
 39 | 	aarch64-linux-gnu-ld -g -o $@ $<
 40 | 
 41 | func.o: src/func.s
 42 | 	aarch64-linux-gnu-as -g -o $@ $<
 43 | 
 44 | space: space.o
 45 | 	arm-none-eabi-ld -g -o $@ $<
 46 | 
 47 | space.o: src/space.s
 48 | 	arm-none-eabi-as -g -o $@ $<
 49 | 
 50 | rename: rename.o
 51 | 	arm-none-eabi-ld -g -o $@ $<
 52 | 
 53 | rename.o: src/rename.s
 54 | 	arm-none-eabi-as -g -o $@ $<
 55 | 
 56 | mov: mov.o
 57 | 	arm-none-eabi-ld -g -o $@ $<
 58 | 
 59 | mov.o: src/mov.s
 60 | 	arm-none-eabi-as -g -o $@ $<
 61 | 
 62 | ldmia: ldmia.o
 63 | 	arm-none-eabi-ld -g -o $@ $<
 64 | 
 65 | ldmia.o: src/ldmia.s
 66 | 	arm-none-eabi-as -g -o $@ $<
 67 | 
 68 | bic: bic.o
 69 | 	arm-none-eabi-ld -g -o $@ $<
 70 | 
 71 | bic.o: src/bic.s
 72 | 	arm-none-eabi-as -g -o $@ $<
 73 | 
 74 | uxtb: uxtb.o
 75 | 	arm-none-eabi-ld -g -o $@ $<
 76 | 
 77 | uxtb.o: src/uxtb.s
 78 | 	arm-none-eabi-as -g -o $@ $<
 79 | 
 80 | psr: psr.o
 81 | 	arm-none-eabi-ld -g -o $@ $<
 82 | 
 83 | psr.o: src/psr.s
 84 | 	arm-none-eabi-as -g -o $@ $<
 85 | 
 86 | .PHONY: run-first
 87 | run-first: first
 88 | 	qemu-aarch64 $<
 89 | 
 90 | .PHONY: run-add
 91 | run-add: add
 92 | 	qemu-aarch64 $<
 93 | 
 94 | 
 95 | .PHONY: clean
 96 | clean:
 97 | 	${RM} first *.o
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/risc-v/README.md:
--------------------------------------------------------------------------------
 1 | ## RISC-V
 2 | RISC-V is open source and began its development in 2010 (ARM started in 1990) by
 3 | Berkely Parallel Computing Laboratory and later 36 tech companies came together
 4 | to form the RISC-V Foundation which was later renamed to RISV-V International in
 5 | 2020. RISC-V seems to be pronounced as RISC-Five where the 5 comes from this
 6 | was Birkely's fifth RISC ISA design.
 7 | 
 8 | As the name hints as RISC-V is a reduced instruction set computer instruction
 9 | set architecture (ISA). ISA is perhaps simplified as the design of a computer
10 | in terms of basic operations that it must support. It does not address impl
11 | specific details of the computer so two even if you have 2 processors that
12 | support the same ISA they can be implemented very differently.
13 | 
14 | ### Background
15 | Both ARM and RISC-V are instruction set architectures (ISA) and there are both
16 | reduced instruction set computers (RISC).
17 | 
18 | 
19 | ### ISA Specification
20 | https://github.com/riscv/riscv-isa-manual/releases/download/Ratified-IMAFDQC/riscv-spec-20191213.pdf
21 | 
22 | ### Hardware Thread
23 | In the ISA they talk about Harts which is the same thing as a core in other
24 | ISAs.
25 | 
26 | ### Base Module
27 | If I've understood this correclty the base modules are RV32I, RV64I, and RV128I
28 | where RV stands for RISC-V, 32/64/128 the bus/register sized, and `I` stands for
29 | integer which refers to the operations available. 
30 | 
31 | ### Extension Modules
32 | Extension modules add to the base and add more operations. For example, the `M`
33 | extension stands for multiplication and adds multiply/divide instructions. I
34 | thought this was a bit odd and that mul/div should be included in the base, but
35 | there are many places  where only the simplest of operations are required, like
36 | small "stupid" small gadgets/devices. So it allows the manufacturers to only
37 | include what they actually will use.
38 | 
39 | ### add instruction
40 | The following adds the contents of two registers and stores the result in the
41 | destination register:
42 | ```assembly
43 |   add rd rs1 rs2
44 | 
45 | rd  = distination register
46 | rs1 = source register 1
47 | rs2 = source register 2
48 | ```
49 | 
50 | 
51 | ### Compiling
52 | ```
53 | $ make out/hello
54 | $ ./out/hello
55 | Bajja!
56 | ```
57 | 
58 | ### Debugging
59 | ```console
60 | $ qemu-riscv64 -g 7777 out/hello
61 | ```
62 | ```console
63 | $ riscv64-unknown-linux-gnu-gdb ./out/hello
64 | Reading symbols from ./out/hello...
65 | (gdb) target remote localhost:7777
66 | ``
67 | 


--------------------------------------------------------------------------------
/linux/notes/bcd.md:
--------------------------------------------------------------------------------
 1 | ### Binary Coded Decimal
 2 | This is where a decimal (base 10) is divided into individual bytes (so 8 bit. For example
 3 | 123 would become:
 4 | ```
 5 |   1         2        3
 6 | 00000001 00000010 00000011
 7 | ```
 8 | So we are representing on decimal digit, that is 0-9 as a byte. But we might
 9 | notice that we are using 4 bits to do this and 4 bits (1 nibble) can represent
10 | 0-15.
11 | 
12 | So what happens if we add two BCD numbers say 5 and 6:
13 | ```
14 |  00000101
15 | +00000110
16 | -----
17 |  00001011
18 | ```
19 | That is 11 in decimal but notice that 11 in BCD is:
20 | ```
21 | 00000001 00000001
22 | ```
23 | So there needs something to be done after the addition to bring the result into
24 | BCD after adding.
25 | 
26 | ### Unpacked BCD
27 | Lets take our example 123:
28 | ```
29 |   1         2        3
30 | 00000001 00000010 00000011
31 | ```
32 | Notice that we are only using the lower nibble, the upper is left unused and
33 | wasted.
34 | 
35 | ### Packed BCD
36 | Saves space by packing two digits into a byte.
37 | ```
38 |   1         2 
39 | 00000001 00000010 Unpacked BCD
40 | 00010010          Packed BCD
41 | ```
42 | So instead of two bytes we can fit the same information in a single byte.
43 | Instructions need to understand if the data it is operating on is in unpacked
44 | or packed format to be able to perform the correct operations.
45 | 
46 | So we would use a .byte type to store either a unpacked single BCD number or
47 | the same .byte could be used to store two packed BCD numbers.
48 | 
49 | ### Adjust After Addition (aaa)
50 | This instruction adjusts the al register. It is added after an add instruction
51 | which adds two unpacked bcd values together and places the sum in `al`.
52 | 
53 | Lets take a simple example where we add two unpacked values.
54 | The examples can be found in [bcd.s](../bcd.s).
55 | ```console
56 | 3   	unpacked1: .byte 8
57 |    4   	unpacked2: .byte 4
58 |    5   	
59 |    6   	.global _start
60 |    7   	
61 |    8   	.text
62 |    9   	_start: 
63 |    10  	  mov unpacked1, %al
64 |    11  	  mov unpacked2, %bl
65 | -> 12  	  add %bl, %al
66 |    13  	  aaa
67 | ```
68 | Now if we inspect the values before the addtion:
69 | ```console
70 | (lldb) register read -f b al
71 |       al = 0b00001000
72 | (lldb) register read -f b bl
73 |       bl = 0b00000100
74 | ```
75 | And after the addition we have the value in `al`:
76 | ```console
77 | (lldb) register read -f b al
78 |       al = 0b00001100
79 | ```
80 | Notice that this addition generated 1100 which is 12. But since we are working
81 | with unpacked bcd values we want the result in that format. We can get that by
82 | using the aaa instruction:
83 | 
84 | And after the addition the ascii adjust after addition (aaa):
85 | ```console
86 | (lldb) register read -f b ax
87 |       ax = 0b0000 0001 0000 0010
88 |                     1         2
89 | ```
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/aix/README.md:
--------------------------------------------------------------------------------
 1 | ### AIX Assembler/Assembly notes
 2 | Advanced Interactive executive (AIX) is a proprietery UNIX operating system sold
 3 | by IBM and introduced in 1986.
 4 | 
 5 | Servers that run this OS are AS/400, later known as iSeries, System i, OS/400
 6 | later known as i5/OS and now IBMi. And also IBM POWER and POWERPC in the
 7 | RS/600 later known as pSeries, then System p
 8 | 
 9 | 
10 | #### Assembly syntax
11 | Being a RISC processor all instructions take a register as its first operand.
12 | And unlike CISC processors the registers use number instead of names. For
13 | general purpose data there are 32 registers.
14 | 
15 | Because there are so many registers all arguments to functions can be passed
16 | in registers, starting from register 3 which would be the first argument for
17 | a function call.
18 | For system calls the syscall number goes into gpr0 and the args begin in
19 | gpr3.
20 | 
21 | #### load immediate (li)
22 | Loads a constant into a register, similar to `mov $1, %rax` one would do:
23 | ```assembly
24 | .data 
25 | message:
26 |   .string 'Bajja\n'
27 |   len = . - message
28 | 
29 | .global _start
30 | 
31 | .text
32 | _start:
33 |    li 0, 1           # load constant 1 into register 0 (syscall number)
34 | ```
35 | This is actually not an instruction but a memonic which can be though of as a
36 | preprocessor macro. The assembler will interpret it and generate the correct
37 | instructions for the memonic. For example the `li` instruction above will
38 | become:
39 | ```assembler
40 | addi 0, 0, 1
41 | ```
42 | This might look like it is adding 1 to register 0 and then storing that in
43 | register 0 but gpr0 is sometimes read as 0 depending on the context and in the
44 | case of addi the spec says that it is 0 in this case.
45 | 
46 | #### add
47 | ```assembly
48 | addi 4, 3, 5
49 | ```
50 | The above will add the value of register 5 to the contents of register 3 and
51 | then store the result in register 4.
52 | 
53 | #### addi
54 | Add immediate
55 | ```assembly
56 | addi 4, 3, 5
57 | ```
58 | The above will add 5 to the contents of register 3 and then store the result
59 | in register 4. Notice that we are adding the constant 5 and not the contents of
60 | register 5.
61 | 
62 | #### or 
63 | ```assembly
64 | or rA, rS, rB
65 | ```
66 | This will register S with register B and then store the result in register A.
67 | There is also an simplified memonic named `mr` (memonic or?):
68 | ```assembly
69 | mr rA,sA
70 | ```
71 | Which is the same as:
72 | ```assembly
73 | or rA, rS, rS
74 | ```
75 | 
76 | Example:
77 | ```assembly
78 | mr 9,3
79 | ```
80 | So this would or the contents of gpr3 is or:ed with itself and the the result
81 | is stored in gpr9. Is this some sort of way to move the contents between
82 | registers? I mean or with itself will not alter the content (not like xor which
83 | would be the same thing as setting it to zero.
84 | 
85 | 
86 | #### mtvsrd
87 | ```assembly
88 | mtvsrd  32,14
89 | ```
90 | 
91 | 


--------------------------------------------------------------------------------
/linux/notes/float.md:
--------------------------------------------------------------------------------
  1 | ## Floating points in assembly
  2 | The Floating Point Unit (FPU) is a decicated hardware component that implements
  3 | operations like addition, subtraction, multiplication and division on floating
  4 | point numbers. There are also instructions for more advanced operations like
  5 | square root, trig funtions, logarithm functions.
  6 | It supports multiple data types including single/double precision
  7 | floating-point, signed integers, and BCD.
  8 | 
  9 | This unit has its own registers:
 10 | ```
 11 |   79                                   0
 12 |    +-----------------------------------+
 13 | R7 |                                   |
 14 |    +-----------------------------------+
 15 | R6 |                                   |
 16 |    +-----------------------------------+
 17 | R5 |                                   |
 18 |    +-----------------------------------+
 19 | R4 |                                   |
 20 |    +-----------------------------------+
 21 | R3 |                                   |
 22 |    +-----------------------------------+
 23 | R2 |                                   |
 24 |    +-----------------------------------+
 25 | R1 |                                   |
 26 |    +-----------------------------------+
 27 | R0 |                                   |
 28 |    +-----------------------------------+
 29 | ```
 30 | These 8 registers make up a stack. Data that can be pushed onto this stack
 31 | are signed integers of sizes 16, 32, or 64 bits. Floating point values of sizes
 32 | 32, 64, or 80 bits. BCD packed quantities.
 33 | There is no way to transfer data from one of these registers to the general
 34 | purpose x86 registers.
 35 | 
 36 | ST(0) denotes the top of the stack. ST(i) denotes the i-th register from the
 37 | current top. Most of the instructions use ST(0) as an implicit operand.
 38 | 
 39 | There are also registers for general purpose in this unit. These are 16-bit
 40 | registers.
 41 | 
 42 | ### Control register
 43 | ```
 44 |              Control register
 45 |   15                                         0
 46 |    +-----------------------------------------+
 47 |    |                                         |
 48 |    +-----------------------------------------+
 49 | ```
 50 | 
 51 | ### Status/Word register
 52 | The status register is sometimes called the 
 53 | ```
 54 |              Status register
 55 |   15                                         0
 56 |    +-----------------------------------------+
 57 |    |B|C3|TOP|C2|C1|C0|ES|SF|PE|UE|OE|ZE|DE|IE|
 58 |    +-----------------------------------------+
 59 | ```
 60 | These values are cleared using `fclex` or `fnclex` (Clear Exceptions)
 61 | instructions.
 62 | The values cannot be used directly but instead one has to copied to memory
 63 | or to register `ax` using `fstsw` or `fnstsw` (Store FPU status word)
 64 | 
 65 | ### Tag register
 66 | This register describes the content of the data on the stack registers.
 67 | ```
 68 |              Tag register
 69 |   15                                         0
 70 |    +-----------------------------------------+
 71 |    |                                         |
 72 |    +-----------------------------------------+
 73 | ```
 74 | 
 75 | 
 76 | ### Defining a float
 77 | ```assembly
 78 | .data
 79 |   radius: .float 3.14
 80 |   m: .float 2.2
 81 | ```
 82 | A float is a 64 bit value. A short is a 32 bit value.
 83 | 
 84 | ### Data transfer
 85 | These instruction deal with pushing and poping values to/from the stack
 86 | These instructions are named differently depending on the data being
 87 | pushed/poped (floating-point, signed integer, or packed BCD).
 88 | 
 89 | `fld` (floating point unit load) pushed a floating point value onto the register
 90 | stack. The operand can be a memory location but it can also be st(0) (the value
 91 | on the top of the stack)
 92 | 
 93 | `fild` (float point unit Integer Load) reads a signed integer from memory and
 94 | converts it to a double extended precision value and then pushes that value onto
 95 | the register stack.
 96 | 
 97 | `fst` (floating point unit store) copies st(0) to st(i) or a memory location.
 98 | `fstp` (floating point unit store and pop) copies st(0) to st(i) or a memory
 99 | location and also pops the stack (removes the entry or perhaps just adjusts
100 | the stack pointer to the slot before it).
101 | 
102 | `fist` floating point unit convert to Integer and store the result in a memory
103 | location.
104 | `fistp` floating point unit convert to Integer and store the result in a memory
105 | location and pops the stack.
106 | `fisttp` converts the value in st(0) to an integer using truncation, and saves
107 | the result in the specified memory location and then pops the stack. This
108 | instruction is only available on processors that support SSE3.
109 | 
110 | `fxch` exchanges the content of register st(0) and st(i).
111 | 
112 | `fcmovcc` conditionally copies the content of st(i) to st(0) if the condition
113 | is true.
114 | 
115 | ### Push a float onto the FPU stack
116 | ```assembly
117 |   fld radius
118 | ```
119 | We can inspect this by using:
120 | ```console
121 | (lldb) expr -f b -- radius
122 | (void *) $2 = 0b0000000000000000000000000000000001000000010010001111010111000011
123 | (lldb) register read st0
124 | ```
125 | 
126 | ###  Copy value st(0)
127 | Floating point store and then pop (the FPU stack that is):
128 | ```console
129 |   fstp result
130 | ```
131 | 
132 | ### Multiplication
133 | ```assembly
134 |   fld radius
135 |   fld m
136 |   fmulp
137 | 
138 | (lldb) expr -f f -- result
139 | (void *) $2 = 5.3733119381271126E-315
140 | ```
141 | 


--------------------------------------------------------------------------------
/linux/notes/sse.md:
--------------------------------------------------------------------------------
  1 | ### Streaming SIMD Extension (SSE)
  2 | This makes it possible to perform arithmetic operations on four pairs of 32-bit
  3 | floating point numbers at a time. This is done with 16 128 bit registers:
  4 | ```
  5 | XMM0...XMM15
  6 | 
  7 |   127                   0
  8 |    +--------------------+
  9 |    |      XMM0          |
 10 |    +--------------------+
 11 |    ...
 12 | ```
 13 | So each register can hold 128 bits which means we can store one 128 bit value,
 14 | or 2 64-bit values, or 4 32-bit values, or 8 16-bit values, or 16 8 bit values.
 15 | 
 16 | 
 17 | SSE was introduces in 1998, SSE2 in 1999, and SSE3 in 2004.
 18 | 
 19 | Advanced Vector Extension (AVX) was introduced in 2011 and expanded the
 20 | 16 registers to 256 bits and then names for them are
 21 | ```
 22 | XMM0...XMM15  128-bits
 23 | YMM0...YMM15  256-bits
 24 | 
 25 | 255                    127                    0
 26 |    +------------------------------------------+
 27 |    |      YMM0          |     XMM0            |
 28 |    +------------------------------------------+
 29 |    ...
 30 | ```
 31 | Now, these registers can store 8 32-bit floating point values, or 4 64-bit
 32 | values.
 33 | In 2013 Intel released AVX-2 which also allowed the values to be integers in
 34 | addition to floating point values.
 35 | 
 36 | AVX-512 increased the number of registers to 32 and also increased the size of
 37 | the registers to 512.
 38 | ```
 39 | XMM0...XMM15  128-bits
 40 | YMM0...YMM15  256-bits
 41 | ZMM1...ZMM31  512-bits
 42 | 
 43 |  512                   255                    127                    0
 44 |    +------------------------------------------+----------------------+
 45 |    |      ZMM1          |        YMM0         |          XMM0        |
 46 |    +------------------------------------------+----------------------+
 47 |    ...
 48 | ```
 49 | 
 50 | ### Scalar operations
 51 | Only operate on the least significant data elements. So if we have a 64-bit
 52 | floating point value a scalar operation only operates on the bits 0-31, and
 53 | for a 128 bit value only on the bits 0-64.
 54 | 
 55 | ### Packed operations
 56 | The operate on the whole register bits in parallel..
 57 | 
 58 | 
 59 | ### Single Instruction Multiple Data (SIMD)
 60 | Is what is says on the box, a single instructions (add, sub, mul, div, shift,
 61 | compares, datals
 62 | ) on
 63 | multiple data elements during the same instruction. So one instruction can
 64 | replace multiple.
 65 | 
 66 | ### movaps
 67 | This instruction will move an aligned packed single precision value into an
 68 | xmm register.
 69 | 
 70 | So lets start very simple and see how we can place data in a register.
 71 | ```assembly
 72 | .data
 73 | v1: .float 1.0, 2.0, 3.0, 4.0 
 74 | ...
 75 |   movaps v1, %xmm0
 76 | ```
 77 | 
 78 | ```console
 79 | (lldb) expr -f float32 -- $xmm0
 80 | (unsigned char __attribute__((ext_vector_type(16)))) $2 = (1, 2, 3, 4)
 81 | ```
 82 | So that looks like what we expect. We have declared v1 to be the address where
 83 | there will be four floats which each are 4 bytes. 
 84 | Just printing the register `xmm0` will give us:
 85 | ```console
 86 | (lldb) register read xmm0
 87 |     xmm0 = {0x00 0x00 0x80 0x3f 0x00 0x00 0x00 0x40 0x00 0x00 0x40 0x40 0x00 0x00 0x80 0x40}
 88 | ```
 89 | Notice that this showing the data in this register as 16 bytes (8x16=128), but
 90 | we specified that our data is a float which is 4 bytes each. 128/4=32.
 91 | Next we can add these two vector together:
 92 | ```assembly
 93 |   addps %xmm1, %xmm0
 94 | ```
 95 | This is using add packad (the p) single precision (the s which is 32 bits). This
 96 | is a destructive operation and add the values in source xmm1 with the values
 97 | in destination xmm0 and then overrite the values in the destination register.
 98 | We can inspect the values in xmm0 after this operation:
 99 | ```console
100 | (lldb) expr -f float32 -- $xmm0
101 | (unsigned char __attribute__((ext_vector_type(16)))) $15 = (2, 4, 6, 8)
102 | ```
103 | 
104 | ### movapd
105 | This will move an aligned packed double precision value into a xmm register.
106 | ```assembly
107 |   movapd v3, %xmm0    # move aligned packed double precision
108 | ```
109 | ```console
110 | (lldb) expr -f float64 -- $xmm0
111 | (unsigned char __attribute__((ext_vector_type(16)))) $0 = (1, 2)
112 | ```
113 | 
114 | 
115 | ### xor xmm register
116 | To clear/set to zero a register we can use `xorps` for packed single precsion:
117 | ```assembly
118 |   xorps %xmm0, %xmm0   # xor packed single precision
119 | ```
120 | 
121 | ### Integers
122 | We can also work with integers instead of floating point values. Remember that
123 | we have xmm registers that are 128 bits long. How we divide/interpret/pack
124 | that data is up to us. We could have two .quad  (8 bytes, 64 bits), 4 .int/.long
125 | (4 bytes, 32 bits), or 8 .word/.short (2 bytes, 16 bits), or 16 .byte (1 byte,
126 | 8 bits).
127 | 
128 | For example we can have a packed 128 bit value with 4 32 bit ints:
129 | ```assembly
130 | i1: .int 1, 2, 3, 4
131 | 
132 |   movaps i1, %xmm0  # mov aligned packed single precision (32 bit values).
133 | ```
134 | Remember that we have to specify the correct instrution for the type of the
135 | data that we are moving.
136 | ```console
137 | (lldb) expr -f uint32_t -- $xmm0
138 | (unsigned char __attribute__((ext_vector_type(16)))) $2 = (1, 2, 3, 4)
139 | ```
140 | Add to add two of these vectors we use `addps`:
141 | ```assembly
142 |   addps i2, %xmm0  # add packed double precision src, dest (result in dest)
143 | ```
144 | 
145 | We can also use the space to add 16 bytes together:
146 | ```assembly
147 | i3: .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
148 | 
149 |   xorps %xmm0, %xmm0   # xor packed single precision
150 |   movapd i3, %xmm0
151 |   paddb i3, %xmm0
152 | ```
153 | ```console
154 | (lldb) expr -f uint8_t --  $xmm0
155 | (unsigned char __attribute__((ext_vector_type(16)))) $12 = (0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30)
156 | ```
157 | 


--------------------------------------------------------------------------------
/gas/README.md:
--------------------------------------------------------------------------------
  1 | ## GNU Assembler examples
  2 | 
  3 | ### Building
  4 | 
  5 |     make
  6 | 
  7 | All binaries will be placed in the `out` directory.
  8 | 
  9 | 
 10 | ### arch type
 11 | When using `as` you can specify the architecture type using `-arch`
 12 | 
 13 |     man 3 arch
 14 | 
 15 | If no target architecture is specified, it defaults to the architecture of the host it is running on.
 16 | 
 17 | ### Segments
 18 | In Mach-O sections are segments that contains sections, for example:
 19 | 
 20 |     .section __TEXT, __text
 21 | 
 22 | `__TEXT` is the segment and `__text` the section.
 23 | 
 24 | 
 25 | ### System calls
 26 | This is done using the `syscall` instruction.
 27 | 
 28 | /usr/include/sys/syscall.h
 29 | 
 30 | 
 31 | ### Pointers
 32 | When using parenthises around a register, for example (%eax) means to
 33 | dereference the pointer in eax and use it.
 34 | To get the address of a label you can use `$` before the label. If you are on a
 35 | 64 bit machine you may have to load the effective address instead
 36 | `(leaq label(%rip), %rdi)` if direct addressing is not supported.
 37 | 
 38 | ### learning_i386
 39 | Just a hello world example that prints something to standard out.
 40 | 
 41 | ### 64Bit.s
 42 | This is a example of using system calls in a x86_64 arch. There are a few
 43 | interesting/different things that I ran into when trying to get this working.
 44 | The first was the way a system call is specified. 
 45 | 
 46 | You can find the system calls [syscalls.master](inhttp://www.opensource.apple.com/source/xnu/xnu-1504.3.12/bsd/kern/syscalls.master):
 47 | ```
 48 |     ...
 49 |     4    AUE_NULL    ALL { user_ssize_t write(int fd, user_addr_t cbuf, user_size_t nbyte); }
 50 | ```
 51 | We can see that `write` has the value `4`, but when we specify this in
 52 | [64bit.s](./64bit.s) we use:
 53 | ```
 54 |     movq $0x2000004, %rax
 55 | ```
 56 | Why `x2000004``` instead of simply `4`?  
 57 | The reason for this can be found in [syscall_sw.h](http://www.opensource.apple.com/source/xnu/xnu-792.13.8/osfmk/mach/i386/syscall_sw.h).
 58 | This is not a public header so it will probably not be available on your system. 
 59 | 
 60 | In XNU, the POSIX system calls make up only of four system call classes (SYSCALL_CLASS): 
 61 | 
 62 | 1. UNIX (1)
 63 | 2. MACH (2)
 64 | 3. MDEP (3)
 65 | 4. DIAG (4) 
 66 | 
 67 | In 64-bit, all call types are positive, but the most significant byte contains
 68 |  the value of SYSCALL_CLASS from the preceding table.
 69 | The value is checked by shifting the system call number 
 70 | 
 71 | 	SYSCALL_CLASS_SHIFT (=24) bits.
 72 | 	2 << 24 = 2000000 hex
 73 | 
 74 | 
 75 | The next thing that I did not understand was this line:
 76 | 
 77 |     movq msg@GOTPCREL(%rip), %rsi # string to print. rsi is used for the second argument to functions in x86_64
 78 | 
 79 | ### Relocations
 80 | If you have multiple object files and want to link them together one options is to add the code sections together, then the 
 81 | data sections etc. But if you have a function at address 0 in both object files which one will get invoked? It would depend on 
 82 | which was linked first as the other would have it's address shifted.
 83 | 
 84 | * When you load/store data you need to know the location.
 85 | * When you branch/jump you need to be able to specify the location to branch/jump to.
 86 | 
 87 | Lets take a look at the following c program:
 88 | 
 89 |     extern int something;
 90 | 
 91 |     int function(void) {
 92 |       return something;
 93 |     }
 94 | 
 95 | Next, we can compile and then display the relocation section:
 96 | 
 97 |     $ clang -c rel.c
 98 |     $ otool -r rel.o
 99 |     RELOCATION RECORDS FOR [__text]:
100 |     0000000000000007 X86_64_RELOC_GOT_LOAD _something@GOTPCREL
101 | 
102 |     RELOCATION RECORDS FOR [__compact_unwind]:
103 |     0000000000000000 X86_64_RELOC_UNSIGNED __text
104 | 
105 | You can find information about [X86_64_RELOC_GOT_LOAD](https://opensource.apple.com/source/xnu/xnu-1699.22.73/EXTERNAL_HEADERS/mach-o/x86_64/reloc.h.auto.html).
106 | During compilation _something is not known to the compiler so a relocation entry is left for the linker to resolve. The entry is specified as addredss `0000000000000007'
107 | 
108 |     $ objdump -disassemble rel.o
109 | 
110 |     rel.o:file format Mach-O 64-bit x86-64
111 | 
112 |     Disassembly of section __TEXT,__text:
113 |     _function:
114 |        0: 55                      pushq%rbp
115 |        1: 48 89 e5                movq%rsp, %rbp
116 |        4: 48 8b 05 00 00 00 00    movq(%rip), %rax
117 |        b: 8b 00                   movl(%rax), %eax
118 |        d: 5d                      popq%rbp
119 |        e: c3                      retq
120 | 
121 | If we look at address 7:
122 | 
123 |        4: 48 8b 05 00 00 00 00    movq(%rip), %rax
124 |                    /\
125 | 
126 | We can see that at address 7 there are four bytes that will be filled by the linker with the correct address.
127 | 
128 | 
129 | GOTPCEL is short for Global Offset Table and Procedure Linkage Table (I think). From what I understand this has to do with relocations. So lets look what relocation information can be found in the mach object file:
130 | 
131 |     $ otool -r out/64bit.o
132 |     $ otool -r out/cli.o
133 |     RELOCATION RECORDS FOR [__text]:
134 |     000000000000000f X86_64_RELOC_BRANCH _printf
135 |     000000000000000a X86_64_RELOC_GOT_LOAD argc@GOTPCREL
136 | 
137 |     RELOCATION RECORDS FOR [__debug_info]:
138 |     000000000000008b X86_64_RELOC_UNSIGNED __text
139 |     0000000000000018 X86_64_RELOC_UNSIGNED __text
140 |     0000000000000010 X86_64_RELOC_UNSIGNED __text
141 | 
142 |     RELOCATION RECORDS FOR [__debug_aranges]:
143 |     0000000000000010 X86_64_RELOC_UNSIGNED __text
144 | 
145 |     RELOCATION RECORDS FOR [__debug_line]:
146 |     0000000000000029 X86_64_RELOC_UNSIGNED __text
147 | 
148 | Relocation is the process of connecting symbolic referenses to symbolic definitions. For the _text segement we can find the following:
149 | 
150 |     000000000000000a X86_64_RELOC_GOT_LOAD argc@GOTPCREL
151 | 
152 | This maps to [cli.s](./cli.s):
153 | 
154 |     movq argc@GOTPCREL(%rip), %rdi
155 | 
156 | Recall that RIP is the instruction pointer
157 |     
158 | ### Instruction Pointer Relative addressing (RIP)
159 | RIP addressing is a mode where address references are provided as a 32-bit displacements from the current instruction pointer (RIP register value). 
160 | One of the advantages of RIP is that is makes it easier to generate Position Independant Code, which is code that is not dependent upon where the 
161 | code is loaded. This is important for shared objects as they don't know where they will be loaded. 
162 | In x64, references to code and data are done using instruction pointer relative (RIP) addressing modes.
163 | 
164 | ### Position Independant Code (PIC)
165 | When the linker creates a shared library it does not know where in the process's address space it might be loaded. This causes a problem for code and data references which need to point to the correct memory locations.
166 | 
167 | My view of this is that when the linker takes multiple object files and merges the sections, like .text, .data etc, merge might not be a good
168 | description but rather adds them sequentially to the resulting object file. If the source files refer to absolut
169 | locations in it's .data section these might not be in the same place after linking into the resulting object file.
170 | Solving this problem can be done using position independant code (PIC) or load-time relocation.
171 | 
172 | There is an offset between the text and data sections. The linker combines all the text and data sections from all the object files and therefore knows the sizes of these sections. So the linker can rewrite the instructions using offsets and the sizes of the sections.
173 | 
174 | But x86 requires absolute addressing does it not?  
175 | You might come accross the following compilation error using as on Mac:
176 | 
177 |     32-bit absolute addressing is not supported in 64-bit mode
178 | 
179 | If we need a relative address (relative to the current instruction pointer which there is no operation for) a way to get this is to use the `CALL some_label` like this:
180 | 
181 |       call some_label
182 |     some_label: 
183 |       pop eax
184 | 
185 | `call` causes the address of the next instruction to be saved on the stack and then it will jump to some_label. `pop eax` pops the address into eax which is now the value of the instruction pointer.
186 | 
187 | PIC are implemented using Global Offset Table (GOT) which is a table of addresses in the .data section. When an instruction referres to a variable it does not use an absolute address (would require relocation) but instead referrs to an entry in the GOT which is located at a well known place in the data section. The entry in the GOT referrs to an absolut address.
188 | So this is a sort of relocation but in the data section instead of in the code section which is what was done for load-time relocation. But doing this in the data section, which is not shared and is writable does not cause any issues.
189 | Also relocations in the code section have to be done per variable reference and not per variable as is the case when using a GOT.
190 | 
191 | So that covers variables but for function calls a Procedure Linkage Table (PLT) is used. This is part of the text section. Instead of calling a function directly a call is made to an entry in the PLT which performs the actual call. This is sometimes called `trampoline` which I've seen on occasions when inspecting/dumping in lldb but did not know what it meant. This allows for lazy resolution of functions calls.Also every PLT entry as an entry in the GOT.
192 | 
193 | 
194 | Only position independent code is supposed to be included into shared objects (SO) as they should have an ability to dynamically change their 
195 | location in RAM.
196 | 
197 | ### Load-time relocation
198 | This process might take some time during loading which might be a performance hit depending on the type of program being written.
199 | Since the text section needs to be modified during loading (needs to do the actual relocations) it is not possible to have it shared by multiple processes.
200 | 
201 | ### Instruction Pointer Relative addressing (RIP)
202 | References to code and data in x64 are done with instruction relative pointer addressing. So instructions can use references that are relative to the current instruction (or the next one) and don't require them to be absolute addresses. This works for offsets of up to 32bits but for programs that are larger than that this offset will not be enough. One could use absolute 64 bit addresses for everything but more instructions are required to perform simple operations and most programs will not require this.
203 | The solution is to introduce code models to cater for all needs. The compiler should be able to take an option where the programmer can say that this object file will not be linked into a large program. And also that this compilation unit will be included in a huge library and that 64-bit addressing should be used.
204 | 
205 | In (64-bit mode), the encoding for the old 32-bit immediate offset addressing mode, is now a 32-bit offset 
206 | from the current RIP, not from 0x00000000 like before. 
207 | You only need to know how far away it is from the currently executing instruction (technically the next instruction)
208 | 
209 | 
210 | #### Assemble 64Bit.s
211 | 
212 |     as -g -arch x86_64 64bit.s -o 64bit.o
213 | 
214 | #### Link 64Bit.s
215 | 
216 |     ld -e _start -macosx_version_min 10.8 -lSystem -arch x86_64 64bit.o -o 64bit
217 | 
218 | ### Mach Object file (mach-o)
219 | 
220 |     otool -h 64bit.o
221 |     64bit.o:
222 |     Mach header
223 |            magic cputype cpusubtype  caps    filetype ncmds sizeofcmds      flags
224 |      0xfeedfacf 16777223          3  0x00          1     3        656 0x00000000
225 | 
226 | The magic number can be found in ```/usr/include/mach-o/loader.h```:
227 | 
228 |     #define MH_MAGIC_64 0xfeedfacf /* the 64-bit mach magic number */
229 |     
230 | The ```cputype``` can be located in ```/usr/include/mach/machine.h```:
231 | 
232 |     
233 | ### otool
234 | Dump sections:
235 | 
236 |     $ otool -s __TEXT __text jump.o
237 |     $ otool -s __DATA __data jump.o
238 |     
239 | 
240 | ### Redzone
241 | Put simply, the red zone is an optimization. Code can assume that the 128 bytes below rsp will not be asynchronously clobbered 
242 | by signals or interrupt handlers, and thus can use it for scratch data, without explicitly moving the stack pointer. The last 
243 | sentence is where the optimization lays - decrementing rspand restoring it are two instructions that can be saved when using 
244 | the red zone for data.
245 | 
246 | However, keep in mind that the red zone will be clobbered by function calls, so it's usually most useful in leaf functions 
247 | (functions that call no other functions)
248 | 
249 | Preserving the base pointer
250 | The base pointer rbp (and its predecessor ebp on x86), being a stable "anchor" to the beginning of the stack frame throughout
251 | the execution of a function, is very convenient for manual assembly coding and for debugging [5]. However, some time ago it 
252 | was noticed that compiler-generated code doesn't really need it (the compiler can easily keep track of offsets from rsp), 
253 | and the DWARF debugging format provides means (CFI) to access stack frames without the base pointer.
254 | 
255 | This is why some compilers started omitting the base pointer for aggressive optimizations, thus shortening the function prologue 
256 | and epilogue, and providing an additional register for general-purpose use (which, recall, is quite useful on x86 with its 
257 | limited set of GPRs).
258 | 
259 | gcc keeps the base pointer by default on x86, but allows the optimization with the -fomit-frame-pointer compilation flag. 
260 | How recommended it is to use this flag is a debated issue - you may do some googling if this interests you.
261 | 
262 | Anyhow, one other "novelty" the AMD64 ABI introduced is making the base pointer explicitly optional, stating:
263 | 
264 | The conventional use of %rbp as a frame pointer for the stack frame may be avoided by using %rsp (the stack pointer) to index into 
265 | the stack frame. This technique saves two instructions in the prologue and epilogue and makes one additional general-purpose 
266 | register (%rbp) available.
267 | gcc adheres to this recommendation and by default omits the frame pointer on x64, when compiling with optimizations. It gives an 
268 | option to preserve it by providing the -fno-omit-frame-pointer flag. For clarity's sake, the stack frames showed above were 
269 | produced without omitting the frame pointer.
270 | 
271 | 
272 | ### Setting register to zero
273 | My first thought would be using a mov instruction, like `movq $0, rax' for example.
274 | 
275 | You might come a cross something like the following:
276 | 
277 |     xorl  %eax, %eax
278 | 
279 | Which simply a way of setting the register to zero. The xorl instruction uses fewer bytes than the mov. I found suggestions that it 
280 | migth be more performat that using mov $0, %eax
281 | 
282 |     100000f72:48 c7 c0 00 00 00 00 movq $0, %rax
283 | 
284 |     100000f79:48 31 c0             xorq %rax, %rax
285 | 
286 | Notice that the byte code for xorg are smaller than movq.
287 | Reducing instruction sizes will reduce instruction-cache misses, and therefore improve performance.
288 | 
289 | ### System calls
290 | You can use dtruss to see what system call are being done:
291 | 
292 |     $ sudo dtruss `pwd`/malloc
293 | 
294 | 
295 | ### comp
296 | Compares two values and sets the EFLAGS register. It performs subtraction on the operands:
297 | 
298 |     cmp op1, op2
299 | 
300 | op2 - op1. None of the operands are modified but the EFLAGS reg is set as if subtraction too
301 | place.
302 | 
303 | 
304 | ###  Preprocessing assemler files
305 | Assembler files that start with a capital S indicate that the file needs to be
306 | preprocessed as it contains #include/#define which have to be processed. A lower
307 | case s assembly file is just normal assembly. 
308 | 
309 | So we have to preprocess such files with `cpp`:
310 | ```console                                                                          
311 | $ cpp hello.S | as -o hello.o -                                                     
312 | $ ld -o hello hello.o                                                               
313 | $ ./hello                                                                           
314 | Hello, world!                                                                       
315 | ```          
316 | 


--------------------------------------------------------------------------------
/arm/README.md:
--------------------------------------------------------------------------------
  1 | ### ARM Assembly
  2 | ARM is a Reduced Instruction Set Computing (RISC) processor which is different
  3 | from Intel which are Complex Instruction Set Computing (CISC) processors.
  4 | Simpler instructions tend to consume less power and is a reason for ARM being
  5 | used in smaller embedded devices.
  6 | 
  7 | It has more general purpose registers than CISC processors and have around 100
  8 | instructions.
  9 | 
 10 | ARM uses a LOAD/STORE memory model for memory access so an operation will first
 11 | have to load a value into a register, operate on that value, and then store it
 12 | back to memory.
 13 | 
 14 | ARM has two modes, ARM mode and Thumb mode. 
 15 | 
 16 | Before version 3 ARM processors were little-endian but after that the ARM
 17 | processors have become BI-endian which allows the endienness to be toggled.
 18 | 
 19 | ### ARM mode
 20 | This is the traditional instructions set where instructions are 32-bits long.
 21 | 
 22 | ### Thumb mode
 23 | This mode supports higher code density where instructions can be either 16-bits
 24 | and some are still 32-bits long.
 25 | 
 26 | ### Thumb2 mode
 27 | My understanding of this is that you have to choose if you use ARM mode or
 28 | Thumb mode when writing code. Thumb2 adds more instructions to Thumb mode so
 29 | that it is almost on par with ARM mode, but also adds a new assembly syntax
 30 | to allow for writing code in a unified way and then deciding on the mode at
 31 | assemble time. The is called Unified Assembly Language (UAL).
 32 | 
 33 | ### ARM versions
 34 | ```
 35 | ARM Family                ARM architecture
 36 | --------------------------------------------------------
 37 | ARM7                      ARM v4
 38 | ARM9                      ARM v5
 39 | ARM11                     ARM v6
 40 | Cortex-A                  ARM v7-A   (A=Application)
 41 | Cortex-R                  ARM v7-R   (R=Realtime)
 42 | Cortex-M                  ARM v7-M   (M=Microcontroller)
 43 | ```
 44 | 
 45 | ### ARMv8
 46 | Introduced AArch64, which is a new instruction set (64 bit support).
 47 | 
 48 | ### Container for assembly development
 49 | ```console
 50 | $ docker build -t arm-assembly .
 51 | ```
 52 | 
 53 | ### Run the container
 54 | ```console
 55 | $ docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -ti -v${PWD}/src:/src:Z -w="/src" arm-assembly sh
 56 | ```
 57 | 
 58 | ### Compiling and linking
 59 | ```console
 60 | /src # as first.s -o first.o
 61 | /src # ld -o first first.o 
 62 | /src # ./first
 63 | Hello, ARM64!
 64 | ```
 65 | 
 66 | ### objdump (arch64-linux-gnu)
 67 | ```console
 68 | $ aarch64-linux-gnu-objdump -s -d first
 69 | 
 70 | first:     file format elf64-littleaarch64
 71 | 
 72 | Contents of section .text:
 73 |  4000b0 200080d2 e1000058 02010058 080880d2   ......X...X....
 74 |  4000c0 010000d4 000080d2 a80b8052 010000d4  ...........R....
 75 |  4000d0 e0004100 00000000 06000000 00000000  ..A.............
 76 | Contents of section .data:
 77 |  4100e0 42616a6a 610a                        Bajja.          
 78 | 
 79 | Disassembly of section .text:
 80 | 
 81 | 00000000004000b0 <_start>:
 82 |   4000b0:	d2800020 	mov	x0, #0x1                   	// #1
 83 |   4000b4:	580000e1 	ldr	x1, 4000d0 <_start+0x20>
 84 |   4000b8:	58000102 	ldr	x2, 4000d8 <_start+0x28>
 85 |   4000bc:	d2800808 	mov	x8, #0x40                  	// #64
 86 |   4000c0:	d4000001 	svc	#0x0
 87 |   4000c4:	d2800000 	mov	x0, #0x0                   	// #0
 88 |   4000c8:	52800ba8 	mov	w8, #0x5d                  	// #93
 89 |   4000cc:	d4000001 	svc	#0x0
 90 |   4000d0:	004100e0 	.word	0x004100e0
 91 |   4000d4:	00000000 	.word	0x00000000
 92 |   4000d8:	00000006 	.word	0x00000006
 93 |   4000dc:	00000000 	.word	0x00000000
 94 | ```
 95 | Notice the `=mesg` here:
 96 | ```assembly
 97 |     ldr     x1, =msg    /* buf */
 98 | ```
 99 | The `=` sign in this case indicates an LDR pseudo instruction. msg is defined
100 | in first.s as:
101 | ```assembler
102 | msg:
103 |     .ascii        "Bajja\n"
104 | ```
105 | And `msg` is a label which is an address so it would be 64 bits when using a
106 | 64-bit processor. So without the `=` sign the compiler would see a value that
107 | is not a 16-bit immediate value trying to be loaded into x1. But with the `=`
108 | sign the compiler will change this instruction to :
109 | ```
110 |   4000b4:	580000e1 	ldr	x1, 4000d0 <_start+0x20>
111 | ```
112 | The max size of an immediate value is 16-bits, and that becomes an issue when
113 | we need use 64 bit addresses and move them into registers. The register can
114 | handle 64 bits but not the opcode parameter. But what it can do is use a value
115 | relative to the current instruction pointer and this is what is happening here.
116 | We are telling the processor to use the value at 4000d0 which contains a pointer
117 | to the data, in this case the string 'bajja'.
118 | Notice that this is in the .text segment following the code of the function.
119 | 
120 | ### Instructions
121 | Are 32 bits in size (for both 32 and 64 bit processors).
122 | 
123 | ### Registers
124 | A64 provides 31 general purpose registers and each can be used as a 64-bit
125 | register in which case the name of the register starts with an `x`. So we have
126 | x0-x30 (can be upper or lower case) to use.
127 | These register can also be used as 32-bit register and the one uses `w` as the
128 | name of them.
129 | 
130 | Note that the type of register used will impact the instruction in which the
131 | register is used.
132 | 
133 | ```
134 | x0-x7                Arguments to functions and return values.
135 | x8                   For syscalls the number goes into this register.
136 | x9-x15               For local variable.
137 | x16-x18              Used for IPC and platform values.
138 | x19-x28              Callee saved
139 | x29                  Frame register (like rbp I think)
140 | x30                  Link Register (return address for function calls)
141 | SP/XZR               The stack pointer for instruction dealing with the stack
142 |                      and zero register otherwise.
143 | PC                   The program counter.
144 | ```
145 | 
146 | 
147 | ### Calling conventions
148 | ```
149 |              syscall nr  return  arg0  arg1  arg2  arg3 arg4 arg5
150 | arm          r7          r0      r0    r1    r2    r3   r4   r5
151 | arm64        x8          x0      x0    x1    x2    x3   x4   x5
152 | ```
153 | 
154 | ### System calls
155 | See [64-bit table](https://chromium.googlesource.com/chromiumos/docs/+/master/constants/syscalls.md#tables)
156 | for system call numbers.
157 | 
158 | The instruction for system calls, system interrupt is `svc`
159 | (supervisor call) which takes a system call number for the table above.
160 | The arguments the system call takes can also be see in the table above in the
161 | additional columns for each call.
162 | 
163 | 
164 | ### xzr register
165 | Is a register and it's value is always zero.
166 | ```console
167 | Disassembly of section .text:
168 | 
169 | 0000000000400078 <_start>:
170 |   400078:	d2800000 	mov	x0, #0x0                   	// #0
171 |   40007c:	aa1f03e0 	mov	x0, xzr
172 |   400080:	aa1f03e0 	mov	x0, xzr
173 |   400084:	d2800ba8 	mov	x8, #0x5d                  	// #93
174 |   400088:	d4000001 	svc	#0x0
175 | ```
176 | Note that `movz` will move the immediate and then zero out the rest of the bits
177 | are set to zero in the destination register.
178 | And without aliases we get:
179 | ```console
180 | $ aarch64-linux-gnu-objdump -d -M no-aliases xzr
181 | 
182 | xzr:     file format elf64-littleaarch64
183 | 
184 | 
185 | Disassembly of section .text:
186 | 
187 | 0000000000400078 <_start>:
188 |   400078:	d2800000 	movz	x0, #0x0
189 |   40007c:	aa1f03e0 	orr	x0, xzr, xzr
190 |   400080:	aa1f03e0 	orr	x0, xzr, xzr
191 |   400084:	d2800ba8 	movz	x8, #0x5d
192 |   400088:	d4000001 	svc	#0x0
193 | ```
194 | 
195 | ### svc (supervisor call)
196 | This is used for system interrupt, for example calling exit:
197 | ```assembly
198 |     mov     x0, #0      /* status */
199 |     mov     x8, #93     /* exit syscall #93 */
200 |     svc     #0          
201 | ```
202 | To me it makes sense that the exist status code is passed ion register x0, but
203 | it is not clear to me why the system call number, #93 above, is passed in
204 | register x8. This is just a calling convention and we can see the conventions
205 | in the syscall man page:
206 | ```console
207 | Arch/ABI    Instruction           System  Ret  Ret  Error    Notes
208 |                                          call #  val  val2
209 |        ───────────────────────────────────────────────────────────────────
210 |        alpha       callsys               v0      v0   a4   a3       1, 6
211 |        arc         trap0                 r8      r0   -    -
212 |        arm/OABI    swi NR                -       r0   -    -        2
213 |        arm/EABI    swi 0x0               r7      r0   r1   -
214 |        arm64       svc #0                w8      x0   x1   -
215 |        blackfin    excpt 0x0             P0      R0   -    -
216 |        i386        int $0x80             eax     eax  edx  -
217 |        ia64        break 0x100000        r15     r8   r9   r10      1, 6
218 |        m68k        trap #0               d0      d0   -    -
219 |        microblaze  brki r14,8            r12     r3   -    -
220 |        mips        syscall               v0      v0   v1   a3       1, 6
221 |        nios2       trap                  r2      r2   -    r7
222 |        parisc      ble 0x100(%sr2, %r0)  r20     r28  -    -
223 |        powerpc     sc                    r0      r3   -    r0       1
224 |        powerpc64   sc                    r0      r3   -    cr0.SO   1
225 |        riscv       ecall                 a7      a0   a1   -
226 |        s390        svc 0                 r1      r2   r3   -        3
227 |        s390x       svc 0                 r1      r2   r3   -        3
228 |        superh      trap #0x17            r3      r0   r1   -        4, 6
229 |        sparc/32    t 0x10                g1      o0   o1   psr/csr  1, 6
230 |        sparc/64    t 0x6d                g1      o0   o1   psr/csr  1, 6
231 |        tile        swint1                R10     R00  -    R01      1
232 |        x86-64      syscall               rax     rax  rdx  -        5
233 |        x32         syscall               rax     rax  rdx  -        5
234 |        xtensa      syscall               a2      a2   -    -
235 | ```
236 | 
237 | 
238 | The argument to `svc` is mandatory but is up to the handler how to use it (I
239 | think).
240 | 
241 | 
242 | ### mov
243 | ```
244 | mov{S}{cond} Rd, Operand2
245 | mov{cond} Rd, #imm16
246 | ```
247 | If `S` is specified then the conditional flags are updated as part of the
248 | operation.
249 | 
250 | 
251 | Apperently mov is not an arm instruction but an alias. So when we write
252 | ```assembly
253 |   mov   x0, #4
254 |   mov   x1, x0
255 | ```
256 | The assembler will expand that to:
257 | ```asssembly
258 |   4000c4:	d2800080 	movz	x0, #0x4
259 |   4000c8:	aa0003e1 	orr	x1, xzr, x0
260 | ```
261 | 
262 | ### add
263 | A very basic [add](./src/add.s):
264 | ```
265 | $ make add
266 | $ qemu-aarch64 add
267 | $ echo $?
268 | 5
269 | ```
270 | 
271 | ### load address (ldr)
272 | This is used to load the address, like leaq in x86_64. The `=` sign is used
273 | in this case:
274 | ```assembly
275 |     ldr     x1, =msg
276 | ```
277 | The `=` sign in this case means to use the LDR pseudo instruction.
278 | 
279 | The following example loads the value found in the memory location in r0 into
280 | ra:
281 | ```assembly
282 |     ldr     ra, [r0]
283 | ```
284 | 
285 | ### branch
286 | An example of conditional branching can be found in [branch](../src/branch.s).
287 | 
288 | ### Store Register (STR)
289 | This command stores the contents of a register into memory:
290 | ```assembly
291 |   str x0, [SP, #-16]!
292 | ```
293 | Notice the `!` which is the for register write-back. So SP is used as the base
294 | register and 16 is subtracted from that, and SP is also updated with that value.
295 | 
296 | So what this is doing is substracting 16 from SP and updating SP, then
297 | copying x0 into that location.
298 | 
299 | 
300 | ### QEMU
301 | Machine emulator.
302 | 
303 | #### Setup
304 | ```console
305 | $ sudo cp fedora_aarch64.repo /etc/yum.repos.d/
306 | $ dnf install aarch64-linux-gnu-{binutils,gcc,glibc}
307 | ```
308 | 
309 | #### Compiling and linking
310 | ```console
311 | $ aarch64-linux-gnu-as -o first.o src/first.s
312 | $ aarch64-linux-gnu-ld -o first first.o
313 | $ file first
314 | first: ELF 64-bit LSB executable, ARM aarch64, version 1 (SYSV), statically linked, not stripped
315 | ```
316 | 
317 | #### Run using QEMU
318 | ```console
319 | $ $ qemu-aarch64 first
320 | Hello, ARM64!
321 | ```
322 | 
323 | ### Microprocessor without Interlocked Pipelined Stages (MIPS)
324 | Is a RISC instruction set architecture (ISA).
325 | 
326 | 
327 | ### armasm
328 | Needs to be [downloaded](https://developer.arm.com/tools-and-software/embedded/arm-compiler/downloads/version-6)
329 | and installed.
330 | 
331 | To start using Arm Compiler for Embedded 6.17:
332 | ```console
333 | - Create a suite sub-shell using /home/danielbevenius/ArmCompilerforEmbedded6.17/bin/suite_exec bash
334 | ```
335 | 
336 | 
337 | #### Directives
338 | Just keep in mind that these are directives that the specific assembler uses
339 | and are not part of the instructions set. So we can choose use either armasm or
340 | or as (GNU assembler) to write our programs and they only understand their own
341 | directives.
342 | 
343 | armasm and GNU as directives:
344 | ```
345 | ASM                    GNU
346 | AREA                   .sect
347 | EQU                    .equ
348 | DCB                    .byte
349 | DCW                    .half
350 | DCD                    .word
351 | SPACE                  .space
352 | END                    .end
353 | RN                     .asg
354 | ```
355 | EQU comes from equate directive.
356 | 
357 | 
358 | ### arm-none-eabi-as
359 | This can be used to cross compilation of arm assembly programs and allows for
360 | exploring 32 bit arm code.
361 | 
362 | For example, there is a [space.s](./src/space.s) program that we can compile
363 | using:
364 | ```console
365 | $ make space
366 | arm-none-eabi-as -g -o space.o src/space.s
367 | arm-none-eabi-ld -g -o space space.o
368 | ```
369 | This can then be run using:
370 | ```console
371 | $ qemu-arm ./space
372 | ```
373 | That is not very helpful though as nothing will happen. What we can do instead
374 | is specify that the emulator should halt:
375 | ```console
376 | $ qemu-arm -singlestep -g 7777 space
377 | ```
378 | `7777` is the port that we can then use to connector using gdb:
379 | ```console
380 | $ arm-none-eabi-gdb
381 | (gdb) file space
382 | (gdb) target remote localhost:7777
383 | Remote debugging using localhost:7777
384 | _start () at src/space.s:6
385 | 6	  ldr r0, =A
386 | 
387 | // stepping/inspecting...
388 | 
389 | (gdb) disassemble 
390 | Dump of assembler code for function _start:
391 |    0x00008000 <+0>:	ldr	r0, [pc, #8]	; 0x8010 <_start+16>
392 |    0x00008004 <+4>:	mov	r1, #2
393 |    0x00008008 <+8>:	str	r1, [r0]
394 | => 0x0000800c <+12>:	b	0x8000 <_start>
395 |    0x00008010 <+16>:	andeq	r8, r1, r4, lsl r0
396 | End of assembler dump.
397 | 
398 | (gdb)
399 | (gdb) x $r0
400 | 0x18014:	0x00000002
401 | 
402 | (gdb) kill
403 | ```
404 | 
405 | ### ldr (arm)
406 | Takes a value in memory and writes it to a regiser:
407 | ```
408 | LDR{size}{cond} <Rd>, <addressing mode>
409 | ```
410 | Without a size specified the will be a 32-bit write.
411 | Size can also be `LDRB` for a 8-bits, `LDRH` for 16-bits (Halfword), `LDRSB`
412 | for signed byte, `LDRSH` signed halfword, and `LDM` for multiple words.
413 | The addressing modes can have a base register, and offset, and a shift
414 | operation:
415 | ```assembly
416 |     dest           base     shift operation
417 |        ↓             ↓       ↓
418 |   ldr r9,         [r12, r8, LSR #2]
419 |                          ↑       ↑
420 |                        offset   immediate value
421 | ```
422 | The shifted offset is added to the base, so r12 + r8 * 4 and this is called the
423 | effective address. So lets say the base address contains the address to a
424 | struct, r8 is a member of the struct which is an array, then we could index
425 | values in the array using the shift I think.
426 | 
427 | ### str (arm)
428 | Takes a value from a register and stores i in memory.
429 | ```assembly
430 | 
431 |   str r0, [r1]
432 | 
433 | r0: 0xaabbccdd             r1: 0x00008000 ------> 0x0000800: 0xdd
434 |                                                   0x0000801: 0xcc
435 |                                                   0x0000802: 0xbb
436 |                                                   0x0000803: 0xaa
437 | ```
438 | We can also add an increment operand to the str instruction:
439 | ```
440 |   str r0, [r1], #4
441 | 
442 | r0: 0xaabbccdd             r1: 0x00008004 --+     0x0000800: 0xdd
443 |                                             |     0x0000801: 0xcc
444 |                                             |     0x0000802: 0xbb
445 |                                             |     0x0000803: 0xaa
446 |                                             +---→ 0x0000804: 0x00
447 | ```
448 | Notice that after the instruction completes r1 has now been incremented.
449 | 
450 | ```assembly
451 |   r1, [r0, #4]!
452 | ```
453 | r1 will contain the value or r0+4, and r0 will be updated to contain r0+4.
454 | 
455 | ```assembly
456 |   r1, [r0], #4
457 | ```
458 | r1 will contain the value or r0, and r0 will be updated to contain r0+4.
459 | 
460 | ### pre-indexed addressing
461 | ```
462 |   ldr{size}{cond} <Rd>, [<Rn>, <offset>] {!}
463 |                          {effecitve addr}
464 | 
465 | ! = should the effective address be written back into Rn, without this Rn will
466 |     be unchanged.
467 | ```
468 | 
469 | ### post-indexed addressing
470 | This is the same as we saw previously where after the str (or ldr) completes
471 | Rn is incremented. Notice that this only says incremented, and this differs
472 | from `!` where the effective address is written back into `Rn`.
473 | ```
474 |   str{size}{cond} <Rd>, [<Rn>], <offset>
475 | 
476 |   str r0, [r1], #4
477 | 
478 | r0: 0xaabbccdd             r1: 0x00008000 ------> 0x0000800: 0xdd
479 |                                                   0x0000801: 0xcc
480 |                                                   0x0000802: 0xbb
481 |                                                   0x0000803: 0xaa
482 | ```
483 | 
484 | ### instruction encoding
485 | 
486 | The instruction encoding for a 32-bit instruction looks like this:
487 | ```
488 |  31   29  27  25  23  21  19  17  15  13  11  9  7  5  3  1
489 |  +---------------------------------------------------------+
490 |  | Cond  |0|0|I| opcode|S| Op1 | Dest |   Operand 2        |
491 |  +---------------------------------------------------------+
492 |     30  28  26  24  22  20  18 16  14  12  10  8  6  4  2  0
493 | 
494 | Data processing instruction: bit 26 and 27: 00
495 | opcode: the instruction, add, sub, mov, cmp etc.
496 | I: is the immediate bit. If this is 0 then Operand 2 is a register, and if this
497 |    bit is 1 Operand 2 is an immediate value.
498 | Operand 2: 12-bits. 2¹²=4096, so we only have values in the range 0-4096 but
499 | ARM does not use this value as an 12-bit number!
500 | Instead what is does is that it uses an 8bit value with a 4-bits rotate value
501 | 2⁴ = 16.
502 | 
503 |  11   9   7   5   3   1
504 |  +-----------------------+
505 |  |Rotate| Immediate      |
506 |  +-----------------------+
507 |    10   8   6   4   2   0
508 | 
509 | 11-8 Rotate bits
510 | 7-0  Immediate bits
511 | ```
512 | For example:
513 | ```
514 |                         mov r0, #3, 2
515 | 
516 | 3 = 0000 0000 0000 0000 0000 0000 0000 0011
517 | Rotate that binary 2 give us:
518 | 1100 0000 0000 0000 0000 0000 0000 0000
519 | which is -1073741824 decimal
520 | ```
521 | And we can inspect the generated instruction using objdump:
522 | 
523 | 800c:	e3a00103 	mov	r0, #-1073741824	; 0xc0000000
524 | ```
525 | 
526 | ### dot
527 | The dot “.” represents the current location in program memory. So:
528 | ```assembly
529 |   b .
530 | ```
531 | Tells the processor to jump to this very instruction, that is. execute it
532 | over and over in an endless loop. This is used often at the end of
533 | a microcontroller programs, as it prevents the processor from executing random
534 | data that is located in flash memory after the program.
535 | 
536 | The following instruction is also an unconditional branch:
537 | ```assemlby
538 |   b.n .
539 | ```
540 | The `.n` suffix is telling the assembler to encode this instruction as 16 bits.
541 | 
542 | 
543 | ### STMIA/LDMIA
544 | Store and load multiple registers.
545 | Syntax:
546 | ```assembly
547 |  ldmia Rn!, {reg_list}
548 |  stmia Rn!, {reg_list}
549 | ```
550 | Example:
551 | ```assembly
552 | 8000358:       c302            stmia   r3!, {r1}
553 | ```
554 | 
555 | ```console
556 | $ make ldmia
557 | $ qemu-arm ldmia
558 | $ qemu-arm -g 7777 ./ldmia
559 | ```
560 | ```console
561 | $ arm-none-eabi-gdb ldmia
562 | GNU gdb (GNU Arm Embedded 
563 | (gdb) br ldmia.s:15
564 | Breakpoint 1 at 0x8000: file src/ldmia.s, line 15.
565 | (gdb) target remote localhost:7777
566 | Remote debugging using localhost:7777
567 | _start () at src/ldmia.s:15
568 | (gdb) si
569 | (gdb) i r $r0 $r1 $r2 $r3 $r4 $r5
570 | r0             0x8000              32768
571 | r1             0x0                 0
572 | r2             0x1                 1
573 | r3             0x10                16
574 | r4             0x11                17
575 | r5             0x100               256
576 | ```
577 | So notice that we have loaded all of the registers, r1, r2, r3 r4, and r5 with
578 | the values of the `array`. So with a single instruction we have loaded
579 | `multiple` registers.
580 | 
581 | ### bic (bit clear)
582 | ```console
583 | $ make bic
584 | $ qemu-arm -g 7777 ./bic
585 | ```
586 | ```console
587 | $ arm-none-eabi-gdb bic
588 | (gdb) target remote localhost:7777
589 | Remote debugging using localhost:7777
590 | (gdb) si
591 | (gdb) p/t $r0
592 | $1 = 1111
593 | (gdb) f
594 | #0  _start () at src/bic.s:7
595 | 7	  bic r1, r0, #4
596 | (gdb) p/t $r1
597 | $3 = 1011
598 | ```
599 | Notice how this instruction can be used to mask out single bits. If we wanted
600 | to make out the topmost bits we could us 1100 (12 decimal).
601 | 
602 | 
603 | ### Unsigned Extend type (uxtx)
604 | Where type can be `b` for byte, `h` for halfword, `w` for word`.
605 | So lets say we have a 4 bit binary value 1010 and want to extend this to
606 | 8 bits this would just be 0000 1010. 
607 | 
608 | Example: [uxtb.s](./src/uxtb.s]
609 | 
610 | ```console
611 | (gdb) p/x $r0
612 | $2 = 0xffffff65
613 | 
614 | (gdb) si
615 | (gdb) p/t $r1
616 | $4 = 1100101
617 | 
618 | (gdb) si
619 | (gdb) p/x $r1
620 | $6 = 0x65
621 | (gdb) p/x $r1
622 | $2 = 0xff65
623 | ```
624 | Notice that this allows us to copy/cut/mask a portion of register. This would
625 | be useful when reading from a register. 
626 | 
627 | ### Current Program Status Register
628 | Example: [psr.s](./src/psr.s)
629 | 
630 | ```console
631 | (gdb) x/tw $cpsr
632 | 0x10:	00000000001010000000000000000010
633 | ```
634 | Bit 31, is the negative condition flag and we can see that it is currently not
635 | set.  
636 | Bit 30, is the zero condition flag and we can see that it is currently not set.
637 | ```console
638 | (gdb) p/t $cpsr
639 | $3 = 10000000000000000000000000010000
640 | (gdb) p/t ($cpsr & (1 << 31)) != 0
641 | $13 = 1
642 | ```
643 | 
644 | ### Wait For Interrupt (wfi)
645 | Allows the core to enter low power mode and stop executing code.
646 | ```assembly
647 |   wfi
648 | ```
649 | 
650 | ### Wait For Event (wfe)
651 | Allows the core to enter low power mode and stop executing code.
652 | ```assembly
653 |   wfe
654 | ```
655 | 
656 | ### Signal Event (SEV)
657 | Causes an event to be signaled to all cores. So I think that if there there are
658 | multiple cores one core could issue a wfe and then calling SEV would wake it
659 | up.
660 | 


--------------------------------------------------------------------------------
/linux/README.md:
--------------------------------------------------------------------------------
  1 | # Linux assembly language exploration
  2 | 
  3 | The examples here are sometimes not really useful except for inspecting
  4 | object code and understanding how things get linked.
  5 | 
  6 | ## Variables in data section
  7 | 
  8 | Take `first.s` as an example and look at the variable that is defined in the
  9 | .data section:
 10 | 
 11 | ```
 12 | .data
 13 | something:
 14 |   .byte 2
 15 | ```
 16 | 
 17 | If we use objdump to inspect the data section we find:
 18 | 
 19 | ```console
 20 | $ objdump -d -j .data first
 21 | 
 22 | first:     file format elf64-x86-64
 23 | 
 24 | 
 25 | Disassembly of section .data:
 26 | 
 27 | 0000000000402000 <something>:
 28 |   402000:	02 00                	.byte 0x2
 29 | 	...
 30 | ```
 31 | 
 32 | ## Check if zero/null
 33 | [check_zero.s](./check_zero.s) contains an example of checking a register if
 34 | it is zero by using the `test` opcode:
 35 | ```
 36 |   test %rcx, %rcx
 37 |   je zero_func
 38 | ``` 
 39 | So I'm thinking that this would be similar to a checking that there is a value
 40 | or not.
 41 | 
 42 | 
 43 | ## Load Effective Address
 44 | 
 45 | Take the following instructions:
 46 | ```
 47 | .data
 48 | msg:
 49 |    .ascii "bajja\n"
 50 | ...
 51 | 
 52 |   mov msg, %rsi
 53 |   lea msg, %rsi
 54 | ```
 55 | 
 56 | Now, if we take a look at `msg` it contains:
 57 | ```console
 58 | (lldb) expr msg
 59 | (void *) $5 = 0x00000a616a6a6162
 60 | ```
 61 | 
 62 | This looked a little strange to me at first, but this is actually the value
 63 | contained in the memory location:
 64 | 
 65 | ```console
 66 | (lldb) memory read -f x -s 8 -c 1 0x0000000000402000
 67 | 0x00402000: 0x00000a616a6a6162
 68 | 
 69 | (lldb) memory read -c 5 0x0000000000402000
 70 | 0x00402000: 62 61 6a 6a 61                                   bajja
 71 | ```
 72 | And remember memory is read using little endian `00000a616a6a6162` which
 73 | would then become `00000a62616a6a61`.
 74 | 
 75 | Using mov with `msg` will only copy the value `00000a616a6a6162` into a register
 76 | for example. But to pass the msg to a function like write we would need to
 77 | pass a pointer. For this we use the command lea which is like `&` in c/c++
 78 | to get the address:
 79 | ```
 80 | (lldb) expr (char*)&msg
 81 | (char *) $26 = 0x0000000000402000 "bajja\n"
 82 | ```
 83 | 
 84 | ### .set directive
 85 | Can be used to set a memory location to a value.
 86 | ```
 87 | done: .ascii "done...\n"
 88 | .set done_len, . - done
 89 | ```
 90 | If we take a look at the binary we will find:
 91 | ```
 92 |   40102e:	48 c7 c2 08 00 00 00 	mov    $0x8,%rdx
 93 | ```
 94 | So this works sort of like a `#define` in C/C++ which would be replaced by the
 95 | preprocessor (not that there is one when using as). There won't be any symbol
 96 | for msg_len.
 97 | 
 98 | This can also be written as:
 99 | ```
100 | msg_len = . - msg
101 | ```
102 | 
103 | ### GAS section directive
104 | This directive has the following format:
105 | ```
106 | .section name [, "flags"[, @type[, flag_specific_args]]]
107 | ```
108 | 
109 | ### syscall
110 | syscall is used to make an indirect system call and has the following signature:
111 | ```
112 |      long syscall(long number, ...);
113 | ```
114 | And example is when calling exit which has sys number 60:
115 | ```
116 |   mov $60, %rax
117 |   xor %rdi, %rdi
118 |   syscall
119 | ```
120 | The system calls can be found using `man syscalls` and the actual numbers can
121 | be found in `/usr/include/asm/unistd_64.h`
122 | 
123 | So the system call number is passed in rax, and the following arguments to the
124 | actual system call are passed in rdi, rsi, rdx, r10, r8, r9. And the result is
125 | stored in rax.
126 | 
127 | ### execve
128 | This section will look closer at the execve system call and calling it from
129 | assembly code.
130 | 
131 | ```c
132 | #include <unistd.h>
133 | 
134 | int execve(const char *pathname, char *const argv[],
135 |            char *const envp[]);
136 | ```
137 | 
138 | System call nr is `59` which is the value that does into `%rax`.
139 | ```assembly
140 | mov $59, %rax
141 | ```
142 | 
143 | The first argument which is the file name is passed in `%rdi`.
144 | ```assembly
145 |   lea msg, %rdi  
146 | ```
147 | 
148 | The second argument which is argv is passed in `%rsi%`. Now this is an array
149 | of char pointers which we need to create.
150 | ```assembly
151 | ```
152 | And the last argument which is envp is passed in `%rdx`.
153 | 
154 | ### array on the stack
155 | So it was not obvious to me how to create an array on the stack in assembly and
156 | adding elements to it.
157 | ```c
158 | int main() {                                                                       
159 |   int array[2] = {1, 2};                                                           
160 |   int* ptr = array;
161 | } 
162 | ```
163 | 
164 | ```console
165 | $ gcc -o arr arr.c -fomit-frame-pointer
166 | ```
167 | 
168 | ```console
169 | $ objdump --disassemble=main arr
170 | 
171 | arr:     file format elf64-x86-64
172 | 
173 | 
174 | Disassembly of section .init:
175 | 
176 | Disassembly of section .text:
177 | 
178 | 0000000000401106 <main>:
179 |   401106:	c7 44 24 f0 01 00 00 	movl   $0x1,-0x10(%rsp)
180 |   40110d:	00 
181 |   40110e:	c7 44 24 f4 02 00 00 	movl   $0x2,-0xc(%rsp)
182 |   401115:	00 
183 |   401116:	48 8d 44 24 f0       	lea    -0x10(%rsp),%rax
184 |   40111b:	48 89 44 24 f8       	mov    %rax,-0x8(%rsp)
185 |   401120:	b8 00 00 00 00       	mov    $0x0,%eax
186 |   401125:	c3                   	retq   
187 | 
188 | Disassembly of section .fini:
189 | ```
190 | So this is interesting, we are just placing the 1 on the stack relative to the
191 | stack pointer (now remember that the position is given in hex! I keep forgetting
192 | this when debugging):
193 | ```console
194 | (lldb) memory read -f x -c 1 -s 4 '$rsp - 16'
195 | 0x7fffffffd0e8: 0x00000001
196 | ```
197 | So the whole array would be at:
198 | ```console
199 | (lldb) memory read -f x -c 2 -s 4 '$rsp - 16'
200 | 0x7fffffffd0e8: 0x00000001 0x00000000
201 | ```
202 | And we can verify this by stepping over the next instruction using si
203 | ```console
204 | (lldb) si
205 | (lldb) memory read -f x -c 2 -s 4 '$rsp - 16'
206 | 0x7fffffffd0e8: 0x00000001 0x00000002
207 | ```
208 | Now, we have the `ptr` local variable which is loading the effective address
209 | of `%rsp - 16` into rax, which is the address of the first entry of the array.
210 | Next, this value is stored in location `%rsp - 8`. 
211 | ```console
212 | (lldb) register read rax
213 |      rax = 0x0000000000401106  arr`main at arr.c:2:7
214 | (lldb) register read rax
215 |      rax = 0x00007fffffffd0e8
216 | (lldb) memory read -f x -c 1 -s 4 '$rsp - 8'
217 | 0x7fffffffd0f0: 0x00000000
218 | (lldb) si
219 | (lldb) memory read -f x -c 1 -s 4 '$rsp - 8'
220 | 0x7fffffffd0f0: 0xffffd0e8
221 | ```
222 | Now this might seem really trivial but it can be good to know how to actually
223 | create an array on the stack in assembly without having to first disassemble
224 | c code.
225 | 
226 | ### Stack addressing
227 | I need to remind myself that the stack is just a part of memory, but handled
228 | in a different way. The stack is there in the allocated memory for the process
229 | and we can use memory locations below the stack pointer value (rsp). Remember
230 | that rsp just points to a memory location that happens to be what some
231 | instructions update when the are executed, for example, push/pop will subtract
232 | and add to value in rsp. But if we don't use those instructions we are free
233 | to just store values by using mov and placing values in specific locations
234 | relative to rsp (if rsp moves we would be in trouble which in those cases we
235 | would use a base pointer/frame pointer in rbp).
236 | 
237 | We have to know what size of the data we are going to move so that move
238 | instruction will know.
239 | ```
240 | movb     1 bytes (8 bits)
241 | movs     single (32-bit floating point)
242 | movw     word (16 bits)
243 | movl     long (32-bit integer or 64-bit floating point)
244 | movq     quad (64-bit)
245 | movt     ten bytes (80-bit floating point)
246 | ```
247 | Lets explore this a little using [arr.s](./arr.s):
248 | ```console
249 | (lldb) br s -n _start
250 | ```
251 | Now, lets say we want to see the stack from the current rsp and 64 bytes
252 | down which is the stack where we can place values.
253 | To do this we have to remember that the stack grows downward, so we want to
254 | look at from the current rsp down 64 bytes which means subtracting 64 from rsp:
255 | ```console
256 | (lldb) memory read -f x -c 10 -s 8 '$rsp - 64'
257 | 0x7fffffffd190: 0x0000000000000000 0x0000000000000000
258 | 0x7fffffffd1a0: 0x0000000000000000 0x0000000000000000
259 | 0x7fffffffd1b0: 0x0000000000000000 0x0000000000000000
260 | 0x7fffffffd1c0: 0x0000000000000000 0x0000000000000000
261 | 
262 | (lldb) memory read -f x -c 10 -s 8 '$rsp - 64'
263 | 0x7fffffffd190: 0x0000000000000000 0x0000000000000000
264 | 0x7fffffffd1a0: 0x0000000000000000 0x0000000000000000
265 | 0x7fffffffd1b0: 0x0000000000000000 0x0000000000000000
266 | 0x7fffffffd1c0: 0x0000000000000000 0x0000000000000000
267 | 0x7fffffffd1d0: 0x0000000000000001 0x00007fffffffd5a1
268 | ```
269 | 
270 | Now, depending on the data stored we might be interested in looking at bytes, 
271 | words, etc. So we need to adjust the `size` `-s` and also the `count` `-c`.
272 | The size is the size of the memory granuality that we be displayed which makes
273 | it easier to see what belongs to which memory locations.
274 | ```console
275 | (lldb) memory read -f x -c 20 -s 4 '$rsp - 64'
276 | 0x7fffffffd190: 0x00000000 0x00000000 0x00000000 0x00000000
277 | 0x7fffffffd1a0: 0x00000000 0x00000000 0x00000000 0x00000000
278 | 0x7fffffffd1b0: 0x00000000 0x00000000 0x00000000 0x00000000
279 | 0x7fffffffd1c0: 0x00000000 0x00000000 0x00000002 0x00000000
280 | 0x7fffffffd1d0: 0x00000004 0x00000000 0xffffd5a1 0x00007fff
281 | ```
282 | ```
283 | Bytes: size: 1 count: 64/1 = 64  (add one for rsp)
284 | Word:  size: 2 count: 64/2 = 32  (add one for rsp)
285 | Quad:  size: 4 count: 64/4 = 16  (add one for rsp)
286 | 
287 | ```
288 | Byte example
289 | ```console
290 | (lldb) memory read -f x -c 65 -s 1 '$rsp - 64'
291 | 0x7fffffffd190: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
292 | 0x7fffffffd198: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
293 | 0x7fffffffd1a0: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
294 | 0x7fffffffd1a8: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
295 | 0x7fffffffd1b0: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
296 | 0x7fffffffd1b8: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
297 | 0x7fffffffd1c0: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
298 | 0x7fffffffd1c8: 0x02 0x00 0x00 0x00 0x00 0x00 0x00 0x00
299 | 0x7fffffffd1d0: 0x04
300 | ```
301 | 
302 | 
303 | Word example:
304 | ```console
305 | (lldb) memory read -f x -c 33 -s 2 '$rsp - 64'
306 | 0x7fffffffd190: 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
307 | 0x7fffffffd1a0: 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
308 | 0x7fffffffd1b0: 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000 0x0000
309 | 0x7fffffffd1c0: 0x0000 0x0000 0x0000 0x0000 0x0002 0x0000 0x0000 0x0000
310 | 0x7fffffffd1d0: 0x0004
311 | ```
312 | 
313 | Quad example:
314 | ```console
315 | (lldb) memory read -f x -c 17 -s 4 '$rsp - 64'
316 | 0x7fffffffd190: 0x00000000 0x00000000 0x00000000 0x00000000
317 | 0x7fffffffd1a0: 0x00000000 0x00000000 0x00000000 0x00000000
318 | 0x7fffffffd1b0: 0x00000000 0x00000000 0x00000000 0x00000000
319 | 0x7fffffffd1c0: 0x00000000 0x00000000 0x00000002 0x00000000
320 | 0x7fffffffd1d0: 0x00000004
321 | ```
322 | 
323 | Now, if we specify addressing relative to a register we use an number before
324 | the register and use parentheses:
325 | ```
326 |   movb $4, 0(%rsp)
327 |   movb $2, -1(%rsp)
328 |   movb $3, -2(%rsp)
329 | ```
330 | Doing this this
331 | ```console
332 | (lldb) memory read -f x -c 4 -s 1 '$rsp - 3'
333 | 0x7fffffffd1cd: 0x00 0x03 0x02 0x01
334 | ```
335 | 
336 | And we can use the address from rsp to address the different values, just
337 | like an array of ints:
338 | ```console
339 | (lldb) register read rsp
340 |      rsp = 0x00007fffffffd1d0
341 | (lldb) memory read -f x -c 1 -s 1 '0x00007fffffffd1d0 - 2'
342 | 0x7fffffffd1ce: 0x03
343 | (lldb) memory read -f x -c 1 -s 1 '0x00007fffffffd1d0 - 1'
344 | 0x7fffffffd1cf: 0x02
345 | (lldb) memory read -f x -c 1 -s 1 '0x00007fffffffd1d0 - 0'
346 | 0x7fffffffd1d0: 0x01
347 | ```
348 | 
349 | One thing to note is that when we run the example program this it will be invoked
350 | by `execve`:
351 | ```console
352 | $ strace ./arr 
353 | execve("./arr", ["./arr"], 0x7fffde884fb0 /* 74 vars */) = 0
354 | exit(0)                                 = ?
355 | +++ exited with 0 +++
356 | ```
357 | And when we break in the debugger we can inspect the existing stack:
358 | ```console
359 | (lldb) memory read -f x -c 8 -s 8 '$rsp'
360 | 0x7fffffffd1d0: 0x0000000000000001 0x00007fffffffd5a1
361 | 0x7fffffffd1e0: 0x0000000000000000 0x00007fffffffd5e0
362 | 0x7fffffffd1f0: 0x00007fffffffd5f4 0x00007fffffffd62a
363 | 0x7fffffffd200: 0x00007fffffffd641 0x00007fffffffd660
364 | (lldb) memory read -f s 0x00007fffffffd5a1
365 | 0x7fffffffd5a1: "/home/danielbevenius/work/assembly/learning-assembly/linux/arr"
366 | ```
367 | The value in top most position on the stack is argc which above is 1. So really
368 | our program should not overwrite that value but instead substract the size of
369 | an int (32-bits/4-bytes) and then add our values.
370 | ```assembly
371 |   sub  $1, %rsp
372 | ```
373 | Notice that the register we have specified is a 64 bit register
374 | 
375 | ```
376 | (lldb) register read rsp
377 |      rsp = 0x00007fffffffd1b0
378 | (lldb) si
379 | (lldb) register read rsp
380 |      rsp = 0x00007fffffffd1af
381 | ```
382 | 140737488343472 140737488343471
383 | 
384 | Try to add an alias for this which will take the size of the stack to show:
385 | ```
386 | (lldb) command alias showstack memory read -f x -c 10 -s 8 `$rsp - 64`
387 | ```
388 | 
389 | ### mov label 
390 | When you move something you need to think about how much you data you are
391 | moving. For example, take the following:
392 | ```assembly
393 | .data
394 | msg: .ascii "something\n"
395 | len: .int . - msg
396 | ...
397 | 
398 |   mov len, %rdx
399 | ```
400 | This is moving 64 bits into rdx starting from the memory location len. In our
401 | case the first 32 bits of len contain our message length which is 10 (a in hex)
402 | and the rest is whatever follows in the data section.
403 | If we only want to move our int we can use:
404 | ```assembly
405 | mov len %edx
406 | ```
407 | or 
408 | ```assembly
409 | mov len %dl
410 | ```
411 | 
412 | ### mul
413 | Take the [example](./multi.s) and if we run this in the debugger we find:
414 | ```console
415 | $ lldb -- ./multi 
416 | (lldb) target create "./multi"
417 | Current executable set to './multi' (x86_64).
418 | (lldb) br s -n _start
419 | (lldb) r
420 | (lldb) register read rax rbx
421 |      rax = 0x0000000000000008
422 |      rbx = 0x0000000000000004
423 | (lldb) si
424 | (lldb) register read rax rbx
425 |      rax = 0x0000000000000020
426 |      rbx = 0x0000000000000004
427 | (lldb) register read -f d rax rbx
428 |      rax = 32
429 |      rbx = 4
430 | ```
431 | Notice that the result of the multiplication is placed in `rax`.
432 | 
433 | ### div
434 | This operaton generates two output values, the quotient and a remainder.
435 | 
436 | Take the [example](./div.s) and if we run this in the debugger we find:
437 | ```console
438 |    4   	_start:
439 |    5   	  mov $21, %ax
440 |    6   	  mov $2, %bx
441 | -->7   	  div %bx
442 | (lldb) register read -f d al ah bx
443 |       al = 21
444 |       ah = 0
445 |       bx = 2
446 | (lldb) si
447 | (lldb) register read -f d al ah dx
448 |       al = 10
449 |       ah = 0
450 |       dx = 1
451 | ```
452 | Notice the quotent is in `al` and the remainder is in `dx`. 
453 | 
454 | ### integer to string
455 | To do this in assembly we need to take a look at how this is done.
456 | We have an integer which is a number of digits. We divide take the remainder of
457 | dividing the number by 10 to get the right most digit, and then continue
458 | dividing until there are no more digits.
459 | 
460 | ```
461 | 234 % 10 = 4-------------+
462 | 234 / 10 = 23            |
463 |                          |
464 |            +------------↓↓
465 | 23 % 10  = 3           234
466 | 23 / 10  = 2           ↑
467 |                        |
468 | 2  % 10  = 2-----------+
469 | 2  / 10  = 0
470 | ```
471 | So when we get 0 as the quotient (the result of dividing the number by 10) we
472 | are finished, and if we take the remainders (shown as separate operations above
473 | but the `div` operation in assembly produces both), we have the number if the
474 | reverse order (432 instread of 234). Now we still want to print these and that
475 | will be done with ascii. In ascii the zero digit has the value 48:
476 | ```console
477 | $ man ascii
478 | Oct   Dec   Hex   Char                        Oct   Dec   Hex   Char
479 | ────────────────────────────────────────────────────────────────────────
480 | 000   0     00    NUL '\0' (null character)   100   64    40    @
481 | ...
482 | 012   10    0A    LF  '\n' (new line)         112   74    4A    J
483 | ...
484 | 060   48    30    0                           160   112   70    p
485 | 061   49    31    1                           161   113   71    q
486 | 062   50    32    2                           162   114   72    r
487 | 063   51    33    3                           163   115   73    s
488 | 064   52    34    4                           164   116   74    t
489 | 065   53    35    5                           165   117   75    u
490 | 066   54    36    6                           166   118   76    v
491 | 067   55    37    7                           167   119   77    w
492 | 070   56    38    8                           170   120   78    x
493 | 071   57    39    9                           171   121   79    y
494 | ```
495 | So if we take our digit `4` and add `48` we get `52`. And if we do that will
496 | all the digits and arrage them after each other in memory we can then pass
497 | a pointer to that memory to the write system call.
498 | 
499 | 
500 | ### executable stack
501 | The background here is that linux has allowed the stack to be executable in the
502 | past, for example for nested functions and trampoline code. I think this was
503 | before exploits were a real issue but nowadays this is something that is
504 | prevented as allowing the programs data or stack to be executable allows for
505 | code to be placed in these regions of memory and then jumped to which can be
506 | used in exploits.
507 | 
508 | The solution is that compilers can add a section to the object file which the
509 | linker can then detect to make the data/stack non-executable. But if this
510 | section is missing the data/stack area will be executable. Remember that the
511 | assembler will create an object file which contains information for the linker,
512 | and the linker will create an object file that is used by the operating system
513 | to load it into memory. If I recall the permission for memory is set in the page
514 | table so this would have to some information in the object file created by the
515 | linker that causes the OS to make the stack be executable.
516 | 
517 | When the ld does its linking linking, it will not add a specific program header
518 | if a single object file is not marked as non-executable. So an object file
519 | , for example an assembly source that is not marked as non-executable,
520 | would make the entire library/executable become marked as having an executable
521 | stack.
522 | 
523 | For an executable file (or a shared object file) this information is in the
524 | program header, which is an array of structures that describes a segment that
525 | the system needs to handle to make a process for the program. The program
526 | header is only meaningful for executable and shared object files.
527 | This struct looks like this:
528 | ```c
529 | typedef struct {
530 |         Elf64_Word      p_type;
531 |         Elf64_Word      p_flags;
532 |         Elf64_Off       p_offset;
533 |         Elf64_Addr      p_vaddr;
534 |         Elf64_Addr      p_paddr;
535 |         Elf64_Xword     p_filesz;
536 |         Elf64_Xword     p_memsz;
537 |         Elf64_Xword     p_align;
538 | } Elf64_Phdr;
539 | ```
540 | Notice the `p_flags` which can be:
541 | ```
542 | PF_X         0x1          Execute 
543 | PF_W         0x2          Write
544 | PF_R         0x4          Read
545 | PF_MASKPROC  0xf000000    Unspecified
546 | ```
547 | `PT_GNU_STACK` is a p_flags member specifies the permissions on the segment
548 | containing the stack and is used to indicate wether the stack should be
549 | executable. `The absense of this header indicates that the stack will be
550 | executable`. This is not exactly true and the value of the p_flags entry might
551 | not be `PF_X` in which case the program header will still be there.
552 | For example:
553 | Using the example without the section in the source we compile and like this:
554 | ```console
555 | $ as -o exec-stack.o exec-stack.s 
556 | $ ld -z execstack -o exec-stack exec-stack.o
557 | $ readelf -l exec-stack | grep -A1 GNU_STACK
558 |   GNU_STACK      0x0000000000000000 0x0000000000000000 0x0000000000000000
559 |                  0x0000000000000000 0x0000000000000000  RWE    0x10
560 | ```
561 | Notice that we have the program header but the flags are `RWE`. 
562 | ```console
563 | $ ld -z noexecstack -o exec-stack exec-stack.o
564 | $ readelf -l exec-stack | grep -A1 GNU_STACK
565 |   GNU_STACK      0x0000000000000000 0x0000000000000000 0x0000000000000000
566 |                  0x0000000000000000 0x0000000000000000  RW     0x10
567 | ```
568 | 
569 | When assembling with as (gnu assembler) this can be specified using:
570 | ```
571 | --execstack or --noexecstack assembler options 
572 | ```
573 | Or you can add the section to the assembly source file:
574 | ```assembly
575 | ```assembly
576 | .section .note.GNU-stack,"",@progbits
577 | ```
578 | Specifying `--noexecstack` would be the same as adding the section to the
579 | assembly source code.
580 | 
581 | It is also possible to specify this flag to the linker:
582 | ```console
583 | $ as -o exec-stack.o exec-stack.s
584 | $ ld -z noexecstack -o exec-stack exec-stack.o
585 | ```
586 | So if we have a source file without the .note.GNU-stack section this would add
587 | the section to the program header of the generated object file.
588 | 
589 | The example  [exec-stack.s](./exec-stack.s) will be used below and if we take
590 | a look at the program headers for it without the section `.note.GNU-stack` added
591 | we see:
592 | ```console
593 | $ readelf -w --program-headers exec-stack
594 | Elf file type is EXEC (Executable file)
595 | Entry point 0x401000
596 | There are 3 program headers, starting at offset 64
597 | 
598 | Program Headers:
599 |   Type           Offset             VirtAddr           PhysAddr
600 |                  FileSiz            MemSiz              Flags  Align
601 |   LOAD           0x0000000000000000 0x0000000000400000 0x0000000000400000
602 |                  0x00000000000000e8 0x00000000000000e8  R      0x1000
603 |   LOAD           0x0000000000001000 0x0000000000401000 0x0000000000401000
604 |                  0x0000000000000015 0x0000000000000015  R E    0x1000
605 |   LOAD           0x0000000000002000 0x0000000000402000 0x0000000000402000
606 |                  0x0000000000000004 0x0000000000000004  RW     0x1000
607 | ```
608 | Notice that there are only three Program Headers!
609 | 
610 | And if we add the section we can find:
611 | ```console
612 | $ readelf -w --program-headers exec-stack
613 | Elf file type is EXEC (Executable file)
614 | Entry point 0x401000
615 | There are 4 program headers, starting at offset 64
616 | 
617 | Program Headers:
618 |   Type           Offset             VirtAddr           PhysAddr
619 |                  FileSiz            MemSiz              Flags  Align
620 |   LOAD           0x0000000000000000 0x0000000000400000 0x0000000000400000
621 |                  0x0000000000000120 0x0000000000000120  R      0x1000
622 |   LOAD           0x0000000000001000 0x0000000000401000 0x0000000000401000
623 |                  0x0000000000000015 0x0000000000000015  R E    0x1000
624 |   LOAD           0x0000000000002000 0x0000000000402000 0x0000000000402000
625 |                  0x0000000000000004 0x0000000000000004  RW     0x1000
626 |   GNU_STACK      0x0000000000000000 0x0000000000000000 0x0000000000000000
627 |                  0x0000000000000000 0x0000000000000000  RW     0x10
628 | ```
629 | So when have the secion there will be a program header named `GNU_STACK` and
630 | this executable will not be allowed to execute code in the data or stack
631 | section.
632 | 
633 | This program header will be checked when the binary is loaded in 
634 | [load_elf_binary](https://elixir.bootlin.com/linux/latest/source/fs/binfmt_elf.c#L929):
635 | ```c
636 | for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++)
637 | 		switch (elf_ppnt->p_type) {
638 | 		case PT_GNU_STACK:
639 | 			if (elf_ppnt->p_flags & PF_X)
640 | 				executable_stack = EXSTACK_ENABLE_X;
641 | 			else
642 | 				executable_stack = EXSTACK_DISABLE_X;
643 | 			break;
644 | 
645 | ...
646 | 
647 | if (elf_read_implies_exec(*elf_ex, executable_stack))
648 | 		current->personality |= READ_IMPLIES_EXEC;
649 | ...
650 | ```
651 | Notice that we only enter this if `PT_GNU_STACK` is present. So that should
652 | be enough to determine if an executable has a non-exeutable stack is that
653 | there is no PT_GNU_STACK.
654 | 
655 | For example, without the section added:
656 | ```console
657 | $ readelf -w -l exec-stack | grep GNU_STACK
658 | ```
659 | And with it added:
660 | ```console
661 | $ readelf -w -l exec-stack | grep GNU_STACK
662 |   GNU_STACK      0x0000000000000000 0x0000000000000000 0x0000000000000000
663 | ```
664 | 
665 | We can also inspect the object file using:
666 | ```console
667 | $ readelf -WS exec-stack.o | grep .note.GNU-stack
668 |   [ 5] .note.GNU-stack   PROGBITS        0000000000000000 000059 000000 00      0   0  1
669 | ```
670 | And without the section in the source this would not match.
671 | ```console
672 | $ as --noexecstack -o exec-stack.o exec-stack.s
673 | $ readelf -WS exec-stack.o | grep .note.GNU-stack
674 |   [ 5] .note.GNU-stack   PROGBITS        0000000000000000 000059 000000 00      0   0  1
675 | ```
676 | 
677 | ## No Execute (NX)
678 | The no execute bit is used in CPUs to separate memory areas for either data
679 | storage or instructions storage. An operating system with NX support will mark
680 | certain areas of memory as non-exeutable.
681 | 
682 | 
683 | ### sections
684 | We can add section as we wish:
685 | ```assembly
686 | .section bajja_section
687 | ```
688 | ```console
689 | $ objdump -h tmp.o
690 | 
691 | tmp.o:     file format elf64-x86-64
692 | 
693 | Sections:
694 | Idx Name          Size      VMA               LMA               File off  Algn
695 |   0 .text         00000030  0000000000000000  0000000000000000  00000040  2**0
696 |                   CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
697 |   1 .data         0000000a  0000000000000000  0000000000000000  00000070  2**0
698 |                   CONTENTS, ALLOC, LOAD, DATA
699 |   2 .bss          00000000  0000000000000000  0000000000000000  0000007a  2**0
700 |                   ALLOC
701 |   3 bajja_section 00000000  0000000000000000  0000000000000000  0000007a  2**0
702 |                   CONTENTS, READONLY
703 | ```
704 | 
705 | ### cmp and sub
706 | The cmp instruction will take a source and destination and subtract the
707 | destination with the source:
708 | ```assembly
709 |   mov $11, %rax
710 |   mov $10, %rsi
711 |   cmp %rax, %rsi
712 | ```
713 | This similar to `sub %rax, %rsi` only the destination %rsi will not be modfied.
714 | So the source is subtracted from the destination, so this will perform
715 | `%rsi - %rax = 10 - 11 = -1`.
716 | 
717 | Remember to display rflags using `--binary or -b`:
718 | ```console
719 | (lldb) expr -f b -- $rflags
720 | (unsigned long) $11 = 0b0000000000000000000000000000000000000000000000000000001010010111
721 | ```
722 | Using `expr` allows us to use a mask to find the values that we might be
723 | interested in. For example, to see only the value of the carry flag:
724 | ```console
725 | (lldb) expr -f b -- $rflags & 0x0001
726 | (unsigned long) $11 = 0b0000000000000000000000000000000000000000000000000000000000000001
727 | ```
728 | When subtracting, and hence this is also true for cmp, the carry flag is set
729 | when the result becomes too large of a negative value. For example
730 | ```
731 | 0000 - 0001 = 1111 = -1
732 | ```
733 | 
734 | The flags affected by cmp/sub are CF, OF, SF, ZF, AF, and PF.
735 | 
736 | The carry flag is used to determine when subtracting unsigned integers produce
737 | a negative value.
738 | How about signed integers and subtracting them, these could have valid negative
739 | numbers. In this case the carry flag is not useful. Instread one has to use
740 | the overflow flag.
741 | 
742 | ### test
743 | This instruction is very similar to `cmp` and affects the same flags in almost
744 | the same way with the execption of `AF`.
745 | 
746 | ### rflags
747 | 
748 | ```
749 |                                            11 10 9  8  7  6  5  4  3  2  1  0
750 | +-----------------------------------------------------------------------------+
751 | |                                         |OF|DF|IF|TF|SF|ZF|  |AF|  |PF|  |CF|
752 | +-----------------------------------------------------------------------------+
753 | 
754 | CF = Carry        0x0001   00000000000000000000000000000001
755 | PF = Parity       0x0004   00000000000000000000000000000100
756 | AF = Adjust       0x0010   00000000000000000000000000010000
757 | ZF = Zero         0x0040   00000000000000000000000001000000
758 | SF = Sign         0x0080   00000000000000000000000010000000
759 | TF = Trap         0x0100   00000000000000000000000100000000
760 | IF = Interrupt    0x0200   00000000000000000000001000000000
761 | DF = Direction    0x0400   0000000000000000000001000000000
762 | OF = Overflow     0x0800   0000000000000000000010000000000
763 | ```
764 | The above masks can be used to AND the rflags register to check values:
765 | ```console
766 | (lldb) expr -f b -- $rflags & 0x0001
767 | ```
768 | 
769 | ### Parity Flag
770 | The parity flags on x86 only looks at one byte, the least significant, and if
771 | the bits set are even the parity flags is set, and if it is odd then it is 0.
772 | 
773 | ### Adjust/Auxiliary/Auxilary Carry Flag
774 | The adjust flag is also called the Auxiliary flag or Auxilariy Carray flag. This
775 | is set if an arithmetic operation causes a borrow to occur in the four least
776 | significant bits. This was used for EBCDIC and is not really used anymore, but
777 | the cmp/sub instructions can affect these flags.
778 | 
779 | ### Zero flag 
780 | ```
781 | $ lldb -- zero-flag
782 | (lldb) target create "zero-flag"
783 | Current executable set to 'zero-flag' (x86_64).
784 | (lldb) br s -n _start
785 | (lldb) disassemble 
786 | zero-flag`_start:
787 |     0x401000 <+0>:  mov    rax, 0x2
788 | ->  0x401007 <+7>:  mov    rcx, 0x2
789 |     0x40100e <+14>: sub    rcx, rax
790 | (lldb) expr -f b -- $rflags & 0x0040
791 | (unsigned long) $0 = 0b0000000000000000000000000000000000000000000000000000000000000000
792 | (lldb) si
793 | (lldb) expr -f b -- $rflags & 0x0040
794 | (unsigned long) $5 = 0b0000000000000000000000000000000000000000000000000000000001000000
795 | ```
796 | 
797 | ### Direction flag
798 | 
799 | ## conditional set instruction (setcc)
800 | This instruction will conditionally set the destination operand to 0 or 1
801 | depending on the status flags (CF, SF, OF, ZF, and PF).
802 | 
803 | An example can be found in [setne.s](./setne.s).
804 | 
805 | For a real example this usage can be found in Node.js:
806 | ```console
807 |   a4c1f3:	e8 58 ab fd ff       	callq  a26d50 <getauxval@plt>
808 |   a4c204:	48 85 c0             	test   %rax,%rax
809 |   a4c207:	0f 95 05 73 c8 ba 03 	setne  0x3bac873(%rip)        # 45f8a81 <_ZN4node11per_process15linux_at_secureE>
810 | ```
811 | Notice that are calling `getauxval` and it returns a values which on x64 will
812 | be in register rax. The test is anding that regiser with itself, if the result
813 | is not zero then we set the value at the address relative to the value in the
814 | instruction pointer registry.
815 | 
816 | 
817 | ### Conditional mov
818 | TODO: add an example of this.
819 | 
820 | ### dereferencing
821 | ```assembly
822 |   mov msg, %rcx
823 | ```
824 | This will move the contents located at the address msg, the size of the data
825 | will be 64-bits as we are moving into rax.
826 | ```console
827 | (lldb) disassemble -F att
828 | 0x401000 <+0>:  movq   0x402000, %rcx
829 | ```
830 | 
831 | We can put parentheses around it which is the same thing as saying that we
832 | want to copy the data located at the address msg.
833 | ```assembly
834 |   mov (msg), %rax
835 | ```
836 | A reason for doing this is perhaps we want to add an offset to the address:
837 | ```assembly
838 |   mov (msg+0), %rcx
839 | ```
840 | But that is actually possible without the parenthesis.
841 | ```console
842 |     0x401011 <+17>: movq   0x402000, %rcx
843 | ```
844 | But this can be useful when we want to specify an offset for a register perhaps
845 | like :
846 | ```assembly
847 |   0x4016cd <+11>: movq   %rsi, -0x20(%rbp)
848 | ```
849 | 
850 | Now if we use `$` which is also use $ immediate valus like $4 would be the
851 | constant 4 and remember that msg is just an address and we are specifying that
852 | as an immediate value.
853 | ```assembly
854 |   mov $msg, %rcx
855 | ```
856 | We also use $ with constants, like $4 would be the constant 4 and remember
857 | that msg is just an address 
858 | ```console
859 | 0x401008 <+8>:  movq   $0x402000, %rax           ; imm = 0x40200
860 | ```
861 | 
862 | And this is the same as using `lea` to load the effective address.
863 | ```assembly
864 |   lea msg, %rbx
865 | ```
866 | ```console
867 |     0x401020 <+32>: leaq   0x402000, %rbx
868 | ```
869 | 
870 | ### .p2align directive
871 | This directive is used to pad the location counter
872 | ```assembly
873 | .p2align 5,,31
874 | ```
875 | This pads to align on a 32-byte boundry.
876 | 
877 | 


--------------------------------------------------------------------------------