├── 8_asm_c
    ├── README.md
    ├── Makefile
    ├── main.c
    └── funcs.asm
├── .gitignore
├── Makefile
├── att.md
├── 1_io.asm
├── 2_addr.asm
├── 6_libc.asm
├── 3_jump.asm
├── 0_basic.asm
├── 4_leaf.asm
├── 5_nonleaf.asm
├── LICENSE
├── README.md
└── 7_float.asm


/8_asm_c/README.md:
--------------------------------------------------------------------------------
1 | A small appendix to show how C code can call assembly code.
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | todo
 2 | *.o
 3 | 0_basic
 4 | 1_io
 5 | 2_addr
 6 | 3_jump
 7 | 4_leaf
 8 | 5_nonleaf
 9 | 6_libc
10 | 7_float
11 | 8_asm_c/fact
12 | 


--------------------------------------------------------------------------------
/8_asm_c/Makefile:
--------------------------------------------------------------------------------
 1 | NASM     := nasm
 2 | ASMFLAGS += -g -f elf64
 3 | CC       := gcc
 4 | CCFLAGS  := -g -no-pie 
 5 | LD       := ld
 6 | LDFLAGS  := 
 7 | LDLIBS   :=
 8 | RM       := rm
 9 | 
10 | C_OBJ     := main.o
11 | OBJ      := funcs.o
12 | PROG     := fact
13 | 
14 | .PHONY: all clean
15 | all: $(PROG)
16 | 
17 | $(PROG): $(OBJ) $(C_OBJ)
18 | 	$(CC) -o $@ $^
19 | 
20 | $(OBJ): $(OBJ:%.o=%.asm)
21 | 	$(NASM) -o $@ $(ASMFLAGS) $<
22 | 
23 | $(C_OBJ): $(C_OBJ:%.o=%.c)
24 | 	$(CC) -c -o $@ $(CCFLAGS) $<
25 | 
26 | clean:
27 | 	$(RM) -f $(PROG) $(OBJ) $(C_OBJ)
28 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | NASM     := nasm
 2 | ASMFLAGS += -g -f elf64
 3 | CC       := gcc
 4 | CCFLAGS  := -g -no-pie 
 5 | LD       := ld
 6 | LDFLAGS  := 
 7 | LDLIBS   :=
 8 | RM       := rm
 9 | MAKE     := make
10 | 
11 | TARGETS  := 0_basic 1_io 2_addr 3_jump 4_leaf 5_nonleaf
12 | CTARGETS := 6_libc 7_float
13 | OBJ      := $(addsuffix .o, $(TARGETS) $(CTARGETS))
14 | 
15 | EXTRA_DIRS := 8_asm_c
16 | 
17 | .PHONY: all clean
18 | 
19 | all: $(TARGETS) $(CTARGETS)
20 | 	@for i in $(EXTRA_DIRS); do $(MAKE) -C $$i $@; done
21 | 
22 | clean:
23 | 	$(RM) -f $(TARGETS) $(CTARGETS) $(OBJ)	
24 | 	@for i in $(EXTRA_DIRS); do $(MAKE) -C $$i $@; done
25 | 
26 | $(TARGETS):  %: %.o 
27 | 	$(LD) -o $@ $< $(LDFLAGS) $(LDLIBS)
28 | 
29 | $(CTARGETS): %: %.o 
30 | 	$(CC) -o $@ $< $(CCFLAGS) $(LDFLAGS) $(LDLIBS)
31 | 
32 | $(OBJ): %.o: %.asm
33 | 	$(NASM) -o $@ $< $(ASMFLAGS)


--------------------------------------------------------------------------------
/att.md:
--------------------------------------------------------------------------------
 1 | AT&T syntax (used by default in GNU assembler) has a few differences from
 2 | Intel syntax:
 3 | 
 4 | - immediate values must be prefixed with a dollar sign `$`
 5 | 
 6 | - registers must be prefixed with a percent sign `%`
 7 | 
 8 | - for instructions that have a source and a destination operand,
 9 | their order is reversed: `instr SOURCE, DEST`
10 | 
11 | - the addressing mode is in the form `displacment(base,index,scale)`.
12 | Only the base is required.  
13 | (The address is calculated as base + scale \* index + displacment)
14 | 
15 | - whenever specifying the operand's size is required, this is done as
16 | a single-letter suffix in the instruction's name: `byte`, `word`,
17 | `dword` and `qword` become suffixes `b`, `w`, `l` (long) and `q`
18 | respectively.
19 | 
20 | Those differences are summarized by the table below.
21 | 
22 | | Intel syntax                         | AT&T syntax |
23 | |:------------------------------------:|:------------------------------------:|
24 | | mov rax, 1                           | mov $1, %rax |
25 | | mov dword [rsp], eax                 | movl %eax, (%rsp) |
26 | | mov word [rbp - 8], dx               | movw %dx, -8(%rbp) |
27 | | lea rcx, qword [rsp + 2 \* rax + 3]  | leaq 3(%rsp,%rax,2), %rcx |
28 | 


--------------------------------------------------------------------------------
/8_asm_c/main.c:
--------------------------------------------------------------------------------
 1 | // Copyright 2019 Luana Carmo M de F Barbosa
 2 | //
 3 | // This file is licensed under the CC-BY-SA 2.0 license.
 4 | // See LICENSE for details.
 5 | //
 6 | // main.c: C code calling assembly code.
 7 | //
 8 | 
 9 | #include <stdio.h>
10 | #include <stdlib.h>
11 | #include <stdint.h>
12 | 
13 | #define BUFFER_SIZE 1024
14 | 
15 | // declare the functions that were defined in the other file.
16 | // We use integer types with explicit sizes, to be on the safe side.
17 | //
18 | extern uint64_t factorial(uint64_t n);
19 | extern uint64_t uint2str(uint64_t n, uint8_t *buf, uint64_t bufsize);
20 | 
21 | int main(int argc, char **argv)
22 | {
23 | 	if (argc != 2) {
24 | 		fprintf(stderr, "usage: %s number\n", argv[0]);
25 | 		exit(EXIT_FAILURE);
26 | 	}
27 | 	int num = atoi(argv[1]);
28 | 
29 | 	// the function call works normally. Only an extra cast is needed
30 | 	uint64_t fact = factorial((uint64_t) num);
31 | 	//printf("%d\n", fact);
32 | 
33 | 	uint8_t buf[BUFFER_SIZE];
34 | 	// note that buf is passed to uint2str as a pointer, which the assembly
35 | 	// code will access through the addressing mode.
36 | 	//
37 | 	uint64_t last_index = uint2str(fact, buf, BUFFER_SIZE);
38 | 	if (last_index >= BUFFER_SIZE) {
39 | 		fprintf(stderr, "result is too large to fit in buffer\n");
40 | 		exit(EXIT_FAILURE);
41 | 	}
42 | 	// the resulting string is not null terminated: add it manually.
43 | 	buf[last_index] = '\0';
44 | 
45 | 	// no newline because buf already has one
46 | 	printf("%d! = %s", num, buf);
47 | 
48 | 	return 0;
49 | }
50 | 


--------------------------------------------------------------------------------
/8_asm_c/funcs.asm:
--------------------------------------------------------------------------------
  1 | ; Copyright 2019 Luana Carmo M de F Barbosa
  2 | ;
  3 | ; This file is licensed under the CC-BY-SA 2.0 license.
  4 | ; See LICENSE for details.
  5 | ;
  6 | 
  7 | ; funcs.asm: implementation of functions required by main.c.
  8 | ; These were borrowed from other files in here, but we couldn't use those files
  9 | ; directly because most contain a _start definition, which conflicts
 10 | ; with C library's _start.
 11 | ; Also, we need to mark those functions as global.
 12 | 
 13 | ;Arguments from C are passed in to these registers in order:
 14 | ;Integrers/pointers: rdi, rsi, rdx, rcx, r8, r9
 15 | ;Floats:             xmm0, xmm1, xmm2, xmm3, ..., xmm7
 16 | ;Additional pointers are pushed on the stack
 17 | 
 18 | ;Return
 19 | ;Ints/pointers   rax or rdx:rax
 20 | ;Floats          xmm0 or xmm1:xmm0
 21 | 
 22 | ;we must preseve rbp, rbx, r12, r13, r14, r15 so we push/pop
 23 | 
 24 | global uint2str
 25 | global factorial
 26 | 
 27 | ;extern uint64_t uint2str(uint64_t n, uint8_t *buf, uint64_t bufsize)
 28 | ;so   n       maps to rdi
 29 | ;     *buf    maps to rsi
 30 | ;     bufsize maps to rdx
 31 | ;we write the data under *buf only bufsize bytes
 32 | ;we return to rax the number of bytes written
 33 | ;   this function doesnt add a NULL BYTE, C will need to check if there are
 34 | ;   any leftover room to add it before printing. Else there is possibly info leftover/failure
 35 | uint2str:
 36 | 	push rbp
 37 | 	mov rbp, rsp
 38 | 
 39 | 	push rbx
 40 | 	push r12
 41 | 	push r13
 42 | 	push r14
 43 | 	push r15
 44 | 
 45 | 	mov r12, rdi ; n
 46 | 	mov r13, rsi ; buf
 47 | 	mov r14, rdx ; bufsize
 48 | 
 49 | 	xor r15, r15 ; i = 0 (counter)
 50 | 	mov rax, r12 ; rax = n
 51 | 
 52 | .loop:
 53 | 	; iterate over the number mod 10 to get the digits in reverse order
 54 | 
 55 | 	cmp rax, 0
 56 | 	je .done	; if(n == 0) break;
 57 | 	cmp r15, r14
 58 | 	jge .done	; if(i >= bufsize) break;
 59 | 
 60 | 	xor rdx, rdx
 61 | 	mov rdi, 10
 62 | 	div rdi
 63 | 	; now n%10 is in rdx and n/10 is in rax.
 64 | 	add dl, '0'
 65 | 	mov byte [r13 + r15], dl ; write that value to the string
 66 | 
 67 | 	inc r15		; i++
 68 | 	jmp .loop
 69 | .done:
 70 | 	; append a newline to the string
 71 | 	mov byte [r13 + r15], 0x0a
 72 | 	inc r15
 73 | 
 74 | 	; now we've written the string to buf, but it's reversed
 75 | 	; (the last digit appears first), so we call revstr to fix it.
 76 | 	;
 77 | 	mov rdi, r13 ; buf
 78 | 	lea rsi, [r15 - 1] ; number of chars written -1 (don't include the newline)
 79 | 	call revstr
 80 | 
 81 | 	mov rax, r15 ; return value: the counter
 82 | 
 83 | 	pop r15
 84 | 	pop r14
 85 | 	pop r13
 86 | 	pop r12
 87 | 	pop rbx
 88 | 
 89 | 	mov rsp, rbp
 90 | 	pop rbp
 91 | 	ret
 92 | 
 93 | ; extern uint64_t factorial(uint64_t n);
 94 | ; so n maps to rdi
 95 | ; and we return via rax
 96 | factorial:
 97 | 	push rbp
 98 | 	mov rbp, rsp
 99 | 	push rbx
100 | 	push r12
101 | 	push r13
102 | 	push r14
103 | 	push r15
104 | 
105 | 	mov r12, rdi ; n
106 | 
107 | 	; recursion base: n == 0
108 | 	cmp r12, 0
109 | 	je .zero
110 | 
111 | 	mov rbx, rdi
112 | 	dec rdi
113 | 	call factorial
114 | 
115 | 	xor rdx, rdx
116 | 	mul rbx
117 | 
118 | 	; ideally, we should check if the multiplication above overflowed...
119 | 
120 | 	jmp .done
121 | .zero:
122 | 	mov rax, 1 ; 0! = 1
123 | .done:
124 | 	pop r15
125 | 	pop r14
126 | 	pop r13
127 | 	pop r12
128 | 	pop rbx
129 | 	mov rsp, rbp
130 | 	pop rbp
131 | 	ret
132 | 
133 | revstr:
134 | 	xor r8, r8
135 | 	lea r9, [rsi - 1]
136 | 
137 | .loop:
138 | 	cmp r8, r9
139 | 	jge .done
140 | 
141 | 	; swap s[r8] and s[r9]
142 | 	mov cl, byte [rdi + r8]
143 | 	xchg cl, byte [rdi + r9]
144 | 	mov byte [rdi + r8], cl
145 | 
146 | 	inc r8
147 | 	dec r9
148 | 	jmp .loop
149 | .done:
150 | 	ret
151 | 
152 | ; vim: set ft=nasm:


--------------------------------------------------------------------------------
/1_io.asm:
--------------------------------------------------------------------------------
  1 | ; Copyright 2018-2019 Luana Carmo M de F Barbosa
  2 | ;
  3 | ; This file is licensed under the CC-BY-SA 2.0 license.
  4 | ; See LICENSE for details.
  5 | ;
  6 | 
  7 | ; (1) io.asm: I/O, system calls
  8 | ; 
  9 | ; This file shows how to read from stdin and write to stdout in assembly,
 10 | ; with a simple program that writes a prompt, reads a string and writes it back.
 11 | ;
 12 | 
 13 | global _start
 14 | 
 15 | section .data
 16 | 	; The directive 'db' means 'declare bytes', while the label 'str'
 17 | 	; stores the address of the beginning of that string.
 18 | 	; Note the string declared is not null terminated, so we need
 19 | 	; to keep track of its size, which we do next (with STRSIZE).
 20 | 	; Also note that 0xA is the ascii value for Line Feed (newline, "\n").
 21 | 	;
 22 | 	str: db "Type something and I'll repeat it! (max 64 bytes)", 0xA
 23 | 
 24 | 	; The lone '$' is the address the assembler is currently at,
 25 | 	; so by subtracting it from 'str' we get the number of bytes in
 26 | 	; the declared string (and that still works if we change the string (*),
 27 | 	; since it's not a hardcoded size.)
 28 | 	;
 29 | 	STRSIZE: equ $ - str
 30 | 
 31 | 	; File descriptors.
 32 | 	; A file descriptor is a number used in Unix to refer to an open file.
 33 | 	; Some special files are always open, and have fixed file descriptors:
 34 | 	;	stdin: 0
 35 | 	;	stdout: 1
 36 | 	;	stderr: 2
 37 | 	;
 38 | 	STDIN: equ 0
 39 | 	STDOUT: equ 1
 40 | 
 41 | ; We haven't used this section before: this is for unitialized space
 42 | ; (meaning we can't tell its contents at first), which gets reserved
 43 | ; when the program starts.
 44 | ;
 45 | section .bss
 46 | 	; resb: reserve bytes. (The argument is the number of bytes.)
 47 | 	buf: resb 64
 48 | 	BUF_SIZE: equ 64 ; keep the same number as above
 49 | 
 50 | 
 51 | section .text
 52 | _start:
 53 | 	; If you want to do anything actually useful in assembly,
 54 | 	; you'll need the OS's blessing through a system call.
 55 | 	; That's also the case for I/O: in order to write something
 56 | 	; to stdout, we'll need to use the 'write' system call.
 57 | 	; You can see the full list of system calls with
 58 | 	;	$ man 2 syscalls
 59 | 	; And you can see more info about the syscall <foo> with
 60 | 	;	$ man 2 <foo>
 61 | 	; From write's manpage:
 62 | 	;	ssize_t write(int fd, const void *buf, size_t count);
 63 | 	;	[...]
 64 | 	;	write() writes up to count bytes from the buffer starting
 65 | 	;	at buf to the file referred to by the file descriptor fd.
 66 | 	;
 67 | 	; All these mov instructions place the system call arguments
 68 | 	; where they should be (more on this later).
 69 | 	;
 70 | 	mov rax, 1 ; __NR_write (more on this later)
 71 | 	mov rdi, STDOUT
 72 | 	mov rsi, str 
 73 | 	mov rdx, STRSIZE
 74 | 
 75 | 	; In 64-bit mode, a system call is done with a dedicated 'syscall'
 76 | 	; instruction. In 32-bit, we'd need to use 'int 0x80'
 77 | 	; ('int' is the instruction for interrupt, 0x80 is the Linux
 78 | 	; kernel's interruption handler).
 79 | 	; The kernel knows which system call we want seeing the number
 80 | 	; in rax: it must be the number matching the system call.
 81 | 	; In 64 bits, those are defined in
 82 | 	;	/usr/include/asm/unistd_64.h
 83 | 	; as macros __NR_<foo>, where <foo> is the system call.
 84 | 	; The arguments to the system call are also passed in registers:
 85 | 	;	"The kernel interface uses %rdi, %rsi, %rdx, %r10, %r8
 86 | 	;	and %r9." (ABI, appendix A, section A.2.1)
 87 | 	; Which explains the previous 'mov' instructions.
 88 | 	;
 89 | 	syscall
 90 | 
 91 | 	; output works through a system call, and so does input,
 92 | 	; through the read system call. From read's manpage:
 93 | 	;	ssize_t read(int fd, void *buf, size_t count);
 94 | 	;	[...]
 95 | 	;	read() attempts to read up to count bytes from file
 96 | 	;	descriptor fd into the buffer starting at buf.
 97 | 	;
 98 | 	; Also, if the input has more bytes than count, the extra bytes
 99 | 	; won't be read. (*)
100 | 	;
101 | 
102 | 	; note: instead of
103 | 	;	mov rax, 0
104 | 	; we use the equivalent
105 | 	;	xor rax, rax
106 | 	; which makes an XOR of the two operands and stores the result
107 | 	; in the first one, as is usual in the Intel syntax.
108 | 	; XOR'ing a value with itself always gives zero, so those are in fact
109 | 	; equivalent.
110 | 	;
111 | 	xor rax, rax ; __NR_read == 0
112 | 	mov rdi, STDIN
113 | 	mov rsi, buf
114 | 	mov rdx, BUF_SIZE
115 | 	syscall
116 | 
117 | 	; again from the read's manpage:
118 | 	;	On success, the number of bytes read is returned [...]
119 | 	;	It is not an error if this number is smaller than the
120 | 	;	number of bytes requested; this may happen for example
121 | 	;	because fewer bytes are actually available right now [...]
122 | 	;
123 | 	; We want to keep track of the return value so we can only
124 | 	; write the number of bytes we've read.
125 | 	; System calls always write their return value to rax.
126 | 	; Since we need to write the number of the next system call
127 | 	; to that register, we store the value in a different one.
128 | 	;
129 | 	mov rbx, rax
130 | 
131 | 	; now, write it back to stdout
132 | 	mov rax, 1 ; __NR_write
133 | 	mov rdi, STDOUT
134 | 	mov rsi, buf
135 | 	mov rdx, rbx ; get the size from where we stored it
136 | 	syscall
137 | 
138 | 	; now we quit. Even that requires a system call: exit.
139 | 	mov rax, 60
140 | 	; The argument to exit is the status code: 0 indicates success,
141 | 	; non-zero indicates failure.
142 | 	;
143 | 	xor rdi, rdi
144 | 	syscall
145 | 
146 | ; Exercises
147 | ;
148 | ; === St Thomas' Wisdom ===
149 | ; Verify all claims marked with (*).
150 | ;
151 | ; === Changing Stuff and Seeing What Happens ===
152 | ;	- Scramble the mov instructions before the first SYSCALL and see
153 | ;	if it still works.
154 | ;
155 | ; === Your Turn ===
156 | ;	- Write a program that writes a prompt asking for the user's name,
157 | ;	reads it, then prints back "Hello, " followed by the name that was read.
158 | ;	Note: you can quickly debug system calls with the "strace" command.
159 | 
160 | ; vim: set ft=nasm:
161 | 


--------------------------------------------------------------------------------
/2_addr.asm:
--------------------------------------------------------------------------------
  1 | ; Copyright 2018-2019 Luana Carmo M de F Barbosa
  2 | ;
  3 | ; This file is licensed under the CC-BY-SA 2.0 license.
  4 | ; See LICENSE for details.
  5 | ;
  6 | 
  7 | ; (2) addr.asm: addressing mode, mov and lea, etc
  8 | global _start
  9 | 
 10 | section .data
 11 | 	str:	db '0123456789',0xA
 12 | 	strsiz:	equ $ - str
 13 | 	STDOUT:	equ 1
 14 | 
 15 | section .text
 16 | _start:
 17 | 	; we want to print str several times after making changes
 18 | 	; to some of its bytes, to make sure the code works.
 19 | 	;
 20 | 	mov rdx, strsiz
 21 | 	; copy the address of str to rsi. mov "doesn't know" it's an address
 22 | 	; because the addressing mode wasn't used.
 23 | 	;
 24 | 	mov rsi, str
 25 | 	mov rdi, STDOUT
 26 | 	mov rax, 1	; __NR_write
 27 | 
 28 | 	; rdx, rsi, and rdi will not be changed through most of this code,
 29 | 	; because we'll make a lot of syscalls using these same arguments,
 30 | 	; so we might as well not touch these registers.
 31 | 	; (syscalls do not change the contents of general purpose registers,
 32 | 	; except for rcx, r11 and rax).
 33 | 	;
 34 | 	syscall
 35 | 
 36 | 	; In Intel syntax, the addressing mode is denoted by
 37 | 	; square brackets []. For example,
 38 | 	;	mov [rsi], 3
 39 | 	; This would be equivalent, in C, to
 40 | 	;	*rsi = 3;
 41 | 	; (assuming rsi were a variable you could use in C).
 42 | 	; This means we write 3 to the address whose value is in rsi,
 43 | 	; as if we were dereferencing a pointer. Needless to say,
 44 | 	; if the address is not valid, you'll get a segmentation fault.
 45 | 	;
 46 | 	; There's a problem with this, though. How many bytes do we want
 47 | 	; to write? (Keep in mind that, though rsi is 8 bytes, that's the
 48 | 	; size of the address, not the operand.)
 49 | 	; In C, we never have to ask that question, because C has types,
 50 | 	; and each type has a size in bytes. For instance:
 51 | 	;	int *x;
 52 | 	;	x = /* some valid address */
 53 | 	;	*x = 3;
 54 | 	; Here the answer to our question is obvious: the assignment writes
 55 | 	; sizeof(int) bytes. However, there's no such thing as types
 56 | 	; in assembly, so we must write the operand's size explicitly, say
 57 | 	;	mov byte [rsi], 3
 58 | 	; we could also use 'word' for 2 bytes and 'dword' (double word)
 59 | 	; for 4 bytes.
 60 | 	; (as a side note: intel syntax generally requires one to write
 61 | 	;	mov byte ptr [rsi], 3
 62 | 	; the 'ptr' is to make it explicit it's an address, but that's
 63 | 	; redundant since we're using the addressing mode anyway!
 64 | 	; So nasm decided to remove the 'ptr' keyword altogether.)
 65 | 	;
 66 | 
 67 | 	mov byte [rsi], 'a'
 68 | 	; syscalls use rax as return register, so we need to restore
 69 | 	; rax to __NR_write = 1 every time.
 70 | 	mov rax, 1
 71 | 	syscall
 72 | 
 73 | 	mov word [rsi], 'bc'
 74 | 	mov rax, 1
 75 | 	syscall
 76 | 
 77 | 	mov dword [rsi], 'defg'
 78 | 	mov rax, 1
 79 | 	syscall
 80 | 
 81 | 	; So far we've simply accessed the address at one register,
 82 | 	; but you can make arithmetic in the addressing mode; that's
 83 | 	; the main reason why this mode is useful.
 84 | 	; From the basic architecture manual:
 85 | 	; "In 64-bit mode, a memory operand can be referenced by
 86 | 	; a segment selector and an offset. [...]
 87 | 	; The offset part of a memory address in 64-bit mode can be
 88 | 	; specified directly as a static value or through an address
 89 | 	; computation made up of one or more of the following components:
 90 | 	;	Displacement -- An 8-bit, 16-bit, or 32-bit value.
 91 | 	;	Base -- The value in a 64-bit general-purpose register.
 92 | 	;	Index -- The value in a 64-bit general-purpose register.
 93 | 	;	Scale factor -- A value of 2, 4, or 8 that is multiplied
 94 | 	;	by the index value."
 95 | 	; In intel syntax, this is written as:
 96 | 	;	[base + scale * index + displacement]
 97 | 	; (most of these are optional; see below).
 98 | 	;
 99 | 	mov byte [rsi], 'h'		; base only
100 | 
101 | 	mov rax, 1
102 | 	syscall
103 | 
104 | 	mov byte [str], 'i'		; displacement only (constant)
105 | 	; displacement only
106 | 	; (constant as well; will be computed when assembling)
107 | 	mov byte [str+1], 'j'
108 | 	mov rax, 1
109 | 	syscall
110 | 
111 | 	mov rbx, 3
112 | 	mov byte [rsi+2], 'k'		; base and displacement
113 | 	mov byte [rsi+rbx], 'l'		; base and index
114 | 	mov byte [rsi+2*rbx], 'm'	; base, scale and index
115 | 	mov byte [rsi+2*rbx+2], 'n'	; base, scale, index and displacement
116 | 
117 | 	mov rax, 1
118 | 	syscall
119 | 
120 | 	; We know that
121 | 	;	mov rax, [addr_expr]
122 | 	; would be equivalent, in C, to
123 | 	;	rax = *(addr_expr);
124 | 	; meaning it copies whatever is in the address addr to rax.
125 | 	; But what if we needed to use the artihemtic provided by the
126 | 	; addressing mode, but copy the address itself, instead of its
127 | 	; contents? That's when we use the lea (load effective address):
128 | 	;	lea rax, [addr_expr]
129 | 	; would be equivalent to
130 | 	;	rax = (addr_expr);
131 | 	;
132 | 	mov rbx, 2
133 | 	lea rsi, [str+2*rbx]	; str+4
134 | 
135 | 	; because of the arithmetic possible in the addressing mode,
136 | 	; lea is sometimes used with expressions
137 | 	; which aren't addresses at all. For instance, we know that
138 | 	;	lea rcx, [rax + 2 * rbx + 8]
139 | 	; will load rcx with rax + 2 * rbx + 8. Whether this is a
140 | 	; valid address or not doesn't change anything: unlike mov,
141 | 	; lea never tries to access that location.
142 | 	; It wouldn't be possible to do such calculation in a single
143 | 	; instruction without using lea. That's not its original purpose,
144 | 	; but hey, it works!
145 | 	;
146 | 	mov rbx, -2
147 | 	lea rdx, [strsiz+2*rbx]	; use strsiz-4 as size in the write syscall
148 | 
149 | 	mov rax, 1
150 | 	syscall
151 | 
152 | 	; TODO segment registers
153 | 
154 | 	xor rdi, rdi
155 | 	mov rax, 60
156 | 	syscall
157 | 
158 | ; Exercises
159 | ;
160 | ; === Your Turn ===
161 | ;	- Write a program that reads a string from stdin, changes the character
162 | ;	in the middle of that string to a newline, then prints it again.
163 | ;	(The "middle" can be obtained halving the string's size, which is
164 | ;	returned by the read sysem call.)
165 | ;
166 | ;	- Use the addressing mode to calculate 3*n + 1, where n is the value of
167 | ;	a register of your choice.
168 | ;
169 | 
170 | ; vim: set ft=nasm:
171 | 


--------------------------------------------------------------------------------
/6_libc.asm:
--------------------------------------------------------------------------------
  1 | ; Copyright 2018-2019 Luana Carmo M de F Barbosa
  2 | ;
  3 | ; This file is licensed under the CC-BY-SA 2.0 license.
  4 | ; See LICENSE for details.
  5 | ;
  6 | 
  7 | ; (6) libc.asm: using C library functions
  8 | ;
  9 | ; Here we'll revisit our old friends main() and printf().
 10 | ; Since C files turn into assembly at some point, it makes sense to expect it
 11 | ; to be possible to call C library functions in assembly. But to do so,
 12 | ; we must link our object file against the C library, and a few things must
 13 | ; change for us to do that.
 14 | ;
 15 | ; Note: you'll need to use the -no-pie option to link this file with gcc.
 16 | ;
 17 | 
 18 | ; again, the 'global' directive tells nasm to export a symbol to the linker.
 19 | ; When we were using _start, this just saved us from a warning when linking,
 20 | ; because the linker is generally smart enough to find where _start is.
 21 | ; Now, however, if this directive isn't used, the linker will complain that
 22 | ; the C library has an undefined reference to main, and will fail. (*)
 23 | ; (See below for why we're using main instead of _start.)
 24 | ;
 25 | global main
 26 | 
 27 | ; these are the symbols we'll be using from the standard library.
 28 | ; The linker doesn't need this, but nasm does, so it won't complain about
 29 | ; undefined symbols. (*)
 30 | ;
 31 | extern printf
 32 | extern atoi
 33 | 
 34 | section .data
 35 | 	; note how we add the null byte at the end: this string will be used by
 36 | 	; printf(), so we need to do that.
 37 | 	;
 38 | 	fmt: db '%d + %d = %d',0xA,0x0
 39 | 	; also, because the string is null terminated, we don't need to
 40 | 	; keep track of its size.
 41 | 
 42 | 	usageMsg: db 'usage: %s first_number second_number',0xA,0x0
 43 | 
 44 | ; this simple program adds two numbers given as command-line arguments
 45 | ; and prints the result.
 46 | ;
 47 | section .text
 48 | ;
 49 | ; Note how we use main() here instead of _start.
 50 | ; The C language defines main() as the first function to be called, but other
 51 | ; than that - and unlike _start - main() is a normal function, which needs to be
 52 | ; called with arguments, and which returns. So, who calls main()?
 53 | ; And where is _start now?
 54 | ; The answer to both of those is: the C library.
 55 | ; Even if a C program does not use anything from the C library, it still must be
 56 | ; linked against it because that library is what provides a _start definition,
 57 | ; which will make all necessary preparations and call main().
 58 | ;
 59 | ; So if we were to use _start here, the linker could complain twice: first
 60 | ; because it can't find the definition of the 'main' symbol that the C library
 61 | ; references, second because there's two definitions of _start. (*)
 62 | ;
 63 | main:
 64 | 	push rbp
 65 | 	mov rbp, rsp
 66 | 	push rbx
 67 | 	push r12
 68 | 	push r13
 69 | 	push r14
 70 | 	push r15
 71 | 
 72 | 	; main has the prototype
 73 | 	;	int main(int argc, char **argv);
 74 | 	; Those arguments are where you would expect them: argc, being the first
 75 | 	; parameter, is in rdi, and argv, the second one, is in rsi.
 76 | 	; We store those in callee-saved registers, as usual.
 77 | 	;
 78 | 	mov r12, rdi ; argc
 79 | 	mov r13, rsi ; argv
 80 | 
 81 | 	; allocate space for two ints (4 bytes each)
 82 | 	sub rsp, 8
 83 | 
 84 | 	; Since argv[0] is the program name, the numbers we want to add were
 85 | 	; passed as strings in argv[1] and argv[2].
 86 | 	;
 87 | 	; make sure that argc == 3
 88 | 	cmp r12, 3
 89 | 	jne .fail
 90 | 
 91 | 	; convert argv[1] and argv[2] to integers
 92 | 	mov rdi, qword [r13 + 8] ; argv[1]
 93 | 	call atoi
 94 | 	mov dword [rsp], eax
 95 | 
 96 | 	mov rdi, qword [r13 + 16] ; argv[2]
 97 | 	call atoi
 98 | 	mov dword [rsp+4], eax
 99 | 
100 | 	; add the numbers.
101 | 	; ADD won't take two memory locations as arguments, so we need to load
102 | 	; one of them into a register.
103 | 	mov ecx, dword [rsp]
104 | 	add ecx, dword [rsp+4]
105 | 
106 | 	; finally, print the value!
107 | 	; we want to call printf as:
108 | 	;	printf(fmt, num1, num2, sum);
109 | 	; (where num1 and num2 are the ints corresponding converted from argv)
110 | 	mov rdi, fmt
111 | 	mov esi, [rsp] ; the argument is 4 bytes, so we use esi instead of rsi
112 | 	mov edx, [rsp+4]
113 | 	; the sum of the two numbers is already in ecx
114 | 
115 | 	; this is all it takes to call printf(). Calling a library function
116 | 	; is no different than calling any other function: the linker figures
117 | 	; everything out for you.
118 | 	;
119 | 	call printf
120 | 
121 | 	xor rax, rax ; exit status
122 | 	jmp .end
123 | .fail:
124 | 	mov rdi, usageMsg
125 | 	mov rsi, [r13] ; argv[0]
126 | 	call printf
127 | 	mov rax, 1 ; exit status
128 | .end:
129 | 	pop r15
130 | 	pop r14
131 | 	pop r13
132 | 	pop r12
133 | 	pop rbx
134 | 	mov rsp, rbp
135 | 	pop rbp
136 | 	ret
137 | 
138 | ; Exercises
139 | ;
140 | ; === St Thomas' Wisdom ===
141 | ; Verify all claims marked with (*).
142 | ;
143 | ; === Reverse engineering ===
144 | ; Feed a C compiler the following code:
145 | ;
146 | ;	struct pair {
147 | ;		int a;
148 | ;		int b;
149 | ;	};
150 | ;
151 | ;	int main(int argc, char **argv)
152 | ;	{
153 | ;		struct pair p = {2, 5};
154 | ;		/* do something with p */
155 | ;		return 0;
156 | ;	}
157 | ;
158 | ; And see the resulting program's instructions. Where is p stored?
159 | ; Now, change the main function to:
160 | ;
161 | ;	int main(int argc, char **argv)
162 | ;	{
163 | ;		struct pair *p = malloc(sizeof(struct pair));
164 | ;		p->a = 2;
165 | ;		p->b = 5;
166 | ;		/* do something with p */
167 | ;		free(p);
168 | ;		return 0;
169 | ;	}
170 | ;
171 | ; Where is p stored now?
172 | ;
173 | ; === Your Turn ===
174 | ;	- Write a program that takes any number of command line arguments,
175 | ;	and prints their characters interleaved: the first character of each
176 | ;	argument must be printed in sequence, then the second, and so on.
177 | ;	For simplicity's sake, your program can refuse arguments of different
178 | ;	lengths.
179 | ;
180 | ;	- Write a program that reads a sequence of integers from standard input
181 | ;	and sorts it using the qsort() function.
182 | ;
183 | ; === Fly Higher ===
184 | ; We've shown how to use the standard C library with assembly, but you can use
185 | ; any C library in a similar fashion (as long as you link it against the final
186 | ; executable.) Write a simple assembly program that uses some library you're
187 | ; familiar with.
188 | ; If you're out of ideas, here are a few, with the suggested libraries in
189 | ; parenthesis:
190 | ;	- Create a blank GUI window, then destroy it after N seconds (SDL, XCB)
191 | ;	- Play a WAV file, or a single note with fixed duration (SDL, openAL?)
192 | ;	- Encrypt user input with AES, or compute its MD5 hash (openSSL)
193 | ;	- Fetch an HTML file from an HTTP URL (libcurl)
194 | ;
195 | ; (Note: we haven't yet explained how to deal with floating point numbers.
196 | ; If you believe you'll need to use them, feel free to skip to the next file
197 | ; and come back later.)
198 | 
199 | ; vim: set ft=nasm:
200 | 


--------------------------------------------------------------------------------
/3_jump.asm:
--------------------------------------------------------------------------------
  1 | ; Copyright 2018-2019 Luana Carmo M de F Barbosa
  2 | ;
  3 | ; This file is licensed under the CC-BY-SA 2.0 license.
  4 | ; See LICENSE for details.
  5 | ;
  6 | 
  7 | ; (3) jump.asm: conditional and unconditional jumps, eip and eflags.
  8 | ;
  9 | ; All code written so far had instructions that were executed sequentially.
 10 | ; But almost any program needs loops (while, for) and branches (if, else, ...);
 11 | ; under the hood, those are all jumps.
 12 | ;
 13 | global _start
 14 | 
 15 | section .data
 16 | 	prompt_str: db 'Write something! (max 32 bytes)',0xA
 17 | 	PROMPT_STRSIZE: equ $ - prompt_str
 18 | 
 19 | 	unused_str: db "This string won't be printed",0xA
 20 | 	UNUSED_STRSIZE: equ $ - unused_str
 21 | 
 22 | 	less_str: db 'The input string has less than 16 bytes',0xA
 23 | 	LESS_STRSIZE: equ $ - less_str
 24 | 
 25 | 	more_str: db 'The input string has 16 bytes or more',0xA
 26 | 	MORE_STRSIZE: equ $ - more_str
 27 | 
 28 | section .bss
 29 | 	buf: resb 32
 30 | 	BUF_SIZE: equ 32 ; keep the same as above
 31 | 
 32 | section .text
 33 | _start:
 34 | 	; write the prompt string
 35 | 	mov rax, 1 ; __NR_write
 36 | 	mov rdi, 1 ; stdout
 37 | 	mov rsi, prompt_str
 38 | 	mov rdx, PROMPT_STRSIZE
 39 | 	syscall
 40 | 
 41 | 	; jmp: unconditional jump. This instruction makes execution deviate from
 42 | 	; its usual path, in such a way that the instruction after my_label
 43 | 	; will be executed next. (This is the same as 'goto' in C.)
 44 | 	; But how does that work?
 45 | 	; From the basic architecture manual, section 3.5:
 46 | 	;
 47 | 	; "The instruction pointer (EIP) register contains the offset in the
 48 | 	; current code segment for the next instruction to be executed.
 49 | 	; It is advanced from one instruction boundary to the next
 50 | 	; in straight-line code or it is moved ahead or backwards by a number
 51 | 	; of instructions [...]"
 52 | 	;
 53 | 	; So the address of the next instruction is stored in this special
 54 | 	; register, eip. The manual goes on:
 55 | 	;
 56 | 	; "The EIP register cannot be accessed directly by software; it is
 57 | 	; controlled implicitly by control-transfer instructions
 58 | 	; (such as JMP, Jcc, CALL, and RET), interrupts, and exceptions."
 59 | 	;
 60 | 	; This means we can't use eip with the mov instruction, i.e.
 61 | 	; 'mov eip, <label>' is illegal. However, 'jmp <label>' has the same
 62 | 	; effect as that.
 63 | 	;
 64 | 	jmp my_label
 65 | 
 66 | 	; because of the previous jump, these instructions won't be executed.
 67 | 	; (If they were to be, they would print unused_str.)
 68 | 	mov rax, 1
 69 | 	mov rdi, 1
 70 | 	mov rsi, unused_str
 71 | 	mov rdx, UNUSED_STRSIZE
 72 | 	syscall
 73 | 
 74 | 	; we need to use labels to know where to jump to.
 75 | 	; The labels in the .text section store the address of the instruction
 76 | 	; immediately following them.
 77 | my_label:
 78 | 	; ask the user for input.
 79 | 	xor rax, rax ; __NR_read == 0
 80 | 	xor rdi, rdi ; stdin == 0
 81 | 	mov rsi, buf
 82 | 	mov rdx, BUF_SIZE
 83 | 	syscall
 84 | 
 85 | 	; The number of bytes that have been read is returned by the
 86 | 	; read system call, so it's on rax now. We store this value in rbx,
 87 | 	; which won't be altered by the next system calls.
 88 | 	mov rbx, rax
 89 | 
 90 | 	; now we see whether the input has less than 16 bytes.
 91 | 	; we compare the input size in rbx with 16, through 'cmp'.
 92 | 	;
 93 | 	; cmp subtracts the operands, but discards the result, storing only
 94 | 	; some info about it in a special register: eflags. (*)
 95 | 	; eflags stores several flags, but we're mostly interested in two:
 96 | 	; the zero flag and the sign flag.
 97 | 	; From the basic architecture manual, section 3.4.3.1:
 98 | 	;
 99 | 	; "ZF (bit 6) Zero flag - Set if the result is zero; cleared otherwise.
100 | 	; SF (bit 7) Sign flag - Set equal to the most-significant bit of
101 | 	; the result, which is the sign bit of a signed integer.
102 | 	; (0 indicates a positive value and 1 indicates a negative value.)"
103 | 	; 
104 | 	; These two flags, along with 'cmp', are enough to know whether
105 | 	; x - y == 0 <=> x == y and whether x - y < 0 <=> x < y.
106 | 	; This is enough to compare two integers in any possible way.
107 | 	;
108 | 	cmp rbx, 16
109 | 
110 | 	; conditional jump: if the previous comparsion instruction gives a
111 | 	; 'less than' result - that is, if the sign flag is 1 - jump to the
112 | 	; address stored in the operand label; otherwise, execute the
113 | 	; next instruction as usual. (*)
114 | 	; Just like with 'jmp', this is also changes the value of eip.
115 | 	; There are several instructions for conditional jumps, which are
116 | 	; grouped as 'Jcc' in the instruction set. For the 'jl' instruction,
117 | 	; the instruction set says:
118 | 	; "JL rel8	[...]		Jump short if less (SF != 0F)"
119 | 	jl less_16
120 | 
121 | 	; if we're here, the previous jump didn't happen, so the input must
122 | 	; have 16 bytes or more.
123 | 	mov rax, 1 ; __NR_write
124 | 	mov rdi, 1 ; stdout
125 | 	mov rsi, more_str
126 | 	mov rdx, MORE_STRSIZE
127 | 	syscall
128 | 
129 | 	; if it wasn't for this jump, the instructions after the label
130 | 	; less_16 would be executed, so it would simultaneously print
131 | 	; that the input has less than 16 bytes, but also that it has
132 | 	; 16 or more. That's not what we want.
133 | 	; Note that these two jumps effectively implement an if-else; in C,
134 | 	; this would be something like
135 | 	;	if(size > 16) {
136 | 	;		/* print more_str */
137 | 	;	} else {
138 | 	;		/* print less_str */
139 | 	;	}
140 | 	;
141 | 	jmp size_printed
142 | 
143 | less_16:
144 | 	; if we're here, the conditional jump did happen, and the input has
145 | 	; less than 16 bytes
146 | 	mov rax, 1 ; __NR_write
147 | 	mov rdi, 1 ; stdout
148 | 	mov rsi, less_str
149 | 	mov rdx, LESS_STRSIZE
150 | 	syscall
151 | 
152 | size_printed:
153 | 	; iterate over buf, adding 1 to each byte.
154 | 	; but first, we temporarily decrease the buf's size by 1,
155 | 	; because we don't want to touch its last character (the newline).
156 | 	dec rbx ; dec: decrement
157 | 
158 | 	; The following code is equivalent to:
159 | 	;	for(i = 0; i < size; i++) {
160 | 	;		str[i]++;
161 | 	;	}
162 | 	;
163 | 	xor rcx, rcx
164 | loop_begin:
165 | 	; if the condition i < size is false, i.e. if i >= size,
166 | 	; get out of the loop at once. If we only made this comparison
167 | 	; at the bottom of the loop, we would have something akin to a
168 | 	; do ... while instead.
169 | 	;
170 | 	cmp rcx, rbx
171 | 	jge loop_done
172 | 
173 | 	inc byte [buf + rcx] ; inc: increment (++)
174 | 
175 | 	inc rcx ; i++
176 | 
177 | 	; jump to loop_begin's address, starting a new iteration of the loop
178 | 	jmp loop_begin
179 | 
180 | loop_done:
181 | 	; restore the string's size
182 | 	inc rbx
183 | 
184 | 	; finally, print the string...
185 | 	mov rax, 1 ; __NR_write
186 | 	mov rdi, 1 ; STDOUT
187 | 	mov rsi, buf
188 | 	mov rdx, rbx
189 | 	syscall
190 | 
191 | 	; ...and quit!
192 | 	mov rax, 60
193 | 	xor rdi, rdi
194 | 	syscall
195 | 
196 | ; Exercises
197 | ;
198 | ; === St Thomas' Wisdom ===
199 | ; Verify all claims marked with (*).
200 | ;	- You can see that how CMP affects the eflags register by printing its
201 | ;	value in gdb, and you can see the JL works as stated by putting a
202 | ;	breakpoint on it and stepping one instruction.
203 | ;
204 | ; === Learn to Love Your Compiler ===
205 | ; Write the following pseudocode in your favorite *compiled* language:
206 | ;	n <- 0
207 | ;	for i in range 1 to 9 inclusive do
208 | ;		n += i
209 | ;	end
210 | ;	print n
211 | ;
212 | ; Now inspect the program generated with "objdump -d".
213 | ; Do you see any jumps in the corresponding assembly code? If you do, try out
214 | ; a more agressive optmization level (consult your compiler's documentation).
215 | ; If you don't, that's because the compiler can figure out that the loop's
216 | ; index values are all known at compile time, so it transforms the code into
217 | ;	n <- 0
218 | ;	n += 1
219 | ;	n += 2
220 | ;	...
221 | ;	n += 9
222 | ;	print n
223 | ;
224 | ; This is called *loop unrolling*. Then, it figures that all those computations
225 | ; can also be done in compile time, and finally turns that into
226 | ;	n <- 45
227 | ;	print n
228 | ;
229 | ; === Your Turn ===
230 | ;	- Max: write a program that, given an array of integers, places the
231 | ;	largest of them in a certain register. (The array can be hardcoded,
232 | ;	i.e. declared in the .data section)
233 | ;
234 | ;	- ROT13: write a program that reads a string, then replaces all
235 | ;	of its characters in range a-z (lowercase) with its ROT13 equivalent,
236 | ;	and prints the string. Bonus points if you do it for uppercase A-Z too.
237 | ;	Characters not in range a-z (possibly A-Z as well) must be unchanged.
238 | ;
239 | ; Bonus
240 | ;
241 | ; === Race Against the Compiler ===
242 | ; Write the same program, both in assembly and in your favourite *compiled*
243 | ; progamming language, and compare the execution times of both.
244 | ; (Interpreters have the overhead of parsing the code at runtime, so it
245 | ; wouldn't be fair. Java counts as interpreted here, because it translates
246 | ; its bytecode to native instructions at runtime.)
247 | ; Use hardcoded values as input and do NOT print any results, so the timing
248 | ; measurements won't be tainted by I/O.
249 | ;	- The program must compute the exponentiation of two unsigned integers,
250 | ;	using the square and multiply algorithm.
251 | ;	
252 | 
253 | ; vim: set ft=nasm:
254 | 


--------------------------------------------------------------------------------
/0_basic.asm:
--------------------------------------------------------------------------------
  1 | ; Copyright 2018-2019 Luana Carmo M de F Barbosa
  2 | ;
  3 | ; This file is licensed under the CC-BY-SA 2.0 license.
  4 | ; See LICENSE for details.
  5 | 
  6 | ; (0) basic: basic assembly stuff.
  7 | ; The program doesn't do anything useful, it just explains the outline
  8 | ; of assembly code.
  9 | ;
 10 | ; Please read (at least) the sections 'System requirements' and
 11 | ; 'References' of README first.
 12 | ;
 13 | 
 14 | ; This is an assembler directive, i.e. an action to be taken when assembling,
 15 | ; not when executing.
 16 | ; This tells the assembler to export the '_start' symbol so the
 17 | ; linker will be able find it later.
 18 | ;
 19 | global _start
 20 | 
 21 | ; Every program in ELF (Executable and Linking Format, default in Linux)
 22 | ; has several sections. You can see them with
 23 | ;       $ objdump -h <path_to_program>
 24 | ; There are many, but the sections we'll use are
 25 | ;
 26 | ;       data: global variables
 27 | ;	rodata: global constants (read-only data)
 28 | ;       bss: space reserved at program startup
 29 | ;       text: CPU instructions (more on that later)
 30 | ;
 31 | ; These directives are used to specify their repsective sections.
 32 | ;
 33 | section .data
 34 | 	; Anything that's on the beginning of a line and is followed by
 35 | 	; a colon ':' is a label. Labels generally store addresses.
 36 | 	; here my_arr stores the address of the first byte declared by
 37 | 	; the 'db' (declare bytes) directive.
 38 | 	; In other words, this would be equivalent to the C code
 39 | 	;
 40 | 	;	char my_arr[] = {0x12,0x34,0x56,0x78,0x90};
 41 | 	;
 42 | 	; Note that db writes its arguments to the resulting program,
 43 | 	; i.e. an hexdump of the executable would show these bytes. (*)
 44 | 	;
 45 | 	my_arr: db 0x12,0x34,0x56,0x78,0x90
 46 | 
 47 | 	; just as there is db, there's also
 48 | 	;
 49 | 	;	dw: declare word (2 bytes),
 50 | 	;	dd: declare doubleword (4 bytes)
 51 | 	;	dq: declare quadword (8 bytes)
 52 | 	;
 53 | 	; ...among others, but these are the ones we'll use most often.
 54 | 	; (See the NASM manual, section 3.2.1 for the full list.)
 55 | 	;
 56 | 	; It's important to note that all those are little-endian,
 57 | 	; meaning that the bytes' order gets "reversed": the last byte in the
 58 | 	; multi-byte value goes first. (*)
 59 | 	;
 60 | 	little_endian_beef: dw 0xbeef ; becomes 0xef 0xbe, in that order
 61 | 
 62 | 	; if we use dw, dd, ..., with less bytes than they expect, the rest
 63 | 	; of the bytes get filled with zeroes.
 64 | 	;
 65 | 	filled_with_zero: dw 0x42 ; becomes 0x42 0x00, in that order
 66 | 
 67 | 	; becomes 0x76 0x98 0x32 0x54 0xAA 0x00, in that order
 68 | 	my_arr2: dw 0x9876, 0x5432, 0xAA
 69 | 	my_arr3: dd 0xdeadbeef, 0xc0ffee ; 0xc0ffee -> 0xee 0xff 0xc0 0x00
 70 | 	my_arr4: dq 0x0102030405060708, 0x090a0b0c0d0e0f00
 71 | 
 72 | 	; the equ directive sets a name to the value of an expression.
 73 | 	; Because this is an assembler directive, UNUSED is not written to
 74 | 	; the resulting program. This is similar to #define in C.
 75 | 	;
 76 | 	UNUSED: equ 3
 77 | 
 78 | section .text
 79 | ; The _start label has a special meaning: it's the program's entry point,
 80 | ; i.e. the first instruction to be executed is at this address.
 81 | ;
 82 | _start:
 83 | 	; All these are instructions, i.e. operations the CPU knows how
 84 | 	; to carry out directly. There's a full list of them in the
 85 | 	; instruction set, but we'll only use a dozen or so.
 86 | 	; Instructions are separated from their operands by whitespace,
 87 | 	; and the operands are separated from other with commas, like so:
 88 | 	;	<instr> <operand1>, <operand2>, ..., <operand_n>
 89 | 	; The following instructions are 'mov', which simply copy data.
 90 | 	; In case of such instructions, which have a source and
 91 | 	; a destination operand, the Intel syntax (which nasm uses)
 92 | 	; dictates the first operand is the destination, and the second is
 93 | 	; the source:
 94 | 	;	<instr> DEST, SOURCE
 95 | 	; Generally, the source and destination operands can be either
 96 | 	; an address or a register - a small storage that lives inside
 97 | 	; the CPU. The source can also be an immediate value, i.e.
 98 | 	; a simple number.
 99 | 	;
100 | 	mov rax, 0 ; moves the value 0 to register rax
101 | 
102 | 	; There are several registers avaliable on x86_64. Some serve
103 | 	; specific purposes (e.g. registers for storing floating point
104 | 	; numbers), while others are called "general purpose" registers.
105 | 	; There are 16 of them:
106 | 	;
107 | 	;	rax: accumulator
108 | 	;	rbx: base
109 | 	;	rcx: counter
110 | 	;	rdx: destination
111 | 	;	rsp and rbp: stack pointer and base pointer
112 | 	;	rsi and rdi: source and destination index
113 | 	;	r8 through r15: lack of creativity
114 | 	;
115 | 	; The prefix 'r' in all those mean we want to use all 64 bits
116 | 	; in the registers. For all those, except r8 through r15,
117 | 	; it's possible to access:
118 | 	;	- the lowest 32 bits with 'e' prefix, e.g. eax, ebp
119 | 	;	- the lowest 16 bits without any prefix, e.g. ax, si
120 | 	; Also, for registers rax through rdx, it's possible to access:
121 | 	;	- the lowest byte with the 'l' suffix, replacing the
122 | 	;	trailing 'x', e.g. al
123 | 	;	- the highest byte in the 16 bits with the 'h' suffix, in
124 | 	;	the same way as above, e.g. ah
125 | 	; This is summarized in figure 3-5 of section 3.4.1,
126 | 	; basic architecture. These first 8 named registers and their extra
127 | 	; access modes are historical, dating back to the first Intel processors.
128 | 	;
129 | 	; (Note: we use 'byte' as a synonym of '8 bits', because it
130 | 	; indeed is in the x86 architecture.)
131 | 	;
132 | 	mov eax, 0x12345678 ; copies 4 bytes to eax
133 | 	; still copies 4 bytes to eax: the remaining 2 bytes are filled
134 | 	; with zeroes, i.e. this is the same as
135 | 	;	mov eax, 0x0000abcd
136 | 	; note that this is different from 'mov ax, 0xabcd'
137 | 	; in that the previous instruction would only change the lowest
138 | 	; 2 bytes of the register.
139 | 	;
140 | 	mov eax, 0xabcd
141 | 
142 | 	; copies to the lowset byte: now ax will be 0xab12 (*)
143 | 	mov al, 0x12 
144 | 	; copies to the highest byte: now ax will be 0x3412 (*)
145 | 	mov ah, 0x34
146 | 
147 | 	; and of course, you can make arithmetic too.
148 | 
149 | 	mov rdi, 10
150 | 	mov rsi, 7
151 | 	mov rbx, 14
152 | 
153 | 	inc rdi		; INC: increment
154 | 	dec rsi		; DEC: decrement
155 | 
156 | 	; ADD: adds the two operands and stores the result in the destination
157 | 	; one (again, that's the first one, because we're using Intel syntax.)
158 | 	;
159 | 	add rdi, rbx	; Equivalent to rdi += rbx
160 | 
161 | 	sub rsi, rbx	; SUB: subtract. Equivalent to rsi -= rbx
162 | 
163 | 	; Naturally, we also have instructions for multiplying and dividing
164 | 	; integers, but they come with a few catches.
165 | 	;
166 | 	; First, there's two variants for each: MUL and DIV interpret their
167 | 	; operands as unsigned integers, while IMUL and IDIV interpret their
168 | 	; operands as signed integers in two's complement.
169 | 	; (This changes whether or not the operands' most significant bits are
170 | 	; interpreted as sign bits).
171 | 	;
172 | 	; Second, while both multiplication and division need two numbers,
173 | 	; the MUL and DIV instructions take a single operand because they use
174 | 	; fixed registers for the other number.
175 | 	; For example, when a 64-bit operand is used in
176 | 	;
177 | 	;	- MUL, the result is rax * <operand>, and it's a 128-bit value
178 | 	;	stored in rdx:rax - meaning the 64 lower bits are stored in rdx,
179 | 	;	while the 64 upper bits are stored in rax.
180 | 	;
181 | 	;	- DIV, the operand is the divisor and the dividend is rdx:rax,
182 | 	;	meaning it's a 128-bit value whose 64 upper bits are in rdx and
183 | 	;	whose 64 lower bits are in rax. The quotient is a 64-bit value
184 | 	;	stored in rax, and the remainder is also a 64-bit value, stored
185 | 	;	in rdx.
186 | 	;
187 | 	mov rax, 7
188 | 	mov rdx, 4 ; will be overwritten by MUL
189 | 	mov rdi, 3
190 | 
191 | 	mul rdi
192 | 	; here, rax <- 3 * 7 = 21, rdx <- 0 (*)
193 | 
194 | 	mov rax, 22
195 | 	mov rdx, 0
196 | 	mov rdi, 4
197 | 
198 | 	div rdi
199 | 	; here, rax is floor(22 / 4) = 5, and rdx is 22 mod 4 = 2 (*)
200 | 
201 | 	; finally, we have bitwise operations too.
202 | 	mov rdi, 0x35
203 | 	mov rsi, 0x44
204 | 
205 | 	and rdi, rsi	; bitwise AND
206 | 	or rdi, rsi	; bitwise OR
207 | 	xor rdi, rsi	; bitwise XOR
208 | 
209 | 	shr rsi, 2	; right (logical) bitshift: equivalent to rsi >> 2
210 | 	shl rsi, 3	; left (logical) bitshift: equivalent to rsi << 3
211 | 
212 | 	; Note that there's SAR for arithmetic right shift.
213 | 	; There's also SAL, but it's equivalent to SHL.
214 | 
215 | 	; the code below is a system call to exit cleanly;
216 | 	; we'll explain it in the next file.
217 | 	;
218 | 	mov rax, 60
219 | 	xor rdi, rdi
220 | 	syscall
221 | 
222 | ; Exercises
223 | ;
224 | ; === First Things First ===
225 | ; Assemble and link this file into a program, then run it.
226 | ; (The program should do nothing other than exit cleanly)
227 | ;
228 | ; === St Thomas' Wisdom ===
229 | ; Verify all claims marked with (*).
230 | ;	- Print a hexdump of the program to verify db, dw, etc. work as stated,
231 | ;	including the endianess.
232 | ;	- Run the program in gdb to verify that the instructions work as stated,
233 | ;	stepping through each one and printing the affected registers' value
234 | ;	as needed. (Refer to the "Debugging" section of README.md to learn how.)
235 | ;
236 | ; === Changing Stuff and Seeing What Happens ===
237 | ;	- Comment out the syscall instruction and run again.
238 | ;	- Change DIV's operand to zero and run again.
239 | ;
240 | 
241 | ; vim: set ft=nasm:
242 | 


--------------------------------------------------------------------------------
/4_leaf.asm:
--------------------------------------------------------------------------------
  1 | ; Copyright 2018-2019 Luana Carmo M de F Barbosa
  2 | ;
  3 | ; This file is licensed under the CC-BY-SA 2.0 license.
  4 | ; See LICENSE for details.
  5 | ;
  6 | 
  7 | ; (4) leaf.sam: leaf functions, calling conventions, and the stack
  8 | ;
  9 | ; We've already seen how branches and loops work under the hood;
 10 | ; now let's do the same for functions.
 11 | ; For now, we'll only deal with _leaf_ functions -- that is, functions that
 12 | ; do not call other functions. Non-leaf functions require some extra work and
 13 | ; will be covered in the next file.
 14 | 
 15 | global _start
 16 | 
 17 | section .data
 18 | 	STR: db 'this string will be reversed!',0xA
 19 | 	STRSIZE: equ $ - STR
 20 | 
 21 | section .text
 22 | _start:
 23 | 	; So far, all the code we had was in _start; now some of it goes into
 24 | 	; a separate function. We'll call our function revstr to reverse STR
 25 | 	; before writing it to stdout.
 26 | 	; (You can see the parameters revstr takes and its return type below.)
 27 | 	;
 28 | 	; Calling a function is done simply with the 'call' instruction
 29 | 	; (more on this below). But where do the parameters go? And where does
 30 | 	; the function put its return value?
 31 | 	; Each operating system has a calling convention, which answers these
 32 | 	; questions; it is defined in the OS's Application Binary Interface
 33 | 	; (ABI). Linux follows the Unix System V ABI, which states, in section
 34 | 	; 3.2.3 (Parameter Passing):
 35 | 	;
 36 | 	; "2. If the class is INTEGER, the next available register of the sequence
 37 | 	;  %rdi, %rsi, %rdx, %rcx, %r8 and %r9 is used"
 38 | 	;
 39 | 	; (Note that the sequence is similar, but slightly different than that
 40 | 	; used for system calls.) The INTEGER class is defined as
 41 | 	;
 42 | 	; "INTEGER This class consists of integral types that fit into one of
 43 | 	; the general purpose registers."
 44 | 	;
 45 | 	; Since we're not dealing with floats yet, or with absurdly large
 46 | 	; values, all our parameters fit that definition.
 47 | 	; (Again, assembly doesn't have types, so pretty much everything can be
 48 | 	; seen as an "integral type", including pointers.)
 49 | 	; As for the return value:
 50 | 	;
 51 | 	; "3. If the class is INTEGER, the next available register of the
 52 | 	; sequence %rax, %rdx is used."
 53 | 	;
 54 | 	; Note that this allows us to return two 64-bit values, or a single
 55 | 	; 128-bit value, by placing its upper bits in rdx and its lower bits
 56 | 	; on rax. This isn't needed in most cases, though.
 57 | 	;
 58 | 	mov rdi, STR		; 1st arg in rdi
 59 | 	lea rsi, [STRSIZE - 1]	; 2nd arg in rsi (-1: don't include the newline)
 60 | 
 61 | 	; Now that the arguments are in the right place, we call the function.
 62 | 	; All it takes is using the CALL instruction, with the label that stores
 63 | 	; the address of the beginning of the function.
 64 | 	; But how does CALL work?
 65 | 	;
 66 | 	; Calling a function is a lot like an unconditional jump, because
 67 | 	; the next instruction we want to execute will be at the beginning of
 68 | 	; the function we want to call. There's only one catch: when the
 69 | 	; function returns, we have to go back to the instruction that was right
 70 | 	; after the call. So we could implement a call as a modified 'jmp' that
 71 | 	; stores the return address somewhere. And where is that?
 72 | 	;
 73 | 	; If you ever heard an explanation of how a recursive implementation
 74 | 	; of the Fibonnaci sequence or the factorial function work,
 75 | 	; you may have heard that each function's variables is stored in a stack.
 76 | 	; Well, that's the one. The call stack, or execution stack for some.
 77 | 	; (From now on, we'll just refer to it as "the stack".)
 78 | 	; It's also used to store the return addresses: it makes sense to store
 79 | 	; return addresses in a stack, because if function A calls B
 80 | 	; which calls C which calls D, when D returns, we expect it to return
 81 | 	; to C, meaning we must jump to the return address saved last, which is
 82 | 	; a last-in-first-out policy.
 83 | 	;
 84 | 	; Therefore, calling a function means pushing the next instruction's
 85 | 	; address (i.e. the contents of RIP) to the stack, then jumping to
 86 | 	; (setting RIP to) the argument supplied. (*)
 87 | 	; Indeed, from the instruction set:
 88 | 	;
 89 | 	; "
 90 | 	;[...]
 91 | 	;ELSE (* Near absolute call *)
 92 | 	;	IF OperandSize = 64
 93 | 	;	THEN
 94 | 	;		tempRIP <- DEST; (* DEST is r/m64 *)
 95 | 	;	IF stack not large enough for a 8-byte return address
 96 | 	;	THEN #SS(0); FI;
 97 | 	;	Push(RIP);
 98 | 	;	RIP <- tempRIP;
 99 | 	;	FI;
100 | 	; "
101 | 	; (Note: we omit the difference between near/far and absolute/relative
102 | 	; calls for now. All calls we're dealing with are near and absolute.)
103 | 	;
104 | 	call revstr
105 | 
106 | 	; write STR
107 | 	mov rax, 1 ; __NR_write
108 | 	mov rdi, 1 ; stdout
109 | 	mov rsi, STR
110 | 	mov rdx, STRSIZE
111 | 	syscall
112 | 
113 | 	; quit
114 | 	mov rax, 60
115 | 	xor rdi, rdi
116 | 	syscall
117 | 
118 | ; void revstr(char *s, size_t size);
119 | ;	reverses the bytes of the array s, which has size bytes.
120 | ;
121 | ; A function definition starts with a label, which we can use to call it.
122 | ; Note that the assembler doesn't know whether this is a label is a function
123 | ; or something else; labels are only meant to store addresses,
124 | ; and those addresses could be of anything.
125 | ; It's our use of this address (with the CALL instruction) that makes it
126 | ; a function.
127 | ;
128 | revstr:
129 | 	; There are some registers which functions are not allowed to change.
130 | 	; Those are the callee-saved registers, and which registers have that
131 | 	; property depends on the ABI.
132 | 	; From SysV ABI's figure 3.4, they are rbx and r12 through r15.
133 | 	; In leaf functions, all it takes to make sure these registers won't
134 | 	; change is not writing to them.
135 | 	; (Callee-saved registers will be explained further in the next file.)
136 | 	;
137 | 	xor r8, r8
138 | 	lea r9, [rsi - 1]
139 | 
140 | 	; labels starting with a dot '.' are local labels.
141 | 	; You can define several local labels with the same name in separate
142 | 	; functions and they won't conflict.
143 | 	; From the NASM manual, section 3.9: "
144 | 	;
145 | 	; label1 ; some code
146 | 	; .loop
147 | 	; [...]
148 | 	;
149 | 	; label2 ; some code
150 | 	; .loop
151 | 	; [...]
152 | 	;
153 | 	; [...] This is achieved by means of defining a local label
154 | 	; in terms of the previous non-local label: the first definition of
155 | 	; .loop above is really defining a symbol called label1.loop,
156 | 	; and the second defines a symbol called label2.loop." (*)
157 | .loop:
158 | 	cmp r8, r9
159 | 	jge .done
160 | 
161 | 	; swap s[r8] and s[r9]
162 | 	; XCHG exchanges two values. Unfortunately, at least one of them has
163 | 	; to be a register, or else we could simply do
164 | 	; 'xchg byte [rsi + r8], byte [rsi + r9]'
165 | 	;
166 | 	mov cl, byte [rdi + r8]
167 | 	xchg cl, byte [rdi + r9]
168 | 	mov byte [rdi + r8], cl
169 | 
170 | 	inc r8
171 | 	dec r9
172 | 	jmp .loop
173 | .done:
174 | 	; Return from the function: pop the address stored in the stack
175 | 	; and set RIP to that. From the instruction set:
176 | 	;
177 | 	;IF instruction = near return
178 | 	;THEN;
179 | 	;	IF OperandSize = 32
180 | 	; [...]
181 | 	;	ELSE IF OperandSize = 64
182 | 	;	THEN
183 | 	;		IF top 8 bytes of stack not within stack limits
184 | 	;			THEN #SS(0); FI;
185 | 	;			RIP <- Pop();
186 | 	;
187 | 	; Note that if a malicious agent manages to change the value
188 | 	; corresponding to the return address in the stack, they can deviate
189 | 	; execution to run malicious instructions: this is called stack smashing.
190 | 	; Many protection mechanisms have been devised to prevent it.
191 | 	;
192 | 	ret
193 | 
194 | ; Exercises
195 | ; (Note: all functions referred to here must be leaf functions.)
196 | ;
197 | ; === St Thomas' Wisdom ===
198 | ; Verify all claims marked with (*).
199 | ;
200 | ; === Learn to Love Your Compiler ===
201 | ; Write the following pseudocode in your favorite *compiled* language:
202 | ;
203 | ;	function halve_truncate(integer i) -> integer
204 | ;		return floor(i / 2);
205 | ;	end function
206 | ;	read integer n from standard input
207 | ;	print halve_truncate(n)
208 | ;
209 | ; Now inspect the program generated with "objdump -d". Do you see any call to
210 | ; halve_truncate? If you do, try out a more agressive optmization level
211 | ; (consult your compiler's documentation). If you don't, that's because the
212 | ; compiler realized the function is so small and it's only called from one
213 | ; place, there's no good reason to pay the overhead of calling it.
214 | ; Instead, it "copy-pastes" the function's code and places it where the
215 | ; function call once was, like so:
216 | ;
217 | ;	read integer n from standard input
218 | ;	print floor(n / 2)
219 | ;
220 | ; This is called *inlining*.
221 | ;
222 | ; === Your Turn ===
223 | ;	- Write a function that recieves an unsigned integer, and returns the
224 | ;	number of steps required for that number to become 1 with successive
225 | ;	applications of the Collatz function:
226 | ;		f(n) = n/2      if n is even
227 | ;		       3*n + 1  if n is odd
228 | ;
229 | ;	Bonus points if you use dynamic programming (storing the intermediate
230 | ;	results in a huge array, which can be declared in the .data section.)
231 | ;	Note: because the function must be leaf, you're required to implement
232 | ;	an iterative function (rather than a recursive one).
233 | ;
234 | ;	- Write a program that calls that function for several numbers,
235 | ;	at your choice. Run it in gdb to see your function's return values.
236 | ;
237 | ; === Pointless Constraints Make Amusing Puzzles ===
238 | ;	- Make a function call using only the instructions PUSH and RET,
239 | ;	and labels.
240 | ;	Note that the function is still expected to return to its caller in the
241 | ;	usual way - with a RET instruction at its end - and it must return to
242 | ;	the instruction immediately following the last one that caused
243 | ;	the function to be called.
244 | ;
245 | ;	- Write a program that reads a string, then calls a function that
246 | ;	replaces all its characters with '+' if the string's size is even,
247 | ;	but calls a different function that replaces them with '-' if the
248 | ;	size is odd. However, the program must contain *a single* CALL
249 | ;	instruction, and you're not allowed to use jumps instead of CALL,
250 | ;	or call the functions with any instruction that isn't CALL.
251 | ;	You'll probably need to read the instruction set's entry for CALL here.
252 | ;	(Hint: use two CALLs first, worry about the constraint later.)
253 | 
254 | ; vim: set ft=nasm:
255 | 


--------------------------------------------------------------------------------
/5_nonleaf.asm:
--------------------------------------------------------------------------------
  1 | ; Copyright 2018-2019 Luana Carmo M de F Barbosa
  2 | ;
  3 | ; This file is licensed under the CC-BY-SA 2.0 license.
  4 | ; See LICENSE for details.
  5 | ;
  6 | 
  7 | ; (5) nonleaf.asm: non-leaf functions and callee-saved registers
  8 | ;
  9 | ; Non-leaf functions require some extra work than leaf functions.
 10 | ; We present them here. With this, you should be able to write any function
 11 | ; in assembly x86_64.
 12 | 
 13 | global _start
 14 | 
 15 | section .bss
 16 | 	BUF_SIZ: equ 64
 17 | 	buf1: resb 64
 18 | 	buf2: resb 64
 19 | 
 20 | section .text
 21 | _start:
 22 | 	; The first function we call is uint2str - which, as the name implies,
 23 | 	; converts an unsigned integer to a string.
 24 | 	; Calling a non-leaf function works just the same.
 25 | 	;
 26 | 	mov rdi, 40579
 27 | 	mov rsi, buf1
 28 | 	mov rdx, BUF_SIZ
 29 | 	call uint2str
 30 | 
 31 | 	; write buf1
 32 | 	mov rdx, rax ; uint2str's return value: the number of bytes written
 33 | 	mov rax, 1 ; __NR_write
 34 | 	mov rdi, 1 ; stdout
 35 | 	mov rsi, buf1
 36 | 	syscall
 37 | 
 38 | 	; this function takes just one argument.
 39 | 	mov rdi, 5
 40 | 	call factorial
 41 | 
 42 | 	; convert factorial's return value (in rax) to a string, passing it
 43 | 	; as the first parameter to uint2str.
 44 | 	mov rdi, rax
 45 | 	mov rsi, buf2
 46 | 	mov rdx, BUF_SIZ
 47 | 	call uint2str
 48 | 
 49 | 	; write buf2; pretty much the same as above.
 50 | 	mov rdx, rax
 51 | 	mov rax, 1 ; __NR_write
 52 | 	mov rdi, 1 ; stdout
 53 | 	mov rsi, buf2
 54 | 	syscall
 55 | 
 56 | 	; quit
 57 | 	mov rax, 60
 58 | 	xor rdi, rdi
 59 | 	syscall
 60 | 
 61 | ; size_t uint2str(unsigned int n, char *buf, size_t bufsize);
 62 | ;	writes the string corresponding to the number n in buf,
 63 | ;	writing no more than bufsize bytes.
 64 | ;	Returns the number of bytes written.
 65 | ;
 66 | uint2str:
 67 | 	; The two instructions below appear at the beginning of nearly all
 68 | 	; non-leaf functions. These are often called the prologue of a function.
 69 | 	;
 70 | 	; When a function returns, some of the registers must have the same
 71 | 	; value they had when the function was called. Those are named
 72 | 	; "callee-saved registers". Which registers are callee-saved is defined
 73 | 	; by the OS's ABI.
 74 | 	; It doesn't matter whether the function changes these registers during
 75 | 	; its execution, but if it does, they must be restored to their original
 76 | 	; values. To that end, we must store their original values somewhere
 77 | 	; - and that somewhere is also the stack. We store them explicitly with
 78 | 	; the PUSH instruction, which explains the 'push rbp' below, because
 79 | 	; rbp is one of the callee saved registers.
 80 | 	;
 81 | 	push rbp ; store the previous value of rbp in the stack
 82 | 	; 
 83 | 	; There is one crucial detail we haven't mentioned so far.
 84 | 	; We've mentioned the stack - but where is the stack top stored?
 85 | 	; It's in a register: rsp (SP meaning Stack Pointer).
 86 | 	; In fact, this is what the PUSH instruction does with a 64-bit operand
 87 | 	; (from the instruction set):
 88 | 	;
 89 | 	;"
 90 | 	;IF StackAddrSize = 64
 91 | 	;THEN
 92 | 	;	IF OperandSize = 64
 93 | 	;	THEN
 94 | 	;		RSP <- RSP - 8;
 95 | 	;		Memory[SS:RSP] <- SRC; (* push quadword *)
 96 | 	; [...]
 97 | 	;"
 98 | 	;
 99 | 	; Two things are noteworthy in this pseudocode. First, RSP actually has
100 | 	; its value subtracted when something is PUSH'd - meaning that lower
101 | 	; addresses are closer to the stack top, i.e. the stack is
102 | 	; "upside down". (*) Second, the address of RSP is accessed in the
103 | 	; segment register SS, which stands for Stack Segment.
104 | 	;
105 | 	; rsp is also a calee-saved register, so we also have to store its value
106 | 	; somewhere to restore it later. (Actually, this is not always needed,
107 | 	; but we usually store rsp anyway.)
108 | 	; Unlike eip, rsp is not a special register: it's a general purpose one.
109 | 	; It can therefore be used with all instructions that accept
110 | 	; general purpose registers as arguments, including MOV.
111 | 	
112 | 	; Instead of pushing rsp to the stack, we store its value on rbp.
113 | 	; (BP stands for Base Pointer.) We can do this because we've already
114 | 	; stored the previous value of rbp with 'push rbp', above.
115 | 	; We use rbp so we know that all addresses in the range of those
116 | 	; stored by rbp + 8 and rsp are addresses belonging to this function's
117 | 	; portion of the stack (rbp + 8 because of the PUSH above).
118 | 	; That portion is called stack frame.
119 | 	;
120 | 	mov rbp, rsp ; store the current stack top at rbp
121 | 
122 | 	; push the remaining callee saved registers to the stack, so we can
123 | 	; restore them before returning.
124 | 	; (Note that all of those PUSH instructions will change rsp's value.)
125 | 	;
126 | 	; Note: we can keep callee-saved registers unaltered by simply not
127 | 	; changing them. That's why we didn't need these PUSH instructions
128 | 	; in leaf functions: you only need to push the calee-saved registers
129 | 	; you'll change, so in leaf functions there's a good reason not to use
130 | 	; these registers, and no good reason to do so.
131 | 	; However, a non-leaf function calls another function, and we want to
132 | 	; make sure our local variables are not changed in the function we'll
133 | 	; call. To that end, we can either store those variables
134 | 	;
135 | 	;	(1) in the stack, or
136 | 	;	(2) in callee-saved registers.
137 | 	;
138 | 	; We've chosen (2), because accessing stuff from registers is quicker
139 | 	; than from memory; though if we had too many variables to store,
140 | 	; we'd have to resort to (1). This is a decision the compiler does
141 | 	; for you when using high level languages. (*)
142 | 	;
143 | 	push rbx
144 | 	push r12
145 | 	push r13
146 | 	push r14
147 | 	push r15
148 | 
149 | 	; Store the arguments passed to us in callee-saved registers.
150 | 	; As per the SysV calling convention, these arguments will be in rdi,
151 | 	; rsi and rdx respectively.
152 | 	;
153 | 	; (Note that, since we're writing both this function and all the ones
154 | 	; that call it, we could have deviated from the calling convention
155 | 	; if we wanted to - as long as both the caller and the callee agree
156 | 	; on where the arguments should be, it really doesn't matter.
157 | 	; But there will be times when our code needs to call functions written
158 | 	; by someone else; then, the easiest way to make these our caller agree
159 | 	; with their callee is to stick to the calling convention.)
160 | 	;
161 | 	mov r12, rdi ; n
162 | 	mov r13, rsi ; buf
163 | 	mov r14, rdx ; bufsize
164 | 
165 | 	xor r15, r15 ; i = 0 (counter)
166 | 	mov rax, r12 ; rax = n
167 | 
168 | .loop:
169 | 	; iterate over the number mod 10 to get the digits in reverse order
170 | 
171 | 	cmp rax, 0
172 | 	je .done	; if(n == 0) break;
173 | 	cmp r15, r14
174 | 	jge .done	; if(i >= bufsize) break;
175 | 
176 | 	xor rdx, rdx
177 | 	mov rdi, 10
178 | 	div rdi
179 | 	; now n%10 is in rdx and n/10 is in rax.
180 | 	; Since we're using rax to store n, that was equivalent to
181 | 	;	n /= 10; rdx = n % 10;
182 | 	;
183 | 	; n%10 is between 0 and 9, so that fits in a byte.
184 | 	; we add '0' (the ascii value of the character 0) to get a value
185 | 	; from '0' to '9'
186 | 	;
187 | 	add dl, '0'
188 | 	mov byte [r13 + r15], dl ; write that value to the string
189 | 
190 | 	inc r15		; i++
191 | 	jmp .loop
192 | .done:
193 | 	; append a newline to the string
194 | 	mov byte [r13 + r15], 0x0a
195 | 	inc r15
196 | 
197 | 	; now we've written the string to buf, but it's reversed
198 | 	; (the last digit appears first), so we call revstr to fix it.
199 | 	;
200 | 	mov rdi, r13 ; buf
201 | 	lea rsi, [r15 - 1] ; number of chars written -1 (don't include the newline)
202 | 	call revstr
203 | 
204 | 	mov rax, r15 ; return value: the counter
205 | 
206 | 	; restore callee saved registers, in the reverse order they were pushed,
207 | 	; with the POP instruction. As expected, POP retrieves a value from
208 | 	; the stack, stores it in its argument, and updates the stack top:
209 | 	; (again from the instruction set)
210 | 	;
211 | 	;ELSE IF StackAddrSize = 64
212 | 	;THEN
213 | 	;	IF OperandSize = 64
214 | 	;	THEN
215 | 	;		DEST <- SS:RSP; (* Copy quadword *)
216 | 	;		RSP <- RSP + 8;
217 | 	pop r15
218 | 	pop r14
219 | 	pop r13
220 | 	pop r12
221 | 	pop rbx
222 | 
223 | 	; undo the prologue: these two are often called the function's epliogue.
224 | 	; First, retrieve the value old value of rsp from rbp...
225 | 	mov rsp, rbp
226 | 	; ...then restore rbp from the stack. This instruction also adds 8 to
227 | 	; the stack pointer, thus restoring rsp to the value it was right after the
228 | 	; CALL instruction, which is vital to make the following RET work.
229 | 	pop rbp
230 | 	; same as before.
231 | 	ret
232 | 
233 | ; unsigned int factorial(unsigned int n);
234 | ;	Returns the factorial of n.
235 | ;
236 | ; This is here just to show an example of recursive function.
237 | ; It's not too different from a normal non-leaf function, though.
238 | ;
239 | factorial:
240 | 	push rbp
241 | 	mov rbp, rsp
242 | 	push rbx
243 | 	push r12
244 | 	push r13
245 | 	push r14
246 | 	push r15
247 | 
248 | 	mov r12, rdi ; n
249 | 
250 | 	; recursion base: n == 0
251 | 	cmp r12, 0
252 | 	je .zero
253 | 
254 | 	; again, we put our argument in a callee saved register to make it
255 | 	; survive to the function call
256 | 	;
257 | 	mov rbx, rdi
258 | 	dec rdi
259 | 	call factorial ; call itself
260 | 
261 | 	; here, we have returned from the recursive call.
262 | 	;
263 | 	; again, "mul src" is equivalent to "rdx:rax = rax * src"
264 | 	xor rdx, rdx
265 | 	mul rbx
266 | 
267 | 	; ideally, we should check if the multiplication above overflowed...
268 | 
269 | 	jmp .done
270 | .zero:
271 | 	mov rax, 1 ; 0! = 1
272 | .done:
273 | 	pop r15
274 | 	pop r14
275 | 	pop r13
276 | 	pop r12
277 | 	pop rbx
278 | 	mov rsp, rbp
279 | 	pop rbp
280 | 	ret
281 | 
282 | ; void revstr(char *s, size_t size);
283 | ;	reverses the bytes of the array s, which has size bytes.
284 | ;
285 | ; This is the same function from the previous file.
286 | revstr:
287 | 	xor r8, r8
288 | 	lea r9, [rsi - 1]
289 | 
290 | .loop:
291 | 	cmp r8, r9
292 | 	jge .done
293 | 
294 | 	; swap s[r8] and s[r9]
295 | 	mov cl, byte [rdi + r8]
296 | 	xchg cl, byte [rdi + r9]
297 | 	mov byte [rdi + r8], cl
298 | 
299 | 	inc r8
300 | 	dec r9
301 | 	jmp .loop
302 | .done:
303 | 	ret
304 | 
305 | ; Exercises
306 | ;
307 | ; === St Thomas' Wisdom ===
308 | ; Verify all claims marked with (*).
309 | ;
310 | ; === Changing Stuff and Seeing What Happens ===
311 | ;	- Comment out the prologue and epilogue of both non-leaf functions
312 | ;	shown. Do the functions still work? Think about it and explain it
313 | ;	to yourself.
314 | ;
315 | ;	- Do the same for all PUSH instructions with callee-saved registers
316 | ;	(after the prologue), on both non-leaf functions.
317 | ;
318 | ; === Learn to Love Your Compiler ===
319 | ; Write the following pseudocode in your favorite *compiled* language:
320 | ;
321 | ;	(* this function's code could be anything, we just make it complicated
322 | ;	 * hoping it won't be inlined.
323 | ;	 * If this is still being inlined: usually, there are compiler-specific
324 | ;	 * attributes you can put on your code to prevent it from doing so. *)
325 | ;	function foo(integer x) -> integer
326 | ;		n <- 0
327 | ;		i <- 0
328 | ;		while (i < x)
329 | ;			n += i
330 | ;			i += 1
331 | ;		end while
332 | ;		return n
333 | ;	end function
334 | ;
335 | ;	function bar(integer x) -> integer
336 | ;		y <- leaf(x)
337 | ;		return x * y
338 | ;	end function
339 | ;
340 | ; Now inspect the program generated with "objdump -d". Do you see any PUSH
341 | ; instructions in the function foo at all? If you do, try a more agressive
342 | ; optimization level. If you don't, that's because the compiler figured that
343 | ; foo is a leaf function and thus PUSHing it argument isn't necessary, whereas
344 | ; that's not the case for bar. (Its argument is being passed to another function
345 | ; and thus needs to be placed in the stack or in a callee-saved register.)
346 | ;
347 | ; That is the *leaf function optimization*.
348 | ;
349 | ; === Your turn ===
350 | ;	- We've shown a function that converts an unsigned int to string.
351 | ;	Write a function that does the opposite: given a string that represents
352 | ;	a number and the string size, return the corresponding number.
353 | ;	If the function recieves a string that does not represent an unsigned
354 | ;	integer, it should print an error message.
355 | ;	A string that starts with a number but then derails, like "123abc",
356 | ;	may or may not be seen as invalid input: that's up to you.
357 | ;
358 | ;	- Write a program that reads a string from stdin, turns it into a number
359 | ;	with your function, multiplies it by two, turns the result into a string
360 | ;	with uint2str, and finally prints that string.
361 | ;
362 | ;	- Write a recursive function that recieves an unsigned integer N
363 | ;	and returns the N-th number in the Fibonacci sequence.
364 | ;	Again, bonus points if you use dynamic programming.
365 | ;
366 | ;	- Write a program to test your Fibonacci function, by calling it with
367 | ;	several numbers and printing the return values (again using uint2str).
368 | ;
369 | 
370 | ; vim: set ft=nasm:
371 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2018-2019 Luana Martins Barbosa
 2 | 
 3 | All files in this project are licensed under the CC-BY-SA 2.0, which can be found in
 4 | <https://creativecommons.org/licenses/by-sa/2.0/legalcode>, and whose content follows.
 5 | 
 6 | Attribution-ShareAlike 2.0
 7 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.
 8 | 
 9 | License
10 | 
11 | THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.
12 | 
13 | BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.
14 | 
15 | 1. Definitions
16 | 
17 |     "Collective Work" means a work, such as a periodical issue, anthology or encyclopedia, in which the Work in its entirety in unmodified form, along with a number of other contributions, constituting separate and independent works in themselves, are assembled into a collective whole. A work that constitutes a Collective Work will not be considered a Derivative Work (as defined below) for the purposes of this License.
18 |     "Derivative Work" means a work based upon the Work or upon the Work and other pre-existing works, such as a translation, musical arrangement, dramatization, fictionalization, motion picture version, sound recording, art reproduction, abridgment, condensation, or any other form in which the Work may be recast, transformed, or adapted, except that a work that constitutes a Collective Work will not be considered a Derivative Work for the purpose of this License. For the avoidance of doubt, where the Work is a musical composition or sound recording, the synchronization of the Work in timed-relation with a moving image ("synching") will be considered a Derivative Work for the purpose of this License.
19 |     "Licensor" means the individual or entity that offers the Work under the terms of this License.
20 |     "Original Author" means the individual or entity who created the Work.
21 |     "Work" means the copyrightable work of authorship offered under the terms of this License.
22 |     "You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.
23 |     "License Elements" means the following high-level license attributes as selected by Licensor and indicated in the title of this License: Attribution, ShareAlike.
24 | 
25 | 2. Fair Use Rights. Nothing in this license is intended to reduce, limit, or restrict any rights arising from fair use, first sale or other limitations on the exclusive rights of the copyright owner under copyright law or other applicable laws.
26 | 
27 | 3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:
28 | 
29 |     to reproduce the Work, to incorporate the Work into one or more Collective Works, and to reproduce the Work as incorporated in the Collective Works;
30 |     to create and reproduce Derivative Works;
31 |     to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission the Work including as incorporated in Collective Works;
32 |     to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission Derivative Works.
33 | 
34 |     For the avoidance of doubt, where the work is a musical composition:
35 |         Performance Royalties Under Blanket Licenses. Licensor waives the exclusive right to collect, whether individually or via a performance rights society (e.g. ASCAP, BMI, SESAC), royalties for the public performance or public digital performance (e.g. webcast) of the Work.
36 |         Mechanical Rights and Statutory Royalties. Licensor waives the exclusive right to collect, whether individually or via a music rights society or designated agent (e.g. Harry Fox Agency), royalties for any phonorecord You create from the Work ("cover version") and distribute, subject to the compulsory license created by 17 USC Section 115 of the US Copyright Act (or the equivalent in other jurisdictions).
37 |     Webcasting Rights and Statutory Royalties. For the avoidance of doubt, where the Work is a sound recording, Licensor waives the exclusive right to collect, whether individually or via a performance-rights society (e.g. SoundExchange), royalties for the public digital performance (e.g. webcast) of the Work, subject to the compulsory license created by 17 USC Section 114 of the US Copyright Act (or the equivalent in other jurisdictions).
38 | 
39 | The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. All rights not expressly granted by Licensor are hereby reserved.
40 | 
41 | 4. Restrictions.The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:
42 | 
43 |     You may distribute, publicly display, publicly perform, or publicly digitally perform the Work only under the terms of this License, and You must include a copy of, or the Uniform Resource Identifier for, this License with every copy or phonorecord of the Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Work that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Work itself to be made subject to the terms of this License. If You create a Collective Work, upon notice from any Licensor You must, to the extent practicable, remove from the Collective Work any reference to such Licensor or the Original Author, as requested. If You create a Derivative Work, upon notice from any Licensor You must, to the extent practicable, remove from the Derivative Work any reference to such Licensor or the Original Author, as requested.
44 |     You may distribute, publicly display, publicly perform, or publicly digitally perform a Derivative Work only under the terms of this License, a later version of this License with the same License Elements as this License, or a Creative Commons iCommons license that contains the same License Elements as this License (e.g. Attribution-ShareAlike 2.0 Japan). You must include a copy of, or the Uniform Resource Identifier for, this License or other license specified in the previous sentence with every copy or phonorecord of each Derivative Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Derivative Works that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder, and You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Derivative Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Derivative Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Derivative Work itself to be made subject to the terms of this License.
45 |     If you distribute, publicly display, publicly perform, or publicly digitally perform the Work or any Derivative Works or Collective Works, You must keep intact all copyright notices for the Work and give the Original Author credit reasonable to the medium or means You are utilizing by conveying the name (or pseudonym if applicable) of the Original Author if supplied; the title of the Work if supplied; to the extent reasonably practicable, the Uniform Resource Identifier, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and in the case of a Derivative Work, a credit identifying the use of the Work in the Derivative Work (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). Such credit may be implemented in any reasonable manner; provided, however, that in the case of a Derivative Work or Collective Work, at a minimum such credit will appear where any other comparable authorship credit appears and in a manner at least as prominent as such other comparable authorship credit.
46 | 
47 | 5. Representations, Warranties and Disclaimer
48 | 
49 | UNLESS OTHERWISE AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE MATERIALS, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
50 | 
51 | 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
52 | 
53 | 7. Termination
54 | 
55 |     This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Derivative Works or Collective Works from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.
56 |     Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.
57 | 
58 | 8. Miscellaneous
59 | 
60 |     Each time You distribute or publicly digitally perform the Work or a Collective Work, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.
61 |     Each time You distribute or publicly digitally perform a Derivative Work, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.
62 |     If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.
63 |     No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.
64 |     This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.
65 | 
66 | Creative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor.
67 | 
68 | Except for the limited purpose of indicating to the public that the Work is licensed under the CCPL, neither party will use the trademark "Creative Commons" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time.
69 | 
70 | Creative Commons may be contacted at https://creativecommons.org/.
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ### What is this?
  2 | 
  3 | An introduction to x86 64-bit assembly, with thoroughly commented code,
  4 | and exercises at the end of each file.
  5 | 
  6 | ### Why?
  7 | 
  8 | Because most entry-level assembly code is 32-bit, yet most processors nowadays are 64-bit.
  9 | Also, information about assembly in general seems scarce.
 10 | 
 11 | ### But why learn assembly?
 12 | 
 13 | When I started learning C, I had a mental diagram that went
 14 | 
 15 | ```mermaid
 16 | graph LR
 17 | subgraph black magic
 18 | f(foo.c) -- compiler --> p(program)
 19 | end
 20 | ```
 21 | 
 22 | As I learned, this diagram was updated until it became something like this:
 23 | 
 24 | ```mermaid
 25 | graph LR
 26 | c1(foo1.c) -- preprocessor --> pp1(foo1.i)
 27 | c2(foo2.c) --> pp2(foo2.i)
 28 | cN(fooN.c) --> ppN(fooN.i)
 29 | subgraph black magic 
 30 | pp1 -- compiler --> o1(foo1.o)
 31 | pp2 --> o2(foo2.o)
 32 | ppN --> oN(fooN.o)
 33 | dl(dynamic libraries) -- linker, runtime --> p
 34 | sl(static libraries) -- linker --> p{program}
 35 | o1 -- linker --> p
 36 | o2 --> p
 37 | oN --> p
 38 | end
 39 | ```
 40 | 
 41 | There's still an awful lot of black magic in that. (The preprocessor was the only thing I actually understood,
 42 | because it's just text substitution.)  
 43 | It felt like the Wise Ones, from the top of Mount Turing, had granted us,
 44 | lowly apprentices, the all-mighty Compiler, so we could play around with our silly programming languages
 45 | and pretend we're like Them...  
 46 | "How can I call myself a programmer if I don't understand the tool I use most often?" I wondered.
 47 | So I decided to study how the compiler works, and that's what led me to assembly.
 48 | 
 49 | There are plenty of other reasons to learn it, though. You'll need to know assembly to:
 50 | 
 51 | - Deal with executables directly, which may be needed if you don't have the source code:
 52 | 	- Binary patches, i.e. changing the program directly
 53 | 	- Reverse engineering, i.e. retrieving source code from an exectuable
 54 | 	- Malware analysis, i.e. deciding whether a program is malware or not
 55 | 
 56 | - Implement some performance-critical systems, which use assembly to squeeze every bit (pun intended) of performance
 57 | out of the CPU
 58 | 
 59 | - Believe it or not, debugging. In some cases where you don't have a clue why your program is behaving the way it is,
 60 | looking at the assembly code generated can tell what the program is actually doing.
 61 | 
 62 | ### What is assembly?
 63 | 
 64 | Your CPU doesn't understand C, Python, Javascript, or whatever programming language you're used to.
 65 | What *does* it understand, then? There must be some set of basic operations which are understood directly
 66 | by the processor, and which are Turing complete, otherwise Turing complete languages wouldn't be able to run on it.  
 67 | Indeed, those basic operations are called *instructions*, and they're encoded by a bunch of bytes meant to be
 68 | fed directly to the processor. For instance:
 69 | 
 70 | ```
 71 | 	48 89 c8
 72 | ```
 73 | 
 74 | What instruction is that? You don't know? Neither do I. Obviously, looking at the bytes
 75 | of instructions directly isn't very effective unless you memorize which instruction corresponds
 76 | to which bytes.  
 77 | Now, if I tell you that instruction is
 78 | 
 79 | ```
 80 | 	mov rax, rcx
 81 | ```
 82 | 
 83 | Much better, isn't it? You may not know yet what this means, but at least you can read it.  
 84 | The previous raw bytes are in a way the processor would understand directly, so we say
 85 | it's written in a machine language. As for the human-readable form, it's in an **assembly language**.  
 86 | To use assembly languages, *assemblers* have been created: programs that convert
 87 | assembly code to machine code.
 88 | 
 89 | Naturally, machine language may vary from one processor to another.
 90 | Any CPU belongs to a certain *architecture*, which defines, among other things,
 91 | the *instruction set* that CPU understands.  
 92 | Therefore, assembly isn't a single programming language, but a family of languages,
 93 | each of which gives human-readable form to the machine code understood by a certain
 94 | CPU architecture.
 95 | 
 96 | Here, we'll show assembly code for the x86 architecture, to which most computers'
 97 | CPUs belong (as opposed to mobile devices, whose processors are generally ARM).
 98 | Specifically, the assembly code shown here is for the 64-bit version of the
 99 | x86 archictecture, which is generally called x86-64, x64, or amd64.
100 | (That last name is used for Intel processors as well as AMD ones.)
101 | 
102 | Assembly languages are often called *low-level* languages, because they deal very
103 | closely with hardware. In contrast, all other programming languages are called
104 | *high-level* languages. (Some people say C is a low-level language, because it's
105 | much closer to the hardware than languages like Python, but throwing C in the same
106 | basket as assembly isn't right either. Perhaps we should compromise and call C
107 | a middle-level language?)
108 | 
109 | Assembly has significant short-comings, though, and that's what ultimately led to
110 | the creation of high-level languages. Namely, assembly
111 | 
112 | - isn't portable: code written for one processor won't necessarily work on a different one.
113 | Nowadays this may not be so much of an issue, since there's only half a dozen
114 | prevailing CPU architectures now, but back when the first high-level languages were created,
115 | this was a serious issue. Also, assembly code depends on which assembler you use, because
116 | each have their own syntax and directives; this is unlike writing C code, because C always
117 | has the same syntax regardless of which compiler will be used.
118 | (Assembly code also depends on the operating system, but so does most high-level code anyway.)
119 | 
120 | - is hard to read and mantain: ever heard of spaghetti code? In assembly that's the only way of
121 | doing anything. There's no `if` or `while` or anything of that sort.
122 | Even Dijkstra's famous paper "Go To Statement Considered Harmful" has a remark in that sense:
123 | 
124 | > [...] and I became convinced that the **go to** statement should be abolished from all
125 | > higher level" programming languages (i.e. everything except, perhaps, plain machine code)
126 | 
127 | Not only that, but assembly allows doing computations in bizarre ways, which might as well be faster
128 | to execute, but are much slower to understand.
129 | 
130 | - is easy to make mistakes in: the closer you get to the hardware, the more annoying details
131 | you'll forget to put in your code there are. Also, since assembly is even closer to the hardware
132 | than C, you'll get all the segmentation faults you'd get there, plus a few more (no, really).
133 | So if you're used to Java levels of hand-holding, I warn you beforehand: this will be a royal pain.
134 | 
135 | - lacks abstractions that make coding easier. Classes, closures, smart pointers / garbage collection,
136 | you name it... none of this exists here.
137 | 
138 | It should be noted, however, that all this high-level code has to become machine code at some point
139 | so it can be executed: usually, this is done translating that code to assembly first
140 | (after which the assembler takes over).  
141 | And who does that? The all-mighty Compiler, granted to us by the Wise Ones... not for us, but for themselves,
142 | because they were fed up with assembly!
143 | 
144 | ### Full compilation diagram
145 | After learning all that, my mental compilation diagram was finally updated to this:
146 | 
147 | ```mermaid
148 | graph TB
149 | c1(foo1.c) -- preprocessor --> pp1(foo1.i)
150 | c2(foo2.c) --> pp2(foo2.i)
151 | cN(fooN.c) --> ppN(fooN.i)
152 | subgraph compiler
153 | pp1 -- frontend --> ir1(IR, foo1)
154 | pp2 --> ir2(IR, foo2)
155 | ppN --> irN(IR, foo3)
156 | ir1 -- backend --> s1(foo1.S: assembly)
157 | ir2 --> s2(foo2.S)
158 | irN --> sN(fooN.S)
159 | end
160 | s1 -- assembler --> o1(foo1.o: machine code)
161 | s2 --> o2(foo2.o)
162 | sN --> oN(fooN.o)
163 | dl(dynamic libraries: machine code) -- linker, runtime --> p
164 | sl(static libraries: machine code) -- linker --> p{program: machine code}
165 | o1 -- linker --> p
166 | o2 --> p
167 | oN --> p
168 | ```
169 | 
170 | Note how the compiler is split into frontend and backend: compilers usually support writing
171 | machine code to more than one CPU architecture, so rather than converting the high-level code
172 | to that directly, it first converts it into some form of architecture-independent
173 | intermediate representation (IR).
174 | 
175 | Also, it may seem weird that the linker recieves and produces machine code. Indeed, the linker
176 | doesn't translate the code, it just takes the symbols (functions, global variables, etc)
177 | referenced (but not defined) in some file and ties them to their definition in some other file.
178 | If the input files are being linked against some dynamic library, the linker is called again at runtime.
179 | 
180 | ### System Requirements
181 | 
182 | Assembly isn't portable at all. Therefore, the code here will only run in Linux,
183 | in a CPU with the x86-64 architecture. If you don't know yours, run
184 | ```bash
185 | 	$ uname -m
186 | ```
187 | To assemble the code, you'll need `nasm` (Netwide Assembler), version 2.12 or greater.
188 | To link it, any linker will do. (ld and gold come with binutils, so chances are you have them already).
189 | 
190 | For assembly code that uses C library functions, you'll need an implementation
191 | of the C library; glibc will do (once again, you probably have it).
192 | gcc isn't strictly needed, but it makes it easier to link such code.
193 | 
194 | ### Human Requirements
195 | 
196 | I expect you to have a solid understanding of the basic programming language
197 | concepts: branches, loops, functions, arrays etc. If you're still struggling
198 | with these, come back later.  
199 | I do *not* expect you to know C - however, some things are explained in terms of
200 | how they would look like in C, and some C functions are used in the code,
201 | so you may have to look these things up if you don't know them already.  
202 | 
203 | ### Assembling and linking
204 | 
205 | UPDATE: there's a Makefile now, so all you have to do is run 'make' to build
206 | all programs. Note that the Makefile requires a C compiler.
207 | 
208 | Assembling with `nasm`:
209 | 
210 | ```bash
211 | 	$ nasm -g -f elf64 foo.asm
212 | ```
213 | 
214 | The -g switch is to produce debugging information, for e.g. `gdb`, and it's optional.
215 | The -f switch specifies the format (in this case, the 64-bit Executable and Linking Format (ELF)).  
216 | If everything goes well, you should get a foo.o file.
217 | 
218 | Linking:
219 | ```bash
220 | 	$ ld -o foo foo.o		# default
221 | 	$ gcc -o [-no-pie] foo foo.o -v	# if your file contains C function calls
222 | ```
223 | 
224 | The -o switch (in both lines) specifies the executable's name;
225 | if omitted, it will be named a.out.
226 | The -v switch (in the 2nd line) is optional, and increases verbosity, in case you want to see
227 | what `gcc` is doing under the hood.
228 | You may need the -no-pie option when linking with `gcc` if your code is using absolute addresses.  
229 | If, upon trying to execute the executable, you get  
230 | ```
231 | 	bash: ./foo: No such file or directory
232 | ```
233 | It means something went wrong when linking, but you've got an useless file nonetheless. Sweet.
234 | 
235 | ### Debugging
236 | 
237 | Debugging programs generated by assemblers might seem, in principle, the same as with any other program.
238 | However, debuggers are usually tied to the high-level code that generated that program
239 | (stepping through line numbers in that code, keeping track of variables, etc). Here, we normally want
240 | to step through instructions rather than source code lines, and to see the registers' and memory's
241 | contents instead of variables.  
242 | I don't know whether the debugger of your favourite IDE (if you have one) has such features:
243 | feel free to check that. What I do know is the GNU Debugger (`gdb`) has them:
244 | 
245 | | option                           | description |
246 | |:--------------------------------:|:------------------------------------------:|
247 | | `disas[semble]`                  | print the instructions of a function. |
248 | | `set disassembly-flavor intel`   | so that the output of `disas` will be in intel syntax, as opposed to AT&T (which is `gdb`'s default) |
249 | | `si`, `ni`                       | analogous to `s` and `n` execept they step through instructions rather than source code lines |
250 | | `info reg` `[reg1]` `[reg2]` ... | prints the value of the requested registers. (Omitting that argument will print all registers) |
251 | | `print $<reg>`                   | another way of printing a single register's value. The `$` is required. |
252 | | `x/<fmt>`                        | examine the contents of a memory location. |
253 | | `info stack`                     | print the stack. |
254 | 
255 | Also, there are a few debugging tools out there that you probably didn't need with high-level languages,
256 | but which are of great use in assembly:
257 | 
258 | - `strace`: prints your program's system calls. This can be used to quickly find errors in system calls.
259 | - `ltrace`: prints your program's library calls. Same idea.
260 | 
261 | And as usual, `valgrind` is wonderful for finding memory-related errors.
262 | 
263 | ### Intel vs AT&T syntax
264 | 
265 | In x86 assembly, there are two "flavors" of syntax: Intel and AT&T.
266 | `nasm` uses Intel syntax, while the GNU assmebler (`as`) uses AT&T syntax
267 | by default (though it has support for Intel syntax as well).
268 | All source files presented here were written for `nasm` and therefore use the
269 | Intel syntax.
270 | 
271 | The file `att.md` summarizes the differences between the two flavors.
272 | 
273 | ### References
274 | 
275 | There are several seemingly vague references in the code, such as 'ABI',
276 | 'instruction set', etc. Here are the documents I'm citing:
277 | 
278 | | Reference                  | Document
279 | |:--------------------------:|-----------------------------------------------|
280 | | ABI                        |  System V Application Binary Interface, AMD64 Architecture Processor Supplement, Draft Version 0.99.8 |
281 | | Basic architecture         |  Intel(R) 64 and IA-32 Architectures Software Developer's Manual Volume 1: Basic Architecture |
282 | | Instruction set            |  Intel(R) 64 and IA-32 Architectures Software Developer's Manual Volume 2 (2A, 2B & 2C): Instruction Set Reference, A-Z |
283 | | NASM manual                | NASM - The Netwide Assembler, version 2.12.01 |
284 | | System programming guide   |  Intel(R) 64 and IA-32 Architectures Software Developer's Manual Volume 3 (3A, 3B, 3C & 3D): System Programming Guide |
285 | 
286 | It's also noteworthy that the number corresponding to each system call can be found in
287 | `/usr/include/asm/unistd_64.h`.
288 | 
289 | ### Contributing
290 | 
291 | I am by no means an expert in assembly. If you see anything that's wrong
292 | or poorly explained, or if there's something that wasn't mentioned here
293 | but you think should be, please file an issue in Gitlab and/or send
294 | a pull request.
295 | 
296 | ### External Links
297 | 
298 | [This image](https://commons.wikimedia.org/wiki/File:Table_of_x86_Registers_svg.svg)
299 | shows all x86-64 registers.
300 | 
301 | ### Acknowledgements
302 | Thanks to Vitor Guidi for the suggestion of adding exercises.
303 | 
304 | ### Copyright 
305 | 
306 | This file, as well as all other files in this project, are relased under the CC-BY-SA 2.0 license.
307 | See the file called LICENSE for details.
308 | 


--------------------------------------------------------------------------------
/7_float.asm:
--------------------------------------------------------------------------------
  1 | ; Copyright 2018-2019 Luana Carmo M de F Barbosa
  2 | ;
  3 | ; This file is licensed under the CC-BY-SA 2.0 license.
  4 | ; See LICENSE for details.
  5 | ;
  6 | 
  7 | ; (7) float.asm: floating point numbers, x87/SSE/AVX, byte alignment
  8 | ;
  9 | ; We could've explained this sooner, but I wanted to be able to use printf()
 10 | ; and scanf() rather than making float <-> string conversions by hand.
 11 | ;
 12 | ; Note: in this file, we use "float" and "double" as synonyms with
 13 | ; "single-precision floating point number" (4 bytes) and
 14 | ; "double-precision floating point number" (8 bytes) respectively.
 15 | 
 16 | global main
 17 | 
 18 | extern printf
 19 | extern scanf
 20 | 
 21 | section .rodata
 22 | 	prompt_flt: db 'type a float (x, result will be x+1): ',0x0
 23 | 	prompt_dbl: db 'type a double (x, result will be x-1): ',0x0
 24 | 
 25 | 	; for scanf(), "%f" means float and "%lf" means double; for printf(),
 26 | 	; "%f" means double, and there there's no way of printing a float
 27 | 	; directly (not as far as I know).
 28 | 	;
 29 | 	scanf_flt_fmt: db '%f',0x0
 30 | 	scanf_dbl_fmt: db '%lf',0x0
 31 | 	printf_dbl_fmt: db 'result = %f',0xA,0x00
 32 | 
 33 | 	printf_dbl_vec2_fmt: db '%f, %f',0xA,0x00
 34 | 	printf_dbl_vec4_fmt: db '%f, %f, %f, %f',0xA,0x00
 35 | 
 36 | 	; dt: declare extended precision (80-bit) floating point number.
 37 | 	; (A long double is generally a 80-bit floating point number.)
 38 | 	;
 39 | 	ldbl1: dt 2.222222222222222222
 40 | 	ldbl2: dt 7.777777777777777777
 41 | 
 42 | 	; long double format.
 43 | 	; We use 18 digits of precision here because in 80-bit floating point
 44 | 	; numbers, the mantissa has 64 bits, and 2^(-64) < 10^(-18).
 45 | 	;
 46 | 	printf_ldbl_fmt: db '%.18Lf',0xA,0x00
 47 | 
 48 | 	; Align the current address in a 16-byte boundary,
 49 | 	; that is, make the current address divisible by 16, by declaring a
 50 | 	; bunch of useless bytes for padding.
 51 | 	; A few notes on align:
 52 | 	; "Both macros require their first argument to be a power of two [...]
 53 | 	; ALIGN and ALIGNB, being simple macros, perform no error checking:
 54 | 	; they cannot warn you if their first argument fails to be a power of
 55 | 	; two [...]
 56 | 	; A final caveat: ALIGN and ALIGNB work relative to the beginning of
 57 | 	; the section, not the beginning of the address space in the final
 58 | 	; executable. Aligning to a 16−byte boundary when the section you’re in
 59 | 	; is only guaranteed to be aligned to a 4−byte boundary, for example,
 60 | 	; is a waste of effort." (NASM manual, section 4.11.12)
 61 | 	;
 62 | 	align 16
 63 | 
 64 | 	; dword = 32 bits, hence, a float.
 65 | 	; Nasm understands a number literal with a dot as a floating point
 66 | 	; number and writes the appropiate value.
 67 | 	;
 68 | 	flt1: dd 1.0
 69 | 	; qword = 64 bits, hence, a double.
 70 | 	dbl1: dq 1.0
 71 | 
 72 | 	; we use this again because we also need flt_vec4_1 to be 16-byte
 73 | 	; aligned. We've declared 1 float and 1 double, which adds
 74 | 	; 4 + 8 = 12 bytes, so we need more 4 bytes of padding.
 75 | 	align 16
 76 | 
 77 | 	flt_vec4_1: dd 2.0, -1.0, 3.5, 4.2
 78 | 	flt_vec4_2: dd 1.2, 3.4, -1.2, 7.8
 79 | 
 80 | 	dbl_vec2_1: dq 2.0, -1.0
 81 | 	dbl_vec2_2: dq 1.2, 3.2
 82 | 
 83 | 	; these two need to be in a 32-byte alignment
 84 | 	align 32
 85 | 	dbl_vec4_1: dq 4.7, -6.8, 3.1, 6.7
 86 | 	dbl_vec4_2: dq 8.4, 9.2, 4.9, -1.6
 87 | 
 88 | section .text
 89 | main:
 90 | 	push rbp
 91 | 	mov rbp, rsp
 92 | 
 93 | 	; these are separate functions not only for the organization's sake,
 94 | 	; but also to show that they actually work using different stack
 95 | 	; byte alignments.
 96 | 	;
 97 | 	call sse
 98 | 	call avx
 99 | 	call x87
100 | 
101 | 	xor rax, rax
102 | 
103 | 	mov rsp, rbp
104 | 	pop rbp
105 | 	ret
106 | 
107 | ; void sse(void);
108 | sse:
109 | 	; SSE instructions deal with the XMM registers: xmm0 through xmm7
110 | 	; in 32-bit mode, and additionally xmm8 through xmm15 in 64-bit mode.
111 | 	; XMM registers are 128-bit (16-byte) long.
112 | 	; (Basic architecture, sections 10.2.1 and 10.2.2)
113 | 
114 | 	push rbp
115 | 	mov rbp, rsp
116 | 
117 | 	; Align rsp's address to a 16-byte alignment. The reason for this
118 | 	; is the same as for the use of those align macros back there: we want
119 | 	; to use rsp indexed addresses as arguments for SSE instructions,
120 | 	; and those must be 16-byte aligned.
121 | 	;
122 | 	; But how does that align rsp to a 16-byte boundary?
123 | 	; An address that's aligned to a (2^N)-byte boundary is one whose
124 | 	; value is divisible by 2^N.
125 | 	; It turns out, because of how two's complement work, that -2^N is
126 | 	; a value with al bits 1, except for the last N bits, which are all 0.
127 | 	; For instance, with -16:
128 | 	; 
129 | 	; 16 = 0..010000 --(1's compl)--> 1..101111 --(+1)--> 1..110000 = -16
130 | 	;
131 | 	; So we could've written the instruction below as "AND rsp, 0xff..f0",
132 | 	; with 31 Fs. We didn't, because if you leave a single F out,
133 | 	; it'd be equivalent to "AND rsp, 0x0f..f0" and no good can possibly
134 | 	; come out of that. Besides, though it may seem weird at first,
135 | 	; the intention is very clear when you write an AND with a negative
136 | 	; power of two like that.
137 | 	;
138 | 	; Therefore, the AND below is zero'ing out the last 4 bits of rsp,
139 | 	; which makes it divisible by 2^4, because that's how the positional
140 | 	; system work: if the N last digits of a number written in base B
141 | 	; are all 0, then the number is divisible by B^N. (The proof is trivial
142 | 	; - yes, I know everyone says that, but for real this time.)
143 | 	; Since we're talking about the last 4 bits being zero, N=4 and B=2.
144 | 	;
145 | 	; (As a side note: sometimes you can omit the prologue and epliogue
146 | 	; of non-leaf functions if you're careful enough.
147 | 	; We can save ourselves from storing rsp in rbp if we can undo all
148 | 	; changes to rsp's value throughout the function.
149 | 	; For instance, if rsp is only manipulated through PUSH and SUB
150 | 	; instructions, we can ADD the same value that was SUB'd and have the
151 | 	; same number of POP instructions than PUSH ones, and that's enough to
152 | 	; restore rsp to its original value. Here, though, this AND cannot be
153 | 	; easily reverted, so it forces us to store the previous value of rsp
154 | 	; somewhere.)
155 | 	;
156 | 	and rsp, -16
157 | 
158 | 	; make room for 4 floats, or 2 doubles.
159 | 	; We need 16 bytes for that, but even if we needed less we'd still
160 | 	; have to subtract a multiple of 16 because the stack must remain
161 | 	; 16-byte aligned.
162 | 	;
163 | 	sub rsp, 16
164 | 
165 | 	mov rdi, prompt_flt
166 | 	call printf
167 | 
168 | 	mov rdi, scanf_flt_fmt
169 | 	lea rsi, [rsp]
170 | 	call scanf
171 | 
172 | 	; Since xmm0 is 128-bit long, and at first we only use its lowest bits,
173 | 	; we zero it out first.
174 | 	; We have to use PXOR instead of XOR because the latter expects
175 | 	; general purpose registers (rax, rbx, ...) while the former expects
176 | 	; XMM registers. (When in doubt, you can check the instruction set.)
177 | 	;
178 | 	pxor xmm0, xmm0
179 | 
180 | 	; most SSE and AVX instructions have one of the following suffixes:
181 | 	;
182 | 	;	SS = Scalar Single-precision
183 | 	;	PS = Packed Single-precision
184 | 	;	SD = Scalar Double-precision
185 | 	;	PD = Packed Double-precision
186 | 	;
187 | 	; As expected, single or double precision means float (4 bytes)
188 | 	; or double (8 bytes) respectively.
189 | 	; As for scalar versus packed: since XMM registers are 128-bit long
190 | 	; (16 bytes), we can fit 2 doubles or 4 floats in one of them.
191 | 	; When doing so, we say the numbers are packed. However, we can also
192 | 	; store just one float or double per register: when doing so, we say
193 | 	; that value is a scalar. (It's called like that as an analogy to the
194 | 	; mathematical concepts of scalar and vector, since packed values
195 | 	; can be seen as a vector of real numbers).
196 | 	;
197 | 	movss xmm0, dword [rsp]
198 | 
199 | 	; add the two values. Again, we cannot use the ADD instruction:
200 | 	; we must use one of the SSE instructions, and becuase our operands
201 | 	; are both scalar and single-precision, then ADDSS it is.
202 | 	;
203 | 	; Nearly all SSE instructions require memory arguments to be 16-byte
204 | 	; aligned, because that's the size of XMM registers.
205 | 	; If that requirement isn't met, you'll get a segmentation fault.
206 | 	;
207 | 	; This is why we needed those align macros back there, and also why
208 | 	; we had to declare flt1 in the first place (SSE instructions do not
209 | 	; accept immediate values as arguments).
210 | 	;
211 | 	addss xmm0, [flt1]
212 | 
213 | 	; printf()'s format string expects a double, so convert the result
214 | 	; to double before calling printf().
215 | 	;
216 | 	; (cvt = convert, ss2sd = SS to SD)
217 | 	cvtss2sd xmm0, xmm0
218 | 
219 | 	mov rdi, printf_dbl_fmt
220 | 	call printf
221 | 
222 | 	; now we do the same with doubles
223 | 
224 | 	mov rdi, prompt_dbl
225 | 	call printf
226 | 
227 | 	mov rdi, scanf_dbl_fmt
228 | 	lea rsi, [rsp]
229 | 	call scanf
230 | 
231 | 	; same as before, but instead of scalar single-precision (SS)
232 | 	; we use scalar double-precision (SD)
233 | 	movsd xmm0, qword [rsp]
234 |         subsd xmm0, [dbl1]
235 | 
236 | 	; result is already a double, no conversion needed
237 | 	mov rdi, printf_dbl_fmt
238 | 	call printf
239 | 
240 | 
241 | 	; Since we have MOVSS, one might expect that for packed floats the
242 | 	; instruction would be "movps". It's not, though, because it comes in
243 | 	; two flavors: MOVUPS and MOVAPS. 
244 | 	; MOVUPS is one of the few SSE instructions that do not require memory
245 | 	; arguments to be aligned on a 16-byte boundary, whereas MOVAPS does
246 | 	; require that, as usual. (The 'A' and 'U' stand for "Aligned" and
247 | 	; "Unaligned", respectively.)
248 | 	; However, since we already had to align flt_vec4_1 because of other
249 | 	; SSE instructions where we use it, then we might as well use MOVAPS.
250 | 	;
251 | 	movaps xmm0, [flt_vec4_1]
252 | 	mulps xmm0, [flt_vec4_2]
253 | 
254 | 	; move result back to the stack...
255 | 	; (we can use MOVAPS here too because rsp is 16-byte aligned too)
256 | 	movaps [rsp], xmm0
257 | 	; ...so we can move the 4 floats back to separate registers,
258 | 	; as scalar values (i.e. unpack them)
259 | 	;
260 | 	movss xmm0, dword [rsp]
261 | 	movss xmm1, dword [rsp+4]
262 | 	movss xmm2, dword [rsp+8]
263 | 	movss xmm3, dword [rsp+12]
264 | 	; again, we have to convert all those values to double because of the
265 | 	; printf() format
266 | 	;
267 | 	cvtss2sd xmm0, xmm0
268 | 	cvtss2sd xmm1, xmm1
269 | 	cvtss2sd xmm2, xmm2
270 | 	cvtss2sd xmm3, xmm3
271 | 
272 | 	mov rdi, printf_dbl_vec4_fmt
273 | 	call printf
274 | 
275 | 	; the same with doubles, again.
276 | 	; (There's also MOVAPD and MOVUPD)
277 | 	movapd xmm0, [dbl_vec2_1]
278 | 	divpd xmm0, [dbl_vec2_2]
279 | 
280 | 	movapd [rsp], xmm0
281 | 	movsd xmm0, qword [rsp]
282 | 	movsd xmm1, qword [rsp+8]
283 | 
284 | 	mov rdi, printf_dbl_vec2_fmt
285 | 	call printf
286 | 
287 | 	mov rsp, rbp
288 | 	pop rbp
289 | 
290 | 	ret
291 | 
292 | ; void avx(void);
293 | avx:
294 | 	; AVX instructions deal with YMM registers: ymm0 through ymm7 in 32-bit
295 | 	; mode and also ymm8 through ymm15 in 64-bit mode.
296 | 	; YMM registers are 256-bit (32-byte) long.
297 | 	; Similar to eax/rax, when using AVX, xmmN is an alias to the lowest
298 | 	; 16 bytes of ymmN.
299 | 	; (Basic architecture, section 14.1)
300 | 	;
301 | 	; In the same vein, there's also AVX-512, which introduces 512-bit long
302 | 	; registers zmm0 through zmm15, and ymmN is the lowest 32 bytes of zmmN.
303 | 
304 | 	push rbp
305 | 	mov rbp, rsp
306 | 
307 | 	; Since AVX uses 32-byte long registers, we'll need to align rsp
308 | 	; to 32-byte to use it in AVX instructions.
309 | 	and rsp, -32
310 | 
311 | 	; make room for 4 doubles. Again, even if we needed less space,
312 | 	; we'd still have to subtract a multiple of 32 here.
313 | 	sub rsp, 32
314 | 
315 | 	; AVX instructions are similar to SSE, except we need to prepend them
316 | 	; with 'v' so they're econded using the VEX prefix, which allows using
317 | 	; the YMM registers as arguments (instruction set, section 2.3.1).
318 | 	;
319 | 	vmovapd ymm0, [dbl_vec4_1]
320 | 	vaddpd ymm0, ymm0, [dbl_vec4_2]
321 | 
322 | 	vmovapd [rsp], ymm0
323 | 
324 | 	; unpack the 4 doubles into separate XMM registers.
325 | 	; We can use the instructions prepended with 'v' with XMM registers too:
326 | 	; the only difference is the highest 16 bytes of the corresponding YMM
327 | 	; register are zero'd out.
328 | 	; If you're only using XMM registers, there's no difference between
329 | 	; them, but since we were using all of ymm0 I do want to clear its
330 | 	; highest bytes.
331 | 	;
332 | 	vmovsd xmm0, qword [rsp]
333 | 	; and we use 'v' here too, just because we can.
334 | 	vmovsd xmm1, qword [rsp+8]
335 | 	vmovsd xmm2, qword [rsp+16]
336 | 	vmovsd xmm3, qword [rsp+24]
337 | 
338 | 	mov rdi, printf_dbl_vec4_fmt
339 | 	call printf
340 | 
341 | 	; (ymm0 is not callee-saved, it may have changed when calling printf())
342 | 	vmovapd [rsp], ymm0
343 | 	vmovapd ymm1, [dbl_vec4_1]
344 | 
345 | 	; AVX also has some more elaborated features like fused add-multiply:
346 | 	; this instruction mulitplies the packed doubles in the 2nd and 3rd
347 | 	; operands then adds them to those in the 1st operand.
348 | 	vfmadd231pd ymm0, ymm1, [dbl_vec4_2]
349 | 
350 | 	vmovapd [rsp], ymm0
351 | 	movsd xmm0, qword [rsp]
352 | 	movsd xmm1, qword [rsp+8]
353 | 	movsd xmm2, qword [rsp+16]
354 | 	movsd xmm3, qword [rsp+24]
355 | 	mov rdi, printf_dbl_vec4_fmt
356 | 	call printf
357 | 
358 | 	mov rsp, rbp
359 | 	pop rbp
360 | 	ret
361 | 
362 | ; void x87(void);
363 | x87:
364 | 	push rbp
365 | 	mov rbp, rsp
366 | 
367 | 	; And now for something different!
368 | 	; SSE, AVX and the like only have support to floats or doubles.
369 | 	; To use long doubles (80-bit), we need to use legacy x87 instructions.
370 | 	; (These date way back to the days of floating-point coprocessors...)
371 | 	;
372 | 
373 | 	; x87 instructions do not require any form of byte alignment.
374 | 	; However, this is still needed because the printf() call may use SSE
375 | 	; instructions (it does on my libc implementation).
376 | 	;
377 | 	and rsp, -16
378 | 
379 | 	; make room for one 80-bit floating point number.
380 | 	; (again, only 10 bytes are needed, but we must mantain the alignment)
381 | 	;
382 | 	sub rsp, 16
383 | 
384 | 	; x87 operates on special "registers", st0 through st7, each of which
385 | 	; refer to a certain position in a floating point stack.
386 | 	;
387 | 	; x87 instructions are all prefixed with an 'f'.
388 | 	; Here, we use FLD (LD = load) to push the 80-bit (tword) value
389 | 	; at address ldbl1 to the register st0.
390 | 	;
391 | 	; Note that the TWORD prefix here is mandatory, because most x87
392 | 	; instructions can also be used with 32-bit and 64-bit memory locations:
393 | 	; "Almost any x87 floating−point instruction that references memory must
394 | 	; use one of the prefixes DWORD, QWORD or TWORD to indicate what size
395 | 	; of memory operand it refers to." (NASM manual, section 3.1)
396 | 	;
397 | 	fld tword [ldbl1]
398 | 	;
399 | 	; We want to load a second long double, but before that, we copy
400 | 	; the value at st0 to st1 so it won't be lost. This is done with FST
401 | 	; (ST = store).
402 | 	;
403 | 	fst st1
404 | 
405 | 	; Load the second long double.
406 | 	fld tword [ldbl2]
407 | 
408 | 	; add the numbers. Note how we only provide one operand to this
409 | 	; instruction: the other one is implicitly st0. Nasm allows you to write
410 | 	; it explicitly, though:
411 | 	; "For x87 floating−point instructions, NASM accepts a wide range of
412 | 	; syntaxes: you can use two−operand forms like MASM supports, or you can
413 | 	; use NASM’s native single−operand forms in most cases. For example,
414 | 	; you can code:
415 | 	;	fadd st1 ; this sets st0 := st0 + st1
416 | 	;	fadd st0,st1 ; so does this
417 | 	;
418 | 	; " (NASM manual, section 3.1)
419 | 	;
420 | 	; This instruction stores the result back to st0 as well.
421 | 	;
422 | 	fadd st1
423 | 
424 | 	; FSTP is similar to FST, but also pops the floating point stack.
425 | 	; store the value of st0 in the stack (again, as an 80-bit location),
426 | 	; so we can print it.
427 | 	;
428 | 	fstp tword [rsp]
429 | 
430 | 	mov rdi, printf_ldbl_fmt
431 | 	mov rsi, rsp
432 | 	call printf
433 | 
434 | 	; x87 also allows loading directly some famous constants.
435 | 	; This instructions loads pi.
436 | 	fldpi
437 | 
438 | 	; And of course, what's the point of storing pi if you can't do
439 | 	; trigonometric stuff?
440 | 	; This calculates the cosine of st0 and stores the result in that same
441 | 	; "register".
442 | 	fcos
443 | 
444 | 	; again, so we can print the value
445 | 	fstp tword [rsp]
446 | 
447 | 	mov rdi, printf_ldbl_fmt
448 | 	mov rsi, rsp
449 | 	call printf
450 | 
451 | 	mov rsp, rbp
452 | 	pop rbp
453 | 	ret
454 | 
455 | ; Exercises
456 | ;
457 | ; === Changing Stuff and Seeing What Happens ===
458 | ; - Comment out all align macros (only one at a time).
459 | ;
460 | ; - Comment out all AND rsp, ... instructions. (It may be that you don't get a
461 | ; segfault because rsp's address was already on a 2^N-byte alignment by chance.
462 | ; If this is the case, throw in a "SUB rsp, 8" so it won't happen.)
463 | ;
464 | ; - Comment out the CVT* instructions.
465 | ;
466 | ; === Your Turn ===
467 | ; - Write a program that prints the number 0.2 as a float, double, and
468 | ; long double, using the same number of decimal places for all of them so you
469 | ; can compare the rounding errors.
470 | ;
471 | ; - Write a program that takes in a vector of 4 doubles and prints the scalar
472 | ; product between them. The vectors can be hardcoded in the data section
473 | ; or read using printf().
474 | ;
475 | ; - Write an avx512 function that does the same operations than avx, but using
476 | ; AVX-512 and the ZMM registers, then write a program to test it.
477 | ;
478 | ; Bonus
479 | ;
480 | ; Write an interactive RPN calculator which supports at least the 4 basic
481 | ; operations. Whether you'll use float, double or long double is up to you.
482 | ;
483 | 
484 | ; vim: set ft=nasm:
485 | 


--------------------------------------------------------------------------------