├── .gitignore ├── README.md ├── chapter01 ├── Makefile └── first.s ├── chapter02 ├── Makefile ├── sum01.s └── sum02.s ├── chapter03 ├── Makefile ├── load01.s ├── load02.s ├── store01.s └── store02.s ├── chapter05 ├── Makefile ├── branch01.s └── compare01.s ├── chapter06 ├── Makefile ├── collatz.s ├── loop01.s ├── loop02.s └── test.py ├── chapter07 ├── Makefile └── rol.s ├── chapter08 ├── Makefile └── array01.s ├── chapter09 ├── Makefile ├── hello01.s ├── printf01.s └── printf02.s ├── chapter10 ├── Makefile ├── factorial01.s ├── factorial02.s ├── factorial03.s └── test.c ├── chapter11 ├── Makefile ├── collatz02.s ├── collatz03.s ├── stats └── test ├── chapter12 ├── Makefile ├── mult64.s └── mult64_2.s ├── chapter13 ├── Makefile └── addf.s ├── chapter14 ├── Makefile ├── benchmark.s └── matmul.s ├── chapter15 ├── Makefile ├── benchmark.s ├── divideby14.s ├── division.s └── magic.py ├── chapter16 ├── Makefile ├── binsearch.s ├── calcjump.s ├── hybrid.s ├── ifstring.s └── jumptable.s ├── chapter17 ├── Makefile ├── array_by_ref.s ├── array_by_value.s ├── double_array.s ├── first_pointer.s ├── good_pointer.s └── wrong_pointer.s ├── chapter18 ├── Makefile ├── square └── square.s ├── chapter19 ├── Makefile ├── write_c.s └── write_sys.s ├── chapter20 ├── Makefile ├── direct.s ├── greeter_01.s ├── greeter_02.s └── indirect.s ├── chapter21 ├── Makefile ├── reinterpret.s ├── subword.s └── subword_signed.s ├── chapter22 ├── Makefile ├── back-to-arm.s ├── thumb-call.s └── thumb-first.s ├── chapter23 ├── Makefile ├── nested01.s └── nested02.s ├── chapter24 ├── Makefile ├── print-array.s ├── sort-array.s └── trampoline-sort-array.s └── chapter25 ├── Makefile ├── byte_array_add.s ├── clipped_add.s └── motivation.s /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | raspberry-pi-assembler 2 | ====================== 3 | 4 | Support files for the blog posts on Raspberry Pi Assembler 5 | 6 | http://thinkingeek.com/category/raspberry-pi/ 7 | -------------------------------------------------------------------------------- /chapter01/Makefile: -------------------------------------------------------------------------------- 1 | ASM_FILE=first.s 2 | 3 | all: first 4 | 5 | first: first.o 6 | gcc -o $@ $+ 7 | 8 | first.o : first.s 9 | as -o $@ $< 10 | 11 | clean: 12 | rm -vf first *.o 13 | -------------------------------------------------------------------------------- /chapter01/first.s: -------------------------------------------------------------------------------- 1 | /* -- first.s */ 2 | /* This is a comment. Comments are enclosed in slash* and *slash */ 3 | .global main /* 'main' is our entry point and must be global */ 4 | .func main /* 'main' is a function */ 5 | 6 | main: /* This is main */ 7 | mov r0, #2 /* Put a 2 inside the register r0 */ 8 | bx lr /* Return from main */ 9 | 10 | -------------------------------------------------------------------------------- /chapter02/Makefile: -------------------------------------------------------------------------------- 1 | EXES=sum01 sum02 2 | all: $(EXES) 3 | 4 | %: %.o 5 | gcc -o $@ $+ 6 | 7 | % : %.s 8 | 9 | %.o : %.s 10 | as -o $@ $< 11 | 12 | .PHONY: clean 13 | clean: 14 | rm -vf $(EXES) *.o 15 | -------------------------------------------------------------------------------- /chapter02/sum01.s: -------------------------------------------------------------------------------- 1 | /* -- sum01.s */ 2 | .global main 3 | 4 | main: 5 | mov r1, #3 6 | mov r2, #4 7 | add r0, r1, r2 /* r0 ← r1 + r2 */ 8 | bx lr 9 | 10 | -------------------------------------------------------------------------------- /chapter02/sum02.s: -------------------------------------------------------------------------------- 1 | /* -- sum02.s */ 2 | .global main 3 | 4 | main: 5 | mov r0, #3 6 | mov r1, #4 7 | add r0, r0, r1 /* r0 ← r1 + r2 */ 8 | bx lr 9 | 10 | -------------------------------------------------------------------------------- /chapter03/Makefile: -------------------------------------------------------------------------------- 1 | EXES=load01 load02 store01 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter03/load01.s: -------------------------------------------------------------------------------- 1 | /* -- load01.s */ 2 | 3 | /* -- Data section */ 4 | .data 5 | 6 | /* Ensure variable is 4-byte aligned */ 7 | .balign 4 8 | /* Define storage for myvar1 */ 9 | myvar1: 10 | /* Contents of myvar1 is just '3' */ 11 | .word 3 12 | 13 | /* Ensure variable is 4-byte aligned */ 14 | .balign 4 15 | /* Define storage for myvar2 */ 16 | myvar2: 17 | /* Contents of myvar2 is just '3' */ 18 | .word 4 19 | 20 | /* -- Code section */ 21 | .text 22 | 23 | /* Ensure function section starts 4 byte aligned */ 24 | .balign 4 25 | .global main 26 | main: 27 | ldr r1, addr_of_myvar1 /* r1 ← &myvar1 */ 28 | ldr r1, [r1] /* r1 ← *r1 */ 29 | ldr r2, addr_of_myvar2 /* r2 ← &myvar2 */ 30 | ldr r2, [r2] /* r1 ← *r2 */ 31 | add r0, r1, r2 32 | bx lr 33 | 34 | /* Labels needed to access data */ 35 | addr_of_myvar1 : .word myvar1 36 | addr_of_myvar2 : .word myvar2 37 | -------------------------------------------------------------------------------- /chapter03/load02.s: -------------------------------------------------------------------------------- 1 | /* -- load02.s */ 2 | 3 | /* -- Data section */ 4 | .data 5 | 6 | /* Ensure variable is 4-byte aligned */ 7 | .balign 4 8 | /* Define storage for myvar1 */ 9 | myvar1: 10 | /* Contents of myvar1 is just '3' */ 11 | .word 3 12 | 13 | /* Ensure variable is 4-byte aligned */ 14 | .balign 4 15 | /* Define storage for myvar2 */ 16 | myvar2: 17 | /* Contents of myvar2 is just '3' */ 18 | .word 4 19 | 20 | /* -- Code section */ 21 | .text 22 | 23 | /* Ensure function section starts 4 byte aligned */ 24 | .balign 4 25 | .global main 26 | main: 27 | ldr r1, .Laddr_of_myvar1 /* r1 ← &myvar1 */ 28 | ldr r1, [r1] /* r1 ← *r1 */ 29 | ldr r2, .Laddr_of_myvar2 /* r2 ← &myvar2 */ 30 | ldr r2, [r2] /* r1 ← *r2 */ 31 | add r0, r1, r2 32 | bx lr 33 | 34 | /* Labels needed to access data */ 35 | .Laddr_of_myvar1 : .word myvar1 36 | .Laddr_of_myvar2 : .word myvar2 37 | -------------------------------------------------------------------------------- /chapter03/store01.s: -------------------------------------------------------------------------------- 1 | /* -- store01.s */ 2 | 3 | /* -- Data section */ 4 | .data 5 | 6 | /* Ensure variable is 4-byte aligned */ 7 | .balign 4 8 | /* Define storage for myvar1 */ 9 | myvar1: 10 | /* Contents of myvar1 is just '3' */ 11 | .word 0 12 | 13 | /* Ensure variable is 4-byte aligned */ 14 | .balign 4 15 | /* Define storage for myvar2 */ 16 | myvar2: 17 | /* Contents of myvar2 is just '3' */ 18 | .word 0 19 | 20 | /* -- Code section */ 21 | .text 22 | 23 | /* Ensure function section starts 4 byte aligned */ 24 | .balign 4 25 | .global main 26 | main: 27 | ldr r1, addr_of_myvar1 /* r1 ← &myvar1 */ 28 | mov r3, #3 /* r3 ← 3 */ 29 | str r3, [r1] /* *r1 ← r3 */ 30 | ldr r2, addr_of_myvar2 /* r2 ← &myvar2 */ 31 | mov r3, #4 /* r3 ← 3 */ 32 | str r3, [r2] /* *r2 ← r3 */ 33 | 34 | ldr r1, addr_of_myvar1 /* r1 ← &myvar1 */ 35 | ldr r1, [r1] /* r1 ← *r1 */ 36 | ldr r2, addr_of_myvar2 /* r2 ← &myvar2 */ 37 | ldr r2, [r2] /* r1 ← *r2 */ 38 | add r0, r1, r2 39 | bx lr 40 | 41 | /* Labels needed to access data */ 42 | addr_of_myvar1 : .word myvar1 43 | addr_of_myvar2 : .word myvar2 44 | -------------------------------------------------------------------------------- /chapter03/store02.s: -------------------------------------------------------------------------------- 1 | /* -- store02.s */ 2 | 3 | /* -- Data section */ 4 | .data 5 | 6 | /* Ensure variable is 4-byte aligned */ 7 | .balign 4 8 | /* Define storage for myvar1 */ 9 | myvar1: 10 | /* Contents of myvar1 is just '3' */ 11 | .word 3 12 | 13 | /* Ensure variable is 4-byte aligned */ 14 | .balign 4 15 | /* Define storage for myvar2 */ 16 | myvar2: 17 | /* Contents of myvar2 is just '3' */ 18 | .word 4 19 | 20 | /* Ensure variable is 4-byte aligned */ 21 | .balign 4 22 | /* Define storage for myvar3 */ 23 | myvar3: 24 | /* Contents of myvar3 is just '0' */ 25 | .word 0 26 | 27 | /* -- Code section */ 28 | .text 29 | 30 | /* Ensure function section starts 4 byte aligned */ 31 | .balign 4 32 | .global main 33 | main: 34 | ldr r1, addr_of_myvar1 /* r1 ← &myvar1 */ 35 | ldr r1, [r1] /* r1 ← *r1 */ 36 | ldr r2, addr_of_myvar2 /* r2 ← &myvar2 */ 37 | ldr r2, [r2] /* r1 ← *r2 */ 38 | add r3, r1, r2 /* r3 ← r1 + r2 */ 39 | ldr r4, addr_of_myvar3 /* r4 ← &myvar3 */ 40 | str r3, [r4] /* *r4 ← r3 */ 41 | /* Clear registers to prove that 42 | we are actually something 43 | previously stored */ 44 | mov r0, #0 /* r0 ← 0 */ 45 | mov r1, #0 /* r1 ← 0 */ 46 | mov r2, #0 /* r2 ← 0 */ 47 | mov r3, #0 /* r3 ← 0 */ 48 | mov r4, #0 /* r4 ← 0 */ 49 | 50 | ldr r0, addr_of_myvar3 51 | ldr r0, [r0] 52 | bx lr 53 | 54 | /* Labels needed to access data */ 55 | addr_of_myvar1 : .word myvar1 56 | addr_of_myvar2 : .word myvar2 57 | addr_of_myvar3 : .word myvar3 58 | -------------------------------------------------------------------------------- /chapter05/Makefile: -------------------------------------------------------------------------------- 1 | EXES=branch01 compare01 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter05/branch01.s: -------------------------------------------------------------------------------- 1 | /* -- branch01.s */ 2 | 3 | .text 4 | .global main 5 | main: 6 | case_a: 7 | mov r0, #2 8 | b end 9 | case_b : 10 | mov r0, #3 11 | end: 12 | bx lr 13 | -------------------------------------------------------------------------------- /chapter05/compare01.s: -------------------------------------------------------------------------------- 1 | /* -- compare01.s */ 2 | 3 | .text 4 | .global main 5 | main: 6 | mov r1, #2 /* r1 ← 2 */ 7 | mov r2, #2 /* r2 ← 2 */ 8 | cmp r1, r2 /* r1 ← r2 */ 9 | beq case_equal /* branch to case_equal if Z = 1 */ 10 | case_different : 11 | mov r0, #2 /* r0 ← 2 */ 12 | b end /* branch to end */ 13 | case_equal: 14 | mov r0, #1 /* r0 ← 1 */ 15 | end: 16 | bx lr 17 | -------------------------------------------------------------------------------- /chapter06/Makefile: -------------------------------------------------------------------------------- 1 | EXES=loop01 loop02 collatz 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter06/collatz.s: -------------------------------------------------------------------------------- 1 | /* -- collatz.s */ 2 | 3 | .text 4 | .global main 5 | main: 6 | mov r1, #123 /* r1 ← 123 */ 7 | mov r2, #0 /* r2 ← 0 */ 8 | loop: 9 | cmp r1, #1 /* compare r1 and 1 */ 10 | beq end /* branch to end if r1 == 1 */ 11 | 12 | and r3, r1, #1 /* r3 ← r1 & 1 */ 13 | cmp r3, #0 /* comprare r3 and 0 */ 14 | bne odd /* branch to odd if r3 != 0 */ 15 | even: 16 | mov r1, r1, ASR #1 /* r1 ← (r1 >> 1) */ 17 | b end_loop 18 | odd: 19 | add r1, r1, r1, LSL #1 /* r1 ← r1 + (r1 << 1) */ 20 | add r1, r1, #1 /* r1 ← r1 + 1 */ 21 | 22 | end_loop: 23 | add r2, r2, #1 /* r2 ← r2 + 1 */ 24 | b loop /* branch to loop */ 25 | 26 | end: 27 | mov r0, r2 28 | bx lr 29 | -------------------------------------------------------------------------------- /chapter06/loop01.s: -------------------------------------------------------------------------------- 1 | /* -- loop01.s */ 2 | 3 | .text 4 | .global main 5 | main: 6 | mov r1, #0 /* r1 ← 0 */ 7 | mov r2, #1 /* r2 ← 1 */ 8 | loop: 9 | cmp r2, #22 /* compare r2 and 22 */ 10 | bgt end /* branch if r2 > 22 to end */ 11 | add r1, r1, r2 /* r1 ← r1 + r1 */ 12 | add r2, r2, #1 /* r2 ← r2 + 1 */ 13 | b loop 14 | end: 15 | mov r0, r1 /* r0 ← r1 */ 16 | bx lr 17 | -------------------------------------------------------------------------------- /chapter06/loop02.s: -------------------------------------------------------------------------------- 1 | /* -- loop02.s */ 2 | 3 | .text 4 | .global main 5 | main: 6 | mov r1, #0 /* r1 ← 0 */ 7 | mov r2, #1 /* r2 ← 1 */ 8 | b check_loop /* unconditionally jump at the end of the loop */ 9 | loop: 10 | add r1, r1, r2 /* r1 ← r1 + r1 */ 11 | add r2, r2, #1 /* r2 ← r2 + 1 */ 12 | check_loop: 13 | cmp r2, #22 /* compare r2 and 22 */ 14 | ble loop /* branch if r2 <= 22 to the beginning of the loop */ 15 | end: 16 | mov r0, r1 /* r0 ← r1 */ 17 | bx lr 18 | -------------------------------------------------------------------------------- /chapter06/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import string 4 | 5 | n = 123 6 | step = 0 7 | nums = [str(n)] 8 | 9 | while n != 1: 10 | print "Step: %d -> %d" % (step, n) 11 | if n % 2 == 0: 12 | n = n / 2 13 | else: 14 | n = 3 * n + 1 15 | nums.append(str(n)) 16 | step = step + 1 17 | 18 | print "Step: %d -> %d" % (step, n) 19 | 20 | print string.join(nums, ", ") 21 | -------------------------------------------------------------------------------- /chapter07/Makefile: -------------------------------------------------------------------------------- 1 | EXES=rol 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter07/rol.s: -------------------------------------------------------------------------------- 1 | /* -- rol.s */ 2 | 3 | .data 4 | 5 | .balign 4 6 | value: 7 | .int 0x12345678 8 | 9 | .global main 10 | .text 11 | main: 12 | ldr r1, .Lcvalue 13 | ldr r1, [r1] 14 | mov r1, r1, ROL #1 15 | mov r1, r1, ROL #31 16 | 17 | eor r0, r0, r0 18 | bx lr 19 | .Lcvalue: .word value 20 | -------------------------------------------------------------------------------- /chapter08/Makefile: -------------------------------------------------------------------------------- 1 | EXES=array01 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter08/array01.s: -------------------------------------------------------------------------------- 1 | /* -- array01.s */ 2 | .data 3 | 4 | .balign 4 5 | a: .skip 400 6 | 7 | .balign 4 8 | b: .skip 8 9 | 10 | .text 11 | 12 | .global main 13 | main: 14 | ldr r1, addr_of_a /* r1 ← &a */ 15 | mov r2, #0 /* r2 ← 0 */ 16 | loop: 17 | cmp r2, #100 /* Have we reached 100 yet? */ 18 | beq end /* If so, leave the loop, otherwise continue */ 19 | add r3, r1, r2, LSL #2 /* r3 ← r1 + r2 * 4 */ 20 | str r2, [r3] /* *r3 ← r2 */ 21 | add r2, r2, #1 /* r2 ← r2 + 1 */ 22 | b loop /* Go to the beginning of the loop */ 23 | end: 24 | bx lr 25 | 26 | addr_of_a: .word a 27 | addr_of_b: .word b 28 | -------------------------------------------------------------------------------- /chapter09/Makefile: -------------------------------------------------------------------------------- 1 | EXES=hello01 printf01 printf02 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter09/hello01.s: -------------------------------------------------------------------------------- 1 | /* -- hello01.s */ 2 | .data 3 | 4 | greeting: 5 | .asciz "Hello world" 6 | 7 | .balign 4 8 | return: .word 0 9 | 10 | .text 11 | 12 | .global main 13 | main: 14 | ldr r1, address_of_return /* r1 ← &address_of_return */ 15 | str lr, [r1] /* *r1 ← lr */ 16 | 17 | ldr r0, address_of_greeting /* r0 ← &address_of_greeting */ 18 | /* First parameter of puts */ 19 | 20 | bl puts /* Call to puts */ 21 | /* lr ← address of next instruction */ 22 | 23 | ldr r1, address_of_return /* r1 ← &address_of_return */ 24 | ldr lr, [r1] /* lr ← *r1 */ 25 | bx lr /* return from main */ 26 | 27 | address_of_greeting: .word greeting 28 | address_of_return: .word return 29 | 30 | /* External */ 31 | .global puts 32 | -------------------------------------------------------------------------------- /chapter09/printf01.s: -------------------------------------------------------------------------------- 1 | /* -- printf01.s */ 2 | .data 3 | 4 | /* First message */ 5 | .balign 4 6 | message1: .asciz "Hey, type a number: " 7 | 8 | /* Second message */ 9 | .balign 4 10 | message2: .asciz "I read the number %d\n" 11 | 12 | /* Format pattern for scanf */ 13 | .balign 4 14 | scan_pattern : .asciz "%d" 15 | 16 | /* Where scanf will store the number read */ 17 | .balign 4 18 | number_read: .word 0 19 | 20 | .balign 4 21 | return: .word 0 22 | 23 | .text 24 | 25 | .global main 26 | main: 27 | ldr r1, address_of_return /* r1 ← &address_of_return */ 28 | str lr, [r1] /* *r1 ← lr */ 29 | 30 | ldr r0, address_of_message1 /* r0 ← &message1 */ 31 | bl printf /* call to printf */ 32 | 33 | ldr r0, address_of_scan_pattern /* r0 ← &scan_pattern */ 34 | ldr r1, address_of_number_read /* r1 ← &number_read */ 35 | bl scanf /* call to scanf */ 36 | 37 | ldr r0, address_of_message2 /* r0 ← &message2 */ 38 | ldr r1, address_of_number_read /* r1 ← &number_read */ 39 | ldr r1, [r1] /* r1 ← *r1 */ 40 | bl printf /* call to printf */ 41 | 42 | ldr r0, address_of_number_read /* r0 ← &number_read */ 43 | ldr r0, [r0] /* r0 ← *r0 */ 44 | 45 | ldr lr, address_of_return /* lr ← &address_of_return */ 46 | ldr lr, [lr] /* lr ← *lr */ 47 | bx lr /* return from main using lr */ 48 | 49 | 50 | address_of_message1 : .word message1 51 | address_of_message2 : .word message2 52 | address_of_scan_pattern : .word scan_pattern 53 | address_of_number_read : .word number_read 54 | address_of_return : .word return 55 | 56 | /* External */ 57 | .global printf 58 | .global scanf 59 | -------------------------------------------------------------------------------- /chapter09/printf02.s: -------------------------------------------------------------------------------- 1 | /* -- printf02.s */ 2 | .data 3 | 4 | /* First message */ 5 | .balign 4 6 | message1: .asciz "Hey, type a number: " 7 | 8 | /* Second message */ 9 | .balign 4 10 | message2: .asciz "%d times 5 is %d\n" 11 | 12 | /* Format pattern for scanf */ 13 | .balign 4 14 | scan_pattern : .asciz "%d" 15 | 16 | /* Where scanf will store the number read */ 17 | .balign 4 18 | number_read: .word 0 19 | 20 | .balign 4 21 | return: .word 0 22 | 23 | .balign 4 24 | return2: .word 0 25 | 26 | .text 27 | 28 | /* 29 | mult_by_5 function 30 | */ 31 | mult_by_5: 32 | ldr r1, address_of_return2 /* r1 ← &address_of_return */ 33 | str lr, [r1] /* *r1 ← lr */ 34 | 35 | add r0, r0, r0, LSL #2 /* r0 ← r0 + 4*r0 */ 36 | 37 | ldr lr, address_of_return2 /* lr ← &address_of_return */ 38 | ldr lr, [lr] /* lr ← *lr */ 39 | bx lr /* return from main using lr */ 40 | address_of_return2 : .word return2 41 | 42 | .global main 43 | main: 44 | ldr r1, address_of_return /* r1 ← &address_of_return */ 45 | str lr, [r1] /* *r1 ← lr */ 46 | 47 | ldr r0, address_of_message1 /* r0 ← &message1 */ 48 | bl printf /* call to printf */ 49 | 50 | ldr r0, address_of_scan_pattern /* r0 ← &scan_pattern */ 51 | ldr r1, address_of_number_read /* r1 ← &number_read */ 52 | bl scanf /* call to scanf */ 53 | 54 | ldr r0, address_of_number_read /* r0 ← &number_read */ 55 | ldr r0, [r0] /* r0 ← *r0 */ 56 | bl mult_by_5 57 | 58 | mov r2, r0 /* r1 ← r0 */ 59 | ldr r1, address_of_number_read /* r0 ← &number_read */ 60 | ldr r1, [r1] /* r0 ← *r1 */ 61 | ldr r0, address_of_message2 /* r0 ← &message2 */ 62 | bl printf /* call to printf */ 63 | 64 | ldr lr, address_of_return /* lr ← &address_of_return */ 65 | ldr lr, [lr] /* lr ← *lr */ 66 | bx lr /* return from main using lr */ 67 | 68 | 69 | address_of_message1 : .word message1 70 | address_of_message2 : .word message2 71 | address_of_scan_pattern : .word scan_pattern 72 | address_of_number_read : .word number_read 73 | address_of_return : .word return 74 | 75 | /* External */ 76 | .global printf 77 | .global scanf 78 | -------------------------------------------------------------------------------- /chapter10/Makefile: -------------------------------------------------------------------------------- 1 | EXES=factorial01 factorial02 factorial03 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -march=armv6 -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter10/factorial01.s: -------------------------------------------------------------------------------- 1 | /* -- factorial01.s */ 2 | .data 3 | 4 | message1: .asciz "Type a number: " 5 | format: .asciz "%d" 6 | message2: .asciz "The factorial of %d is %d\n" 7 | 8 | .text 9 | 10 | factorial: 11 | str lr, [sp,#-4]! /* Push lr onto the top of the stack */ 12 | str r0, [sp,#-4]! /* Push r0 onto the top of the stack */ 13 | 14 | cmp r0, #0 /* compare r0 and 0 */ 15 | bne is_nonzero /* if r0 != 0 then branch */ 16 | mov r0, #1 /* r0 ← 1. This is the return */ 17 | b end 18 | is_nonzero: 19 | /* Prepare the call to factorial(n-1) */ 20 | sub r0, r0, #1 /* r0 ← r0 - 1 */ 21 | bl factorial 22 | /* After the call r0 contains factorial(n-1) */ 23 | /* Load r0 (that we kept in th stack) into r1 */ 24 | ldr r1, [sp] /* r1 ← *sp */ 25 | mul r0, r0, r1 /* r0 ← r0 * r1 */ 26 | 27 | end: 28 | add sp, sp, #+4 /* Discard the r0 we kept in the stack */ 29 | ldr lr, [sp], #+4 /* Pop the top of the stack and put it in lr */ 30 | bx lr /* Leave factorial */ 31 | 32 | .globl main 33 | main: 34 | str lr, [sp,#-4]! /* Push lr onto the top of the stack */ 35 | sub sp, sp, #4 /* Make room for one 4 byte integer in the stack */ 36 | /* In these 4 bytes we will keep the number entered by */ 37 | /* the user */ 38 | 39 | ldr r0, address_of_message1 /* Set &message1 as the first parameter of printf */ 40 | bl printf /* Call printf */ 41 | 42 | ldr r0, address_of_format /* Set &format as the first parameter of scanf */ 43 | mov r1, sp /* Set the top of the stack as the second parameter */ 44 | /* of scanf */ 45 | bl scanf /* Call scanf */ 46 | 47 | ldr r0, [sp] /* Load the integer read by scanf into r0 */ 48 | /* So we set it as the first parameter of factorial */ 49 | bl factorial /* Call factorial */ 50 | 51 | mov r2, r0 /* Get the result of factorial and move it to r2 */ 52 | /* So we set it as the third parameter of printf */ 53 | ldr r1, [sp] /* Load the integer read by scanf into r1 */ 54 | /* So we set it as the second parameter of printf */ 55 | ldr r0, address_of_message2 /* Set &message2 as the first parameter of printf */ 56 | bl printf /* Call printf */ 57 | 58 | 59 | add sp, sp, #+4 /* Discard the integer read by scanf */ 60 | ldr lr, [sp], #+4 /* Pop the top of the stack and put it in lr */ 61 | bx lr /* Leave main */ 62 | 63 | address_of_message1: .word message1 64 | address_of_message2: .word message2 65 | address_of_format: .word format 66 | -------------------------------------------------------------------------------- /chapter10/factorial02.s: -------------------------------------------------------------------------------- 1 | /* -- factorial02.s */ 2 | .data 3 | 4 | message1: .asciz "Type a number: " 5 | format: .asciz "%d" 6 | message2: .asciz "The factorial of %d is %d\n" 7 | 8 | .text 9 | 10 | factorial: 11 | str lr, [sp,#-4]! /* Push lr onto the top of the stack */ 12 | str r4, [sp,#-4]! /* Push r0 onto the top of the stack */ 13 | mov r4, r0 /* Keep a copy of the initial value of r0 in r4 */ 14 | 15 | 16 | cmp r0, #0 /* compare r0 and 0 */ 17 | bne is_nonzero /* if r0 != 0 then branch */ 18 | mov r0, #1 /* r0 ← 1. This is the return */ 19 | b end 20 | is_nonzero: 21 | /* Prepare the call to factorial(n-1) */ 22 | sub r0, r0, #1 /* r0 ← r0 - 1 */ 23 | bl factorial 24 | /* After the call r0 contains factorial(n-1) */ 25 | /* Load initial value of r0 (that we kept in r4) into r1 */ 26 | mov r1, r4 /* r1 ← r4 */ 27 | mul r0, r0, r1 /* r0 ← r0 * r1 */ 28 | 29 | end: 30 | ldr r4, [sp], #+4 /* Restore r4 */ 31 | ldr lr, [sp], #+4 /* Pop the top of the stack and put it in lr */ 32 | bx lr /* Leave factorial */ 33 | 34 | .globl main 35 | main: 36 | str lr, [sp,#-4]! /* Push lr onto the top of the stack */ 37 | sub sp, sp, #4 /* Make room for one 4 byte integer in the stack */ 38 | /* In these 4 bytes we will keep the number entered by */ 39 | /* the user */ 40 | 41 | ldr r0, address_of_message1 /* Set &message1 as the first parameter of printf */ 42 | bl printf /* Call printf */ 43 | 44 | ldr r0, address_of_format /* Set &format as the first parameter of scanf */ 45 | mov r1, sp /* Set the top of the stack as the second parameter */ 46 | /* of scanf */ 47 | bl scanf /* Call scanf */ 48 | 49 | ldr r0, [sp] /* Load the integer read by scanf into r0 */ 50 | /* So we set it as the first parameter of factorial */ 51 | bl factorial /* Call factorial */ 52 | 53 | mov r2, r0 /* Get the result of factorial and move it to r2 */ 54 | /* So we set it as the third parameter of printf */ 55 | ldr r1, [sp] /* Load the integer read by scanf into r1 */ 56 | /* So we set it as the second parameter of printf */ 57 | ldr r0, address_of_message2 /* Set &message2 as the first parameter of printf */ 58 | bl printf /* Call printf */ 59 | 60 | 61 | add sp, sp, #+4 /* Discard the integer read by scanf */ 62 | ldr lr, [sp], #+4 /* Pop the top of the stack and put it in lr */ 63 | bx lr /* Leave main */ 64 | 65 | address_of_message1: .word message1 66 | address_of_message2: .word message2 67 | address_of_format: .word format 68 | -------------------------------------------------------------------------------- /chapter10/factorial03.s: -------------------------------------------------------------------------------- 1 | /* -- factorial03.s */ 2 | .data 3 | 4 | message1: .asciz "Type a number: " 5 | format: .asciz "%d" 6 | message2: .asciz "The factorial of %d is %d\n" 7 | 8 | .text 9 | 10 | factorial: 11 | stmdb sp!, {r4, lr} 12 | mov r4, r0 /* Keep a copy of the initial value of r0 in r4 */ 13 | 14 | 15 | cmp r0, #0 /* compare r0 and 0 */ 16 | bne is_nonzero /* if r0 != 0 then branch */ 17 | mov r0, #1 /* r0 ← 1. This is the return */ 18 | b end 19 | is_nonzero: 20 | /* Prepare the call to factorial(n-1) */ 21 | sub r0, r0, #1 /* r0 ← r0 - 1 */ 22 | bl factorial 23 | /* After the call r0 contains factorial(n-1) */ 24 | /* Load initial value of r0 (that we kept in r4) into r1 */ 25 | mov r1, r4 /* r1 ← r4 */ 26 | mul r0, r0, r1 /* r0 ← r0 * r1 */ 27 | 28 | end: 29 | ldmia sp!, {r4, lr} 30 | bx lr /* Leave factorial */ 31 | 32 | .globl main 33 | main: 34 | str lr, [sp,#-4]! /* Push lr onto the top of the stack */ 35 | sub sp, sp, #4 /* Make room for one 4 byte integer in the stack */ 36 | /* In these 4 bytes we will keep the number entered by */ 37 | /* the user */ 38 | 39 | ldr r0, address_of_message1 /* Set &message1 as the first parameter of printf */ 40 | bl printf /* Call printf */ 41 | 42 | ldr r0, address_of_format /* Set &format as the first parameter of scanf */ 43 | mov r1, sp /* Set the top of the stack as the second parameter */ 44 | /* of scanf */ 45 | bl scanf /* Call scanf */ 46 | 47 | ldr r0, [sp] /* Load the integer read by scanf into r0 */ 48 | /* So we set it as the first parameter of factorial */ 49 | bl factorial /* Call factorial */ 50 | 51 | mov r2, r0 /* Get the result of factorial and move it to r2 */ 52 | /* So we set it as the third parameter of printf */ 53 | ldr r1, [sp] /* Load the integer read by scanf into r1 */ 54 | /* So we set it as the second parameter of printf */ 55 | ldr r0, address_of_message2 /* Set &message2 as the first parameter of printf */ 56 | bl printf /* Call printf */ 57 | 58 | add sp, sp, #+4 /* Discard the integer read by scanf */ 59 | ldr lr, [sp], #+4 /* Pop the top of the stack and put it in lr */ 60 | bx lr /* Leave main */ 61 | 62 | address_of_message1: .word message1 63 | address_of_message2: .word message2 64 | address_of_format: .word format 65 | -------------------------------------------------------------------------------- /chapter10/test.c: -------------------------------------------------------------------------------- 1 | void f(int n) 2 | { 3 | int m; 4 | m = n + 1; 5 | } 6 | -------------------------------------------------------------------------------- /chapter11/Makefile: -------------------------------------------------------------------------------- 1 | EXES=collatz02 collatz03 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter11/collatz02.s: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rofirrim/raspberry-pi-assembler/75685f80a35318777fad9dc33837698c19952e89/chapter11/collatz02.s -------------------------------------------------------------------------------- /chapter11/collatz03.s: -------------------------------------------------------------------------------- 1 | /* -- collatz03.s */ 2 | .data 3 | 4 | message: .asciz "Type a number: " 5 | scan_format : .asciz "%d" 6 | message2: .asciz "Length of the Hailstone sequence for %d is %d\n" 7 | 8 | 9 | .text 10 | 11 | collatz2: 12 | /* r0 contains the first argument */ 13 | push {r4} 14 | mov r4, r0 15 | mov r3, #4194304 16 | collatz_repeat: 17 | mov r1, r4 /* r1 ← r0 */ 18 | mov r0, #0 /* r0 ← 0 */ 19 | collatz2_loop: 20 | cmp r1, #1 /* compare r1 and 1 */ 21 | beq collatz2_end /* if r1 == 1 branch to collatz2_end */ 22 | and r2, r1, #1 /* r2 ← r1 & 1 */ 23 | cmp r2, #0 /* compare r2 and 0 */ 24 | moveq r1, r1, ASR #1 /* if r2 == 0, r1 ← r1 >> 1. This is r1 ← r1/2 */ 25 | addne r1, r1, r1, LSL #1 /* if r2 != 0, r1 ← r1 + (r1 << 1). This is r1 ← 3*r1 */ 26 | addne r1, r1, #1 /* if r2 != 0, r1 ← r1 + 1. */ 27 | collatz2_end_loop: 28 | add r0, r0, #1 /* r0 ← r0 + 1 */ 29 | b collatz2_loop /* branch back to collatz2_loop */ 30 | collatz2_end: 31 | sub r3, r3, #1 32 | cmp r3, #0 33 | bne collatz_repeat 34 | pop {r4} 35 | bx lr 36 | 37 | .global main 38 | main: 39 | push {lr} /* keep lr */ 40 | sub sp, sp, #4 /* make room for 4 bytes in the stack */ 41 | 42 | ldr r0, address_of_message /* first parameter of printf: &message */ 43 | bl printf /* call printf */ 44 | 45 | ldr r0, address_of_scan_format /* first parameter of scanf: &scan_format */ 46 | mov r1, sp /* second parameter of scanf: 47 | address of the top of the stack */ 48 | bl scanf /* call scanf */ 49 | 50 | ldr r0, [sp] /* first parameter of collatz: 51 | the value stored (by scanf) in the top of the stack */ 52 | bl collatz2 /* call collatz2 */ 53 | 54 | mov r2, r0 /* third parameter of printf: 55 | the result of collatz */ 56 | ldr r1, [sp] /* second parameter of printf: 57 | the value stored (by scanf) in the top of the stack */ 58 | ldr r0, address_of_message2 /* first parameter of printf: &address_of_message */ 59 | bl printf 60 | 61 | add sp, sp, #4 62 | pop {lr} 63 | bx lr 64 | 65 | 66 | address_of_message: .word message 67 | address_of_scan_format: .word scan_format 68 | address_of_message2: .word message2 69 | -------------------------------------------------------------------------------- /chapter11/stats: -------------------------------------------------------------------------------- 1 | 2 | Performance counter stats for './collatz03' (25 runs): 3 | 4 | 4,179080 task-clock # 0,766 CPUs utilized ( +- 0,36% ) 5 | 6 | 0,005459041 seconds time elapsed ( +- 0,55% ) 7 | 8 | -------------------------------------------------------------------------------- /chapter11/test: -------------------------------------------------------------------------------- 1 | cpu-cycles 2 | cycles 3 | stalled-cycles-frontend 4 | idle-cycles-frontend 5 | stalled-cycles-backend 6 | idle-cycles-backend 7 | instructions 8 | cache-references 9 | cache-misses 10 | branch-instructions 11 | branches 12 | branch-misses 13 | bus-cycles 14 | cpu-clock 15 | task-clock 16 | page-faults 17 | faults 18 | minor-faults 19 | major-faults 20 | context-switches 21 | cs 22 | cpu-migrations 23 | migrations 24 | alignment-faults 25 | emulation-faults 26 | L1-dcache-loads 27 | L1-dcache-load-misses 28 | L1-dcache-stores 29 | L1-dcache-store-misses 30 | L1-dcache-prefetches 31 | L1-dcache-prefetch-misses 32 | L1-icache-loads 33 | L1-icache-load-misses 34 | L1-icache-prefetches 35 | L1-icache-prefetch-misses 36 | LLC-loads 37 | LLC-load-misses 38 | LLC-stores 39 | LLC-store-misses 40 | LLC-prefetches 41 | LLC-prefetch-misses 42 | dTLB-loads 43 | dTLB-load-misses 44 | dTLB-stores 45 | dTLB-store-misses 46 | dTLB-prefetches 47 | dTLB-prefetch-misses 48 | iTLB-loads 49 | iTLB-load-misses 50 | branch-loads 51 | branch-load-misses 52 | node-loads 53 | node-load-misses 54 | node-stores 55 | node-store-misses 56 | node-prefetches 57 | node-prefetch-misses 58 | -------------------------------------------------------------------------------- /chapter12/Makefile: -------------------------------------------------------------------------------- 1 | EXES=mult64 mult64_2 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter12/mult64.s: -------------------------------------------------------------------------------- 1 | /* -- mult64.s */ 2 | 3 | .data 4 | 5 | .align 8 6 | message : .asciz "Multiplication of %d by %d is %lld\n" 7 | 8 | .align 4 9 | number_a: .word 987654321 10 | number_b: .word 1234567890 11 | 12 | .text 13 | 14 | mult64: 15 | /* The argument will be passed in r0, r1 and r2, r3 and returned in r0, r1 */ 16 | /* Keep the registers that we are going to write */ 17 | push {r4, r5, r6, r7, r8, lr} 18 | /* For covenience, mov r0,r1 into r4,r5 */ 19 | mov r4, r0 /* r0 ← r4 */ 20 | mov r5, r1 /* r5 ← r1 */ 21 | 22 | smull r0, r6, r2, r4 /* r0,r6 ← r2 * r4 */ 23 | smull r7, r8, r3, r4 /* r7,r8 ← r3 * r4 */ 24 | smull r4, r5, r2, r5 /* r4,r5 ← r2 * r5 */ 25 | adds r2, r7, r4 /* r2 ← r7 + r4 and update cpsr */ 26 | adc r1, r2, r6 /* r1 ← r2 + r6 + C */ 27 | 28 | /* Restore registers */ 29 | pop {r4, r5, r6, r7, r8, lr} 30 | bx lr /* Leave mult64 */ 31 | 32 | mult64_2: 33 | /* The argument will be passed in r0, r1 and r2, r3 and returned in r0, r1 */ 34 | /* Keep the registers that we are going to write */ 35 | push {r4, r5, r6, lr} 36 | 37 | /* For convenience, mov r0,r1 into r4,r5 */ 38 | mov r4, r0 /* r0 ← r4 */ 39 | mov r5, r1 /* r5 ← r1 */ 40 | smull r0, r1, r2, r4 /* r0,r1 ← r2 * r4 */ 41 | smlal r1, r6, r3, r4 /* r1 ← r1 + LO(r3*r4). r6 ← r6 + HI(r3*r4) */ 42 | smlal r1, r6, r2, r5 /* r1 ← r1 + LO(r4*r3). r6 ← r6 + HI(r2*r5) */ 43 | 44 | /* Restore registers */ 45 | pop {r4, r5, r6, lr} 46 | bx lr 47 | 48 | .global main 49 | main: 50 | push {r4, r5, r6, lr} /* Keep the registers we are going to modify */ 51 | /* We have to load the number from memory because the literal value would 52 | not fit the instruction */ 53 | ldr r4, addr_number_a /* r4 ← &a */ 54 | ldr r4, [r4] /* r4 ← *r4 */ 55 | ldr r5, addr_number_b /* r5 ← &b */ 56 | ldr r5, [r5] /* r5 ← *r5 */ 57 | 58 | /* Now prepare the call to mult64 59 | /* 60 | The first number is passed in 61 | registers r0,r1 and the second one in r2,r3 62 | Note that we pass 32-bit numbers, this is why 63 | the higher register will be zero 64 | */ 65 | mov r0, r4 /* r0 ← r4 */ 66 | mov r1, #0 /* r1 ← 0 */ 67 | 68 | mov r2, r5 /* r2 ← r5 */ 69 | mov r3, #0 /* r3 ← 0 */ 70 | 71 | bl mult64 /* call mult64 function */ 72 | /* The result of the multiplication is in r0,r1 */ 73 | 74 | /* Now prepare the call to printf */ 75 | /* We have to pass &message, r4, r5 and r0,r1 */ 76 | /* Because of the calling convention &message and 77 | r4, r5 will be passed in registers r0, r1 and r2. 78 | The result of mult64 (still in r0,r1) must be passed 79 | in the stack because we ran out registers for passing 80 | parameters. Technically we still have r3 but 81 | is not an even numbered register so it cannot have 82 | the lower part of a 64-bit number (by convention) */ 83 | /* Note that arguments passed in the stack must be pushed 84 | in reverse order because we want parameters of lower positions 85 | to be in the stack in lower addresses (by convention) */ 86 | push {r1} /* Push r1 onto the stack. 5th parameter */ 87 | push {r0} /* Push r0 onto the stack. 4th parameter */ 88 | mov r2, r5 /* r2 ← r5. 3rd parameter */ 89 | mov r1, r4 /* r1 ← r4. 2nd parameter */ 90 | ldr r0, addr_of_message /* r0 ← &message 1st parameter */ 91 | bl printf /* Call printf */ 92 | add sp, sp, #8 /* sp ← sp + 8 */ 93 | /* Pop the two registers we pushed above */ 94 | 95 | mov r0, #0 /* r0 ← 0 */ 96 | pop {r4, r5, r6, lr} /* Restore registers we kept */ 97 | bx lr /* Leave main */ 98 | 99 | addr_of_message : .word message 100 | addr_number_a: .word number_a 101 | addr_number_b: .word number_b 102 | -------------------------------------------------------------------------------- /chapter12/mult64_2.s: -------------------------------------------------------------------------------- 1 | /* -- mult64.s */ 2 | .data 3 | 4 | .align 8 5 | message : .asciz "Multiplication of %lld by %lld is %lld\n" 6 | 7 | .align 4 8 | number_a_low: .word 3755744309 9 | number_a_high: .word 2 10 | 11 | number_b_low: .word 12345678 12 | number_b_high: .word 0 13 | 14 | .text 15 | 16 | mult64: 17 | /* The argument will be passed in r0, r1 and r2, r3 and returned in r0, r1 */ 18 | /* Keep the registers that we are going to write */ 19 | push {r4, r5, r6, r7, r8, lr} 20 | /* For covenience, mov r0,r1 into r4,r5 */ 21 | mov r4, r0 /* r0 ← r4 */ 22 | mov r5, r1 /* r5 ← r1 */ 23 | 24 | umull r0, r6, r2, r4 /* r0,r6 ← r2 * r4 */ 25 | umull r7, r8, r3, r4 /* r7,r8 ← r3 * r4 */ 26 | umull r4, r5, r2, r5 /* r4,r5 ← r2 * r5 */ 27 | adds r2, r7, r4 /* r2 ← r7 + r4 and update cpsr */ 28 | adc r1, r2, r6 /* r1 ← r2 + r6 + C */ 29 | 30 | /* Restore registers */ 31 | pop {r4, r5, r6, r7, r8, lr} 32 | bx lr /* Leave mult64 */ 33 | 34 | mult64_2: 35 | /* The argument will be passed in r0, r1 and r2, r3 and returned in r0, r1 */ 36 | /* Keep the registers that we are going to write */ 37 | push {r4, r5, r6, lr} 38 | 39 | /* For convenience, mov r0,r1 into r4,r5 */ 40 | mov r4, r0 /* r0 ← r4 */ 41 | mov r5, r1 /* r5 ← r1 */ 42 | umull r0, r1, r2, r4 /* r0,r1 ← r2 * r4 */ 43 | umlal r1, r6, r3, r4 /* r1 ← r1 + LO(r3*r4). r6 ← r6 + HI(r3*r4) */ 44 | umlal r1, r6, r2, r5 /* r1 ← r1 + LO(r4*r3). r6 ← r6 + HI(r2*r5) */ 45 | 46 | /* Restore registers */ 47 | pop {r4, r5, r6, lr} 48 | bx lr 49 | 50 | .global main 51 | main: 52 | push {r4, r5, r6, r7, r8, lr} /* Keep the registers we are going to modify */ 53 | /* We have to load the number from memory because the literal value would 54 | not fit the instruction */ 55 | ldr r4, addr_number_a_low /* r4 ← &a_low */ 56 | ldr r4, [r4] /* r4 ← *r4 */ 57 | ldr r5, addr_number_a_high /* r5 ← &a_high */ 58 | ldr r5, [r5] /* r5 ← *r5 */ 59 | 60 | ldr r6, addr_number_b_low /* r6 ← &b_low */ 61 | ldr r6, [r6] /* r6 ← *r6 */ 62 | ldr r7, addr_number_b_high /* r7 ← &b_high */ 63 | ldr r7, [r7] /* r7 ← *r7 */ 64 | 65 | /* Now prepare the call to mult64 66 | /* 67 | The first number is passed in 68 | registers r0,r1 and the second one in r2,r3 69 | Note that we pass 32-bit numbers, this is why 70 | the higher register will be zero 71 | */ 72 | mov r0, r4 /* r0 ← r4 */ 73 | mov r1, r5 /* r1 ← r5 */ 74 | 75 | mov r2, r6 /* r2 ← r6 */ 76 | mov r3, r7 /* r3 ← r7 */ 77 | 78 | bl mult64 /* call mult64 function */ 79 | /* The result of the multiplication is in r0,r1 */ 80 | 81 | /* Now prepare the call to printf */ 82 | /* We have to pass &message, {r4,r5}, {r6,r7} and {r0,r1} */ 83 | # push {r1} /* Push r1 onto the stack. 7th parameter */ 84 | # push {r0} /* Push r0 onto the stack. 6th parameter */ 85 | # push {r7} /* Push r7 onto the stack. 5th parameter */ 86 | # push {r6} /* Push r6 onto the stack. 4th parameter */ 87 | push {r0,r1} 88 | push {r6,r7} 89 | mov r3, r5 /* r3 ← r5. 3rd parameter */ 90 | mov r2, r4 /* r2 ← r4. 2nd parameter */ 91 | ldr r0, addr_of_message /* r0 ← &message 1st parameter */ 92 | bl printf /* Call printf */ 93 | add sp, sp, #16 /* sp ← sp + 16 */ 94 | /* Pop the two registers we pushed above */ 95 | 96 | mov r0, #0 /* r0 ← 0 */ 97 | pop {r4, r5, r6, r7, r8, lr} /* Restore registers we kept */ 98 | bx lr /* Leave main */ 99 | 100 | addr_of_message : .word message 101 | addr_number_a_low: .word number_a_low 102 | addr_number_a_high: .word number_a_high 103 | 104 | addr_number_b_low: .word number_b_low 105 | addr_number_b_high: .word number_b_high 106 | -------------------------------------------------------------------------------- /chapter13/Makefile: -------------------------------------------------------------------------------- 1 | EXES=addf 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -mcpu=arm1176jzf-s -mfpu=vfpv2 -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter13/addf.s: -------------------------------------------------------------------------------- 1 | /* -- addf.s */ 2 | 3 | .data 4 | 5 | .align 4 6 | array_of_floats_1: 7 | .float 1.2, 3.4, 5.6, 7.8, 9.10, 10.11, 12.13, 14.15 8 | 9 | .align 4 10 | array_of_floats_2: 11 | .float 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8 12 | 13 | .text 14 | 15 | .global main 16 | main: 17 | push {r4, r5, r6, lr} 18 | 19 | ldr r4, addr_of_array_of_floats_1 20 | fldmias r4, {s8-s15} /* Load 8 floats from [r4] to {s8-s15} */ 21 | 22 | ldr r4, addr_of_array_of_floats_2 23 | fldmias r4, {s16-s23} /* Load 8 floats from [r4] to {s16-s23} */ 24 | 25 | /* Set the LEN field of FPSCR to be 8 (value 7) */ 26 | mov r5, #0b111 /* r5 ← 7 */ 27 | mov r5, r5, LSL #16 /* r5 ← r5 << 16 */ 28 | fmrx r4, fpscr /* r4 ← fpscr */ 29 | orr r4, r4, r5 /* r4 ← r4 | r5 */ 30 | fmxr fpscr, r4 /* fpscr ← r4 */ 31 | 32 | fadds s24, s8, s16 /* {s24-s31} ← {s8-s15} + {s16-s23} */ 33 | 34 | /* Set the LEN field of FPSCR back to 1 (value 0) */ 35 | mvn r5, r5 /* r5 ← ~r5 */ 36 | fmrx r4, fpscr /* r4 ← fpscr */ 37 | and r4, r4, r5 /* r4 ← r4 & r5 */ 38 | fmxr fpscr, r4 /* fpscr ← r4 */ 39 | 40 | pop {r4, r5, r6, lr} 41 | mov r0, #0 42 | bx lr 43 | 44 | addr_of_array_of_floats_1 : .word array_of_floats_1 45 | addr_of_array_of_floats_2 : .word array_of_floats_2 46 | -------------------------------------------------------------------------------- /chapter14/Makefile: -------------------------------------------------------------------------------- 1 | EXES=matmul benchmark 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -mfpu=vfpv2 -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter14/benchmark.s: -------------------------------------------------------------------------------- 1 | /* -- matmul.s */ 2 | .data 3 | mat_A: .float 0.1, 0.2, 0.0, 0.1 4 | .float 0.2, 0.1, 0.3, 0.0 5 | .float 0.0, 0.3, 0.1, 0.5 6 | .float 0.0, 0.6, 0.4, 0.1 7 | mat_B: .float 4.92, 2.54, -0.63, -1.75 8 | .float 3.02, -1.51, -0.87, 1.35 9 | .float -4.29, 2.14, 0.71, 0.71 10 | .float -0.95, 0.48, 2.38, -0.95 11 | mat_C: .float 0.0, 0.0, 0.0, 0.0 12 | .float 0.0, 0.0, 0.0, 0.0 13 | .float 0.0, 0.0, 0.0, 0.0 14 | .float 0.0, 0.0, 0.0, 0.0 15 | .float 0.0, 0.0, 0.0, 0.0 16 | 17 | format_result : .asciz "Matrix result is:\n%5.2f %5.2f %5.2f %5.2f\n%5.2f %5.2f %5.2f %5.2f\n%5.2f %5.2f %5.2f %5.2f\n%5.2f %5.2f %5.2f %5.2f\n" 18 | 19 | .text 20 | 21 | naive_matmul_4x4: 22 | /* r0 address of A 23 | r1 address of B 24 | r2 address of C 25 | */ 26 | push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */ 27 | /* First zero 16 single floating point */ 28 | /* In IEEE 754, all bits cleared means 0.0 */ 29 | mov r4, r2 30 | mov r5, #16 31 | mov r6, #0 32 | b .L0_loop_init_test 33 | .L0_loop_init : 34 | str r6, [r4], +#4 /* *r4 ← r6 then r4 ← r4 + 4 */ 35 | .L0_loop_init_test: 36 | subs r5, r5, #1 37 | bge .L0_loop_init 38 | 39 | /* We will use 40 | r4 as i 41 | r5 as j 42 | r6 as k 43 | */ 44 | mov r4, #0 /* r4 ← 0 */ 45 | .L0_loop_i: /* loop header of i */ 46 | cmp r4, #4 /* if r4 == 4 goto end of the loop i */ 47 | beq .L0_end_loop_i 48 | mov r5, #0 /* r5 ← 0 */ 49 | .L0_loop_j: /* loop header of j */ 50 | cmp r5, #4 /* if r5 == 4 goto end of the loop j */ 51 | beq .L0_end_loop_j 52 | /* Compute the address of C[i][j] and load it into s0 */ 53 | /* Address of C[i][j] is C + 4*(4 * i + j) */ 54 | mov r7, r5 /* r7 ← r5. This is r7 ← j */ 55 | adds r7, r7, r4, LSL #2 /* r7 ← r7 + (r4 << 2). 56 | This is r7 ← j + i * 4. 57 | We multiply i by the row size (4 elements) */ 58 | adds r7, r2, r7, LSL #2 /* r7 ← r2 + (r7 << 2). 59 | This is r7 ← C + 4*(j + i * 4) 60 | We multiply (j + i * 4) by the size of the element. 61 | A single-precision floating point takes 4 bytes. 62 | */ 63 | vldr s0, [r7] /* s0 ← *r7 */ 64 | 65 | mov r6, #0 /* r6 ← 0 */ 66 | .L0_loop_k : /* loop header of k */ 67 | cmp r6, #4 /* if r6 == 4 goto end of the loop k */ 68 | beq .L0_end_loop_k 69 | 70 | /* Compute the address of a[i][k] and load it into s1 */ 71 | /* Address of a[i][k] is a + 4*(4 * i + k) */ 72 | mov r8, r6 /* r8 ← r6. This is r8 ← k */ 73 | adds r8, r8, r4, LSL #2 /* r8 ← r8 + (r4 << 2). This is r8 ← k + i * 4 */ 74 | adds r8, r0, r8, LSL #2 /* r8 ← r0 + (r8 << 2). This is r8 ← a + 4*(k + i * 4) */ 75 | vldr s1, [r8] /* s1 ← *r8 */ 76 | 77 | /* Compute the address of b[k][j] and load it into s2 */ 78 | /* Address of b[k][j] is b + 4*(4 * k + j) */ 79 | mov r8, r5 /* r8 ← r5. This is r8 ← j */ 80 | adds r8, r8, r6, LSL #2 /* r8 ← r8 + (r6 << 2). This is r8 ← j + k * 4 */ 81 | adds r8, r1, r8, LSL #2 /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j + k * 4) */ 82 | vldr s2, [r8] /* s1 ← *r8 */ 83 | 84 | vmul.f32 s3, s1, s2 /* s3 ← s1 * s2 */ 85 | vadd.f32 s0, s0, s3 /* s0 ← s0 + s3 */ 86 | 87 | add r6, r6, #1 /* r6 ← r6 + 1 */ 88 | b .L0_loop_k /* next iteration of loop k */ 89 | .L0_end_loop_k: /* Here ends loop k */ 90 | vstr s0, [r7] /* Store s0 back to C[i][j] */ 91 | add r5, r5, #1 /* r5 ← r5 + 1 */ 92 | b .L0_loop_j /* next iteration of loop j */ 93 | .L0_end_loop_j: /* Here ends loop j */ 94 | add r4, r4, #1 /* r4 ← r4 + 1 */ 95 | b .L0_loop_i /* next iteration of loop i */ 96 | .L0_end_loop_i: /* Here ends loop i */ 97 | 98 | pop {r4, r5, r6, r7, r8, lr} /* Restore integer registers */ 99 | bx lr /* Leave function */ 100 | 101 | naive_vectorial_matmul_4x4: 102 | /* r0 address of A 103 | r1 address of B 104 | r2 address of C 105 | */ 106 | push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */ 107 | vpush {s16-s19} /* Floating point registers starting from s16 must be preserved */ 108 | vpush {s24-s27} 109 | /* First zero 16 single floating point */ 110 | /* In IEEE 754, all bits cleared means 0 */ 111 | mov r4, r2 112 | mov r5, #16 113 | mov r6, #0 114 | b .L1_loop_init_test 115 | .L1_loop_init : 116 | str r6, [r4], +#4 /* *r4 ← r6 then r4 ← r4 + 4 */ 117 | .L1_loop_init_test: 118 | subs r5, r5, #1 119 | bge .L1_loop_init 120 | 121 | /* Set the LEN field of FPSCR to be 4 (value 3) */ 122 | mov r5, #0b011 /* r5 ← 3 */ 123 | mov r5, r5, LSL #16 /* r5 ← r5 << 16 */ 124 | fmrx r4, fpscr /* r4 ← fpscr */ 125 | orr r4, r4, r5 /* r4 ← r4 | r5 */ 126 | fmxr fpscr, r4 /* fpscr ← r4 */ 127 | 128 | /* We will use 129 | r4 as i 130 | r5 as j 131 | */ 132 | mov r4, #0 /* r4 ← 0 */ 133 | .L1_loop_i: /* loop header of i */ 134 | cmp r4, #4 /* if r4 == 4 goto end of the loop i */ 135 | beq .L1_end_loop_i 136 | mov r5, #0 /* r5 ← 0 */ 137 | .L1_loop_j: /* loop header of j */ 138 | cmp r5, #4 /* if r5 == 4 goto end of the loop j */ 139 | beq .L1_end_loop_j 140 | /* Compute the address of C[i][j] and load it into s0 */ 141 | /* Address of C[i][j] is C + 4*(4 * i + j) */ 142 | mov r7, r5 /* r7 ← r5. This is r7 ← j */ 143 | adds r7, r7, r4, LSL #2 /* r7 ← r7 + (r4 << 2). 144 | This is r7 ← j + i * 4. 145 | We multiply i by the row size (4 elements) */ 146 | adds r7, r2, r7, LSL #2 /* r7 ← r2 + (r7 << 2). 147 | This is r7 ← C + 4*(j + i * 4) 148 | We multiply (j + i * 4) by the size of the element. 149 | A single-precision floating point takes 4 bytes. 150 | */ 151 | /* Compute the address of a[i][0] */ 152 | mov r8, r4, LSL #2 153 | adds r8, r0, r8, LSL #2 154 | vldmia r8, {s8-s11} /* Load {s8,s9,s10,s11} ← {a[i][0], a[i][1], a[i][2], a[i][3]} */ 155 | 156 | /* Compute the address of b[0][j] */ 157 | mov r8, r5 /* r8 ← r5. This is r8 ← j */ 158 | adds r8, r1, r8, LSL #2 /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j) */ 159 | vldr s16, [r8] /* s16 ← *r8. This is s16 ← b[0][j] */ 160 | vldr s17, [r8, #16] /* s17 ← *(r8 + 16). This is s17 ← b[1][j] */ 161 | vldr s18, [r8, #32] /* s18 ← *(r8 + 32). This is s17 ← b[2][j] */ 162 | vldr s19, [r8, #48] /* s19 ← *(r8 + 48). This is s17 ← b[3][j] */ 163 | 164 | vmul.f32 s24, s8, s16 /* {s24,s25,s26,s27} ← {s8,s9,s10,s11} * {s16,s17,s18,s19} */ 165 | vmov.f32 s0, s24 /* s0 ← s24 */ 166 | vadd.f32 s0, s0, s25 /* s0 ← s0 + s25 */ 167 | vadd.f32 s0, s0, s26 /* s0 ← s0 + s26 */ 168 | vadd.f32 s0, s0, s27 /* s0 ← s0 + s27 */ 169 | 170 | vstr s0, [r7] /* Store s0 back to C[i][j] */ 171 | add r5, r5, #1 /* r5 ← r5 + 1 */ 172 | b .L1_loop_j /* next iteration of loop j */ 173 | .L1_end_loop_j: /* Here ends loop j */ 174 | add r4, r4, #1 /* r4 ← r4 + 1 */ 175 | b .L1_loop_i /* next iteration of loop i */ 176 | .L1_end_loop_i: /* Here ends loop i */ 177 | 178 | /* Set the LEN field of FPSCR back to 1 (value 0) */ 179 | mov r5, #0b011 /* r5 ← 3 */ 180 | mvn r5, r5, LSL #16 /* r5 ← r5 << 16 */ 181 | fmrx r4, fpscr /* r4 ← fpscr */ 182 | and r4, r4, r5 /* r4 ← r4 & r5 */ 183 | fmxr fpscr, r4 /* fpscr ← r4 */ 184 | 185 | vpop {s24-s27} /* Restore preserved floating registers */ 186 | vpop {s16-s19} 187 | pop {r4, r5, r6, r7, r8, lr} /* Restore integer registers */ 188 | bx lr /* Leave function */ 189 | 190 | naive_vectorial_matmul_2_4x4: 191 | /* r0 address of A 192 | r1 address of B 193 | r2 address of C 194 | */ 195 | push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */ 196 | vpush {s16-s31} /* Floating point registers starting from s16 must be preserved */ 197 | /* First zero 16 single floating point */ 198 | /* In IEEE 754, all bits cleared means 0 */ 199 | mov r4, r2 200 | mov r5, #16 201 | mov r6, #0 202 | b .L2_loop_init_test 203 | .L2_loop_init : 204 | str r6, [r4], +#4 /* *r4 ← r6 then r4 ← r4 + 4 */ 205 | .L2_loop_init_test: 206 | subs r5, r5, #1 207 | bge .L2_loop_init 208 | 209 | /* Set the LEN field of FPSCR to be 4 (value 3) */ 210 | mov r5, #0b011 /* r5 ← 3 */ 211 | mov r5, r5, LSL #16 /* r5 ← r5 << 16 */ 212 | fmrx r4, fpscr /* r4 ← fpscr */ 213 | orr r4, r4, r5 /* r4 ← r4 | r5 */ 214 | fmxr fpscr, r4 /* fpscr ← r4 */ 215 | 216 | /* We will use 217 | r4 as i 218 | r5 as j 219 | */ 220 | mov r4, #0 /* r4 ← 0 */ 221 | .L2_loop_i: /* loop header of i */ 222 | cmp r4, #4 /* if r4 == 4 goto end of the loop i */ 223 | beq .L2_end_loop_i 224 | mov r5, #0 /* r5 ← 0 */ 225 | .L2_loop_j: /* loop header of j */ 226 | cmp r5, #4 /* if r5 == 4 goto end of the loop j */ 227 | beq .L2_end_loop_j 228 | /* Compute the address of C[i][j] and load it into s0 */ 229 | /* Address of C[i][j] is C + 4*(4 * i + j) */ 230 | mov r7, r5 /* r7 ← r5. This is r7 ← j */ 231 | adds r7, r7, r4, LSL #2 /* r7 ← r7 + (r4 << 2). 232 | This is r7 ← j + i * 4. 233 | We multiply i by the row size (4 elements) */ 234 | adds r7, r2, r7, LSL #2 /* r7 ← r2 + (r7 << 2). 235 | This is r7 ← C + 4*(j + i * 4) 236 | We multiply (j + i * 4) by the size of the element. 237 | A single-precision floating point takes 4 bytes. 238 | */ 239 | /* Compute the address of a[i][0] */ 240 | mov r8, r4, LSL #2 241 | adds r8, r0, r8, LSL #2 242 | vldmia r8, {s8-s11} /* Load {s8,s9,s10,s11} ← {a[i][0], a[i][1], a[i][2], a[i][3]} */ 243 | 244 | /* Compute the address of b[0][j] */ 245 | mov r8, r5 /* r8 ← r5. This is r8 ← j */ 246 | adds r8, r1, r8, LSL #2 /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j) */ 247 | vldr s16, [r8] /* s16 ← *r8. This is s16 ← b[0][j] */ 248 | vldr s17, [r8, #16] /* s17 ← *(r8 + 16). This is s17 ← b[1][j] */ 249 | vldr s18, [r8, #32] /* s18 ← *(r8 + 32). This is s17 ← b[2][j] */ 250 | vldr s19, [r8, #48] /* s19 ← *(r8 + 48). This is s17 ← b[3][j] */ 251 | 252 | /* Compute the address of b[0][j+1] */ 253 | add r8, r5, #1 /* r8 ← r5 + 1. This is r8 ← j + 1*/ 254 | adds r8, r1, r8, LSL #2 /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j + 1) */ 255 | vldr s20, [r8] /* s20 ← *r8. This is s20 ← b[0][j + 1] */ 256 | vldr s21, [r8, #16] /* s21 ← *(r8 + 16). This is s21 ← b[1][j + 1] */ 257 | vldr s22, [r8, #32] /* s22 ← *(r8 + 32). This is s22 ← b[2][j + 1] */ 258 | vldr s23, [r8, #48] /* s23 ← *(r8 + 48). This is s23 ← b[3][j + 1] */ 259 | 260 | vmul.f32 s24, s8, s16 /* {s24,s25,s26,s27} ← {s8,s9,s10,s11} * {s16,s17,s18,s19} */ 261 | vmov.f32 s0, s24 /* s0 ← s24 */ 262 | vadd.f32 s0, s0, s25 /* s0 ← s0 + s25 */ 263 | vadd.f32 s0, s0, s26 /* s0 ← s0 + s26 */ 264 | vadd.f32 s0, s0, s27 /* s0 ← s0 + s27 */ 265 | 266 | vmul.f32 s28, s8, s20 /* {s28,s29,s30,s31} ← {s8,s9,s10,s11} * {s20,s21,s22,s23} */ 267 | 268 | vmov.f32 s1, s28 /* s1 ← s28 */ 269 | vadd.f32 s1, s1, s29 /* s1 ← s1 + s29 */ 270 | vadd.f32 s1, s1, s30 /* s1 ← s1 + s30 */ 271 | vadd.f32 s1, s1, s31 /* s1 ← s1 + s31 */ 272 | 273 | vstmia r7, {s0-s1} /* {C[i][j], C[i][j+1]} ← {s0, s1} */ 274 | 275 | add r5, r5, #2 /* r5 ← r5 + 2 */ 276 | b .L2_loop_j /* next iteration of loop j */ 277 | .L2_end_loop_j: /* Here ends loop j */ 278 | add r4, r4, #1 /* r4 ← r4 + 1 */ 279 | b .L2_loop_i /* next iteration of loop i */ 280 | .L2_end_loop_i: /* Here ends loop i */ 281 | 282 | /* Set the LEN field of FPSCR back to 1 (value 0) */ 283 | mov r5, #0b011 /* r5 ← 3 */ 284 | mvn r5, r5, LSL #16 /* r5 ← r5 << 16 */ 285 | fmrx r4, fpscr /* r4 ← fpscr */ 286 | and r4, r4, r5 /* r4 ← r4 & r5 */ 287 | fmxr fpscr, r4 /* fpscr ← r4 */ 288 | 289 | vpop {s16-s31} /* Restore preserved floating registers */ 290 | pop {r4, r5, r6, r7, r8, lr} /* Restore integer registers */ 291 | bx lr /* Leave function */ 292 | 293 | better_vectorial_matmul_4x4: 294 | /* r0 address of A 295 | r1 address of B 296 | r2 address of C 297 | */ 298 | push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */ 299 | vpush {s16-s19} /* Floating point registers starting from s16 must be preserved */ 300 | vpush {s24-s27} 301 | /* First zero 16 single floating point */ 302 | /* In IEEE 754, all bits cleared means 0 */ 303 | mov r4, r2 304 | mov r5, #16 305 | mov r6, #0 306 | b .L3_loop_init_test 307 | .L3_loop_init : 308 | str r6, [r4], +#4 /* *r4 ← r6 then r4 ← r4 + 4 */ 309 | .L3_loop_init_test: 310 | subs r5, r5, #1 311 | bge .L3_loop_init 312 | 313 | /* Set the LEN field of FPSCR to be 4 (value 3) */ 314 | mov r5, #0b011 /* r5 ← 3 */ 315 | mov r5, r5, LSL #16 /* r5 ← r5 << 16 */ 316 | fmrx r4, fpscr /* r4 ← fpscr */ 317 | orr r4, r4, r5 /* r4 ← r4 | r5 */ 318 | fmxr fpscr, r4 /* fpscr ← r4 */ 319 | 320 | /* We will use 321 | r4 as k 322 | r5 as i 323 | */ 324 | mov r4, #0 /* r4 ← 0 */ 325 | .L3_loop_k: /* loop header of k */ 326 | cmp r4, #4 /* if r4 == 4 goto end of the loop k */ 327 | beq .L3_end_loop_k 328 | mov r5, #0 /* r5 ← 0 */ 329 | .L3_loop_i: /* loop header of i */ 330 | cmp r5, #4 /* if r5 == 4 goto end of the loop i */ 331 | beq .L3_end_loop_i 332 | /* Compute the address of C[i][0] */ 333 | /* Address of C[i][0] is C + 4*(4 * i) */ 334 | add r7, r2, r5, LSL #4 /* r7 ← r2 + (r5 << 4). This is r7 ← c + 4*4*i */ 335 | vldmia r7, {s8-s11} /* Load {s8,s9,s10,s11} ← {c[i][0], c[i][1], c[i][2], c[i][3]} */ 336 | /* Compute the address of A[i][k] */ 337 | /* Address of A[i][k] is A + 4*(4*i + k) */ 338 | add r8, r4, r5, LSL #2 /* r8 ← r4 + r5 << 2. This is r8 ← k + 4*i */ 339 | add r8, r0, r8, LSL #2 /* r8 ← r0 + r8 << 2. This is r8 ← a + 4*(k + 4*i) */ 340 | vldr s0, [r8] /* Load s0 ← a[i][k] */ 341 | 342 | /* Compute the address of B[k][0] */ 343 | /* Address of B[k][0] is B + 4*(4*k) */ 344 | add r8, r1, r4, LSL #4 /* r8 ← r1 + r4 << 4. This is r8 ← b + 4*(4*k) */ 345 | vldmia r8, {s16-s19} /* Load {s16,s17,s18,s19} ← {b[k][0], b[k][1], b[k][2], b[k][3]} */ 346 | 347 | vmul.f32 s24, s16, s0 /* {s24,s25,s26,s27} ← {s16,s17,s18,s19} * {s0,s0,s0,s0} */ 348 | vadd.f32 s8, s8, s24 /* {s8,s9,s10,s11} ← {s8,s9,s10,s11} + {s24,s25,s26,s7} */ 349 | 350 | vstmia r7, {s8-s11} /* Store {c[i][0],c[i][1],c[i][2],c[i][3]} ← {s8,s9,s10,s11} */ 351 | 352 | add r5, r5, #1 /* r5 ← r5 + 1. This is i = i + 1 */ 353 | b .L3_loop_i /* next iteration of loop i */ 354 | .L3_end_loop_i: /* Here ends loop i */ 355 | add r4, r4, #1 /* r4 ← r4 + 1. This is k = k + 1 */ 356 | b .L3_loop_k /* next iteration of loop k */ 357 | .L3_end_loop_k: /* Here ends loop k */ 358 | 359 | /* Set the LEN field of FPSCR back to 1 (value 0) */ 360 | mov r5, #0b011 /* r5 ← 3 */ 361 | mvn r5, r5, LSL #16 /* r5 ← r5 << 16 */ 362 | fmrx r4, fpscr /* r4 ← fpscr */ 363 | and r4, r4, r5 /* r4 ← r4 & r5 */ 364 | fmxr fpscr, r4 /* fpscr ← r4 */ 365 | 366 | vpop {s24-s27} /* Restore preserved floating registers */ 367 | vpop {s16-s19} 368 | pop {r4, r5, r6, r7, r8, lr} /* Restore integer registers */ 369 | bx lr /* Leave function */ 370 | 371 | best_vectorial_matmul_4x4: 372 | /* r0 address of A 373 | r1 address of B 374 | r2 address of C 375 | */ 376 | push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */ 377 | vpush {s16-s19} /* Floating point registers starting from s16 must be preserved */ 378 | 379 | /* First zero 16 single floating point */ 380 | /* In IEEE 754, all bits cleared means 0 */ 381 | mov r4, r2 382 | mov r5, #16 383 | mov r6, #0 384 | b .L4_loop_init_test 385 | .L4_loop_init : 386 | str r6, [r4], +#4 /* *r4 ← r6 then r4 ← r4 + 4 */ 387 | .L4_loop_init_test: 388 | subs r5, r5, #1 389 | bge .L4_loop_init 390 | 391 | /* Set the LEN field of FPSCR to be 4 (value 3) */ 392 | mov r5, #0b011 /* r5 ← 3 */ 393 | mov r5, r5, LSL #16 /* r5 ← r5 << 16 */ 394 | fmrx r4, fpscr /* r4 ← fpscr */ 395 | orr r4, r4, r5 /* r4 ← r4 | r5 */ 396 | fmxr fpscr, r4 /* fpscr ← r4 */ 397 | 398 | /* We will use 399 | r4 as k 400 | r5 as i 401 | */ 402 | mov r4, #0 /* r4 ← 0 */ 403 | .L4_loop_k: /* loop header of k */ 404 | cmp r4, #4 /* if r4 == 4 goto end of the loop k */ 405 | beq .L4_end_loop_k 406 | mov r5, #0 /* r5 ← 0 */ 407 | .L4_loop_i: /* loop header of i */ 408 | cmp r5, #4 /* if r5 == 4 goto end of the loop i */ 409 | beq .L4_end_loop_i 410 | /* Compute the address of C[i][0] */ 411 | /* Address of C[i][0] is C + 4*(4 * i) */ 412 | add r7, r2, r5, LSL #4 /* r7 ← r2 + (r5 << 4). This is r7 ← c + 4*4*i */ 413 | vldmia r7, {s8-s15} /* Load {s8,s9,s10,s11,s12,s13,s14,s15} 414 | ← {c[i][0], c[i][1], c[i][2], c[i][3] 415 | c[i+1][0], c[i+1][1], c[i+1][2], c[i+1][3]} */ 416 | /* Compute the address of A[i][k] */ 417 | /* Address of A[i][k] is A + 4*(4*i + k) */ 418 | add r8, r4, r5, LSL #2 /* r8 ← r4 + r5 << 2. This is r8 ← k + 4*i */ 419 | add r8, r0, r8, LSL #2 /* r8 ← r0 + r8 << 2. This is r8 ← a + 4*(k + 4*i) */ 420 | vldr s0, [r8] /* Load s0 ← a[i][k] */ 421 | vldr s1, [r8, #16] /* Load s1 ← a[i+1][k] */ 422 | 423 | /* Compute the address of B[k][0] */ 424 | /* Address of B[k][0] is B + 4*(4*k) */ 425 | add r8, r1, r4, LSL #4 /* r8 ← r1 + r4 << 4. This is r8 ← b + 4*(4*k) */ 426 | vldmia r8, {s16-s19} /* Load {s16,s17,s18,s19} ← {b[k][0], b[k][1], b[k][2], b[k][3]} */ 427 | 428 | vmla.f32 s8, s16, s0 /* {s8,s9,s10,s11} ← {s8,s9,s10,s11} + ({s16,s17,s18,s19} * {s0,s0,s0,s0}) */ 429 | vmla.f32 s12, s16, s1 /* {s12,s13,s14,s15} ← {s12,s13,s14,s15} + ({s16,s17,s18,s19} * {s1,s1,s1,s1}) */ 430 | 431 | vstmia r7, {s8-s15} /* Store {c[i][0], c[i][1], c[i][2], c[i][3], 432 | c[i+1][0], c[i+1][1], c[i+1][2]}, c[i+1][3] } 433 | ← {s8,s9,s10,s11,s12,s13,s14,s15} */ 434 | 435 | add r5, r5, #2 /* r5 ← r5 + 2. This is i = i + 2 */ 436 | b .L4_loop_i /* next iteration of loop i */ 437 | .L4_end_loop_i: /* Here ends loop i */ 438 | add r4, r4, #1 /* r4 ← r4 + 1. This is k = k + 1 */ 439 | b .L4_loop_k /* next iteration of loop k */ 440 | .L4_end_loop_k: /* Here ends loop k */ 441 | 442 | /* Set the LEN field of FPSCR back to 1 (value 0) */ 443 | mov r5, #0b011 /* r5 ← 3 */ 444 | mvn r5, r5, LSL #16 /* r5 ← r5 << 16 */ 445 | fmrx r4, fpscr /* r4 ← fpscr */ 446 | and r4, r4, r5 /* r4 ← r4 & r5 */ 447 | fmxr fpscr, r4 /* fpscr ← r4 */ 448 | 449 | vpop {s16-s19} /* Restore preserved floating registers */ 450 | pop {r4, r5, r6, r7, r8, lr} /* Restore integer registers */ 451 | bx lr /* Leave function */ 452 | 453 | .globl main 454 | main: 455 | push {r4, r5, r6, lr} /* Keep integer registers */ 456 | 457 | ldr r0, addr_mat_A /* r0 ← a */ 458 | ldr r1, addr_mat_B /* r1 ← b */ 459 | ldr r2, addr_mat_C /* r2 ← c */ 460 | mov r4, #1 461 | mov r4, r4, LSL #21 462 | .Lmain_loop_test: 463 | bl best_vectorial_matmul_4x4 464 | subs r4, r4, #1 465 | bne .Lmain_loop_test /* Should have been 'bge' */ 466 | 467 | mov r0, #0 468 | pop {r4, r5, r6, lr} 469 | bx lr 470 | 471 | addr_mat_A : .word mat_A 472 | addr_mat_B : .word mat_B 473 | addr_mat_C : .word mat_C 474 | addr_format_result : .word format_result 475 | -------------------------------------------------------------------------------- /chapter14/matmul.s: -------------------------------------------------------------------------------- 1 | /* -- matmul.s */ 2 | .data 3 | mat_A: .float 0.1, 0.2, 0.0, 0.1 4 | .float 0.2, 0.1, 0.3, 0.0 5 | .float 0.0, 0.3, 0.1, 0.5 6 | .float 0.0, 0.6, 0.4, 0.1 7 | mat_B: .float 4.92, 2.54, -0.63, -1.75 8 | .float 3.02, -1.51, -0.87, 1.35 9 | .float -4.29, 2.14, 0.71, 0.71 10 | .float -0.95, 0.48, 2.38, -0.95 11 | mat_C: .float 0.0, 0.0, 0.0, 0.0 12 | .float 0.0, 0.0, 0.0, 0.0 13 | .float 0.0, 0.0, 0.0, 0.0 14 | .float 0.0, 0.0, 0.0, 0.0 15 | .float 0.0, 0.0, 0.0, 0.0 16 | 17 | format_result : .asciz "Matrix result is:\n%5.2f %5.2f %5.2f %5.2f\n%5.2f %5.2f %5.2f %5.2f\n%5.2f %5.2f %5.2f %5.2f\n%5.2f %5.2f %5.2f %5.2f\n" 18 | 19 | .text 20 | 21 | naive_matmul_4x4: 22 | /* r0 address of A 23 | r1 address of B 24 | r2 address of C 25 | */ 26 | push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */ 27 | /* First zero 16 single floating point */ 28 | /* In IEEE 754, all bits cleared means 0 */ 29 | mov r4, r2 30 | mov r5, #16 31 | mov r6, #0 32 | b .L0_loop_init_test 33 | .L0_loop_init : 34 | str r6, [r4], +#4 /* *r4 ← r6 then r4 ← r4 + 4 */ 35 | .L0_loop_init_test: 36 | subs r5, r5, #1 37 | bne .L0_loop_init 38 | 39 | /* We will use 40 | r4 as i 41 | r5 as j 42 | r6 as k 43 | */ 44 | mov r4, #0 /* r4 ← 0 */ 45 | .L0_loop_i: /* loop header of i */ 46 | cmp r4, #4 /* if r4 == 4 goto end of the loop i */ 47 | beq .L0_end_loop_i 48 | mov r5, #0 /* r5 ← 0 */ 49 | .L0_loop_j: /* loop header of j */ 50 | cmp r5, #4 /* if r5 == 4 goto end of the loop j */ 51 | beq .L0_end_loop_j 52 | /* Compute the address of C[i][j] and load it into s0 */ 53 | /* Address of C[i][j] is C + 4*(4 * i + j) */ 54 | mov r7, r5 /* r7 ← r5. This is r7 ← j */ 55 | adds r7, r7, r4, LSL #2 /* r7 ← r7 + (r4 << 2). 56 | This is r7 ← j + i * 4. 57 | We multiply i by the row size (4 elements) */ 58 | adds r7, r2, r7, LSL #2 /* r7 ← r2 + (r7 << 2). 59 | This is r7 ← C + 4*(j + i * 4) 60 | We multiply (j + i * 4) by the size of the element. 61 | A single-precision floating point takes 4 bytes. 62 | */ 63 | vldr s0, [r7] /* s0 ← *r7 */ 64 | 65 | mov r6, #0 /* r6 ← 0 */ 66 | .L0_loop_k : /* loop header of k */ 67 | cmp r6, #4 /* if r6 == 4 goto end of the loop k */ 68 | beq .L0_end_loop_k 69 | 70 | /* Compute the address of a[i][k] and load it into s1 */ 71 | /* Address of a[i][k] is a + 4*(4 * i + k) */ 72 | mov r8, r6 /* r8 ← r6. This is r8 ← k */ 73 | adds r8, r8, r4, LSL #2 /* r8 ← r8 + (r4 << 2). This is r8 ← k + i * 4 */ 74 | adds r8, r0, r8, LSL #2 /* r8 ← r0 + (r8 << 2). This is r8 ← a + 4*(k + i * 4) */ 75 | vldr s1, [r8] /* s1 ← *r8 */ 76 | 77 | /* Compute the address of b[k][j] and load it into s2 */ 78 | /* Address of b[k][j] is b + 4*(4 * k + j) */ 79 | mov r8, r5 /* r8 ← r5. This is r8 ← j */ 80 | adds r8, r8, r6, LSL #2 /* r8 ← r8 + (r6 << 2). This is r8 ← j + k * 4 */ 81 | adds r8, r1, r8, LSL #2 /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j + k * 4) */ 82 | vldr s2, [r8] /* s1 ← *r8 */ 83 | 84 | vmul.f32 s3, s1, s2 /* s3 ← s1 * s2 */ 85 | vadd.f32 s0, s0, s3 /* s0 ← s0 + s3 */ 86 | 87 | add r6, r6, #1 /* r6 ← r6 + 1 */ 88 | b .L0_loop_k /* next iteration of loop k */ 89 | .L0_end_loop_k: /* Here ends loop k */ 90 | vstr s0, [r7] /* Store s0 back to C[i][j] */ 91 | add r5, r5, #1 /* r5 ← r5 + 1 */ 92 | b .L0_loop_j /* next iteration of loop j */ 93 | .L0_end_loop_j: /* Here ends loop j */ 94 | add r4, r4, #1 /* r4 ← r4 + 1 */ 95 | b .L0_loop_i /* next iteration of loop i */ 96 | .L0_end_loop_i: /* Here ends loop i */ 97 | 98 | pop {r4, r5, r6, r7, r8, lr} /* Restore integer registers */ 99 | bx lr /* Leave function */ 100 | 101 | naive_vectorial_matmul_4x4: 102 | /* r0 address of A 103 | r1 address of B 104 | r2 address of C 105 | */ 106 | push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */ 107 | vpush {s16-s19} /* Floating point registers starting from s16 must be preserved */ 108 | vpush {s24-s27} 109 | /* First zero 16 single floating point */ 110 | /* In IEEE 754, all bits cleared means 0 */ 111 | mov r4, r2 112 | mov r5, #16 113 | mov r6, #0 114 | b .L1_loop_init_test 115 | .L1_loop_init : 116 | str r6, [r4], +#4 /* *r4 ← r6 then r4 ← r4 + 4 */ 117 | .L1_loop_init_test: 118 | subs r5, r5, #1 119 | bne .L1_loop_init 120 | 121 | /* Set the LEN field of FPSCR to be 4 (value 3) */ 122 | mov r5, #0b011 /* r5 ← 3 */ 123 | mov r5, r5, LSL #16 /* r5 ← r5 << 16 */ 124 | fmrx r4, fpscr /* r4 ← fpscr */ 125 | orr r4, r4, r5 /* r4 ← r4 | r5 */ 126 | fmxr fpscr, r4 /* fpscr ← r4 */ 127 | 128 | /* We will use 129 | r4 as i 130 | r5 as j 131 | */ 132 | mov r4, #0 /* r4 ← 0 */ 133 | .L1_loop_i: /* loop header of i */ 134 | cmp r4, #4 /* if r4 == 4 goto end of the loop i */ 135 | beq .L1_end_loop_i 136 | mov r5, #0 /* r5 ← 0 */ 137 | .L1_loop_j: /* loop header of j */ 138 | cmp r5, #4 /* if r5 == 4 goto end of the loop j */ 139 | beq .L1_end_loop_j 140 | /* Compute the address of C[i][j] and load it into s0 */ 141 | /* Address of C[i][j] is C + 4*(4 * i + j) */ 142 | mov r7, r5 /* r7 ← r5. This is r7 ← j */ 143 | adds r7, r7, r4, LSL #2 /* r7 ← r7 + (r4 << 2). 144 | This is r7 ← j + i * 4. 145 | We multiply i by the row size (4 elements) */ 146 | adds r7, r2, r7, LSL #2 /* r7 ← r2 + (r7 << 2). 147 | This is r7 ← C + 4*(j + i * 4) 148 | We multiply (j + i * 4) by the size of the element. 149 | A single-precision floating point takes 4 bytes. 150 | */ 151 | /* Compute the address of a[i][0] */ 152 | mov r8, r4, LSL #2 153 | adds r8, r0, r8, LSL #2 154 | vldmia r8, {s8-s11} /* Load {s8,s9,s10,s11} ← {a[i][0], a[i][1], a[i][2], a[i][3]} */ 155 | 156 | /* Compute the address of b[0][j] */ 157 | mov r8, r5 /* r8 ← r5. This is r8 ← j */ 158 | adds r8, r1, r8, LSL #2 /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j) */ 159 | vldr s16, [r8] /* s16 ← *r8. This is s16 ← b[0][j] */ 160 | vldr s17, [r8, #16] /* s17 ← *(r8 + 16). This is s17 ← b[1][j] */ 161 | vldr s18, [r8, #32] /* s18 ← *(r8 + 32). This is s17 ← b[2][j] */ 162 | vldr s19, [r8, #48] /* s19 ← *(r8 + 48). This is s17 ← b[3][j] */ 163 | 164 | vmul.f32 s24, s8, s16 /* {s24,s25,s26,s27} ← {s8,s9,s10,s11} * {s16,s17,s18,s19} */ 165 | vmov.f32 s0, s24 /* s0 ← s24 */ 166 | vadd.f32 s0, s0, s25 /* s0 ← s0 + s25 */ 167 | vadd.f32 s0, s0, s26 /* s0 ← s0 + s26 */ 168 | vadd.f32 s0, s0, s27 /* s0 ← s0 + s27 */ 169 | 170 | vstr s0, [r7] /* Store s0 back to C[i][j] */ 171 | add r5, r5, #1 /* r5 ← r5 + 1 */ 172 | b .L1_loop_j /* next iteration of loop j */ 173 | .L1_end_loop_j: /* Here ends loop j */ 174 | add r4, r4, #1 /* r4 ← r4 + 1 */ 175 | b .L1_loop_i /* next iteration of loop i */ 176 | .L1_end_loop_i: /* Here ends loop i */ 177 | 178 | /* Set the LEN field of FPSCR back to 1 (value 0) */ 179 | mov r5, #0b011 /* r5 ← 3 */ 180 | mvn r5, r5, LSL #16 /* r5 ← r5 << 16 */ 181 | fmrx r4, fpscr /* r4 ← fpscr */ 182 | and r4, r4, r5 /* r4 ← r4 & r5 */ 183 | fmxr fpscr, r4 /* fpscr ← r4 */ 184 | 185 | vpop {s24-s27} /* Restore preserved floating registers */ 186 | vpop {s16-s19} 187 | pop {r4, r5, r6, r7, r8, lr} /* Restore integer registers */ 188 | bx lr /* Leave function */ 189 | 190 | naive_vectorial_matmul_2_4x4: 191 | /* r0 address of A 192 | r1 address of B 193 | r2 address of C 194 | */ 195 | push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */ 196 | vpush {s16-s31} /* Floating point registers starting from s16 must be preserved */ 197 | /* First zero 16 single floating point */ 198 | /* In IEEE 754, all bits cleared means 0 */ 199 | mov r4, r2 200 | mov r5, #16 201 | mov r6, #0 202 | b .L2_loop_init_test 203 | .L2_loop_init : 204 | str r6, [r4], +#4 /* *r4 ← r6 then r4 ← r4 + 4 */ 205 | .L2_loop_init_test: 206 | subs r5, r5, #1 207 | bne .L2_loop_init 208 | 209 | /* Set the LEN field of FPSCR to be 4 (value 3) */ 210 | mov r5, #0b011 /* r5 ← 3 */ 211 | mov r5, r5, LSL #16 /* r5 ← r5 << 16 */ 212 | fmrx r4, fpscr /* r4 ← fpscr */ 213 | orr r4, r4, r5 /* r4 ← r4 | r5 */ 214 | fmxr fpscr, r4 /* fpscr ← r4 */ 215 | 216 | /* We will use 217 | r4 as i 218 | r5 as j 219 | */ 220 | mov r4, #0 /* r4 ← 0 */ 221 | .L2_loop_i: /* loop header of i */ 222 | cmp r4, #4 /* if r4 == 4 goto end of the loop i */ 223 | beq .L2_end_loop_i 224 | mov r5, #0 /* r5 ← 0 */ 225 | .L2_loop_j: /* loop header of j */ 226 | cmp r5, #4 /* if r5 == 4 goto end of the loop j */ 227 | beq .L2_end_loop_j 228 | /* Compute the address of C[i][j] and load it into s0 */ 229 | /* Address of C[i][j] is C + 4*(4 * i + j) */ 230 | mov r7, r5 /* r7 ← r5. This is r7 ← j */ 231 | adds r7, r7, r4, LSL #2 /* r7 ← r7 + (r4 << 2). 232 | This is r7 ← j + i * 4. 233 | We multiply i by the row size (4 elements) */ 234 | adds r7, r2, r7, LSL #2 /* r7 ← r2 + (r7 << 2). 235 | This is r7 ← C + 4*(j + i * 4) 236 | We multiply (j + i * 4) by the size of the element. 237 | A single-precision floating point takes 4 bytes. 238 | */ 239 | /* Compute the address of a[i][0] */ 240 | mov r8, r4, LSL #2 241 | adds r8, r0, r8, LSL #2 242 | vldmia r8, {s8-s11} /* Load {s8,s9,s10,s11} ← {a[i][0], a[i][1], a[i][2], a[i][3]} */ 243 | 244 | /* Compute the address of b[0][j] */ 245 | mov r8, r5 /* r8 ← r5. This is r8 ← j */ 246 | adds r8, r1, r8, LSL #2 /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j) */ 247 | vldr s16, [r8] /* s16 ← *r8. This is s16 ← b[0][j] */ 248 | vldr s17, [r8, #16] /* s17 ← *(r8 + 16). This is s17 ← b[1][j] */ 249 | vldr s18, [r8, #32] /* s18 ← *(r8 + 32). This is s17 ← b[2][j] */ 250 | vldr s19, [r8, #48] /* s19 ← *(r8 + 48). This is s17 ← b[3][j] */ 251 | 252 | /* Compute the address of b[0][j+1] */ 253 | add r8, r5, #1 /* r8 ← r5 + 1. This is r8 ← j + 1*/ 254 | adds r8, r1, r8, LSL #2 /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j + 1) */ 255 | vldr s20, [r8] /* s20 ← *r8. This is s20 ← b[0][j + 1] */ 256 | vldr s21, [r8, #16] /* s21 ← *(r8 + 16). This is s21 ← b[1][j + 1] */ 257 | vldr s22, [r8, #32] /* s22 ← *(r8 + 32). This is s22 ← b[2][j + 1] */ 258 | vldr s23, [r8, #48] /* s23 ← *(r8 + 48). This is s23 ← b[3][j + 1] */ 259 | 260 | vmul.f32 s24, s8, s16 /* {s24,s25,s26,s27} ← {s8,s9,s10,s11} * {s16,s17,s18,s19} */ 261 | vmov.f32 s0, s24 /* s0 ← s24 */ 262 | vadd.f32 s0, s0, s25 /* s0 ← s0 + s25 */ 263 | vadd.f32 s0, s0, s26 /* s0 ← s0 + s26 */ 264 | vadd.f32 s0, s0, s27 /* s0 ← s0 + s27 */ 265 | 266 | vmul.f32 s28, s8, s20 /* {s28,s29,s30,s31} ← {s8,s9,s10,s11} * {s20,s21,s22,s23} */ 267 | 268 | vmov.f32 s1, s28 /* s1 ← s28 */ 269 | vadd.f32 s1, s1, s29 /* s1 ← s1 + s29 */ 270 | vadd.f32 s1, s1, s30 /* s1 ← s1 + s30 */ 271 | vadd.f32 s1, s1, s31 /* s1 ← s1 + s31 */ 272 | 273 | vstmia r7, {s0-s1} /* {C[i][j], C[i][j+1]} ← {s0, s1} */ 274 | 275 | add r5, r5, #2 /* r5 ← r5 + 2 */ 276 | b .L2_loop_j /* next iteration of loop j */ 277 | .L2_end_loop_j: /* Here ends loop j */ 278 | add r4, r4, #1 /* r4 ← r4 + 1 */ 279 | b .L2_loop_i /* next iteration of loop i */ 280 | .L2_end_loop_i: /* Here ends loop i */ 281 | 282 | /* Set the LEN field of FPSCR back to 1 (value 0) */ 283 | mov r5, #0b011 /* r5 ← 3 */ 284 | mvn r5, r5, LSL #16 /* r5 ← r5 << 16 */ 285 | fmrx r4, fpscr /* r4 ← fpscr */ 286 | and r4, r4, r5 /* r4 ← r4 & r5 */ 287 | fmxr fpscr, r4 /* fpscr ← r4 */ 288 | 289 | vpop {s16-s31} /* Restore preserved floating registers */ 290 | pop {r4, r5, r6, r7, r8, lr} /* Restore integer registers */ 291 | bx lr /* Leave function */ 292 | 293 | better_vectorial_matmul_4x4: 294 | /* r0 address of A 295 | r1 address of B 296 | r2 address of C 297 | */ 298 | push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */ 299 | vpush {s16-s19} /* Floating point registers starting from s16 must be preserved */ 300 | vpush {s24-s27} 301 | /* First zero 16 single floating point */ 302 | /* In IEEE 754, all bits cleared means 0 */ 303 | mov r4, r2 304 | mov r5, #16 305 | mov r6, #0 306 | b .L3_loop_init_test 307 | .L3_loop_init : 308 | str r6, [r4], +#4 /* *r4 ← r6 then r4 ← r4 + 4 */ 309 | .L3_loop_init_test: 310 | subs r5, r5, #1 311 | bne .L3_loop_init 312 | 313 | /* Set the LEN field of FPSCR to be 4 (value 3) */ 314 | mov r5, #0b011 /* r5 ← 3 */ 315 | mov r5, r5, LSL #16 /* r5 ← r5 << 16 */ 316 | fmrx r4, fpscr /* r4 ← fpscr */ 317 | orr r4, r4, r5 /* r4 ← r4 | r5 */ 318 | fmxr fpscr, r4 /* fpscr ← r4 */ 319 | 320 | /* We will use 321 | r4 as k 322 | r5 as i 323 | */ 324 | mov r4, #0 /* r4 ← 0 */ 325 | .L3_loop_k: /* loop header of k */ 326 | cmp r4, #4 /* if r4 == 4 goto end of the loop k */ 327 | beq .L3_end_loop_k 328 | mov r5, #0 /* r5 ← 0 */ 329 | .L3_loop_i: /* loop header of i */ 330 | cmp r5, #4 /* if r5 == 4 goto end of the loop i */ 331 | beq .L3_end_loop_i 332 | /* Compute the address of C[i][0] */ 333 | /* Address of C[i][0] is C + 4*(4 * i) */ 334 | add r7, r2, r5, LSL #4 /* r7 ← r2 + (r5 << 4). This is r7 ← c + 4*4*i */ 335 | vldmia r7, {s8-s11} /* Load {s8,s9,s10,s11} ← {c[i][0], c[i][1], c[i][2], c[i][3]} */ 336 | /* Compute the address of A[i][k] */ 337 | /* Address of A[i][k] is A + 4*(4*i + k) */ 338 | add r8, r4, r5, LSL #2 /* r8 ← r4 + r5 << 2. This is r8 ← k + 4*i */ 339 | add r8, r0, r8, LSL #2 /* r8 ← r0 + r8 << 2. This is r8 ← a + 4*(k + 4*i) */ 340 | vldr s0, [r8] /* Load s0 ← a[i][k] */ 341 | 342 | /* Compute the address of B[k][0] */ 343 | /* Address of B[k][0] is B + 4*(4*k) */ 344 | add r8, r1, r4, LSL #4 /* r8 ← r1 + r4 << 4. This is r8 ← b + 4*(4*k) */ 345 | vldmia r8, {s16-s19} /* Load {s16,s17,s18,s19} ← {b[k][0], b[k][1], b[k][2], b[k][3]} */ 346 | 347 | vmul.f32 s24, s16, s0 /* {s24,s25,s26,s27} ← {s16,s17,s18,s19} * {s0,s0,s0,s0} */ 348 | vadd.f32 s8, s8, s24 /* {s8,s9,s10,s11} ← {s8,s9,s10,s11} + {s24,s25,s26,s7} */ 349 | 350 | vstmia r7, {s8-s11} /* Store {c[i][0],c[i][1],c[i][2],c[i][3]} ← {s8,s9,s10,s11} */ 351 | 352 | add r5, r5, #1 /* r5 ← r5 + 1. This is i = i + 1 */ 353 | b .L3_loop_i /* next iteration of loop i */ 354 | .L3_end_loop_i: /* Here ends loop i */ 355 | add r4, r4, #1 /* r4 ← r4 + 1. This is k = k + 1 */ 356 | b .L3_loop_k /* next iteration of loop k */ 357 | .L3_end_loop_k: /* Here ends loop k */ 358 | 359 | /* Set the LEN field of FPSCR back to 1 (value 0) */ 360 | mov r5, #0b011 /* r5 ← 3 */ 361 | mvn r5, r5, LSL #16 /* r5 ← r5 << 16 */ 362 | fmrx r4, fpscr /* r4 ← fpscr */ 363 | and r4, r4, r5 /* r4 ← r4 & r5 */ 364 | fmxr fpscr, r4 /* fpscr ← r4 */ 365 | 366 | vpop {s24-s27} /* Restore preserved floating registers */ 367 | vpop {s16-s19} 368 | pop {r4, r5, r6, r7, r8, lr} /* Restore integer registers */ 369 | bx lr /* Leave function */ 370 | 371 | best_vectorial_matmul_4x4: 372 | /* r0 address of A 373 | r1 address of B 374 | r2 address of C 375 | */ 376 | push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */ 377 | vpush {s16-s19} /* Floating point registers starting from s16 must be preserved */ 378 | 379 | /* First zero 16 single floating point */ 380 | /* In IEEE 754, all bits cleared means 0 */ 381 | mov r4, r2 382 | mov r5, #16 383 | mov r6, #0 384 | b .L4_loop_init_test 385 | .L4_loop_init : 386 | str r6, [r4], +#4 /* *r4 ← r6 then r4 ← r4 + 4 */ 387 | .L4_loop_init_test: 388 | subs r5, r5, #1 389 | bne .L4_loop_init 390 | 391 | /* Set the LEN field of FPSCR to be 4 (value 3) */ 392 | mov r5, #0b011 /* r5 ← 3 */ 393 | mov r5, r5, LSL #16 /* r5 ← r5 << 16 */ 394 | fmrx r4, fpscr /* r4 ← fpscr */ 395 | orr r4, r4, r5 /* r4 ← r4 | r5 */ 396 | fmxr fpscr, r4 /* fpscr ← r4 */ 397 | 398 | /* We will use 399 | r4 as k 400 | r5 as i 401 | */ 402 | mov r4, #0 /* r4 ← 0 */ 403 | .L4_loop_k: /* loop header of k */ 404 | cmp r4, #4 /* if r4 == 4 goto end of the loop k */ 405 | beq .L4_end_loop_k 406 | mov r5, #0 /* r5 ← 0 */ 407 | .L4_loop_i: /* loop header of i */ 408 | cmp r5, #4 /* if r5 == 4 goto end of the loop i */ 409 | beq .L4_end_loop_i 410 | /* Compute the address of C[i][0] */ 411 | /* Address of C[i][0] is C + 4*(4 * i) */ 412 | add r7, r2, r5, LSL #4 /* r7 ← r2 + (r5 << 4). This is r7 ← c + 4*4*i */ 413 | vldmia r7, {s8-s15} /* Load {s8,s9,s10,s11,s12,s13,s14,s15} 414 | ← {c[i][0], c[i][1], c[i][2], c[i][3] 415 | c[i+1][0], c[i+1][1], c[i+1][2], c[i+1][3]} */ 416 | /* Compute the address of A[i][k] */ 417 | /* Address of A[i][k] is A + 4*(4*i + k) */ 418 | add r8, r4, r5, LSL #2 /* r8 ← r4 + r5 << 2. This is r8 ← k + 4*i */ 419 | add r8, r0, r8, LSL #2 /* r8 ← r0 + r8 << 2. This is r8 ← a + 4*(k + 4*i) */ 420 | vldr s0, [r8] /* Load s0 ← a[i][k] */ 421 | vldr s1, [r8, #16] /* Load s1 ← a[i+1][k] */ 422 | 423 | /* Compute the address of B[k][0] */ 424 | /* Address of B[k][0] is B + 4*(4*k) */ 425 | add r8, r1, r4, LSL #4 /* r8 ← r1 + r4 << 4. This is r8 ← b + 4*(4*k) */ 426 | vldmia r8, {s16-s19} /* Load {s16,s17,s18,s19} ← {b[k][0], b[k][1], b[k][2], b[k][3]} */ 427 | 428 | vmla.f32 s8, s16, s0 /* {s8,s9,s10,s11} ← {s8,s9,s10,s11} + ({s16,s17,s18,s19} * {s0,s0,s0,s0}) */ 429 | vmla.f32 s12, s16, s1 /* {s12,s13,s14,s15} ← {s12,s13,s14,s15} + ({s16,s17,s18,s19} * {s1,s1,s1,s1}) */ 430 | 431 | vstmia r7, {s8-s15} /* Store {c[i][0], c[i][1], c[i][2], c[i][3], 432 | c[i+1][0], c[i+1][1], c[i+1][2]}, c[i+1][3] } 433 | ← {s8,s9,s10,s11,s12,s13,s14,s15} */ 434 | 435 | add r5, r5, #2 /* r5 ← r5 + 2. This is i = i + 2 */ 436 | b .L4_loop_i /* next iteration of loop i */ 437 | .L4_end_loop_i: /* Here ends loop i */ 438 | add r4, r4, #1 /* r4 ← r4 + 1. This is k = k + 1 */ 439 | b .L4_loop_k /* next iteration of loop k */ 440 | .L4_end_loop_k: /* Here ends loop k */ 441 | 442 | /* Set the LEN field of FPSCR back to 1 (value 0) */ 443 | mov r5, #0b011 /* r5 ← 3 */ 444 | mvn r5, r5, LSL #16 /* r5 ← r5 << 16 */ 445 | fmrx r4, fpscr /* r4 ← fpscr */ 446 | and r4, r4, r5 /* r4 ← r4 & r5 */ 447 | fmxr fpscr, r4 /* fpscr ← r4 */ 448 | 449 | vpop {s16-s19} /* Restore preserved floating registers */ 450 | pop {r4, r5, r6, r7, r8, lr} /* Restore integer registers */ 451 | bx lr /* Leave function */ 452 | 453 | .globl main 454 | main: 455 | push {r4, r5, r6, lr} /* Keep integer registers */ 456 | 457 | /* Prepare call to naive_matmul_4x4 */ 458 | ldr r0, addr_mat_A /* r0 ← a */ 459 | ldr r1, addr_mat_B /* r1 ← b */ 460 | ldr r2, addr_mat_C /* r2 ← c */ 461 | bl best_vectorial_matmul_4x4 462 | 463 | /* Now print the result matrix */ 464 | ldr r4, addr_mat_C /* r4 ← c */ 465 | 466 | vldr s0, [r4] /* s0 ← *r4. This is s0 ← c[0][0] */ 467 | vcvt.f64.f32 d1, s0 /* Convert it into a double-precision 468 | d1 ← s0 469 | */ 470 | vmov r2, r3, d1 /* {r2,r3} ← d1 */ 471 | 472 | mov r6, sp /* Remember the stack pointer, we need it to restore it back later */ 473 | /* r6 ← sp */ 474 | 475 | mov r5, #1 /* We will iterate from 1 to 15 (because the 0th item has already been handled */ 476 | add r4, r4, #60 /* Go to the last item of the matrix c, this is c[3][3] */ 477 | .Lloop: 478 | vldr s0, [r4] /* s0 ← *r4. Load the current item */ 479 | vcvt.f64.f32 d1, s0 /* Convert it into a double-precision 480 | d1 ← s0 481 | */ 482 | sub sp, sp, #8 /* Make room in the stack for the double-precision */ 483 | vstr d1, [sp] /* Store the double precision in the top of the stack */ 484 | sub r4, r4, #4 /* Move to the previous element in the matrix */ 485 | add r5, r5, #1 /* One more item has been handled */ 486 | cmp r5, #16 /* if r5 != 16 go to next iteration of the loop */ 487 | bne .Lloop 488 | 489 | ldr r0, addr_format_result /* r0 ← &format_result */ 490 | bl printf /* call printf */ 491 | mov sp, r6 /* Restore the stack after the call */ 492 | 493 | mov r0, #0 494 | pop {r4, r5, r6, lr} 495 | bx lr 496 | 497 | addr_mat_A : .word mat_A 498 | addr_mat_B : .word mat_B 499 | addr_mat_C : .word mat_C 500 | addr_format_result : .word format_result 501 | -------------------------------------------------------------------------------- /chapter15/Makefile: -------------------------------------------------------------------------------- 1 | EXES=divideby14 division benchmark 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -march=armv6 -mfpu=vfpv2 -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter15/benchmark.s: -------------------------------------------------------------------------------- 1 | /* division.s */ 2 | 3 | .data 4 | 5 | .text 6 | 7 | .globl main 8 | 9 | unsigned_naive_longdiv: 10 | /* r0 contains N */ 11 | /* r1 contains D */ 12 | mov r2, r1 /* r2 ← r1. We keep D in r2 */ 13 | mov r1, r0 /* r1 ← r0. We keep N in r1 */ 14 | 15 | mov r0, #0 /* r0 ← 0. Set Q = 0 initially */ 16 | 17 | b .Lloop_check0 18 | .Lloop0: 19 | add r0, r0, #1 /* r0 ← r0 + 1. Q = Q + 1 */ 20 | sub r1, r1, r2 /* r1 ← r1 - r2 */ 21 | .Lloop_check0: 22 | cmp r1, r2 /* compute r1 - r2 and update cpsr */ 23 | bhs .Lloop0 /* branch if r1 >= r2 (C=0 or Z=1) */ 24 | 25 | /* r0 already contains Q */ 26 | /* r1 already contains R */ 27 | bx lr 28 | 29 | unsigned_longdiv: 30 | /* r0 contains N */ 31 | /* r1 contains D */ 32 | /* r2 contains Q */ 33 | /* r3 contains R */ 34 | push {r4, lr} 35 | mov r2, #0 /* r2 ← 0 */ 36 | mov r3, #0 /* r3 ← 0 */ 37 | 38 | mov r4, #32 /* r4 ← 32 */ 39 | b .Lloop_check1 40 | .Lloop1: 41 | movs r0, r0, LSL #1 /* r0 ← r0 << 1 updating cpsr (sets C if 31st bit of r0 was 1) */ 42 | adc r3, r3, r3 /* r3 ← r3 + r3 + C. This is equivalent to r3 ← (r3 << 1) + C */ 43 | 44 | cmp r3, r1 /* compute r3 - r1 and update cpsr */ 45 | subhs r3, r3, r1 /* if r3 >= r1 (C=1) then r3 ← r3 - r1 */ 46 | adc r2, r2, r2 /* r2 ← r2 + r2 + C. This is equivalent to r2 ← (r2 << 1) + C */ 47 | .Lloop_check1: 48 | subs r4, r4, #1 /* r4 ← r4 - 1 */ 49 | bpl .Lloop1 /* if r4 >= 0 (N=0) then branch to .Lloop1 */ 50 | 51 | mov r0, r2 52 | 53 | pop {r4, lr} 54 | bx lr 55 | 56 | better_unsigned_division : 57 | /* r0 contains N */ 58 | /* r1 contains D */ 59 | /* r2 contains Q */ 60 | /* r3 tmp */ 61 | 62 | mov r3, r1 /* r3 ← r1 */ 63 | cmp r3, r0, LSR #1 /* update cpsr with r3 - 2*r0 */ 64 | .Lloop2: 65 | movls r3, r3, LSL #1 /* if r3 <= 2*r0 (C=0 or Z=1) then r3 ← 2*r3 */ 66 | cmp r3, r0, LSR #1 /* update cpsr with r3 - 2*r0 */ 67 | bls .Lloop2 /* branch to .Lloop2 if r3 <= 2*r0 (C=0 or Z=1) */ 68 | 69 | mov r2, #0 /* r2 ← 0 */ 70 | 71 | .Lloop3: 72 | cmp r0, r3 /* update cpsr with r0 - r3 */ 73 | subhs r0, r0, r3 /* if r0 >= r3 then r0 ← r0 - r3 */ 74 | adc r2, r2, r2 /* r2 ← r2 + r2 + C (if r0 >= r3 then C = 1 else C = 0) */ 75 | 76 | mov r3, r3, LSR #1 /* r3 ← r3 >> 1 */ 77 | cmp r3, r1 /* update cpsr with r3 - r1 */ 78 | bhs .Lloop3 /* if r3 >= r1 branch to .Lloop3 */ 79 | 80 | mov r0, r2 81 | 82 | bx lr 83 | 84 | vfpv2_division: 85 | /* r0 contains N */ 86 | /* r1 contains D */ 87 | vmov s0, r0 /* s0 ← r0 (bit copy) */ 88 | vmov s1, r1 /* s1 ← r1 (bit copy) */ 89 | vcvt.f32.s32 s0, s0 /* s0 ← (float)s0 */ 90 | vcvt.f32.s32 s1, s1 /* s1 ← (float)s1 */ 91 | vdiv.f32 s0, s0, s1 /* s0 ← s0 / s1 */ 92 | vcvt.s32.f32 s0, s0 /* s0 ← (int)s0 */ 93 | vmov r0, s0 /* r0 ← s0 (bit copy). This is Q */ 94 | bx lr 95 | 96 | 97 | clz_unsigned_division: 98 | /* This algorithm does not work if N == D */ 99 | /* cmp r0, r1 Compare r0 and r1 */ 100 | /* moveq r0, #1 If they are equal set the result to 1 */ 101 | /* bxeq lr If they are equal leave the function */ 102 | 103 | clz r3, r0 /* Count leading zeroes of N */ 104 | clz r2, r1 /* Count leading zeroes of D */ 105 | sub r3, r2, r3 /* r3 ← r2 - r3. 106 | This is the difference of zeroes 107 | between N and N 108 | Note: D should be smaller than N 109 | so this substraction is ok */ 110 | add r3, r3, #1 111 | 112 | mov r2, #0 113 | b .Lloop_check4 114 | .Lloop4: 115 | cmp r0, r1, lsl r3 116 | adc r2, r2, r2 117 | subcs r0, r0, r1, lsl r3 118 | .Lloop_check4: 119 | subs r3, r3, #1 /* r3 ← r3 - 1 */ 120 | bpl .Lloop4 /* if r3 >= 0 (N=0) then branch to .Lloop1 */ 121 | 122 | mov r0, r2 123 | 124 | bx lr 125 | 126 | .set MAX, 16384 127 | main: 128 | push {r4, r5, r6, lr} 129 | 130 | mov r4, #1 /* r4 ← 1 */ 131 | 132 | b .Lcheck_loop_i /* branch to .Lcheck_loop_i */ 133 | .Lloop_i: 134 | mov r5, r4 /* r5 ← r4 */ 135 | b .Lcheck_loop_j /* branch to .Lcheck_loop_j */ 136 | .Lloop_j: 137 | 138 | mov r0, r5 /* r0 ← r5. This is N */ 139 | mov r1, r4 /* r1 ← r4. This is D */ 140 | 141 | bl better_unsigned_division 142 | 143 | /* mov r3, r0 144 | mov r2, r4 145 | mov r1, r5 146 | ldr r0, addr_of_message 147 | bl printf */ 148 | 149 | 150 | add r5, r5, #1 151 | .Lcheck_loop_j: 152 | cmp r5, #MAX /* compare r5 and 10 */ 153 | bne .Lloop_j /* if r5 != 10 branch to .Lloop_j */ 154 | add r4, r4, #1 155 | .Lcheck_loop_i: 156 | cmp r4, #MAX /* compare r4 and 10 */ 157 | bne .Lloop_i /* if r4 != 10 branch to .Lloop_i */ 158 | 159 | mov r0, #0 160 | 161 | pop {r4, r5, r6, lr} 162 | bx lr 163 | 164 | message: .asciz "%u / %u = %u\n" 165 | addr_of_message: .word message 166 | -------------------------------------------------------------------------------- /chapter15/divideby14.s: -------------------------------------------------------------------------------- 1 | /* -- divideby14.s */ 2 | 3 | .data 4 | 5 | .align 4 6 | read_number: .word 0 7 | 8 | .align 4 9 | message1 : .asciz "Enter an integer to divide it by 14: " 10 | 11 | .align 4 12 | message2 : .asciz "Number %d (signed-)divided by 14 is %d\n" 13 | 14 | .align 4 15 | scan_format : .asciz "%d" 16 | 17 | .text 18 | 19 | /* This function has been generated using "magic.py 14 code_for_signed" */ 20 | s_divide_by_14: 21 | /* r0 contains the argument to be divided by 14 */ 22 | ldr r1, .Ls_magic_number_14 /* r1 ← magic_number */ 23 | smull r1, r2, r1, r0 /* r1 ← Lower32Bits(r1*r0). r2 ← Upper32Bits(r1*r0) */ 24 | add r2, r2, r0 /* r2 ← r2 + r0 */ 25 | mov r2, r2, ASR #3 /* r2 ← r2 >> 3 */ 26 | mov r1, r0, LSR #31 /* r1 ← r0 >> 31 */ 27 | add r0, r2, r1 /* r0 ← r2 + r1 */ 28 | bx lr /* leave function */ 29 | .align 4 30 | .Ls_magic_number_14: .word 0x92492493 31 | 32 | .globl main 33 | 34 | main: 35 | /* Call printf */ 36 | push {r4, lr} 37 | ldr r0, addr_of_message1 /* r0 ← &message */ 38 | bl printf 39 | 40 | /* Call scanf */ 41 | ldr r0, addr_of_scan_format /* r0 ← &scan_format */ 42 | ldr r1, addr_of_read_number /* r1 ← &read_number */ 43 | bl scanf 44 | 45 | ldr r0, addr_of_read_number /* r1 ← &read_number */ 46 | ldr r0, [r0] /* r1 ← *r1 */ 47 | 48 | bl s_divide_by_14 49 | mov r2, r0 50 | 51 | ldr r1, addr_of_read_number /* r1 ← &read_number */ 52 | ldr r1, [r1] /* r1 ← *r1 */ 53 | 54 | ldr r0, addr_of_message2 /* r0 ← &message2 */ 55 | bl printf /* Call printf, r1 and r2 already 56 | contain the desired values */ 57 | 58 | pop {r4, lr} 59 | mov r0, #0 60 | bx lr 61 | 62 | addr_of_message1: .word message1 63 | addr_of_scan_format: .word scan_format 64 | addr_of_message2: .word message2 65 | addr_of_read_number: .word read_number 66 | -------------------------------------------------------------------------------- /chapter15/division.s: -------------------------------------------------------------------------------- 1 | /* division.s */ 2 | 3 | .data 4 | 5 | .text 6 | 7 | .globl main 8 | 9 | unsigned_naive_longdiv: 10 | /* r0 contains N */ 11 | /* r1 contains D */ 12 | mov r2, r1 /* r2 ← r0. We keep D in r2 */ 13 | mov r1, r0 /* r1 ← r0. We keep N in r1 */ 14 | 15 | mov r0, #0 /* r0 ← 0. Set Q = 0 initially */ 16 | 17 | b .Lloop_check0 18 | .Lloop0: 19 | add r0, r0, #1 /* r0 ← r0 + 1. Q = Q + 1 */ 20 | sub r1, r1, r2 /* r1 ← r1 - r2 */ 21 | .Lloop_check0: 22 | cmp r1, r2 /* compute r1 - r2 and update cpsr */ 23 | bhs .Lloop0 /* branch if r1 >= r2 (C=0 or Z=1) */ 24 | 25 | /* r0 already contains Q */ 26 | /* r1 already contains R */ 27 | bx lr 28 | 29 | unsigned_longdiv: 30 | /* r0 contains N */ 31 | /* r1 contains D */ 32 | /* r2 contains Q */ 33 | /* r3 contains R */ 34 | push {r4, lr} 35 | mov r2, #0 /* r2 ← 0 */ 36 | mov r3, #0 /* r3 ← 0 */ 37 | 38 | mov r4, #32 /* r4 ← 32 */ 39 | b .Lloop_check1 40 | .Lloop1: 41 | movs r0, r0, LSL #1 /* r0 ← r0 << 1 updating cpsr (sets C if 31st bit of r0 was 1) */ 42 | adc r3, r3, r3 /* r3 ← r3 + r3 + C. This is equivalent to r3 ← (r3 << 1) + C */ 43 | 44 | cmp r3, r1 /* compute r3 - r1 and update cpsr */ 45 | subhs r3, r3, r1 /* if r3 >= r1 (C=1) then r3 ← r3 - r1 */ 46 | adc r2, r2, r2 /* r2 ← r2 + r2 + C. This is equivalent to r2 ← (r2 << 1) + C */ 47 | .Lloop_check1: 48 | subs r4, r4, #1 /* r4 ← r4 - 1 */ 49 | bpl .Lloop1 /* if r4 >= 0 (N=0) then branch to .Lloop1 */ 50 | 51 | pop {r4, lr} 52 | bx lr 53 | 54 | better_unsigned_division : 55 | /* r0 contains N */ 56 | /* r1 contains D */ 57 | /* r2 contains Q */ 58 | /* r3 tmp */ 59 | 60 | mov r3, r1 /* r3 ← r1 */ 61 | cmp r3, r0, LSR #1 /* update cpsr with r3 - 2*r0 */ 62 | .Lloop2: 63 | movls r3, r3, LSL #1 /* if r3 <= 2*r0 (C=0 or Z=1) then r3 ← 2*r3 */ 64 | cmp r3, r0, LSR #1 /* update cpsr with r3 - 2*r0 */ 65 | bls .Lloop2 /* branch to .Lloop2 if r3 <= 2*r0 (C=0 or Z=1) */ 66 | 67 | mov r2, #0 /* r2 ← 0 */ 68 | 69 | .Lloop3: 70 | cmp r0, r3 /* update cpsr with r0 - r3 */ 71 | subhs r0, r0, r3 /* if r0 >= r3 then r0 ← r0 - r3 */ 72 | adc r2, r2, r2 /* r2 ← r2 + r2 + C (if r0 >= r3 then C = 1 else C = 0) */ 73 | 74 | mov r3, r3, LSR #1 /* r3 ← r3 >> 1 */ 75 | cmp r3, r1 /* update cpsr with r3 - r1 */ 76 | bhs .Lloop3 /* if r3 >= r1 branch to .Lloop3 */ 77 | 78 | bx lr 79 | 80 | vfpv2_division: 81 | /* r0 contains N */ 82 | /* r1 contains D */ 83 | vmov s0, r0 /* s0 ← r0 (bit copy) */ 84 | vmov s1, r1 /* s1 ← r1 (bit copy) */ 85 | vcvt.f32.s32 s0, s0 /* s0 ← (float)s0 */ 86 | vcvt.f32.s32 s1, s1 /* s1 ← (float)s1 */ 87 | vdiv.f32 s0, s0, s1 /* s0 ← s0 / s1 */ 88 | vcvt.s32.f32 s0, s0 /* s0 ← (int)s0 */ 89 | vmov r0, s0 /* r0 ← s0 (bit copy). This is Q */ 90 | bx lr 91 | 92 | 93 | main: 94 | bx lr 95 | -------------------------------------------------------------------------------- /chapter15/magic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf-8 3 | 4 | # Implemented very naively following the equations in Hacker's Delight 5 | 6 | # We assume 32-bit 7 | w = 32 8 | # Make sure you use Python 2.5+ because we may enter in the domain of bignums 9 | # (Python long) during the computations 10 | 11 | # We mimick a C99-style %-operator (remainder) 12 | # Python returns the sign of the divisor 13 | # while C99 uses the sign of the dividend 14 | def rem(x, y): 15 | t = x % y 16 | if (t == 0): 17 | return t 18 | # For nonzero results we may have to adjust the result 19 | # 2 % 3 = 2 20 | # -2 % -3 = -2 21 | if (x > 0) != (y > 0): 22 | t = t - y 23 | return t 24 | 25 | 26 | def magic_unsigned(d): 27 | p = w 28 | n_c = 2**w - rem(2**w, d) - 1 29 | while not (2**p > (n_c * (d - 1 - rem(2**p - 1, d)))): 30 | p = p + 1 31 | m = (2**p + d - 1 - rem(2**p - 1, d)) / d 32 | # Adjust the result to w bits 33 | magic = m & ~(~0 << w) 34 | add_flag = (m != magic) 35 | shift = p - w 36 | return (magic, shift, add_flag) 37 | 38 | def magic_signed_positive(d): 39 | p = w 40 | n_c = 2**(w-1) - rem(2**(w-1), d) - 1 41 | while not (2**p > (n_c*(d-rem(2**p, d)))): 42 | p = p + 1 43 | m = (2**p + d - rem(2**p, d)) / d 44 | # Adjust the result to w bits 45 | magic = m & ~(~0 << w) 46 | shift = p - w 47 | return (magic, shift) 48 | 49 | def magic_signed_negative(d): 50 | p = w 51 | n_c = -(2**(w-1)) + rem(2**(w-1) + 1, d) 52 | while not (2**p > (n_c*(d+rem(2**p, d)))): 53 | p = p + 1 54 | m = (2**p - d - rem(2**p, d)) / d 55 | # Adjust the result to w bits 56 | magic = m & ~(~0 << w) 57 | shift = p - w 58 | return (magic, shift) 59 | 60 | import sys 61 | import string 62 | 63 | operations = ["just_tell", "code_for_signed", "code_for_unsigned"] 64 | 65 | def usage_message(): 66 | print "usage: {0} divisor [{1}]".format(sys.argv[0], string.join(operations, "|")) 67 | sys.exit(1) 68 | 69 | if len(sys.argv) < 2: 70 | usage_message() 71 | 72 | # The divisor 73 | try: 74 | d = int(sys.argv[1]) 75 | except: 76 | usage_message() 77 | 78 | if (d == 0): 79 | print "dividend cannot be zero" 80 | usage_message() 81 | 82 | if len(sys.argv) >= 3: 83 | operation = sys.argv[2] 84 | else: 85 | operation = "just_tell" 86 | 87 | if operation not in operations: 88 | usage_message() 89 | 90 | if operation == "just_tell": 91 | if d > 0: 92 | (magic_signed, shift_signed) = magic_signed_positive(d) 93 | (magic_unsigned, shift_unsigned, add_flag) = magic_unsigned(d) 94 | print "Magic number for signed division by {0} is {1} (0x{1:X}) with shift {2}".format(d, magic_signed, shift_signed) 95 | print "Magic number for unsigned division by {0} is {1} (0x{1:X}) with shift {2}{3}".format(d, magic_unsigned, shift_unsigned, " and we need an extra addition" if add_flag else "") 96 | elif d < 0: 97 | (magic_signed, shift_signed) = magic_signed_negative(d) 98 | print "Magic number for signed division by {0} is {1} (0x{1:X}) with shift {2}".format(d, magic_signed, shift_signed) 99 | else: 100 | print "Can't divide by 0" 101 | elif operation == "code_for_signed": 102 | if (d > 0): 103 | (magic_signed, shift_signed) = magic_signed_positive(d) 104 | else: 105 | (magic_signed, shift_signed) = magic_signed_negative(d) 106 | 107 | tab = " " 108 | dividend_name = "{0}".format(d) if d > 0 else "minus_{0}".format(-d) 109 | magic_number_name = ".Ls_magic_number_{0}".format(dividend_name) 110 | function_name = "s_divide_by_{0}".format(dividend_name) 111 | code = "{0}:\n".format(function_name) 112 | code += tab + "/* r0 contains the argument to be divided by {0} */\n".format(d) 113 | code += tab + "ldr r1, {0} /* r1 ← magic_number */\n".format(magic_number_name) 114 | code += tab + "smull r1, r2, r1, r0 /* r1 ← Lower32Bits(r1*r0). r2 ← Upper32Bits(r1*r0) */\n" 115 | magic_number_is_negative = (magic_signed & (1 << (w-1))) 116 | if d > 0 and magic_number_is_negative: 117 | code += tab + "add r2, r2, r0 /* r2 ← r2 + r0 */\n" 118 | elif d < 0 and not magic_number_is_negative: 119 | code += tab + "sub r2, r2, r0 /* r2 ← r2 - r0 */\n" 120 | if shift_signed > 0: 121 | code += tab + "mov r2, r2, ASR #{0} /* r2 ← r2 >> {0} */\n".format(shift_signed) 122 | code += tab + "mov r1, r0, LSR #{0} /* r1 ← r0 >> {0} */\n".format(w-1) 123 | code += tab + "add r0, r2, r1 /* r0 ← r2 + r1 */\n" 124 | code += tab + "bx lr /* leave function */\n" 125 | code += tab + ".align 4\n" 126 | code += tab + "{0}: .word 0x{1:x}\n".format(magic_number_name, magic_signed) 127 | 128 | print code 129 | elif operation == "code_for_unsigned": 130 | if d < 0: 131 | print "You requested code for unsigned but the divisor is negative!" 132 | sys.exit(1) 133 | (magic_unsigned, shift_unsigned, add_flag) = magic_unsigned(d) 134 | tab = " " 135 | dividend_name = "{0}".format(d) 136 | magic_number_name = ".Lu_magic_number_{0}".format(dividend_name) 137 | function_name = "u_divide_by_{0}".format(dividend_name) 138 | code = "{0}:\n".format(function_name) 139 | code += tab + "/* r0 contains the argument to be divided by {0} */\n".format(d) 140 | code += tab + "ldr r1, {0} /* r1 ← magic_number */\n".format(magic_number_name) 141 | code += tab + "umull r1, r2, r1, r0 /* r1 ← Lower32Bits(r1*r0). r2 ← Upper32Bits(r1*r0) */\n" 142 | if add_flag: 143 | code += tab + "adds r2, r2, r0 /* r2 ← r2 + r0 updating cpsr */\n" 144 | code += tab + "mov r2, r2, ROR #0 /* r2 ← (carry_flag << 31) | (r2 >> 1) */\n".format(shift_unsigned) 145 | code += tab + "mov r0, r2, LSR #{0} /* r0 ← r2 >> {0} */\n".format(shift_unsigned) 146 | elif shift_unsigned > 0: 147 | code += tab + "mov r0, r2, LSR #{0} /* r0 ← r2 >> {0} */\n".format(shift_unsigned) 148 | code += tab + "bx lr /* leave function */\n" 149 | code += tab + ".align 4\n" 150 | code += tab + "{0}: .word 0x{1:x}\n".format(magic_number_name, magic_unsigned) 151 | 152 | print code 153 | else: 154 | print "Operation {} not implemented".format(operation) 155 | -------------------------------------------------------------------------------- /chapter16/Makefile: -------------------------------------------------------------------------------- 1 | EXES=jumptable calcjump ifstring binsearch hybrid 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -march=armv6 -mfpu=vfpv2 -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter16/binsearch.s: -------------------------------------------------------------------------------- 1 | /* binsearch.s */ 2 | .data 3 | 4 | .text 5 | 6 | .globl main 7 | 8 | main: 9 | 10 | cmp r0, #1 /* r0 - 1 and update cpsr */ 11 | blt case_default /* if r0 < 1 then branch to case_default */ 12 | cmp r0, #10 /* r0 - 10 and update cpsr */ 13 | bgt case_default /* if r0 > 10 then branch to case default */ 14 | 15 | case_1_to_10: 16 | cmp r0, #5 /* r0 - 5 and update cpsr */ 17 | beq case_5 /* if r0 == 5 branch to case_5 */ 18 | blt case_1_to_4 /* if r0 < 5 branch to case_1_to_4 */ 19 | bgt case_6_to_10 /* if r0 > 5 branch to case_6_to_4 */ 20 | 21 | case_1_to_4: 22 | cmp r0, #2 /* r0 - 2 and update cpsr */ 23 | beq case_2 /* if r0 == 2 branch to case_2 */ 24 | blt case_1 /* if r0 < 2 branch to case_1 25 | (case_1_to_1 does not make sense) */ 26 | bgt case_3_to_4 /* if r0 > 2 branch to case_3_to_4 */ 27 | 28 | case_3_to_4: 29 | cmp r0, #3 /* r0 - 3 and update cpsr */ 30 | beq case_3 /* if r0 == 3 branch to case_3 */ 31 | b case_4 /* otherwise it must be r0 == 4, 32 | branch to case_4 */ 33 | 34 | case_6_to_10: 35 | cmp r0, #8 /* r0 - 8 and update cpsr */ 36 | beq case_8 /* if r0 == 8 branch to case_8 */ 37 | blt case_6_to_7 /* if r0 < 8 then branch to case_6_to_7 */ 38 | bgt case_9_to_10 /* if r0 > 8 then branch to case_9_to_10 */ 39 | 40 | case_6_to_7: 41 | cmp r0, #6 /* r0 - 6 and update cpsr */ 42 | beq case_6 /* if r0 == 6 branch to case_6 */ 43 | b case_7 /* otherwise it must be r0 == 7, 44 | branch to case 7 */ 45 | 46 | case_9_to_10: 47 | cmp r0, #9 /* r0 - 9 and update cpsr */ 48 | beq case_9 49 | b case_10 50 | 51 | case_1: 52 | mov r0, #1 53 | b after_switch 54 | case_2: 55 | mov r0, #2 56 | b after_switch 57 | case_3: 58 | mov r0, #3 59 | b after_switch 60 | case_4: 61 | mov r0, #4 62 | b after_switch 63 | case_5: 64 | mov r0, #5 65 | b after_switch 66 | case_6: 67 | mov r0, #6 68 | b after_switch 69 | case_7: 70 | mov r0, #7 71 | b after_switch 72 | case_8: 73 | mov r0, #8 74 | b after_switch 75 | case_9: 76 | mov r0, #9 77 | b after_switch 78 | case_10: 79 | mov r0, #10 80 | b after_switch 81 | 82 | case_default: 83 | mov r0, #42 /* r0 ← 42 */ 84 | b after_switch /* break (unnecessary) */ 85 | 86 | after_switch: 87 | 88 | bx lr /* Return from main */ 89 | -------------------------------------------------------------------------------- /chapter16/calcjump.s: -------------------------------------------------------------------------------- 1 | /* calcjump.s */ 2 | .data 3 | 4 | .text 5 | 6 | .globl main 7 | 8 | main: 9 | cmp r0, #1 /* r0 - 1 and update cpsr */ 10 | blt case_default /* branch to case_default if r0 < 1 */ 11 | cmp r0, #3 /* r0 - 3 and update cpsr */ 12 | bgt case_default /* branch to case_default if r0 > 3 */ 13 | 14 | sub r0, r0, #1 /* r0 ← r0 - 1. Required to index the table */ 15 | ldr r1, addr_of_case_1 /* r1 ← &case_1 */ 16 | add r1, r1, r0, LSL #3 /* r1 ← r1 + r0 * 8 17 | Each instruction is 4 bytes 18 | Each case takes 2 instructions 19 | Thus, each case is 8 bytes (4 * 2) 20 | */ 21 | 22 | mov pc, r1 /* pc ← r1 23 | This will cause a branch to the 24 | computed address */ 25 | 26 | case_1: 27 | mov r0, #1 /* r0 ← 1 */ 28 | b after_switch /* break */ 29 | 30 | case_2: 31 | mov r0, #2 /* r0 ← 2 */ 32 | b after_switch /* break */ 33 | 34 | case_3: 35 | mov r0, #3 /* r0 ← 3 */ 36 | b after_switch /* break */ 37 | 38 | case_default: 39 | mov r0, #42 /* r0 ← 42 */ 40 | b after_switch /* break (unnecessary) */ 41 | 42 | after_switch: 43 | 44 | bx lr /* Return from main */ 45 | 46 | .align 4 47 | addr_of_case_1: .word case_1 48 | -------------------------------------------------------------------------------- /chapter16/hybrid.s: -------------------------------------------------------------------------------- 1 | /* hybrid.s */ 2 | .data 3 | 4 | .text 5 | 6 | .globl main 7 | 8 | main: 9 | push {r4, r5, r6, lr} 10 | 11 | cmp r0, #1 /* r0 - 1 and update cpsr */ 12 | blt case_default /* if r0 < 1 then branch to case_default */ 13 | cmp r0, #300 /* r0 - 300 and update cpsr */ 14 | bgt case_default /* if r0 > 300 then branch to case default */ 15 | 16 | /* prepare the binary search. 17 | r1 will hold the lower index 18 | r2 will hold the upper index 19 | r3 the base address of the case_value_table 20 | */ 21 | mov r1, #0 22 | mov r2, #9 23 | ldr r3, addr_case_value_table /* r3 ← &case_value_table */ 24 | 25 | b check_binary_search 26 | binary_search: 27 | add r4, r1, r2 /* r4 ← r1 + r2 */ 28 | mov r4, r4, ASR #1 /* r4 ← r4 / 2 */ 29 | ldr r5, [r3, +r4, LSL #2] /* r5 ← *(r3 + r4 * 4). 30 | This is r5 ← case_value_table[r4] */ 31 | cmp r0, r5 /* r0 - r5 and update cpsr */ 32 | sublt r2, r4, #1 /* if r0 < r5 then r2 ← r4 - 1 */ 33 | addgt r1, r4, #1 /* if r0 > r5 then r1 ← r4 + 1 */ 34 | bne check_binary_search /* if r0 != r5 branch to binary_search */ 35 | 36 | /* if we reach here it means that r0 == r5 */ 37 | ldr r5, addr_case_addresses_table /* r5 ← &addr_case_value_table */ 38 | ldr r5, [r5, +r4, LSL #2] /* r5 ← *(r5 + r4*4) 39 | This is r5 ← case_addresses_table[r4] */ 40 | mov pc, r5 /* branch to the proper case */ 41 | 42 | check_binary_search: 43 | cmp r1, r2 /* r1 - r2 and update cpsr */ 44 | ble binary_search /* if r1 <= r2 branch to binary_search */ 45 | 46 | /* if we reach here it means the case value 47 | was not found. branch to default case */ 48 | b case_default 49 | 50 | case_1: 51 | mov r0, #1 52 | b after_switch 53 | case_2: 54 | mov r0, #2 55 | b after_switch 56 | case_3: 57 | mov r0, #3 58 | b after_switch 59 | case_24: 60 | mov r0, #24 61 | b after_switch 62 | case_25: 63 | mov r0, #95 64 | b after_switch 65 | case_26: 66 | mov r0, #96 67 | b after_switch 68 | case_97: 69 | mov r0, #97 70 | b after_switch 71 | case_98: 72 | mov r0, #98 73 | b after_switch 74 | case_99: 75 | mov r0, #99 76 | b after_switch 77 | case_300: 78 | mov r0, #300 /* The error code will be 44 */ 79 | b after_switch 80 | 81 | case_default: 82 | mov r0, #42 /* r0 ← 42 */ 83 | b after_switch /* break (unnecessary) */ 84 | 85 | after_switch: 86 | 87 | pop {r4,r5,r6,lr} 88 | bx lr /* Return from main */ 89 | 90 | case_value_table: .word 1, 2, 3, 24, 25, 26, 97, 98, 99, 300 91 | addr_case_value_table: .word case_value_table 92 | 93 | case_addresses_table: 94 | .word case_1 95 | .word case_2 96 | .word case_3 97 | .word case_24 98 | .word case_25 99 | .word case_26 100 | .word case_97 101 | .word case_98 102 | .word case_99 103 | .word case_300 104 | addr_case_addresses_table: .word case_addresses_table 105 | -------------------------------------------------------------------------------- /chapter16/ifstring.s: -------------------------------------------------------------------------------- 1 | /* ifstring.s */ 2 | .data 3 | 4 | .text 5 | 6 | .globl main 7 | 8 | main: 9 | cmp r0, #1 /* r0 - 1 and update cpsr */ 10 | beq case_1 /* if r0 == 1 branch to case_1 */ 11 | cmp r0, #2 /* r0 - 2 and update cpsr */ 12 | beq case_2 /* if r0 == 2 branch to case_2 */ 13 | cmp r0, #3 /* r0 - 3 and update cpsr */ 14 | beq case_3 /* if r0 == 3 branch to case_3 */ 15 | b case_default /* branch to case_default */ 16 | 17 | case_1: 18 | mov r0, #1 /* r0 ← 1 */ 19 | b after_switch /* break */ 20 | 21 | case_2: 22 | mov r0, #2 /* r0 ← 2 */ 23 | b after_switch /* break */ 24 | 25 | case_3: 26 | mov r0, #3 /* r0 ← 3 */ 27 | b after_switch /* break */ 28 | 29 | case_default: 30 | mov r0, #42 /* r0 ← 42 */ 31 | b after_switch /* break (unnecessary) */ 32 | 33 | after_switch: 34 | 35 | bx lr /* Return from main */ 36 | -------------------------------------------------------------------------------- /chapter16/jumptable.s: -------------------------------------------------------------------------------- 1 | /* jumptable.s */ 2 | .data 3 | 4 | .text 5 | 6 | .globl main 7 | 8 | main: 9 | cmp r0, #1 /* r0 - 1 and update cpsr */ 10 | blt case_default /* branch to case_default if r0 < 1 */ 11 | cmp r0, #3 /* r0 - 3 and update cpsr */ 12 | bgt case_default /* branch to case_default if r0 > 3 */ 13 | 14 | sub r0, r0, #1 /* r0 ← r0 - 1. Required to index the table */ 15 | ldr r1, addr_of_jump_table /* r1 ← &jump_table */ 16 | ldr r1, [r1, +r0, LSL #2] /* r1 ← *(r1 + r0*4). 17 | This is r1 ← jump_table[r0] */ 18 | 19 | mov pc, r1 /* pc ← r1 20 | This will cause a branch to the 21 | computed address */ 22 | 23 | case_1: 24 | mov r0, #1 /* r0 ← 1 */ 25 | b after_switch /* break */ 26 | 27 | case_2: 28 | mov r0, #2 /* r0 ← 2 */ 29 | b after_switch /* break */ 30 | 31 | case_3: 32 | mov r0, #3 /* r0 ← 3 */ 33 | b after_switch /* break */ 34 | 35 | case_default: 36 | mov r0, #42 /* r0 ← 42 */ 37 | b after_switch /* break (unnecessary) */ 38 | 39 | after_switch: 40 | 41 | bx lr /* Return from main */ 42 | 43 | .align 4 44 | jump_table: 45 | .word case_1 46 | .word case_2 47 | .word case_3 48 | 49 | .align 4 50 | addr_of_jump_table: .word jump_table 51 | -------------------------------------------------------------------------------- /chapter17/Makefile: -------------------------------------------------------------------------------- 1 | EXES=first_pointer wrong_pointer good_pointer array_by_value array_by_ref double_array 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -march=armv6 -mfpu=vfpv2 -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter17/array_by_ref.s: -------------------------------------------------------------------------------- 1 | /* array_by_ref.s */ 2 | 3 | .data 4 | 5 | .align 4 6 | 7 | big_array : 8 | .word 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 9 | .word 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41 10 | .word 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61 11 | .word 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81 12 | .word 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100 13 | .word 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116 14 | .word 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132 15 | .word 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148 16 | .word 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164 17 | .word 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180 18 | .word 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196 19 | .word 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212 20 | .word 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228 21 | .word 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244 22 | .word 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 23 | 24 | .align 4 25 | 26 | message: .asciz "The sum of 0 to 255 is %d\n" 27 | 28 | .text 29 | .globl main 30 | 31 | sum_array_ref : 32 | /* Parameters: 33 | r0 Number of items 34 | r1 Address of the array 35 | */ 36 | push {r4, r5, r6, lr} 37 | 38 | /* We have passed all the data by value */ 39 | 40 | /* r4 will hold the sum so far */ 41 | mov r4, #0 /* r4 ← 0 */ 42 | mov r5, #0 /* r5 ← 0 */ 43 | 44 | b .Lcheck_loop_array_sum 45 | .Lloop_array_sum: 46 | ldr r6, [r1, r5, LSL #2] /* r6 ← *(r1 + r5 * 4) */ 47 | add r4, r4, r6 /* r4 ← r4 + r6 */ 48 | add r5, r5, #1 /* r5 ← r5 + 1 */ 49 | .Lcheck_loop_array_sum: 50 | cmp r5, r0 /* r5 - r0 and update cpsr */ 51 | bne .Lloop_array_sum /* if r5 != r0 go to .Lloop_array_sum */ 52 | 53 | mov r0, r4 /* r0 ← r4, to return the value of the sum */ 54 | pop {r4, r5, r6, lr} 55 | 56 | bx lr 57 | 58 | 59 | main: 60 | push {r4, lr} 61 | /* we will not use r4 but we need to keep the function 8-byte aligned */ 62 | 63 | mov r0, #256 64 | ldr r1, address_of_big_array 65 | 66 | bl sum_array_ref 67 | 68 | /* prepare the call to printf */ 69 | mov r1, r0 /* second parameter, the sum itself */ 70 | ldr r0, address_of_message /* first parameter, the message */ 71 | bl printf 72 | 73 | pop {r4, lr} 74 | bx lr 75 | 76 | address_of_big_array : .word big_array 77 | address_of_message : .word message 78 | -------------------------------------------------------------------------------- /chapter17/array_by_value.s: -------------------------------------------------------------------------------- 1 | /* array_by_value.s */ 2 | 3 | .data 4 | 5 | .align 4 6 | 7 | big_array : 8 | .word 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 9 | .word 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41 10 | .word 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61 11 | .word 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81 12 | .word 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100 13 | .word 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116 14 | .word 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132 15 | .word 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148 16 | .word 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164 17 | .word 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180 18 | .word 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196 19 | .word 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212 20 | .word 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228 21 | .word 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244 22 | .word 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 23 | 24 | .align 4 25 | 26 | message: .asciz "The sum of 0 to 255 is %d\n" 27 | 28 | .text 29 | .globl main 30 | 31 | sum_array_value : 32 | push {r4, r5, r6, lr} 33 | 34 | /* We have passed all the data by value */ 35 | 36 | /* r4 will hold the sum so far */ 37 | mov r4, #0 /* r4 ← 0 */ 38 | /* In r0 we have the number of items of the array */ 39 | 40 | cmp r0, #1 /* r0 - #1 and update cpsr */ 41 | blt .Lend_of_sum_array /* if r0 < 1 branch to end_of_sum_array */ 42 | add r4, r4, r1 /* add the first item */ 43 | 44 | cmp r0, #2 /* r0 - #2 and update cpsr */ 45 | blt .Lend_of_sum_array /* if r0 < 2 branch to end_of_sum_array */ 46 | add r4, r4, r2 /* add the second item */ 47 | 48 | cmp r0, #3 /* r0 - #3 and update cpsr */ 49 | blt .Lend_of_sum_array /* if r0 < 3 branch to end_of_sum_array */ 50 | add r4, r4, r3 /* add the third item */ 51 | 52 | /* 53 | The stack at this point looks like this 54 | | | (lower addresses) 55 | | | 56 | | lr | <- sp points here 57 | | r6 | <- this is sp + 4 58 | | r5 | <- this is sp + 8 59 | | r4 | <- this is sp + 12 60 | | big_array[3] | <- this is sp + 16 (we want r5 to point here) 61 | | big_array[4] | 62 | | ... | 63 | | big_array[255] | 64 | | | 65 | | | (higher addresses) 66 | 67 | keep in r5 the address where the stack-passed portion of the array starts */ 68 | add r5, sp, #16 /* r5 ← sp + 16 */ 69 | 70 | /* in register r3 we will count how many items we have read 71 | from the stack. */ 72 | mov r3, #0 73 | 74 | /* in the stack there will always be 3 less items because 75 | the first 3 were already passed in registers 76 | (recall that r0 had how many items were in the array) */ 77 | sub r0, r0, #3 78 | 79 | b .Lcheck_loop_sum_array 80 | .Lloop_sum_array: 81 | ldr r6, [r5, r3, LSL #2] /* r6 ← *(r5 + r3 * 4) load 82 | the array item r3 from the stack */ 83 | add r4, r4, r6 /* r4 ← r4 + r6 84 | accumulate in r4 */ 85 | add r3, r3, #1 /* r3 ← r3 + 1 86 | move to the next item */ 87 | .Lcheck_loop_sum_array: 88 | cmp r3, r0 /* r0 - r3 and update cpsr */ 89 | blt .Lloop_sum_array /* if r3 < r3 branch to loop_sum_array */ 90 | 91 | .Lend_of_sum_array: 92 | mov r0, r4 /* r0 ← r4, to return the value of the sum */ 93 | pop {r4, r5, r6, lr} 94 | 95 | bx lr 96 | 97 | 98 | main: 99 | push {r4, r5, r6, r7, r8, lr} 100 | /* we will not use r8 but we need to keep the function 8-byte aligned */ 101 | 102 | ldr r4, address_of_big_array 103 | 104 | /* Prepare call */ 105 | 106 | mov r0, #256 /* Load in the first parameter the number of items 107 | r0 ← 256 108 | */ 109 | 110 | ldr r1, [r4] /* load in the second parameter the first item of the array */ 111 | ldr r2, [r4, #4] /* load in the third parameter the second item of the array */ 112 | ldr r3, [r4, #8] /* load in the fourth parameter the third item of the array */ 113 | 114 | /* before pushing anything in the stack keep its position */ 115 | mov r7, sp 116 | 117 | /* We cannot use more registers, now we have to push them onto the stack 118 | (in reverse order) */ 119 | mov r5, #255 /* r5 ← 255 120 | This is the last item position 121 | (note that the first would be in position 0) */ 122 | 123 | 124 | b .Lcheck_pass_parameter_loop 125 | .Lpass_parameter_loop: 126 | 127 | ldr r6, [r4, r5, LSL #2] /* r6 ← *(r4 + r5 * 4). 128 | loads the item in position r5 into r6. Note that 129 | we have to multiply by 4 because this is the size 130 | of each item in the array */ 131 | push {r6} /* push the loaded value to the stack */ 132 | sub r5, r5, #1 /* we are done with the current item, 133 | go to the previous index of the array */ 134 | .Lcheck_pass_parameter_loop: 135 | cmp r5, #2 /* compute r5 - #2 and update cpsr */ 136 | bne .Lpass_parameter_loop /* if r5 != #2 branch to pass_parameter_loop */ 137 | 138 | /* We are done, we have passed all the values of the array, 139 | now call the function */ 140 | bl sum_array_value 141 | 142 | /* restore the stack position */ 143 | mov sp, r7 144 | 145 | /* prepare the call to printf */ 146 | mov r1, r0 /* second parameter, the sum itself */ 147 | ldr r0, address_of_message /* first parameter, the message */ 148 | bl printf 149 | 150 | pop {r4, r5, r6, r7, r8, lr} 151 | bx lr 152 | 153 | address_of_big_array : .word big_array 154 | address_of_message : .word message 155 | -------------------------------------------------------------------------------- /chapter17/double_array.s: -------------------------------------------------------------------------------- 1 | /* double_array.s */ 2 | 3 | .data 4 | 5 | .align 4 6 | 7 | big_array : 8 | .word 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 9 | .word 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41 10 | .word 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61 11 | .word 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81 12 | .word 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100 13 | .word 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116 14 | .word 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132 15 | .word 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148 16 | .word 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164 17 | .word 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180 18 | .word 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196 19 | .word 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212 20 | .word 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228 21 | .word 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244 22 | .word 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255 23 | 24 | .align 4 25 | 26 | message: .asciz "Item at position %d has value %d\n" 27 | 28 | .text 29 | .globl main 30 | 31 | double_array : 32 | /* Parameters: 33 | r0 Number of items 34 | r1 Address of the array 35 | */ 36 | push {r4, r5, r6, lr} 37 | 38 | mov r4, #0 /* r4 ← 0 */ 39 | 40 | b .Lcheck_loop_array_double 41 | .Lloop_array_double: 42 | ldr r5, [r1, r4, LSL #2] /* r5 ← *(r1 + r4 * 4) */ 43 | mov r5, r5, LSL #1 /* r5 ← r5 * 2 */ 44 | str r5, [r1, r4, LSL #2] /* *(r1 + r4 * 4) ← r5 */ 45 | add r4, r4, #1 /* r4 ← r4 + 1 */ 46 | .Lcheck_loop_array_double: 47 | cmp r4, r0 /* r4 - r0 and update cpsr */ 48 | bne .Lloop_array_double /* if r4 != r0 go to .Lloop_array_double */ 49 | 50 | pop {r4, r5, r6, lr} 51 | 52 | bx lr 53 | 54 | print_each_item: 55 | push {r4, r5, r6, r7, r8, lr} /* r8 is unused */ 56 | 57 | mov r4, #0 /* r4 ← 0 */ 58 | mov r6, r0 /* r6 ← r0. Keep r0 because we will overwrite it */ 59 | mov r7, r1 /* r7 ← r1. Keep r1 because we will overwrite it */ 60 | 61 | 62 | b .Lcheck_loop_print_items 63 | .Lloop_print_items: 64 | ldr r5, [r7, r4, LSL #2] /* r5 ← *(r7 + r4 * 4) */ 65 | 66 | /* Prepare the call to printf */ 67 | ldr r0, address_of_message /* first parameter of the call to printf below */ 68 | mov r1, r4 /* second parameter: item position */ 69 | mov r2, r5 /* third parameter: item value */ 70 | bl printf /* call printf */ 71 | 72 | add r4, r4, #1 /* r4 ← r4 + 1 */ 73 | .Lcheck_loop_print_items: 74 | cmp r4, r6 /* r4 - r6 and update cpsr */ 75 | bne .Lloop_print_items /* if r4 != r6 goto .Lloop_print_items */ 76 | 77 | pop {r4, r5, r6, r7, r8, lr} 78 | bx lr 79 | 80 | main: 81 | push {r4, lr} 82 | /* we will not use r4 but we need to keep the function 8-byte aligned */ 83 | 84 | /* first call print_each_item */ 85 | mov r0, #256 /* first_parameter: number of items */ 86 | ldr r1, address_of_big_array /* second parameter: address of the array */ 87 | bl print_each_item /* call to print_each_item */ 88 | 89 | /* call to double_array */ 90 | mov r0, #256 /* first_parameter: number of items */ 91 | ldr r1, address_of_big_array /* second parameter: address of the array */ 92 | bl double_array /* call to double_array */ 93 | 94 | /* second call print_each_item */ 95 | mov r0, #256 /* first_parameter: number of items */ 96 | ldr r1, address_of_big_array /* second parameter: address of the array */ 97 | bl print_each_item /* call to print_each_item */ 98 | 99 | pop {r4, lr} 100 | bx lr 101 | 102 | address_of_big_array : .word big_array 103 | address_of_message : .word message 104 | -------------------------------------------------------------------------------- /chapter17/first_pointer.s: -------------------------------------------------------------------------------- 1 | /* first_pointer.s */ 2 | 3 | .data 4 | 5 | .align 4 6 | number_1 : .word 3 7 | 8 | .text 9 | .globl main 10 | 11 | 12 | main: 13 | ldr r0, pointer_to_number 14 | ldr r0, [r0] 15 | 16 | bx lr 17 | 18 | pointer_to_number: .word number_1 19 | -------------------------------------------------------------------------------- /chapter17/good_pointer.s: -------------------------------------------------------------------------------- 1 | /* good_pointer.s */ 2 | 3 | .data 4 | 5 | .align 4 6 | number_1 : .word 3 7 | number_2 : .word 4 8 | pointer_to_number: .word 0 9 | 10 | .text 11 | .globl main 12 | 13 | 14 | main: 15 | ldr r0, addr_of_pointer_to_number 16 | /* r0 ← &pointer_to_number */ 17 | 18 | ldr r1, addr_of_number_2 /* r1 ← &number_2 */ 19 | 20 | str r1, [r0] /* *r0 ← r1. 21 | This is actually 22 | pointer_to_number ← &number_2 */ 23 | 24 | ldr r1, [r0] /* r1 ← *r0. 25 | This is actually 26 | r1 ← pointer_to_number 27 | Since pointer_to_number has the value &number_2 28 | then this is like 29 | r1 ← &number_2 30 | */ 31 | 32 | 33 | ldr r0, [r1] /* r0 ← *r1 34 | Since r1 had as value &number_2 35 | then this is like 36 | r0 ← number_2 37 | */ 38 | 39 | bx lr 40 | 41 | addr_of_pointer_to_number: .word pointer_to_number 42 | addr_of_number_1: .word number_1 43 | addr_of_number_2: .word number_2 44 | -------------------------------------------------------------------------------- /chapter17/wrong_pointer.s: -------------------------------------------------------------------------------- 1 | /* wrong_pointer.s */ 2 | 3 | .data 4 | 5 | .align 4 6 | number_1 : .word 3 7 | number_2 : .word 4 8 | 9 | .text 10 | .globl main 11 | 12 | main: 13 | ldr r1, address_of_number_2 /* r1 ← &number_2 */ 14 | str r1, pointer_to_number /* pointer_to_number ← r1, this is pointer_to_number ← &number_2 */ 15 | 16 | bx lr 17 | 18 | pointer_to_number: .word number_1 19 | address_of_number_2: .word number_2 20 | -------------------------------------------------------------------------------- /chapter18/Makefile: -------------------------------------------------------------------------------- 1 | EXES=square 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -march=armv6 -mfpu=vfpv2 -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter18/square: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rofirrim/raspberry-pi-assembler/75685f80a35318777fad9dc33837698c19952e89/chapter18/square -------------------------------------------------------------------------------- /chapter18/square.s: -------------------------------------------------------------------------------- 1 | /* squares.s */ 2 | 3 | .data 4 | 5 | .align 4 6 | message: .asciz "Sum of 1^2 + 2^2 + 3^2 + 4^2 + 5^2 is %d\n" 7 | 8 | .text 9 | 10 | 11 | sq: 12 | ldr r1, [r0] /* r1 ← (*r0) */ 13 | mul r1, r1, r1 /* r1 ← r1 * r1 */ 14 | str r1, [r0] /* (*r0) ← r1 */ 15 | bx lr 16 | 17 | sq_sum5: 18 | push {fp, lr} /* Keep fp and all callee-saved registers. */ 19 | mov fp, sp /* Set the dynamic link */ 20 | 21 | sub sp, sp, #16 /* sp ← sp - 4. Allocate space for 4 integers in the stack */ 22 | /* Keep parameters in the stack */ 23 | str r0, [fp, #-16] /* *(fp - 16) ← r0 */ 24 | str r1, [fp, #-12] /* *(fp - 12) ← r1 */ 25 | str r2, [fp, #-8] /* *(fp - 8) ← r2 */ 26 | str r3, [fp, #-4] /* *(fp - 4) ← r3 */ 27 | 28 | /* At this point the stack looks like this 29 | | Value | Address(es) 30 | +--------+----------------------- 31 | | r0 | [fp, #-16], [sp] 32 | | r1 | [fp, #-12], [sp, #4] 33 | | r2 | [fp, #-8], [sp, #8] 34 | | r3 | [fp, #-4], [sp, #12] 35 | | fp | [fp], [sp, #16] 36 | | lr | [fp, #4], [sp, #20] 37 | | e | [fp, #8], [sp, #24] 38 | v 39 | Higher 40 | addresses 41 | */ 42 | 43 | sub r0, fp, #16 /* r0 ← fp - 16 */ 44 | bl sq /* call sq(&a); */ 45 | sub r0, fp, #12 /* r0 ← fp - 12 */ 46 | bl sq /* call sq(&b); */ 47 | sub r0, fp, #8 /* r0 ← fp - 8 */ 48 | bl sq /* call sq(&c); */ 49 | sub r0, fp, #4 /* r0 ← fp - 4 */ 50 | bl sq /* call sq(&d) */ 51 | add r0, fp, #8 /* r0 ← fp + 8 */ 52 | bl sq /* call sq(&e) */ 53 | 54 | ldr r0, [fp, #-16] /* r0 ← *(fp - 16). Loads a into r0 */ 55 | ldr r1, [fp, #-12] /* r1 ← *(fp - 12). Loads b into r1 */ 56 | add r0, r0, r1 /* r0 ← r0 + r1 */ 57 | ldr r1, [fp, #-8] /* r1 ← *(fp - 8). Loads c into r1 */ 58 | add r0, r0, r1 /* r0 ← r0 + r1 */ 59 | ldr r1, [fp, #-4] /* r1 ← *(fp - 4). Loads d into r1 */ 60 | add r0, r0, r1 /* r0 ← r0 + r1 */ 61 | ldr r1, [fp, #8] /* r1 ← *(fp + 8). Loads e into r1 */ 62 | add r0, r0, r1 /* r0 ← r0 + r1 */ 63 | 64 | mov sp, fp /* Undo the dynamic link */ 65 | pop {fp, lr} /* Restore fp and callee-saved registers */ 66 | bx lr 67 | 68 | .globl main 69 | 70 | main: 71 | push {r4, lr} /* Keep callee-saved registers */ 72 | 73 | /* Prepare the call to sq_sum5 */ 74 | mov r0, #1 /* Parameter a ← 1 */ 75 | mov r1, #2 /* Parameter b ← 2 */ 76 | mov r2, #3 /* Parameter c ← 3 */ 77 | mov r3, #4 /* Parameter d ← 4 */ 78 | 79 | /* Parameter e goes through the stack, 80 | so it requires enlarging the stack */ 81 | mov r4, #5 /* r4 ← 5 */ 82 | sub sp, sp, #8 /* Enlarge the stack 8 bytes, 83 | we will use only the 84 | topmost 4 bytes */ 85 | str r4, [sp] /* Parameter e ← 5 */ 86 | bl sq_sum5 /* call sq_sum5(1, 2, 3, 4, 5) */ 87 | add sp, sp, #8 /* Shrink back the stack */ 88 | 89 | /* Prepare the call to printf */ 90 | mov r1, r0 /* The result of sq_sum5 */ 91 | ldr r0, address_of_message 92 | bl printf /* Call printf */ 93 | 94 | pop {r4, lr} /* Restore callee-saved registers */ 95 | bx lr 96 | 97 | 98 | address_of_message: .word message 99 | -------------------------------------------------------------------------------- /chapter19/Makefile: -------------------------------------------------------------------------------- 1 | EXES=write_c write_sys 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -march=armv6 -mfpu=vfpv2 -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter19/write_c.s: -------------------------------------------------------------------------------- 1 | /* write_c.s */ 2 | 3 | .data 4 | 5 | 6 | greeting: .asciz "Hello world\n" 7 | after_greeting: 8 | 9 | .set size_of_greeting, after_greeting - greeting 10 | 11 | .text 12 | 13 | .globl main 14 | 15 | main: 16 | push {r4, lr} 17 | mov r0, #1 18 | ldr r1, addr_of_greeting 19 | mov r2, #size_of_greeting 20 | bl write 21 | 22 | mov r0, #0 23 | 24 | pop {r4, lr} 25 | bx lr 26 | 27 | addr_of_greeting : .word greeting 28 | -------------------------------------------------------------------------------- /chapter19/write_sys.s: -------------------------------------------------------------------------------- 1 | /* write_sys.s */ 2 | 3 | .data 4 | 5 | 6 | greeting: .asciz "Hello world\n" 7 | after_greeting: 8 | 9 | .set size_of_greeting, after_greeting - greeting 10 | 11 | .text 12 | 13 | .globl main 14 | 15 | main: 16 | push {r4, lr} 17 | 18 | /* Prepare the system call */ 19 | mov r0, #1 /* r0 ← 1 */ 20 | ldr r1, addr_of_greeting /* r1 ← &greeting */ 21 | mov r2, #size_of_greeting /* r2 ← sizeof(greeting) */ 22 | 23 | mov r7, #4 /* select system call 'write' */ 24 | swi #0 /* perform the system call */ 25 | 26 | mov r0, #0 27 | pop {r4, lr} 28 | bx lr 29 | 30 | addr_of_greeting : .word greeting 31 | -------------------------------------------------------------------------------- /chapter20/Makefile: -------------------------------------------------------------------------------- 1 | EXES=direct indirect greeter_01 greeter_02 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -march=armv6 -mfpu=vfpv2 -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter20/direct.s: -------------------------------------------------------------------------------- 1 | .data /* data section */ 2 | .align 4 /* ensure the next label is 4-byte aligned */ 3 | message: .asciz "Hello world\n" 4 | 5 | .text /* text section (= code) */ 6 | 7 | .align 4 /* ensure the next label is 4-byte aligned */ 8 | say_hello: 9 | push {r4, lr} /* keep lr because we call printf, 10 | we keep r4 to keep the stack 8-byte 11 | aligned, as per AAPCS requirements */ 12 | /* Prepare the call to printf */ 13 | ldr r0, addr_of_message /* r0 ← &message */ 14 | bl printf /* call printf */ 15 | pop {r4, lr} /* restore r4 and lr */ 16 | bx lr /* return to the caller */ 17 | 18 | .align 4 /* ensure the next label is 4-byte aligned */ 19 | addr_of_message: .word message 20 | 21 | .globl main /* state that 'main' label is global */ 22 | .align 4 /* ensure the next label is 4-byte aligned */ 23 | main: 24 | push {r4, lr} /* keep lr because we call printf, 25 | we keep r4 to keep the stack 8-byte 26 | aligned, as per AAPCS requirements */ 27 | bl say_hello /* call say_hello, directly, using the label */ 28 | 29 | mov r0, #0 /* return from the program, set error code */ 30 | pop {r4, lr} /* restore r4 and lr */ 31 | bx lr /* return to the caller (the system) */ 32 | 33 | -------------------------------------------------------------------------------- /chapter20/greeter_01.s: -------------------------------------------------------------------------------- 1 | .data /* data section */ 2 | .align 4 /* ensure the next label is 4-byte aligned */ 3 | message_1: .asciz "Hello\n" 4 | .align 4 /* ensure the next label is 4-byte aligned */ 5 | message_2: .asciz "Bonjour\n" 6 | 7 | .text /* text section (= code) */ 8 | 9 | .align 4 /* ensure the next label is 4-byte aligned */ 10 | say_hello: 11 | push {r4, lr} /* keep lr because we call printf, 12 | we keep r4 to keep the stack 8-byte 13 | aligned, as per AAPCS requirements */ 14 | /* Prepare the call to printf */ 15 | ldr r0, addr_of_message_1 /* r0 ← &message */ 16 | bl printf /* call printf */ 17 | pop {r4, lr} /* restore r4 and lr */ 18 | bx lr /* return to the caller */ 19 | 20 | .align 4 /* ensure the next label is 4-byte aligned */ 21 | addr_of_message_1: .word message_1 22 | 23 | .align 4 /* ensure the next label is 4-byte aligned */ 24 | say_bonjour: 25 | push {r4, lr} /* keep lr because we call printf, 26 | we keep r4 to keep the stack 8-byte 27 | aligned, as per AAPCS requirements */ 28 | /* Prepare the call to printf */ 29 | ldr r0, addr_of_message_2 /* r0 ← &message */ 30 | bl printf /* call printf */ 31 | pop {r4, lr} /* restore r4 and lr */ 32 | bx lr /* return to the caller */ 33 | 34 | .align 4 /* ensure the next label is 4-byte aligned */ 35 | addr_of_message_2: .word message_2 36 | 37 | .align 4 38 | greeter: 39 | push {r4, lr} /* keep lr because we call printf, 40 | we keep r4 to keep the stack 8-byte 41 | aligned, as per AAPCS requirements */ 42 | blx r0 /* indirect call to r0 */ 43 | pop {r4, lr} /* restore r4 and lr */ 44 | bx lr /* return to the caller */ 45 | 46 | .globl main /* state that 'main' label is global */ 47 | .align 4 /* ensure the next label is 4-byte aligned */ 48 | main: 49 | push {r4, lr} /* keep lr because we call printf, 50 | we keep r4 to keep the stack 8-byte 51 | aligned, as per AAPCS requirements */ 52 | 53 | ldr r0, addr_say_hello /* r0 ← &say_hello */ 54 | bl greeter /* call greeter */ 55 | 56 | ldr r0, addr_say_bonjour /* r0 ← &say_bonjour */ 57 | bl greeter /* call greeter */ 58 | 59 | mov r0, #0 /* return from the program, set error code */ 60 | pop {r4, lr} /* restore r4 and lr */ 61 | bx lr /* return to the caller (the system) */ 62 | 63 | addr_say_hello : .word say_hello 64 | addr_say_bonjour : .word say_bonjour 65 | -------------------------------------------------------------------------------- /chapter20/greeter_02.s: -------------------------------------------------------------------------------- 1 | .data /* data section */ 2 | 3 | .align 4 /* ensure the next label is 4-byte aligned */ 4 | message_hello: .asciz "Hello %s\n" 5 | .align 4 /* ensure the next label is 4-byte aligned */ 6 | message_bonjour: .asciz "Bonjour %s\n" 7 | 8 | /* tags of kind of people */ 9 | .align 4 /* ensure the next label is 4-byte aligned */ 10 | person_english : .word say_hello /* tag for people 11 | that will be greeted 12 | in English */ 13 | .align 4 /* ensure the next label is 4-byte aligned */ 14 | person_french : .word say_bonjour /* tag for people 15 | that will be greeted 16 | in French */ 17 | 18 | /* several names to be used in the people definition */ 19 | .align 4 20 | name_pierre: .asciz "Pierre" 21 | .align 4 22 | name_john: .asciz "John" 23 | .align 4 24 | name_sally: .asciz "Sally" 25 | .align 4 26 | name_bernadette: .asciz "Bernadette" 27 | 28 | /* some people */ 29 | .align 4 30 | person_john: .word name_john, person_english 31 | .align 4 32 | person_pierre: .word name_pierre, person_french 33 | .align 4 34 | person_sally: .word name_sally, person_english 35 | .align 4 36 | person_bernadette: .word name_bernadette, person_french 37 | 38 | /* array of people */ 39 | people : .word person_john, person_pierre, person_sally, person_bernadette 40 | 41 | .text /* text section (= code) */ 42 | 43 | .align 4 /* ensure the next label is 4-byte aligned */ 44 | say_hello: 45 | push {r4, lr} /* keep lr because we call printf, 46 | we keep r4 to keep the stack 8-byte 47 | aligned, as per AAPCS requirements */ 48 | /* Prepare the call to printf */ 49 | mov r1, r0 /* r1 ← r0 */ 50 | ldr r0, addr_of_message_hello 51 | /* r0 ← &message_hello */ 52 | bl printf /* call printf */ 53 | pop {r4, lr} /* restore r4 and lr */ 54 | bx lr /* return to the caller */ 55 | 56 | .align 4 /* ensure the next label is 4-byte aligned */ 57 | addr_of_message_hello: .word message_hello 58 | 59 | .align 4 /* ensure the next label is 4-byte aligned */ 60 | say_bonjour: 61 | push {r4, lr} /* keep lr because we call printf, 62 | we keep r4 to keep the stack 8-byte 63 | aligned, as per AAPCS requirements */ 64 | /* Prepare the call to printf */ 65 | mov r1, r0 /* r1 ← r0 */ 66 | ldr r0, addr_of_message_bonjour 67 | /* r0 ← &message_bonjour */ 68 | bl printf /* call printf */ 69 | pop {r4, lr} /* restore r4 and lr */ 70 | bx lr /* return to the caller */ 71 | 72 | .align 4 /* ensure the next label is 4-byte aligned */ 73 | addr_of_message_bonjour: .word message_bonjour 74 | 75 | /* This function receives an address to a person */ 76 | .align 4 77 | greet_person: 78 | push {r4, lr} /* keep lr because we call printf, 79 | we keep r4 to keep the stack 8-byte 80 | aligned, as per AAPCS requirements */ 81 | 82 | /* prepare indirect function call */ 83 | mov r4, r0 /* r0 ← r4, keep the first parameter in r4 */ 84 | ldr r0, [r4] /* r0 ← *r4, this is the address to the name 85 | of the person and the first parameter 86 | of the indirect called function*/ 87 | 88 | ldr r1, [r4, #4] /* r1 ← *(r4 + 4) this is the address 89 | to the person tag */ 90 | ldr r1, [r1] /* r1 ← *r1, the address of the 91 | specific greeting function */ 92 | 93 | blx r1 /* indirect call to r1, this is 94 | the specific greeting function */ 95 | 96 | pop {r4, lr} /* restore r4 and lr */ 97 | bx lr /* return to the caller */ 98 | 99 | .globl main /* state that 'main' label is global */ 100 | .align 4 /* ensure the next label is 4-byte aligned */ 101 | main: 102 | push {r4, r5, r6, lr} /* keep callee saved registers that we will modify */ 103 | 104 | ldr r4, addr_of_people /* r4 ← &people */ 105 | /* recall that people is an array of addresses (pointers) to people */ 106 | 107 | /* now we loop from 0 to 4 */ 108 | mov r5, #0 /* r5 ← 0 */ 109 | b check_loop /* branch to the loop check */ 110 | 111 | loop: 112 | /* prepare the call to greet_person */ 113 | ldr r0, [r4, r5, LSL #2] /* r0 ← *(r4 + r5 << 2) this is 114 | r0 ← *(r4 + r5 * 4) 115 | recall, people is an array of addresses, 116 | so this is 117 | r0 ← people[r5] 118 | */ 119 | bl greet_person /* call greet_person */ 120 | add r5, r5, #1 /* r5 ← r5 + 1 */ 121 | check_loop: 122 | cmp r5, #4 /* compute r5 - 4 and update cpsr */ 123 | bne loop /* if r5 != 4 branch to loop */ 124 | 125 | mov r0, #0 /* return from the program, set error code */ 126 | pop {r4, r5, r6, lr} /* callee saved registers */ 127 | bx lr /* return to the caller (the system) */ 128 | 129 | addr_of_people : .word people 130 | -------------------------------------------------------------------------------- /chapter20/indirect.s: -------------------------------------------------------------------------------- 1 | .data /* data section */ 2 | .align 4 /* ensure the next label is 4-byte aligned */ 3 | message: .asciz "Hello world\n" 4 | .align 4 /* ensure the next label is 4-byte aligned */ 5 | ptr_of_fun: .word 0 /* we set its initial value zero */ 6 | 7 | .text /* text section (= code) */ 8 | 9 | .align 4 /* ensure the next label is 4-byte aligned */ 10 | say_hello: 11 | push {r4, lr} /* keep lr because we call printf, 12 | we keep r4 to keep the stack 8-byte 13 | aligned, as per AAPCS requirements */ 14 | /* Prepare the call to printf */ 15 | ldr r0, addr_of_message /* r0 ← &message */ 16 | bl printf /* call printf */ 17 | pop {r4, lr} /* restore r4 and lr */ 18 | bx lr /* return to the caller */ 19 | 20 | .align 4 /* ensure the next label is 4-byte aligned */ 21 | addr_of_message: .word message 22 | 23 | .align 4 24 | make_indirect_call: 25 | push {r4, lr} /* keep lr because we call printf, 26 | we keep r4 to keep the stack 8-byte 27 | aligned, as per AAPCS requirements */ 28 | ldr r0, addr_ptr_of_fun /* r0 ← &ptr_of_fun */ 29 | ldr r0, [r0] /* r0 ← *r0 */ 30 | blx r0 /* indirect call to r0 */ 31 | pop {r4, lr} /* restore r4 and lr */ 32 | bx lr /* return to the caller */ 33 | 34 | .globl main /* state that 'main' label is global */ 35 | .align 4 /* ensure the next label is 4-byte aligned */ 36 | main: 37 | push {r4, lr} /* keep lr because we call printf, 38 | we keep r4 to keep the stack 8-byte 39 | aligned, as per AAPCS requirements */ 40 | 41 | ldr r1, addr_say_hello /* r1 ← &say_hello */ 42 | ldr r0, addr_ptr_of_fun /* r0 ← &addr_ptr_of_fun */ 43 | str r1, [r0] /* *r0 ← r1 44 | this is 45 | ptr_of_fun ← &say_hello */ 46 | 47 | bl make_indirect_call /* call make_indirect_call */ 48 | 49 | mov r0, #0 /* return from the program, set error code */ 50 | pop {r4, lr} /* restore r4 and lr */ 51 | bx lr /* return to the caller (the system) */ 52 | 53 | addr_ptr_of_fun: .word ptr_of_fun 54 | addr_say_hello : .word say_hello 55 | -------------------------------------------------------------------------------- /chapter21/Makefile: -------------------------------------------------------------------------------- 1 | EXES=subword subword_signed reinterpret 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -march=armv6 -mfpu=vfpv2 -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter21/reinterpret.s: -------------------------------------------------------------------------------- 1 | .data 2 | 3 | .align 4 4 | a_word: .word 0x11223344 5 | 6 | .align 4 7 | message_bytes : .asciz "byte #%d is 0x%x\n" 8 | message_halfwords : .asciz "halfword #%d is 0x%x\n" 9 | message_words : .asciz "word #%d is 0x%x\n" 10 | 11 | .text 12 | 13 | .globl main 14 | main: 15 | push {r4, r5, r6, lr} /* keep callee saved registers */ 16 | 17 | ldr r4, addr_a_word /* r4 ← &a_word */ 18 | 19 | mov r5, #0 /* r5 ← 0 */ 20 | b check_loop_bytes /* branch to check_loop_bytes */ 21 | 22 | loop_bytes: 23 | /* prepare call to printf */ 24 | ldr r0, addr_message_bytes 25 | /* r0 ← &message_bytes 26 | first parameter of printf */ 27 | mov r1, r5 /* r1 ← r5 28 | second parameter of printf */ 29 | ldrb r2, [r4, r5] /* r2 ← *{byte}(r4 + r5) 30 | third parameter of printf */ 31 | bl printf /* call printf */ 32 | add r5, r5, #1 /* r5 ← r5 + 1 */ 33 | check_loop_bytes: 34 | cmp r5, #4 /* compute r5 - 4 and update cpsr */ 35 | bne loop_bytes /* if r5 != 4 branch to loop_bytes */ 36 | 37 | mov r5, #0 /* r5 ← 0 */ 38 | b check_loop_halfwords /* branch to check_loop_halfwords */ 39 | 40 | loop_halfwords: 41 | /* prepare call to printf */ 42 | ldr r0, addr_message_halfwords 43 | /* r0 ← &message_halfwords 44 | first parameter of printf */ 45 | mov r1, r5 /* r1 ← r5 46 | second parameter of printf */ 47 | mov r6, r5, LSL #1 /* r6 ← r5 * 2 */ 48 | ldrh r2, [r4, r6] /* r2 ← *{half}(r4 + r6) 49 | this is r2 ← *{half}(r4 + r5 * 2) 50 | third parameter of printf */ 51 | bl printf /* call printf */ 52 | add r5, r5, #1 /* r5 ← r5 + 1 */ 53 | check_loop_halfwords: 54 | cmp r5, #2 /* compute r5 - 2 and update cpsr */ 55 | bne loop_halfwords /* if r5 != 2 branch to loop_halfwords */ 56 | 57 | /* prepare call to printf */ 58 | ldr r0, addr_message_words /* r0 ← &message_words 59 | first parameter of printf */ 60 | mov r1, #0 /* r1 ← 0 61 | second parameter of printf */ 62 | ldr r2, [r4] /* r1 ← *r4 63 | third parameter of printf */ 64 | bl printf /* call printf */ 65 | 66 | pop {r4, r5, r6, lr} /* restore callee saved registers */ 67 | mov r0, #0 /* set error code */ 68 | bx lr /* return to system */ 69 | 70 | addr_a_word : .word a_word 71 | addr_message_bytes : .word message_bytes 72 | addr_message_halfwords : .word message_halfwords 73 | addr_message_words : .word message_words 74 | -------------------------------------------------------------------------------- /chapter21/subword.s: -------------------------------------------------------------------------------- 1 | .data 2 | 3 | .align 4 4 | one_byte: .byte 205 5 | 6 | .align 4 7 | one_halfword: .hword 42445 8 | 9 | .text 10 | 11 | .globl main 12 | main: 13 | push {r4, lr} 14 | 15 | ldr r0, addr_of_one_byte /* r0 ← &one_byte */ 16 | ldrb r0, [r0] /* r0 ← *{byte}r0 */ 17 | 18 | ldr r1, addr_of_one_halfword /* r1 ← &one_halfword */ 19 | ldrh r1, [r1] /* r1 ← *{half}r1 */ 20 | 21 | pop {r4, lr} 22 | mov r0, #0 23 | bx lr 24 | 25 | addr_of_one_byte: .word one_byte 26 | addr_of_one_halfword: .word one_halfword 27 | -------------------------------------------------------------------------------- /chapter21/subword_signed.s: -------------------------------------------------------------------------------- 1 | .data 2 | 3 | .align 4 4 | one_byte: .byte 205 5 | 6 | .align 4 7 | one_halfword: .hword 42445 8 | 9 | .text 10 | 11 | .globl main 12 | main: 13 | push {r4, lr} 14 | 15 | ldr r0, addr_of_one_byte /* r0 ← &one_byte */ 16 | ldrsb r0, [r0] /* r0 ← *{byte}r0 */ 17 | 18 | ldr r1, addr_of_one_halfword /* r1 ← &one_halfword */ 19 | ldrsh r1, [r1] /* r1 ← *{half}r1 */ 20 | 21 | pop {r4, lr} 22 | mov r0, #0 23 | bx lr 24 | 25 | addr_of_one_byte: .word one_byte 26 | addr_of_one_halfword: .word one_halfword 27 | -------------------------------------------------------------------------------- /chapter22/Makefile: -------------------------------------------------------------------------------- 1 | EXES=thumb-first thumb-call back-to-arm 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -march=armv6 -mfpu=vfpv2 -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter22/back-to-arm.s: -------------------------------------------------------------------------------- 1 | /* thumb-first.s */ 2 | 3 | .text 4 | 5 | .data 6 | message: .asciz "Hello world %d\n" 7 | 8 | .code 16 /* Here we say we will use Thumb */ 9 | .align 2 /* Make sure instructions are aligned at 2-byte boundary */ 10 | thumb_function: 11 | push {r4, lr} /* keep r4 and lr in the stack */ 12 | mov r4, #0 /* r4 ← 0 */ 13 | b check_loop /* unconditional branch to check_loop */ 14 | loop: 15 | /* prepare the call to printf */ 16 | ldr r0, addr_of_message /* r0 ← &message */ 17 | mov r1, r4 /* r1 ← r4 */ 18 | blx printf /* From Thumb to ARM we use blx. 19 | printf is a function 20 | in the C library that is implemented 21 | using ARM instructions */ 22 | add r4, r4, #1 /* r4 ← r4 + 1 */ 23 | check_loop: 24 | cmp r4, #4 /* compute r4 - 4 and update the cpsr */ 25 | blt loop /* if the cpsr means that r4 < 4 branch to loop */ 26 | 27 | pop {r4, pc} /* restore registers and return from function */ 28 | .align 4 29 | addr_of_message: .word message 30 | 31 | .code 32 /* Here we say we will use ARM */ 32 | .align 4 /* Make sure instructions are aligned at 4-byte boundary */ 33 | .globl main 34 | main: 35 | push {r4, lr} 36 | 37 | blx thumb_function /* Switch from ARM to Thumb */ 38 | 39 | pop {r4, lr} 40 | bx lr 41 | -------------------------------------------------------------------------------- /chapter22/thumb-call.s: -------------------------------------------------------------------------------- 1 | /* thumb-call.s */ 2 | .text 3 | 4 | .code 16 /* Here we say we will use Thumb */ 5 | .align 2 /* Make sure instructions are aligned at 2-byte boundary */ 6 | 7 | thumb_function_2: 8 | mov r0, #2 9 | bx lr /* A leaf Thumb function (i.e. a function that does not call 10 | any other function) returns using "bx lr" */ 11 | 12 | thumb_function_1: 13 | push {r4, lr} /* Keep r4 and lr in the stack */ 14 | bl thumb_function_2 /* From Thumb to Thumb we use bl */ 15 | pop {r4, pc} /* This is how we return from a non-leaf Thumb function */ 16 | 17 | .code 32 /* Here we say we will use ARM */ 18 | .align 4 /* Make sure instructions are aligned at 4-byte boundary */ 19 | .globl main 20 | main: 21 | push {r4, lr} 22 | 23 | blx thumb_function_1 /* From ARM to Thumb we use blx */ 24 | 25 | pop {r4, lr} 26 | bx lr 27 | -------------------------------------------------------------------------------- /chapter22/thumb-first.s: -------------------------------------------------------------------------------- 1 | /* thumb-first.s */ 2 | .text 3 | 4 | .code 16 /* Here we say we will use Thumb */ 5 | .align 2 /* Make sure instructions are aligned at 2-byte boundary */ 6 | 7 | thumb_function: 8 | mov r0, #2 /* r0 ← 2 */ 9 | bx lr /* return */ 10 | 11 | .code 32 /* Here we say we will use ARM */ 12 | .align 4 /* Make sure instructions are aligned at 4-byte boundary */ 13 | 14 | .globl main 15 | main: 16 | push {r4, lr} 17 | 18 | blx thumb_function /* From ARM to Thumb we use blx */ 19 | 20 | pop {r4, lr} 21 | bx lr 22 | -------------------------------------------------------------------------------- /chapter23/Makefile: -------------------------------------------------------------------------------- 1 | EXES=nested01 nested02 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -march=armv6 -mfpu=vfpv2 -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter23/nested01.s: -------------------------------------------------------------------------------- 1 | /* nested01.s */ 2 | 3 | .text 4 | 5 | f: 6 | push {r4, r5, fp, lr} /* keep registers */ 7 | mov fp, sp /* keep dynamic link */ 8 | 9 | sub sp, sp, #8 /* make room for x (4 bytes) 10 | plus 4 bytes to keep stack 11 | aligned */ 12 | /* x is in address "fp - 4" */ 13 | 14 | mov r4, #1 /* r4 ← 0 */ 15 | str r4, [fp, #-4] /* x ← r4 */ 16 | 17 | bl g /* call (nested function) g */ 18 | 19 | ldr r4, [fp, #-4] /* r4 ← x */ 20 | add r4, r4, #1 /* r4 ← r4 + 1 */ 21 | str r4, [fp, #-4] /* x ← r4 */ 22 | 23 | mov sp, fp /* restore dynamic link */ 24 | pop {r4, r5, fp, lr} /* restore registers */ 25 | bx lr /* return */ 26 | 27 | /* nested function g */ 28 | g: 29 | push {r4, r5, fp, lr} /* keep registers */ 30 | mov fp, sp /* keep dynamic link */ 31 | 32 | /* At this point our stack looks like this 33 | 34 | Data | Address | Notes 35 | ------+---------+-------------------- 36 | r4 | fp | 37 | r5 | fp + 4 | 38 | fp | fp + 8 | This is the old fp 39 | lr | 40 | */ 41 | 42 | ldr r4, [fp, #+8] /* get the activation record 43 | of my caller 44 | (since only f can call me) 45 | */ 46 | 47 | /* now r4 acts like the fp we had inside 'f' */ 48 | ldr r5, [r4, #-4] /* r5 ← x */ 49 | add r5, r5, #1 /* r5 ← r5 + 1 */ 50 | str r5, [r4, #-4] /* x ← r5 */ 51 | 52 | mov sp, fp /* restore dynamic link */ 53 | pop {r4, r5, fp, lr} /* restore registers */ 54 | bx lr /* return */ 55 | 56 | .globl main 57 | 58 | main : 59 | push {r4, lr} /* keep registers */ 60 | 61 | bl f /* call f */ 62 | 63 | mov r0, #0 64 | pop {r4, lr} 65 | bx lr 66 | -------------------------------------------------------------------------------- /chapter23/nested02.s: -------------------------------------------------------------------------------- 1 | /* nested01.s */ 2 | 3 | .text 4 | 5 | # void f(void) // non nested (nesting depth = 0) 6 | # { 7 | # int x; 8 | # 9 | # void g() // nested (nesting depth = 1) 10 | # { 11 | # x = x + 1; 12 | # } 13 | # void h() // nested (nesting depth = 1) 14 | # { 15 | # void m() // nested (nesting depth = 2) 16 | # { 17 | # x = x + 2; 18 | # g(); 19 | # } 20 | # 21 | # g(); 22 | # m(); 23 | # x = x + 3; 24 | # } 25 | # 26 | # x = 1; 27 | # h(); 28 | # // here x will be 8 29 | # } 30 | 31 | f: 32 | push {r4, r10, fp, lr} /* keep registers */ 33 | mov fp, sp /* setup dynamic link */ 34 | 35 | sub sp, sp, #8 /* make room for x (4 + 4 bytes) */ 36 | /* x will be in address "fp - 4" */ 37 | 38 | /* At this point our stack looks like this 39 | 40 | Data | Address | Notes 41 | ------+---------+--------------------------- 42 | | fp - 8 | alignment (per AAPCS) 43 | x | fp - 4 | 44 | r4 | fp | 45 | r10 | fp + 8 | previous value of r10 46 | fp | fp + 12 | previous value of fp 47 | lr | fp + 16 | 48 | */ 49 | 50 | mov r4, #1 /* r4 ← 1 */ 51 | str r4, [fp, #-4] /* x ← r4 */ 52 | 53 | /* prepare the call to h */ 54 | mov r10, fp /* setup the static link, 55 | since we are calling an immediately nested function 56 | it is just the current frame */ 57 | bl h 58 | 59 | mov sp, fp /* restore stack */ 60 | pop {r4, r10, fp, lr} /* restore registers */ 61 | bx lr /* return */ 62 | 63 | /* ------ nested function ------------------ */ 64 | h : 65 | push {r4, r5, r10, fp, lr} /* keep registers */ 66 | mov fp, sp /* setup dynamic link */ 67 | 68 | sub sp, sp, #4 /* align stack */ 69 | 70 | /* At this point our stack looks like this 71 | 72 | Data | Address | Notes 73 | ------+---------+--------------------------- 74 | | fp - 4 | alignment (per AAPCS) 75 | r4 | fp | 76 | r5 | fp + 4 | 77 | r10 | fp + 8 | frame pointer of 'f' 78 | fp | fp + 12 | frame pointer of caller 79 | lr | fp + 16 | 80 | */ 81 | 82 | /* prepare call to g */ 83 | /* g is a sibling so the static link will be the same 84 | as the current one */ 85 | ldr r10, [fp, #8] 86 | bl g 87 | 88 | /* prepare call to m */ 89 | /* m is an immediately nested function so the static 90 | link is the current frame */ 91 | mov r10, fp 92 | bl m 93 | 94 | ldr r4, [fp, #8] /* load frame pointer of 'f' */ 95 | ldr r5, [r4, #-4] /* r5 ← x */ 96 | add r5, r5, #3 /* r5 ← r5 + 3 */ 97 | str r5, [r4, #-4] /* x ← r5 */ 98 | 99 | mov sp, fp /* restore stack */ 100 | pop {r4, r5, r10, fp, lr} /* restore registers */ 101 | bx lr 102 | 103 | 104 | /* ------ nested function ------------------ */ 105 | m: 106 | push {r4, r5, r10, fp, lr} /* keep registers */ 107 | mov fp, sp /* setup dynamic link */ 108 | 109 | sub sp, sp, #4 /* align stack */ 110 | /* At this point our stack looks like this 111 | 112 | Data | Address | Notes 113 | ------+---------+--------------------------- 114 | | fp - 4 | alignment (per AAPCS) 115 | r4 | fp | 116 | r5 | fp + 4 | 117 | r10 | fp + 8 | frame pointer of 'h' 118 | fp | fp + 12 | frame pointer of caller 119 | lr | fp + 16 | 120 | */ 121 | 122 | ldr r4, [fp, #8] /* r4 ← frame pointer of 'h' */ 123 | ldr r4, [r4, #8] /* r4 ← frame pointer of 'f' */ 124 | ldr r5, [r4, #-4] /* r5 ← x */ 125 | add r5, r5, #2 /* r5 ← r5 + 2 */ 126 | str r5, [r4, #-4] /* x ← r5 */ 127 | 128 | /* setup call to g */ 129 | ldr r10, [fp, #8] /* r10 ← frame pointer of 'h' */ 130 | ldr r10, [r10, #8] /* r10 ← frame pointer of 'f' */ 131 | bl g 132 | 133 | mov sp, fp /* restore stack */ 134 | pop {r4, r5, r10, fp, lr} /* restore registers */ 135 | bx lr 136 | 137 | /* ------ nested function ------------------ */ 138 | g: 139 | push {r4, r5, r10, fp, lr} /* keep registers */ 140 | mov fp, sp /* setup dynamic link */ 141 | 142 | sub sp, sp, #4 /* align stack */ 143 | 144 | /* At this point our stack looks like this 145 | 146 | Data | Address | Notes 147 | ------+---------+--------------------------- 148 | | fp - 4 | alignment (per AAPCS) 149 | r4 | fp | 150 | r5 | fp + 4 | 151 | r10 | fp + 8 | frame pointer of 'f' 152 | fp | fp + 12 | frame pointer of caller 153 | lr | fp + 16 | 154 | */ 155 | 156 | ldr r4, [fp, #8] /* r4 ← frame pointer of 'f' */ 157 | ldr r5, [r4, #-4] /* r5 ← x */ 158 | add r5, r5, #1 /* r5 ← r5 + 1 */ 159 | str r5, [r4, #-4] /* x ← r5 */ 160 | 161 | mov sp, fp /* restore dynamic link */ 162 | pop {r4, r5, r10, fp, lr} /* restore registers */ 163 | bx lr 164 | 165 | .globl main 166 | 167 | main : 168 | push {r4, lr} /* keep registers */ 169 | 170 | bl f /* call f */ 171 | 172 | mov r0, #0 173 | pop {r4, lr} 174 | bx lr 175 | -------------------------------------------------------------------------------- /chapter24/Makefile: -------------------------------------------------------------------------------- 1 | EXES=print-array sort-array trampoline-sort-array 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -march=armv6 -mfpu=vfpv2 -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter24/print-array.s: -------------------------------------------------------------------------------- 1 | /* print-array.s */ 2 | 3 | .data 4 | 5 | /* declare an array of 10 integers called my_array */ 6 | .align 4 7 | my_array: .word 82, 70, 93, 77, 91, 30, 42, 6, 92, 64 8 | 9 | /* format strings for printf */ 10 | /* format string that prints an integer plus a space */ 11 | .align 4 12 | integer_printf: .asciz "%d " 13 | /* format string that simply prints a newline */ 14 | .align 4 15 | newline_printf: .asciz "\n" 16 | 17 | .text 18 | 19 | print_array: 20 | /* r0 will be the address of the integer array */ 21 | /* r1 will be the number of items in the array */ 22 | push {r4, r5, r6, lr} /* keep r4, r5, r6 and lr in the stack */ 23 | 24 | mov r4, r0 /* r4 ← r0. keep the address of the array */ 25 | mov r5, r1 /* r5 ← r1. keep the number of items */ 26 | mov r6, #0 /* r6 ← 0. current item to print */ 27 | 28 | b .Lprint_array_check_loop /* go to the condition check of the loop */ 29 | 30 | .Lprint_array_loop: 31 | /* prepare the call to printf */ 32 | ldr r0, addr_of_integer_printf /* r0 ← &integer_printf */ 33 | ldr r1, [r4, +r6, LSL #2] /* r1 ← *(r4 + r6 * 4) */ 34 | bl printf /* call printf */ 35 | 36 | add r6, r6, #1 /* r6 ← r6 + 1 */ 37 | .Lprint_array_check_loop: 38 | cmp r6, r5 /* perform r6 - r5 and update cpsr */ 39 | bne .Lprint_array_loop /* if cpsr states that r6 is not equal to r5 40 | branch to the body of the loop */ 41 | 42 | /* prepare call to printf */ 43 | ldr r0, addr_of_newline_printf /* r0 ← &newline_printf */ 44 | bl printf 45 | 46 | pop {r4, r5, r6, lr} /* restore r4, r5, r6 and lr from the stack */ 47 | bx lr /* return */ 48 | 49 | addr_of_integer_printf: .word integer_printf 50 | addr_of_newline_printf: .word newline_printf 51 | 52 | .globl main 53 | main: 54 | push {r4, lr} /* keep r4 and lr in the stack */ 55 | 56 | /* prepare call to print_array */ 57 | ldr r0, addr_of_my_array /* r0 ← &my_array */ 58 | mov r1, #10 /* r1 ← 10 59 | our array is of length 10 */ 60 | bl print_array /* call print_array */ 61 | 62 | mov r0, #0 /* r0 ← 0 set errorcode to 0 prior returning from main */ 63 | pop {r4, lr} /* restore r4 and lr in the stack */ 64 | bx lr /* return */ 65 | 66 | addr_of_my_array: .word my_array 67 | -------------------------------------------------------------------------------- /chapter24/sort-array.s: -------------------------------------------------------------------------------- 1 | /* sort-array.s */ 2 | 3 | .data 4 | 5 | /* declare an array of 10 integers called my_array */ 6 | .align 4 7 | my_array: .word 82, 70, 93, 77, 91, 30, 42, 6, 92, 64 8 | 9 | /* format strings for printf */ 10 | /* format string that prints an integer plus a space */ 11 | .align 4 12 | integer_printf: .asciz "%d " 13 | /* format string that simply prints a newline */ 14 | .align 4 15 | newline_printf: .asciz "\n" 16 | 17 | .text 18 | 19 | integer_comparison: 20 | /* r0 will be the address to the first integer */ 21 | /* r1 will be the address to the second integer */ 22 | ldr r0, [r0] /* r0 ← *r0 23 | load the integer pointed by r0 in r0 */ 24 | ldr r1, [r1] /* r1 ← *r1 25 | load the integer pointed by r1 in r1 */ 26 | 27 | cmp r0, r1 /* compute r0 - r1 and update cpsr */ 28 | moveq r0, #0 /* if cpsr means that r0 == r1 then r0 ← 0 */ 29 | movlt r0, #-1 /* if cpsr means that r0 < r1 then r0 ← -1 */ 30 | movgt r0, #1 /* if cpsr means that r0 > r1 then r0 ← 1 */ 31 | bx lr /* return */ 32 | 33 | print_array: 34 | /* r0 will be the address of the integer array */ 35 | /* r1 will be the number of items in the array */ 36 | push {r4, r5, r6, lr} /* keep r4, r5, r6 and lr in the stack */ 37 | 38 | mov r4, r0 /* r4 ← r0. keep the address of the array */ 39 | mov r5, r1 /* r5 ← r1. keep the number of items */ 40 | mov r6, #0 /* r6 ← 0. current item to print */ 41 | 42 | b .Lprint_array_check_loop /* go to the condition check of the loop */ 43 | 44 | .Lprint_array_loop: 45 | /* prepare the call to printf */ 46 | ldr r0, addr_of_integer_printf /* r0 ← &integer_printf */ 47 | ldr r1, [r4, +r6, LSL #2] /* r1 ← *(r4 + r6 * 4) */ 48 | bl printf /* call printf */ 49 | 50 | add r6, r6, #1 /* r6 ← r6 + 1 */ 51 | .Lprint_array_check_loop: 52 | cmp r6, r5 /* perform r6 - r5 and update cpsr */ 53 | bne .Lprint_array_loop /* if cpsr states that r6 is not equal to r5 54 | branch to the body of the loop */ 55 | 56 | /* prepare call to printf */ 57 | ldr r0, addr_of_newline_printf /* r0 ← &newline_printf */ 58 | bl printf 59 | 60 | pop {r4, r5, r6, lr} /* restore r4, r5, r6 and lr from the stack */ 61 | bx lr /* return */ 62 | 63 | addr_of_integer_printf: .word integer_printf 64 | addr_of_newline_printf: .word newline_printf 65 | 66 | .globl main 67 | main: 68 | push {r4, lr} /* keep r4 and lr in the stack */ 69 | 70 | /* prepare call to print_array */ 71 | ldr r0, addr_of_my_array /* r0 ← &my_array */ 72 | mov r1, #10 /* r1 ← 10 73 | our array is of length 10 */ 74 | bl print_array /* call print_array */ 75 | 76 | /* prepare call to qsort */ 77 | /* 78 | void qsort(void *base, 79 | size_t nmemb, 80 | size_t size, 81 | int (*compar)(const void *, const void *)); 82 | */ 83 | ldr r0, addr_of_my_array /* r0 ← &my_array 84 | base */ 85 | mov r1, #10 /* r1 ← 10 86 | nmemb = number of members 87 | our array is 10 elements long */ 88 | mov r2, #4 /* r1 ← 4 89 | size of each member is 4 bytes */ 90 | ldr r3, addr_of_integer_comparison 91 | /* r3 ← &integer_comparison 92 | comp */ 93 | bl qsort /* call qsort */ 94 | 95 | /* now print again to see if elements were sorted */ 96 | /* prepare call to print_array */ 97 | ldr r0, addr_of_my_array /* r0 ← &my_array */ 98 | mov r1, #10 /* r1 ← 10 99 | our array is of length 10 */ 100 | bl print_array /* call print_array */ 101 | 102 | mov r0, #0 /* r0 ← 0 set errorcode to 0 prior returning from main */ 103 | pop {r4, lr} /* restore r4 and lr in the stack */ 104 | bx lr /* return */ 105 | 106 | addr_of_my_array: .word my_array 107 | addr_of_integer_comparison : .word integer_comparison 108 | -------------------------------------------------------------------------------- /chapter24/trampoline-sort-array.s: -------------------------------------------------------------------------------- 1 | /* trampoline-sort-arrays.s */ 2 | 3 | .data 4 | 5 | /* declare an array of 10 integers called my_array */ 6 | .align 4 7 | my_array: .word 82, 70, 93, 77, 91, 30, 42, 6, 92, 64 8 | 9 | /* format strings for printf */ 10 | /* format string that prints an integer plus a space */ 11 | .align 4 12 | integer_printf: .asciz "%d " 13 | /* format string that simply prints a newline */ 14 | .align 4 15 | newline_printf: .asciz "\n" 16 | .align 4 17 | comparison_message: .asciz "Num comparisons: %d\n" 18 | 19 | .text 20 | 21 | print_array: 22 | /* r0 will be the address of the integer array */ 23 | /* r1 will be the number of items in the array */ 24 | push {r4, r5, r6, lr} /* keep r4, r5, r6 and lr in the stack */ 25 | 26 | mov r4, r0 /* r4 ← r0. keep the address of the array */ 27 | mov r5, r1 /* r5 ← r1. keep the number of items */ 28 | mov r6, #0 /* r6 ← 0. current item to print */ 29 | 30 | b .Lprint_array_check_loop /* go to the condition check of the loop */ 31 | 32 | .Lprint_array_loop: 33 | /* prepare the call to printf */ 34 | ldr r0, addr_of_integer_printf /* r0 ← &integer_printf */ 35 | ldr r1, [r4, +r6, LSL #2] /* r1 ← *(r4 + r6 * 4) */ 36 | bl printf /* call printf */ 37 | 38 | add r6, r6, #1 /* r6 ← r6 + 1 */ 39 | .Lprint_array_check_loop: 40 | cmp r6, r5 /* perform r6 - r5 and update cpsr */ 41 | bne .Lprint_array_loop /* if cpsr states that r6 is not equal to r5 42 | branch to the body of the loop */ 43 | 44 | /* prepare call to printf */ 45 | ldr r0, addr_of_newline_printf /* r0 ← &newline_printf */ 46 | bl printf 47 | 48 | pop {r4, r5, r6, lr} /* restore r4, r5, r6 and lr from the stack */ 49 | bx lr /* return */ 50 | 51 | addr_of_integer_printf: .word integer_printf 52 | addr_of_newline_printf: .word newline_printf 53 | 54 | .globl main 55 | main: 56 | push {r4, r5, r6, fp, lr} /* keep callee saved registers */ 57 | mov fp, sp /* setup dynamic link */ 58 | 59 | sub sp, sp, #4 /* counter will be in fp - 4 */ 60 | /* note that now the stack is 8-byte aligned */ 61 | 62 | /* set counter to zero */ 63 | mov r4, #0 /* r4 ← 0 */ 64 | str r4, [fp, #-4] /* counter ← r4 */ 65 | 66 | /* Make room for the trampoline */ 67 | sub sp, sp, #32 /* sp ← sp - 32 */ 68 | /* note that 32 is a multiple of 8, so the stack 69 | is still 8-byte aligned */ 70 | 71 | /* copy the trampoline into the stack */ 72 | mov r4, #32 /* r4 ← 32 */ 73 | ldr r5, .Laddr_trampoline_template /* r4 ← &trampoline_template */ 74 | mov r6, sp /* r6 ← sp */ 75 | b .Lcopy_trampoline_loop_check /* branch to copy_trampoline_loop_check */ 76 | 77 | .Lcopy_trampoline_loop: 78 | ldr r7, [r5] /* r7 ← *r5 */ 79 | str r7, [r6] /* *r6 ← r7 */ 80 | add r5, r5, #4 /* r5 ← r5 + 4 */ 81 | add r6, r6, #4 /* r6 ← r6 + 4 */ 82 | sub r4, r4, #4 /* r4 ← r4 - 4 */ 83 | .Lcopy_trampoline_loop_check: 84 | cmp r4, #0 /* compute r4 - 0 and update cpsr */ 85 | bgt .Lcopy_trampoline_loop /* if cpsr means that r4 > 0 86 | then branch to copy_trampoline_loop */ 87 | 88 | /* setup the trampoline */ 89 | ldr r4, addr_of_integer_comparison_count 90 | /* r4 ← &integer_comparison_count */ 91 | str r4, [fp, #-36] /* *(fp + 36) ← r4 */ 92 | /* set the function_called in the trampoline 93 | to be &integer_comparison_count */ 94 | str fp, [fp, #-32] /* *(fp + 32) ← fp */ 95 | /* set the lexical_scope in the trampoline 96 | to be fp */ 97 | 98 | /* prepare call to __clear_cache */ 99 | mov r0, sp /* r0 ← sp */ 100 | add r1, sp, #32 /* r1 ← sp + 32 */ 101 | bl __clear_cache /* call __clear_cache */ 102 | 103 | /* prepare call to print_array */ 104 | ldr r0, addr_of_my_array /* r0 ← &my_array */ 105 | mov r1, #10 /* r1 ← 10 106 | our array is of length 10 */ 107 | bl print_array /* call print_array */ 108 | 109 | /* prepare call to qsort */ 110 | /* 111 | void qsort(void *base, 112 | size_t nmemb, 113 | size_t size, 114 | int (*compar)(const void *, const void *)); 115 | */ 116 | ldr r0, addr_of_my_array /* r0 ← &my_array 117 | base */ 118 | mov r1, #10 /* r1 ← 10 119 | nmemb = number of members 120 | our array is 10 elements long */ 121 | mov r2, #4 /* r2 ← 4 122 | size of each member is 4 bytes */ 123 | sub r3, fp, #28 /* r3 ← fp + 28 */ 124 | bl qsort /* call qsort */ 125 | 126 | /* prepare call to printf */ 127 | ldr r1, [fp, #-4] /* r1 ← counter 128 | num comparisons */ 129 | ldr r0, addr_of_comparison_message /* r0 ← &comparison_message */ 130 | bl printf /* call printf */ 131 | 132 | /* now print again the array to see if elements were sorted */ 133 | /* prepare call to print_array */ 134 | ldr r0, addr_of_my_array /* r0 ← &my_array */ 135 | mov r1, #10 /* r1 ← 10 136 | our array is of length 10 */ 137 | bl print_array /* call print_array */ 138 | 139 | mov r0, #0 /* r0 ← 0 set errorcode to 0 prior returning from main */ 140 | 141 | mov sp, fp 142 | pop {r4, r5, r6, fp, lr} /* restore callee-saved registers */ 143 | bx lr /* return */ 144 | 145 | addr_of_my_array: .word my_array 146 | addr_of_comparison_message : .word comparison_message 147 | 148 | /* nested function integer comparison */ 149 | addr_of_integer_comparison_count : .word integer_comparison_count 150 | integer_comparison_count: 151 | /* r0 will be the address to the first integer */ 152 | /* r1 will be the address to the second integer */ 153 | push {r4, r5, r10, fp, lr} /* keep callee-saved registers */ 154 | mov fp, sp /* setup dynamic link */ 155 | 156 | ldr r0, [r0] /* r0 ← *r0 157 | load the integer pointed by r0 in r0 */ 158 | ldr r1, [r1] /* r1 ← *r1 159 | load the integer pointed by r1 in r1 */ 160 | 161 | cmp r0, r1 /* compute r0 - r1 and update cpsr */ 162 | moveq r0, #0 /* if cpsr means that r0 == r1 then r0 ← 0 */ 163 | movlt r0, #-1 /* if cpsr means that r0 < r1 then r0 ← -1 */ 164 | movgt r0, #1 /* if cpsr means that r0 > r1 then r0 ← 1 */ 165 | 166 | ldr r4, [fp, #8] /* r4 ← *(fp + 8) 167 | get static link in the stack */ 168 | ldr r5, [r4, #-4] /* r5 ← *(r4 - 4) 169 | get value of counter */ 170 | add r5, r5, #1 /* r5 ← r5 + 1 */ 171 | str r5, [r4, #-4] /* *(r4 - 4) ← r5 172 | update counter */ 173 | 174 | mov sp, fp /* restore stack */ 175 | pop {r4, r5, r10, fp, lr} /* restore callee-saved registers */ 176 | bx lr /* return */ 177 | 178 | .Laddr_trampoline_template : .word .Ltrampoline_template 179 | .Ltrampoline_template: 180 | .Lfunction_called: .word 0x0 181 | .Llexical_scope: .word 0x0 182 | push {r4, r5, r10, lr} /* keep callee-saved registers */ 183 | ldr r4, .Lfunction_called /* r4 ← function called */ 184 | ldr r10, .Llexical_scope /* r10 ← lexical scope */ 185 | blx r4 /* indirect call to r4 */ 186 | pop {r4, r5, r10, lr} /* restore callee-saved registers */ 187 | bx lr /* return */ 188 | 189 | 190 | -------------------------------------------------------------------------------- /chapter25/Makefile: -------------------------------------------------------------------------------- 1 | EXES=motivation byte_array_add clipped_add 2 | OBJS=$(addsuffix .o, $(EXES)) 3 | all: $(EXES) $(OBJS) 4 | 5 | %: %.o 6 | gcc -o $@ $+ 7 | 8 | % : %.s 9 | 10 | %.o : %.s 11 | as -march=armv6 -mfpu=vfpv2 -o $@ $< 12 | 13 | .PHONY: clean 14 | clean: 15 | rm -vf $(EXES) *.o 16 | -------------------------------------------------------------------------------- /chapter25/byte_array_add.s: -------------------------------------------------------------------------------- 1 | # byte_array_add.s 2 | 3 | naive_byte_array_addition: 4 | /* r0 contains the base address of a */ 5 | /* r1 contains the base address of b */ 6 | /* r2 contains the base address of c */ 7 | /* r3 is N */ 8 | /* r4 is the number of the current item 9 | so it holds that 0 ≤ r4 < r3 */ 10 | 11 | mov r4, #0 /* r4 ← 0 */ 12 | b .Lcheck_loop0 /* branch to check_loop0 */ 13 | 14 | .Lloop0: 15 | ldrb r5, [r0, r4] /* r5 ← *{unsigned byte}(r0 + r4) */ 16 | ldrb r6, [r1, r4] /* r6 ← *{unsigned byte}(r1 + r4) */ 17 | add r7, r5, r6 /* r7 ← r5 + r6 */ 18 | strb r7, [r2, r4] /* *{unsigned byte}(r2 + r4) ← r7 */ 19 | add r4, r4, #1 /* r4 ← r4 + 1 */ 20 | .Lcheck_loop0: 21 | cmp r4, r3 /* perform r4 - r3 and update cpsr */ 22 | blt .Lloop0 /* if cpsr means that r4 < r3 jump to loop0 */ 23 | 24 | simd_byte_array_addition_0: 25 | /* r0 contains the base address of a */ 26 | /* r1 contains the base address of b */ 27 | /* r2 contains the base address of c */ 28 | /* r3 is N */ 29 | /* r4 is the number of the current item 30 | so it holds that 0 ≤ r4 < r3 */ 31 | 32 | mov r4, #0 /* r4 ← 0 */ 33 | b .Lcheck_loop1 /* branch to check_loop1 */ 34 | 35 | .Lloop1: 36 | ldr r5, [r0, r4] /* r5 ← *(r0 + r4) */ 37 | ldr r6, [r1, r4] /* r6 ← *(r1 + r4) */ 38 | sadd8 r7, r5, r6 /* r7[7:0] ← r5[7:0] + r6[7:0] */ 39 | /* r7[15:8] ← r5[15:8] + r6[15:8] */ 40 | /* r7[23:16] ← r5[23:16] + r6[23:16] */ 41 | /* r7[31:24] ← r5[31:24] + r6[31:24] */ 42 | /* r7[x:y] means bits x to y of the register r7 */ 43 | str r7, [r2, r4] /* *(r2 + r4) ← r7 */ 44 | add r4, r4, #4 /* r4 ← r4 + 4 */ 45 | .Lcheck_loop1: 46 | cmp r4, r3 /* perform r4 - r3 and update cpsr */ 47 | blt .Lloop1 /* if cpsr means that r4 < r3 jump to loop1 */ 48 | 49 | simd_byte_array_addition_1: 50 | /* r0 contains the base address of a */ 51 | /* r1 contains the base address of b */ 52 | /* r2 contains the base address of c */ 53 | /* r3 is N */ 54 | /* r4 is the number of the current item 55 | so it holds that 0 ≤ r4 < r3 */ 56 | 57 | mov r4, #0 /* r4 ← 0 */ 58 | sub r8, r3, #3 /* r8 ← r3 - 3 59 | this is r8 ← N - 3 */ 60 | b .Lcheck_loop2 /* branch to check_loop2 */ 61 | 62 | .Lloop2: 63 | ldr r5, [r0, r4] /* r5 ← *(r0 + r4) */ 64 | ldr r6, [r1, r4] /* r6 ← *(r1 + r4) */ 65 | sadd8 r7, r5, r6 /* r7[7:0] ← r5[7:0] + r6[7:0] */ 66 | /* r7[15:8] ← r5[15:8] + r6[15:8] */ 67 | /* r7[23:16] ← r5[23:16] + r6[23:16] */ 68 | /* r7[31:24] ← r5[31:24] + r6[31:24] */ 69 | str r7, [r2, r4] /* *(r2 + r4) ← r7 */ 70 | add r4, r4, #4 /* r4 ← r4 + 4 */ 71 | .Lcheck_loop2: 72 | cmp r4, r8 /* perform r4 - r8 and update cpsr */ 73 | blt .Lloop2 /* if cpsr means that r4 < r8 jump to loop2 */ 74 | /* i.e. if r4 < N - 3 jump to loop2 */ 75 | 76 | /* epilog loop */ 77 | b .Lcheck_loop3 /* branch to check_loop3 */ 78 | 79 | .Lloop3: 80 | ldrb r5, [r0, r4] /* r5 ← *{unsigned byte}(r0 + r4) */ 81 | ldrb r6, [r1, r4] /* r6 ← *{unsigned byte}(r1 + r4) */ 82 | add r7, r5, r6 /* r7 ← r5 + r6 */ 83 | strb r7, [r2, r4] /* *{unsigned byte}(r2 + r4) ← r7 */ 84 | 85 | add r4, r4, #1 /* r4 ← r4 + 1 */ 86 | .Lcheck_loop3: 87 | cmp r4, r3 /* perform r4 - r3 and update cpsr */ 88 | blt .Lloop3 /* if cpsr means that r4 < r3 jump to loop 3 */ 89 | 90 | .global main 91 | main: 92 | mov r0, #0 93 | bx lr 94 | -------------------------------------------------------------------------------- /chapter25/clipped_add.s: -------------------------------------------------------------------------------- 1 | 2 | .data 3 | max16bit: .word 32767 4 | 5 | .text 6 | 7 | clipped_add16bit: 8 | /* first operand is in r0 */ 9 | /* second operand is in r0 */ 10 | /* result is left in r0 */ 11 | push {r4, lr} /* keep registers */ 12 | 13 | ldr r4, addr_of_max16bit /* r4 ← &max16bit */ 14 | ldr r4, [r4] /* r4 ← *r4 */ 15 | /* now r4 == 32767 (i.e. 2^15 - 1) */ 16 | 17 | add r0, r0, r1 /* r0 ← r0 + r1 */ 18 | cmp r0, r4 /* perform r0 - r4 and update cpsr */ 19 | movgt r0, r4 /* if r0 > r4 then r0 ← r4 */ 20 | bgt end /* if r0 > r4 then branch to end */ 21 | 22 | mvn r4, r4 /* r4 ← ~r4 23 | now r4 == -32768 (i.e. -2^15) */ 24 | cmp r0, r4 /* perform r0 - r4 and update cpsr */ 25 | movlt r0, r4 /* if r0 < r4 then r0 ← r4 */ 26 | 27 | end: 28 | 29 | pop {r4, lr} /* restore registers */ 30 | bx lr /* return */ 31 | addr_of_max16bit: .word max16bit 32 | 33 | .globl main 34 | 35 | main: 36 | mov r0, #0 37 | bx lr 38 | -------------------------------------------------------------------------------- /chapter25/motivation.s: -------------------------------------------------------------------------------- 1 | # motivation.s 2 | 3 | naive_channel_mixing: 4 | /* r0 contains the base address of channel1 */ 5 | /* r1 contains the base address of channel2 */ 6 | /* r2 contains the base address of channel_out */ 7 | /* r3 is the number of samples */ 8 | /* r4 is the number of the current sample 9 | so it holds that 0 ≤ r4 < r3 */ 10 | 11 | mov r4, #0 /* r4 ← 0 */ 12 | b .Lcheck_loop /* branch to check_loop */ 13 | .Lloop: 14 | mov r5, r4, LSL #1 /* r5 ← r4 << 1 (this is r5 ← r4 * 2) */ 15 | /* a halfword takes two bytes, so multiply 16 | the index by two. We do this here because 17 | ldrsh does not allow an addressing mode 18 | like [r0, r5, LSL #1] */ 19 | ldrsh r6, [r0, r5] /* r6 ← *{signed half}(r0 + r5) */ 20 | ldrsh r7, [r1, r5] /* r7 ← *{signed half}(r1 + r5) */ 21 | add r8, r6, r7 /* r8 ← r6 + r7 */ 22 | mov r8, r8, LSR #1 /* r8 ← r8 >> 1 (this is r8 ← r8 / 2)*/ 23 | strh r8, [r2, r5] /* *{half}(r2 + r5) ← r8 */ 24 | add r4, r4, #1 /* r4 ← r4 + 1 */ 25 | .Lcheck_loop: 26 | cmp r4, r3 /* compute r4 - r3 and update cpsr */ 27 | blt .Lloop /* if r4 < r3 jump to the 28 | beginning of the loop */ 29 | 30 | 31 | better_channel_mixing: 32 | /* r0 contains the base address of channel1 */ 33 | /* r1 contains the base address of channel2 */ 34 | /* r2 contains the base address of channel_out */ 35 | /* r3 is the number of samples */ 36 | /* r4 is the number of the current sample 37 | so it holds that 0 ≤ r4 < r3 */ 38 | 39 | mov r4, #0 /* r4 ← 0 */ 40 | b .Lcheck_loop1 /* branch to check_loop */ 41 | .Lloop1: 42 | ldr r6, [r0, r4] /* r6 ← *(r0 + r4) */ 43 | ldr r7, [r1, r4] /* r7 ← *(r1 + r4) */ 44 | shadd16 r8, r6, r7 /* r8[15:0] ← (r6[15:0] + r7[15:0]) >> 1*/ 45 | /* r8[31:16] ← (r6[31:16] + r7[31:16]) >> 1*/ 46 | str r8, [r2, r4] /* *(r2 + r4) ← r8 */ 47 | add r4, r4, #2 /* r4 ← r4 + 2 */ 48 | .Lcheck_loop1: 49 | cmp r4, r3 /* compute r4 - r3 and update cpsr */ 50 | blt .Lloop1 /* if r4 < r3 jump to the 51 | beginning of the loop */ 52 | 53 | .global main 54 | main: 55 | mov r0, #0 56 | bx lr 57 | --------------------------------------------------------------------------------