├── .gitignore
├── README.md
├── chapter01
    ├── Makefile
    └── first.s
├── chapter02
    ├── Makefile
    ├── sum01.s
    └── sum02.s
├── chapter03
    ├── Makefile
    ├── load01.s
    ├── load02.s
    ├── store01.s
    └── store02.s
├── chapter05
    ├── Makefile
    ├── branch01.s
    └── compare01.s
├── chapter06
    ├── Makefile
    ├── collatz.s
    ├── loop01.s
    ├── loop02.s
    └── test.py
├── chapter07
    ├── Makefile
    └── rol.s
├── chapter08
    ├── Makefile
    └── array01.s
├── chapter09
    ├── Makefile
    ├── hello01.s
    ├── printf01.s
    └── printf02.s
├── chapter10
    ├── Makefile
    ├── factorial01.s
    ├── factorial02.s
    ├── factorial03.s
    └── test.c
├── chapter11
    ├── Makefile
    ├── collatz02.s
    ├── collatz03.s
    ├── stats
    └── test
├── chapter12
    ├── Makefile
    ├── mult64.s
    └── mult64_2.s
├── chapter13
    ├── Makefile
    └── addf.s
├── chapter14
    ├── Makefile
    ├── benchmark.s
    └── matmul.s
├── chapter15
    ├── Makefile
    ├── benchmark.s
    ├── divideby14.s
    ├── division.s
    └── magic.py
├── chapter16
    ├── Makefile
    ├── binsearch.s
    ├── calcjump.s
    ├── hybrid.s
    ├── ifstring.s
    └── jumptable.s
├── chapter17
    ├── Makefile
    ├── array_by_ref.s
    ├── array_by_value.s
    ├── double_array.s
    ├── first_pointer.s
    ├── good_pointer.s
    └── wrong_pointer.s
├── chapter18
    ├── Makefile
    ├── square
    └── square.s
├── chapter19
    ├── Makefile
    ├── write_c.s
    └── write_sys.s
├── chapter20
    ├── Makefile
    ├── direct.s
    ├── greeter_01.s
    ├── greeter_02.s
    └── indirect.s
├── chapter21
    ├── Makefile
    ├── reinterpret.s
    ├── subword.s
    └── subword_signed.s
├── chapter22
    ├── Makefile
    ├── back-to-arm.s
    ├── thumb-call.s
    └── thumb-first.s
├── chapter23
    ├── Makefile
    ├── nested01.s
    └── nested02.s
├── chapter24
    ├── Makefile
    ├── print-array.s
    ├── sort-array.s
    └── trampoline-sort-array.s
└── chapter25
    ├── Makefile
    ├── byte_array_add.s
    ├── clipped_add.s
    └── motivation.s


/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | raspberry-pi-assembler
2 | ======================
3 | 
4 | Support files for the blog posts on Raspberry Pi Assembler
5 | 
6 | http://thinkingeek.com/category/raspberry-pi/
7 | 


--------------------------------------------------------------------------------
/chapter01/Makefile:
--------------------------------------------------------------------------------
 1 | ASM_FILE=first.s
 2 | 
 3 | all: first
 4 | 
 5 | first: first.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | first.o : first.s
 9 | 	as -o $@ $<
10 | 
11 | clean:
12 | 	rm -vf first *.o
13 | 


--------------------------------------------------------------------------------
/chapter01/first.s:
--------------------------------------------------------------------------------
 1 | /* -- first.s */
 2 | /* This is a comment. Comments are enclosed in slash* and *slash */
 3 | .global main /* 'main' is our entry point and must be global */
 4 | .func main   /* 'main' is a function */
 5 | 
 6 | main:          /* This is main */
 7 |     mov r0, #2 /* Put a 2 inside the register r0 */
 8 |     bx lr      /* Return from main */
 9 | 
10 | 


--------------------------------------------------------------------------------
/chapter02/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=sum01 sum02
 2 | all: $(EXES)
 3 | 
 4 | %: %.o
 5 | 	gcc -o $@ $+
 6 | 
 7 | % : %.s
 8 | 
 9 | %.o : %.s
10 | 	as -o $@ $<
11 | 
12 | .PHONY: clean
13 | clean:
14 | 	rm -vf $(EXES) *.o
15 | 


--------------------------------------------------------------------------------
/chapter02/sum01.s:
--------------------------------------------------------------------------------
 1 | /* -- sum01.s */
 2 | .global main
 3 | 
 4 | main:
 5 |     mov r1, #3
 6 |     mov r2, #4
 7 |     add r0, r1, r2  /* r0 ← r1 + r2 */
 8 |     bx lr
 9 | 
10 | 


--------------------------------------------------------------------------------
/chapter02/sum02.s:
--------------------------------------------------------------------------------
 1 | /* -- sum02.s */
 2 | .global main
 3 | 
 4 | main:
 5 |     mov r0, #3
 6 |     mov r1, #4
 7 |     add r0, r0, r1  /* r0 ← r1 + r2 */
 8 |     bx lr
 9 | 
10 | 


--------------------------------------------------------------------------------
/chapter03/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=load01 load02 store01
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter03/load01.s:
--------------------------------------------------------------------------------
 1 | /* -- load01.s */
 2 | 
 3 | /* -- Data section */
 4 | .data
 5 | 
 6 | /* Ensure variable is 4-byte aligned */
 7 | .balign 4
 8 | /* Define storage for myvar1 */
 9 | myvar1:
10 |     /* Contents of myvar1 is just '3' */
11 |     .word 3
12 | 
13 | /* Ensure variable is 4-byte aligned */
14 | .balign 4
15 | /* Define storage for myvar2 */
16 | myvar2:
17 |     /* Contents of myvar2 is just '3' */
18 |     .word 4
19 | 
20 | /* -- Code section */
21 | .text
22 | 
23 | /* Ensure function section starts 4 byte aligned */
24 | .balign 4
25 | .global main
26 | main:
27 |     ldr r1, addr_of_myvar1 /* r1 ← &myvar1 */
28 |     ldr r1, [r1]           /* r1 ← *r1 */
29 |     ldr r2, addr_of_myvar2 /* r2 ← &myvar2 */
30 |     ldr r2, [r2]           /* r1 ← *r2 */
31 |     add r0, r1, r2
32 |     bx lr
33 | 
34 | /* Labels needed to access data */
35 | addr_of_myvar1 : .word myvar1
36 | addr_of_myvar2 : .word myvar2
37 | 


--------------------------------------------------------------------------------
/chapter03/load02.s:
--------------------------------------------------------------------------------
 1 | /* -- load02.s */
 2 | 
 3 | /* -- Data section */
 4 | .data
 5 | 
 6 | /* Ensure variable is 4-byte aligned */
 7 | .balign 4
 8 | /* Define storage for myvar1 */
 9 | myvar1:
10 |     /* Contents of myvar1 is just '3' */
11 |     .word 3
12 | 
13 | /* Ensure variable is 4-byte aligned */
14 | .balign 4
15 | /* Define storage for myvar2 */
16 | myvar2:
17 |     /* Contents of myvar2 is just '3' */
18 |     .word 4
19 | 
20 | /* -- Code section */
21 | .text
22 | 
23 | /* Ensure function section starts 4 byte aligned */
24 | .balign 4
25 | .global main
26 | main:
27 |     ldr r1, .Laddr_of_myvar1 /* r1 ← &myvar1 */
28 |     ldr r1, [r1]           /* r1 ← *r1 */
29 |     ldr r2, .Laddr_of_myvar2 /* r2 ← &myvar2 */
30 |     ldr r2, [r2]           /* r1 ← *r2 */
31 |     add r0, r1, r2
32 |     bx lr
33 | 
34 | /* Labels needed to access data */
35 | .Laddr_of_myvar1 : .word myvar1
36 | .Laddr_of_myvar2 : .word myvar2
37 | 


--------------------------------------------------------------------------------
/chapter03/store01.s:
--------------------------------------------------------------------------------
 1 | /* -- store01.s */
 2 | 
 3 | /* -- Data section */
 4 | .data
 5 | 
 6 | /* Ensure variable is 4-byte aligned */
 7 | .balign 4
 8 | /* Define storage for myvar1 */
 9 | myvar1:
10 |     /* Contents of myvar1 is just '3' */
11 |     .word 0
12 | 
13 | /* Ensure variable is 4-byte aligned */
14 | .balign 4
15 | /* Define storage for myvar2 */
16 | myvar2:
17 |     /* Contents of myvar2 is just '3' */
18 |     .word 0
19 | 
20 | /* -- Code section */
21 | .text
22 | 
23 | /* Ensure function section starts 4 byte aligned */
24 | .balign 4
25 | .global main
26 | main:
27 |     ldr r1, addr_of_myvar1 /* r1 ← &myvar1 */
28 |     mov r3, #3             /* r3 ← 3 */
29 |     str r3, [r1]           /* *r1 ← r3 */
30 |     ldr r2, addr_of_myvar2 /* r2 ← &myvar2 */
31 |     mov r3, #4             /* r3 ← 3 */
32 |     str r3, [r2]           /* *r2 ← r3 */
33 | 
34 |     ldr r1, addr_of_myvar1 /* r1 ← &myvar1 */
35 |     ldr r1, [r1]           /* r1 ← *r1 */
36 |     ldr r2, addr_of_myvar2 /* r2 ← &myvar2 */
37 |     ldr r2, [r2]           /* r1 ← *r2 */
38 |     add r0, r1, r2
39 |     bx lr
40 | 
41 | /* Labels needed to access data */
42 | addr_of_myvar1 : .word myvar1
43 | addr_of_myvar2 : .word myvar2
44 | 


--------------------------------------------------------------------------------
/chapter03/store02.s:
--------------------------------------------------------------------------------
 1 | /* -- store02.s */
 2 | 
 3 | /* -- Data section */
 4 | .data
 5 | 
 6 | /* Ensure variable is 4-byte aligned */
 7 | .balign 4
 8 | /* Define storage for myvar1 */
 9 | myvar1:
10 |     /* Contents of myvar1 is just '3' */
11 |     .word 3
12 | 
13 | /* Ensure variable is 4-byte aligned */
14 | .balign 4
15 | /* Define storage for myvar2 */
16 | myvar2:
17 |     /* Contents of myvar2 is just '3' */
18 |     .word 4
19 | 
20 | /* Ensure variable is 4-byte aligned */
21 | .balign 4
22 | /* Define storage for myvar3 */
23 | myvar3:
24 |     /* Contents of myvar3 is just '0' */
25 |     .word 0
26 | 
27 | /* -- Code section */
28 | .text
29 | 
30 | /* Ensure function section starts 4 byte aligned */
31 | .balign 4
32 | .global main
33 | main:
34 |     ldr r1, addr_of_myvar1 /* r1 ← &myvar1 */
35 |     ldr r1, [r1]           /* r1 ← *r1 */
36 |     ldr r2, addr_of_myvar2 /* r2 ← &myvar2 */
37 |     ldr r2, [r2]           /* r1 ← *r2 */
38 |     add r3, r1, r2         /* r3 ← r1 + r2 */
39 |     ldr r4, addr_of_myvar3 /* r4 ← &myvar3 */
40 |     str r3, [r4]           /* *r4 ← r3 */
41 |     /* Clear registers to prove that
42 |        we are actually something
43 |        previously stored */
44 |     mov r0, #0             /* r0 ← 0 */
45 |     mov r1, #0             /* r1 ← 0 */
46 |     mov r2, #0             /* r2 ← 0 */
47 |     mov r3, #0             /* r3 ← 0 */
48 |     mov r4, #0             /* r4 ← 0 */
49 |     
50 |     ldr r0, addr_of_myvar3
51 |     ldr r0, [r0]
52 |     bx lr
53 | 
54 | /* Labels needed to access data */
55 | addr_of_myvar1 : .word myvar1
56 | addr_of_myvar2 : .word myvar2
57 | addr_of_myvar3 : .word myvar3
58 | 


--------------------------------------------------------------------------------
/chapter05/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=branch01 compare01
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter05/branch01.s:
--------------------------------------------------------------------------------
 1 | /* -- branch01.s */
 2 | 
 3 | .text
 4 | .global main
 5 | main:
 6 | case_a:
 7 |     mov r0, #2
 8 |     b end
 9 | case_b :
10 |     mov r0, #3
11 | end:
12 |     bx lr
13 | 


--------------------------------------------------------------------------------
/chapter05/compare01.s:
--------------------------------------------------------------------------------
 1 | /* -- compare01.s */
 2 | 
 3 | .text
 4 | .global main
 5 | main:
 6 |     mov r1, #2       /* r1 ← 2 */
 7 |     mov r2, #2       /* r2 ← 2 */
 8 |     cmp r1, r2       /* r1 ← r2 */
 9 |     beq case_equal   /* branch to case_equal if Z = 1 */
10 | case_different :
11 |     mov r0, #2       /* r0 ← 2 */
12 |     b end            /* branch to end */
13 | case_equal:
14 |     mov r0, #1       /* r0 ← 1 */
15 | end:
16 |     bx lr
17 | 


--------------------------------------------------------------------------------
/chapter06/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=loop01 loop02 collatz
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter06/collatz.s:
--------------------------------------------------------------------------------
 1 | /* -- collatz.s */
 2 | 
 3 | .text
 4 | .global main
 5 | main:
 6 |     mov r1, #123           /* r1 ← 123 */
 7 |     mov r2, #0             /* r2 ← 0 */
 8 | loop: 
 9 |     cmp r1, #1             /* compare r1 and 1 */
10 |     beq end                /* branch to end if r1 == 1 */
11 | 
12 |     and r3, r1, #1         /* r3 ← r1 & 1 */
13 |     cmp r3, #0             /* comprare r3 and 0 */
14 |     bne odd                /* branch to odd if r3 != 0 */
15 | even:
16 |     mov r1, r1, ASR #1     /* r1 ← (r1 >> 1) */
17 |     b end_loop
18 | odd:
19 |     add r1, r1, r1, LSL #1 /* r1 ← r1 + (r1 << 1) */
20 |     add r1, r1, #1         /* r1 ← r1 + 1 */
21 | 
22 | end_loop:
23 |     add r2, r2, #1         /* r2 ← r2 + 1 */
24 |     b loop                 /* branch to loop */
25 | 
26 | end:
27 |     mov r0, r2
28 |     bx lr
29 | 


--------------------------------------------------------------------------------
/chapter06/loop01.s:
--------------------------------------------------------------------------------
 1 | /* -- loop01.s */
 2 | 
 3 | .text
 4 | .global main
 5 | main:
 6 |     mov r1, #0       /* r1 ← 0 */
 7 |     mov r2, #1       /* r2 ← 1 */
 8 | loop: 
 9 |     cmp r2, #22      /* compare r2 and 22 */
10 |     bgt end          /* branch if r2 > 22 to end */
11 |     add r1, r1, r2   /* r1 ← r1 + r1 */
12 |     add r2, r2, #1   /* r2 ← r2 + 1 */
13 |     b loop
14 | end:
15 |     mov r0, r1       /* r0 ← r1 */
16 |     bx lr
17 | 


--------------------------------------------------------------------------------
/chapter06/loop02.s:
--------------------------------------------------------------------------------
 1 | /* -- loop02.s */
 2 | 
 3 | .text
 4 | .global main
 5 | main:
 6 |     mov r1, #0       /* r1 ← 0 */
 7 |     mov r2, #1       /* r2 ← 1 */
 8 |     b check_loop     /* unconditionally jump at the end of the loop */
 9 | loop: 
10 |     add r1, r1, r2   /* r1 ← r1 + r1 */
11 |     add r2, r2, #1   /* r2 ← r2 + 1 */
12 | check_loop:
13 |     cmp r2, #22      /* compare r2 and 22 */
14 |     ble loop         /* branch if r2 &lt;= 22 to the beginning of the loop */
15 | end:
16 |     mov r0, r1       /* r0 ← r1 */
17 |     bx lr
18 | 


--------------------------------------------------------------------------------
/chapter06/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import string
 4 | 
 5 | n = 123
 6 | step = 0
 7 | nums = [str(n)]
 8 | 
 9 | while n != 1:
10 |     print "Step: %d -> %d" % (step, n)
11 |     if n % 2 == 0:
12 |         n = n / 2
13 |     else:
14 |         n = 3 * n + 1
15 |     nums.append(str(n))
16 |     step = step + 1
17 | 
18 | print "Step: %d -> %d" % (step, n)
19 | 
20 | print string.join(nums, ", ")
21 | 


--------------------------------------------------------------------------------
/chapter07/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=rol
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter07/rol.s:
--------------------------------------------------------------------------------
 1 | /* -- rol.s */
 2 | 
 3 | .data
 4 | 
 5 | .balign 4
 6 | value: 
 7 | .int 0x12345678
 8 | 
 9 | .global main
10 | .text
11 | main:
12 |     ldr r1, .Lcvalue
13 |     ldr r1, [r1]
14 |     mov r1, r1, ROL #1
15 |     mov r1, r1, ROL #31
16 | 
17 |     eor r0, r0, r0
18 |     bx lr
19 | .Lcvalue: .word value
20 | 


--------------------------------------------------------------------------------
/chapter08/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=array01
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter08/array01.s:
--------------------------------------------------------------------------------
 1 | /* -- array01.s */
 2 | .data
 3 | 
 4 | .balign 4
 5 | a: .skip 400
 6 | 
 7 | .balign 4
 8 | b: .skip 8
 9 | 
10 | .text
11 | 
12 | .global main
13 | main:
14 |     ldr r1, addr_of_a       /* r1 ← &a */
15 |     mov r2, #0              /* r2 ← 0 */
16 | loop:
17 |     cmp r2, #100            /* Have we reached 100 yet? */
18 |     beq end                 /* If so, leave the loop, otherwise continue */
19 |     add r3, r1, r2, LSL #2  /* r3 ← r1 + r2 * 4 */
20 |     str r2, [r3]            /* *r3 ← r2 */
21 |     add r2, r2, #1          /* r2 ← r2 + 1 */
22 |     b loop                  /* Go to the beginning of the loop */
23 | end:
24 |     bx lr
25 | 
26 | addr_of_a: .word a
27 | addr_of_b: .word b
28 | 


--------------------------------------------------------------------------------
/chapter09/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=hello01 printf01 printf02
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter09/hello01.s:
--------------------------------------------------------------------------------
 1 | /* -- hello01.s */
 2 | .data
 3 | 
 4 | greeting: 
 5 |  .asciz "Hello world"
 6 | 
 7 | .balign 4
 8 | return: .word 0
 9 | 
10 | .text
11 | 
12 | .global main
13 | main:
14 |     ldr r1, address_of_return     /*   r1 ← &address_of_return */
15 |     str lr, [r1]                  /*   *r1 ← lr */
16 | 
17 |     ldr r0, address_of_greeting   /* r0 ← &address_of_greeting */
18 |                                   /* First parameter of puts */
19 | 
20 |     bl puts                       /* Call to puts */
21 |                                   /* lr ← address of next instruction */
22 | 
23 |     ldr r1, address_of_return     /* r1 ← &address_of_return */
24 |     ldr lr, [r1]                  /* lr ← *r1 */
25 |     bx lr                         /* return from main */
26 | 
27 | address_of_greeting: .word greeting
28 | address_of_return: .word return
29 | 
30 | /* External */
31 | .global puts
32 | 


--------------------------------------------------------------------------------
/chapter09/printf01.s:
--------------------------------------------------------------------------------
 1 | /* -- printf01.s */
 2 | .data
 3 | 
 4 | /* First message */
 5 | .balign 4
 6 | message1: .asciz "Hey, type a number: "
 7 | 
 8 | /* Second message */
 9 | .balign 4
10 | message2: .asciz "I read the number %d\n"
11 | 
12 | /* Format pattern for scanf */
13 | .balign 4
14 | scan_pattern : .asciz "%d"
15 | 
16 | /* Where scanf will store the number read */
17 | .balign 4
18 | number_read: .word 0
19 | 
20 | .balign 4
21 | return: .word 0
22 | 
23 | .text
24 | 
25 | .global main
26 | main:
27 |     ldr r1, address_of_return        /* r1 ← &address_of_return */
28 |     str lr, [r1]                     /* *r1 ← lr */
29 | 
30 |     ldr r0, address_of_message1      /* r0 ← &message1 */
31 |     bl printf                        /* call to printf */
32 | 
33 |     ldr r0, address_of_scan_pattern  /* r0 ← &scan_pattern */
34 |     ldr r1, address_of_number_read   /* r1 ← &number_read */
35 |     bl scanf                         /* call to scanf */
36 | 
37 |     ldr r0, address_of_message2      /* r0 ← &message2 */
38 |     ldr r1, address_of_number_read   /* r1 ← &number_read */
39 |     ldr r1, [r1]                     /* r1 ← *r1 */
40 |     bl printf                        /* call to printf */
41 | 
42 |     ldr r0, address_of_number_read   /* r0 ← &number_read */
43 |     ldr r0, [r0]                     /* r0 ← *r0 */
44 | 
45 |     ldr lr, address_of_return        /* lr ← &address_of_return */
46 |     ldr lr, [lr]                     /* lr ← *lr */
47 |     bx lr                            /* return from main using lr */
48 | 
49 | 
50 | address_of_message1 : .word message1
51 | address_of_message2 : .word message2
52 | address_of_scan_pattern : .word scan_pattern
53 | address_of_number_read : .word number_read
54 | address_of_return : .word return
55 | 
56 | /* External */
57 | .global printf
58 | .global scanf
59 | 


--------------------------------------------------------------------------------
/chapter09/printf02.s:
--------------------------------------------------------------------------------
 1 | /* -- printf02.s */
 2 | .data
 3 | 
 4 | /* First message */
 5 | .balign 4
 6 | message1: .asciz "Hey, type a number: "
 7 | 
 8 | /* Second message */
 9 | .balign 4
10 | message2: .asciz "%d times 5 is %d\n"
11 | 
12 | /* Format pattern for scanf */
13 | .balign 4
14 | scan_pattern : .asciz "%d"
15 | 
16 | /* Where scanf will store the number read */
17 | .balign 4
18 | number_read: .word 0
19 | 
20 | .balign 4
21 | return: .word 0
22 | 
23 | .balign 4
24 | return2: .word 0
25 | 
26 | .text
27 | 
28 | /*
29 | mult_by_5 function
30 | */
31 | mult_by_5: 
32 |     ldr r1, address_of_return2       /* r1 ← &address_of_return */
33 |     str lr, [r1]                     /* *r1 ← lr */
34 | 
35 |     add r0, r0, r0, LSL #2           /* r0 ← r0 + 4*r0 */
36 | 
37 |     ldr lr, address_of_return2       /* lr ← &address_of_return */
38 |     ldr lr, [lr]                     /* lr ← *lr */
39 |     bx lr                            /* return from main using lr */
40 | address_of_return2 : .word return2
41 | 
42 | .global main
43 | main:
44 |     ldr r1, address_of_return        /* r1 ← &address_of_return */
45 |     str lr, [r1]                     /* *r1 ← lr */
46 | 
47 |     ldr r0, address_of_message1      /* r0 ← &message1 */
48 |     bl printf                        /* call to printf */
49 | 
50 |     ldr r0, address_of_scan_pattern  /* r0 ← &scan_pattern */
51 |     ldr r1, address_of_number_read   /* r1 ← &number_read */
52 |     bl scanf                         /* call to scanf */
53 | 
54 |     ldr r0, address_of_number_read   /* r0 ← &number_read */
55 |     ldr r0, [r0]                     /* r0 ← *r0 */
56 |     bl mult_by_5
57 | 
58 |     mov r2, r0                       /* r1 ← r0 */
59 |     ldr r1, address_of_number_read   /* r0 ← &number_read */
60 |     ldr r1, [r1]                     /* r0 ← *r1 */
61 |     ldr r0, address_of_message2      /* r0 ← &message2 */
62 |     bl printf                        /* call to printf */
63 | 
64 |     ldr lr, address_of_return        /* lr ← &address_of_return */
65 |     ldr lr, [lr]                     /* lr ← *lr */
66 |     bx lr                            /* return from main using lr */
67 | 
68 | 
69 | address_of_message1 : .word message1
70 | address_of_message2 : .word message2
71 | address_of_scan_pattern : .word scan_pattern
72 | address_of_number_read : .word number_read
73 | address_of_return : .word return
74 | 
75 | /* External */
76 | .global printf
77 | .global scanf
78 | 


--------------------------------------------------------------------------------
/chapter10/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=factorial01 factorial02 factorial03
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -march=armv6 -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter10/factorial01.s:
--------------------------------------------------------------------------------
 1 | /* -- factorial01.s */
 2 | .data
 3 | 
 4 | message1: .asciz "Type a number: "
 5 | format:   .asciz "%d"
 6 | message2: .asciz "The factorial of %d is %d\n"
 7 | 
 8 | .text
 9 | 
10 | factorial:
11 |     str lr, [sp,#-4]!  /* Push lr onto the top of the stack */
12 |     str r0, [sp,#-4]!  /* Push r0 onto the top of the stack */
13 | 
14 |     cmp r0, #0         /* compare r0 and 0 */
15 |     bne is_nonzero     /* if r0 != 0 then branch */
16 |     mov r0, #1         /* r0 ← 1. This is the return */
17 |     b end
18 | is_nonzero:
19 |                        /* Prepare the call to factorial(n-1) */
20 |     sub r0, r0, #1     /* r0 ← r0 - 1 */
21 |     bl factorial
22 |                        /* After the call r0 contains factorial(n-1) */
23 |                        /* Load r0 (that we kept in th stack) into r1 */
24 |     ldr r1, [sp]       /* r1 ← *sp */
25 |     mul r0, r0, r1     /* r0 ← r0 * r1 */
26 |     
27 | end:
28 |     add sp, sp, #+4    /* Discard the r0 we kept in the stack */
29 |     ldr lr, [sp], #+4  /* Pop the top of the stack and put it in lr */
30 |     bx lr              /* Leave factorial */
31 | 
32 | .globl main
33 | main:
34 |     str lr, [sp,#-4]!            /* Push lr onto the top of the stack */
35 |     sub sp, sp, #4               /* Make room for one 4 byte integer in the stack */
36 |                                  /* In these 4 bytes we will keep the number entered by */
37 |                                  /* the user */
38 | 
39 |     ldr r0, address_of_message1  /* Set &message1 as the first parameter of printf */
40 |     bl printf                    /* Call printf */
41 | 
42 |     ldr r0, address_of_format    /* Set &format as the first parameter of scanf */
43 |     mov r1, sp                   /* Set the top of the stack as the second parameter */
44 |                                  /* of scanf */
45 |     bl scanf                     /* Call scanf */
46 | 
47 |     ldr r0, [sp]                 /* Load the integer read by scanf into r0 */
48 |                                  /* So we set it as the first parameter of factorial */
49 |     bl factorial                 /* Call factorial */
50 | 
51 |     mov r2, r0                   /* Get the result of factorial and move it to r2 */
52 |                                  /* So we set it as the third parameter of printf */
53 |     ldr r1, [sp]                 /* Load the integer read by scanf into r1 */
54 |                                  /* So we set it as the second parameter of printf */
55 |     ldr r0, address_of_message2  /* Set &message2 as the first parameter of printf */
56 |     bl printf                    /* Call printf */
57 | 
58 | 
59 |     add sp, sp, #+4              /* Discard the integer read by scanf */
60 |     ldr lr, [sp], #+4            /* Pop the top of the stack and put it in lr */
61 |     bx lr                        /* Leave main */
62 | 
63 | address_of_message1: .word message1
64 | address_of_message2: .word message2
65 | address_of_format: .word format
66 | 


--------------------------------------------------------------------------------
/chapter10/factorial02.s:
--------------------------------------------------------------------------------
 1 | /* -- factorial02.s */
 2 | .data
 3 | 
 4 | message1: .asciz "Type a number: "
 5 | format:   .asciz "%d"
 6 | message2: .asciz "The factorial of %d is %d\n"
 7 | 
 8 | .text
 9 | 
10 | factorial:
11 |     str lr, [sp,#-4]!  /* Push lr onto the top of the stack */
12 |     str r4, [sp,#-4]!  /* Push r0 onto the top of the stack */
13 |     mov r4, r0         /* Keep a copy of the initial value of r0 in r4 */
14 | 
15 | 
16 |     cmp r0, #0         /* compare r0 and 0 */
17 |     bne is_nonzero     /* if r0 != 0 then branch */
18 |     mov r0, #1         /* r0 ← 1. This is the return */
19 |     b end
20 | is_nonzero:
21 |                        /* Prepare the call to factorial(n-1) */
22 |     sub r0, r0, #1     /* r0 ← r0 - 1 */
23 |     bl factorial
24 |                        /* After the call r0 contains factorial(n-1) */
25 |                        /* Load initial value of r0 (that we kept in r4) into r1 */
26 |     mov r1, r4         /* r1 ← r4 */
27 |     mul r0, r0, r1     /* r0 ← r0 * r1 */
28 |     
29 | end:
30 |     ldr r4, [sp], #+4  /* Restore r4 */
31 |     ldr lr, [sp], #+4  /* Pop the top of the stack and put it in lr */
32 |     bx lr              /* Leave factorial */
33 | 
34 | .globl main
35 | main:
36 |     str lr, [sp,#-4]!            /* Push lr onto the top of the stack */
37 |     sub sp, sp, #4               /* Make room for one 4 byte integer in the stack */
38 |                                  /* In these 4 bytes we will keep the number entered by */
39 |                                  /* the user */
40 | 
41 |     ldr r0, address_of_message1  /* Set &message1 as the first parameter of printf */
42 |     bl printf                    /* Call printf */
43 | 
44 |     ldr r0, address_of_format    /* Set &format as the first parameter of scanf */
45 |     mov r1, sp                   /* Set the top of the stack as the second parameter */
46 |                                  /* of scanf */
47 |     bl scanf                     /* Call scanf */
48 | 
49 |     ldr r0, [sp]                 /* Load the integer read by scanf into r0 */
50 |                                  /* So we set it as the first parameter of factorial */
51 |     bl factorial                 /* Call factorial */
52 | 
53 |     mov r2, r0                   /* Get the result of factorial and move it to r2 */
54 |                                  /* So we set it as the third parameter of printf */
55 |     ldr r1, [sp]                 /* Load the integer read by scanf into r1 */
56 |                                  /* So we set it as the second parameter of printf */
57 |     ldr r0, address_of_message2  /* Set &message2 as the first parameter of printf */
58 |     bl printf                    /* Call printf */
59 | 
60 | 
61 |     add sp, sp, #+4              /* Discard the integer read by scanf */
62 |     ldr lr, [sp], #+4            /* Pop the top of the stack and put it in lr */
63 |     bx lr                        /* Leave main */
64 | 
65 | address_of_message1: .word message1
66 | address_of_message2: .word message2
67 | address_of_format: .word format
68 | 


--------------------------------------------------------------------------------
/chapter10/factorial03.s:
--------------------------------------------------------------------------------
 1 | /* -- factorial03.s */
 2 | .data
 3 | 
 4 | message1: .asciz "Type a number: "
 5 | format:   .asciz "%d"
 6 | message2: .asciz "The factorial of %d is %d\n"
 7 | 
 8 | .text
 9 | 
10 | factorial:
11 |     stmdb sp!, {r4, lr}
12 |     mov r4, r0         /* Keep a copy of the initial value of r0 in r4 */
13 | 
14 | 
15 |     cmp r0, #0         /* compare r0 and 0 */
16 |     bne is_nonzero     /* if r0 != 0 then branch */
17 |     mov r0, #1         /* r0 ← 1. This is the return */
18 |     b end
19 | is_nonzero:
20 |                        /* Prepare the call to factorial(n-1) */
21 |     sub r0, r0, #1     /* r0 ← r0 - 1 */
22 |     bl factorial
23 |                        /* After the call r0 contains factorial(n-1) */
24 |                        /* Load initial value of r0 (that we kept in r4) into r1 */
25 |     mov r1, r4         /* r1 ← r4 */
26 |     mul r0, r0, r1     /* r0 ← r0 * r1 */
27 |     
28 | end:
29 |     ldmia sp!, {r4, lr}
30 |     bx lr              /* Leave factorial */
31 | 
32 | .globl main
33 | main:
34 |     str lr, [sp,#-4]!            /* Push lr onto the top of the stack */
35 |     sub sp, sp, #4               /* Make room for one 4 byte integer in the stack */
36 |                                  /* In these 4 bytes we will keep the number entered by */
37 |                                  /* the user */
38 | 
39 |     ldr r0, address_of_message1  /* Set &message1 as the first parameter of printf */
40 |     bl printf                    /* Call printf */
41 | 
42 |     ldr r0, address_of_format    /* Set &format as the first parameter of scanf */
43 |     mov r1, sp                   /* Set the top of the stack as the second parameter */
44 |                                  /* of scanf */
45 |     bl scanf                     /* Call scanf */
46 | 
47 |     ldr r0, [sp]                 /* Load the integer read by scanf into r0 */
48 |                                  /* So we set it as the first parameter of factorial */
49 |     bl factorial                 /* Call factorial */
50 | 
51 |     mov r2, r0                   /* Get the result of factorial and move it to r2 */
52 |                                  /* So we set it as the third parameter of printf */
53 |     ldr r1, [sp]                 /* Load the integer read by scanf into r1 */
54 |                                  /* So we set it as the second parameter of printf */
55 |     ldr r0, address_of_message2  /* Set &message2 as the first parameter of printf */
56 |     bl printf                    /* Call printf */
57 | 
58 |     add sp, sp, #+4              /* Discard the integer read by scanf */
59 |     ldr lr, [sp], #+4            /* Pop the top of the stack and put it in lr */
60 |     bx lr                        /* Leave main */
61 | 
62 | address_of_message1: .word message1
63 | address_of_message2: .word message2
64 | address_of_format: .word format
65 | 


--------------------------------------------------------------------------------
/chapter10/test.c:
--------------------------------------------------------------------------------
1 | void f(int n)
2 | {
3 | int m;
4 | m = n + 1;
5 | }
6 | 


--------------------------------------------------------------------------------
/chapter11/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=collatz02 collatz03
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter11/collatz02.s:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rofirrim/raspberry-pi-assembler/75685f80a35318777fad9dc33837698c19952e89/chapter11/collatz02.s


--------------------------------------------------------------------------------
/chapter11/collatz03.s:
--------------------------------------------------------------------------------
 1 | /* -- collatz03.s */
 2 | .data
 3 | 
 4 | message: .asciz "Type a number: "
 5 | scan_format : .asciz "%d"
 6 | message2: .asciz "Length of the Hailstone sequence for %d is %d\n"
 7 | 
 8 | 
 9 | .text
10 | 
11 | collatz2:
12 |     /* r0 contains the first argument */
13 |     push {r4}
14 |     mov r4, r0
15 |     mov r3, #4194304
16 |   collatz_repeat:
17 |     mov r1, r4                 /* r1 ← r0 */
18 |     mov r0, #0                 /* r0 ← 0 */
19 |   collatz2_loop:
20 |     cmp r1, #1                 /* compare r1 and 1 */
21 |     beq collatz2_end           /* if r1 == 1 branch to collatz2_end */
22 |     and r2, r1, #1             /* r2 ← r1 & 1 */
23 |     cmp r2, #0                 /* compare r2 and 0 */
24 |     moveq r1, r1, ASR #1       /* if r2 == 0, r1 ← r1 >> 1. This is r1 ← r1/2 */
25 |     addne r1, r1, r1, LSL #1   /* if r2 != 0, r1 ← r1 + (r1 << 1). This is r1 ← 3*r1 */
26 |     addne r1, r1, #1           /* if r2 != 0, r1 ← r1 + 1. */
27 |   collatz2_end_loop:
28 |     add r0, r0, #1             /* r0 ← r0 + 1 */
29 |     b collatz2_loop             /* branch back to collatz2_loop */
30 |   collatz2_end:
31 |     sub r3, r3, #1
32 |     cmp r3, #0
33 |     bne collatz_repeat
34 |     pop {r4}
35 |     bx lr
36 | 
37 | .global main
38 | main:
39 |     push {lr}                       /* keep lr */
40 |     sub sp, sp, #4                  /* make room for 4 bytes in the stack */
41 | 
42 |     ldr r0, address_of_message      /* first parameter of printf: &message */
43 |     bl printf                       /* call printf */
44 | 
45 |     ldr r0, address_of_scan_format  /* first parameter of scanf: &scan_format */
46 |     mov r1, sp                      /* second parameter of scanf: 
47 |                                        address of the top of the stack */
48 |     bl scanf                        /* call scanf */
49 | 
50 |     ldr r0, [sp]                    /* first parameter of collatz:
51 |                                        the value stored (by scanf) in the top of the stack */
52 |     bl collatz2                     /* call collatz2 */
53 |     
54 |     mov r2, r0                      /* third parameter of printf: 
55 |                                        the result of collatz */
56 |     ldr r1, [sp]                    /* second parameter of printf:
57 |                                        the value stored (by scanf) in the top of the stack */
58 |     ldr r0, address_of_message2     /* first parameter of printf: &address_of_message */
59 |     bl printf
60 | 
61 |     add sp, sp, #4
62 |     pop {lr}
63 |     bx lr
64 |     
65 | 
66 | address_of_message: .word message
67 | address_of_scan_format: .word scan_format
68 | address_of_message2: .word message2
69 | 


--------------------------------------------------------------------------------
/chapter11/stats:
--------------------------------------------------------------------------------
1 | 
2 |  Performance counter stats for './collatz03' (25 runs):
3 | 
4 |           4,179080 task-clock                #    0,766 CPUs utilized            ( +-  0,36% )
5 | 
6 |        0,005459041 seconds time elapsed                                          ( +-  0,55% )
7 | 
8 | 


--------------------------------------------------------------------------------
/chapter11/test:
--------------------------------------------------------------------------------
 1 | cpu-cycles 
 2 | cycles   
 3 | stalled-cycles-frontend 
 4 | idle-cycles-frontend 
 5 | stalled-cycles-backend 
 6 | idle-cycles-backend    
 7 | instructions                                       
 8 | cache-references                                   
 9 | cache-misses                                       
10 | branch-instructions
11 | branches                    
12 | branch-misses                                      
13 | bus-cycles                                         
14 | cpu-clock                                          
15 | task-clock                                         
16 | page-faults
17 | faults                              
18 | minor-faults                                       
19 | major-faults                                       
20 | context-switches
21 | cs                             
22 | cpu-migrations
23 | migrations                       
24 | alignment-faults                                   
25 | emulation-faults                                   
26 | L1-dcache-loads                                    
27 | L1-dcache-load-misses                              
28 | L1-dcache-stores                                   
29 | L1-dcache-store-misses                             
30 | L1-dcache-prefetches                               
31 | L1-dcache-prefetch-misses                          
32 | L1-icache-loads                                    
33 | L1-icache-load-misses                              
34 | L1-icache-prefetches                               
35 | L1-icache-prefetch-misses                          
36 | LLC-loads                                          
37 | LLC-load-misses                                    
38 | LLC-stores                                         
39 | LLC-store-misses                                   
40 | LLC-prefetches                                     
41 | LLC-prefetch-misses                                
42 | dTLB-loads                                         
43 | dTLB-load-misses                                   
44 | dTLB-stores                                        
45 | dTLB-store-misses                                  
46 | dTLB-prefetches                                    
47 | dTLB-prefetch-misses                               
48 | iTLB-loads                                         
49 | iTLB-load-misses                                   
50 | branch-loads                                       
51 | branch-load-misses                                 
52 | node-loads                                         
53 | node-load-misses                                   
54 | node-stores                                        
55 | node-store-misses                                  
56 | node-prefetches                                    
57 | node-prefetch-misses                               
58 | 


--------------------------------------------------------------------------------
/chapter12/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=mult64 mult64_2
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter12/mult64.s:
--------------------------------------------------------------------------------
  1 | /* -- mult64.s */
  2 | 
  3 | .data
  4 | 
  5 | .align 8
  6 | message : .asciz "Multiplication of %d by %d is %lld\n"
  7 | 
  8 | .align 4
  9 | number_a: .word 987654321
 10 | number_b: .word 1234567890
 11 | 
 12 | .text
 13 | 
 14 | mult64:
 15 |    /* The argument will be passed in r0, r1 and r2, r3 and returned in r0, r1 */
 16 |    /* Keep the registers that we are going to write */
 17 |    push {r4, r5, r6, r7, r8, lr}
 18 |    /* For covenience, mov r0,r1 into r4,r5 */
 19 |    mov r4, r0   /* r0 ← r4 */
 20 |    mov r5, r1   /* r5 ← r1 */
 21 | 
 22 |    smull r0, r6, r2, r4    /* r0,r6 ← r2 * r4 */
 23 |    smull r7, r8, r3, r4    /* r7,r8 ← r3 * r4 */
 24 |    smull r4, r5, r2, r5    /* r4,r5 ← r2 * r5 */
 25 |    adds r2, r7, r4         /* r2 ← r7 + r4 and update cpsr */
 26 |    adc r1, r2, r6          /* r1 ← r2 + r6 + C */
 27 | 
 28 |    /* Restore registers */
 29 |    pop {r4, r5, r6, r7, r8, lr}
 30 |    bx lr                   /* Leave mult64 */
 31 | 
 32 | mult64_2:
 33 |    /* The argument will be passed in r0, r1 and r2, r3 and returned in r0, r1 */
 34 |    /* Keep the registers that we are going to write */
 35 |    push {r4, r5, r6, lr}
 36 | 
 37 |    /* For convenience, mov r0,r1 into r4,r5 */
 38 |    mov r4, r0   /* r0 ← r4 */
 39 |    mov r5, r1   /* r5 ← r1 */
 40 |    smull r0, r1, r2, r4    /* r0,r1 ← r2 * r4 */
 41 |    smlal r1, r6, r3, r4    /* r1 ← r1 + LO(r3*r4). r6 ← r6 + HI(r3*r4) */
 42 |    smlal r1, r6, r2, r5    /* r1 ← r1 + LO(r4*r3). r6 ← r6 + HI(r2*r5) */
 43 | 
 44 |    /* Restore registers */
 45 |    pop {r4, r5, r6, lr}
 46 |    bx lr
 47 | 
 48 | .global main
 49 | main:
 50 |     push {r4, r5, r6, lr}       /* Keep the registers we are going to modify */
 51 |     /* We have to load the number from memory because the literal value would
 52 |        not fit the instruction */
 53 |     ldr r4, addr_number_a       /* r4 ← &a  */
 54 |     ldr r4, [r4]                /* r4 ← *r4 */
 55 |     ldr r5, addr_number_b       /* r5 ← &b  */
 56 |     ldr r5, [r5]                /* r5 ← *r5 */
 57 | 
 58 |     /* Now prepare the call to mult64
 59 |     /* 
 60 |        The first number is passed in 
 61 |        registers r0,r1 and the second one in r2,r3
 62 |        Note that we pass 32-bit numbers, this is why
 63 |        the higher register will be zero
 64 |     */
 65 |     mov r0, r4                  /* r0 ← r4 */
 66 |     mov r1, #0                  /* r1 ← 0 */
 67 | 
 68 |     mov r2, r5                  /* r2 ← r5 */
 69 |     mov r3, #0                  /* r3 ← 0 */
 70 | 
 71 |     bl mult64                   /* call mult64 function */
 72 |     /* The result of the multiplication is in r0,r1 */
 73 |     
 74 |     /* Now prepare the call to printf */
 75 |     /* We have to pass &message, r4, r5 and r0,r1 */
 76 |     /* Because of the calling convention &message and 
 77 |        r4, r5 will be passed in registers r0, r1 and r2.
 78 |        The result of mult64 (still in r0,r1) must be passed
 79 |        in the stack because we ran out registers for passing
 80 |        parameters. Technically we still have r3 but
 81 |        is not an even numbered register so it cannot have
 82 |        the lower part of a 64-bit number (by convention) */
 83 |     /* Note that arguments passed in the stack must be pushed
 84 |        in reverse order because we want parameters of lower positions
 85 |        to be in the stack in lower addresses (by convention) */
 86 |     push {r1}                   /* Push r1 onto the stack. 5th parameter */
 87 |     push {r0}                   /* Push r0 onto the stack. 4th parameter */
 88 |     mov r2, r5                  /* r2 ← r5.                3rd parameter */
 89 |     mov r1, r4                  /* r1 ← r4.                2nd parameter */
 90 |     ldr r0, addr_of_message     /* r0 ← &message           1st parameter */
 91 |     bl printf                   /* Call printf */
 92 |     add sp, sp, #8              /* sp ← sp + 8 */
 93 |                                 /* Pop the two registers we pushed above */
 94 | 
 95 |     mov r0, #0                  /* r0 ← 0 */
 96 |     pop {r4, r5, r6, lr}        /* Restore registers we kept */
 97 |     bx lr                       /* Leave main */
 98 | 
 99 | addr_of_message : .word message
100 | addr_number_a: .word number_a
101 | addr_number_b: .word number_b
102 | 


--------------------------------------------------------------------------------
/chapter12/mult64_2.s:
--------------------------------------------------------------------------------
  1 | /* -- mult64.s */
  2 | .data
  3 | 
  4 | .align 8
  5 | message : .asciz "Multiplication of %lld by %lld is %lld\n"
  6 | 
  7 | .align 4
  8 | number_a_low: .word 3755744309
  9 | number_a_high: .word 2
 10 | 
 11 | number_b_low: .word 12345678
 12 | number_b_high: .word 0
 13 | 
 14 | .text
 15 | 
 16 | mult64:
 17 |    /* The argument will be passed in r0, r1 and r2, r3 and returned in r0, r1 */
 18 |    /* Keep the registers that we are going to write */
 19 |    push {r4, r5, r6, r7, r8, lr}
 20 |    /* For covenience, mov r0,r1 into r4,r5 */
 21 |    mov r4, r0   /* r0 ← r4 */
 22 |    mov r5, r1   /* r5 ← r1 */
 23 | 
 24 |    umull r0, r6, r2, r4    /* r0,r6 ← r2 * r4 */
 25 |    umull r7, r8, r3, r4    /* r7,r8 ← r3 * r4 */
 26 |    umull r4, r5, r2, r5    /* r4,r5 ← r2 * r5 */
 27 |    adds r2, r7, r4         /* r2 ← r7 + r4 and update cpsr */
 28 |    adc r1, r2, r6          /* r1 ← r2 + r6 + C */
 29 | 
 30 |    /* Restore registers */
 31 |    pop {r4, r5, r6, r7, r8, lr}
 32 |    bx lr                   /* Leave mult64 */
 33 | 
 34 | mult64_2:
 35 |    /* The argument will be passed in r0, r1 and r2, r3 and returned in r0, r1 */
 36 |    /* Keep the registers that we are going to write */
 37 |    push {r4, r5, r6, lr}
 38 | 
 39 |    /* For convenience, mov r0,r1 into r4,r5 */
 40 |    mov r4, r0   /* r0 ← r4 */
 41 |    mov r5, r1   /* r5 ← r1 */
 42 |    umull r0, r1, r2, r4    /* r0,r1 ← r2 * r4 */
 43 |    umlal r1, r6, r3, r4    /* r1 ← r1 + LO(r3*r4). r6 ← r6 + HI(r3*r4) */
 44 |    umlal r1, r6, r2, r5    /* r1 ← r1 + LO(r4*r3). r6 ← r6 + HI(r2*r5) */
 45 | 
 46 |    /* Restore registers */
 47 |    pop {r4, r5, r6, lr}
 48 |    bx lr
 49 | 
 50 | .global main
 51 | main:
 52 |     push {r4, r5, r6, r7, r8, lr}       /* Keep the registers we are going to modify */
 53 |     /* We have to load the number from memory because the literal value would
 54 |        not fit the instruction */
 55 |     ldr r4, addr_number_a_low       /* r4 ← &a_low  */
 56 |     ldr r4, [r4]                    /* r4 ← *r4 */
 57 |     ldr r5, addr_number_a_high      /* r5 ← &a_high  */
 58 |     ldr r5, [r5]                    /* r5 ← *r5 */
 59 | 
 60 |     ldr r6, addr_number_b_low       /* r6 ← &b_low  */
 61 |     ldr r6, [r6]                    /* r6 ← *r6 */
 62 |     ldr r7, addr_number_b_high      /* r7 ← &b_high  */
 63 |     ldr r7, [r7]                    /* r7 ← *r7 */
 64 | 
 65 |     /* Now prepare the call to mult64
 66 |     /* 
 67 |        The first number is passed in 
 68 |        registers r0,r1 and the second one in r2,r3
 69 |        Note that we pass 32-bit numbers, this is why
 70 |        the higher register will be zero
 71 |     */
 72 |     mov r0, r4                  /* r0 ← r4 */
 73 |     mov r1, r5                  /* r1 ← r5 */
 74 | 
 75 |     mov r2, r6                  /* r2 ← r6 */
 76 |     mov r3, r7                  /* r3 ← r7 */
 77 | 
 78 |     bl mult64                  /* call mult64 function */
 79 |     /* The result of the multiplication is in r0,r1 */
 80 |     
 81 |     /* Now prepare the call to printf */
 82 |     /* We have to pass &message, {r4,r5}, {r6,r7} and {r0,r1} */
 83 | #    push {r1}                   /* Push r1 onto the stack. 7th parameter */
 84 | #    push {r0}                   /* Push r0 onto the stack. 6th parameter */
 85 | #    push {r7}                   /* Push r7 onto the stack. 5th parameter */
 86 | #    push {r6}                   /* Push r6 onto the stack. 4th parameter */
 87 |     push {r0,r1}
 88 |     push {r6,r7}
 89 |     mov r3, r5                  /* r3 ← r5.                3rd parameter */
 90 |     mov r2, r4                  /* r2 ← r4.                2nd parameter */
 91 |     ldr r0, addr_of_message     /* r0 ← &message           1st parameter */
 92 |     bl printf                   /* Call printf */
 93 |     add sp, sp, #16             /* sp ← sp + 16 */
 94 |                                 /* Pop the two registers we pushed above */
 95 | 
 96 |     mov r0, #0                  /* r0 ← 0 */
 97 |     pop {r4, r5, r6, r7, r8, lr}        /* Restore registers we kept */
 98 |     bx lr                       /* Leave main */
 99 | 
100 | addr_of_message : .word message
101 | addr_number_a_low: .word number_a_low
102 | addr_number_a_high: .word number_a_high
103 | 
104 | addr_number_b_low: .word number_b_low
105 | addr_number_b_high: .word number_b_high
106 | 


--------------------------------------------------------------------------------
/chapter13/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=addf
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -mcpu=arm1176jzf-s -mfpu=vfpv2 -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter13/addf.s:
--------------------------------------------------------------------------------
 1 | /* -- addf.s */
 2 | 
 3 | .data
 4 | 
 5 | .align 4
 6 | array_of_floats_1: 
 7 | .float 1.2, 3.4, 5.6, 7.8, 9.10, 10.11, 12.13, 14.15
 8 | 
 9 | .align 4
10 | array_of_floats_2:
11 | .float 0.1, 0.2, 0.3, 0.4, 0.5,   0.6,   0.7,  0.8
12 | 
13 | .text
14 | 
15 | .global main
16 | main:
17 |     push {r4, r5, r6, lr}
18 | 
19 |     ldr r4, addr_of_array_of_floats_1
20 |     fldmias r4, {s8-s15}                  /* Load 8 floats from [r4] to {s8-s15} */
21 | 
22 |     ldr r4, addr_of_array_of_floats_2
23 |     fldmias r4, {s16-s23}                 /* Load 8 floats from [r4] to {s16-s23} */
24 | 
25 |     /* Set the LEN field of FPSCR to be 8 (value 7) */
26 |     mov r5, #0b111                        /* r5 ← 7 */
27 |     mov r5, r5, LSL #16                   /* r5 ← r5 << 16 */
28 |     fmrx r4, fpscr                        /* r4 ← fpscr */
29 |     orr r4, r4, r5                        /* r4 ← r4 | r5 */
30 |     fmxr fpscr, r4                        /* fpscr ← r4 */
31 | 
32 |     fadds s24, s8, s16                    /* {s24-s31} ← {s8-s15} + {s16-s23} */
33 | 
34 |     /* Set the LEN field of FPSCR back to 1 (value 0) */
35 |     mvn r5, r5                            /* r5 ← ~r5 */
36 |     fmrx r4, fpscr                        /* r4 ← fpscr */
37 |     and r4, r4, r5                        /* r4 ← r4 & r5 */
38 |     fmxr fpscr, r4                        /* fpscr ← r4 */
39 | 
40 |     pop {r4, r5, r6, lr}
41 |     mov r0, #0
42 |     bx lr
43 | 
44 | addr_of_array_of_floats_1 : .word array_of_floats_1
45 | addr_of_array_of_floats_2 : .word array_of_floats_2
46 | 


--------------------------------------------------------------------------------
/chapter14/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=matmul benchmark
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -mfpu=vfpv2 -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter14/benchmark.s:
--------------------------------------------------------------------------------
  1 | /* -- matmul.s */
  2 | .data
  3 | mat_A: .float 0.1, 0.2, 0.0, 0.1
  4 |        .float 0.2, 0.1, 0.3, 0.0
  5 |        .float 0.0, 0.3, 0.1, 0.5 
  6 |        .float 0.0, 0.6, 0.4, 0.1
  7 | mat_B: .float  4.92,  2.54, -0.63, -1.75
  8 |        .float  3.02, -1.51, -0.87,  1.35
  9 |        .float -4.29,  2.14,  0.71,  0.71
 10 |        .float -0.95,  0.48,  2.38, -0.95
 11 | mat_C: .float 0.0, 0.0, 0.0, 0.0
 12 |        .float 0.0, 0.0, 0.0, 0.0
 13 |        .float 0.0, 0.0, 0.0, 0.0
 14 |        .float 0.0, 0.0, 0.0, 0.0
 15 |        .float 0.0, 0.0, 0.0, 0.0
 16 | 
 17 | format_result : .asciz "Matrix result is:\n%5.2f %5.2f %5.2f %5.2f\n%5.2f %5.2f %5.2f %5.2f\n%5.2f %5.2f %5.2f %5.2f\n%5.2f %5.2f %5.2f %5.2f\n"
 18 | 
 19 | .text
 20 | 
 21 | naive_matmul_4x4:
 22 |     /* r0 address of A
 23 |        r1 address of B
 24 |        r2 address of C
 25 |     */
 26 |     push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */
 27 |     /* First zero 16 single floating point */
 28 |     /* In IEEE 754, all bits cleared means 0.0 */
 29 |     mov r4, r2
 30 |     mov r5, #16
 31 |     mov r6, #0
 32 |     b .L0_loop_init_test
 33 |     .L0_loop_init :
 34 |       str r6, [r4], +#4   /* *r4 ← r6 then r4 ← r4 + 4 */
 35 |     .L0_loop_init_test:
 36 |       subs r5, r5, #1
 37 |       bge .L0_loop_init
 38 | 
 39 |     /* We will use 
 40 |            r4 as i
 41 |            r5 as j
 42 |            r6 as k
 43 |     */
 44 |     mov r4, #0 /* r4 ← 0 */
 45 |     .L0_loop_i:  /* loop header of i */
 46 |       cmp r4, #4  /* if r4 == 4 goto end of the loop i */
 47 |       beq .L0_end_loop_i
 48 |       mov r5, #0  /* r5 ← 0 */
 49 |       .L0_loop_j: /* loop header of j */
 50 |        cmp r5, #4 /* if r5 == 4 goto end of the loop j */
 51 |         beq .L0_end_loop_j
 52 |         /* Compute the address of C[i][j] and load it into s0 */
 53 |         /* Address of C[i][j] is C + 4*(4 * i + j) */
 54 |         mov r7, r5               /* r7 ← r5. This is r7 ← j */
 55 |         adds r7, r7, r4, LSL #2  /* r7 ← r7 + (r4 << 2). 
 56 |                                     This is r7 ← j + i * 4.
 57 |                                     We multiply i by the row size (4 elements) */
 58 |         adds r7, r2, r7, LSL #2  /* r7 ← r2 + (r7 << 2).
 59 |                                     This is r7 ← C + 4*(j + i * 4)
 60 |                                     We multiply (j + i * 4) by the size of the element.
 61 |                                     A single-precision floating point takes 4 bytes.
 62 |                                     */
 63 |         vldr s0, [r7] /* s0 ← *r7 */
 64 | 
 65 |         mov r6, #0 /* r6 ← 0 */
 66 |         .L0_loop_k :  /* loop header of k */
 67 |           cmp r6, #4 /* if r6 == 4 goto end of the loop k */
 68 |           beq .L0_end_loop_k
 69 | 
 70 |           /* Compute the address of a[i][k] and load it into s1 */
 71 |           /* Address of a[i][k] is a + 4*(4 * i + k) */
 72 |           mov r8, r6               /* r8 ← r6. This is r8 ← k */
 73 |           adds r8, r8, r4, LSL #2  /* r8 ← r8 + (r4 << 2). This is r8 ← k + i * 4 */
 74 |           adds r8, r0, r8, LSL #2  /* r8 ← r0 + (r8 << 2). This is r8 ← a + 4*(k + i * 4) */
 75 |           vldr s1, [r8]            /* s1 ← *r8 */
 76 | 
 77 |           /* Compute the address of b[k][j] and load it into s2 */
 78 |           /* Address of b[k][j] is b + 4*(4 * k + j) */
 79 |           mov r8, r5               /* r8 ← r5. This is r8 ← j */
 80 |           adds r8, r8, r6, LSL #2  /* r8 ← r8 + (r6 << 2). This is r8 ← j + k * 4 */
 81 |           adds r8, r1, r8, LSL #2  /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j + k * 4) */
 82 |           vldr s2, [r8]            /* s1 ← *r8 */
 83 | 
 84 |           vmul.f32 s3, s1, s2      /* s3 ← s1 * s2 */
 85 |           vadd.f32 s0, s0, s3      /* s0 ← s0 + s3 */
 86 | 
 87 |           add r6, r6, #1           /* r6 ← r6 + 1 */
 88 |           b .L0_loop_k               /* next iteration of loop k */
 89 |         .L0_end_loop_k: /* Here ends loop k */
 90 |         vstr s0, [r7]            /* Store s0 back to C[i][j] */
 91 |         add r5, r5, #1  /* r5 ← r5 + 1 */
 92 |         b .L0_loop_j /* next iteration of loop j */
 93 |        .L0_end_loop_j: /* Here ends loop j */
 94 |        add r4, r4, #1 /* r4 ← r4 + 1 */
 95 |        b .L0_loop_i     /* next iteration of loop i */
 96 |     .L0_end_loop_i: /* Here ends loop i */
 97 | 
 98 |     pop {r4, r5, r6, r7, r8, lr}  /* Restore integer registers */
 99 |     bx lr /* Leave function */
100 | 
101 | naive_vectorial_matmul_4x4:
102 |     /* r0 address of A
103 |        r1 address of B
104 |        r2 address of C
105 |     */
106 |     push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */
107 |     vpush {s16-s19}               /* Floating point registers starting from s16 must be preserved */
108 |     vpush {s24-s27}
109 |     /* First zero 16 single floating point */
110 |     /* In IEEE 754, all bits cleared means 0 */
111 |     mov r4, r2
112 |     mov r5, #16
113 |     mov r6, #0
114 |     b .L1_loop_init_test
115 |     .L1_loop_init :
116 |       str r6, [r4], +#4   /* *r4 ← r6 then r4 ← r4 + 4 */
117 |     .L1_loop_init_test:
118 |       subs r5, r5, #1
119 |       bge .L1_loop_init
120 | 
121 |     /* Set the LEN field of FPSCR to be 4 (value 3) */
122 |     mov r5, #0b011                        /* r5 ← 3 */
123 |     mov r5, r5, LSL #16                   /* r5 ← r5 << 16 */
124 |     fmrx r4, fpscr                        /* r4 ← fpscr */
125 |     orr r4, r4, r5                        /* r4 ← r4 | r5 */
126 |     fmxr fpscr, r4                        /* fpscr ← r4 */
127 | 
128 |     /* We will use 
129 |            r4 as i
130 |            r5 as j
131 |     */
132 |     mov r4, #0 /* r4 ← 0 */
133 |     .L1_loop_i:  /* loop header of i */
134 |       cmp r4, #4  /* if r4 == 4 goto end of the loop i */
135 |       beq .L1_end_loop_i
136 |       mov r5, #0  /* r5 ← 0 */
137 |       .L1_loop_j: /* loop header of j */
138 |        cmp r5, #4 /* if r5 == 4 goto end of the loop j */
139 |         beq .L1_end_loop_j
140 |         /* Compute the address of C[i][j] and load it into s0 */
141 |         /* Address of C[i][j] is C + 4*(4 * i + j) */
142 |         mov r7, r5               /* r7 ← r5. This is r7 ← j */
143 |         adds r7, r7, r4, LSL #2  /* r7 ← r7 + (r4 << 2). 
144 |                                     This is r7 ← j + i * 4.
145 |                                     We multiply i by the row size (4 elements) */
146 |         adds r7, r2, r7, LSL #2  /* r7 ← r2 + (r7 << 2).
147 |                                     This is r7 ← C + 4*(j + i * 4)
148 |                                     We multiply (j + i * 4) by the size of the element.
149 |                                     A single-precision floating point takes 4 bytes.
150 |                                     */
151 |         /* Compute the address of a[i][0] */
152 |         mov r8, r4, LSL #2
153 |         adds r8, r0, r8, LSL #2
154 |         vldmia r8, {s8-s11}  /* Load {s8,s9,s10,s11} ← {a[i][0], a[i][1], a[i][2], a[i][3]} */
155 | 
156 |         /* Compute the address of b[0][j] */
157 |         mov r8, r5               /* r8 ← r5. This is r8 ← j */
158 |         adds r8, r1, r8, LSL #2  /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j) */
159 |         vldr s16, [r8]             /* s16 ← *r8. This is s16 ← b[0][j] */
160 |         vldr s17, [r8, #16]        /* s17 ← *(r8 + 16). This is s17 ← b[1][j] */
161 |         vldr s18, [r8, #32]        /* s18 ← *(r8 + 32). This is s17 ← b[2][j] */
162 |         vldr s19, [r8, #48]        /* s19 ← *(r8 + 48). This is s17 ← b[3][j] */
163 | 
164 |         vmul.f32 s24, s8, s16      /* {s24,s25,s26,s27} ← {s8,s9,s10,s11} * {s16,s17,s18,s19} */
165 |         vmov.f32 s0, s24           /* s0 ← s24 */
166 |         vadd.f32 s0, s0, s25       /* s0 ← s0 + s25 */
167 |         vadd.f32 s0, s0, s26       /* s0 ← s0 + s26 */
168 |         vadd.f32 s0, s0, s27       /* s0 ← s0 + s27 */
169 | 
170 |         vstr s0, [r7]            /* Store s0 back to C[i][j] */
171 |         add r5, r5, #1  /* r5 ← r5 + 1 */
172 |         b .L1_loop_j /* next iteration of loop j */
173 |        .L1_end_loop_j: /* Here ends loop j */
174 |        add r4, r4, #1 /* r4 ← r4 + 1 */
175 |        b .L1_loop_i     /* next iteration of loop i */
176 |     .L1_end_loop_i: /* Here ends loop i */
177 | 
178 |     /* Set the LEN field of FPSCR back to 1 (value 0) */
179 |     mov r5, #0b011                        /* r5 ← 3 */
180 |     mvn r5, r5, LSL #16                   /* r5 ← r5 << 16 */
181 |     fmrx r4, fpscr                        /* r4 ← fpscr */
182 |     and r4, r4, r5                        /* r4 ← r4 & r5 */
183 |     fmxr fpscr, r4                        /* fpscr ← r4 */
184 | 
185 |     vpop {s24-s27}                /* Restore preserved floating registers */
186 |     vpop {s16-s19}
187 |     pop {r4, r5, r6, r7, r8, lr}  /* Restore integer registers */
188 |     bx lr /* Leave function */
189 |     
190 | naive_vectorial_matmul_2_4x4:
191 |     /* r0 address of A
192 |        r1 address of B
193 |        r2 address of C
194 |     */
195 |     push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */
196 |     vpush {s16-s31}               /* Floating point registers starting from s16 must be preserved */
197 |     /* First zero 16 single floating point */
198 |     /* In IEEE 754, all bits cleared means 0 */
199 |     mov r4, r2
200 |     mov r5, #16
201 |     mov r6, #0
202 |     b .L2_loop_init_test
203 |     .L2_loop_init :
204 |       str r6, [r4], +#4   /* *r4 ← r6 then r4 ← r4 + 4 */
205 |     .L2_loop_init_test:
206 |       subs r5, r5, #1
207 |       bge .L2_loop_init
208 | 
209 |     /* Set the LEN field of FPSCR to be 4 (value 3) */
210 |     mov r5, #0b011                        /* r5 ← 3 */
211 |     mov r5, r5, LSL #16                   /* r5 ← r5 << 16 */
212 |     fmrx r4, fpscr                        /* r4 ← fpscr */
213 |     orr r4, r4, r5                        /* r4 ← r4 | r5 */
214 |     fmxr fpscr, r4                        /* fpscr ← r4 */
215 | 
216 |     /* We will use 
217 |            r4 as i
218 |            r5 as j
219 |     */
220 |     mov r4, #0 /* r4 ← 0 */
221 |     .L2_loop_i:  /* loop header of i */
222 |       cmp r4, #4  /* if r4 == 4 goto end of the loop i */
223 |       beq .L2_end_loop_i
224 |       mov r5, #0  /* r5 ← 0 */
225 |       .L2_loop_j: /* loop header of j */
226 |        cmp r5, #4 /* if r5 == 4 goto end of the loop j */
227 |         beq .L2_end_loop_j
228 |         /* Compute the address of C[i][j] and load it into s0 */
229 |         /* Address of C[i][j] is C + 4*(4 * i + j) */
230 |         mov r7, r5               /* r7 ← r5. This is r7 ← j */
231 |         adds r7, r7, r4, LSL #2  /* r7 ← r7 + (r4 << 2). 
232 |                                     This is r7 ← j + i * 4.
233 |                                     We multiply i by the row size (4 elements) */
234 |         adds r7, r2, r7, LSL #2  /* r7 ← r2 + (r7 << 2).
235 |                                     This is r7 ← C + 4*(j + i * 4)
236 |                                     We multiply (j + i * 4) by the size of the element.
237 |                                     A single-precision floating point takes 4 bytes.
238 |                                     */
239 |         /* Compute the address of a[i][0] */
240 |         mov r8, r4, LSL #2
241 |         adds r8, r0, r8, LSL #2
242 |         vldmia r8, {s8-s11}  /* Load {s8,s9,s10,s11} ← {a[i][0], a[i][1], a[i][2], a[i][3]} */
243 | 
244 |         /* Compute the address of b[0][j] */
245 |         mov r8, r5               /* r8 ← r5. This is r8 ← j */
246 |         adds r8, r1, r8, LSL #2  /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j) */
247 |         vldr s16, [r8]             /* s16 ← *r8. This is s16 ← b[0][j] */
248 |         vldr s17, [r8, #16]        /* s17 ← *(r8 + 16). This is s17 ← b[1][j] */
249 |         vldr s18, [r8, #32]        /* s18 ← *(r8 + 32). This is s17 ← b[2][j] */
250 |         vldr s19, [r8, #48]        /* s19 ← *(r8 + 48). This is s17 ← b[3][j] */
251 | 
252 |         /* Compute the address of b[0][j+1] */
253 |         add r8, r5, #1             /* r8 ← r5 + 1. This is r8 ← j + 1*/
254 |         adds r8, r1, r8, LSL #2    /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j + 1) */
255 |         vldr s20, [r8]             /* s20 ← *r8. This is s20 ← b[0][j + 1] */
256 |         vldr s21, [r8, #16]        /* s21 ← *(r8 + 16). This is s21 ← b[1][j + 1] */
257 |         vldr s22, [r8, #32]        /* s22 ← *(r8 + 32). This is s22 ← b[2][j + 1] */
258 |         vldr s23, [r8, #48]        /* s23 ← *(r8 + 48). This is s23 ← b[3][j + 1] */
259 | 
260 |         vmul.f32 s24, s8, s16      /* {s24,s25,s26,s27} ← {s8,s9,s10,s11} * {s16,s17,s18,s19} */
261 |         vmov.f32 s0, s24           /* s0 ← s24 */
262 |         vadd.f32 s0, s0, s25       /* s0 ← s0 + s25 */
263 |         vadd.f32 s0, s0, s26       /* s0 ← s0 + s26 */
264 |         vadd.f32 s0, s0, s27       /* s0 ← s0 + s27 */
265 | 
266 |         vmul.f32 s28, s8, s20      /* {s28,s29,s30,s31} ← {s8,s9,s10,s11} * {s20,s21,s22,s23} */
267 | 
268 |         vmov.f32 s1, s28           /* s1 ← s28 */
269 |         vadd.f32 s1, s1, s29       /* s1 ← s1 + s29 */
270 |         vadd.f32 s1, s1, s30       /* s1 ← s1 + s30 */
271 |         vadd.f32 s1, s1, s31       /* s1 ← s1 + s31 */
272 | 
273 |         vstmia r7, {s0-s1}         /* {C[i][j], C[i][j+1]} ← {s0, s1} */
274 | 
275 |         add r5, r5, #2  /* r5 ← r5 + 2 */
276 |         b .L2_loop_j /* next iteration of loop j */
277 |        .L2_end_loop_j: /* Here ends loop j */
278 |        add r4, r4, #1 /* r4 ← r4 + 1 */
279 |        b .L2_loop_i     /* next iteration of loop i */
280 |     .L2_end_loop_i: /* Here ends loop i */
281 | 
282 |     /* Set the LEN field of FPSCR back to 1 (value 0) */
283 |     mov r5, #0b011                        /* r5 ← 3 */
284 |     mvn r5, r5, LSL #16                   /* r5 ← r5 << 16 */
285 |     fmrx r4, fpscr                        /* r4 ← fpscr */
286 |     and r4, r4, r5                        /* r4 ← r4 & r5 */
287 |     fmxr fpscr, r4                        /* fpscr ← r4 */
288 | 
289 |     vpop {s16-s31}                /* Restore preserved floating registers */
290 |     pop {r4, r5, r6, r7, r8, lr}  /* Restore integer registers */
291 |     bx lr /* Leave function */
292 | 
293 | better_vectorial_matmul_4x4:
294 |     /* r0 address of A
295 |        r1 address of B
296 |        r2 address of C
297 |     */
298 |     push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */
299 |     vpush {s16-s19}               /* Floating point registers starting from s16 must be preserved */
300 |     vpush {s24-s27}
301 |     /* First zero 16 single floating point */
302 |     /* In IEEE 754, all bits cleared means 0 */
303 |     mov r4, r2
304 |     mov r5, #16
305 |     mov r6, #0
306 |     b .L3_loop_init_test
307 |     .L3_loop_init :
308 |       str r6, [r4], +#4   /* *r4 ← r6 then r4 ← r4 + 4 */
309 |     .L3_loop_init_test:
310 |       subs r5, r5, #1
311 |       bge .L3_loop_init
312 | 
313 |     /* Set the LEN field of FPSCR to be 4 (value 3) */
314 |     mov r5, #0b011                        /* r5 ← 3 */
315 |     mov r5, r5, LSL #16                   /* r5 ← r5 << 16 */
316 |     fmrx r4, fpscr                        /* r4 ← fpscr */
317 |     orr r4, r4, r5                        /* r4 ← r4 | r5 */
318 |     fmxr fpscr, r4                        /* fpscr ← r4 */
319 | 
320 |     /* We will use 
321 |            r4 as k
322 |            r5 as i
323 |     */
324 |     mov r4, #0 /* r4 ← 0 */
325 |     .L3_loop_k:  /* loop header of k */
326 |       cmp r4, #4  /* if r4 == 4 goto end of the loop k */
327 |       beq .L3_end_loop_k
328 |       mov r5, #0  /* r5 ← 0 */
329 |       .L3_loop_i: /* loop header of i */
330 |        cmp r5, #4 /* if r5 == 4 goto end of the loop i */
331 |         beq .L3_end_loop_i
332 |         /* Compute the address of C[i][0] */
333 |         /* Address of C[i][0] is C + 4*(4 * i) */
334 |         add r7, r2, r5, LSL #4         /* r7 ← r2 + (r5 << 4). This is r7 ← c + 4*4*i */
335 |         vldmia r7, {s8-s11}            /* Load {s8,s9,s10,s11} ← {c[i][0], c[i][1], c[i][2], c[i][3]} */
336 |         /* Compute the address of A[i][k] */
337 |         /* Address of A[i][k] is A + 4*(4*i + k) */
338 |         add r8, r4, r5, LSL #2         /* r8 ← r4 + r5 << 2. This is r8 ← k + 4*i */
339 |         add r8, r0, r8, LSL #2         /* r8 ← r0 + r8 << 2. This is r8 ← a + 4*(k + 4*i) */
340 |         vldr s0, [r8]                  /* Load s0 ← a[i][k] */
341 | 
342 |         /* Compute the address of B[k][0] */
343 |         /* Address of B[k][0] is B + 4*(4*k) */
344 |         add r8, r1, r4, LSL #4         /* r8 ← r1 + r4 << 4. This is r8 ← b + 4*(4*k) */
345 |         vldmia r8, {s16-s19}           /* Load {s16,s17,s18,s19} ← {b[k][0], b[k][1], b[k][2], b[k][3]} */
346 | 
347 |         vmul.f32 s24, s16, s0          /* {s24,s25,s26,s27} ← {s16,s17,s18,s19} * {s0,s0,s0,s0} */
348 |         vadd.f32 s8, s8, s24           /* {s8,s9,s10,s11} ← {s8,s9,s10,s11} + {s24,s25,s26,s7} */
349 | 
350 |         vstmia r7, {s8-s11}            /* Store {c[i][0],c[i][1],c[i][2],c[i][3]} ← {s8,s9,s10,s11} */
351 | 
352 |         add r5, r5, #1  /* r5 ← r5 + 1. This is i = i + 1 */
353 |         b .L3_loop_i /* next iteration of loop i */
354 |        .L3_end_loop_i: /* Here ends loop i */
355 |        add r4, r4, #1 /* r4 ← r4 + 1. This is k = k + 1 */
356 |        b .L3_loop_k     /* next iteration of loop k */
357 |     .L3_end_loop_k: /* Here ends loop k */
358 | 
359 |     /* Set the LEN field of FPSCR back to 1 (value 0) */
360 |     mov r5, #0b011                        /* r5 ← 3 */
361 |     mvn r5, r5, LSL #16                   /* r5 ← r5 << 16 */
362 |     fmrx r4, fpscr                        /* r4 ← fpscr */
363 |     and r4, r4, r5                        /* r4 ← r4 & r5 */
364 |     fmxr fpscr, r4                        /* fpscr ← r4 */
365 | 
366 |     vpop {s24-s27}                /* Restore preserved floating registers */
367 |     vpop {s16-s19}
368 |     pop {r4, r5, r6, r7, r8, lr}  /* Restore integer registers */
369 |     bx lr /* Leave function */
370 | 
371 | best_vectorial_matmul_4x4:
372 |     /* r0 address of A
373 |        r1 address of B
374 |        r2 address of C
375 |     */
376 |     push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */
377 |     vpush {s16-s19}               /* Floating point registers starting from s16 must be preserved */
378 | 
379 |     /* First zero 16 single floating point */
380 |     /* In IEEE 754, all bits cleared means 0 */
381 |     mov r4, r2
382 |     mov r5, #16
383 |     mov r6, #0
384 |     b .L4_loop_init_test
385 |     .L4_loop_init :
386 |       str r6, [r4], +#4   /* *r4 ← r6 then r4 ← r4 + 4 */
387 |     .L4_loop_init_test:
388 |       subs r5, r5, #1
389 |       bge .L4_loop_init
390 | 
391 |     /* Set the LEN field of FPSCR to be 4 (value 3) */
392 |     mov r5, #0b011                        /* r5 ← 3 */
393 |     mov r5, r5, LSL #16                   /* r5 ← r5 << 16 */
394 |     fmrx r4, fpscr                        /* r4 ← fpscr */
395 |     orr r4, r4, r5                        /* r4 ← r4 | r5 */
396 |     fmxr fpscr, r4                        /* fpscr ← r4 */
397 | 
398 |     /* We will use 
399 |            r4 as k
400 |            r5 as i
401 |     */
402 |     mov r4, #0 /* r4 ← 0 */
403 |     .L4_loop_k:  /* loop header of k */
404 |       cmp r4, #4  /* if r4 == 4 goto end of the loop k */
405 |       beq .L4_end_loop_k
406 |       mov r5, #0  /* r5 ← 0 */
407 |       .L4_loop_i: /* loop header of i */
408 |        cmp r5, #4 /* if r5 == 4 goto end of the loop i */
409 |         beq .L4_end_loop_i
410 |         /* Compute the address of C[i][0] */
411 |         /* Address of C[i][0] is C + 4*(4 * i) */
412 |         add r7, r2, r5, LSL #4         /* r7 ← r2 + (r5 << 4). This is r7 ← c + 4*4*i */
413 |         vldmia r7, {s8-s15}            /* Load {s8,s9,s10,s11,s12,s13,s14,s15} 
414 |                                             ← {c[i][0],   c[i][1],   c[i][2],   c[i][3]
415 |                                                c[i+1][0], c[i+1][1], c[i+1][2], c[i+1][3]} */
416 |         /* Compute the address of A[i][k] */
417 |         /* Address of A[i][k] is A + 4*(4*i + k) */
418 |         add r8, r4, r5, LSL #2         /* r8 ← r4 + r5 << 2. This is r8 ← k + 4*i */
419 |         add r8, r0, r8, LSL #2         /* r8 ← r0 + r8 << 2. This is r8 ← a + 4*(k + 4*i) */
420 |         vldr s0, [r8]                  /* Load s0 ← a[i][k] */
421 |         vldr s1, [r8, #16]             /* Load s1 ← a[i+1][k] */
422 | 
423 |         /* Compute the address of B[k][0] */
424 |         /* Address of B[k][0] is B + 4*(4*k) */
425 |         add r8, r1, r4, LSL #4         /* r8 ← r1 + r4 << 4. This is r8 ← b + 4*(4*k) */
426 |         vldmia r8, {s16-s19}           /* Load {s16,s17,s18,s19} ← {b[k][0], b[k][1], b[k][2], b[k][3]} */
427 | 
428 |         vmla.f32 s8, s16, s0           /* {s8,s9,s10,s11} ← {s8,s9,s10,s11} + ({s16,s17,s18,s19} * {s0,s0,s0,s0}) */
429 |         vmla.f32 s12, s16, s1          /* {s12,s13,s14,s15} ← {s12,s13,s14,s15} + ({s16,s17,s18,s19} * {s1,s1,s1,s1}) */
430 | 
431 |         vstmia r7, {s8-s15}            /* Store {c[i][0],   c[i][1],   c[i][2],    c[i][3],
432 |                                                  c[i+1][0], c[i+1][1], c[i+1][2]}, c[i+1][3] }
433 |                                                 ← {s8,s9,s10,s11,s12,s13,s14,s15} */
434 | 
435 |         add r5, r5, #2  /* r5 ← r5 + 2. This is i = i + 2 */
436 |         b .L4_loop_i /* next iteration of loop i */
437 |        .L4_end_loop_i: /* Here ends loop i */
438 |        add r4, r4, #1 /* r4 ← r4 + 1. This is k = k + 1 */
439 |        b .L4_loop_k     /* next iteration of loop k */
440 |     .L4_end_loop_k: /* Here ends loop k */
441 | 
442 |     /* Set the LEN field of FPSCR back to 1 (value 0) */
443 |     mov r5, #0b011                        /* r5 ← 3 */
444 |     mvn r5, r5, LSL #16                   /* r5 ← r5 << 16 */
445 |     fmrx r4, fpscr                        /* r4 ← fpscr */
446 |     and r4, r4, r5                        /* r4 ← r4 & r5 */
447 |     fmxr fpscr, r4                        /* fpscr ← r4 */
448 | 
449 |     vpop {s16-s19}                /* Restore preserved floating registers */
450 |     pop {r4, r5, r6, r7, r8, lr}  /* Restore integer registers */
451 |     bx lr /* Leave function */
452 | 
453 | .globl main
454 | main:
455 |     push {r4, r5, r6, lr}  /* Keep integer registers */
456 | 
457 |     ldr r0, addr_mat_A  /* r0 ← a */
458 |     ldr r1, addr_mat_B  /* r1 ← b */
459 |     ldr r2, addr_mat_C  /* r2 ← c */
460 |     mov r4, #1
461 |     mov r4, r4, LSL #21
462 |     .Lmain_loop_test: 
463 |       bl best_vectorial_matmul_4x4
464 |       subs r4, r4, #1
465 |       bne .Lmain_loop_test /* Should have been 'bge' */
466 | 
467 |     mov r0, #0
468 |     pop {r4, r5, r6, lr}
469 |     bx lr
470 | 
471 | addr_mat_A : .word mat_A
472 | addr_mat_B : .word mat_B
473 | addr_mat_C : .word mat_C
474 | addr_format_result : .word format_result
475 | 


--------------------------------------------------------------------------------
/chapter14/matmul.s:
--------------------------------------------------------------------------------
  1 | /* -- matmul.s */
  2 | .data
  3 | mat_A: .float 0.1, 0.2, 0.0, 0.1
  4 |        .float 0.2, 0.1, 0.3, 0.0
  5 |        .float 0.0, 0.3, 0.1, 0.5 
  6 |        .float 0.0, 0.6, 0.4, 0.1
  7 | mat_B: .float  4.92,  2.54, -0.63, -1.75
  8 |        .float  3.02, -1.51, -0.87,  1.35
  9 |        .float -4.29,  2.14,  0.71,  0.71
 10 |        .float -0.95,  0.48,  2.38, -0.95
 11 | mat_C: .float 0.0, 0.0, 0.0, 0.0
 12 |        .float 0.0, 0.0, 0.0, 0.0
 13 |        .float 0.0, 0.0, 0.0, 0.0
 14 |        .float 0.0, 0.0, 0.0, 0.0
 15 |        .float 0.0, 0.0, 0.0, 0.0
 16 | 
 17 | format_result : .asciz "Matrix result is:\n%5.2f %5.2f %5.2f %5.2f\n%5.2f %5.2f %5.2f %5.2f\n%5.2f %5.2f %5.2f %5.2f\n%5.2f %5.2f %5.2f %5.2f\n"
 18 | 
 19 | .text
 20 | 
 21 | naive_matmul_4x4:
 22 |     /* r0 address of A
 23 |        r1 address of B
 24 |        r2 address of C
 25 |     */
 26 |     push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */
 27 |     /* First zero 16 single floating point */
 28 |     /* In IEEE 754, all bits cleared means 0 */
 29 |     mov r4, r2
 30 |     mov r5, #16
 31 |     mov r6, #0
 32 |     b .L0_loop_init_test
 33 |     .L0_loop_init :
 34 |       str r6, [r4], +#4   /* *r4 ← r6 then r4 ← r4 + 4 */
 35 |     .L0_loop_init_test:
 36 |       subs r5, r5, #1
 37 |       bne .L0_loop_init
 38 | 
 39 |     /* We will use 
 40 |            r4 as i
 41 |            r5 as j
 42 |            r6 as k
 43 |     */
 44 |     mov r4, #0 /* r4 ← 0 */
 45 |     .L0_loop_i:  /* loop header of i */
 46 |       cmp r4, #4  /* if r4 == 4 goto end of the loop i */
 47 |       beq .L0_end_loop_i
 48 |       mov r5, #0  /* r5 ← 0 */
 49 |       .L0_loop_j: /* loop header of j */
 50 |        cmp r5, #4 /* if r5 == 4 goto end of the loop j */
 51 |         beq .L0_end_loop_j
 52 |         /* Compute the address of C[i][j] and load it into s0 */
 53 |         /* Address of C[i][j] is C + 4*(4 * i + j) */
 54 |         mov r7, r5               /* r7 ← r5. This is r7 ← j */
 55 |         adds r7, r7, r4, LSL #2  /* r7 ← r7 + (r4 << 2). 
 56 |                                     This is r7 ← j + i * 4.
 57 |                                     We multiply i by the row size (4 elements) */
 58 |         adds r7, r2, r7, LSL #2  /* r7 ← r2 + (r7 << 2).
 59 |                                     This is r7 ← C + 4*(j + i * 4)
 60 |                                     We multiply (j + i * 4) by the size of the element.
 61 |                                     A single-precision floating point takes 4 bytes.
 62 |                                     */
 63 |         vldr s0, [r7] /* s0 ← *r7 */
 64 | 
 65 |         mov r6, #0 /* r6 ← 0 */
 66 |         .L0_loop_k :  /* loop header of k */
 67 |           cmp r6, #4 /* if r6 == 4 goto end of the loop k */
 68 |           beq .L0_end_loop_k
 69 | 
 70 |           /* Compute the address of a[i][k] and load it into s1 */
 71 |           /* Address of a[i][k] is a + 4*(4 * i + k) */
 72 |           mov r8, r6               /* r8 ← r6. This is r8 ← k */
 73 |           adds r8, r8, r4, LSL #2  /* r8 ← r8 + (r4 << 2). This is r8 ← k + i * 4 */
 74 |           adds r8, r0, r8, LSL #2  /* r8 ← r0 + (r8 << 2). This is r8 ← a + 4*(k + i * 4) */
 75 |           vldr s1, [r8]            /* s1 ← *r8 */
 76 | 
 77 |           /* Compute the address of b[k][j] and load it into s2 */
 78 |           /* Address of b[k][j] is b + 4*(4 * k + j) */
 79 |           mov r8, r5               /* r8 ← r5. This is r8 ← j */
 80 |           adds r8, r8, r6, LSL #2  /* r8 ← r8 + (r6 << 2). This is r8 ← j + k * 4 */
 81 |           adds r8, r1, r8, LSL #2  /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j + k * 4) */
 82 |           vldr s2, [r8]            /* s1 ← *r8 */
 83 | 
 84 |           vmul.f32 s3, s1, s2      /* s3 ← s1 * s2 */
 85 |           vadd.f32 s0, s0, s3      /* s0 ← s0 + s3 */
 86 | 
 87 |           add r6, r6, #1           /* r6 ← r6 + 1 */
 88 |           b .L0_loop_k               /* next iteration of loop k */
 89 |         .L0_end_loop_k: /* Here ends loop k */
 90 |         vstr s0, [r7]            /* Store s0 back to C[i][j] */
 91 |         add r5, r5, #1  /* r5 ← r5 + 1 */
 92 |         b .L0_loop_j /* next iteration of loop j */
 93 |        .L0_end_loop_j: /* Here ends loop j */
 94 |        add r4, r4, #1 /* r4 ← r4 + 1 */
 95 |        b .L0_loop_i     /* next iteration of loop i */
 96 |     .L0_end_loop_i: /* Here ends loop i */
 97 | 
 98 |     pop {r4, r5, r6, r7, r8, lr}  /* Restore integer registers */
 99 |     bx lr /* Leave function */
100 | 
101 | naive_vectorial_matmul_4x4:
102 |     /* r0 address of A
103 |        r1 address of B
104 |        r2 address of C
105 |     */
106 |     push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */
107 |     vpush {s16-s19}               /* Floating point registers starting from s16 must be preserved */
108 |     vpush {s24-s27}
109 |     /* First zero 16 single floating point */
110 |     /* In IEEE 754, all bits cleared means 0 */
111 |     mov r4, r2
112 |     mov r5, #16
113 |     mov r6, #0
114 |     b .L1_loop_init_test
115 |     .L1_loop_init :
116 |       str r6, [r4], +#4   /* *r4 ← r6 then r4 ← r4 + 4 */
117 |     .L1_loop_init_test:
118 |       subs r5, r5, #1
119 |       bne .L1_loop_init
120 | 
121 |     /* Set the LEN field of FPSCR to be 4 (value 3) */
122 |     mov r5, #0b011                        /* r5 ← 3 */
123 |     mov r5, r5, LSL #16                   /* r5 ← r5 << 16 */
124 |     fmrx r4, fpscr                        /* r4 ← fpscr */
125 |     orr r4, r4, r5                        /* r4 ← r4 | r5 */
126 |     fmxr fpscr, r4                        /* fpscr ← r4 */
127 | 
128 |     /* We will use 
129 |            r4 as i
130 |            r5 as j
131 |     */
132 |     mov r4, #0 /* r4 ← 0 */
133 |     .L1_loop_i:  /* loop header of i */
134 |       cmp r4, #4  /* if r4 == 4 goto end of the loop i */
135 |       beq .L1_end_loop_i
136 |       mov r5, #0  /* r5 ← 0 */
137 |       .L1_loop_j: /* loop header of j */
138 |        cmp r5, #4 /* if r5 == 4 goto end of the loop j */
139 |         beq .L1_end_loop_j
140 |         /* Compute the address of C[i][j] and load it into s0 */
141 |         /* Address of C[i][j] is C + 4*(4 * i + j) */
142 |         mov r7, r5               /* r7 ← r5. This is r7 ← j */
143 |         adds r7, r7, r4, LSL #2  /* r7 ← r7 + (r4 << 2). 
144 |                                     This is r7 ← j + i * 4.
145 |                                     We multiply i by the row size (4 elements) */
146 |         adds r7, r2, r7, LSL #2  /* r7 ← r2 + (r7 << 2).
147 |                                     This is r7 ← C + 4*(j + i * 4)
148 |                                     We multiply (j + i * 4) by the size of the element.
149 |                                     A single-precision floating point takes 4 bytes.
150 |                                     */
151 |         /* Compute the address of a[i][0] */
152 |         mov r8, r4, LSL #2
153 |         adds r8, r0, r8, LSL #2
154 |         vldmia r8, {s8-s11}  /* Load {s8,s9,s10,s11} ← {a[i][0], a[i][1], a[i][2], a[i][3]} */
155 | 
156 |         /* Compute the address of b[0][j] */
157 |         mov r8, r5               /* r8 ← r5. This is r8 ← j */
158 |         adds r8, r1, r8, LSL #2  /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j) */
159 |         vldr s16, [r8]             /* s16 ← *r8. This is s16 ← b[0][j] */
160 |         vldr s17, [r8, #16]        /* s17 ← *(r8 + 16). This is s17 ← b[1][j] */
161 |         vldr s18, [r8, #32]        /* s18 ← *(r8 + 32). This is s17 ← b[2][j] */
162 |         vldr s19, [r8, #48]        /* s19 ← *(r8 + 48). This is s17 ← b[3][j] */
163 | 
164 |         vmul.f32 s24, s8, s16      /* {s24,s25,s26,s27} ← {s8,s9,s10,s11} * {s16,s17,s18,s19} */
165 |         vmov.f32 s0, s24           /* s0 ← s24 */
166 |         vadd.f32 s0, s0, s25       /* s0 ← s0 + s25 */
167 |         vadd.f32 s0, s0, s26       /* s0 ← s0 + s26 */
168 |         vadd.f32 s0, s0, s27       /* s0 ← s0 + s27 */
169 | 
170 |         vstr s0, [r7]            /* Store s0 back to C[i][j] */
171 |         add r5, r5, #1  /* r5 ← r5 + 1 */
172 |         b .L1_loop_j /* next iteration of loop j */
173 |        .L1_end_loop_j: /* Here ends loop j */
174 |        add r4, r4, #1 /* r4 ← r4 + 1 */
175 |        b .L1_loop_i     /* next iteration of loop i */
176 |     .L1_end_loop_i: /* Here ends loop i */
177 | 
178 |     /* Set the LEN field of FPSCR back to 1 (value 0) */
179 |     mov r5, #0b011                        /* r5 ← 3 */
180 |     mvn r5, r5, LSL #16                   /* r5 ← r5 << 16 */
181 |     fmrx r4, fpscr                        /* r4 ← fpscr */
182 |     and r4, r4, r5                        /* r4 ← r4 & r5 */
183 |     fmxr fpscr, r4                        /* fpscr ← r4 */
184 | 
185 |     vpop {s24-s27}                /* Restore preserved floating registers */
186 |     vpop {s16-s19}
187 |     pop {r4, r5, r6, r7, r8, lr}  /* Restore integer registers */
188 |     bx lr /* Leave function */
189 |     
190 | naive_vectorial_matmul_2_4x4:
191 |     /* r0 address of A
192 |        r1 address of B
193 |        r2 address of C
194 |     */
195 |     push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */
196 |     vpush {s16-s31}               /* Floating point registers starting from s16 must be preserved */
197 |     /* First zero 16 single floating point */
198 |     /* In IEEE 754, all bits cleared means 0 */
199 |     mov r4, r2
200 |     mov r5, #16
201 |     mov r6, #0
202 |     b .L2_loop_init_test
203 |     .L2_loop_init :
204 |       str r6, [r4], +#4   /* *r4 ← r6 then r4 ← r4 + 4 */
205 |     .L2_loop_init_test:
206 |       subs r5, r5, #1
207 |       bne .L2_loop_init
208 | 
209 |     /* Set the LEN field of FPSCR to be 4 (value 3) */
210 |     mov r5, #0b011                        /* r5 ← 3 */
211 |     mov r5, r5, LSL #16                   /* r5 ← r5 << 16 */
212 |     fmrx r4, fpscr                        /* r4 ← fpscr */
213 |     orr r4, r4, r5                        /* r4 ← r4 | r5 */
214 |     fmxr fpscr, r4                        /* fpscr ← r4 */
215 | 
216 |     /* We will use 
217 |            r4 as i
218 |            r5 as j
219 |     */
220 |     mov r4, #0 /* r4 ← 0 */
221 |     .L2_loop_i:  /* loop header of i */
222 |       cmp r4, #4  /* if r4 == 4 goto end of the loop i */
223 |       beq .L2_end_loop_i
224 |       mov r5, #0  /* r5 ← 0 */
225 |       .L2_loop_j: /* loop header of j */
226 |        cmp r5, #4 /* if r5 == 4 goto end of the loop j */
227 |         beq .L2_end_loop_j
228 |         /* Compute the address of C[i][j] and load it into s0 */
229 |         /* Address of C[i][j] is C + 4*(4 * i + j) */
230 |         mov r7, r5               /* r7 ← r5. This is r7 ← j */
231 |         adds r7, r7, r4, LSL #2  /* r7 ← r7 + (r4 << 2). 
232 |                                     This is r7 ← j + i * 4.
233 |                                     We multiply i by the row size (4 elements) */
234 |         adds r7, r2, r7, LSL #2  /* r7 ← r2 + (r7 << 2).
235 |                                     This is r7 ← C + 4*(j + i * 4)
236 |                                     We multiply (j + i * 4) by the size of the element.
237 |                                     A single-precision floating point takes 4 bytes.
238 |                                     */
239 |         /* Compute the address of a[i][0] */
240 |         mov r8, r4, LSL #2
241 |         adds r8, r0, r8, LSL #2
242 |         vldmia r8, {s8-s11}  /* Load {s8,s9,s10,s11} ← {a[i][0], a[i][1], a[i][2], a[i][3]} */
243 | 
244 |         /* Compute the address of b[0][j] */
245 |         mov r8, r5               /* r8 ← r5. This is r8 ← j */
246 |         adds r8, r1, r8, LSL #2  /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j) */
247 |         vldr s16, [r8]             /* s16 ← *r8. This is s16 ← b[0][j] */
248 |         vldr s17, [r8, #16]        /* s17 ← *(r8 + 16). This is s17 ← b[1][j] */
249 |         vldr s18, [r8, #32]        /* s18 ← *(r8 + 32). This is s17 ← b[2][j] */
250 |         vldr s19, [r8, #48]        /* s19 ← *(r8 + 48). This is s17 ← b[3][j] */
251 | 
252 |         /* Compute the address of b[0][j+1] */
253 |         add r8, r5, #1             /* r8 ← r5 + 1. This is r8 ← j + 1*/
254 |         adds r8, r1, r8, LSL #2    /* r8 ← r1 + (r8 << 2). This is r8 ← b + 4*(j + 1) */
255 |         vldr s20, [r8]             /* s20 ← *r8. This is s20 ← b[0][j + 1] */
256 |         vldr s21, [r8, #16]        /* s21 ← *(r8 + 16). This is s21 ← b[1][j + 1] */
257 |         vldr s22, [r8, #32]        /* s22 ← *(r8 + 32). This is s22 ← b[2][j + 1] */
258 |         vldr s23, [r8, #48]        /* s23 ← *(r8 + 48). This is s23 ← b[3][j + 1] */
259 | 
260 |         vmul.f32 s24, s8, s16      /* {s24,s25,s26,s27} ← {s8,s9,s10,s11} * {s16,s17,s18,s19} */
261 |         vmov.f32 s0, s24           /* s0 ← s24 */
262 |         vadd.f32 s0, s0, s25       /* s0 ← s0 + s25 */
263 |         vadd.f32 s0, s0, s26       /* s0 ← s0 + s26 */
264 |         vadd.f32 s0, s0, s27       /* s0 ← s0 + s27 */
265 | 
266 |         vmul.f32 s28, s8, s20      /* {s28,s29,s30,s31} ← {s8,s9,s10,s11} * {s20,s21,s22,s23} */
267 | 
268 |         vmov.f32 s1, s28           /* s1 ← s28 */
269 |         vadd.f32 s1, s1, s29       /* s1 ← s1 + s29 */
270 |         vadd.f32 s1, s1, s30       /* s1 ← s1 + s30 */
271 |         vadd.f32 s1, s1, s31       /* s1 ← s1 + s31 */
272 | 
273 |         vstmia r7, {s0-s1}         /* {C[i][j], C[i][j+1]} ← {s0, s1} */
274 | 
275 |         add r5, r5, #2  /* r5 ← r5 + 2 */
276 |         b .L2_loop_j /* next iteration of loop j */
277 |        .L2_end_loop_j: /* Here ends loop j */
278 |        add r4, r4, #1 /* r4 ← r4 + 1 */
279 |        b .L2_loop_i     /* next iteration of loop i */
280 |     .L2_end_loop_i: /* Here ends loop i */
281 | 
282 |     /* Set the LEN field of FPSCR back to 1 (value 0) */
283 |     mov r5, #0b011                        /* r5 ← 3 */
284 |     mvn r5, r5, LSL #16                   /* r5 ← r5 << 16 */
285 |     fmrx r4, fpscr                        /* r4 ← fpscr */
286 |     and r4, r4, r5                        /* r4 ← r4 & r5 */
287 |     fmxr fpscr, r4                        /* fpscr ← r4 */
288 | 
289 |     vpop {s16-s31}                /* Restore preserved floating registers */
290 |     pop {r4, r5, r6, r7, r8, lr}  /* Restore integer registers */
291 |     bx lr /* Leave function */
292 | 
293 | better_vectorial_matmul_4x4:
294 |     /* r0 address of A
295 |        r1 address of B
296 |        r2 address of C
297 |     */
298 |     push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */
299 |     vpush {s16-s19}               /* Floating point registers starting from s16 must be preserved */
300 |     vpush {s24-s27}
301 |     /* First zero 16 single floating point */
302 |     /* In IEEE 754, all bits cleared means 0 */
303 |     mov r4, r2
304 |     mov r5, #16
305 |     mov r6, #0
306 |     b .L3_loop_init_test
307 |     .L3_loop_init :
308 |       str r6, [r4], +#4   /* *r4 ← r6 then r4 ← r4 + 4 */
309 |     .L3_loop_init_test:
310 |       subs r5, r5, #1
311 |       bne .L3_loop_init
312 | 
313 |     /* Set the LEN field of FPSCR to be 4 (value 3) */
314 |     mov r5, #0b011                        /* r5 ← 3 */
315 |     mov r5, r5, LSL #16                   /* r5 ← r5 << 16 */
316 |     fmrx r4, fpscr                        /* r4 ← fpscr */
317 |     orr r4, r4, r5                        /* r4 ← r4 | r5 */
318 |     fmxr fpscr, r4                        /* fpscr ← r4 */
319 | 
320 |     /* We will use 
321 |            r4 as k
322 |            r5 as i
323 |     */
324 |     mov r4, #0 /* r4 ← 0 */
325 |     .L3_loop_k:  /* loop header of k */
326 |       cmp r4, #4  /* if r4 == 4 goto end of the loop k */
327 |       beq .L3_end_loop_k
328 |       mov r5, #0  /* r5 ← 0 */
329 |       .L3_loop_i: /* loop header of i */
330 |        cmp r5, #4 /* if r5 == 4 goto end of the loop i */
331 |         beq .L3_end_loop_i
332 |         /* Compute the address of C[i][0] */
333 |         /* Address of C[i][0] is C + 4*(4 * i) */
334 |         add r7, r2, r5, LSL #4         /* r7 ← r2 + (r5 << 4). This is r7 ← c + 4*4*i */
335 |         vldmia r7, {s8-s11}            /* Load {s8,s9,s10,s11} ← {c[i][0], c[i][1], c[i][2], c[i][3]} */
336 |         /* Compute the address of A[i][k] */
337 |         /* Address of A[i][k] is A + 4*(4*i + k) */
338 |         add r8, r4, r5, LSL #2         /* r8 ← r4 + r5 << 2. This is r8 ← k + 4*i */
339 |         add r8, r0, r8, LSL #2         /* r8 ← r0 + r8 << 2. This is r8 ← a + 4*(k + 4*i) */
340 |         vldr s0, [r8]                  /* Load s0 ← a[i][k] */
341 | 
342 |         /* Compute the address of B[k][0] */
343 |         /* Address of B[k][0] is B + 4*(4*k) */
344 |         add r8, r1, r4, LSL #4         /* r8 ← r1 + r4 << 4. This is r8 ← b + 4*(4*k) */
345 |         vldmia r8, {s16-s19}           /* Load {s16,s17,s18,s19} ← {b[k][0], b[k][1], b[k][2], b[k][3]} */
346 | 
347 |         vmul.f32 s24, s16, s0          /* {s24,s25,s26,s27} ← {s16,s17,s18,s19} * {s0,s0,s0,s0} */
348 |         vadd.f32 s8, s8, s24           /* {s8,s9,s10,s11} ← {s8,s9,s10,s11} + {s24,s25,s26,s7} */
349 | 
350 |         vstmia r7, {s8-s11}            /* Store {c[i][0],c[i][1],c[i][2],c[i][3]} ← {s8,s9,s10,s11} */
351 | 
352 |         add r5, r5, #1  /* r5 ← r5 + 1. This is i = i + 1 */
353 |         b .L3_loop_i /* next iteration of loop i */
354 |        .L3_end_loop_i: /* Here ends loop i */
355 |        add r4, r4, #1 /* r4 ← r4 + 1. This is k = k + 1 */
356 |        b .L3_loop_k     /* next iteration of loop k */
357 |     .L3_end_loop_k: /* Here ends loop k */
358 | 
359 |     /* Set the LEN field of FPSCR back to 1 (value 0) */
360 |     mov r5, #0b011                        /* r5 ← 3 */
361 |     mvn r5, r5, LSL #16                   /* r5 ← r5 << 16 */
362 |     fmrx r4, fpscr                        /* r4 ← fpscr */
363 |     and r4, r4, r5                        /* r4 ← r4 & r5 */
364 |     fmxr fpscr, r4                        /* fpscr ← r4 */
365 | 
366 |     vpop {s24-s27}                /* Restore preserved floating registers */
367 |     vpop {s16-s19}
368 |     pop {r4, r5, r6, r7, r8, lr}  /* Restore integer registers */
369 |     bx lr /* Leave function */
370 | 
371 | best_vectorial_matmul_4x4:
372 |     /* r0 address of A
373 |        r1 address of B
374 |        r2 address of C
375 |     */
376 |     push {r4, r5, r6, r7, r8, lr} /* Keep integer registers */
377 |     vpush {s16-s19}               /* Floating point registers starting from s16 must be preserved */
378 | 
379 |     /* First zero 16 single floating point */
380 |     /* In IEEE 754, all bits cleared means 0 */
381 |     mov r4, r2
382 |     mov r5, #16
383 |     mov r6, #0
384 |     b .L4_loop_init_test
385 |     .L4_loop_init :
386 |       str r6, [r4], +#4   /* *r4 ← r6 then r4 ← r4 + 4 */
387 |     .L4_loop_init_test:
388 |       subs r5, r5, #1
389 |       bne .L4_loop_init
390 | 
391 |     /* Set the LEN field of FPSCR to be 4 (value 3) */
392 |     mov r5, #0b011                        /* r5 ← 3 */
393 |     mov r5, r5, LSL #16                   /* r5 ← r5 << 16 */
394 |     fmrx r4, fpscr                        /* r4 ← fpscr */
395 |     orr r4, r4, r5                        /* r4 ← r4 | r5 */
396 |     fmxr fpscr, r4                        /* fpscr ← r4 */
397 | 
398 |     /* We will use 
399 |            r4 as k
400 |            r5 as i
401 |     */
402 |     mov r4, #0 /* r4 ← 0 */
403 |     .L4_loop_k:  /* loop header of k */
404 |       cmp r4, #4  /* if r4 == 4 goto end of the loop k */
405 |       beq .L4_end_loop_k
406 |       mov r5, #0  /* r5 ← 0 */
407 |       .L4_loop_i: /* loop header of i */
408 |        cmp r5, #4 /* if r5 == 4 goto end of the loop i */
409 |         beq .L4_end_loop_i
410 |         /* Compute the address of C[i][0] */
411 |         /* Address of C[i][0] is C + 4*(4 * i) */
412 |         add r7, r2, r5, LSL #4         /* r7 ← r2 + (r5 << 4). This is r7 ← c + 4*4*i */
413 |         vldmia r7, {s8-s15}            /* Load {s8,s9,s10,s11,s12,s13,s14,s15} 
414 |                                             ← {c[i][0],   c[i][1],   c[i][2],   c[i][3]
415 |                                                c[i+1][0], c[i+1][1], c[i+1][2], c[i+1][3]} */
416 |         /* Compute the address of A[i][k] */
417 |         /* Address of A[i][k] is A + 4*(4*i + k) */
418 |         add r8, r4, r5, LSL #2         /* r8 ← r4 + r5 << 2. This is r8 ← k + 4*i */
419 |         add r8, r0, r8, LSL #2         /* r8 ← r0 + r8 << 2. This is r8 ← a + 4*(k + 4*i) */
420 |         vldr s0, [r8]                  /* Load s0 ← a[i][k] */
421 |         vldr s1, [r8, #16]             /* Load s1 ← a[i+1][k] */
422 | 
423 |         /* Compute the address of B[k][0] */
424 |         /* Address of B[k][0] is B + 4*(4*k) */
425 |         add r8, r1, r4, LSL #4         /* r8 ← r1 + r4 << 4. This is r8 ← b + 4*(4*k) */
426 |         vldmia r8, {s16-s19}           /* Load {s16,s17,s18,s19} ← {b[k][0], b[k][1], b[k][2], b[k][3]} */
427 | 
428 |         vmla.f32 s8, s16, s0           /* {s8,s9,s10,s11} ← {s8,s9,s10,s11} + ({s16,s17,s18,s19} * {s0,s0,s0,s0}) */
429 |         vmla.f32 s12, s16, s1          /* {s12,s13,s14,s15} ← {s12,s13,s14,s15} + ({s16,s17,s18,s19} * {s1,s1,s1,s1}) */
430 | 
431 |         vstmia r7, {s8-s15}            /* Store {c[i][0],   c[i][1],   c[i][2],    c[i][3],
432 |                                                  c[i+1][0], c[i+1][1], c[i+1][2]}, c[i+1][3] }
433 |                                                 ← {s8,s9,s10,s11,s12,s13,s14,s15} */
434 | 
435 |         add r5, r5, #2  /* r5 ← r5 + 2. This is i = i + 2 */
436 |         b .L4_loop_i /* next iteration of loop i */
437 |        .L4_end_loop_i: /* Here ends loop i */
438 |        add r4, r4, #1 /* r4 ← r4 + 1. This is k = k + 1 */
439 |        b .L4_loop_k     /* next iteration of loop k */
440 |     .L4_end_loop_k: /* Here ends loop k */
441 | 
442 |     /* Set the LEN field of FPSCR back to 1 (value 0) */
443 |     mov r5, #0b011                        /* r5 ← 3 */
444 |     mvn r5, r5, LSL #16                   /* r5 ← r5 << 16 */
445 |     fmrx r4, fpscr                        /* r4 ← fpscr */
446 |     and r4, r4, r5                        /* r4 ← r4 & r5 */
447 |     fmxr fpscr, r4                        /* fpscr ← r4 */
448 | 
449 |     vpop {s16-s19}                /* Restore preserved floating registers */
450 |     pop {r4, r5, r6, r7, r8, lr}  /* Restore integer registers */
451 |     bx lr /* Leave function */
452 | 
453 | .globl main
454 | main:
455 |     push {r4, r5, r6, lr}  /* Keep integer registers */
456 | 
457 |     /* Prepare call to naive_matmul_4x4 */
458 |     ldr r0, addr_mat_A  /* r0 ← a */
459 |     ldr r1, addr_mat_B  /* r1 ← b */
460 |     ldr r2, addr_mat_C  /* r2 ← c */
461 |     bl best_vectorial_matmul_4x4
462 | 
463 |     /* Now print the result matrix */
464 |     ldr r4, addr_mat_C  /* r4 ← c */
465 | 
466 |     vldr s0, [r4] /* s0 ← *r4. This is s0 ← c[0][0] */
467 |     vcvt.f64.f32 d1, s0 /* Convert it into a double-precision
468 |                            d1 ← s0
469 |                          */
470 |     vmov r2, r3, d1      /* {r2,r3} ← d1 */
471 | 
472 |     mov r6, sp     /* Remember the stack pointer, we need it to restore it back later */
473 |                    /* r6 ← sp */
474 | 
475 |     mov r5, #1  /* We will iterate from 1 to 15 (because the 0th item has already been handled */
476 |     add r4, r4, #60 /* Go to the last item of the matrix c, this is c[3][3] */
477 |     .Lloop:
478 |         vldr s0, [r4] /* s0 ← *r4. Load the current item */
479 |         vcvt.f64.f32 d1, s0 /* Convert it into a double-precision
480 |                                d1 ← s0
481 |                              */
482 |         sub sp, sp, #8      /* Make room in the stack for the double-precision */
483 |         vstr d1, [sp]       /* Store the double precision in the top of the stack */
484 |         sub r4, r4, #4      /* Move to the previous element in the matrix */
485 |         add r5, r5, #1      /* One more item has been handled */
486 |         cmp r5, #16         /* if r5 != 16 go to next iteration of the loop */
487 |         bne .Lloop
488 | 
489 |     ldr r0, addr_format_result /* r0 ← &format_result */
490 |     bl printf /* call printf */
491 |     mov sp, r6  /* Restore the stack after the call  */
492 | 
493 |     mov r0, #0
494 |     pop {r4, r5, r6, lr}
495 |     bx lr
496 | 
497 | addr_mat_A : .word mat_A
498 | addr_mat_B : .word mat_B
499 | addr_mat_C : .word mat_C
500 | addr_format_result : .word format_result
501 | 


--------------------------------------------------------------------------------
/chapter15/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=divideby14 division benchmark
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -march=armv6 -mfpu=vfpv2 -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter15/benchmark.s:
--------------------------------------------------------------------------------
  1 | /* division.s */
  2 | 
  3 | .data
  4 | 
  5 | .text
  6 | 
  7 | .globl main
  8 | 
  9 | unsigned_naive_longdiv:
 10 |     /* r0 contains N */
 11 |     /* r1 contains D */
 12 |     mov r2, r1             /* r2 ← r1. We keep D in r2 */
 13 |     mov r1, r0             /* r1 ← r0. We keep N in r1 */
 14 | 
 15 |     mov r0, #0             /* r0 ← 0. Set Q = 0 initially */
 16 | 
 17 |     b .Lloop_check0
 18 |     .Lloop0:
 19 |        add r0, r0, #1      /* r0 ← r0 + 1. Q = Q + 1 */
 20 |        sub r1, r1, r2      /* r1 ← r1 - r2 */
 21 |     .Lloop_check0:
 22 |        cmp r1, r2          /* compute r1 - r2 and update cpsr */
 23 |        bhs .Lloop0         /* branch if r1 >= r2 (C=0 or Z=1) */
 24 | 
 25 |     /* r0 already contains Q */
 26 |     /* r1 already contains R */
 27 |     bx lr
 28 | 
 29 | unsigned_longdiv:
 30 |     /* r0 contains N */
 31 |     /* r1 contains D */
 32 |     /* r2 contains Q */
 33 |     /* r3 contains R */
 34 |     push {r4, lr}
 35 |     mov r2, #0                 /* r2 ← 0 */
 36 |     mov r3, #0                 /* r3 ← 0 */
 37 | 
 38 |     mov r4, #32                /* r4 ← 32 */
 39 |     b .Lloop_check1
 40 |     .Lloop1:
 41 |         movs r0, r0, LSL #1    /* r0 ← r0 << 1 updating cpsr (sets C if 31st bit of r0 was 1) */
 42 |         adc r3, r3, r3         /* r3 ← r3 + r3 + C. This is equivalent to r3 ← (r3 << 1) + C */
 43 | 
 44 |         cmp r3, r1             /* compute r3 - r1 and update cpsr */
 45 |         subhs r3, r3, r1       /* if r3 >= r1 (C=1) then r3 ← r3 - r1 */
 46 |         adc r2, r2, r2         /* r2 ← r2 + r2 + C. This is equivalent to r2 ← (r2 << 1) + C */
 47 |     .Lloop_check1:
 48 |         subs r4, r4, #1        /* r4 ← r4 - 1 */
 49 |         bpl .Lloop1            /* if r4 >= 0 (N=0) then branch to .Lloop1 */
 50 | 
 51 |     mov r0, r2
 52 | 
 53 |     pop {r4, lr}
 54 |     bx lr
 55 | 
 56 | better_unsigned_division :
 57 |     /* r0 contains N */
 58 |     /* r1 contains D */
 59 |     /* r2 contains Q */
 60 |     /* r3 tmp */
 61 | 
 62 |     mov r3, r1                 /* r3 ← r1 */
 63 |     cmp r3, r0, LSR #1         /* update cpsr with r3 - 2*r0 */
 64 |     .Lloop2:
 65 |     movls r3, r3, LSL #1       /* if r3 <= 2*r0 (C=0 or Z=1) then r3 ← 2*r3 */
 66 |     cmp r3, r0, LSR #1         /* update cpsr with r3 - 2*r0 */
 67 |     bls .Lloop2                /* branch to .Lloop2 if r3 <= 2*r0 (C=0 or Z=1) */
 68 | 
 69 |     mov r2, #0                 /* r2 ← 0 */
 70 | 
 71 |     .Lloop3:
 72 |     cmp r0, r3                 /* update cpsr with r0 - r3 */
 73 |     subhs r0, r0, r3           /* if r0 >= r3 then r0 ← r0 - r3 */
 74 |     adc r2, r2, r2             /* r2 ← r2 + r2 + C (if r0 >= r3 then C = 1 else C = 0) */
 75 | 
 76 |     mov r3, r3, LSR #1         /* r3 ← r3 >> 1 */
 77 |     cmp r3, r1                 /* update cpsr with r3 - r1 */
 78 |     bhs .Lloop3                /* if r3 >= r1 branch to .Lloop3 */
 79 | 
 80 |     mov r0, r2
 81 |    
 82 |     bx lr
 83 | 
 84 | vfpv2_division:
 85 |     /* r0 contains N */
 86 |     /* r1 contains D */
 87 |     vmov s0, r0             /* s0 ← r0 (bit copy) */
 88 |     vmov s1, r1             /* s1 ← r1 (bit copy) */
 89 |     vcvt.f32.s32 s0, s0     /* s0 ← (float)s0 */
 90 |     vcvt.f32.s32 s1, s1     /* s1 ← (float)s1 */
 91 |     vdiv.f32 s0, s0, s1     /* s0 ← s0 / s1 */
 92 |     vcvt.s32.f32 s0, s0     /* s0 ← (int)s0 */
 93 |     vmov r0, s0             /* r0 ← s0 (bit copy). This is Q */
 94 |     bx lr
 95 | 
 96 | 
 97 | clz_unsigned_division:
 98 |     /*                          This algorithm does not work if N == D */
 99 |     /* cmp r0, r1               Compare r0 and r1 */
100 |     /* moveq r0, #1             If they are equal set the result to 1 */
101 |     /* bxeq lr                  If they are equal leave the function */
102 | 
103 |     clz  r3, r0               /* Count leading zeroes of N */
104 |     clz  r2, r1               /* Count leading zeroes of D */
105 |     sub  r3, r2, r3           /* r3 ← r2 - r3. 
106 |                                  This is the difference of zeroes
107 |                                  between N and N
108 |                                  Note: D should be smaller than N
109 |                                  so this substraction is ok */
110 |     add r3, r3, #1
111 | 
112 |     mov r2, #0
113 |     b .Lloop_check4
114 |     .Lloop4:
115 |       cmp r0, r1, lsl r3
116 |       adc r2, r2, r2
117 |       subcs r0, r0, r1, lsl r3
118 |     .Lloop_check4:
119 |         subs r3, r3, #1        /* r3 ← r3 - 1 */
120 |         bpl .Lloop4            /* if r3 >= 0 (N=0) then branch to .Lloop1 */
121 | 
122 |     mov r0, r2
123 | 
124 |     bx lr
125 | 
126 | .set MAX, 16384
127 | main:
128 |     push {r4, r5, r6, lr}
129 | 
130 |     mov r4, #1                         /* r4 ← 1 */
131 | 
132 |     b .Lcheck_loop_i                   /* branch to .Lcheck_loop_i */
133 |     .Lloop_i:
134 |        mov r5, r4                      /* r5 ← r4 */
135 |        b .Lcheck_loop_j                /* branch to .Lcheck_loop_j */
136 |        .Lloop_j:
137 | 
138 |          mov r0, r5                    /* r0 ← r5. This is N */
139 |          mov r1, r4                    /* r1 ← r4. This is D */
140 | 
141 |          bl  better_unsigned_division
142 | 
143 |        /* mov r3, r0
144 |          mov r2, r4
145 |          mov r1, r5
146 |          ldr r0, addr_of_message
147 |          bl printf */
148 | 
149 | 
150 |          add r5, r5, #1
151 |        .Lcheck_loop_j:
152 |          cmp r5, #MAX                   /* compare r5 and 10 */
153 |          bne .Lloop_j                  /* if r5 != 10 branch to .Lloop_j */
154 |        add r4, r4, #1
155 |     .Lcheck_loop_i:
156 |       cmp r4, #MAX                     /* compare r4 and 10 */
157 |       bne .Lloop_i                     /* if r4 != 10 branch to .Lloop_i */
158 | 
159 |     mov r0, #0
160 | 
161 |     pop {r4, r5, r6, lr}
162 |     bx lr
163 | 
164 | message: .asciz "%u / %u = %u\n"
165 | addr_of_message: .word message
166 | 


--------------------------------------------------------------------------------
/chapter15/divideby14.s:
--------------------------------------------------------------------------------
 1 | /* -- divideby14.s */
 2 | 
 3 | .data
 4 | 
 5 | .align 4
 6 | read_number: .word 0
 7 | 
 8 | .align 4
 9 | message1 : .asciz "Enter an integer to divide it by 14: "
10 | 
11 | .align 4
12 | message2 : .asciz "Number %d (signed-)divided by 14 is %d\n"
13 | 
14 | .align 4
15 | scan_format : .asciz "%d"
16 | 
17 | .text
18 | 
19 | /* This function has been generated using "magic.py 14 code_for_signed" */
20 | s_divide_by_14:
21 |    /* r0 contains the argument to be divided by 14 */
22 |    ldr r1, .Ls_magic_number_14 /* r1 ← magic_number */
23 |    smull r1, r2, r1, r0   /* r1 ← Lower32Bits(r1*r0). r2 ← Upper32Bits(r1*r0) */
24 |    add r2, r2, r0         /* r2 ← r2 + r0 */
25 |    mov r2, r2, ASR #3     /* r2 ← r2 >> 3 */
26 |    mov r1, r0, LSR #31    /* r1 ← r0 >> 31 */
27 |    add r0, r2, r1         /* r0 ← r2 + r1 */
28 |    bx lr                  /* leave function */
29 |    .align 4
30 |    .Ls_magic_number_14: .word 0x92492493
31 | 
32 | .globl main
33 | 
34 | main:
35 |     /* Call printf */
36 |     push {r4, lr}
37 |     ldr r0, addr_of_message1       /* r0 ← &message */
38 |     bl printf
39 | 
40 |     /* Call scanf */
41 |     ldr r0, addr_of_scan_format   /* r0 ← &scan_format */
42 |     ldr r1, addr_of_read_number   /* r1 ← &read_number */
43 |     bl scanf
44 | 
45 |     ldr r0, addr_of_read_number   /* r1 ← &read_number */
46 |     ldr r0, [r0]                  /* r1 ← *r1 */
47 | 
48 |     bl s_divide_by_14
49 |     mov r2, r0
50 | 
51 |     ldr r1, addr_of_read_number   /* r1 ← &read_number */
52 |     ldr r1, [r1]                  /* r1 ← *r1 */
53 |     
54 |     ldr r0, addr_of_message2      /* r0 ← &message2 */
55 |     bl printf                     /* Call printf, r1 and r2 already
56 |                                      contain the desired values */
57 | 
58 |     pop {r4, lr}
59 |     mov r0, #0
60 |     bx lr
61 | 
62 | addr_of_message1: .word message1
63 | addr_of_scan_format: .word scan_format
64 | addr_of_message2: .word message2
65 | addr_of_read_number: .word read_number
66 | 


--------------------------------------------------------------------------------
/chapter15/division.s:
--------------------------------------------------------------------------------
 1 | /* division.s */
 2 | 
 3 | .data
 4 | 
 5 | .text
 6 | 
 7 | .globl main
 8 | 
 9 | unsigned_naive_longdiv:
10 |     /* r0 contains N */
11 |     /* r1 contains D */
12 |     mov r2, r1             /* r2 ← r0. We keep D in r2 */
13 |     mov r1, r0             /* r1 ← r0. We keep N in r1 */
14 | 
15 |     mov r0, #0             /* r0 ← 0. Set Q = 0 initially */
16 | 
17 |     b .Lloop_check0
18 |     .Lloop0:
19 |        add r0, r0, #1      /* r0 ← r0 + 1. Q = Q + 1 */
20 |        sub r1, r1, r2      /* r1 ← r1 - r2 */
21 |     .Lloop_check0:
22 |        cmp r1, r2          /* compute r1 - r2 and update cpsr */
23 |        bhs .Lloop0         /* branch if r1 >= r2 (C=0 or Z=1) */
24 | 
25 |     /* r0 already contains Q */
26 |     /* r1 already contains R */
27 |     bx lr
28 | 
29 | unsigned_longdiv:
30 |     /* r0 contains N */
31 |     /* r1 contains D */
32 |     /* r2 contains Q */
33 |     /* r3 contains R */
34 |     push {r4, lr}
35 |     mov r2, #0                 /* r2 ← 0 */
36 |     mov r3, #0                 /* r3 ← 0 */
37 | 
38 |     mov r4, #32                /* r4 ← 32 */
39 |     b .Lloop_check1
40 |     .Lloop1:
41 |         movs r0, r0, LSL #1    /* r0 ← r0 << 1 updating cpsr (sets C if 31st bit of r0 was 1) */
42 |         adc r3, r3, r3         /* r3 ← r3 + r3 + C. This is equivalent to r3 ← (r3 << 1) + C */
43 | 
44 |         cmp r3, r1             /* compute r3 - r1 and update cpsr */
45 |         subhs r3, r3, r1       /* if r3 >= r1 (C=1) then r3 ← r3 - r1 */
46 |         adc r2, r2, r2         /* r2 ← r2 + r2 + C. This is equivalent to r2 ← (r2 << 1) + C */
47 |     .Lloop_check1:
48 |         subs r4, r4, #1        /* r4 ← r4 - 1 */
49 |         bpl .Lloop1            /* if r4 >= 0 (N=0) then branch to .Lloop1 */
50 | 
51 |     pop {r4, lr}
52 |     bx lr
53 | 
54 | better_unsigned_division :
55 |     /* r0 contains N */
56 |     /* r1 contains D */
57 |     /* r2 contains Q */
58 |     /* r3 tmp */
59 | 
60 |     mov r3, r1                 /* r3 ← r1 */
61 |     cmp r3, r0, LSR #1         /* update cpsr with r3 - 2*r0 */
62 |     .Lloop2:
63 |     movls r3, r3, LSL #1       /* if r3 <= 2*r0 (C=0 or Z=1) then r3 ← 2*r3 */
64 |     cmp r3, r0, LSR #1         /* update cpsr with r3 - 2*r0 */
65 |     bls .Lloop2                /* branch to .Lloop2 if r3 <= 2*r0 (C=0 or Z=1) */
66 | 
67 |     mov r2, #0                 /* r2 ← 0 */
68 | 
69 |     .Lloop3:
70 |     cmp r0, r3                 /* update cpsr with r0 - r3 */
71 |     subhs r0, r0, r3           /* if r0 >= r3 then r0 ← r0 - r3 */
72 |     adc r2, r2, r2             /* r2 ← r2 + r2 + C (if r0 >= r3 then C = 1 else C = 0) */
73 | 
74 |     mov r3, r3, LSR #1         /* r3 ← r3 >> 1 */
75 |     cmp r3, r1                 /* update cpsr with r3 - r1 */
76 |     bhs .Lloop3                /* if r3 >= r1 branch to .Lloop3 */
77 |    
78 |     bx lr
79 | 
80 | vfpv2_division:
81 |     /* r0 contains N */
82 |     /* r1 contains D */
83 |     vmov s0, r0             /* s0 ← r0 (bit copy) */
84 |     vmov s1, r1             /* s1 ← r1 (bit copy) */
85 |     vcvt.f32.s32 s0, s0     /* s0 ← (float)s0 */
86 |     vcvt.f32.s32 s1, s1     /* s1 ← (float)s1 */
87 |     vdiv.f32 s0, s0, s1     /* s0 ← s0 / s1 */
88 |     vcvt.s32.f32 s0, s0     /* s0 ← (int)s0 */
89 |     vmov r0, s0             /* r0 ← s0 (bit copy). This is Q */
90 |     bx lr
91 | 
92 |     
93 | main:
94 |     bx lr
95 | 


--------------------------------------------------------------------------------
/chapter15/magic.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # coding=utf-8
  3 | 
  4 | # Implemented very naively following the equations in Hacker's Delight
  5 | 
  6 | # We assume 32-bit
  7 | w = 32
  8 | # Make sure you use Python 2.5+ because we may enter in the domain of bignums
  9 | # (Python long) during the computations
 10 | 
 11 | # We mimick a C99-style %-operator (remainder)
 12 | # Python returns the sign of the divisor
 13 | # while C99 uses the sign of the dividend
 14 | def rem(x, y):
 15 |     t = x % y
 16 |     if (t == 0):
 17 |         return t
 18 |     # For nonzero results we may have to adjust the result
 19 |     #  2 %  3 = 2
 20 |     # -2 % -3 = -2
 21 |     if (x > 0) != (y > 0):
 22 |         t = t - y
 23 |     return t
 24 | 
 25 | 
 26 | def magic_unsigned(d):
 27 |     p = w
 28 |     n_c = 2**w - rem(2**w, d) - 1
 29 |     while not (2**p > (n_c * (d - 1 - rem(2**p - 1, d)))):
 30 |         p = p + 1
 31 |     m = (2**p + d - 1 - rem(2**p - 1, d)) / d
 32 |     # Adjust the result to w bits
 33 |     magic = m & ~(~0 << w)
 34 |     add_flag = (m != magic)
 35 |     shift = p - w
 36 |     return (magic, shift, add_flag)
 37 | 
 38 | def magic_signed_positive(d):
 39 |     p = w
 40 |     n_c = 2**(w-1) - rem(2**(w-1), d) - 1
 41 |     while not (2**p > (n_c*(d-rem(2**p, d)))):
 42 |         p = p + 1
 43 |     m = (2**p + d - rem(2**p, d)) / d
 44 |     # Adjust the result to w bits
 45 |     magic = m & ~(~0 << w)
 46 |     shift = p - w
 47 |     return (magic, shift)
 48 | 
 49 | def magic_signed_negative(d):
 50 |     p = w
 51 |     n_c = -(2**(w-1)) + rem(2**(w-1) + 1, d)
 52 |     while not (2**p > (n_c*(d+rem(2**p, d)))):
 53 |         p = p + 1
 54 |     m = (2**p - d - rem(2**p, d)) / d
 55 |     # Adjust the result to w bits
 56 |     magic = m & ~(~0 << w)
 57 |     shift = p - w
 58 |     return (magic, shift)
 59 | 
 60 | import sys
 61 | import string
 62 | 
 63 | operations = ["just_tell", "code_for_signed", "code_for_unsigned"]
 64 | 
 65 | def usage_message():
 66 |     print "usage: {0} divisor [{1}]".format(sys.argv[0], string.join(operations, "|"))
 67 |     sys.exit(1)
 68 | 
 69 | if len(sys.argv) < 2:
 70 |     usage_message()
 71 | 
 72 | # The divisor
 73 | try:
 74 |     d = int(sys.argv[1])
 75 | except:
 76 |     usage_message()
 77 | 
 78 | if (d == 0):
 79 |     print "dividend cannot be zero"
 80 |     usage_message()
 81 | 
 82 | if len(sys.argv) >= 3:
 83 |     operation = sys.argv[2]
 84 | else:
 85 |     operation = "just_tell"
 86 | 
 87 | if operation not in operations:
 88 |     usage_message()
 89 | 
 90 | if operation == "just_tell":
 91 |     if d > 0:
 92 |         (magic_signed, shift_signed) = magic_signed_positive(d)
 93 |         (magic_unsigned, shift_unsigned, add_flag) = magic_unsigned(d)
 94 |         print "Magic number for signed division by {0} is {1} (0x{1:X}) with shift {2}".format(d, magic_signed, shift_signed)
 95 |         print "Magic number for unsigned division by {0} is {1} (0x{1:X}) with shift {2}{3}".format(d, magic_unsigned, shift_unsigned, " and we need an extra addition" if add_flag else "")
 96 |     elif d < 0:
 97 |         (magic_signed, shift_signed) = magic_signed_negative(d)
 98 |         print "Magic number for signed division by {0} is {1} (0x{1:X}) with shift {2}".format(d, magic_signed, shift_signed)
 99 |     else:
100 |         print "Can't divide by 0"
101 | elif operation == "code_for_signed":
102 |     if (d > 0):
103 |         (magic_signed, shift_signed) = magic_signed_positive(d)
104 |     else:
105 |         (magic_signed, shift_signed) = magic_signed_negative(d)
106 | 
107 |     tab = "   "
108 |     dividend_name = "{0}".format(d) if d > 0 else "minus_{0}".format(-d)
109 |     magic_number_name = ".Ls_magic_number_{0}".format(dividend_name)
110 |     function_name = "s_divide_by_{0}".format(dividend_name)
111 |     code = "{0}:\n".format(function_name)
112 |     code += tab + "/* r0 contains the argument to be divided by {0} */\n".format(d)
113 |     code += tab + "ldr r1, {0} /* r1 ← magic_number */\n".format(magic_number_name)
114 |     code += tab + "smull r1, r2, r1, r0   /* r1 ← Lower32Bits(r1*r0). r2 ← Upper32Bits(r1*r0) */\n"
115 |     magic_number_is_negative = (magic_signed & (1 << (w-1)))
116 |     if d > 0 and magic_number_is_negative:
117 |         code += tab + "add r2, r2, r0         /* r2 ← r2 + r0 */\n"
118 |     elif d < 0 and not magic_number_is_negative:
119 |         code += tab + "sub r2, r2, r0         /* r2 ← r2 - r0 */\n"
120 |     if shift_signed > 0:
121 |         code += tab + "mov r2, r2, ASR #{0}     /* r2 ← r2 >> {0} */\n".format(shift_signed)
122 |     code += tab + "mov r1, r0, LSR #{0}    /* r1 ← r0 >> {0} */\n".format(w-1)
123 |     code += tab + "add r0, r2, r1         /* r0 ← r2 + r1 */\n"
124 |     code += tab + "bx lr                  /* leave function */\n"
125 |     code += tab + ".align 4\n"
126 |     code += tab + "{0}: .word 0x{1:x}\n".format(magic_number_name, magic_signed)
127 | 
128 |     print code
129 | elif operation == "code_for_unsigned":
130 |     if d < 0:
131 |         print "You requested code for unsigned but the divisor is negative!"
132 |         sys.exit(1)
133 |     (magic_unsigned, shift_unsigned, add_flag) = magic_unsigned(d)
134 |     tab = "   "
135 |     dividend_name = "{0}".format(d)
136 |     magic_number_name = ".Lu_magic_number_{0}".format(dividend_name)
137 |     function_name = "u_divide_by_{0}".format(dividend_name)
138 |     code = "{0}:\n".format(function_name)
139 |     code += tab + "/* r0 contains the argument to be divided by {0} */\n".format(d)
140 |     code += tab + "ldr r1, {0} /* r1 ← magic_number */\n".format(magic_number_name)
141 |     code += tab + "umull r1, r2, r1, r0   /* r1 ← Lower32Bits(r1*r0). r2 ← Upper32Bits(r1*r0) */\n"
142 |     if add_flag:
143 |         code += tab + "adds r2, r2, r0        /* r2 ← r2 + r0 updating cpsr */\n"
144 |         code += tab + "mov r2, r2, ROR #0     /* r2 ← (carry_flag << 31) | (r2 >> 1) */\n".format(shift_unsigned)
145 |         code += tab + "mov r0, r2, LSR #{0}     /* r0 ← r2 >> {0} */\n".format(shift_unsigned)
146 |     elif shift_unsigned > 0:
147 |         code += tab + "mov r0, r2, LSR #{0}     /* r0 ← r2 >> {0} */\n".format(shift_unsigned)
148 |     code += tab + "bx lr                  /* leave function */\n"
149 |     code += tab + ".align 4\n"
150 |     code += tab + "{0}: .word 0x{1:x}\n".format(magic_number_name, magic_unsigned)
151 | 
152 |     print code
153 | else:
154 |     print "Operation {} not implemented".format(operation)
155 | 


--------------------------------------------------------------------------------
/chapter16/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=jumptable calcjump ifstring binsearch hybrid
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -march=armv6 -mfpu=vfpv2 -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter16/binsearch.s:
--------------------------------------------------------------------------------
 1 | /* binsearch.s */
 2 | .data
 3 | 
 4 | .text
 5 | 
 6 | .globl main
 7 | 
 8 | main:
 9 | 
10 |   cmp r0, #1              /* r0 - 1 and update cpsr */
11 |   blt case_default        /* if r0 < 1 then branch to case_default */
12 |   cmp r0, #10             /* r0 - 10 and update cpsr */
13 |   bgt case_default        /* if r0 > 10 then branch to case default */
14 | 
15 |   case_1_to_10:
16 |     cmp r0, #5            /* r0 - 5 and update cpsr */
17 |     beq case_5            /* if r0 == 5 branch to case_5 */
18 |     blt case_1_to_4       /* if r0 < 5 branch to case_1_to_4 */
19 |     bgt case_6_to_10      /* if r0 > 5 branch to case_6_to_4 */
20 | 
21 |   case_1_to_4:
22 |     cmp r0, #2            /* r0 - 2 and update cpsr */
23 |     beq case_2            /* if r0 == 2 branch to case_2 */
24 |     blt case_1            /* if r0 < 2 branch to case_1 
25 |                              (case_1_to_1 does not make sense) */
26 |     bgt case_3_to_4       /* if r0 > 2 branch to case_3_to_4 */
27 | 
28 |   case_3_to_4:            
29 |     cmp r0, #3            /* r0 - 3 and update cpsr */
30 |     beq case_3            /* if r0 == 3 branch to case_3 */
31 |     b case_4              /* otherwise it must be r0 == 4,
32 |                              branch to case_4 */
33 | 
34 |   case_6_to_10:
35 |     cmp r0, #8            /* r0 - 8 and update cpsr */
36 |     beq case_8            /* if r0 == 8 branch to case_8 */
37 |     blt case_6_to_7       /* if r0 < 8 then branch to case_6_to_7 */
38 |     bgt case_9_to_10      /* if r0 > 8 then branch to case_9_to_10 */
39 | 
40 |   case_6_to_7:
41 |     cmp r0, #6            /* r0 - 6 and update cpsr */
42 |     beq case_6            /* if r0 == 6 branch to case_6 */
43 |     b case_7              /* otherwise it must be r0 == 7,
44 |                              branch to case 7 */
45 | 
46 |   case_9_to_10:
47 |     cmp r0, #9            /* r0 - 9 and update cpsr */
48 |     beq case_9
49 |     b case_10
50 | 
51 |   case_1:
52 |      mov r0, #1
53 |      b after_switch
54 |   case_2:
55 |      mov r0, #2
56 |      b after_switch
57 |   case_3:
58 |      mov r0, #3
59 |      b after_switch
60 |   case_4:
61 |      mov r0, #4
62 |      b after_switch
63 |   case_5:
64 |      mov r0, #5
65 |      b after_switch
66 |   case_6:
67 |      mov r0, #6
68 |      b after_switch
69 |   case_7:
70 |      mov r0, #7
71 |      b after_switch
72 |   case_8:
73 |      mov r0, #8
74 |      b after_switch
75 |   case_9:
76 |      mov r0, #9
77 |      b after_switch
78 |   case_10:
79 |      mov r0, #10
80 |      b after_switch
81 | 
82 |   case_default:
83 |    mov r0, #42                /* r0 ← 42 */
84 |    b after_switch             /* break (unnecessary) */  
85 | 
86 |   after_switch:
87 | 
88 |   bx lr                       /* Return from main */
89 | 


--------------------------------------------------------------------------------
/chapter16/calcjump.s:
--------------------------------------------------------------------------------
 1 | /* calcjump.s */
 2 | .data
 3 | 
 4 | .text
 5 | 
 6 | .globl main
 7 | 
 8 | main:
 9 |   cmp r0, #1                  /* r0 - 1 and update cpsr */
10 |   blt case_default            /* branch to case_default if r0 < 1 */
11 |   cmp r0, #3                  /* r0 - 3 and update cpsr */
12 |   bgt case_default            /* branch to case_default if r0 > 3 */
13 | 
14 |   sub r0, r0, #1              /* r0 ← r0 - 1. Required to index the table */
15 |   ldr r1, addr_of_case_1      /* r1 ← &case_1 */
16 |   add r1, r1, r0, LSL #3      /* r1 ← r1 + r0 * 8
17 |                                  Each instruction is 4 bytes
18 |                                  Each case takes 2 instructions
19 |                                  Thus, each case is 8 bytes (4 * 2)
20 |                                  */
21 | 
22 |   mov pc, r1                  /* pc ← r1
23 |                                  This will cause a branch to the
24 |                                  computed address */
25 | 
26 |   case_1:
27 |    mov r0, #1                 /* r0 ← 1 */ 
28 |    b after_switch             /* break */
29 |  
30 |   case_2:
31 |    mov r0, #2                 /* r0 ← 2 */
32 |    b after_switch             /* break */
33 | 
34 |   case_3:
35 |    mov r0, #3                 /* r0 ← 3 */
36 |    b after_switch             /* break */
37 | 
38 |   case_default:
39 |    mov r0, #42                /* r0 ← 42 */
40 |    b after_switch             /* break (unnecessary) */  
41 | 
42 |   after_switch:
43 | 
44 |   bx lr                       /* Return from main */
45 | 
46 | .align 4
47 | addr_of_case_1: .word case_1
48 | 


--------------------------------------------------------------------------------
/chapter16/hybrid.s:
--------------------------------------------------------------------------------
  1 | /* hybrid.s */
  2 | .data
  3 | 
  4 | .text
  5 | 
  6 | .globl main
  7 | 
  8 | main:
  9 |   push {r4, r5, r6, lr}
 10 | 
 11 |   cmp r0, #1                /* r0 - 1 and update cpsr */
 12 |   blt case_default          /* if r0 < 1 then branch to case_default */
 13 |   cmp r0, #300              /* r0 - 300 and update cpsr */
 14 |   bgt case_default          /* if r0 > 300 then branch to case default */
 15 | 
 16 |   /* prepare the binary search. 
 17 |      r1 will hold the lower index
 18 |      r2 will hold the upper index
 19 |      r3 the base address of the case_value_table
 20 |   */
 21 |   mov r1, #0
 22 |   mov r2, #9
 23 |   ldr r3, addr_case_value_table /* r3 ← &case_value_table */
 24 | 
 25 |   b check_binary_search
 26 |   binary_search:
 27 |     add r4, r1, r2          /* r4 ← r1 + r2 */
 28 |     mov r4, r4, ASR #1      /* r4 ← r4 / 2 */
 29 |     ldr r5, [r3, +r4, LSL #2]   /* r5 ← *(r3 + r4 * 4). 
 30 |                                This is r5 ← case_value_table[r4] */
 31 |     cmp r0, r5              /* r0 - r5 and update cpsr */
 32 |     sublt r2, r4, #1        /* if r0 < r5 then r2 ← r4 - 1 */
 33 |     addgt r1, r4, #1        /* if r0 > r5 then r1 ← r4 + 1 */
 34 |     bne check_binary_search /* if r0 != r5 branch to binary_search */
 35 | 
 36 |     /* if we reach here it means that r0 == r5 */
 37 |     ldr r5, addr_case_addresses_table /* r5 ← &addr_case_value_table */
 38 |     ldr r5, [r5, +r4, LSL #2]   /* r5 ← *(r5 + r4*4) 
 39 |                                This is r5 ← case_addresses_table[r4] */
 40 |     mov pc, r5              /* branch to the proper case */
 41 |     
 42 |   check_binary_search:
 43 |     cmp r1, r2              /* r1 - r2 and update cpsr */
 44 |     ble binary_search       /* if r1 <= r2 branch to binary_search */
 45 | 
 46 |   /* if we reach here it means the case value
 47 |      was not found. branch to default case */
 48 |   b case_default
 49 | 
 50 |   case_1:
 51 |      mov r0, #1
 52 |      b after_switch
 53 |   case_2:
 54 |      mov r0, #2
 55 |      b after_switch
 56 |   case_3:
 57 |      mov r0, #3
 58 |      b after_switch
 59 |   case_24:
 60 |      mov r0, #24
 61 |      b after_switch
 62 |   case_25:
 63 |      mov r0, #95
 64 |      b after_switch
 65 |   case_26:
 66 |      mov r0, #96
 67 |      b after_switch
 68 |   case_97:
 69 |      mov r0, #97
 70 |      b after_switch
 71 |   case_98:
 72 |      mov r0, #98
 73 |      b after_switch
 74 |   case_99:
 75 |      mov r0, #99
 76 |      b after_switch
 77 |   case_300:
 78 |      mov r0, #300    /* The error code will be 44 */
 79 |      b after_switch
 80 | 
 81 |   case_default:
 82 |    mov r0, #42       /* r0 ← 42 */
 83 |    b after_switch    /* break (unnecessary) */  
 84 | 
 85 |   after_switch:
 86 | 
 87 |   pop {r4,r5,r6,lr}
 88 |   bx lr              /* Return from main */
 89 | 
 90 | case_value_table: .word 1, 2, 3, 24, 25, 26, 97, 98, 99, 300
 91 | addr_case_value_table: .word case_value_table
 92 | 
 93 | case_addresses_table:
 94 |     .word case_1
 95 |     .word case_2
 96 |     .word case_3
 97 |     .word case_24
 98 |     .word case_25
 99 |     .word case_26
100 |     .word case_97
101 |     .word case_98
102 |     .word case_99
103 |     .word case_300
104 | addr_case_addresses_table: .word case_addresses_table
105 | 


--------------------------------------------------------------------------------
/chapter16/ifstring.s:
--------------------------------------------------------------------------------
 1 | /* ifstring.s */
 2 | .data
 3 | 
 4 | .text
 5 | 
 6 | .globl main
 7 | 
 8 | main:
 9 |   cmp r0, #1                  /* r0 - 1 and update cpsr */
10 |   beq case_1                  /* if r0 == 1 branch to case_1 */
11 |   cmp r0, #2                  /* r0 - 2 and update cpsr */
12 |   beq case_2                  /* if r0 == 2 branch to case_2 */
13 |   cmp r0, #3                  /* r0 - 3 and update cpsr */
14 |   beq case_3                  /* if r0 == 3 branch to case_3 */
15 |   b case_default              /* branch to case_default */
16 | 
17 |   case_1:
18 |    mov r0, #1                 /* r0 ← 1 */ 
19 |    b after_switch             /* break */
20 |  
21 |   case_2:
22 |    mov r0, #2                 /* r0 ← 2 */
23 |    b after_switch             /* break */
24 | 
25 |   case_3:
26 |    mov r0, #3                 /* r0 ← 3 */
27 |    b after_switch             /* break */
28 | 
29 |   case_default:
30 |    mov r0, #42                /* r0 ← 42 */
31 |    b after_switch             /* break (unnecessary) */  
32 | 
33 |   after_switch:
34 | 
35 |   bx lr                       /* Return from main */
36 | 


--------------------------------------------------------------------------------
/chapter16/jumptable.s:
--------------------------------------------------------------------------------
 1 | /* jumptable.s */
 2 | .data
 3 | 
 4 | .text
 5 | 
 6 | .globl main
 7 | 
 8 | main:
 9 |   cmp r0, #1                  /* r0 - 1 and update cpsr */
10 |   blt case_default            /* branch to case_default if r0 < 1 */
11 |   cmp r0, #3                  /* r0 - 3 and update cpsr */
12 |   bgt case_default            /* branch to case_default if r0 > 3 */
13 | 
14 |   sub r0, r0, #1              /* r0 ← r0 - 1. Required to index the table */
15 |   ldr r1, addr_of_jump_table  /* r1 ← &jump_table */
16 |   ldr r1, [r1, +r0, LSL #2]   /* r1 ← *(r1 + r0*4).
17 |                                  This is r1 ← jump_table[r0] */
18 | 
19 |   mov pc, r1                  /* pc ← r1
20 |                                  This will cause a branch to the
21 |                                  computed address */
22 | 
23 |   case_1:
24 |    mov r0, #1                 /* r0 ← 1 */ 
25 |    b after_switch             /* break */
26 |  
27 |   case_2:
28 |    mov r0, #2                 /* r0 ← 2 */
29 |    b after_switch             /* break */
30 | 
31 |   case_3:
32 |    mov r0, #3                 /* r0 ← 3 */
33 |    b after_switch             /* break */
34 | 
35 |   case_default:
36 |    mov r0, #42                /* r0 ← 42 */
37 |    b after_switch             /* break (unnecessary) */  
38 | 
39 |   after_switch:
40 | 
41 |   bx lr                       /* Return from main */
42 | 
43 | .align 4
44 | jump_table: 
45 |    .word case_1
46 |    .word case_2
47 |    .word case_3
48 | 
49 | .align 4
50 | addr_of_jump_table: .word jump_table
51 | 


--------------------------------------------------------------------------------
/chapter17/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=first_pointer wrong_pointer good_pointer array_by_value array_by_ref double_array
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -march=armv6 -mfpu=vfpv2 -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter17/array_by_ref.s:
--------------------------------------------------------------------------------
 1 | /* array_by_ref.s */
 2 | 
 3 | .data
 4 | 
 5 | .align 4
 6 | 
 7 | big_array :
 8 | .word 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
 9 | .word 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41
10 | .word 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61
11 | .word 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81
12 | .word 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100
13 | .word 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116
14 | .word 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132
15 | .word 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148
16 | .word 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164
17 | .word 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180
18 | .word 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196
19 | .word 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212
20 | .word 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228
21 | .word 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244
22 | .word 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
23 | 
24 | .align 4
25 | 
26 | message: .asciz "The sum of 0 to 255 is %d\n"
27 | 
28 | .text
29 | .globl main
30 | 
31 | sum_array_ref : 
32 |     /* Parameters: 
33 |            r0  Number of items
34 |            r1  Address of the array
35 |     */
36 |     push {r4, r5, r6, lr}
37 | 
38 |     /* We have passed all the data by value */
39 | 
40 |     /* r4 will hold the sum so far */
41 |     mov r4, #0      /* r4 ← 0 */
42 |     mov r5, #0      /* r5 ← 0 */
43 | 
44 |     b .Lcheck_loop_array_sum
45 |     .Lloop_array_sum:
46 |       ldr r6, [r1, r5, LSL #2]   /* r6 ← *(r1 + r5 * 4) */
47 |       add r4, r4, r6             /* r4 ← r4 + r6 */
48 |       add r5, r5, #1             /* r5 ← r5 + 1 */
49 |     .Lcheck_loop_array_sum:
50 |       cmp r5, r0                 /* r5 - r0 and update cpsr */
51 |       bne .Lloop_array_sum       /* if r5 != r0 go to .Lloop_array_sum */
52 | 
53 |     mov r0, r4  /* r0 ← r4, to return the value of the sum */
54 |     pop {r4, r5, r6, lr}
55 | 
56 |     bx lr
57 |     
58 | 
59 | main:
60 |     push {r4, lr}
61 |     /* we will not use r4 but we need to keep the function 8-byte aligned */
62 | 
63 |     mov r0, #256
64 |     ldr r1, address_of_big_array
65 | 
66 |     bl sum_array_ref
67 | 
68 |     /* prepare the call to printf */
69 |     mov r1, r0                  /* second parameter, the sum itself */
70 |     ldr r0, address_of_message  /* first parameter, the message */
71 |     bl printf
72 | 
73 |     pop {r4, lr}
74 |     bx lr
75 | 
76 | address_of_big_array : .word big_array
77 | address_of_message : .word message
78 | 


--------------------------------------------------------------------------------
/chapter17/array_by_value.s:
--------------------------------------------------------------------------------
  1 | /* array_by_value.s */
  2 | 
  3 | .data
  4 | 
  5 | .align 4
  6 | 
  7 | big_array :
  8 | .word 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
  9 | .word 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41
 10 | .word 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61
 11 | .word 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81
 12 | .word 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100
 13 | .word 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116
 14 | .word 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132
 15 | .word 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148
 16 | .word 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164
 17 | .word 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180
 18 | .word 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196
 19 | .word 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212
 20 | .word 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228
 21 | .word 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244
 22 | .word 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
 23 | 
 24 | .align 4
 25 | 
 26 | message: .asciz "The sum of 0 to 255 is %d\n"
 27 | 
 28 | .text
 29 | .globl main
 30 | 
 31 | sum_array_value : 
 32 |     push {r4, r5, r6, lr}
 33 | 
 34 |     /* We have passed all the data by value */
 35 | 
 36 |     /* r4 will hold the sum so far */
 37 |     mov r4, #0      /* r4 ← 0 */
 38 |     /* In r0 we have the number of items of the array */
 39 | 
 40 |     cmp r0, #1            /* r0 - #1 and update cpsr */
 41 |     blt .Lend_of_sum_array  /* if r0 < 1 branch to end_of_sum_array */
 42 |     add r4, r4, r1        /* add the first item */
 43 | 
 44 |     cmp r0, #2            /* r0 - #2 and update cpsr */
 45 |     blt .Lend_of_sum_array  /* if r0 < 2 branch to end_of_sum_array */
 46 |     add r4, r4, r2        /* add the second item */
 47 | 
 48 |     cmp r0, #3            /* r0 - #3 and update cpsr */
 49 |     blt .Lend_of_sum_array  /* if r0 < 3 branch to end_of_sum_array */
 50 |     add r4, r4, r3        /* add the third item */
 51 | 
 52 |     /* 
 53 |      The stack at this point looks like this
 54 |        |                | (lower addresses)
 55 |        |                |
 56 |        | lr             |  <- sp points here
 57 |        | r6             |  <- this is sp + 4
 58 |        | r5             |  <- this is sp + 8
 59 |        | r4             |  <- this is sp + 12
 60 |        | big_array[3]   |  <- this is sp + 16 (we want r5 to point here)
 61 |        | big_array[4]   |
 62 |        |     ...        |
 63 |        | big_array[255] |
 64 |        |                | 
 65 |        |                | (higher addresses)
 66 |     
 67 |     keep in r5 the address where the stack-passed portion of the array starts */
 68 |     add r5, sp, #16 /* r5 ← sp + 16 */
 69 | 
 70 |     /* in register r3 we will count how many items we have read
 71 |        from the stack. */
 72 |     mov r3, #0
 73 | 
 74 |     /* in the stack there will always be 3 less items because
 75 |        the first 3 were already passed in registers
 76 |        (recall that r0 had how many items were in the array) */
 77 |     sub r0, r0, #3
 78 | 
 79 |     b .Lcheck_loop_sum_array
 80 |     .Lloop_sum_array:
 81 |       ldr r6, [r5, r3, LSL #2]       /* r6 ← *(r5 + r3 * 4) load
 82 |                                         the array item r3 from the stack */
 83 |       add r4, r4, r6                 /* r4 ← r4 + r6
 84 |                                         accumulate in r4 */
 85 |       add r3, r3, #1                 /* r3 ← r3 + 1 
 86 |                                         move to the next item */
 87 |     .Lcheck_loop_sum_array:
 88 |       cmp r3, r0           /* r0 - r3 and update cpsr */
 89 |       blt .Lloop_sum_array   /* if r3 < r3  branch to loop_sum_array */
 90 | 
 91 |   .Lend_of_sum_array:
 92 |     mov r0, r4  /* r0 ← r4, to return the value of the sum */
 93 |     pop {r4, r5, r6, lr}
 94 | 
 95 |     bx lr
 96 |     
 97 | 
 98 | main:
 99 |     push {r4, r5, r6, r7, r8, lr}
100 |     /* we will not use r8 but we need to keep the function 8-byte aligned */
101 | 
102 |     ldr r4, address_of_big_array
103 | 
104 |     /* Prepare call */
105 | 
106 |     mov r0, #256  /* Load in the first parameter the number of items 
107 |                      r0 ← 256
108 |                      */
109 | 
110 |     ldr r1, [r4]     /* load in the second parameter the first item of the array */
111 |     ldr r2, [r4, #4] /* load in the third parameter the second item of the array */
112 |     ldr r3, [r4, #8] /* load in the fourth parameter the third item of the array */
113 | 
114 |     /* before pushing anything in the stack keep its position */
115 |     mov r7, sp
116 | 
117 |     /* We cannot use more registers, now we have to push them onto the stack
118 |        (in reverse order) */
119 |     mov r5, #255   /* r5 ← 255
120 |                       This is the last item position
121 |                       (note that the first would be in position 0) */
122 | 
123 | 
124 |     b .Lcheck_pass_parameter_loop
125 |     .Lpass_parameter_loop:
126 | 
127 |       ldr r6, [r4, r5, LSL #2]  /* r6 ← *(r4 + r5 * 4).
128 |                                    loads the item in position r5 into r6. Note that
129 |                                    we have to multiply by 4 because this is the size
130 |                                    of each item in the array */
131 |       push {r6}                 /* push the loaded value to the stack */
132 |       sub r5, r5, #1            /* we are done with the current item,
133 |                                    go to the previous index of the array */
134 |     .Lcheck_pass_parameter_loop:
135 |       cmp r5, #2                /* compute r5 - #2 and update cpsr */
136 |       bne .Lpass_parameter_loop   /* if r5 != #2 branch to pass_parameter_loop */
137 | 
138 |     /* We are done, we have passed all the values of the array,
139 |        now call the function */
140 |     bl sum_array_value
141 | 
142 |     /* restore the stack position */
143 |     mov sp, r7
144 | 
145 |     /* prepare the call to printf */
146 |     mov r1, r0                  /* second parameter, the sum itself */
147 |     ldr r0, address_of_message  /* first parameter, the message */
148 |     bl printf
149 | 
150 |     pop {r4, r5, r6, r7, r8, lr}
151 |     bx lr
152 | 
153 | address_of_big_array : .word big_array
154 | address_of_message : .word message
155 | 


--------------------------------------------------------------------------------
/chapter17/double_array.s:
--------------------------------------------------------------------------------
  1 | /* double_array.s */
  2 | 
  3 | .data
  4 | 
  5 | .align 4
  6 | 
  7 | big_array :
  8 | .word 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
  9 | .word 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41
 10 | .word 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61
 11 | .word 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81
 12 | .word 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100
 13 | .word 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116
 14 | .word 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132
 15 | .word 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148
 16 | .word 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164
 17 | .word 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180
 18 | .word 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196
 19 | .word 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212
 20 | .word 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228
 21 | .word 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244
 22 | .word 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
 23 | 
 24 | .align 4
 25 | 
 26 | message: .asciz "Item at position %d has value %d\n"
 27 | 
 28 | .text
 29 | .globl main
 30 | 
 31 | double_array : 
 32 |     /* Parameters: 
 33 |            r0  Number of items
 34 |            r1  Address of the array
 35 |     */
 36 |     push {r4, r5, r6, lr}
 37 | 
 38 |     mov r4, #0      /* r4 ← 0 */
 39 | 
 40 |     b .Lcheck_loop_array_double
 41 |     .Lloop_array_double:
 42 |       ldr r5, [r1, r4, LSL #2]   /* r5 ← *(r1 + r4 * 4) */
 43 |       mov r5, r5, LSL #1         /* r5 ← r5 * 2 */
 44 |       str r5, [r1, r4, LSL #2]   /* *(r1 + r4 * 4) ← r5 */
 45 |       add r4, r4, #1             /* r4 ← r4 + 1 */
 46 |     .Lcheck_loop_array_double:
 47 |       cmp r4, r0                 /* r4 - r0 and update cpsr */
 48 |       bne .Lloop_array_double       /* if r4 != r0 go to .Lloop_array_double */
 49 | 
 50 |     pop {r4, r5, r6, lr}
 51 | 
 52 |     bx lr
 53 |     
 54 | print_each_item:
 55 |     push {r4, r5, r6, r7, r8, lr} /* r8 is unused */
 56 | 
 57 |     mov r4, #0      /* r4 ← 0 */
 58 |     mov r6, r0      /* r6 ← r0. Keep r0 because we will overwrite it */
 59 |     mov r7, r1      /* r7 ← r1. Keep r1 because we will overwrite it */
 60 | 
 61 | 
 62 |     b .Lcheck_loop_print_items
 63 |     .Lloop_print_items:
 64 |       ldr r5, [r7, r4, LSL #2]   /* r5 ← *(r7 + r4 * 4) */
 65 | 
 66 |       /* Prepare the call to printf */
 67 |       ldr r0, address_of_message /* first parameter of the call to printf below */
 68 |       mov r1, r4      /* second parameter: item position */
 69 |       mov r2, r5      /* third parameter: item value */
 70 |       bl printf       /* call printf */
 71 | 
 72 |       add r4, r4, #1             /* r4 ← r4 + 1 */
 73 |     .Lcheck_loop_print_items:
 74 |       cmp r4, r6                 /* r4 - r6 and update cpsr */
 75 |       bne .Lloop_print_items       /* if r4 != r6 goto .Lloop_print_items */
 76 | 
 77 |     pop {r4, r5, r6, r7, r8, lr}
 78 |     bx lr
 79 | 
 80 | main:
 81 |     push {r4, lr}
 82 |     /* we will not use r4 but we need to keep the function 8-byte aligned */
 83 | 
 84 |     /* first call print_each_item */
 85 |     mov r0, #256                   /* first_parameter: number of items */
 86 |     ldr r1, address_of_big_array   /* second parameter: address of the array */
 87 |     bl print_each_item             /* call to print_each_item */
 88 | 
 89 |     /* call to double_array */
 90 |     mov r0, #256                   /* first_parameter: number of items */
 91 |     ldr r1, address_of_big_array   /* second parameter: address of the array */
 92 |     bl double_array               /* call to double_array */
 93 | 
 94 |     /* second call print_each_item */
 95 |     mov r0, #256                   /* first_parameter: number of items */
 96 |     ldr r1, address_of_big_array   /* second parameter: address of the array */
 97 |     bl print_each_item             /* call to print_each_item */
 98 | 
 99 |     pop {r4, lr}
100 |     bx lr
101 | 
102 | address_of_big_array : .word big_array
103 | address_of_message : .word message
104 | 


--------------------------------------------------------------------------------
/chapter17/first_pointer.s:
--------------------------------------------------------------------------------
 1 | /* first_pointer.s */
 2 | 
 3 | .data
 4 | 
 5 | .align 4
 6 | number_1  : .word 3
 7 | 
 8 | .text
 9 | .globl main
10 | 
11 | 
12 | main:
13 |     ldr r0, pointer_to_number
14 |     ldr r0, [r0]
15 | 
16 |     bx lr
17 | 
18 | pointer_to_number: .word number_1
19 | 


--------------------------------------------------------------------------------
/chapter17/good_pointer.s:
--------------------------------------------------------------------------------
 1 | /* good_pointer.s */
 2 | 
 3 | .data
 4 | 
 5 | .align 4
 6 | number_1  : .word 3
 7 | number_2  : .word 4
 8 | pointer_to_number: .word 0
 9 | 
10 | .text
11 | .globl main
12 | 
13 | 
14 | main:
15 |     ldr r0, addr_of_pointer_to_number
16 |                              /* r0 ← &pointer_to_number */
17 | 
18 |     ldr r1, addr_of_number_2 /* r1 ← &number_2 */
19 | 
20 |     str r1, [r0]             /* *r0 ← r1.
21 |                                 This is actually
22 |                                   pointer_to_number ← &number_2 */
23 | 
24 |     ldr r1, [r0]             /* r1 ← *r0.
25 |                                 This is actually
26 |                                   r1 ← pointer_to_number
27 |                                 Since pointer_to_number has the value &number_2
28 |                                 then this is like
29 |                                   r1 ← &number_2
30 |                              */
31 |                                
32 | 
33 |     ldr r0, [r1]             /* r0 ← *r1
34 |                                 Since r1 had as value &number_2
35 |                                 then this is like
36 |                                    r0 ← number_2
37 |                              */
38 | 
39 |     bx lr
40 | 
41 | addr_of_pointer_to_number: .word pointer_to_number
42 | addr_of_number_1: .word number_1
43 | addr_of_number_2: .word number_2
44 | 


--------------------------------------------------------------------------------
/chapter17/wrong_pointer.s:
--------------------------------------------------------------------------------
 1 | /* wrong_pointer.s */
 2 | 
 3 | .data
 4 | 
 5 | .align 4
 6 | number_1  : .word 3
 7 | number_2  : .word 4
 8 | 
 9 | .text
10 | .globl main
11 | 
12 | main:
13 |     ldr r1, address_of_number_2  /* r1 ← &number_2 */
14 |     str r1, pointer_to_number    /* pointer_to_number ← r1, this is pointer_to_number ← &number_2 */
15 | 
16 |     bx lr
17 | 
18 | pointer_to_number: .word number_1
19 | address_of_number_2: .word number_2
20 | 


--------------------------------------------------------------------------------
/chapter18/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=square
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -march=armv6 -mfpu=vfpv2 -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter18/square:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rofirrim/raspberry-pi-assembler/75685f80a35318777fad9dc33837698c19952e89/chapter18/square


--------------------------------------------------------------------------------
/chapter18/square.s:
--------------------------------------------------------------------------------
 1 | /* squares.s */
 2 | 
 3 | .data
 4 | 
 5 | .align 4
 6 | message: .asciz "Sum of 1^2 + 2^2 + 3^2 + 4^2 + 5^2 is %d\n"
 7 | 
 8 | .text
 9 | 
10 |     
11 | sq: 
12 |   ldr r1, [r0]   /* r1 ← (*r0) */
13 |   mul r1, r1, r1 /* r1 ← r1 * r1 */
14 |   str r1, [r0]   /* (*r0) ← r1 */
15 |   bx lr
16 | 
17 | sq_sum5:
18 |   push {fp, lr}         /* Keep fp and all callee-saved registers. */
19 |   mov fp, sp            /* Set the dynamic link */
20 | 
21 |   sub sp, sp, #16      /* sp ← sp - 4. Allocate space for 4 integers in the stack */
22 |   /* Keep parameters in the stack */
23 |   str r0, [fp, #-16]    /* *(fp - 16) ← r0 */
24 |   str r1, [fp, #-12]    /* *(fp - 12) ← r1 */
25 |   str r2, [fp, #-8]     /* *(fp - 8) ← r2 */
26 |   str r3, [fp, #-4]     /* *(fp - 4) ← r3 */
27 | 
28 |   /* At this point the stack looks like this
29 |      | Value  |  Address(es)
30 |      +--------+-----------------------
31 |      |   r0   |  [fp, #-16], [sp]
32 |      |   r1   |  [fp, #-12], [sp, #4]
33 |      |   r2   |  [fp, #-8],  [sp, #8]
34 |      |   r3   |  [fp, #-4],  [sp, #12]
35 |      |   fp   |  [fp],       [sp, #16]
36 |      |   lr   |  [fp, #4],   [sp, #20]
37 |      |   e    |  [fp, #8],   [sp, #24]
38 |      v
39 |    Higher
40 |    addresses
41 |   */
42 | 
43 |   sub r0, fp, #16    /* r0 ← fp - 16 */
44 |   bl sq              /* call sq(&a); */
45 |   sub r0, fp, #12    /* r0 ← fp - 12 */
46 |   bl sq              /* call sq(&b); */
47 |   sub r0, fp, #8     /* r0 ← fp - 8 */
48 |   bl sq              /* call sq(&c); */
49 |   sub r0, fp, #4     /* r0 ← fp - 4 */
50 |   bl sq              /* call sq(&d) */
51 |   add r0, fp, #8     /* r0 ← fp + 8 */
52 |   bl sq              /* call sq(&e) */
53 | 
54 |   ldr r0, [fp, #-16] /* r0 ← *(fp - 16). Loads a into r0 */
55 |   ldr r1, [fp, #-12] /* r1 ← *(fp - 12). Loads b into r1 */
56 |   add r0, r0, r1     /* r0 ← r0 + r1 */
57 |   ldr r1, [fp, #-8]  /* r1 ← *(fp - 8). Loads c into r1 */
58 |   add r0, r0, r1     /* r0 ← r0 + r1 */
59 |   ldr r1, [fp, #-4]  /* r1 ← *(fp - 4). Loads d into r1 */
60 |   add r0, r0, r1     /* r0 ← r0 + r1 */
61 |   ldr r1, [fp, #8]   /* r1 ← *(fp + 8). Loads e into r1 */
62 |   add r0, r0, r1     /* r0 ← r0 + r1 */
63 | 
64 |   mov sp, fp         /* Undo the dynamic link */
65 |   pop {fp, lr}       /* Restore fp and callee-saved registers */
66 |   bx lr
67 | 
68 | .globl main
69 | 
70 | main:
71 |     push {r4, lr}          /* Keep callee-saved registers */
72 | 
73 |     /* Prepare the call to sq_sum5 */
74 |     mov r0, #1             /* Parameter a ← 1 */
75 |     mov r1, #2             /* Parameter b ← 2 */
76 |     mov r2, #3             /* Parameter c ← 3 */
77 |     mov r3, #4             /* Parameter d ← 4 */
78 | 
79 |     /* Parameter e goes through the stack,
80 |        so it requires enlarging the stack */
81 |     mov r4, #5             /* r4 ← 5 */
82 |     sub sp, sp, #8         /* Enlarge the stack 8 bytes,
83 |                               we will use only the
84 |                               topmost 4 bytes */
85 |     str r4, [sp]           /* Parameter e ← 5 */
86 |     bl sq_sum5             /* call sq_sum5(1, 2, 3, 4, 5) */
87 |     add sp, sp, #8         /* Shrink back the stack */
88 | 
89 |     /* Prepare the call to printf */
90 |     mov r1, r0             /* The result of sq_sum5 */
91 |     ldr r0, address_of_message
92 |     bl printf              /* Call printf */
93 | 
94 |     pop {r4, lr}           /* Restore callee-saved registers */
95 |     bx lr
96 | 
97 | 
98 | address_of_message: .word message
99 | 


--------------------------------------------------------------------------------
/chapter19/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=write_c write_sys
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -march=armv6 -mfpu=vfpv2 -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter19/write_c.s:
--------------------------------------------------------------------------------
 1 | /* write_c.s */
 2 | 
 3 | .data
 4 | 
 5 | 
 6 | greeting: .asciz "Hello world\n"
 7 | after_greeting:
 8 | 
 9 | .set size_of_greeting, after_greeting - greeting
10 | 
11 | .text
12 | 
13 | .globl main
14 | 
15 | main:
16 |     push {r4, lr}
17 |     mov r0, #1
18 |     ldr r1, addr_of_greeting
19 |     mov r2, #size_of_greeting
20 |     bl write
21 | 
22 |     mov r0, #0
23 | 
24 |     pop {r4, lr}
25 |     bx lr
26 | 
27 | addr_of_greeting : .word greeting
28 | 


--------------------------------------------------------------------------------
/chapter19/write_sys.s:
--------------------------------------------------------------------------------
 1 | /* write_sys.s */
 2 | 
 3 | .data
 4 | 
 5 | 
 6 | greeting: .asciz "Hello world\n"
 7 | after_greeting:
 8 | 
 9 | .set size_of_greeting, after_greeting - greeting
10 | 
11 | .text
12 | 
13 | .globl main
14 | 
15 | main:
16 |     push {r4, lr}
17 | 
18 |     /* Prepare the system call */
19 |     mov r0, #1                  /* r0 ← 1 */
20 |     ldr r1, addr_of_greeting    /* r1 ← &greeting */
21 |     mov r2, #size_of_greeting   /* r2 ← sizeof(greeting) */
22 | 
23 |     mov r7, #4                  /* select system call 'write' */
24 |     swi #0                      /* perform the system call */
25 | 
26 |     mov r0, #0
27 |     pop {r4, lr}
28 |     bx lr
29 | 
30 | addr_of_greeting : .word greeting
31 | 


--------------------------------------------------------------------------------
/chapter20/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=direct indirect greeter_01 greeter_02
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -march=armv6 -mfpu=vfpv2 -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter20/direct.s:
--------------------------------------------------------------------------------
 1 | .data     /* data section */
 2 | .align 4  /* ensure the next label is 4-byte aligned */
 3 | message: .asciz "Hello world\n"
 4 | 
 5 | .text     /* text section (= code) */
 6 | 
 7 | .align 4  /* ensure the next label is 4-byte aligned */
 8 | say_hello:
 9 |     push {r4, lr}            /* keep lr because we call printf, 
10 |                                 we keep r4 to keep the stack 8-byte
11 |                                 aligned, as per AAPCS requirements */
12 |     /* Prepare the call to printf */
13 |     ldr r0, addr_of_message  /* r0 ← &message */
14 |     bl printf                /* call printf */
15 |     pop {r4, lr}             /* restore r4 and lr */
16 |     bx lr                    /* return to the caller */
17 | 
18 | .align 4  /* ensure the next label is 4-byte aligned */
19 | addr_of_message: .word message
20 | 
21 | .globl main /* state that 'main' label is global */
22 | .align 4  /* ensure the next label is 4-byte aligned */
23 | main:
24 |     push {r4, lr}            /* keep lr because we call printf, 
25 |                                 we keep r4 to keep the stack 8-byte
26 |                                 aligned, as per AAPCS requirements */
27 |     bl say_hello             /* call say_hello, directly, using the label */
28 | 
29 |     mov r0, #0               /* return from the program, set error code */
30 |     pop {r4, lr}             /* restore r4 and lr */
31 |     bx lr                    /* return to the caller (the system) */
32 | 
33 | 


--------------------------------------------------------------------------------
/chapter20/greeter_01.s:
--------------------------------------------------------------------------------
 1 | .data     /* data section */
 2 | .align 4  /* ensure the next label is 4-byte aligned */
 3 | message_1: .asciz "Hello\n"
 4 | .align 4  /* ensure the next label is 4-byte aligned */
 5 | message_2: .asciz "Bonjour\n"
 6 | 
 7 | .text     /* text section (= code) */
 8 | 
 9 | .align 4  /* ensure the next label is 4-byte aligned */
10 | say_hello:
11 |     push {r4, lr}            /* keep lr because we call printf, 
12 |                                 we keep r4 to keep the stack 8-byte
13 |                                 aligned, as per AAPCS requirements */
14 |     /* Prepare the call to printf */
15 |     ldr r0, addr_of_message_1 /* r0 ← &message */
16 |     bl printf                 /* call printf */
17 |     pop {r4, lr}              /* restore r4 and lr */
18 |     bx lr                     /* return to the caller */
19 | 
20 | .align 4  /* ensure the next label is 4-byte aligned */
21 | addr_of_message_1: .word message_1
22 | 
23 | .align 4  /* ensure the next label is 4-byte aligned */
24 | say_bonjour:
25 |     push {r4, lr}            /* keep lr because we call printf, 
26 |                                 we keep r4 to keep the stack 8-byte
27 |                                 aligned, as per AAPCS requirements */
28 |     /* Prepare the call to printf */
29 |     ldr r0, addr_of_message_2 /* r0 ← &message */
30 |     bl printf                 /* call printf */
31 |     pop {r4, lr}              /* restore r4 and lr */
32 |     bx lr                     /* return to the caller */
33 | 
34 | .align 4  /* ensure the next label is 4-byte aligned */
35 | addr_of_message_2: .word message_2
36 | 
37 | .align 4
38 | greeter:
39 |     push {r4, lr}            /* keep lr because we call printf, 
40 |                                 we keep r4 to keep the stack 8-byte
41 |                                 aligned, as per AAPCS requirements */
42 |     blx r0                   /* indirect call to r0 */
43 |     pop {r4, lr}             /* restore r4 and lr */
44 |     bx lr                    /* return to the caller */
45 | 
46 | .globl main /* state that 'main' label is global */
47 | .align 4  /* ensure the next label is 4-byte aligned */
48 | main:
49 |     push {r4, lr}            /* keep lr because we call printf, 
50 |                                 we keep r4 to keep the stack 8-byte
51 |                                 aligned, as per AAPCS requirements */
52 | 
53 |     ldr r0, addr_say_hello   /* r0 ← &say_hello */
54 |     bl greeter               /* call greeter */
55 | 
56 |     ldr r0, addr_say_bonjour /* r0 ← &say_bonjour */
57 |     bl greeter               /* call greeter */
58 | 
59 |     mov r0, #0               /* return from the program, set error code */
60 |     pop {r4, lr}             /* restore r4 and lr */
61 |     bx lr                    /* return to the caller (the system) */
62 | 
63 | addr_say_hello : .word say_hello
64 | addr_say_bonjour : .word say_bonjour
65 | 


--------------------------------------------------------------------------------
/chapter20/greeter_02.s:
--------------------------------------------------------------------------------
  1 | .data     /* data section */
  2 | 
  3 | .align 4  /* ensure the next label is 4-byte aligned */
  4 | message_hello: .asciz "Hello %s\n"
  5 | .align 4  /* ensure the next label is 4-byte aligned */
  6 | message_bonjour: .asciz "Bonjour %s\n"
  7 | 
  8 | /* tags of kind of people */
  9 | .align 4  /* ensure the next label is 4-byte aligned */
 10 | person_english : .word say_hello /* tag for people
 11 |                                      that will be greeted 
 12 |                                      in English */
 13 | .align 4  /* ensure the next label is 4-byte aligned */
 14 | person_french : .word say_bonjour /* tag for people
 15 |                                      that will be greeted 
 16 |                                      in French */
 17 | 
 18 | /* several names to be used in the people definition */
 19 | .align 4
 20 | name_pierre: .asciz "Pierre"
 21 | .align 4
 22 | name_john: .asciz "John"
 23 | .align 4
 24 | name_sally: .asciz "Sally"
 25 | .align 4
 26 | name_bernadette: .asciz "Bernadette"
 27 | 
 28 | /* some people */
 29 | .align 4
 30 | person_john: .word name_john, person_english
 31 | .align 4
 32 | person_pierre: .word name_pierre, person_french
 33 | .align 4
 34 | person_sally: .word name_sally, person_english
 35 | .align 4
 36 | person_bernadette: .word name_bernadette, person_french
 37 | 
 38 | /* array of people */
 39 | people : .word person_john, person_pierre, person_sally, person_bernadette
 40 | 
 41 | .text     /* text section (= code) */
 42 | 
 43 | .align 4  /* ensure the next label is 4-byte aligned */
 44 | say_hello:
 45 |     push {r4, lr}            /* keep lr because we call printf, 
 46 |                                 we keep r4 to keep the stack 8-byte
 47 |                                 aligned, as per AAPCS requirements */
 48 |     /* Prepare the call to printf */
 49 |     mov r1, r0               /* r1 ← r0 */
 50 |     ldr r0, addr_of_message_hello
 51 |                              /* r0 ← &message_hello */
 52 |     bl printf                /* call printf */
 53 |     pop {r4, lr}             /* restore r4 and lr */
 54 |     bx lr                    /* return to the caller */
 55 | 
 56 | .align 4  /* ensure the next label is 4-byte aligned */
 57 | addr_of_message_hello: .word message_hello
 58 | 
 59 | .align 4  /* ensure the next label is 4-byte aligned */
 60 | say_bonjour:
 61 |     push {r4, lr}            /* keep lr because we call printf, 
 62 |                                 we keep r4 to keep the stack 8-byte
 63 |                                 aligned, as per AAPCS requirements */
 64 |     /* Prepare the call to printf */
 65 |     mov r1, r0               /* r1 ← r0 */
 66 |     ldr r0, addr_of_message_bonjour
 67 |                              /* r0 ← &message_bonjour */
 68 |     bl printf                /* call printf */
 69 |     pop {r4, lr}             /* restore r4 and lr */
 70 |     bx lr                    /* return to the caller */
 71 | 
 72 | .align 4  /* ensure the next label is 4-byte aligned */
 73 | addr_of_message_bonjour: .word message_bonjour
 74 | 
 75 | /* This function receives an address to a person */
 76 | .align 4
 77 | greet_person:
 78 |     push {r4, lr}            /* keep lr because we call printf, 
 79 |                                 we keep r4 to keep the stack 8-byte
 80 |                                 aligned, as per AAPCS requirements */
 81 | 
 82 |     /* prepare indirect function call */
 83 |     mov r4, r0               /* r0 ← r4, keep the first parameter in r4 */
 84 |     ldr r0, [r4]             /* r0 ← *r4, this is the address to the name
 85 |                                 of the person and the first parameter
 86 |                                 of the indirect called function*/
 87 | 
 88 |     ldr r1, [r4, #4]         /* r1 ← *(r4 + 4) this is the address
 89 |                                 to the person tag */
 90 |     ldr r1, [r1]             /* r1 ← *r1, the address of the
 91 |                                 specific greeting function */
 92 | 
 93 |     blx r1                   /* indirect call to r1, this is
 94 |                                 the specific greeting function */
 95 | 
 96 |     pop {r4, lr}             /* restore r4 and lr */
 97 |     bx lr                    /* return to the caller */
 98 | 
 99 | .globl main /* state that 'main' label is global */
100 | .align 4  /* ensure the next label is 4-byte aligned */
101 | main:
102 |     push {r4, r5, r6, lr}    /* keep callee saved registers that we will modify */
103 | 
104 |     ldr r4, addr_of_people   /* r4 ← &people */
105 |     /* recall that people is an array of addresses (pointers) to people */
106 | 
107 |     /* now we loop from 0 to 4 */
108 |     mov r5, #0               /* r5 ← 0 */
109 |     b check_loop             /* branch to the loop check */
110 | 
111 |     loop:
112 |       /* prepare the call to greet_person */
113 |       ldr r0, [r4, r5, LSL #2]  /* r0 ← *(r4 + r5 << 2)   this is
114 |                                    r0 ← *(r4 + r5 * 4)
115 |                                    recall, people is an array of addresses,
116 |                                    so this is
117 |                                    r0 ← people[r5]
118 |                                 */
119 |       bl greet_person           /* call greet_person */
120 |       add r5, r5, #1            /* r5 ← r5 + 1 */
121 |     check_loop:
122 |       cmp r5, #4                /* compute r5 - 4 and update cpsr */
123 |       bne loop                  /* if r5 != 4 branch to loop */
124 | 
125 |     mov r0, #0               /* return from the program, set error code */
126 |     pop {r4, r5, r6, lr}     /* callee saved registers */
127 |     bx lr                    /* return to the caller (the system) */
128 | 
129 | addr_of_people : .word people
130 | 


--------------------------------------------------------------------------------
/chapter20/indirect.s:
--------------------------------------------------------------------------------
 1 | .data     /* data section */
 2 | .align 4  /* ensure the next label is 4-byte aligned */
 3 | message: .asciz "Hello world\n"
 4 | .align 4  /* ensure the next label is 4-byte aligned */
 5 | ptr_of_fun: .word 0   /* we set its initial value zero */
 6 | 
 7 | .text     /* text section (= code) */
 8 | 
 9 | .align 4  /* ensure the next label is 4-byte aligned */
10 | say_hello:
11 |     push {r4, lr}            /* keep lr because we call printf, 
12 |                                 we keep r4 to keep the stack 8-byte
13 |                                 aligned, as per AAPCS requirements */
14 |     /* Prepare the call to printf */
15 |     ldr r0, addr_of_message  /* r0 ← &message */
16 |     bl printf                /* call printf */
17 |     pop {r4, lr}             /* restore r4 and lr */
18 |     bx lr                    /* return to the caller */
19 | 
20 | .align 4  /* ensure the next label is 4-byte aligned */
21 | addr_of_message: .word message
22 | 
23 | .align 4
24 | make_indirect_call:
25 |     push {r4, lr}            /* keep lr because we call printf, 
26 |                                 we keep r4 to keep the stack 8-byte
27 |                                 aligned, as per AAPCS requirements */
28 |     ldr r0, addr_ptr_of_fun  /* r0 ← &ptr_of_fun */
29 |     ldr r0, [r0]             /* r0 ← *r0 */
30 |     blx r0                   /* indirect call to r0 */
31 |     pop {r4, lr}             /* restore r4 and lr */
32 |     bx lr                    /* return to the caller */
33 | 
34 | .globl main /* state that 'main' label is global */
35 | .align 4  /* ensure the next label is 4-byte aligned */
36 | main:
37 |     push {r4, lr}            /* keep lr because we call printf, 
38 |                                 we keep r4 to keep the stack 8-byte
39 |                                 aligned, as per AAPCS requirements */
40 | 
41 |     ldr r1, addr_say_hello   /* r1 ← &say_hello */
42 |     ldr r0, addr_ptr_of_fun  /* r0 ← &addr_ptr_of_fun */
43 |     str r1, [r0]             /* *r0 ← r1
44 |                                 this is
45 |                                 ptr_of_fun ← &say_hello */
46 | 
47 |     bl make_indirect_call    /* call make_indirect_call */
48 | 
49 |     mov r0, #0               /* return from the program, set error code */
50 |     pop {r4, lr}             /* restore r4 and lr */
51 |     bx lr                    /* return to the caller (the system) */
52 | 
53 | addr_ptr_of_fun: .word ptr_of_fun
54 | addr_say_hello : .word say_hello
55 | 


--------------------------------------------------------------------------------
/chapter21/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=subword subword_signed reinterpret
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -march=armv6 -mfpu=vfpv2 -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter21/reinterpret.s:
--------------------------------------------------------------------------------
 1 | .data
 2 | 
 3 | .align 4
 4 | a_word: .word 0x11223344
 5 | 
 6 | .align 4
 7 | message_bytes : .asciz "byte #%d is 0x%x\n"
 8 | message_halfwords : .asciz "halfword #%d is 0x%x\n"
 9 | message_words : .asciz "word #%d is 0x%x\n"
10 | 
11 | .text
12 | 
13 | .globl main
14 | main:
15 |     push {r4, r5, r6, lr}  /* keep callee saved registers */
16 | 
17 |     ldr r4, addr_a_word    /* r4 ← &a_word */
18 | 
19 |     mov r5, #0             /* r5 ← 0 */
20 |     b check_loop_bytes     /* branch to check_loop_bytes */
21 | 
22 |     loop_bytes:
23 |         /* prepare call to printf */
24 |         ldr r0, addr_message_bytes
25 |                            /* r0 ← &message_bytes
26 |                               first parameter of printf */
27 |         mov r1, r5         /* r1 ← r5
28 |                               second parameter of printf */
29 |         ldrb r2, [r4, r5]  /* r2 ← *{byte}(r4 + r5)
30 |                               third parameter of printf */
31 |         bl printf          /* call printf */
32 |         add r5, r5, #1     /* r5 ← r5 + 1 */
33 |     check_loop_bytes:
34 |         cmp r5, #4         /* compute r5 - 4 and update cpsr */
35 |         bne loop_bytes     /* if r5 != 4 branch to loop_bytes */
36 | 
37 |     mov r5, #0             /* r5 ← 0 */
38 |     b check_loop_halfwords /* branch to check_loop_halfwords */
39 | 
40 |     loop_halfwords:
41 |         /* prepare call to printf */
42 |         ldr r0, addr_message_halfwords
43 |                            /* r0 ← &message_halfwords
44 |                               first parameter of printf */
45 |         mov r1, r5         /* r1 ← r5
46 |                               second parameter of printf */
47 |         mov r6, r5, LSL #1 /* r6 ← r5 * 2 */
48 |         ldrh r2, [r4, r6]  /* r2 ← *{half}(r4 + r6)
49 |                               this is r2 ← *{half}(r4 + r5 * 2)
50 |                               third parameter of printf */
51 |         bl printf          /* call printf */
52 |         add r5, r5, #1     /* r5 ← r5 + 1 */
53 |     check_loop_halfwords:
54 |         cmp r5, #2         /* compute r5 - 2 and update cpsr */
55 |         bne loop_halfwords /* if r5 != 2 branch to loop_halfwords */
56 | 
57 |     /* prepare call to printf */
58 |     ldr r0, addr_message_words /* r0 ← &message_words
59 |                                   first parameter of printf */
60 |     mov r1, #0                 /* r1 ← 0
61 |                                   second parameter of printf */
62 |     ldr r2, [r4]               /* r1 ← *r4
63 |                                   third parameter of printf */
64 |     bl printf                  /* call printf */
65 | 
66 |     pop {r4, r5, r6, lr}   /* restore callee saved registers */
67 |     mov r0, #0             /* set error code */
68 |     bx lr                  /* return to system */
69 | 
70 | addr_a_word : .word a_word
71 | addr_message_bytes : .word message_bytes
72 | addr_message_halfwords : .word message_halfwords
73 | addr_message_words : .word message_words
74 | 


--------------------------------------------------------------------------------
/chapter21/subword.s:
--------------------------------------------------------------------------------
 1 | .data
 2 | 
 3 | .align 4
 4 | one_byte: .byte 205
 5 | 
 6 | .align 4
 7 | one_halfword: .hword 42445
 8 | 
 9 | .text
10 | 
11 | .globl main
12 | main:
13 |     push {r4, lr}
14 | 
15 |     ldr r0, addr_of_one_byte     /* r0 ← &one_byte */
16 |     ldrb r0, [r0]                /* r0 ← *{byte}r0 */
17 | 
18 |     ldr r1, addr_of_one_halfword /* r1 ← &one_halfword */
19 |     ldrh r1, [r1]                /* r1 ← *{half}r1 */
20 | 
21 |     pop {r4, lr}
22 |     mov r0, #0
23 |     bx lr
24 | 
25 | addr_of_one_byte: .word one_byte
26 | addr_of_one_halfword: .word one_halfword
27 | 


--------------------------------------------------------------------------------
/chapter21/subword_signed.s:
--------------------------------------------------------------------------------
 1 | .data
 2 | 
 3 | .align 4
 4 | one_byte: .byte 205
 5 | 
 6 | .align 4
 7 | one_halfword: .hword 42445
 8 | 
 9 | .text
10 | 
11 | .globl main
12 | main:
13 |     push {r4, lr}
14 | 
15 |     ldr r0, addr_of_one_byte     /* r0 ← &one_byte */
16 |     ldrsb r0, [r0]                /* r0 ← *{byte}r0 */
17 | 
18 |     ldr r1, addr_of_one_halfword /* r1 ← &one_halfword */
19 |     ldrsh r1, [r1]                /* r1 ← *{half}r1 */
20 | 
21 |     pop {r4, lr}
22 |     mov r0, #0
23 |     bx lr
24 | 
25 | addr_of_one_byte: .word one_byte
26 | addr_of_one_halfword: .word one_halfword
27 | 


--------------------------------------------------------------------------------
/chapter22/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=thumb-first thumb-call back-to-arm
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -march=armv6 -mfpu=vfpv2 -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter22/back-to-arm.s:
--------------------------------------------------------------------------------
 1 | /* thumb-first.s */
 2 | 
 3 | .text
 4 | 
 5 | .data
 6 | message: .asciz "Hello world %d\n"
 7 | 
 8 | .code 16     /* Here we say we will use Thumb */
 9 | .align 2     /* Make sure instructions are aligned at 2-byte boundary */
10 | thumb_function:
11 |     push {r4, lr}         /* keep r4 and lr in the stack */
12 |     mov r4, #0            /* r4 ← 0 */
13 |     b check_loop          /* unconditional branch to check_loop */
14 |     loop:             
15 |        /* prepare the call to printf */
16 |        ldr r0, addr_of_message  /* r0 ← &message */
17 |        mov r1, r4               /* r1 ← r4 */
18 |        blx printf               /* From Thumb to ARM we use blx.
19 |                                    printf is a function
20 |                                    in the C library that is implemented
21 |                                    using ARM instructions */
22 |        add r4, r4, #1           /* r4 ← r4 + 1 */
23 |     check_loop:
24 |        cmp r4, #4               /* compute r4 - 4 and update the cpsr */
25 |        blt loop                 /* if the cpsr means that r4 &lt; 4 branch to loop */
26 | 
27 |     pop {r4, pc}          /* restore registers and return from function */
28 | .align 4
29 | addr_of_message: .word message
30 | 
31 | .code 32     /* Here we say we will use ARM */
32 | .align 4     /* Make sure instructions are aligned at 4-byte boundary */
33 | .globl main
34 | main:
35 |     push {r4, lr}
36 | 
37 |     blx thumb_function /* Switch from ARM to Thumb */
38 | 
39 |     pop {r4, lr}
40 |     bx lr
41 | 


--------------------------------------------------------------------------------
/chapter22/thumb-call.s:
--------------------------------------------------------------------------------
 1 | /* thumb-call.s */
 2 | .text
 3 | 
 4 | .code 16     /* Here we say we will use Thumb */
 5 | .align 2     /* Make sure instructions are aligned at 2-byte boundary */
 6 | 
 7 | thumb_function_2:
 8 |     mov r0, #2
 9 |     bx lr   /* A leaf Thumb function (i.e. a function that does not call
10 |                any other function) returns using "bx lr" */
11 | 
12 | thumb_function_1:
13 |     push {r4, lr}       /* Keep r4 and lr in the stack */
14 |     bl thumb_function_2 /* From Thumb to Thumb we use bl */
15 |     pop {r4, pc}  /* This is how we return from a non-leaf Thumb function */
16 | 
17 | .code 32     /* Here we say we will use ARM */
18 | .align 4     /* Make sure instructions are aligned at 4-byte boundary */
19 | .globl main
20 | main:
21 |     push {r4, lr}
22 | 
23 |     blx thumb_function_1 /* From ARM to Thumb we use blx */
24 | 
25 |     pop {r4, lr}
26 |     bx lr
27 | 


--------------------------------------------------------------------------------
/chapter22/thumb-first.s:
--------------------------------------------------------------------------------
 1 | /* thumb-first.s */
 2 | .text
 3 | 
 4 | .code 16     /* Here we say we will use Thumb */
 5 | .align 2     /* Make sure instructions are aligned at 2-byte boundary */
 6 | 
 7 | thumb_function:
 8 |     mov r0, #2   /* r0 ← 2 */
 9 |     bx lr        /* return */
10 | 
11 | .code 32     /* Here we say we will use ARM */
12 | .align 4     /* Make sure instructions are aligned at 4-byte boundary */
13 | 
14 | .globl main
15 | main:
16 |     push {r4, lr}
17 | 
18 |     blx thumb_function /* From ARM to Thumb we use blx */
19 | 
20 |     pop {r4, lr}
21 |     bx lr
22 | 


--------------------------------------------------------------------------------
/chapter23/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=nested01 nested02
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -march=armv6 -mfpu=vfpv2 -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter23/nested01.s:
--------------------------------------------------------------------------------
 1 | /* nested01.s */
 2 | 
 3 | .text
 4 | 
 5 | f:
 6 |     push {r4, r5, fp, lr} /* keep registers */
 7 |     mov fp, sp /* keep dynamic link */
 8 | 
 9 |     sub sp, sp, #8      /* make room for x (4 bytes)
10 |                            plus 4 bytes to keep stack
11 |                            aligned */
12 |     /* x is in address "fp - 4" */
13 | 
14 |     mov r4, #1          /* r4 ← 0 */
15 |     str r4, [fp, #-4]   /* x ← r4 */
16 | 
17 |     bl g                /* call (nested function) g */
18 | 
19 |     ldr r4, [fp, #-4]   /* r4 ← x */
20 |     add r4, r4, #1      /* r4 ← r4 + 1 */
21 |     str r4, [fp, #-4]   /* x ← r4 */
22 | 
23 |     mov sp, fp /* restore dynamic link */
24 |     pop {r4, r5, fp, lr} /* restore registers */
25 |     bx lr /* return */
26 | 
27 |     /* nested function g */
28 |     g:
29 |         push {r4, r5, fp, lr} /* keep registers */
30 |         mov fp, sp /* keep dynamic link */
31 | 
32 |         /* At this point our stack looks like this
33 | 
34 |           Data | Address | Notes
35 |          ------+---------+--------------------
36 |            r4  | fp      |  
37 |            r5  | fp + 4  |
38 |            fp  | fp + 8  | This is the old fp
39 |            lr  |
40 |         */
41 | 
42 |         ldr r4, [fp, #+8] /* get the activation record
43 |                              of my caller
44 |                              (since only f can call me)
45 |                            */
46 | 
47 |         /* now r4 acts like the fp we had inside 'f' */
48 |         ldr r5, [r4, #-4] /* r5 ← x */
49 |         add r5, r5, #1    /* r5 ← r5 + 1 */
50 |         str r5, [r4, #-4] /* x ← r5 */
51 | 
52 |         mov sp, fp /* restore dynamic link */
53 |         pop {r4, r5, fp, lr} /* restore registers */
54 |         bx lr /* return */
55 | 
56 | .globl main
57 | 
58 | main :
59 |     push {r4, lr} /* keep registers */
60 | 
61 |     bl f          /* call f */
62 | 
63 |     mov r0, #0
64 |     pop {r4, lr}
65 |     bx lr
66 | 


--------------------------------------------------------------------------------
/chapter23/nested02.s:
--------------------------------------------------------------------------------
  1 | /* nested01.s */
  2 | 
  3 | .text
  4 | 
  5 | # void f(void) // non nested (nesting depth = 0)
  6 | # {
  7 | #    int x;
  8 | # 
  9 | #    void g() // nested (nesting depth = 1)
 10 | #    {
 11 | #       x = x + 1;
 12 | #    }
 13 | #    void h() // nested (nesting depth = 1)
 14 | #    {
 15 | #       void m() // nested (nesting depth = 2)
 16 | #       {
 17 | #          x = x + 2;
 18 | #          g();
 19 | #       }
 20 | # 
 21 | #       g();
 22 | #       m();
 23 | #       x = x + 3;
 24 | #    }
 25 | # 
 26 | #    x = 1;
 27 | #    h();
 28 | #    // here x will be 8
 29 | # }
 30 | 
 31 | f:
 32 |     push {r4, r10, fp, lr} /* keep registers */
 33 |     mov fp, sp             /* setup dynamic link */
 34 | 
 35 |     sub sp, sp, #8      /* make room for x (4 + 4 bytes) */
 36 |     /* x will be in address "fp - 4" */
 37 | 
 38 |     /* At this point our stack looks like this
 39 | 
 40 |      Data | Address | Notes
 41 |     ------+---------+---------------------------
 42 |           | fp - 8  | alignment (per AAPCS)
 43 |       x   | fp - 4  | 
 44 |       r4  | fp      |  
 45 |       r10 | fp + 8  | previous value of r10
 46 |       fp  | fp + 12 | previous value of fp
 47 |       lr  | fp + 16 |
 48 |    */
 49 | 
 50 |     mov r4, #1          /* r4 ← 1 */
 51 |     str r4, [fp, #-4]   /* x ← r4 */
 52 | 
 53 |     /* prepare the call to h */
 54 |     mov r10, fp /* setup the static link,
 55 |                    since we are calling an immediately nested function
 56 |                    it is just the current frame */
 57 |     bl h
 58 | 
 59 |     mov sp, fp             /* restore stack */
 60 |     pop {r4, r10, fp, lr}  /* restore registers */
 61 |     bx lr /* return */
 62 | 
 63 | /* ------ nested function ------------------ */
 64 | h :
 65 |     push {r4, r5, r10, fp, lr} /* keep registers */
 66 |     mov fp, sp /* setup dynamic link */
 67 | 
 68 |     sub sp, sp, #4 /* align stack */
 69 | 
 70 |     /* At this point our stack looks like this
 71 | 
 72 |       Data | Address | Notes
 73 |      ------+---------+---------------------------
 74 |            | fp - 4  | alignment (per AAPCS)
 75 |        r4  | fp      |  
 76 |        r5  | fp + 4  | 
 77 |        r10 | fp + 8  | frame pointer of 'f'
 78 |        fp  | fp + 12 | frame pointer of caller
 79 |        lr  | fp + 16 |
 80 |     */
 81 | 
 82 |     /* prepare call to g */
 83 |     /* g is a sibling so the static link will be the same
 84 |        as the current one */
 85 |     ldr r10, [fp, #8]
 86 |     bl g
 87 | 
 88 |     /* prepare call to m */
 89 |     /* m is an immediately nested function so the static
 90 |        link is the current frame */
 91 |     mov r10, fp
 92 |     bl m
 93 | 
 94 |     ldr r4, [fp, #8]  /* load frame pointer of 'f' */
 95 |     ldr r5, [r4, #-4]  /* r5 ← x */
 96 |     add r5, r5, #3     /* r5 ← r5 + 3 */
 97 |     str r5, [r4, #-4]  /* x ← r5 */
 98 | 
 99 |     mov sp, fp            /* restore stack */
100 |     pop {r4, r5, r10, fp, lr} /* restore registers */
101 |     bx lr
102 | 
103 | 
104 | /* ------ nested function ------------------ */
105 | m:
106 |     push {r4, r5, r10, fp, lr} /* keep registers */
107 |     mov fp, sp /* setup dynamic link */
108 | 
109 |     sub sp, sp, #4 /* align stack */
110 |     /* At this point our stack looks like this
111 | 
112 |       Data | Address | Notes
113 |      ------+---------+---------------------------
114 |            | fp - 4  | alignment (per AAPCS)
115 |        r4  | fp      |  
116 |        r5  | fp + 4  |
117 |        r10 | fp + 8  | frame pointer of 'h'
118 |        fp  | fp + 12 | frame pointer of caller
119 |        lr  | fp + 16 |
120 |     */
121 | 
122 |     ldr r4, [fp, #8]  /* r4 ← frame pointer of 'h' */
123 |     ldr r4, [r4, #8]  /* r4 ← frame pointer of 'f' */
124 |     ldr r5, [r4, #-4] /* r5 ← x */
125 |     add r5, r5, #2    /* r5 ← r5 + 2 */
126 |     str r5, [r4, #-4] /* x ← r5 */
127 | 
128 |     /* setup call to g */
129 |     ldr r10, [fp, #8]   /* r10 ← frame pointer of 'h' */
130 |     ldr r10, [r10, #8]  /* r10 ← frame pointer of 'f' */
131 |     bl g
132 | 
133 |     mov sp, fp                /* restore stack */
134 |     pop {r4, r5, r10, fp, lr} /* restore registers */
135 |     bx lr
136 | 
137 | /* ------ nested function ------------------ */
138 | g:
139 |     push {r4, r5, r10, fp, lr} /* keep registers */
140 |     mov fp, sp /* setup dynamic link */
141 | 
142 |     sub sp, sp, #4 /* align stack */
143 | 
144 |     /* At this point our stack looks like this
145 | 
146 |       Data | Address | Notes
147 |      ------+---------+---------------------------
148 |            | fp - 4  | alignment (per AAPCS)
149 |        r4  | fp      |  
150 |        r5  | fp + 4  |  
151 |        r10 | fp + 8  | frame pointer of 'f'
152 |        fp  | fp + 12 | frame pointer of caller
153 |        lr  | fp + 16 |
154 |     */
155 | 
156 |     ldr r4, [fp, #8]  /* r4 ← frame pointer of 'f' */
157 |     ldr r5, [r4, #-4] /* r5 ← x */
158 |     add r5, r5, #1    /* r5 ← r5 + 1 */
159 |     str r5, [r4, #-4] /* x ← r5 */
160 | 
161 |     mov sp, fp /* restore dynamic link */
162 |     pop {r4, r5, r10, fp, lr} /* restore registers */
163 |     bx lr
164 | 
165 | .globl main
166 | 
167 | main :
168 |     push {r4, lr} /* keep registers */
169 | 
170 |     bl f          /* call f */
171 | 
172 |     mov r0, #0
173 |     pop {r4, lr}
174 |     bx lr
175 | 


--------------------------------------------------------------------------------
/chapter24/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=print-array sort-array trampoline-sort-array
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -march=armv6 -mfpu=vfpv2 -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter24/print-array.s:
--------------------------------------------------------------------------------
 1 | /* print-array.s */
 2 | 
 3 | .data
 4 | 
 5 | /* declare an array of 10 integers called my_array */
 6 | .align 4
 7 | my_array: .word 82, 70, 93, 77, 91, 30, 42, 6, 92, 64
 8 | 
 9 | /* format strings for printf */
10 | /* format string that prints an integer plus a space */
11 | .align 4
12 | integer_printf: .asciz "%d "
13 | /* format string that simply prints a newline */
14 | .align 4
15 | newline_printf: .asciz "\n"
16 | 
17 | .text
18 | 
19 | print_array:
20 |     /* r0 will be the address of the integer array */
21 |     /* r1 will be the number of items in the array */
22 |     push {r4, r5, r6, lr}  /* keep r4, r5, r6 and lr in the stack */
23 | 
24 |     mov r4, r0             /* r4 ← r0. keep the address of the array */
25 |     mov r5, r1             /* r5 ← r1. keep the number of items */
26 |     mov r6, #0             /* r6 ← 0.  current item to print */
27 | 
28 |     b .Lprint_array_check_loop /* go to the condition check of the loop */
29 | 
30 |     .Lprint_array_loop:
31 |       /* prepare the call to printf */
32 |       ldr r0, addr_of_integer_printf  /* r0 ← &integer_printf */
33 |       ldr r1, [r4, +r6, LSL #2]       /* r1 ← *(r4 + r6 * 4) */
34 |       bl printf                       /* call printf */
35 | 
36 |       add r6, r6, #1                  /* r6 ← r6 + 1 */
37 |     .Lprint_array_check_loop: 
38 |       cmp r6, r5               /* perform r6 - r5 and update cpsr */
39 |       bne .Lprint_array_loop   /* if cpsr states that r6 is not equal to r5
40 |                                   branch to the body of the loop */
41 | 
42 |     /* prepare call to printf */
43 |     ldr r0, addr_of_newline_printf /* r0 ← &newline_printf */
44 |     bl printf
45 |     
46 |     pop {r4, r5, r6, lr}   /* restore r4, r5, r6 and lr from the stack */
47 |     bx lr                  /* return */
48 | 
49 | addr_of_integer_printf: .word integer_printf
50 | addr_of_newline_printf: .word newline_printf
51 | 
52 | .globl main
53 | main:
54 |     push {r4, lr}             /* keep r4 and lr in the stack */
55 | 
56 |     /* prepare call to print_array */
57 |     ldr r0, addr_of_my_array  /* r0 ← &my_array */
58 |     mov r1, #10               /* r1 ← 10
59 |                                  our array is of length 10 */
60 |     bl print_array            /* call print_array */
61 | 
62 |     mov r0, #0                /* r0 ← 0 set errorcode to 0 prior returning from main */
63 |     pop {r4, lr}              /* restore r4 and lr in the stack */
64 |     bx lr                     /* return */
65 | 
66 | addr_of_my_array: .word my_array
67 | 


--------------------------------------------------------------------------------
/chapter24/sort-array.s:
--------------------------------------------------------------------------------
  1 | /* sort-array.s */
  2 | 
  3 | .data
  4 | 
  5 | /* declare an array of 10 integers called my_array */
  6 | .align 4
  7 | my_array: .word 82, 70, 93, 77, 91, 30, 42, 6, 92, 64
  8 | 
  9 | /* format strings for printf */
 10 | /* format string that prints an integer plus a space */
 11 | .align 4
 12 | integer_printf: .asciz "%d "
 13 | /* format string that simply prints a newline */
 14 | .align 4
 15 | newline_printf: .asciz "\n"
 16 | 
 17 | .text
 18 | 
 19 | integer_comparison:
 20 |     /* r0 will be the address to the first integer */
 21 |     /* r1 will be the address to the second integer */
 22 |     ldr r0, [r0]    /* r0 ← *r0
 23 |                        load the integer pointed by r0 in r0 */
 24 |     ldr r1, [r1]    /* r1 ← *r1
 25 |                        load the integer pointed by r1 in r1 */
 26 | 
 27 |     cmp r0, r1      /* compute r0 - r1 and update cpsr */
 28 |     moveq r0, #0    /* if cpsr means that r0 == r1 then r0 ←  0 */
 29 |     movlt r0, #-1   /* if cpsr means that r0 <  r1 then r0 ← -1 */
 30 |     movgt r0, #1    /* if cpsr means that r0 >  r1 then r0 ←  1 */
 31 |     bx lr           /* return */
 32 | 
 33 | print_array:
 34 |     /* r0 will be the address of the integer array */
 35 |     /* r1 will be the number of items in the array */
 36 |     push {r4, r5, r6, lr}  /* keep r4, r5, r6 and lr in the stack */
 37 | 
 38 |     mov r4, r0             /* r4 ← r0. keep the address of the array */
 39 |     mov r5, r1             /* r5 ← r1. keep the number of items */
 40 |     mov r6, #0             /* r6 ← 0.  current item to print */
 41 | 
 42 |     b .Lprint_array_check_loop /* go to the condition check of the loop */
 43 | 
 44 |     .Lprint_array_loop:
 45 |       /* prepare the call to printf */
 46 |       ldr r0, addr_of_integer_printf  /* r0 ← &integer_printf */
 47 |       ldr r1, [r4, +r6, LSL #2]       /* r1 ← *(r4 + r6 * 4) */
 48 |       bl printf                       /* call printf */
 49 | 
 50 |       add r6, r6, #1                  /* r6 ← r6 + 1 */
 51 |     .Lprint_array_check_loop: 
 52 |       cmp r6, r5               /* perform r6 - r5 and update cpsr */
 53 |       bne .Lprint_array_loop   /* if cpsr states that r6 is not equal to r5
 54 |                                   branch to the body of the loop */
 55 | 
 56 |     /* prepare call to printf */
 57 |     ldr r0, addr_of_newline_printf /* r0 ← &newline_printf */
 58 |     bl printf
 59 |     
 60 |     pop {r4, r5, r6, lr}   /* restore r4, r5, r6 and lr from the stack */
 61 |     bx lr                  /* return */
 62 | 
 63 | addr_of_integer_printf: .word integer_printf
 64 | addr_of_newline_printf: .word newline_printf
 65 | 
 66 | .globl main
 67 | main:
 68 |     push {r4, lr}             /* keep r4 and lr in the stack */
 69 | 
 70 |     /* prepare call to print_array */
 71 |     ldr r0, addr_of_my_array  /* r0 ← &my_array */
 72 |     mov r1, #10               /* r1 ← 10
 73 |                                  our array is of length 10 */
 74 |     bl print_array            /* call print_array */
 75 | 
 76 |     /* prepare call to qsort */
 77 |     /*
 78 |     void qsort(void *base,
 79 |          size_t nmemb,
 80 |          size_t size,
 81 |          int (*compar)(const void *, const void *));
 82 |     */
 83 |     ldr r0, addr_of_my_array  /* r0 ← &my_array
 84 |                                  base */
 85 |     mov r1, #10               /* r1 ← 10
 86 |                                  nmemb = number of members
 87 |                                  our array is 10 elements long */
 88 |     mov r2, #4                /* r1 ← 4
 89 |                                  size of each member is 4 bytes */
 90 |     ldr r3, addr_of_integer_comparison
 91 |                               /* r3 ← &integer_comparison
 92 |                                  comp */
 93 |     bl qsort                  /* call qsort */
 94 | 
 95 |     /* now print again to see if elements were sorted */
 96 |     /* prepare call to print_array */
 97 |     ldr r0, addr_of_my_array  /* r0 ← &my_array */
 98 |     mov r1, #10               /* r1 ← 10
 99 |                                  our array is of length 10 */
100 |     bl print_array            /* call print_array */
101 | 
102 |     mov r0, #0                /* r0 ← 0 set errorcode to 0 prior returning from main */
103 |     pop {r4, lr}              /* restore r4 and lr in the stack */
104 |     bx lr                     /* return */
105 | 
106 | addr_of_my_array: .word my_array
107 | addr_of_integer_comparison : .word integer_comparison
108 | 


--------------------------------------------------------------------------------
/chapter24/trampoline-sort-array.s:
--------------------------------------------------------------------------------
  1 | /* trampoline-sort-arrays.s */
  2 | 
  3 | .data
  4 | 
  5 | /* declare an array of 10 integers called my_array */
  6 | .align 4
  7 | my_array: .word 82, 70, 93, 77, 91, 30, 42, 6, 92, 64
  8 | 
  9 | /* format strings for printf */
 10 | /* format string that prints an integer plus a space */
 11 | .align 4
 12 | integer_printf: .asciz "%d "
 13 | /* format string that simply prints a newline */
 14 | .align 4
 15 | newline_printf: .asciz "\n"
 16 | .align 4
 17 | comparison_message: .asciz "Num comparisons: %d\n"
 18 | 
 19 | .text
 20 | 
 21 | print_array:
 22 |     /* r0 will be the address of the integer array */
 23 |     /* r1 will be the number of items in the array */
 24 |     push {r4, r5, r6, lr}  /* keep r4, r5, r6 and lr in the stack */
 25 | 
 26 |     mov r4, r0             /* r4 ← r0. keep the address of the array */
 27 |     mov r5, r1             /* r5 ← r1. keep the number of items */
 28 |     mov r6, #0             /* r6 ← 0.  current item to print */
 29 | 
 30 |     b .Lprint_array_check_loop /* go to the condition check of the loop */
 31 | 
 32 |     .Lprint_array_loop:
 33 |       /* prepare the call to printf */
 34 |       ldr r0, addr_of_integer_printf  /* r0 ← &integer_printf */
 35 |       ldr r1, [r4, +r6, LSL #2]       /* r1 ← *(r4 + r6 * 4) */
 36 |       bl printf                       /* call printf */
 37 | 
 38 |       add r6, r6, #1                  /* r6 ← r6 + 1 */
 39 |     .Lprint_array_check_loop: 
 40 |       cmp r6, r5               /* perform r6 - r5 and update cpsr */
 41 |       bne .Lprint_array_loop   /* if cpsr states that r6 is not equal to r5
 42 |                                   branch to the body of the loop */
 43 | 
 44 |     /* prepare call to printf */
 45 |     ldr r0, addr_of_newline_printf /* r0 ← &newline_printf */
 46 |     bl printf
 47 |     
 48 |     pop {r4, r5, r6, lr}   /* restore r4, r5, r6 and lr from the stack */
 49 |     bx lr                  /* return */
 50 | 
 51 | addr_of_integer_printf: .word integer_printf
 52 | addr_of_newline_printf: .word newline_printf
 53 | 
 54 | .globl main
 55 | main:
 56 |     push {r4, r5, r6, fp, lr} /* keep callee saved registers */
 57 |     mov fp, sp                /* setup dynamic link */
 58 | 
 59 |     sub sp, sp, #4            /* counter will be in fp - 4 */
 60 |     /* note that now the stack is 8-byte aligned */
 61 | 
 62 |     /* set counter to zero */
 63 |     mov r4, #0        /* r4 ← 0 */
 64 |     str r4, [fp, #-4] /* counter ← r4 */
 65 | 
 66 |     /* Make room for the trampoline */
 67 |     sub sp, sp, #32 /* sp ← sp - 32 */
 68 |     /* note that 32 is a multiple of 8, so the stack
 69 |        is still 8-byte aligned */
 70 | 
 71 |     /* copy the trampoline into the stack */
 72 |     mov r4, #32                        /* r4 ← 32 */
 73 |     ldr r5, .Laddr_trampoline_template /* r4 ← &trampoline_template */
 74 |     mov r6, sp                         /* r6 ← sp */
 75 |     b .Lcopy_trampoline_loop_check     /* branch to copy_trampoline_loop_check */
 76 | 
 77 |     .Lcopy_trampoline_loop:
 78 |         ldr r7, [r5]     /* r7 ← *r5 */
 79 |         str r7, [r6]     /* *r6 ← r7 */
 80 |         add r5, r5, #4   /* r5 ← r5 + 4 */
 81 |         add r6, r6, #4   /* r6 ← r6 + 4 */
 82 |         sub r4, r4, #4   /* r4 ← r4 - 4 */
 83 |     .Lcopy_trampoline_loop_check:
 84 |         cmp r4, #0                  /* compute r4 - 0 and update cpsr */
 85 |         bgt .Lcopy_trampoline_loop  /* if cpsr means that r4 > 0
 86 |                                        then branch to copy_trampoline_loop */
 87 | 
 88 |     /* setup the trampoline */
 89 |     ldr r4, addr_of_integer_comparison_count
 90 |                        /* r4 ← &integer_comparison_count */
 91 |     str r4, [fp, #-36] /* *(fp + 36) ← r4 */
 92 |                        /* set the function_called in the trampoline
 93 |                           to be &integer_comparison_count */
 94 |     str fp, [fp, #-32]  /* *(fp + 32) ← fp */
 95 |                         /* set the lexical_scope in the trampoline
 96 |                            to be fp */
 97 | 
 98 |     /* prepare call to __clear_cache */
 99 |     mov r0, sp       /* r0 ← sp */
100 |     add r1, sp, #32  /* r1 ← sp + 32 */
101 |     bl __clear_cache /* call __clear_cache */
102 | 
103 |     /* prepare call to print_array */
104 |     ldr r0, addr_of_my_array /* r0 ← &my_array */
105 |     mov r1, #10              /* r1 ← 10
106 |                                 our array is of length 10 */
107 |     bl print_array           /* call print_array */
108 | 
109 |     /* prepare call to qsort */
110 |     /*
111 |     void qsort(void *base,
112 |          size_t nmemb,
113 |          size_t size,
114 |          int (*compar)(const void *, const void *));
115 |     */
116 |     ldr r0, addr_of_my_array /* r0 ← &my_array
117 |                                 base */
118 |     mov r1, #10              /* r1 ← 10
119 |                                 nmemb = number of members
120 |                                 our array is 10 elements long */
121 |     mov r2, #4               /* r2 ← 4
122 |                                 size of each member is 4 bytes */
123 |     sub r3, fp, #28          /* r3 ← fp + 28 */
124 |     bl qsort                 /* call qsort */
125 | 
126 |     /* prepare call to printf */
127 |     ldr r1, [fp, #-4]                    /* r1 ← counter
128 |                                             num comparisons */
129 |     ldr r0, addr_of_comparison_message   /* r0 ← &comparison_message */
130 |     bl printf                            /* call printf */
131 | 
132 |     /* now print again the array to see if elements were sorted */
133 |     /* prepare call to print_array */
134 |     ldr r0, addr_of_my_array  /* r0 ← &my_array */
135 |     mov r1, #10               /* r1 ← 10
136 |                                  our array is of length 10 */
137 |     bl print_array            /* call print_array */
138 | 
139 |     mov r0, #0                /* r0 ← 0 set errorcode to 0 prior returning from main */
140 | 
141 |     mov sp, fp
142 |     pop {r4, r5, r6, fp, lr}      /* restore callee-saved registers */
143 |     bx lr                     /* return */
144 | 
145 | addr_of_my_array: .word my_array
146 | addr_of_comparison_message : .word comparison_message
147 | 
148 |     /* nested function integer comparison */
149 |     addr_of_integer_comparison_count : .word integer_comparison_count
150 |     integer_comparison_count:
151 |         /* r0 will be the address to the first integer */
152 |         /* r1 will be the address to the second integer */
153 |         push {r4, r5, r10, fp, lr} /* keep callee-saved registers */
154 |         mov fp, sp                 /* setup dynamic link */
155 | 
156 |         ldr r0, [r0]    /* r0 ← *r0
157 |                            load the integer pointed by r0 in r0 */
158 |         ldr r1, [r1]    /* r1 ← *r1
159 |                            load the integer pointed by r1 in r1 */
160 |      
161 |         cmp r0, r1      /* compute r0 - r1 and update cpsr */
162 |         moveq r0, #0    /* if cpsr means that r0 == r1 then r0 ←  0 */
163 |         movlt r0, #-1   /* if cpsr means that r0 <  r1 then r0 ← -1 */
164 |         movgt r0, #1    /* if cpsr means that r0 >  r1 then r0 ←  1 */
165 | 
166 |         ldr r4, [fp, #8]  /* r4 ← *(fp + 8)
167 |                              get static link in the stack */
168 |         ldr r5, [r4, #-4] /* r5 ← *(r4 - 4)
169 |                              get value of counter */
170 |         add r5, r5, #1    /* r5 ← r5 + 1 */
171 |         str r5, [r4, #-4] /* *(r4 - 4) ← r5
172 |                              update counter */
173 | 
174 |         mov sp, fp        /* restore stack */
175 |         pop {r4, r5, r10, fp, lr} /* restore callee-saved registers */
176 |         bx lr           /* return */
177 | 
178 | .Laddr_trampoline_template : .word .Ltrampoline_template
179 | .Ltrampoline_template:
180 |     .Lfunction_called: .word 0x0
181 |     .Llexical_scope: .word 0x0
182 |     push {r4, r5, r10, lr}           /* keep callee-saved registers */
183 |     ldr r4, .Lfunction_called        /* r4 ← function called */
184 |     ldr r10, .Llexical_scope         /* r10 ← lexical scope */
185 |     blx r4                           /* indirect call to r4 */
186 |     pop {r4, r5, r10, lr}            /* restore callee-saved registers */
187 |     bx lr                            /* return */
188 | 
189 | 
190 | 


--------------------------------------------------------------------------------
/chapter25/Makefile:
--------------------------------------------------------------------------------
 1 | EXES=motivation byte_array_add clipped_add
 2 | OBJS=$(addsuffix .o, $(EXES))
 3 | all: $(EXES) $(OBJS)
 4 | 
 5 | %: %.o
 6 | 	gcc -o $@ $+
 7 | 
 8 | % : %.s
 9 | 
10 | %.o : %.s
11 | 	as -march=armv6 -mfpu=vfpv2 -o $@ $<
12 | 
13 | .PHONY: clean
14 | clean:
15 | 	rm -vf $(EXES) *.o
16 | 


--------------------------------------------------------------------------------
/chapter25/byte_array_add.s:
--------------------------------------------------------------------------------
 1 | # byte_array_add.s
 2 | 
 3 | naive_byte_array_addition:
 4 |     /* r0 contains the base address of a */
 5 |     /* r1 contains the base address of b */
 6 |     /* r2 contains the base address of c */
 7 |     /* r3 is N */
 8 |     /* r4 is the number of the current item
 9 |           so it holds that 0 ≤ r4 < r3 */
10 | 
11 |     mov r4, #0             /* r4 ← 0 */
12 |     b .Lcheck_loop0        /* branch to check_loop0 */
13 | 
14 |     .Lloop0:
15 |       ldrb r5, [r0, r4]    /* r5 ← *{unsigned byte}(r0 + r4) */
16 |       ldrb r6, [r1, r4]    /* r6 ← *{unsigned byte}(r1 + r4) */
17 |       add r7, r5, r6       /* r7 ← r5 + r6 */
18 |       strb r7, [r2, r4]    /* *{unsigned byte}(r2 + r4) ← r7 */
19 |       add r4, r4, #1       /* r4 ← r4 + 1 */
20 |     .Lcheck_loop0:
21 |        cmp r4, r3          /* perform r4 - r3 and update cpsr */
22 |        blt .Lloop0         /* if cpsr means that r4 < r3 jump to loop0 */
23 | 
24 | simd_byte_array_addition_0:
25 |     /* r0 contains the base address of a */
26 |     /* r1 contains the base address of b */
27 |     /* r2 contains the base address of c */
28 |     /* r3 is N */
29 |     /* r4 is the number of the current item
30 |           so it holds that 0 ≤ r4 < r3 */
31 | 
32 |     mov r4, #0             /* r4 ← 0 */
33 |     b .Lcheck_loop1        /* branch to check_loop1 */
34 | 
35 |     .Lloop1:
36 |       ldr r5, [r0, r4]     /* r5 ← *(r0 + r4) */
37 |       ldr r6, [r1, r4]     /* r6 ← *(r1 + r4) */
38 |       sadd8 r7, r5, r6     /* r7[7:0] ← r5[7:0] + r6[7:0] */
39 |                            /* r7[15:8] ← r5[15:8] + r6[15:8] */
40 |                            /* r7[23:16] ← r5[23:16] + r6[23:16] */
41 |                            /* r7[31:24] ← r5[31:24] + r6[31:24] */
42 |                            /* r7[x:y] means bits x to y of the register r7 */
43 |       str r7, [r2, r4]     /* *(r2 + r4) ← r7 */
44 |       add r4, r4, #4       /* r4 ← r4 + 4 */
45 |     .Lcheck_loop1:
46 |        cmp r4, r3          /* perform r4 - r3 and update cpsr */
47 |        blt .Lloop1         /* if cpsr means that r4 < r3 jump to loop1 */
48 |      
49 | simd_byte_array_addition_1:
50 |     /* r0 contains the base address of a */
51 |     /* r1 contains the base address of b */
52 |     /* r2 contains the base address of c */
53 |     /* r3 is N */
54 |     /* r4 is the number of the current item
55 |           so it holds that 0 ≤ r4 < r3 */
56 | 
57 |     mov r4, #0             /* r4 ← 0 */
58 |     sub r8, r3, #3         /* r8 ← r3 - 3
59 |                               this is r8 ← N - 3 */
60 |     b .Lcheck_loop2        /* branch to check_loop2 */
61 | 
62 |     .Lloop2:
63 |       ldr r5, [r0, r4]     /* r5 ← *(r0 + r4) */
64 |       ldr r6, [r1, r4]     /* r6 ← *(r1 + r4) */
65 |       sadd8 r7, r5, r6     /* r7[7:0] ← r5[7:0] + r6[7:0] */
66 |                            /* r7[15:8] ← r5[15:8] + r6[15:8] */
67 |                            /* r7[23:16] ← r5[23:16] + r6[23:16] */
68 |                            /* r7[31:24] ← r5[31:24] + r6[31:24] */
69 |       str r7, [r2, r4]     /* *(r2 + r4) ← r7 */
70 |       add r4, r4, #4       /* r4 ← r4 + 4 */
71 |     .Lcheck_loop2:
72 |        cmp r4, r8          /* perform r4 - r8 and update cpsr */
73 |        blt .Lloop2         /* if cpsr means that r4 < r8 jump to loop2 */
74 |                            /* i.e. if r4 < N - 3 jump to loop2 */
75 | 
76 |      /* epilog loop */
77 |      b .Lcheck_loop3       /* branch to check_loop3 */
78 |  
79 |      .Lloop3: 
80 |         ldrb r5, [r0, r4]  /* r5 ← *{unsigned byte}(r0 + r4) */
81 |         ldrb r6, [r1, r4]  /* r6 ← *{unsigned byte}(r1 + r4) */
82 |         add r7, r5, r6     /* r7 ← r5 + r6 */
83 |         strb r7, [r2, r4]  /* *{unsigned byte}(r2 + r4) ← r7 */ 
84 | 
85 |         add r4, r4, #1     /* r4 ← r4 + 1 */
86 |      .Lcheck_loop3:
87 |         cmp r4, r3         /* perform r4 - r3 and update cpsr */
88 |         blt .Lloop3        /* if cpsr means that r4 < r3 jump to loop 3 */
89 | 
90 | .global main
91 | main:
92 |     mov r0, #0
93 |     bx lr
94 | 


--------------------------------------------------------------------------------
/chapter25/clipped_add.s:
--------------------------------------------------------------------------------
 1 | 
 2 | .data
 3 | max16bit: .word 32767
 4 | 
 5 | .text
 6 | 
 7 | clipped_add16bit:
 8 |     /* first operand is in r0 */
 9 |     /* second operand is in r0 */
10 |     /* result is left in r0 */
11 |     push {r4, lr}             /* keep registers */
12 |  
13 |     ldr r4, addr_of_max16bit  /* r4 ← &max16bit */
14 |     ldr r4, [r4]              /* r4 ← *r4 */
15 |                               /* now r4 == 32767 (i.e. 2^15 - 1) */
16 | 
17 |     add r0, r0, r1            /* r0 ← r0 + r1 */
18 |     cmp r0, r4                /* perform r0 - r4 and update cpsr */
19 |     movgt r0, r4              /* if r0 > r4 then r0 ← r4 */
20 |     bgt end                   /* if r0 > r4 then branch to end */
21 |     
22 |     mvn r4, r4                /* r4 ← ~r4
23 |                                  now r4 == -32768 (i.e. -2^15) */
24 |     cmp r0, r4                /* perform r0 - r4 and update cpsr */
25 |     movlt r0, r4              /* if r0 < r4 then r0 ← r4 */
26 |   
27 |     end:
28 | 
29 |     pop {r4, lr}              /* restore registers */
30 |     bx lr                     /* return */
31 | addr_of_max16bit: .word max16bit
32 | 
33 | .globl main
34 | 
35 | main:
36 |     mov r0, #0
37 |     bx lr
38 | 


--------------------------------------------------------------------------------
/chapter25/motivation.s:
--------------------------------------------------------------------------------
 1 | # motivation.s
 2 | 
 3 | naive_channel_mixing:
 4 |     /* r0 contains the base address of channel1 */
 5 |     /* r1 contains the base address of channel2 */
 6 |     /* r2 contains the base address of channel_out */
 7 |     /* r3 is the number of samples */
 8 |     /* r4 is the number of the current sample
 9 |           so it holds that 0 ≤ r4 < r3 */
10 | 
11 |     mov r4, #0              /* r4 ← 0 */
12 |     b .Lcheck_loop          /* branch to check_loop */
13 |     .Lloop:
14 |       mov r5, r4, LSL #1    /* r5 ← r4 << 1 (this is r5 ← r4 * 2) */
15 |                             /* a halfword takes two bytes, so multiply
16 |                                the index by two. We do this here because
17 |                                ldrsh does not allow an addressing mode
18 |                                like [r0, r5, LSL #1] */
19 |       ldrsh r6, [r0, r5]    /* r6 ← *{signed half}(r0 + r5) */
20 |       ldrsh r7, [r1, r5]    /* r7 ← *{signed half}(r1 + r5) */
21 |       add r8, r6, r7        /* r8 ← r6 + r7 */
22 |       mov r8, r8, LSR #1    /* r8 ← r8 >> 1 (this is r8 ← r8 / 2)*/
23 |       strh r8, [r2, r5]     /* *{half}(r2 + r5) ← r8 */
24 |       add r4, r4, #1        /* r4 ← r4 + 1 */
25 |     .Lcheck_loop:
26 |       cmp r4, r3            /* compute r4 - r3 and update cpsr */
27 |       blt .Lloop            /* if r4 < r3 jump to the
28 |                                beginning of the loop */
29 |       
30 | 
31 | better_channel_mixing:
32 |     /* r0 contains the base address of channel1 */
33 |     /* r1 contains the base address of channel2 */
34 |     /* r2 contains the base address of channel_out */
35 |     /* r3 is the number of samples */
36 |     /* r4 is the number of the current sample
37 |           so it holds that 0 ≤ r4 < r3 */
38 | 
39 |     mov r4, #0              /* r4 ← 0 */
40 |     b .Lcheck_loop1          /* branch to check_loop */
41 |     .Lloop1:
42 |       ldr r6, [r0, r4]      /* r6 ← *(r0 + r4) */
43 |       ldr r7, [r1, r4]      /* r7 ← *(r1 + r4) */
44 |       shadd16 r8, r6, r7    /* r8[15:0] ← (r6[15:0] + r7[15:0]) >> 1*/
45 |                             /* r8[31:16] ← (r6[31:16] + r7[31:16]) >> 1*/
46 |       str r8, [r2, r4]      /* *(r2 + r4) ← r8 */
47 |       add r4, r4, #2        /* r4 ← r4 + 2 */
48 |     .Lcheck_loop1:
49 |       cmp r4, r3            /* compute r4 - r3 and update cpsr */
50 |       blt .Lloop1            /* if r4 < r3 jump to the
51 |                                beginning of the loop */
52 | 
53 | .global main
54 | main:
55 |     mov r0, #0
56 |     bx lr
57 | 


--------------------------------------------------------------------------------