├── .gitignore ├── ser_print.h ├── chacha.h ├── LICENSE ├── chacha20.c ├── chacha_core_c.c ├── Makefile ├── ser_print.c ├── speed.c ├── README.md └── chacha_core_avr.S /.gitignore: -------------------------------------------------------------------------------- 1 | obj-* 2 | *.axf 3 | *.hex 4 | *.s 5 | simavr/simavr 6 | simavr/run_avr 7 | *.a 8 | .tags* 9 | ._* 10 | *.vcd 11 | TAGS 12 | callgrind.out.* 13 | *~ 14 | *.o 15 | *.elf 16 | *.objdump 17 | perf.data 18 | .DS_Store 19 | .make.options* 20 | -------------------------------------------------------------------------------- /ser_print.h: -------------------------------------------------------------------------------- 1 | // ser_print.h 2 | // 2015-09-04 Markku-Juhani O. Saarinen 3 | // bits and pieces originally from various public domain sources 4 | 5 | #ifndef _SER_PRINT_H_ 6 | #define _SER_PRINT_H_ 7 | 8 | #include 9 | #include 10 | 11 | void ser_init(void); 12 | void ser_write(uint8_t c); 13 | void ser_print(const char *s); 14 | void ser_dec64(uint64_t x); 15 | void ser_hex8(uint8_t x); 16 | void ser_hex16(uint16_t x); 17 | void ser_end(); 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /chacha.h: -------------------------------------------------------------------------------- 1 | // chacha.h 2 | // 2018-06-09 Markku-Juhani O. Saarinen 3 | 4 | #ifndef _CHACHA_H_ 5 | #define _CHACHA_H_ 6 | 7 | #include 8 | 9 | // perform the permutation for "dr" doublerounds 10 | void chacha_perm(uint8_t st[64], uint8_t dr); 11 | 12 | // generate a block of ChaCha20 keystream as per RFC7539 13 | void chacha20_block(void *block, // 64 bytes written here 14 | const uint8_t key[32], // 256-bit secret key 15 | const uint8_t nonce[12], // 96-bit nonce 16 | uint32_t cnt); // 32-bit block counter 1, 2.. 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Markku-Juhani O. Saarinen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /chacha20.c: -------------------------------------------------------------------------------- 1 | // chacha20.c 2 | // 2018-06-09 Markku-Juhani O. Saarinen 3 | 4 | #include 5 | #include "chacha.h" 6 | 7 | // generate a block of ChaCha20 keystream as per RFC7539 8 | 9 | void chacha20_block(void *block, // 64 bytes written here 10 | const uint8_t key[32], // 256-bit secret key 11 | const uint8_t nonce[12], // 96-bit nonce 12 | uint32_t cnt) // 32-bit block counter 1, 2.. 13 | { 14 | const uint32_t fixed[4] = 15 | { 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 }; 16 | size_t i; 17 | 18 | memcpy(block, fixed, 16); 19 | memcpy(block + 16, key, 32); 20 | memcpy(block + 48, &cnt, 4); 21 | memcpy(block + 52, nonce, 12); 22 | 23 | chacha_perm(block, 10); // 10 double-rounds 24 | 25 | for (i = 0; i < 4; i++) 26 | ((uint32_t *) block)[i] += fixed[i]; 27 | for (i = 0; i < 8; i++) 28 | ((uint32_t *) block)[i + 4] += ((const uint32_t *) key)[i]; 29 | ((uint32_t *) block)[12] += cnt; 30 | for (i = 0; i < 3; i++) 31 | ((uint32_t *) block)[i + 13] += ((const uint32_t *) nonce)[i]; 32 | } 33 | 34 | -------------------------------------------------------------------------------- /chacha_core_c.c: -------------------------------------------------------------------------------- 1 | // chacha_core_c.c 2 | // 2018-06-09 Markku-Juhani O. Saarinen 3 | 4 | // C version of the ChaCha core (for benchmarking comparison) 5 | 6 | #include "chacha.h" 7 | 8 | // Rotate 32-bit words left 9 | 10 | #ifndef ROTL32 11 | #define ROTL32(x, y) (((x) << (y)) ^ ((x) >> (32 - (y)))) 12 | #endif 13 | 14 | // ChaCha Quarter Round unrolled as a macro 15 | 16 | #define CHACHA_QR(A, B, C, D) { \ 17 | A += B; D ^= A; D = ROTL32(D, 16); \ 18 | C += D; B ^= C; B = ROTL32(B, 12); \ 19 | A += B; D ^= A; D = ROTL32(D, 8); \ 20 | C += D; B ^= C; B = ROTL32(B, 7); \ 21 | } 22 | 23 | // ChaCha permutation -- dr is the number of double rounds 24 | 25 | void chacha_perm(uint8_t st[64], uint8_t dr) 26 | { 27 | uint8_t i; 28 | uint32_t *v = (uint32_t *) st; 29 | 30 | for (i = 0; i < dr; i++) { 31 | CHACHA_QR( v[ 0], v[ 4], v[ 8], v[12] ); 32 | CHACHA_QR( v[ 1], v[ 5], v[ 9], v[13] ); 33 | CHACHA_QR( v[ 2], v[ 6], v[10], v[14] ); 34 | CHACHA_QR( v[ 3], v[ 7], v[11], v[15] ); 35 | CHACHA_QR( v[ 0], v[ 5], v[10], v[15] ); 36 | CHACHA_QR( v[ 1], v[ 6], v[11], v[12] ); 37 | CHACHA_QR( v[ 2], v[ 7], v[ 8], v[13] ); 38 | CHACHA_QR( v[ 3], v[ 4], v[ 9], v[14] ); 39 | } 40 | } 41 | 42 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile 2 | # 2018-06-09 Markku-Juhani O. Saarinen 3 | 4 | PROJ = chacha-avr 5 | TARGET = atmega2560 6 | CPUFREQ = 16000000 7 | WIRE = wiring 8 | DEVICE = /dev/ttyACM0 9 | CC = avr-gcc 10 | OBJCOPY = avr-objcopy 11 | AR = avr-ar 12 | STRIP = avr-strip 13 | AVRDUDE = avrdude 14 | 15 | CFLAGS = -Wall -Ofast -mmcu=$(TARGET) -DF_CPU=$(CPUFREQ) 16 | 17 | # Uncomment this line instead to use the slower C version 18 | # OBJS = obj/chacha_core_c.o obj/chacha20.o 19 | OBJS = obj/chacha_core_avr.o obj/chacha20.o 20 | 21 | $(PROJ): speed.c ser_print.c obj/$(PROJ).a 22 | $(CC) $(CFLAGS) $^ -o $@ 23 | 24 | %.hex: % 25 | $(OBJCOPY) -O ihex -R .eeprom $^ $@ 26 | 27 | obj/$(PROJ).a: $(OBJS) 28 | $(AR) -ar cr $@ $^ 29 | 30 | obj/%.o: %.[cS] 31 | mkdir -p obj/ 32 | $(CC) $(CFLAGS) -c $^ -o $@ 33 | 34 | obj/%.S: %.c 35 | $(CC) $(CFLAGS) -S $^ -o $@ 36 | 37 | clean: 38 | rm -rf obj $(PROJ) $(PROJ).hex $(PROJ)-*.tgz 39 | 40 | dist: clean 41 | cd ..; \ 42 | tar cfvz $(PROJ)/$(PROJ)-`date "+%Y%m%d%H%M"`.tgz $(PROJ)/* 43 | 44 | # simulate with simavr 45 | sim: $(PROJ) 46 | simavr -v -v -v -m $(TARGET) $(PROJ) 47 | 48 | # flash on device, dump serial output 49 | flash: $(PROJ).hex 50 | $(AVRDUDE) -v -c $(WIRE) -p m2560 -P $(DEVICE) \ 51 | -U flash:w:$(PROJ).hex -D 52 | stty -F $(DEVICE) raw icanon eof \^d 38400 53 | cat < $(DEVICE) 54 | -------------------------------------------------------------------------------- /ser_print.c: -------------------------------------------------------------------------------- 1 | // ser_print.c 2 | // 04-Sep-15 Markku-Juhani O. Saarinen 3 | // bits and pieces originally from various public domain sources 4 | 5 | #include 6 | #include 7 | #include "ser_print.h" 8 | 9 | #ifndef F_CPU 10 | #warning "F_CPU is not defined, set to 16MHz per default." 11 | #define F_CPU 16000000 12 | #endif 13 | 14 | //#define BAUD 57600 15 | #define BAUD 38400 16 | #include 17 | 18 | #ifndef UCSRB 19 | # ifndef UDRE 20 | # define UDRE UDRE0 21 | # define RXEN RXEN0 22 | # define TXEN TXEN0 23 | # endif 24 | # ifdef UCSR0A /* ATmega128 */ 25 | # define UCSRA UCSR0A 26 | # define UCSRB UCSR0B 27 | # define UBRRL UBRR0L 28 | # define UBRRH UBRR0H 29 | # define UDR UDR0 30 | # else /* ATmega8 */ 31 | # define UCSRA USR 32 | # define UCSRB UCR 33 | # endif 34 | #endif 35 | 36 | #ifndef UBRR 37 | # define UBRR UBRRL 38 | #endif 39 | 40 | static char ser_initialized = 0; 41 | 42 | void ser_init(void) 43 | { 44 | UBRRH = UBRRH_VALUE; 45 | UBRRL = UBRRL_VALUE; 46 | /* Enable */ 47 | UCSRB = (1 << RXEN) | (1 << TXEN); 48 | } 49 | 50 | void ser_write(unsigned char c) 51 | { 52 | if (!ser_initialized) { 53 | ser_init(); 54 | ser_initialized = 1; 55 | } 56 | while (!(UCSRA & (1 << UDRE))) {}; 57 | UDR = c; 58 | } 59 | 60 | void ser_print(const char *s) 61 | { 62 | while (*s != 0) { 63 | ser_write(*s); 64 | s++; 65 | } 66 | } 67 | 68 | void ser_dec64(uint64_t x) 69 | { 70 | char buf[21]; 71 | int i; 72 | 73 | if (x == 0) { 74 | ser_print("0"); 75 | } else { 76 | i = 20; 77 | buf[i] = 0; 78 | while (x > 0 && i > 0) { 79 | buf[--i] = (char) ((x % 10) + '0'); 80 | x = x / 10; 81 | } 82 | ser_print(&buf[i]); 83 | } 84 | } 85 | 86 | void ser_hex8(uint8_t x) 87 | { 88 | char y; 89 | 90 | y = x >> 4; 91 | if (y < 10) 92 | y += '0'; 93 | else 94 | y += 'A' - 10; 95 | ser_write(y); 96 | y = x & 0xF; 97 | if (y < 10) 98 | y += '0'; 99 | else 100 | y += 'A' - 10; 101 | ser_write(y); 102 | } 103 | 104 | void ser_hex16(uint16_t x) 105 | { 106 | ser_hex8(x >> 8); 107 | ser_hex8(x & 0xFF); 108 | } 109 | 110 | void ser_end() 111 | { 112 | ser_write(4); 113 | 114 | while (1) 115 | {;} 116 | } 117 | 118 | -------------------------------------------------------------------------------- /speed.c: -------------------------------------------------------------------------------- 1 | // speed.c 2 | // 2018-06-09 Markku-Juhani O. Saarinen 3 | 4 | // Test / benchmark code 5 | 6 | #include 7 | #include 8 | #include "ser_print.h" 9 | 10 | #include "chacha.h" 11 | 12 | /* === speed-testing code === */ 13 | 14 | uint64_t tick_ticks; 15 | static uint8_t tick_init_done = 0; 16 | 17 | // intialize timers 18 | 19 | static void tick_init(void) 20 | { 21 | tick_ticks = 0; 22 | #ifdef __AVR_ATmega128__ 23 | TCCR1B = (1 << CS12); 24 | TIMSK |= (1 << TOIE1); 25 | #else 26 | TCCR0B = (1 << CS00); 27 | TCCR1B = (1 << CS12); 28 | TIMSK1 |= (1 << TOIE1); 29 | #endif 30 | TCNT0 = 0; 31 | TCNT1 = 0; 32 | sei(); // Enable global interrupts 33 | tick_init_done = 1; 34 | } 35 | 36 | // interrupt handler on TIMER1 overflow 37 | 38 | ISR(TIMER1_OVF_vect) 39 | { 40 | tick_ticks += (1UL << 24); 41 | } 42 | 43 | unsigned long long tick_cycles(void) 44 | { 45 | if (!tick_init_done) 46 | tick_init(); 47 | 48 | return tick_ticks | (((uint64_t) TCNT1) << 8) | ((uint64_t) TCNT0); 49 | } 50 | 51 | /* 52 | Test keystream output from section 2.4.2 of RFC 7539 53 | 54 | 224F51F3401BD9E12FDE276FB8631DED8C131F823D2C06E27E4FCAEC9EF3CF78 55 | 8A3B0AA372600A92B57974CDED2B9334794CBA40C63E34CDEA212C4CF07D41B7 56 | 69A6749F3F630F4122CAFE28EC4DC47E26D4346D70B98C73F3E9C53AC40C5945 57 | 398B6EDA1A832C89C167EACD901D7E2BF363 58 | */ 59 | 60 | int main(void) 61 | { 62 | // ChaCha20 test vectors from RFC 7539 63 | 64 | const uint8_t key[32] = { 65 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 66 | 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 67 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 68 | 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F 69 | }; 70 | const uint8_t nonce[12] = { 71 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4a, 72 | 0x00, 0x00, 0x00, 0x00 73 | }; 74 | 75 | size_t i, run; 76 | uint64_t t; 77 | 78 | // has to be aligned 79 | uint8_t st[64] __attribute__((aligned(64))); 80 | 81 | // Output keystream 82 | 83 | chacha20_block(st, key, nonce, 1); 84 | for (i = 0; i < 64; i++) 85 | ser_hex8(st[i]); 86 | chacha20_block(st, key, nonce, 2); 87 | for (i = 0; i < 114 - 64; i++) 88 | ser_hex8(st[i]); 89 | ser_write('\n'); 90 | 91 | // time ChaCha8 92 | 93 | tick_init(); 94 | 95 | for (run = 0; run < 10; run++) { 96 | 97 | ser_print("Run #"); 98 | ser_hex8(run); 99 | ser_print(" "); 100 | 101 | t = tick_cycles(); 102 | for (i = 1 << run; i > 0; i--) { 103 | chacha_perm(st, 4); 104 | chacha_perm(st, 4); 105 | chacha_perm(st, 4); 106 | chacha_perm(st, 4); 107 | } 108 | t = tick_cycles() - t; 109 | t >>= run + 2; 110 | 111 | ser_dec64(t); 112 | ser_print(" ticks / block\n"); 113 | } 114 | 115 | ser_end(); 116 | 117 | return 0; 118 | } 119 | 120 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | chacha-avr 2 | ========== 3 | 4 | 2018-06-10 Markku-Juhani O. Saarinen 5 | 6 | A public domain implementation of **ChaCha20** on 8-bit 7 | AVR microcontroller, just because *you never know when you might need one* ! 8 | We implement the raw ChaCha permutation and include ChaCha20 keystream block 9 | generation per [RFC 7539](https://tools.ietf.org/html/rfc7539) as a higher 10 | level use case. 11 | 12 | The actual assembly language implementation is in `chacha_core_avr.S`, 13 | the rest is basically for serial output, testing, and benchmarking. 14 | 15 | ## Comparison with a fast C implementation 16 | 17 | An unrolled C implementation of the same permutation is included in 18 | `chacha_core_c.c` and the `Makefile` contains instructions for swapping it in. 19 | 20 | As can be seen, the C language ChaCha8 block operation requires 18163 ticks, 21 | i.e. 3.6 times more time. Furthermore, the code size is 2594 bytes larger. 22 | My handwritten assembly language permutation is only 324 bytes. This means 23 | that you can get high security cryptography running with less than half a 24 | kilobyte of code altogether. 25 | 26 | 27 | ## Running on Arduino 28 | 29 | The flashing code is incorporated into Makefile. You need to install 30 | (just `apt install` on Debian/Ubuntu) at least `avr-gcc` and `avrdude` 31 | packages. 32 | 33 | Plug Arduino into a USB port and do a `make flash`. Now the thing will compile 34 | code, flash it, and dump output via UART with stty magic to your standard 35 | output. Most relevant settings are in `Makefile` -- you should edit that 36 | first if something fails. The default settings work with my Arduino Mega 2560 37 | with Ubuntu 18.04. 38 | 39 | ``` 40 | $ make flash 41 | mkdir -p obj/ 42 | avr-gcc -Wall -Ofast -mmcu=atmega2560 -DF_CPU=16000000 -c chacha_core_avr.S -o obj/chacha_core_avr.o 43 | mkdir -p obj/ 44 | ``` 45 | .. more stuff.. and then the interesting bit: 46 | ``` 47 | stty -F /dev/ttyACM0 raw icanon eof \^d 38400 48 | cat < /dev/ttyACM0 49 | 224F51F3401BD9E12FDE276FB8631DED8C131F823D2C06E27E4FCAEC9EF3CF788A3B0AA372600A92B57974CDED2B9334794CBA40C63E34CDEA212C4CF07D41B769A6749F3F630F4122CAFE28EC4DC47E26D4346D70B98C73F3E9C53AC40C5945398B6EDA1A832C89C167EACD901D7E2BF363 50 | Run #00 5073 ticks / block 51 | Run #01 5063 ticks / block 52 | Run #02 5057 ticks / block 53 | Run #03 5055 ticks / block 54 | Run #04 5053 ticks / block 55 | Run #05 5052 ticks / block 56 | Run #06 5052 ticks / block 57 | Run #07 5051 ticks / block 58 | Run #08 5052 ticks / block 59 | Run #09 5052 ticks / block 60 | ``` 61 | That's a success. The long hex string there should match with keystream from 62 | section 2.4.2 of RFC 7539 and can be used to verify correctness. 63 | 64 | The performance numbers are for a ChaCha8 block operation, not ChaCha20. 65 | 66 | ## Running on SIMAVR 67 | 68 | If you don't have an Arduino lying about, you can use 69 | `simavr` (https://github.com/buserror/simavr) and test the code with 70 | `make sim`. 71 | 72 | This cycle-perfect simulator simulates even the UART so the output will be 73 | basically equivalent to above: 74 | ``` 75 | simavr -v -v -v -m atmega2560 chacha-avr 76 | Loaded 3822 .text at address 0x0 77 | Loaded 72 .data 78 | UART: 0 configured to 0019 = 2403.8462 bps (x1), 8 data 1 stop 79 | UART: Roughly 4576 usec per byte 80 | 224F51F3401BD9E12FDE276FB8631DED8C131F823D2C06E27E4FCAEC9EF3CF788A3B0AA372600A92B57974CDED2B9334794CBA40C63E34CDEA212C4CF07D41B769A6749F3F630F4122CAFE28EC4DC47E26D4346D70B98C73F3E9C53AC40C5945398B6EDA1A832C89C167EACD901D7E2BF363. 81 | Run #00 5073 ticks / block. 82 | Run #01 5063 ticks / block. 83 | ^Csignal caught, simavr terminating 84 | ``` 85 | ### Have fun 86 | 87 | Cheers, -markku 88 | 89 | **ABSOLUTELY NO WARRANTY WHATSOEVER** 90 | 91 | -------------------------------------------------------------------------------- /chacha_core_avr.S: -------------------------------------------------------------------------------- 1 | // chacha_core_avr.S 2 | // 2018-06-09 Markku-Juhani O. Saarinen 3 | 4 | __tmp_reg__ = 0 5 | __zero_reg__ = 1 6 | 7 | // R2 - R17, R28, R29 are rcall-saved 8 | // R18 - R27, R30, R31 are rcall-globbered 9 | 10 | .text 11 | 12 | // This is the quarter round function 13 | 14 | .qr: 15 | movw z, r24 // input pointer r25:r24 16 | 17 | add zl, r20 // r20: A offset to start 18 | ld r4, z // load A = ( r4, r5, r6, r7 ) 19 | ldd r5, z + 1 20 | ldd r6, z + 2 21 | ldd r7, z + 3 22 | 23 | add zl, r21 // r21: B offset to A 24 | ld r8, z // load B = ( r8, r9, r10, r11 ) 25 | ldd r9, z + 1 26 | ldd r10, z + 2 27 | ldd r11, z + 3 28 | 29 | add zl, r26 // r26: C offset to B 30 | ld r12, z // load C = ( r12, r13, r14, r15 ) 31 | ldd r13, z + 1 32 | ldd r14, z + 2 33 | ldd r15, z + 3 34 | 35 | add zl, r27 // r27: D offset to C 36 | ld r16, z // load D = ( r16, r17, r18, r19 ) 37 | ldd r17, z + 1 38 | ldd r18, z + 2 39 | ldd r19, z + 3 40 | 41 | add r4, r8 // A += B 42 | adc r5, r9 43 | adc r6, r10 44 | adc r7, r11 45 | 46 | eor r16, r4 // D ^= A 47 | eor r17, r5 48 | eor r18, r6 49 | eor r19, r7 50 | 51 | // D = ( r18, r19, r16, r17 ) -- D <<< 16 52 | 53 | add r12, r18 // C += D 54 | adc r13, r19 55 | adc r14, r16 56 | adc r15, r17 57 | 58 | eor r8, r12 // B ^= C 59 | eor r9, r13 60 | eor r10, r14 61 | eor r11, r15 62 | 63 | ldi r23, 4 // B <<< 4 64 | .rol1: 65 | lsl r8 66 | rol r9 67 | rol r10 68 | rol r11 69 | adc r8, __zero_reg__ 70 | dec r23 71 | brne .rol1 72 | 73 | // B = ( r11, r8, r9, r10 ) -- B <<< 8 74 | 75 | add r4, r11 // A += B 76 | adc r5, r8 77 | adc r6, r9 78 | adc r7, r10 79 | 80 | eor r18, r4 // D ^= A 81 | eor r19, r5 82 | eor r16, r6 83 | eor r17, r7 84 | 85 | // D = ( r17, r18, r19, r16 ) -- D <<< 8 86 | 87 | add r12, r17 // C += D 88 | adc r13, r18 89 | adc r14, r19 90 | adc r15, r16 91 | 92 | eor r11, r12 // B ^= C 93 | eor r8, r13 94 | eor r9, r14 95 | eor r10, r15 96 | 97 | mov __tmp_reg__, r11 // B >>> 1 98 | lsr __tmp_reg__ // set carry-in from lsb 99 | ror r10 100 | ror r9 101 | ror r8 102 | ror r11 103 | 104 | // B = ( r10, r11, r8, r9, ) -- B <<< 8 105 | 106 | movw z, r24 // input pointer r25:r24 107 | 108 | add zl, r20 // r20: A offset from start 109 | st z, r4 // store A 110 | std z + 1, r5 111 | std z + 2, r6 112 | std z + 3, r7 113 | 114 | add zl, r21 // r21: B offset to A 115 | st z, r10 // store B 116 | std z + 1, r11 117 | std z + 2, r8 118 | std z + 3, r9 119 | 120 | add zl, r26 // r26: C offset to B 121 | st z, r12 // store C 122 | std z + 1, r13 123 | std z + 2, r14 124 | std z + 3, r15 125 | 126 | add zl, r27 // r27: D offset to C 127 | st z, r17 // store D 128 | std z + 1, r18 129 | std z + 2, r19 130 | std z + 3, r16 131 | 132 | ret 133 | 134 | 135 | // void chacha_perm(uint8_t st[64], uint8_t dr) 136 | 137 | .global chacha_perm 138 | .type chacha_perm, @function 139 | 140 | chacha_perm: 141 | 142 | push r4 143 | push r5 144 | push r6 145 | push r7 146 | push r8 147 | push r9 148 | push r10 149 | push r11 150 | push r12 151 | push r13 152 | push r14 153 | push r15 154 | push r16 155 | push r17 156 | 157 | .iter: 158 | ldi r20, 0 // QR( v[ 0], v[ 4], v[ 8], v[12] ) 159 | ldi r21, 16 160 | ldi r26, 16 161 | ldi r27, 16 162 | rcall .qr 163 | 164 | ldi r20, 4 // QR( v[ 1], v[ 5], v[ 9], v[13] ) 165 | rcall .qr 166 | 167 | ldi r20, 8 // QR( v[ 2], v[ 6], v[10], v[14] ) 168 | rcall .qr 169 | 170 | ldi r20, 12 171 | rcall .qr // QR( v[ 3], v[ 7], v[11], v[15] ) 172 | 173 | ldi r20, 0 // QR( v[ 0], v[ 5], v[10], v[15] ) 174 | ldi r21, 20 175 | ldi r26, 20 176 | ldi r27, 20 177 | rcall .qr 178 | 179 | ldi r20, 4 // QR( v[ 1], v[ 6], v[11], v[12] ) 180 | ldi r27, 4 181 | rcall .qr 182 | 183 | ldi r20, 8 // QR( v[ 2], v[ 7], v[ 8], v[13] ) 184 | ldi r26, 4 185 | ldi r27, 20 186 | rcall .qr 187 | 188 | ldi r20, 12 // QR( v[ 3], v[ 4], v[ 9], v[14] ) 189 | ldi r21, 4 190 | ldi r26, 20 191 | rcall .qr 192 | 193 | dec r22 // r22 = "dr", double round count 194 | brne .iter // iterate 195 | 196 | pop r17 197 | pop r16 198 | pop r15 199 | pop r14 200 | pop r13 201 | pop r12 202 | pop r11 203 | pop r10 204 | pop r9 205 | pop r8 206 | pop r7 207 | pop r6 208 | pop r5 209 | pop r4 210 | 211 | ret 212 | .size chacha_perm, .-chacha_perm 213 | --------------------------------------------------------------------------------