/* SPDX-License-Identifier: GPL-2.0-only */ /* * AES-NI + SSE4.1 implementation of AEGIS-128 * * Copyright (c) 2017-2018 Ondrej Mosnacek * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. * Copyright 2024 Google LLC */ #include #define STATE0 %xmm0 #define STATE1 %xmm1 #define STATE2 %xmm2 #define STATE3 %xmm3 #define STATE4 %xmm4 #define KEY %xmm5 #define MSG %xmm5 #define T0 %xmm6 #define T1 %xmm7 .section .rodata.cst16.aegis128_const, "aM", @progbits, 32 .align 16 .Laegis128_const_0: .byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d .byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62 .Laegis128_const_1: .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd .section .rodata.cst32.zeropad_mask, "aM", @progbits, 32 .align 32 .Lzeropad_mask: .octa 0xffffffffffffffffffffffffffffffff .octa 0 .text /* * aegis128_update * input: * STATE[0-4] - input state * output: * STATE[0-4] - output state (shifted positions) * changed: * T0 */ .macro aegis128_update movdqa STATE4, T0 aesenc STATE0, STATE4 aesenc STATE1, STATE0 aesenc STATE2, STATE1 aesenc STATE3, STATE2 aesenc T0, STATE3 .endm /* * Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register * MSG and zeroize any remaining bytes. Clobbers %rax, %rcx, and %r8. */ .macro load_partial sub $8, %ecx /* LEN - 8 */ jle .Lle8\@ /* Load 9 <= LEN <= 15 bytes: */ movq (SRC), MSG /* Load first 8 bytes */ mov (SRC, %rcx), %rax /* Load last 8 bytes */ neg %ecx shl $3, %ecx shr %cl, %rax /* Discard overlapping bytes */ pinsrq $1, %rax, MSG jmp .Ldone\@ .Lle8\@: add $4, %ecx /* LEN - 4 */ jl .Llt4\@ /* Load 4 <= LEN <= 8 bytes: */ mov (SRC), %eax /* Load first 4 bytes */ mov (SRC, %rcx), %r8d /* Load last 4 bytes */ jmp .Lcombine\@ .Llt4\@: /* Load 1 <= LEN <= 3 bytes: */ add $2, %ecx /* LEN - 2 */ movzbl (SRC), %eax /* Load first byte */ jl .Lmovq\@ movzwl (SRC, %rcx), %r8d /* Load last 2 bytes */ .Lcombine\@: shl $3, %ecx shl %cl, %r8 or %r8, %rax /* Combine the two parts */ .Lmovq\@: movq %rax, MSG .Ldone\@: .endm /* * Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer * DST. Clobbers %rax, %rcx, and %r8. */ .macro store_partial msg sub $8, %ecx /* LEN - 8 */ jl .Llt8\@ /* Store 8 <= LEN <= 15 bytes: */ pextrq $1, \msg, %rax mov %ecx, %r8d shl $3, %ecx ror %cl, %rax mov %rax, (DST, %r8) /* Store last LEN - 8 bytes */ movq \msg, (DST) /* Store first 8 bytes */ jmp .Ldone\@ .Llt8\@: add $4, %ecx /* LEN - 4 */ jl .Llt4\@ /* Store 4 <= LEN <= 7 bytes: */ pextrd $1, \msg, %eax mov %ecx, %r8d shl $3, %ecx ror %cl, %eax mov %eax, (DST, %r8) /* Store last LEN - 4 bytes */ movd \msg, (DST) /* Store first 4 bytes */ jmp .Ldone\@ .Llt4\@: /* Store 1 <= LEN <= 3 bytes: */ pextrb $0, \msg, 0(DST) cmp $-2, %ecx /* LEN - 4 == -2, i.e. LEN == 2? */ jl .Ldone\@ pextrb $1, \msg, 1(DST) je .Ldone\@ pextrb $2, \msg, 2(DST) .Ldone\@: .endm /* * void aegis128_aesni_init(struct aegis_state *state, * const struct aegis_block *key, * const u8 iv[AEGIS128_NONCE_SIZE]); */ SYM_FUNC_START(aegis128_aesni_init) .set STATEP, %rdi .set KEYP, %rsi .set IVP, %rdx /* load IV: */ movdqu (IVP), T1 /* load key: */ movdqa (KEYP), KEY pxor KEY, T1 movdqa T1, STATE0 movdqa KEY, STATE3 movdqa KEY, STATE4 /* load the constants: */ movdqa .Laegis128_const_0(%rip), STATE2 movdqa .Laegis128_const_1(%rip), STATE1 pxor STATE2, STATE3 pxor STATE1, STATE4 /* update 10 times with KEY / KEY xor IV: */ aegis128_update; pxor KEY, STATE4 aegis128_update; pxor T1, STATE3 aegis128_update; pxor KEY, STATE2 aegis128_update; pxor T1, STATE1 aegis128_update; pxor KEY, STATE0 aegis128_update; pxor T1, STATE4 aegis128_update; pxor KEY, STATE3 aegis128_update; pxor T1, STATE2 aegis128_update; pxor KEY, STATE1 aegis128_update; pxor T1, STATE0 /* store the state: */ movdqu STATE0, 0x00(STATEP) movdqu STATE1, 0x10(STATEP) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) RET SYM_FUNC_END(aegis128_aesni_init) /* * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, * unsigned int len); * * len must be a multiple of 16. */ SYM_FUNC_START(aegis128_aesni_ad) .set STATEP, %rdi .set SRC, %rsi .set LEN, %edx test LEN, LEN jz .Lad_out /* load the state: */ movdqu 0x00(STATEP), STATE0 movdqu 0x10(STATEP), STATE1 movdqu 0x20(STATEP), STATE2 movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 .align 8 .Lad_loop: movdqu 0x00(SRC), MSG aegis128_update pxor MSG, STATE4 sub $0x10, LEN jz .Lad_out_1 movdqu 0x10(SRC), MSG aegis128_update pxor MSG, STATE3 sub $0x10, LEN jz .Lad_out_2 movdqu 0x20(SRC), MSG aegis128_update pxor MSG, STATE2 sub $0x10, LEN jz .Lad_out_3 movdqu 0x30(SRC), MSG aegis128_update pxor MSG, STATE1 sub $0x10, LEN jz .Lad_out_4 movdqu 0x40(SRC), MSG aegis128_update pxor MSG, STATE0 sub $0x10, LEN jz .Lad_out_0 add $0x50, SRC jmp .Lad_loop /* store the state: */ .Lad_out_0: movdqu STATE0, 0x00(STATEP) movdqu STATE1, 0x10(STATEP) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) RET .Lad_out_1: movdqu STATE4, 0x00(STATEP) movdqu STATE0, 0x10(STATEP) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) RET .Lad_out_2: movdqu STATE3, 0x00(STATEP) movdqu STATE4, 0x10(STATEP) movdqu STATE0, 0x20(STATEP) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) RET .Lad_out_3: movdqu STATE2, 0x00(STATEP) movdqu STATE3, 0x10(STATEP) movdqu STATE4, 0x20(STATEP) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) RET .Lad_out_4: movdqu STATE1, 0x00(STATEP) movdqu STATE2, 0x10(STATEP) movdqu STATE3, 0x20(STATEP) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) .Lad_out: RET SYM_FUNC_END(aegis128_aesni_ad) .macro encrypt_block s0 s1 s2 s3 s4 i movdqu (\i * 0x10)(SRC), MSG movdqa MSG, T0 pxor \s1, T0 pxor \s4, T0 movdqa \s2, T1 pand \s3, T1 pxor T1, T0 movdqu T0, (\i * 0x10)(DST) aegis128_update pxor MSG, \s4 sub $0x10, LEN jz .Lenc_out_\i .endm /* * void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst, * unsigned int len); * * len must be nonzero and a multiple of 16. */ SYM_FUNC_START(aegis128_aesni_enc) .set STATEP, %rdi .set SRC, %rsi .set DST, %rdx .set LEN, %ecx /* load the state: */ movdqu 0x00(STATEP), STATE0 movdqu 0x10(STATEP), STATE1 movdqu 0x20(STATEP), STATE2 movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 .align 8 .Lenc_loop: encrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0 encrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1 encrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2 encrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3 encrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4 add $0x50, SRC add $0x50, DST jmp .Lenc_loop /* store the state: */ .Lenc_out_0: movdqu STATE4, 0x00(STATEP) movdqu STATE0, 0x10(STATEP) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) RET .Lenc_out_1: movdqu STATE3, 0x00(STATEP) movdqu STATE4, 0x10(STATEP) movdqu STATE0, 0x20(STATEP) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) RET .Lenc_out_2: movdqu STATE2, 0x00(STATEP) movdqu STATE3, 0x10(STATEP) movdqu STATE4, 0x20(STATEP) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) RET .Lenc_out_3: movdqu STATE1, 0x00(STATEP) movdqu STATE2, 0x10(STATEP) movdqu STATE3, 0x20(STATEP) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) RET .Lenc_out_4: movdqu STATE0, 0x00(STATEP) movdqu STATE1, 0x10(STATEP) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) .Lenc_out: RET SYM_FUNC_END(aegis128_aesni_enc) /* * void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src, * u8 *dst, unsigned int len); */ SYM_FUNC_START(aegis128_aesni_enc_tail) .set STATEP, %rdi .set SRC, %rsi .set DST, %rdx .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ /* load the state: */ movdqu 0x00(STATEP), STATE0 movdqu 0x10(STATEP), STATE1 movdqu 0x20(STATEP), STATE2 movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 /* encrypt message: */ mov LEN, %r9d load_partial movdqa MSG, T0 pxor STATE1, T0 pxor STATE4, T0 movdqa STATE2, T1 pand STATE3, T1 pxor T1, T0 mov %r9d, LEN store_partial T0 aegis128_update pxor MSG, STATE4 /* store the state: */ movdqu STATE4, 0x00(STATEP) movdqu STATE0, 0x10(STATEP) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) RET SYM_FUNC_END(aegis128_aesni_enc_tail) .macro decrypt_block s0 s1 s2 s3 s4 i movdqu (\i * 0x10)(SRC), MSG pxor \s1, MSG pxor \s4, MSG movdqa \s2, T1 pand \s3, T1 pxor T1, MSG movdqu MSG, (\i * 0x10)(DST) aegis128_update pxor MSG, \s4 sub $0x10, LEN jz .Ldec_out_\i .endm /* * void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst, * unsigned int len); * * len must be nonzero and a multiple of 16. */ SYM_FUNC_START(aegis128_aesni_dec) .set STATEP, %rdi .set SRC, %rsi .set DST, %rdx .set LEN, %ecx /* load the state: */ movdqu 0x00(STATEP), STATE0 movdqu 0x10(STATEP), STATE1 movdqu 0x20(STATEP), STATE2 movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 .align 8 .Ldec_loop: decrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0 decrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1 decrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2 decrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3 decrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4 add $0x50, SRC add $0x50, DST jmp .Ldec_loop /* store the state: */ .Ldec_out_0: movdqu STATE4, 0x00(STATEP) movdqu STATE0, 0x10(STATEP) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) RET .Ldec_out_1: movdqu STATE3, 0x00(STATEP) movdqu STATE4, 0x10(STATEP) movdqu STATE0, 0x20(STATEP) movdqu STATE1, 0x30(STATEP) movdqu STATE2, 0x40(STATEP) RET .Ldec_out_2: movdqu STATE2, 0x00(STATEP) movdqu STATE3, 0x10(STATEP) movdqu STATE4, 0x20(STATEP) movdqu STATE0, 0x30(STATEP) movdqu STATE1, 0x40(STATEP) RET .Ldec_out_3: movdqu STATE1, 0x00(STATEP) movdqu STATE2, 0x10(STATEP) movdqu STATE3, 0x20(STATEP) movdqu STATE4, 0x30(STATEP) movdqu STATE0, 0x40(STATEP) RET .Ldec_out_4: movdqu STATE0, 0x00(STATEP) movdqu STATE1, 0x10(STATEP) movdqu STATE2, 0x20(STATEP) movdqu STATE3, 0x30(STATEP) movdqu STATE4, 0x40(STATEP) .Ldec_out: RET SYM_FUNC_END(aegis128_aesni_dec) /* * void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src, * u8 *dst, unsigned int len); */ SYM_FUNC_START(aegis128_aesni_dec_tail) .set STATEP, %rdi .set SRC, %rsi .set DST, %rdx .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ /* load the state: */ movdqu 0x00(STATEP), STATE0 movdqu 0x10(STATEP), STATE1 movdqu 0x20(STATEP), STATE2 movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 /* decrypt message: */ mov LEN, %r9d load_partial pxor STATE1, MSG pxor STATE4, MSG movdqa STATE2, T1 pand STATE3, T1 pxor T1, MSG mov %r9d, LEN store_partial MSG /* mask with byte count: */ lea .Lzeropad_mask+16(%rip), %rax sub %r9, %rax movdqu (%rax), T0 pand T0, MSG aegis128_update pxor MSG, STATE4 /* store the state: */ movdqu STATE4, 0x00(STATEP) movdqu STATE0, 0x10(STATEP) movdqu STATE1, 0x20(STATEP) movdqu STATE2, 0x30(STATEP) movdqu STATE3, 0x40(STATEP) RET SYM_FUNC_END(aegis128_aesni_dec_tail) /* * void aegis128_aesni_final(struct aegis_state *state, * struct aegis_block *tag_xor, * unsigned int assoclen, unsigned int cryptlen); */ SYM_FUNC_START(aegis128_aesni_final) .set STATEP, %rdi .set TAG_XOR, %rsi .set ASSOCLEN, %edx .set CRYPTLEN, %ecx /* load the state: */ movdqu 0x00(STATEP), STATE0 movdqu 0x10(STATEP), STATE1 movdqu 0x20(STATEP), STATE2 movdqu 0x30(STATEP), STATE3 movdqu 0x40(STATEP), STATE4 /* prepare length block: */ movd ASSOCLEN, MSG pinsrd $2, CRYPTLEN, MSG psllq $3, MSG /* multiply by 8 (to get bit count) */ pxor STATE3, MSG /* update state: */ aegis128_update; pxor MSG, STATE4 aegis128_update; pxor MSG, STATE3 aegis128_update; pxor MSG, STATE2 aegis128_update; pxor MSG, STATE1 aegis128_update; pxor MSG, STATE0 aegis128_update; pxor MSG, STATE4 aegis128_update; pxor MSG, STATE3 /* xor tag: */ movdqu (TAG_XOR), MSG pxor STATE0, MSG pxor STATE1, MSG pxor STATE2, MSG pxor STATE3, MSG pxor STATE4, MSG movdqu MSG, (TAG_XOR) RET SYM_FUNC_END(aegis128_aesni_final)