diff options
author | Linus Torvalds | 2015-04-15 10:42:15 -0700 |
---|---|---|
committer | Linus Torvalds | 2015-04-15 10:42:15 -0700 |
commit | cb906953d2c3fd450655d9fa833f03690ad50c23 (patch) | |
tree | 06c5665afb24baee3ac49f62db61ca97918079b4 /arch | |
parent | 6c373ca89399c5a3f7ef210ad8f63dc3437da345 (diff) | |
parent | 3abafaf2192b1712079edfd4232b19877d6f41a5 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu:
"Here is the crypto update for 4.1:
New interfaces:
- user-space interface for AEAD
- user-space interface for RNG (i.e., pseudo RNG)
New hashes:
- ARMv8 SHA1/256
- ARMv8 AES
- ARMv8 GHASH
- ARM assembler and NEON SHA256
- MIPS OCTEON SHA1/256/512
- MIPS img-hash SHA1/256 and MD5
- Power 8 VMX AES/CBC/CTR/GHASH
- PPC assembler AES, SHA1/256 and MD5
- Broadcom IPROC RNG driver
Cleanups/fixes:
- prevent internal helper algos from being exposed to user-space
- merge common code from assembly/C SHA implementations
- misc fixes"
* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (169 commits)
crypto: arm - workaround for building with old binutils
crypto: arm/sha256 - avoid sha256 code on ARMv7-M
crypto: x86/sha512_ssse3 - move SHA-384/512 SSSE3 implementation to base layer
crypto: x86/sha256_ssse3 - move SHA-224/256 SSSE3 implementation to base layer
crypto: x86/sha1_ssse3 - move SHA-1 SSSE3 implementation to base layer
crypto: arm64/sha2-ce - move SHA-224/256 ARMv8 implementation to base layer
crypto: arm64/sha1-ce - move SHA-1 ARMv8 implementation to base layer
crypto: arm/sha2-ce - move SHA-224/256 ARMv8 implementation to base layer
crypto: arm/sha256 - move SHA-224/256 ASM/NEON implementation to base layer
crypto: arm/sha1-ce - move SHA-1 ARMv8 implementation to base layer
crypto: arm/sha1_neon - move SHA-1 NEON implementation to base layer
crypto: arm/sha1 - move SHA-1 ARM asm implementation to base layer
crypto: sha512-generic - move to generic glue implementation
crypto: sha256-generic - move to generic glue implementation
crypto: sha1-generic - move to generic glue implementation
crypto: sha512 - implement base layer for SHA-512
crypto: sha256 - implement base layer for SHA-256
crypto: sha1 - implement base layer for SHA-1
crypto: api - remove instance when test failed
crypto: api - Move alg ref count init to crypto_check_alg
...
Diffstat (limited to 'arch')
68 files changed, 10913 insertions, 1129 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index da1266c53c13..7cbf4ef5c6fd 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -2175,6 +2175,9 @@ source "arch/arm/Kconfig.debug" source "security/Kconfig" source "crypto/Kconfig" +if CRYPTO +source "arch/arm/crypto/Kconfig" +endif source "lib/Kconfig" diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig new file mode 100644 index 000000000000..8da2207b0072 --- /dev/null +++ b/arch/arm/crypto/Kconfig @@ -0,0 +1,130 @@ + +menuconfig ARM_CRYPTO + bool "ARM Accelerated Cryptographic Algorithms" + depends on ARM + help + Say Y here to choose from a selection of cryptographic algorithms + implemented using ARM specific CPU features or instructions. + +if ARM_CRYPTO + +config CRYPTO_SHA1_ARM + tristate "SHA1 digest algorithm (ARM-asm)" + select CRYPTO_SHA1 + select CRYPTO_HASH + help + SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented + using optimized ARM assembler. + +config CRYPTO_SHA1_ARM_NEON + tristate "SHA1 digest algorithm (ARM NEON)" + depends on KERNEL_MODE_NEON + select CRYPTO_SHA1_ARM + select CRYPTO_SHA1 + select CRYPTO_HASH + help + SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented + using optimized ARM NEON assembly, when NEON instructions are + available. + +config CRYPTO_SHA1_ARM_CE + tristate "SHA1 digest algorithm (ARM v8 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_SHA1_ARM + select CRYPTO_HASH + help + SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented + using special ARMv8 Crypto Extensions. + +config CRYPTO_SHA2_ARM_CE + tristate "SHA-224/256 digest algorithm (ARM v8 Crypto Extensions)" + depends on KERNEL_MODE_NEON + select CRYPTO_SHA256_ARM + select CRYPTO_HASH + help + SHA-256 secure hash standard (DFIPS 180-2) implemented + using special ARMv8 Crypto Extensions. + +config CRYPTO_SHA256_ARM + tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)" + select CRYPTO_HASH + depends on !CPU_V7M + help + SHA-256 secure hash standard (DFIPS 180-2) implemented + using optimized ARM assembler and NEON, when available. + +config CRYPTO_SHA512_ARM_NEON + tristate "SHA384 and SHA512 digest algorithm (ARM NEON)" + depends on KERNEL_MODE_NEON + select CRYPTO_SHA512 + select CRYPTO_HASH + help + SHA-512 secure hash standard (DFIPS 180-2) implemented + using ARM NEON instructions, when available. + + This version of SHA implements a 512 bit hash with 256 bits of + security against collision attacks. + + This code also includes SHA-384, a 384 bit hash with 192 bits + of security against collision attacks. + +config CRYPTO_AES_ARM + tristate "AES cipher algorithms (ARM-asm)" + depends on ARM + select CRYPTO_ALGAPI + select CRYPTO_AES + help + Use optimized AES assembler routines for ARM platforms. + + AES cipher algorithms (FIPS-197). AES uses the Rijndael + algorithm. + + Rijndael appears to be consistently a very good performer in + both hardware and software across a wide range of computing + environments regardless of its use in feedback or non-feedback + modes. Its key setup time is excellent, and its key agility is + good. Rijndael's very low memory requirements make it very well + suited for restricted-space environments, in which it also + demonstrates excellent performance. Rijndael's operations are + among the easiest to defend against power and timing attacks. + + The AES specifies three key sizes: 128, 192 and 256 bits + + See <http://csrc.nist.gov/encryption/aes/> for more information. + +config CRYPTO_AES_ARM_BS + tristate "Bit sliced AES using NEON instructions" + depends on KERNEL_MODE_NEON + select CRYPTO_ALGAPI + select CRYPTO_AES_ARM + select CRYPTO_ABLK_HELPER + help + Use a faster and more secure NEON based implementation of AES in CBC, + CTR and XTS modes + + Bit sliced AES gives around 45% speedup on Cortex-A15 for CTR mode + and for XTS mode encryption, CBC and XTS mode decryption speedup is + around 25%. (CBC encryption speed is not affected by this driver.) + This implementation does not rely on any lookup tables so it is + believed to be invulnerable to cache timing attacks. + +config CRYPTO_AES_ARM_CE + tristate "Accelerated AES using ARMv8 Crypto Extensions" + depends on KERNEL_MODE_NEON + select CRYPTO_ALGAPI + select CRYPTO_ABLK_HELPER + help + Use an implementation of AES in CBC, CTR and XTS modes that uses + ARMv8 Crypto Extensions + +config CRYPTO_GHASH_ARM_CE + tristate "PMULL-accelerated GHASH using ARMv8 Crypto Extensions" + depends on KERNEL_MODE_NEON + select CRYPTO_HASH + select CRYPTO_CRYPTD + help + Use an implementation of GHASH (used by the GCM AEAD chaining mode) + that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64) + that is part of the ARMv8 Crypto Extensions + +endif diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile index b48fa341648d..6ea828241fcb 100644 --- a/arch/arm/crypto/Makefile +++ b/arch/arm/crypto/Makefile @@ -6,13 +6,35 @@ obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o +obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o +ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o +ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o +ce-obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o +ce-obj-$(CONFIG_CRYPTO_GHASH_ARM_CE) += ghash-arm-ce.o + +ifneq ($(ce-obj-y)$(ce-obj-m),) +ifeq ($(call as-instr,.fpu crypto-neon-fp-armv8,y,n),y) +obj-y += $(ce-obj-y) +obj-m += $(ce-obj-m) +else +$(warning These ARMv8 Crypto Extensions modules need binutils 2.23 or higher) +$(warning $(ce-obj-y) $(ce-obj-m)) +endif +endif + aes-arm-y := aes-armv4.o aes_glue.o aes-arm-bs-y := aesbs-core.o aesbs-glue.o sha1-arm-y := sha1-armv4-large.o sha1_glue.o sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o +sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o +sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y) sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o +sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o +sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o +aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o +ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o quiet_cmd_perl = PERL $@ cmd_perl = $(PERL) $(<) > $(@) @@ -20,4 +42,7 @@ quiet_cmd_perl = PERL $@ $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl $(call cmd,perl) -.PRECIOUS: $(obj)/aesbs-core.S +$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl + $(call cmd,perl) + +.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S diff --git a/arch/arm/crypto/aes-ce-core.S b/arch/arm/crypto/aes-ce-core.S new file mode 100644 index 000000000000..8cfa468ee570 --- /dev/null +++ b/arch/arm/crypto/aes-ce-core.S @@ -0,0 +1,518 @@ +/* + * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions + * + * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .fpu crypto-neon-fp-armv8 + .align 3 + + .macro enc_round, state, key + aese.8 \state, \key + aesmc.8 \state, \state + .endm + + .macro dec_round, state, key + aesd.8 \state, \key + aesimc.8 \state, \state + .endm + + .macro enc_dround, key1, key2 + enc_round q0, \key1 + enc_round q0, \key2 + .endm + + .macro dec_dround, key1, key2 + dec_round q0, \key1 + dec_round q0, \key2 + .endm + + .macro enc_fround, key1, key2, key3 + enc_round q0, \key1 + aese.8 q0, \key2 + veor q0, q0, \key3 + .endm + + .macro dec_fround, key1, key2, key3 + dec_round q0, \key1 + aesd.8 q0, \key2 + veor q0, q0, \key3 + .endm + + .macro enc_dround_3x, key1, key2 + enc_round q0, \key1 + enc_round q1, \key1 + enc_round q2, \key1 + enc_round q0, \key2 + enc_round q1, \key2 + enc_round q2, \key2 + .endm + + .macro dec_dround_3x, key1, key2 + dec_round q0, \key1 + dec_round q1, \key1 + dec_round q2, \key1 + dec_round q0, \key2 + dec_round q1, \key2 + dec_round q2, \key2 + .endm + + .macro enc_fround_3x, key1, key2, key3 + enc_round q0, \key1 + enc_round q1, \key1 + enc_round q2, \key1 + aese.8 q0, \key2 + aese.8 q1, \key2 + aese.8 q2, \key2 + veor q0, q0, \key3 + veor q1, q1, \key3 + veor q2, q2, \key3 + .endm + + .macro dec_fround_3x, key1, key2, key3 + dec_round q0, \key1 + dec_round q1, \key1 + dec_round q2, \key1 + aesd.8 q0, \key2 + aesd.8 q1, \key2 + aesd.8 q2, \key2 + veor q0, q0, \key3 + veor q1, q1, \key3 + veor q2, q2, \key3 + .endm + + .macro do_block, dround, fround + cmp r3, #12 @ which key size? + vld1.8 {q10-q11}, [ip]! + \dround q8, q9 + vld1.8 {q12-q13}, [ip]! + \dround q10, q11 + vld1.8 {q10-q11}, [ip]! + \dround q12, q13 + vld1.8 {q12-q13}, [ip]! + \dround q10, q11 + blo 0f @ AES-128: 10 rounds + vld1.8 {q10-q11}, [ip]! + beq 1f @ AES-192: 12 rounds + \dround q12, q13 + vld1.8 {q12-q13}, [ip] + \dround q10, q11 +0: \fround q12, q13, q14 + bx lr + +1: \dround q12, q13 + \fround q10, q11, q14 + bx lr + .endm + + /* + * Internal, non-AAPCS compliant functions that implement the core AES + * transforms. These should preserve all registers except q0 - q2 and ip + * Arguments: + * q0 : first in/output block + * q1 : second in/output block (_3x version only) + * q2 : third in/output block (_3x version only) + * q8 : first round key + * q9 : secound round key + * ip : address of 3rd round key + * q14 : final round key + * r3 : number of rounds + */ + .align 6 +aes_encrypt: + add ip, r2, #32 @ 3rd round key +.Laes_encrypt_tweak: + do_block enc_dround, enc_fround +ENDPROC(aes_encrypt) + + .align 6 +aes_decrypt: + add ip, r2, #32 @ 3rd round key + do_block dec_dround, dec_fround +ENDPROC(aes_decrypt) + + .align 6 +aes_encrypt_3x: + add ip, r2, #32 @ 3rd round key + do_block enc_dround_3x, enc_fround_3x +ENDPROC(aes_encrypt_3x) + + .align 6 +aes_decrypt_3x: + add ip, r2, #32 @ 3rd round key + do_block dec_dround_3x, dec_fround_3x +ENDPROC(aes_decrypt_3x) + + .macro prepare_key, rk, rounds + add ip, \rk, \rounds, lsl #4 + vld1.8 {q8-q9}, [\rk] @ load first 2 round keys + vld1.8 {q14}, [ip] @ load last round key + .endm + + /* + * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks) + * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks) + */ +ENTRY(ce_aes_ecb_encrypt) + push {r4, lr} + ldr r4, [sp, #8] + prepare_key r2, r3 +.Lecbencloop3x: + subs r4, r4, #3 + bmi .Lecbenc1x + vld1.8 {q0-q1}, [r1, :64]! + vld1.8 {q2}, [r1, :64]! + bl aes_encrypt_3x + vst1.8 {q0-q1}, [r0, :64]! + vst1.8 {q2}, [r0, :64]! + b .Lecbencloop3x +.Lecbenc1x: + adds r4, r4, #3 + beq .Lecbencout +.Lecbencloop: + vld1.8 {q0}, [r1, :64]! + bl aes_encrypt + vst1.8 {q0}, [r0, :64]! + subs r4, r4, #1 + bne .Lecbencloop +.Lecbencout: + pop {r4, pc} +ENDPROC(ce_aes_ecb_encrypt) + +ENTRY(ce_aes_ecb_decrypt) + push {r4, lr} + ldr r4, [sp, #8] + prepare_key r2, r3 +.Lecbdecloop3x: + subs r4, r4, #3 + bmi .Lecbdec1x + vld1.8 {q0-q1}, [r1, :64]! + vld1.8 {q2}, [r1, :64]! + bl aes_decrypt_3x + vst1.8 {q0-q1}, [r0, :64]! + vst1.8 {q2}, [r0, :64]! + b .Lecbdecloop3x +.Lecbdec1x: + adds r4, r4, #3 + beq .Lecbdecout +.Lecbdecloop: + vld1.8 {q0}, [r1, :64]! + bl aes_decrypt + vst1.8 {q0}, [r0, :64]! + subs r4, r4, #1 + bne .Lecbdecloop +.Lecbdecout: + pop {r4, pc} +ENDPROC(ce_aes_ecb_decrypt) + + /* + * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[]) + * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 iv[]) + */ +ENTRY(ce_aes_cbc_encrypt) + push {r4-r6, lr} + ldrd r4, r5, [sp, #16] + vld1.8 {q0}, [r5] + prepare_key r2, r3 +.Lcbcencloop: + vld1.8 {q1}, [r1, :64]! @ get next pt block + veor q0, q0, q1 @ ..and xor with iv + bl aes_encrypt + vst1.8 {q0}, [r0, :64]! + subs r4, r4, #1 + bne .Lcbcencloop + vst1.8 {q0}, [r5] + pop {r4-r6, pc} +ENDPROC(ce_aes_cbc_encrypt) + +ENTRY(ce_aes_cbc_decrypt) + push {r4-r6, lr} + ldrd r4, r5, [sp, #16] + vld1.8 {q6}, [r5] @ keep iv in q6 + prepare_key r2, r3 +.Lcbcdecloop3x: + subs r4, r4, #3 + bmi .Lcbcdec1x + vld1.8 {q0-q1}, [r1, :64]! + vld1.8 {q2}, [r1, :64]! + vmov q3, q0 + vmov q4, q1 + vmov q5, q2 + bl aes_decrypt_3x + veor q0, q0, q6 + veor q1, q1, q3 + veor q2, q2, q4 + vmov q6, q5 + vst1.8 {q0-q1}, [r0, :64]! + vst1.8 {q2}, [r0, :64]! + b .Lcbcdecloop3x +.Lcbcdec1x: + adds r4, r4, #3 + beq .Lcbcdecout + vmov q15, q14 @ preserve last round key +.Lcbcdecloop: + vld1.8 {q0}, [r1, :64]! @ get next ct block + veor q14, q15, q6 @ combine prev ct with last key + vmov q6, q0 + bl aes_decrypt + vst1.8 {q0}, [r0, :64]! + subs r4, r4, #1 + bne .Lcbcdecloop +.Lcbcdecout: + vst1.8 {q6}, [r5] @ keep iv in q6 + pop {r4-r6, pc} +ENDPROC(ce_aes_cbc_decrypt) + + /* + * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, + * int blocks, u8 ctr[]) + */ +ENTRY(ce_aes_ctr_encrypt) + push {r4-r6, lr} + ldrd r4, r5, [sp, #16] + vld1.8 {q6}, [r5] @ load ctr + prepare_key r2, r3 + vmov r6, s27 @ keep swabbed ctr in r6 + rev r6, r6 + cmn r6, r4 @ 32 bit overflow? + bcs .Lctrloop +.Lctrloop3x: + subs r4, r4, #3 + bmi .Lctr1x + add r6, r6, #1 + vmov q0, q6 + vmov q1, q6 + rev ip, r6 + add r6, r6, #1 + vmov q2, q6 + vmov s7, ip + rev ip, r6 + add r6, r6, #1 + vmov s11, ip + vld1.8 {q3-q4}, [r1, :64]! + vld1.8 {q5}, [r1, :64]! + bl aes_encrypt_3x + veor q0, q0, q3 + veor q1, q1, q4 + veor q2, q2, q5 + rev ip, r6 + vst1.8 {q0-q1}, [r0, :64]! + vst1.8 {q2}, [r0, :64]! + vmov s27, ip + b .Lctrloop3x +.Lctr1x: + adds r4, r4, #3 + beq .Lctrout +.Lctrloop: + vmov q0, q6 + bl aes_encrypt + subs r4, r4, #1 + bmi .Lctrhalfblock @ blocks < 0 means 1/2 block + vld1.8 {q3}, [r1, :64]! + veor q3, q0, q3 + vst1.8 {q3}, [r0, :64]! + + adds r6, r6, #1 @ increment BE ctr + rev ip, r6 + vmov s27, ip + bcs .Lctrcarry + teq r4, #0 + bne .Lctrloop +.Lctrout: + vst1.8 {q6}, [r5] + pop {r4-r6, pc} + +.Lctrhalfblock: + vld1.8 {d1}, [r1, :64] + veor d0, d0, d1 + vst1.8 {d0}, [r0, :64] + pop {r4-r6, pc} + +.Lctrcarry: + .irp sreg, s26, s25, s24 + vmov ip, \sreg @ load next word of ctr + rev ip, ip @ ... to handle the carry + adds ip, ip, #1 + rev ip, ip + vmov \sreg, ip + bcc 0f + .endr +0: teq r4, #0 + beq .Lctrout + b .Lctrloop +ENDPROC(ce_aes_ctr_encrypt) + + /* + * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, + * int blocks, u8 iv[], u8 const rk2[], int first) + * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, + * int blocks, u8 iv[], u8 const rk2[], int first) + */ + + .macro next_tweak, out, in, const, tmp + vshr.s64 \tmp, \in, #63 + vand \tmp, \tmp, \const + vadd.u64 \out, \in, \in + vext.8 \tmp, \tmp, \tmp, #8 + veor \out, \out, \tmp + .endm + + .align 3 +.Lxts_mul_x: + .quad 1, 0x87 + +ce_aes_xts_init: + vldr d14, .Lxts_mul_x + vldr d15, .Lxts_mul_x + 8 + + ldrd r4, r5, [sp, #16] @ load args + ldr r6, [sp, #28] + vld1.8 {q0}, [r5] @ load iv + teq r6, #1 @ start of a block? + bxne lr + + @ Encrypt the IV in q0 with the second AES key. This should only + @ be done at the start of a block. + ldr r6, [sp, #24] @ load AES key 2 + prepare_key r6, r3 + add ip, r6, #32 @ 3rd round key of key 2 + b .Laes_encrypt_tweak @ tail call +ENDPROC(ce_aes_xts_init) + +ENTRY(ce_aes_xts_encrypt) + push {r4-r6, lr} + + bl ce_aes_xts_init @ run shared prologue + prepare_key r2, r3 + vmov q3, q0 + + teq r6, #0 @ start of a block? + bne .Lxtsenc3x + +.Lxtsencloop3x: + next_tweak q3, q3, q7, q6 +.Lxtsenc3x: + subs r4, r4, #3 + bmi .Lxtsenc1x + vld1.8 {q0-q1}, [r1, :64]! @ get 3 pt blocks + vld1.8 {q2}, [r1, :64]! + next_tweak q4, q3, q7, q6 + veor q0, q0, q3 + next_tweak q5, q4, q7, q6 + veor q1, q1, q4 + veor q2, q2, q5 + bl aes_encrypt_3x + veor q0, q0, q3 + veor q1, q1, q4 + veor q2, q2, q5 + vst1.8 {q0-q1}, [r0, :64]! @ write 3 ct blocks + vst1.8 {q2}, [r0, :64]! + vmov q3, q5 + teq r4, #0 + beq .Lxtsencout + b .Lxtsencloop3x +.Lxtsenc1x: + adds r4, r4, #3 + beq .Lxtsencout +.Lxtsencloop: + vld1.8 {q0}, [r1, :64]! + veor q0, q0, q3 + bl aes_encrypt + veor q0, q0, q3 + vst1.8 {q0}, [r0, :64]! + subs r4, r4, #1 + beq .Lxtsencout + next_tweak q3, q3, q7, q6 + b .Lxtsencloop +.Lxtsencout: + vst1.8 {q3}, [r5] + pop {r4-r6, pc} +ENDPROC(ce_aes_xts_encrypt) + + +ENTRY(ce_aes_xts_decrypt) + push {r4-r6, lr} + + bl ce_aes_xts_init @ run shared prologue + prepare_key r2, r3 + vmov q3, q0 + + teq r6, #0 @ start of a block? + bne .Lxtsdec3x + +.Lxtsdecloop3x: + next_tweak q3, q3, q7, q6 +.Lxtsdec3x: + subs r4, r4, #3 + bmi .Lxtsdec1x + vld1.8 {q0-q1}, [r1, :64]! @ get 3 ct blocks + vld1.8 {q2}, [r1, :64]! + next_tweak q4, q3, q7, q6 + veor q0, q0, q3 + next_tweak q5, q4, q7, q6 + veor q1, q1, q4 + veor q2, q2, q5 + bl aes_decrypt_3x + veor q0, q0, q3 + veor q1, q1, q4 + veor q2, q2, q5 + vst1.8 {q0-q1}, [r0, :64]! @ write 3 pt blocks + vst1.8 {q2}, [r0, :64]! + vmov q3, q5 + teq r4, #0 + beq .Lxtsdecout + b .Lxtsdecloop3x +.Lxtsdec1x: + adds r4, r4, #3 + beq .Lxtsdecout +.Lxtsdecloop: + vld1.8 {q0}, [r1, :64]! + veor q0, q0, q3 + add ip, r2, #32 @ 3rd round key + bl aes_decrypt + veor q0, q0, q3 + vst1.8 {q0}, [r0, :64]! + subs r4, r4, #1 + beq .Lxtsdecout + next_tweak q3, q3, q7, q6 + b .Lxtsdecloop +.Lxtsdecout: + vst1.8 {q3}, [r5] + pop {r4-r6, pc} +ENDPROC(ce_aes_xts_decrypt) + + /* + * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the + * AES sbox substitution on each byte in + * 'input' + */ +ENTRY(ce_aes_sub) + vdup.32 q1, r0 + veor q0, q0, q0 + aese.8 q0, q1 + vmov r0, s0 + bx lr +ENDPROC(ce_aes_sub) + + /* + * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns + * operation on round key *src + */ +ENTRY(ce_aes_invert) + vld1.8 {q0}, [r1] + aesimc.8 q0, q0 + vst1.8 {q0}, [r0] + bx lr +ENDPROC(ce_aes_invert) diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c new file mode 100644 index 000000000000..b445a5d56f43 --- /dev/null +++ b/arch/arm/crypto/aes-ce-glue.c @@ -0,0 +1,524 @@ +/* + * aes-ce-glue.c - wrapper code for ARMv8 AES + * + * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/hwcap.h> +#include <crypto/aes.h> +#include <crypto/ablk_helper.h> +#include <crypto/algapi.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +/* defined in aes-ce-core.S */ +asmlinkage u32 ce_aes_sub(u32 input); +asmlinkage void ce_aes_invert(void *dst, void *src); + +asmlinkage void ce_aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks); +asmlinkage void ce_aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks); + +asmlinkage void ce_aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[]); +asmlinkage void ce_aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 iv[]); + +asmlinkage void ce_aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], + int rounds, int blocks, u8 ctr[]); + +asmlinkage void ce_aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], + int rounds, int blocks, u8 iv[], + u8 const rk2[], int first); +asmlinkage void ce_aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], + int rounds, int blocks, u8 iv[], + u8 const rk2[], int first); + +struct aes_block { + u8 b[AES_BLOCK_SIZE]; +}; + +static int num_rounds(struct crypto_aes_ctx *ctx) +{ + /* + * # of rounds specified by AES: + * 128 bit key 10 rounds + * 192 bit key 12 rounds + * 256 bit key 14 rounds + * => n byte key => 6 + (n/4) rounds + */ + return 6 + ctx->key_length / 4; +} + +static int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key, + unsigned int key_len) +{ + /* + * The AES key schedule round constants + */ + static u8 const rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, + }; + + u32 kwords = key_len / sizeof(u32); + struct aes_block *key_enc, *key_dec; + int i, j; + + if (key_len != AES_KEYSIZE_128 && + key_len != AES_KEYSIZE_192 && + key_len != AES_KEYSIZE_256) + return -EINVAL; + + memcpy(ctx->key_enc, in_key, key_len); + ctx->key_length = key_len; + + kernel_neon_begin(); + for (i = 0; i < sizeof(rcon); i++) { + u32 *rki = ctx->key_enc + (i * kwords); + u32 *rko = rki + kwords; + + rko[0] = ror32(ce_aes_sub(rki[kwords - 1]), 8); + rko[0] = rko[0] ^ rki[0] ^ rcon[i]; + rko[1] = rko[0] ^ rki[1]; + rko[2] = rko[1] ^ rki[2]; + rko[3] = rko[2] ^ rki[3]; + + if (key_len == AES_KEYSIZE_192) { + if (i >= 7) + break; + rko[4] = rko[3] ^ rki[4]; + rko[5] = rko[4] ^ rki[5]; + } else if (key_len == AES_KEYSIZE_256) { + if (i >= 6) + break; + rko[4] = ce_aes_sub(rko[3]) ^ rki[4]; + rko[5] = rko[4] ^ rki[5]; + rko[6] = rko[5] ^ rki[6]; + rko[7] = rko[6] ^ rki[7]; + } + } + + /* + * Generate the decryption keys for the Equivalent Inverse Cipher. + * This involves reversing the order of the round keys, and applying + * the Inverse Mix Columns transformation on all but the first and + * the last one. + */ + key_enc = (struct aes_block *)ctx->key_enc; + key_dec = (struct aes_block *)ctx->key_dec; + j = num_rounds(ctx); + + key_dec[0] = key_enc[j]; + for (i = 1, j--; j > 0; i++, j--) + ce_aes_invert(key_dec + i, key_enc + j); + key_dec[i] = key_enc[0]; + + kernel_neon_end(); + return 0; +} + +static int ce_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); + int ret; + + ret = ce_aes_expandkey(ctx, in_key, key_len); + if (!ret) + return 0; + + tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; +} + +struct crypto_aes_xts_ctx { + struct crypto_aes_ctx key1; + struct crypto_aes_ctx __aligned(8) key2; +}; + +static int xts_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct crypto_aes_xts_ctx *ctx = crypto_tfm_ctx(tfm); + int ret; + + ret = ce_aes_expandkey(&ctx->key1, in_key, key_len / 2); + if (!ret) + ret = ce_aes_expandkey(&ctx->key2, &in_key[key_len / 2], + key_len / 2); + if (!ret) + return 0; + + tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + unsigned int blocks; + int err; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + kernel_neon_begin(); + while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { + ce_aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + (u8 *)ctx->key_enc, num_rounds(ctx), blocks); + err = blkcipher_walk_done(desc, &walk, + walk.nbytes % AES_BLOCK_SIZE); + } + kernel_neon_end(); + return err; +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + unsigned int blocks; + int err; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + kernel_neon_begin(); + while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { + ce_aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + (u8 *)ctx->key_dec, num_rounds(ctx), blocks); + err = blkcipher_walk_done(desc, &walk, + walk.nbytes % AES_BLOCK_SIZE); + } + kernel_neon_end(); + return err; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + unsigned int blocks; + int err; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + kernel_neon_begin(); + while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { + ce_aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + (u8 *)ctx->key_enc, num_rounds(ctx), blocks, + walk.iv); + err = blkcipher_walk_done(desc, &walk, + walk.nbytes % AES_BLOCK_SIZE); + } + kernel_neon_end(); + return err; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + unsigned int blocks; + int err; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + kernel_neon_begin(); + while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { + ce_aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + (u8 *)ctx->key_dec, num_rounds(ctx), blocks, + walk.iv); + err = blkcipher_walk_done(desc, &walk, + walk.nbytes % AES_BLOCK_SIZE); + } + kernel_neon_end(); + return err; +} + +static int ctr_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct crypto_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + int err, blocks; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); + + kernel_neon_begin(); + while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) { + ce_aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + (u8 *)ctx->key_enc, num_rounds(ctx), blocks, + walk.iv); + nbytes -= blocks * AES_BLOCK_SIZE; + if (nbytes && nbytes == walk.nbytes % AES_BLOCK_SIZE) + break; + err = blkcipher_walk_done(desc, &walk, + walk.nbytes % AES_BLOCK_SIZE); + } + if (nbytes) { + u8 *tdst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE; + u8 *tsrc = walk.src.virt.addr + blocks * AES_BLOCK_SIZE; + u8 __aligned(8) tail[AES_BLOCK_SIZE]; + + /* + * Minimum alignment is 8 bytes, so if nbytes is <= 8, we need + * to tell aes_ctr_encrypt() to only read half a block. + */ + blocks = (nbytes <= 8) ? -1 : 1; + + ce_aes_ctr_encrypt(tail, tsrc, (u8 *)ctx->key_enc, + num_rounds(ctx), blocks, walk.iv); + memcpy(tdst, tail, nbytes); + err = blkcipher_walk_done(desc, &walk, 0); + } + kernel_neon_end(); + + return err; +} + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + int err, first, rounds = num_rounds(&ctx->key1); + struct blkcipher_walk walk; + unsigned int blocks; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + kernel_neon_begin(); + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { + ce_aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr, + (u8 *)ctx->key1.key_enc, rounds, blocks, + walk.iv, (u8 *)ctx->key2.key_enc, first); + err = blkcipher_walk_done(desc, &walk, + walk.nbytes % AES_BLOCK_SIZE); + } + kernel_neon_end(); + + return err; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct crypto_aes_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + int err, first, rounds = num_rounds(&ctx->key1); + struct blkcipher_walk walk; + unsigned int blocks; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + kernel_neon_begin(); + for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) { + ce_aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr, + (u8 *)ctx->key1.key_dec, rounds, blocks, + walk.iv, (u8 *)ctx->key2.key_enc, first); + err = blkcipher_walk_done(desc, &walk, + walk.nbytes % AES_BLOCK_SIZE); + } + kernel_neon_end(); + + return err; +} + +static struct crypto_alg aes_algs[] = { { + .cra_name = "__ecb-aes-ce", + .cra_driver_name = "__driver-ecb-aes-ce", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ce_aes_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, +}, { + .cra_name = "__cbc-aes-ce", + .cra_driver_name = "__driver-cbc-aes-ce", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ce_aes_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, +}, { + .cra_name = "__ctr-aes-ce", + .cra_driver_name = "__driver-ctr-aes-ce", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ce_aes_setkey, + .encrypt = ctr_encrypt, + .decrypt = ctr_encrypt, + }, +}, { + .cra_name = "__xts-aes-ce", + .cra_driver_name = "__driver-xts-aes-ce", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_blkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_set_key, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, +}, { + .cra_name = "ecb(aes)", + .cra_driver_name = "ecb-aes-ce", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + } +}, { + .cra_name = "cbc(aes)", + .cra_driver_name = "cbc-aes-ce", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + } +}, { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-ce", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + } +}, { + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-ce", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 7, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_ablkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + } +} }; + +static int __init aes_init(void) +{ + if (!(elf_hwcap2 & HWCAP2_AES)) + return -ENODEV; + return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs)); +} + +static void __exit aes_exit(void) +{ + crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs)); +} + +module_init(aes_init); +module_exit(aes_exit); diff --git a/arch/arm/crypto/aesbs-glue.c b/arch/arm/crypto/aesbs-glue.c index 15468fbbdea3..6d685298690e 100644 --- a/arch/arm/crypto/aesbs-glue.c +++ b/arch/arm/crypto/aesbs-glue.c @@ -301,7 +301,8 @@ static struct crypto_alg aesbs_algs[] = { { .cra_name = "__cbc-aes-neonbs", .cra_driver_name = "__driver-cbc-aes-neonbs", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct aesbs_cbc_ctx), .cra_alignmask = 7, @@ -319,7 +320,8 @@ static struct crypto_alg aesbs_algs[] = { { .cra_name = "__ctr-aes-neonbs", .cra_driver_name = "__driver-ctr-aes-neonbs", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct aesbs_ctr_ctx), .cra_alignmask = 7, @@ -337,7 +339,8 @@ static struct crypto_alg aesbs_algs[] = { { .cra_name = "__xts-aes-neonbs", .cra_driver_name = "__driver-xts-aes-neonbs", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct aesbs_xts_ctx), .cra_alignmask = 7, diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S new file mode 100644 index 000000000000..f6ab8bcc9efe --- /dev/null +++ b/arch/arm/crypto/ghash-ce-core.S @@ -0,0 +1,94 @@ +/* + * Accelerated GHASH implementation with ARMv8 vmull.p64 instructions. + * + * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + SHASH .req q0 + SHASH2 .req q1 + T1 .req q2 + T2 .req q3 + MASK .req q4 + XL .req q5 + XM .req q6 + XH .req q7 + IN1 .req q7 + + SHASH_L .req d0 + SHASH_H .req d1 + SHASH2_L .req d2 + T1_L .req d4 + MASK_L .req d8 + XL_L .req d10 + XL_H .req d11 + XM_L .req d12 + XM_H .req d13 + XH_L .req d14 + + .text + .fpu crypto-neon-fp-armv8 + + /* + * void pmull_ghash_update(int blocks, u64 dg[], const char *src, + * struct ghash_key const *k, const char *head) + */ +ENTRY(pmull_ghash_update) + vld1.64 {SHASH}, [r3] + vld1.64 {XL}, [r1] + vmov.i8 MASK, #0xe1 + vext.8 SHASH2, SHASH, SHASH, #8 + vshl.u64 MASK, MASK, #57 + veor SHASH2, SHASH2, SHASH + + /* do the head block first, if supplied */ + ldr ip, [sp] + teq ip, #0 + beq 0f + vld1.64 {T1}, [ip] + teq r0, #0 + b 1f + +0: vld1.64 {T1}, [r2]! + subs r0, r0, #1 + +1: /* multiply XL by SHASH in GF(2^128) */ +#ifndef CONFIG_CPU_BIG_ENDIAN + vrev64.8 T1, T1 +#endif + vext.8 T2, XL, XL, #8 + vext.8 IN1, T1, T1, #8 + veor T1, T1, T2 + veor XL, XL, IN1 + + vmull.p64 XH, SHASH_H, XL_H @ a1 * b1 + veor T1, T1, XL + vmull.p64 XL, SHASH_L, XL_L @ a0 * b0 + vmull.p64 XM, SHASH2_L, T1_L @ (a1 + a0)(b1 + b0) + + vext.8 T1, XL, XH, #8 + veor T2, XL, XH + veor XM, XM, T1 + veor XM, XM, T2 + vmull.p64 T2, XL_L, MASK_L + + vmov XH_L, XM_H + vmov XM_H, XL_L + + veor XL, XM, T2 + vext.8 T2, XL, XL, #8 + vmull.p64 XL, XL_L, MASK_L + veor T2, T2, XH + veor XL, XL, T2 + + bne 0b + + vst1.64 {XL}, [r1] + bx lr +ENDPROC(pmull_ghash_update) diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c new file mode 100644 index 000000000000..03a39fe29246 --- /dev/null +++ b/arch/arm/crypto/ghash-ce-glue.c @@ -0,0 +1,320 @@ +/* + * Accelerated GHASH implementation with ARMv8 vmull.p64 instructions. + * + * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> +#include <asm/unaligned.h> +#include <crypto/cryptd.h> +#include <crypto/internal/hash.h> +#include <crypto/gf128mul.h> +#include <linux/crypto.h> +#include <linux/module.h> + +MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +#define GHASH_BLOCK_SIZE 16 +#define GHASH_DIGEST_SIZE 16 + +struct ghash_key { + u64 a; + u64 b; +}; + +struct ghash_desc_ctx { + u64 digest[GHASH_DIGEST_SIZE/sizeof(u64)]; + u8 buf[GHASH_BLOCK_SIZE]; + u32 count; +}; + +struct ghash_async_ctx { + struct cryptd_ahash *cryptd_tfm; +}; + +asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src, + struct ghash_key const *k, const char *head); + +static int ghash_init(struct shash_desc *desc) +{ + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); + + *ctx = (struct ghash_desc_ctx){}; + return 0; +} + +static int ghash_update(struct shash_desc *desc, const u8 *src, + unsigned int len) +{ + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); + unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; + + ctx->count += len; + + if ((partial + len) >= GHASH_BLOCK_SIZE) { + struct ghash_key *key = crypto_shash_ctx(desc->tfm); + int blocks; + + if (partial) { + int p = GHASH_BLOCK_SIZE - partial; + + memcpy(ctx->buf + partial, src, p); + src += p; + len -= p; + } + + blocks = len / GHASH_BLOCK_SIZE; + len %= GHASH_BLOCK_SIZE; + + kernel_neon_begin(); + pmull_ghash_update(blocks, ctx->digest, src, key, + partial ? ctx->buf : NULL); + kernel_neon_end(); + src += blocks * GHASH_BLOCK_SIZE; + partial = 0; + } + if (len) + memcpy(ctx->buf + partial, src, len); + return 0; +} + +static int ghash_final(struct shash_desc *desc, u8 *dst) +{ + struct ghash_desc_ctx *ctx = shash_desc_ctx(desc); + unsigned int partial = ctx->count % GHASH_BLOCK_SIZE; + + if (partial) { + struct ghash_key *key = crypto_shash_ctx(desc->tfm); + + memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial); + kernel_neon_begin(); + pmull_ghash_update(1, ctx->digest, ctx->buf, key, NULL); + kernel_neon_end(); + } + put_unaligned_be64(ctx->digest[1], dst); + put_unaligned_be64(ctx->digest[0], dst + 8); + + *ctx = (struct ghash_desc_ctx){}; + return 0; +} + +static int ghash_setkey(struct crypto_shash *tfm, + const u8 *inkey, unsigned int keylen) +{ + struct ghash_key *key = crypto_shash_ctx(tfm); + u64 a, b; + + if (keylen != GHASH_BLOCK_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + /* perform multiplication by 'x' in GF(2^128) */ + b = get_unaligned_be64(inkey); + a = get_unaligned_be64(inkey + 8); + + key->a = (a << 1) | (b >> 63); + key->b = (b << 1) | (a >> 63); + + if (b >> 63) + key->b ^= 0xc200000000000000UL; + + return 0; +} + +static struct shash_alg ghash_alg = { + .digestsize = GHASH_DIGEST_SIZE, + .init = ghash_init, + .update = ghash_update, + .final = ghash_final, + .setkey = ghash_setkey, + .descsize = sizeof(struct ghash_desc_ctx), + .base = { + .cra_name = "ghash", + .cra_driver_name = "__driver-ghash-ce", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_INTERNAL, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct ghash_key), + .cra_module = THIS_MODULE, + }, +}; + +static int ghash_async_init(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + if (!may_use_simd()) { + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_init(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); + + desc->tfm = child; + desc->flags = req->base.flags; + return crypto_shash_init(desc); + } +} + +static int ghash_async_update(struct ahash_request *req) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + + if (!may_use_simd()) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_update(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + return shash_ahash_update(req, desc); + } +} + +static int ghash_async_final(struct ahash_request *req) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + + if (!may_use_simd()) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_final(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + return crypto_shash_final(desc, req->result); + } +} + +static int ghash_async_digest(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + if (!may_use_simd()) { + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_digest(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); + + desc->tfm = child; + desc->flags = req->base.flags; + return shash_ahash_digest(req, desc); + } +} + +static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key, + unsigned int keylen) +{ + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct crypto_ahash *child = &ctx->cryptd_tfm->base; + int err; + + crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ahash_setkey(child, key, keylen); + crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child) + & CRYPTO_TFM_RES_MASK); + + return err; +} + +static int ghash_async_init_tfm(struct crypto_tfm *tfm) +{ + struct cryptd_ahash *cryptd_tfm; + struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_tfm = cryptd_alloc_ahash("__driver-ghash-ce", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ctx->cryptd_tfm = cryptd_tfm; + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(&cryptd_tfm->base)); + + return 0; +} + +static void ghash_async_exit_tfm(struct crypto_tfm *tfm) +{ + struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_free_ahash(ctx->cryptd_tfm); +} + +static struct ahash_alg ghash_async_alg = { + .init = ghash_async_init, + .update = ghash_async_update, + .final = ghash_async_final, + .setkey = ghash_async_setkey, + .digest = ghash_async_digest, + .halg.digestsize = GHASH_DIGEST_SIZE, + .halg.base = { + .cra_name = "ghash", + .cra_driver_name = "ghash-ce", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_type = &crypto_ahash_type, + .cra_ctxsize = sizeof(struct ghash_async_ctx), + .cra_module = THIS_MODULE, + .cra_init = ghash_async_init_tfm, + .cra_exit = ghash_async_exit_tfm, + }, +}; + +static int __init ghash_ce_mod_init(void) +{ + int err; + + if (!(elf_hwcap2 & HWCAP2_PMULL)) + return -ENODEV; + + err = crypto_register_shash(&ghash_alg); + if (err) + return err; + err = crypto_register_ahash(&ghash_async_alg); + if (err) + goto err_shash; + + return 0; + +err_shash: + crypto_unregister_shash(&ghash_alg); + return err; +} + +static void __exit ghash_ce_mod_exit(void) +{ + crypto_unregister_ahash(&ghash_async_alg); + crypto_unregister_shash(&ghash_alg); +} + +module_init(ghash_ce_mod_init); +module_exit(ghash_ce_mod_exit); diff --git a/arch/arm/crypto/sha1-ce-core.S b/arch/arm/crypto/sha1-ce-core.S new file mode 100644 index 000000000000..b623f51ccbcf --- /dev/null +++ b/arch/arm/crypto/sha1-ce-core.S @@ -0,0 +1,125 @@ +/* + * sha1-ce-core.S - SHA-1 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2015 Linaro Ltd. + * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .fpu crypto-neon-fp-armv8 + + k0 .req q0 + k1 .req q1 + k2 .req q2 + k3 .req q3 + + ta0 .req q4 + ta1 .req q5 + tb0 .req q5 + tb1 .req q4 + + dga .req q6 + dgb .req q7 + dgbs .req s28 + + dg0 .req q12 + dg1a0 .req q13 + dg1a1 .req q14 + dg1b0 .req q14 + dg1b1 .req q13 + + .macro add_only, op, ev, rc, s0, dg1 + .ifnb \s0 + vadd.u32 tb\ev, q\s0, \rc + .endif + sha1h.32 dg1b\ev, dg0 + .ifb \dg1 + sha1\op\().32 dg0, dg1a\ev, ta\ev + .else + sha1\op\().32 dg0, \dg1, ta\ev + .endif + .endm + + .macro add_update, op, ev, rc, s0, s1, s2, s3, dg1 + sha1su0.32 q\s0, q\s1, q\s2 + add_only \op, \ev, \rc, \s1, \dg1 + sha1su1.32 q\s0, q\s3 + .endm + + .align 6 +.Lsha1_rcon: + .word 0x5a827999, 0x5a827999, 0x5a827999, 0x5a827999 + .word 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1, 0x6ed9eba1 + .word 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc, 0x8f1bbcdc + .word 0xca62c1d6, 0xca62c1d6, 0xca62c1d6, 0xca62c1d6 + + /* + * void sha1_ce_transform(struct sha1_state *sst, u8 const *src, + * int blocks); + */ +ENTRY(sha1_ce_transform) + /* load round constants */ + adr ip, .Lsha1_rcon + vld1.32 {k0-k1}, [ip, :128]! + vld1.32 {k2-k3}, [ip, :128] + + /* load state */ + vld1.32 {dga}, [r0] + vldr dgbs, [r0, #16] + + /* load input */ +0: vld1.32 {q8-q9}, [r1]! + vld1.32 {q10-q11}, [r1]! + subs r2, r2, #1 + +#ifndef CONFIG_CPU_BIG_ENDIAN + vrev32.8 q8, q8 + vrev32.8 q9, q9 + vrev32.8 q10, q10 + vrev32.8 q11, q11 +#endif + + vadd.u32 ta0, q8, k0 + vmov dg0, dga + + add_update c, 0, k0, 8, 9, 10, 11, dgb + add_update c, 1, k0, 9, 10, 11, 8 + add_update c, 0, k0, 10, 11, 8, 9 + add_update c, 1, k0, 11, 8, 9, 10 + add_update c, 0, k1, 8, 9, 10, 11 + + add_update p, 1, k1, 9, 10, 11, 8 + add_update p, 0, k1, 10, 11, 8, 9 + add_update p, 1, k1, 11, 8, 9, 10 + add_update p, 0, k1, 8, 9, 10, 11 + add_update p, 1, k2, 9, 10, 11, 8 + + add_update m, 0, k2, 10, 11, 8, 9 + add_update m, 1, k2, 11, 8, 9, 10 + add_update m, 0, k2, 8, 9, 10, 11 + add_update m, 1, k2, 9, 10, 11, 8 + add_update m, 0, k3, 10, 11, 8, 9 + + add_update p, 1, k3, 11, 8, 9, 10 + add_only p, 0, k3, 9 + add_only p, 1, k3, 10 + add_only p, 0, k3, 11 + add_only p, 1 + + /* update state */ + vadd.u32 dga, dga, dg0 + vadd.u32 dgb, dgb, dg1a0 + bne 0b + + /* store new state */ + vst1.32 {dga}, [r0] + vstr dgbs, [r0, #16] + bx lr +ENDPROC(sha1_ce_transform) diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c new file mode 100644 index 000000000000..80bc2fcd241a --- /dev/null +++ b/arch/arm/crypto/sha1-ce-glue.c @@ -0,0 +1,96 @@ +/* + * sha1-ce-glue.c - SHA-1 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <crypto/internal/hash.h> +#include <crypto/sha.h> +#include <crypto/sha1_base.h> +#include <linux/crypto.h> +#include <linux/module.h> + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> + +#include "sha1.h" + +MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +asmlinkage void sha1_ce_transform(struct sha1_state *sst, u8 const *src, + int blocks); + +static int sha1_ce_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + if (!may_use_simd() || + (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) + return sha1_update_arm(desc, data, len); + + kernel_neon_begin(); + sha1_base_do_update(desc, data, len, sha1_ce_transform); + kernel_neon_end(); + + return 0; +} + +static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (!may_use_simd()) + return sha1_finup_arm(desc, data, len, out); + + kernel_neon_begin(); + if (len) + sha1_base_do_update(desc, data, len, sha1_ce_transform); + sha1_base_do_finalize(desc, sha1_ce_transform); + kernel_neon_end(); + + return sha1_base_finish(desc, out); +} + +static int sha1_ce_final(struct shash_desc *desc, u8 *out) +{ + return sha1_ce_finup(desc, NULL, 0, out); +} + +static struct shash_alg alg = { + .init = sha1_base_init, + .update = sha1_ce_update, + .final = sha1_ce_final, + .finup = sha1_ce_finup, + .descsize = sizeof(struct sha1_state), + .digestsize = SHA1_DIGEST_SIZE, + .base = { + .cra_name = "sha1", + .cra_driver_name = "sha1-ce", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init sha1_ce_mod_init(void) +{ + if (!(elf_hwcap2 & HWCAP2_SHA1)) + return -ENODEV; + return crypto_register_shash(&alg); +} + +static void __exit sha1_ce_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(sha1_ce_mod_init); +module_exit(sha1_ce_mod_fini); diff --git a/arch/arm/include/asm/crypto/sha1.h b/arch/arm/crypto/sha1.h index 75e6a417416b..ffd8bd08b1a7 100644 --- a/arch/arm/include/asm/crypto/sha1.h +++ b/arch/arm/crypto/sha1.h @@ -7,4 +7,7 @@ extern int sha1_update_arm(struct shash_desc *desc, const u8 *data, unsigned int len); +extern int sha1_finup_arm(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out); + #endif diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c index e31b0440c613..6fc73bf8766d 100644 --- a/arch/arm/crypto/sha1_glue.c +++ b/arch/arm/crypto/sha1_glue.c @@ -22,127 +22,47 @@ #include <linux/cryptohash.h> #include <linux/types.h> #include <crypto/sha.h> +#include <crypto/sha1_base.h> #include <asm/byteorder.h> -#include <asm/crypto/sha1.h> +#include "sha1.h" asmlinkage void sha1_block_data_order(u32 *digest, const unsigned char *data, unsigned int rounds); - -static int sha1_init(struct shash_desc *desc) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - - *sctx = (struct sha1_state){ - .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, - }; - - return 0; -} - - -static int __sha1_update(struct sha1_state *sctx, const u8 *data, - unsigned int len, unsigned int partial) -{ - unsigned int done = 0; - - sctx->count += len; - - if (partial) { - done = SHA1_BLOCK_SIZE - partial; - memcpy(sctx->buffer + partial, data, done); - sha1_block_data_order(sctx->state, sctx->buffer, 1); - } - - if (len - done >= SHA1_BLOCK_SIZE) { - const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE; - sha1_block_data_order(sctx->state, data + done, rounds); - done += rounds * SHA1_BLOCK_SIZE; - } - - memcpy(sctx->buffer, data + done, len - done); - return 0; -} - - int sha1_update_arm(struct shash_desc *desc, const u8 *data, unsigned int len) { - struct sha1_state *sctx = shash_desc_ctx(desc); - unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; - int res; + /* make sure casting to sha1_block_fn() is safe */ + BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); - /* Handle the fast case right here */ - if (partial + len < SHA1_BLOCK_SIZE) { - sctx->count += len; - memcpy(sctx->buffer + partial, data, len); - return 0; - } - res = __sha1_update(sctx, data, len, partial); - return res; + return sha1_base_do_update(desc, data, len, + (sha1_block_fn *)sha1_block_data_order); } EXPORT_SYMBOL_GPL(sha1_update_arm); - -/* Add padding and return the message digest. */ static int sha1_final(struct shash_desc *desc, u8 *out) { - struct sha1_state *sctx = shash_desc_ctx(desc); - unsigned int i, index, padlen; - __be32 *dst = (__be32 *)out; - __be64 bits; - static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; - - bits = cpu_to_be64(sctx->count << 3); - - /* Pad out to 56 mod 64 and append length */ - index = sctx->count % SHA1_BLOCK_SIZE; - padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index); - /* We need to fill a whole block for __sha1_update() */ - if (padlen <= 56) { - sctx->count += padlen; - memcpy(sctx->buffer + index, padding, padlen); - } else { - __sha1_update(sctx, padding, padlen, index); - } - __sha1_update(sctx, (const u8 *)&bits, sizeof(bits), 56); - - /* Store state in digest */ - for (i = 0; i < 5; i++) - dst[i] = cpu_to_be32(sctx->state[i]); - - /* Wipe context */ - memset(sctx, 0, sizeof(*sctx)); - return 0; + sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_block_data_order); + return sha1_base_finish(desc, out); } - -static int sha1_export(struct shash_desc *desc, void *out) +int sha1_finup_arm(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - struct sha1_state *sctx = shash_desc_ctx(desc); - memcpy(out, sctx, sizeof(*sctx)); - return 0; + sha1_base_do_update(desc, data, len, + (sha1_block_fn *)sha1_block_data_order); + return sha1_final(desc, out); } - - -static int sha1_import(struct shash_desc *desc, const void *in) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - memcpy(sctx, in, sizeof(*sctx)); - return 0; -} - +EXPORT_SYMBOL_GPL(sha1_finup_arm); static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_init, + .init = sha1_base_init, .update = sha1_update_arm, .final = sha1_final, - .export = sha1_export, - .import = sha1_import, + .finup = sha1_finup_arm, .descsize = sizeof(struct sha1_state), - .statesize = sizeof(struct sha1_state), .base = { .cra_name = "sha1", .cra_driver_name= "sha1-asm", diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c index 0b0083757d47..4e22f122f966 100644 --- a/arch/arm/crypto/sha1_neon_glue.c +++ b/arch/arm/crypto/sha1_neon_glue.c @@ -25,147 +25,60 @@ #include <linux/cryptohash.h> #include <linux/types.h> #include <crypto/sha.h> -#include <asm/byteorder.h> +#include <crypto/sha1_base.h> #include <asm/neon.h> #include <asm/simd.h> -#include <asm/crypto/sha1.h> +#include "sha1.h" asmlinkage void sha1_transform_neon(void *state_h, const char *data, unsigned int rounds); - -static int sha1_neon_init(struct shash_desc *desc) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - - *sctx = (struct sha1_state){ - .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, - }; - - return 0; -} - -static int __sha1_neon_update(struct shash_desc *desc, const u8 *data, - unsigned int len, unsigned int partial) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - unsigned int done = 0; - - sctx->count += len; - - if (partial) { - done = SHA1_BLOCK_SIZE - partial; - memcpy(sctx->buffer + partial, data, done); - sha1_transform_neon(sctx->state, sctx->buffer, 1); - } - - if (len - done >= SHA1_BLOCK_SIZE) { - const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE; - - sha1_transform_neon(sctx->state, data + done, rounds); - done += rounds * SHA1_BLOCK_SIZE; - } - - memcpy(sctx->buffer, data + done, len - done); - - return 0; -} - static int sha1_neon_update(struct shash_desc *desc, const u8 *data, - unsigned int len) + unsigned int len) { struct sha1_state *sctx = shash_desc_ctx(desc); - unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; - int res; - /* Handle the fast case right here */ - if (partial + len < SHA1_BLOCK_SIZE) { - sctx->count += len; - memcpy(sctx->buffer + partial, data, len); + if (!may_use_simd() || + (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) + return sha1_update_arm(desc, data, len); - return 0; - } - - if (!may_use_simd()) { - res = sha1_update_arm(desc, data, len); - } else { - kernel_neon_begin(); - res = __sha1_neon_update(desc, data, len, partial); - kernel_neon_end(); - } - - return res; -} - - -/* Add padding and return the message digest. */ -static int sha1_neon_final(struct shash_desc *desc, u8 *out) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - unsigned int i, index, padlen; - __be32 *dst = (__be32 *)out; - __be64 bits; - static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; - - bits = cpu_to_be64(sctx->count << 3); - - /* Pad out to 56 mod 64 and append length */ - index = sctx->count % SHA1_BLOCK_SIZE; - padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index); - if (!may_use_simd()) { - sha1_update_arm(desc, padding, padlen); - sha1_update_arm(desc, (const u8 *)&bits, sizeof(bits)); - } else { - kernel_neon_begin(); - /* We need to fill a whole block for __sha1_neon_update() */ - if (padlen <= 56) { - sctx->count += padlen; - memcpy(sctx->buffer + index, padding, padlen); - } else { - __sha1_neon_update(desc, padding, padlen, index); - } - __sha1_neon_update(desc, (const u8 *)&bits, sizeof(bits), 56); - kernel_neon_end(); - } - - /* Store state in digest */ - for (i = 0; i < 5; i++) - dst[i] = cpu_to_be32(sctx->state[i]); - - /* Wipe context */ - memset(sctx, 0, sizeof(*sctx)); + kernel_neon_begin(); + sha1_base_do_update(desc, data, len, + (sha1_block_fn *)sha1_transform_neon); + kernel_neon_end(); return 0; } -static int sha1_neon_export(struct shash_desc *desc, void *out) +static int sha1_neon_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - struct sha1_state *sctx = shash_desc_ctx(desc); + if (!may_use_simd()) + return sha1_finup_arm(desc, data, len, out); - memcpy(out, sctx, sizeof(*sctx)); + kernel_neon_begin(); + if (len) + sha1_base_do_update(desc, data, len, + (sha1_block_fn *)sha1_transform_neon); + sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_neon); + kernel_neon_end(); - return 0; + return sha1_base_finish(desc, out); } -static int sha1_neon_import(struct shash_desc *desc, const void *in) +static int sha1_neon_final(struct shash_desc *desc, u8 *out) { - struct sha1_state *sctx = shash_desc_ctx(desc); - - memcpy(sctx, in, sizeof(*sctx)); - - return 0; + return sha1_neon_finup(desc, NULL, 0, out); } static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_neon_init, + .init = sha1_base_init, .update = sha1_neon_update, .final = sha1_neon_final, - .export = sha1_neon_export, - .import = sha1_neon_import, + .finup = sha1_neon_finup, .descsize = sizeof(struct sha1_state), - .statesize = sizeof(struct sha1_state), .base = { .cra_name = "sha1", .cra_driver_name = "sha1-neon", diff --git a/arch/arm/crypto/sha2-ce-core.S b/arch/arm/crypto/sha2-ce-core.S new file mode 100644 index 000000000000..87ec11a5f405 --- /dev/null +++ b/arch/arm/crypto/sha2-ce-core.S @@ -0,0 +1,125 @@ +/* + * sha2-ce-core.S - SHA-224/256 secure hash using ARMv8 Crypto Extensions + * + * Copyright (C) 2015 Linaro Ltd. + * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> + + .text + .fpu crypto-neon-fp-armv8 + + k0 .req q7 + k1 .req q8 + rk .req r3 + + ta0 .req q9 + ta1 .req q10 + tb0 .req q10 + tb1 .req q9 + + dga .req q11 + dgb .req q12 + + dg0 .req q13 + dg1 .req q14 + dg2 .req q15 + + .macro add_only, ev, s0 + vmov dg2, dg0 + .ifnb \s0 + vld1.32 {k\ev}, [rk, :128]! + .endif + sha256h.32 dg0, dg1, tb\ev + sha256h2.32 dg1, dg2, tb\ev + .ifnb \s0 + vadd.u32 ta\ev, q\s0, k\ev + .endif + .endm + + .macro add_update, ev, s0, s1, s2, s3 + sha256su0.32 q\s0, q\s1 + add_only \ev, \s1 + sha256su1.32 q\s0, q\s2, q\s3 + .endm + + .align 6 +.Lsha256_rcon: + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + + /* + * void sha2_ce_transform(struct sha256_state *sst, u8 const *src, + int blocks); + */ +ENTRY(sha2_ce_transform) + /* load state */ + vld1.32 {dga-dgb}, [r0] + + /* load input */ +0: vld1.32 {q0-q1}, [r1]! + vld1.32 {q2-q3}, [r1]! + subs r2, r2, #1 + +#ifndef CONFIG_CPU_BIG_ENDIAN + vrev32.8 q0, q0 + vrev32.8 q1, q1 + vrev32.8 q2, q2 + vrev32.8 q3, q3 +#endif + + /* load first round constant */ + adr rk, .Lsha256_rcon + vld1.32 {k0}, [rk, :128]! + + vadd.u32 ta0, q0, k0 + vmov dg0, dga + vmov dg1, dgb + + add_update 1, 0, 1, 2, 3 + add_update 0, 1, 2, 3, 0 + add_update 1, 2, 3, 0, 1 + add_update 0, 3, 0, 1, 2 + add_update 1, 0, 1, 2, 3 + add_update 0, 1, 2, 3, 0 + add_update 1, 2, 3, 0, 1 + add_update 0, 3, 0, 1, 2 + add_update 1, 0, 1, 2, 3 + add_update 0, 1, 2, 3, 0 + add_update 1, 2, 3, 0, 1 + add_update 0, 3, 0, 1, 2 + + add_only 1, 1 + add_only 0, 2 + add_only 1, 3 + add_only 0 + + /* update state */ + vadd.u32 dga, dga, dg0 + vadd.u32 dgb, dgb, dg1 + bne 0b + + /* store new state */ + vst1.32 {dga-dgb}, [r0] + bx lr +ENDPROC(sha2_ce_transform) diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c new file mode 100644 index 000000000000..0755b2d657f3 --- /dev/null +++ b/arch/arm/crypto/sha2-ce-glue.c @@ -0,0 +1,114 @@ +/* + * sha2-ce-glue.c - SHA-224/SHA-256 using ARMv8 Crypto Extensions + * + * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <crypto/internal/hash.h> +#include <crypto/sha.h> +#include <crypto/sha256_base.h> +#include <linux/crypto.h> +#include <linux/module.h> + +#include <asm/hwcap.h> +#include <asm/simd.h> +#include <asm/neon.h> +#include <asm/unaligned.h> + +#include "sha256_glue.h" + +MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); + +asmlinkage void sha2_ce_transform(struct sha256_state *sst, u8 const *src, + int blocks); + +static int sha2_ce_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + if (!may_use_simd() || + (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) + return crypto_sha256_arm_update(desc, data, len); + + kernel_neon_begin(); + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha2_ce_transform); + kernel_neon_end(); + + return 0; +} + +static int sha2_ce_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (!may_use_simd()) + return crypto_sha256_arm_finup(desc, data, len, out); + + kernel_neon_begin(); + if (len) + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha2_ce_transform); + sha256_base_do_finalize(desc, (sha256_block_fn *)sha2_ce_transform); + kernel_neon_end(); + + return sha256_base_finish(desc, out); +} + +static int sha2_ce_final(struct shash_desc *desc, u8 *out) +{ + return sha2_ce_finup(desc, NULL, 0, out); +} + +static struct shash_alg algs[] = { { + .init = sha224_base_init, + .update = sha2_ce_update, + .final = sha2_ce_final, + .finup = sha2_ce_finup, + .descsize = sizeof(struct sha256_state), + .digestsize = SHA224_DIGEST_SIZE, + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-ce", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .init = sha256_base_init, + .update = sha2_ce_update, + .final = sha2_ce_final, + .finup = sha2_ce_finup, + .descsize = sizeof(struct sha256_state), + .digestsize = SHA256_DIGEST_SIZE, + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-ce", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static int __init sha2_ce_mod_init(void) +{ + if (!(elf_hwcap2 & HWCAP2_SHA2)) + return -ENODEV; + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} + +static void __exit sha2_ce_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_init(sha2_ce_mod_init); +module_exit(sha2_ce_mod_fini); diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl new file mode 100644 index 000000000000..fac0533ea633 --- /dev/null +++ b/arch/arm/crypto/sha256-armv4.pl @@ -0,0 +1,716 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# +# Permission to use under GPL terms is granted. +# ==================================================================== + +# SHA256 block procedure for ARMv4. May 2007. + +# Performance is ~2x better than gcc 3.4 generated code and in "abso- +# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per +# byte [on single-issue Xscale PXA250 core]. + +# July 2010. +# +# Rescheduling for dual-issue pipeline resulted in 22% improvement on +# Cortex A8 core and ~20 cycles per processed byte. + +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 16% +# improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +# September 2013. +# +# Add NEON implementation. On Cortex A8 it was measured to process one +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +# code (meaning that latter performs sub-optimally, nothing was done +# about it). + +# May 2014. +# +# Add ARMv8 code path performing at 2.0 cpb on Apple A7. + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$ctx="r0"; $t0="r0"; +$inp="r1"; $t4="r1"; +$len="r2"; $t1="r2"; +$T1="r3"; $t3="r3"; +$A="r4"; +$B="r5"; +$C="r6"; +$D="r7"; +$E="r8"; +$F="r9"; +$G="r10"; +$H="r11"; +@V=($A,$B,$C,$D,$E,$F,$G,$H); +$t2="r12"; +$Ktbl="r14"; + +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); + +sub BODY_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___ if ($i<16); +#if __ARM_ARCH__>=7 + @ ldr $t1,[$inp],#4 @ $i +# if $i==15 + str $inp,[sp,#17*4] @ make room for $t4 +# endif + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +# ifndef __ARMEB__ + rev $t1,$t1 +# endif +#else + @ ldrb $t1,[$inp,#3] @ $i + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + ldrb $t2,[$inp,#2] + ldrb $t0,[$inp,#1] + orr $t1,$t1,$t2,lsl#8 + ldrb $t2,[$inp],#4 + orr $t1,$t1,$t0,lsl#16 +# if $i==15 + str $inp,[sp,#17*4] @ make room for $t4 +# endif + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` + orr $t1,$t1,$t2,lsl#24 + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) +#endif +___ +$code.=<<___; + ldr $t2,[$Ktbl],#4 @ *K256++ + add $h,$h,$t1 @ h+=X[i] + str $t1,[sp,#`$i%16`*4] + eor $t1,$f,$g + add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) + and $t1,$t1,$e + add $h,$h,$t2 @ h+=K256[i] + eor $t1,$t1,$g @ Ch(e,f,g) + eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` + add $h,$h,$t1 @ h+=Ch(e,f,g) +#if $i==31 + and $t2,$t2,#0xff + cmp $t2,#0xf2 @ done? +#endif +#if $i<15 +# if __ARM_ARCH__>=7 + ldr $t1,[$inp],#4 @ prefetch +# else + ldrb $t1,[$inp,#3] +# endif + eor $t2,$a,$b @ a^b, b^c in next round +#else + ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx + eor $t2,$a,$b @ a^b, b^c in next round + ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx +#endif + eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) + and $t3,$t3,$t2 @ (b^c)&=(a^b) + add $d,$d,$h @ d+=h + eor $t3,$t3,$b @ Maj(a,b,c) + add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) + @ add $h,$h,$t3 @ h+=Maj(a,b,c) +___ + ($t2,$t3)=($t3,$t2); +} + +sub BODY_16_XX { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___; + @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i + @ ldr $t4,[sp,#`($i+14)%16`*4] + mov $t0,$t1,ror#$sigma0[0] + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past + mov $t2,$t4,ror#$sigma1[0] + eor $t0,$t0,$t1,ror#$sigma0[1] + eor $t2,$t2,$t4,ror#$sigma1[1] + eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) + ldr $t1,[sp,#`($i+0)%16`*4] + eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) + ldr $t4,[sp,#`($i+9)%16`*4] + + add $t2,$t2,$t0 + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 + add $t1,$t1,$t2 + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) + add $t1,$t1,$t4 @ X[i] +___ + &BODY_00_15(@_); +} + +$code=<<___; +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +.text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +# define adrl adr +.thumb +# else +.code 32 +# endif +#endif + +.type K256,%object +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256,.-K256 +.word 0 @ terminator +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha256_block_data_order +#endif +.align 5 + +.global sha256_block_data_order +.type sha256_block_data_order,%function +sha256_block_data_order: +#if __ARM_ARCH__<7 + sub r3,pc,#8 @ sha256_block_data_order +#else + adr r3,sha256_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#ARMV8_SHA256 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} + ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} + sub $Ktbl,r3,#256+32 @ K256 + sub sp,sp,#16*4 @ alloca(X[16]) +.Loop: +# if __ARM_ARCH__>=7 + ldr $t1,[$inp],#4 +# else + ldrb $t1,[$inp,#3] +# endif + eor $t3,$B,$C @ magic + eor $t2,$t2,$t2 +___ +for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=".Lrounds_16_xx:\n"; +for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +#if __ARM_ARCH__>=7 + ite eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq $t3,[sp,#16*4] @ pull ctx + bne .Lrounds_16_xx + + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past + ldr $t0,[$t3,#0] + ldr $t1,[$t3,#4] + ldr $t2,[$t3,#8] + add $A,$A,$t0 + ldr $t0,[$t3,#12] + add $B,$B,$t1 + ldr $t1,[$t3,#16] + add $C,$C,$t2 + ldr $t2,[$t3,#20] + add $D,$D,$t0 + ldr $t0,[$t3,#24] + add $E,$E,$t1 + ldr $t1,[$t3,#28] + add $F,$F,$t2 + ldr $inp,[sp,#17*4] @ pull inp + ldr $t2,[sp,#18*4] @ pull inp+len + add $G,$G,$t0 + add $H,$H,$t1 + stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} + cmp $inp,$t2 + sub $Ktbl,$Ktbl,#256 @ rewind Ktbl + bne .Loop + + add sp,sp,#`16+3`*4 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size sha256_block_data_order,.-sha256_block_data_order +___ +###################################################################### +# NEON stuff +# +{{{ +my @X=map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); +my $Xfer=$t4; +my $j=0; + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &veor ($T5,$T5,$T4); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 ($T0,$T0,@X[0]); + while($#insns>=2) { eval(shift(@insns)); } + &vst1_32 ("{$T0}","[$Xfer,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vld1_32 ("{$T0}","[$Ktbl,:128]!"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vrev32_8 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vadd_i32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &vst1_32 ("{$T0}","[$Xfer,:128]!"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&eor ($t1,$f,$g)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&and ($t1,$t1,$e)', + '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&eor ($t1,$t1,$g)', # Ch(e,f,g) + '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&ldr ($t1,"[sp,#64]") if ($j==31)', + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&add ($d,$d,$h)', # d+=h + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.global sha256_block_data_order_neon +.type sha256_block_data_order_neon,%function +.align 4 +sha256_block_data_order_neon: +.LNEON: + stmdb sp!,{r4-r12,lr} + + sub $H,sp,#16*4+16 + adrl $Ktbl,K256 + bic $H,$H,#15 @ align for 128-bit stores + mov $t2,sp + mov sp,$H @ alloca + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + + vld1.8 {@X[0]},[$inp]! + vld1.8 {@X[1]},[$inp]! + vld1.8 {@X[2]},[$inp]! + vld1.8 {@X[3]},[$inp]! + vld1.32 {$T0},[$Ktbl,:128]! + vld1.32 {$T1},[$Ktbl,:128]! + vld1.32 {$T2},[$Ktbl,:128]! + vld1.32 {$T3},[$Ktbl,:128]! + vrev32.8 @X[0],@X[0] @ yes, even on + str $ctx,[sp,#64] + vrev32.8 @X[1],@X[1] @ big-endian + str $inp,[sp,#68] + mov $Xfer,sp + vrev32.8 @X[2],@X[2] + str $len,[sp,#72] + vrev32.8 @X[3],@X[3] + str $t2,[sp,#76] @ save original sp + vadd.i32 $T0,$T0,@X[0] + vadd.i32 $T1,$T1,@X[1] + vst1.32 {$T0},[$Xfer,:128]! + vadd.i32 $T2,$T2,@X[2] + vst1.32 {$T1},[$Xfer,:128]! + vadd.i32 $T3,$T3,@X[3] + vst1.32 {$T2},[$Xfer,:128]! + vst1.32 {$T3},[$Xfer,:128]! + + ldmia $ctx,{$A-$H} + sub $Xfer,$Xfer,#64 + ldr $t1,[sp,#0] + eor $t2,$t2,$t2 + eor $t3,$B,$C + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + teq $t1,#0 @ check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + ldr $inp,[sp,#68] + ldr $t0,[sp,#72] + sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl + teq $inp,$t0 + it eq + subeq $inp,$inp,#64 @ avoid SEGV + vld1.8 {@X[0]},[$inp]! @ load next input block + vld1.8 {@X[1]},[$inp]! + vld1.8 {@X[2]},[$inp]! + vld1.8 {@X[3]},[$inp]! + it ne + strne $inp,[sp,#68] + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + ldr $t0,[$t1,#0] + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past + ldr $t2,[$t1,#4] + ldr $t3,[$t1,#8] + ldr $t4,[$t1,#12] + add $A,$A,$t0 @ accumulate + ldr $t0,[$t1,#16] + add $B,$B,$t2 + ldr $t2,[$t1,#20] + add $C,$C,$t3 + ldr $t3,[$t1,#24] + add $D,$D,$t4 + ldr $t4,[$t1,#28] + add $E,$E,$t0 + str $A,[$t1],#4 + add $F,$F,$t2 + str $B,[$t1],#4 + add $G,$G,$t3 + str $C,[$t1],#4 + add $H,$H,$t4 + str $D,[$t1],#4 + stmia $t1,{$E-$H} + + ittte ne + movne $Xfer,sp + ldrne $t1,[sp,#0] + eorne $t2,$t2,$t2 + ldreq sp,[sp,#76] @ restore original sp + itt ne + eorne $t3,$B,$C + bne .L_00_48 + + ldmia sp!,{r4-r12,pc} +.size sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +___ +}}} +###################################################################### +# ARMv8 stuff +# +{{{ +my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); +my @MSG=map("q$_",(8..11)); +my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); +my $Ktbl="r3"; + +$code.=<<___; +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + +# ifdef __thumb2__ +# define INST(a,b,c,d) .byte c,d|0xc,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d +# endif + +.type sha256_block_data_order_armv8,%function +.align 5 +sha256_block_data_order_armv8: +.LARMv8: + vld1.32 {$ABCD,$EFGH},[$ctx] +# ifdef __thumb2__ + adr $Ktbl,.LARMv8 + sub $Ktbl,$Ktbl,#.LARMv8-K256 +# else + adrl $Ktbl,K256 +# endif + add $len,$inp,$len,lsl#6 @ len to point at the end of inp + +.Loop_v8: + vld1.8 {@MSG[0]-@MSG[1]},[$inp]! + vld1.8 {@MSG[2]-@MSG[3]},[$inp]! + vld1.32 {$W0},[$Ktbl]! + vrev32.8 @MSG[0],@MSG[0] + vrev32.8 @MSG[1],@MSG[1] + vrev32.8 @MSG[2],@MSG[2] + vrev32.8 @MSG[3],@MSG[3] + vmov $ABCD_SAVE,$ABCD @ offload + vmov $EFGH_SAVE,$EFGH + teq $inp,$len +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + vld1.32 {$W1},[$Ktbl]! + vadd.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + vld1.32 {$W1},[$Ktbl]! + vadd.i32 $W0,$W0,@MSG[0] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + vld1.32 {$W0},[$Ktbl]! + vadd.i32 $W1,$W1,@MSG[1] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + vld1.32 {$W1},[$Ktbl] + vadd.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#256-16 @ rewind + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + vadd.i32 $W1,$W1,@MSG[3] + vmov $abcd,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + vadd.i32 $ABCD,$ABCD,$ABCD_SAVE + vadd.i32 $EFGH,$EFGH,$EFGH_SAVE + it ne + bne .Loop_v8 + + vst1.32 {$ABCD,$EFGH},[$ctx] + + ret @ bx lr +.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 +#endif +___ +}}} +$code.=<<___; +.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +#endif +___ + +open SELF,$0; +while(<SELF>) { + next if (/^#!/); + last if (!s/^#/@/ and !/^$/); + print; +} +close SELF; + +{ my %opcode = ( + "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, + "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<17)|(($2&8)<<4) + |(($3&7)<<1) |(($3&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; + } + } +} + +foreach (split($/,$code)) { + + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; + + s/\bret\b/bx lr/go or + s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 + + print $_,"\n"; +} + +close STDOUT; # enforce flush diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped new file mode 100644 index 000000000000..555a1a8eec90 --- /dev/null +++ b/arch/arm/crypto/sha256-core.S_shipped @@ -0,0 +1,2808 @@ + +@ ==================================================================== +@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +@ project. The module is, however, dual licensed under OpenSSL and +@ CRYPTOGAMS licenses depending on where you obtain it. For further +@ details see http://www.openssl.org/~appro/cryptogams/. +@ +@ Permission to use under GPL terms is granted. +@ ==================================================================== + +@ SHA256 block procedure for ARMv4. May 2007. + +@ Performance is ~2x better than gcc 3.4 generated code and in "abso- +@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per +@ byte [on single-issue Xscale PXA250 core]. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 22% improvement on +@ Cortex A8 core and ~20 cycles per processed byte. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +@ September 2013. +@ +@ Add NEON implementation. On Cortex A8 it was measured to process one +@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +@ code (meaning that latter performs sub-optimally, nothing was done +@ about it). + +@ May 2014. +@ +@ Add ARMv8 code path performing at 2.0 cpb on Apple A7. + +#ifndef __KERNEL__ +# include "arm_arch.h" +#else +# define __ARM_ARCH__ __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +.text +#if __ARM_ARCH__<7 +.code 32 +#else +.syntax unified +# ifdef __thumb2__ +# define adrl adr +.thumb +# else +.code 32 +# endif +#endif + +.type K256,%object +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256,.-K256 +.word 0 @ terminator +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha256_block_data_order +#endif +.align 5 + +.global sha256_block_data_order +.type sha256_block_data_order,%function +sha256_block_data_order: +#if __ARM_ARCH__<7 + sub r3,pc,#8 @ sha256_block_data_order +#else + adr r3,sha256_block_data_order +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#ARMV8_SHA256 + bne .LARMv8 + tst r12,#ARMV7_NEON + bne .LNEON +#endif + add r2,r1,r2,lsl#6 @ len to point at the end of inp + stmdb sp!,{r0,r1,r2,r4-r11,lr} + ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} + sub r14,r3,#256+32 @ K256 + sub sp,sp,#16*4 @ alloca(X[16]) +.Loop: +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ magic + eor r12,r12,r12 +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 0 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 0 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 0==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 0<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 1 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 1 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 1==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 1<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 2 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 2 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 2==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 2<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 3 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 3 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 3==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 3<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 4 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 4 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 4==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 4<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 5 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 5==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 5<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 6 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 6 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 6==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 6<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 7 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 7==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 7<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 8 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 8 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 8==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 8<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 9 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 9 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 9==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 9<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 10 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 10 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 10==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 10<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 11 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 11 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 11==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 11<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 12 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 12 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 12==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 12<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 13 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 13 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 13==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 13<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 14 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 14 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 14==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 14<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + @ ldr r2,[r1],#4 @ 15 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 15 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 15==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 15<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +.Lrounds_16_xx: + @ ldr r2,[sp,#1*4] @ 16 + @ ldr r1,[sp,#14*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#0*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#9*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 16==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 16<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#2*4] @ 17 + @ ldr r1,[sp,#15*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#1*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#10*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 17==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 17<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#3*4] @ 18 + @ ldr r1,[sp,#0*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#2*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#11*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 18==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 18<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#4*4] @ 19 + @ ldr r1,[sp,#1*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#3*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#12*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 19==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 19<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#5*4] @ 20 + @ ldr r1,[sp,#2*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#4*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#13*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 20==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 20<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#6*4] @ 21 + @ ldr r1,[sp,#3*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#5*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#14*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 21==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 21<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#7*4] @ 22 + @ ldr r1,[sp,#4*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#6*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#15*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 22==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 22<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#8*4] @ 23 + @ ldr r1,[sp,#5*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#7*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#0*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 23==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 23<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#9*4] @ 24 + @ ldr r1,[sp,#6*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#8*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#1*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 24==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 24<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#10*4] @ 25 + @ ldr r1,[sp,#7*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#9*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#2*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 25==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 25<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#11*4] @ 26 + @ ldr r1,[sp,#8*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#10*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#3*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 26==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 26<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#12*4] @ 27 + @ ldr r1,[sp,#9*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#11*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#4*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 27==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 27<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#13*4] @ 28 + @ ldr r1,[sp,#10*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#12*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#5*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 28==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 28<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#14*4] @ 29 + @ ldr r1,[sp,#11*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#13*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#6*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 29==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 29<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#15*4] @ 30 + @ ldr r1,[sp,#12*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#14*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#7*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 30==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 30<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#0*4] @ 31 + @ ldr r1,[sp,#13*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#15*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#8*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 31==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 31<15 +# if __ARM_ARCH__>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH__>=7 + ite eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq r3,[sp,#16*4] @ pull ctx + bne .Lrounds_16_xx + + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r0,[r3,#0] + ldr r2,[r3,#4] + ldr r12,[r3,#8] + add r4,r4,r0 + ldr r0,[r3,#12] + add r5,r5,r2 + ldr r2,[r3,#16] + add r6,r6,r12 + ldr r12,[r3,#20] + add r7,r7,r0 + ldr r0,[r3,#24] + add r8,r8,r2 + ldr r2,[r3,#28] + add r9,r9,r12 + ldr r1,[sp,#17*4] @ pull inp + ldr r12,[sp,#18*4] @ pull inp+len + add r10,r10,r0 + add r11,r11,r2 + stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} + cmp r1,r12 + sub r14,r14,#256 @ rewind Ktbl + bne .Loop + + add sp,sp,#19*4 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + .word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size sha256_block_data_order,.-sha256_block_data_order +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.global sha256_block_data_order_neon +.type sha256_block_data_order_neon,%function +.align 4 +sha256_block_data_order_neon: +.LNEON: + stmdb sp!,{r4-r12,lr} + + sub r11,sp,#16*4+16 + adrl r14,K256 + bic r11,r11,#15 @ align for 128-bit stores + mov r12,sp + mov sp,r11 @ alloca + add r2,r1,r2,lsl#6 @ len to point at the end of inp + + vld1.8 {q0},[r1]! + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + vld1.32 {q8},[r14,:128]! + vld1.32 {q9},[r14,:128]! + vld1.32 {q10},[r14,:128]! + vld1.32 {q11},[r14,:128]! + vrev32.8 q0,q0 @ yes, even on + str r0,[sp,#64] + vrev32.8 q1,q1 @ big-endian + str r1,[sp,#68] + mov r1,sp + vrev32.8 q2,q2 + str r2,[sp,#72] + vrev32.8 q3,q3 + str r12,[sp,#76] @ save original sp + vadd.i32 q8,q8,q0 + vadd.i32 q9,q9,q1 + vst1.32 {q8},[r1,:128]! + vadd.i32 q10,q10,q2 + vst1.32 {q9},[r1,:128]! + vadd.i32 q11,q11,q3 + vst1.32 {q10},[r1,:128]! + vst1.32 {q11},[r1,:128]! + + ldmia r0,{r4-r11} + sub r1,r1,#64 + ldr r2,[sp,#0] + eor r12,r12,r12 + eor r3,r5,r6 + b .L_00_48 + +.align 4 +.L_00_48: + vext.8 q8,q0,q1,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q2,q3,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q0,q0,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#4] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d7,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d7,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d7,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q0,q0,q9 + add r10,r10,r2 + ldr r2,[sp,#8] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d7,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d7,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d0,d0,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d0,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d0,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d0,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#12] + and r3,r3,r12 + vshr.u32 d24,d0,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d0,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d1,d1,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q0 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q1,q2,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q3,q0,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q1,q1,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#20] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d1,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d1,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d1,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q1,q1,q9 + add r6,r6,r2 + ldr r2,[sp,#24] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d1,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d1,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d2,d2,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d2,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d2,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d2,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#28] + and r3,r3,r12 + vshr.u32 d24,d2,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d2,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d3,d3,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q1 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vext.8 q8,q2,q3,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q0,q1,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q2,q2,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#36] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d3,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d3,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d3,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q2,q2,q9 + add r10,r10,r2 + ldr r2,[sp,#40] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d3,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d3,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d4,d4,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d4,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d4,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d4,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#44] + and r3,r3,r12 + vshr.u32 d24,d4,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d4,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d5,d5,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q2 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q3,q0,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q1,q2,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q3,q3,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#52] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d5,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d5,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d5,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q3,q3,q9 + add r6,r6,r2 + ldr r2,[sp,#56] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d5,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d5,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d6,d6,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d6,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d6,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d6,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#60] + and r3,r3,r12 + vshr.u32 d24,d6,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d6,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d7,d7,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q3 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[r14] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + teq r2,#0 @ check for K256 terminator + ldr r2,[sp,#0] + sub r1,r1,#64 + bne .L_00_48 + + ldr r1,[sp,#68] + ldr r0,[sp,#72] + sub r14,r14,#256 @ rewind r14 + teq r1,r0 + it eq + subeq r1,r1,#64 @ avoid SEGV + vld1.8 {q0},[r1]! @ load next input block + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + it ne + strne r1,[sp,#68] + mov r1,sp + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q0,q0 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q0 + ldr r2,[sp,#4] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#8] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#12] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q1,q1 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q1 + ldr r2,[sp,#20] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#24] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#28] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q2,q2 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q2 + ldr r2,[sp,#36] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#40] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#44] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q3,q3 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q3 + ldr r2,[sp,#52] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#56] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#60] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#64] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + ldr r0,[r2,#0] + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r12,[r2,#4] + ldr r3,[r2,#8] + ldr r1,[r2,#12] + add r4,r4,r0 @ accumulate + ldr r0,[r2,#16] + add r5,r5,r12 + ldr r12,[r2,#20] + add r6,r6,r3 + ldr r3,[r2,#24] + add r7,r7,r1 + ldr r1,[r2,#28] + add r8,r8,r0 + str r4,[r2],#4 + add r9,r9,r12 + str r5,[r2],#4 + add r10,r10,r3 + str r6,[r2],#4 + add r11,r11,r1 + str r7,[r2],#4 + stmia r2,{r8-r11} + + ittte ne + movne r1,sp + ldrne r2,[sp,#0] + eorne r12,r12,r12 + ldreq sp,[sp,#76] @ restore original sp + itt ne + eorne r3,r5,r6 + bne .L_00_48 + + ldmia sp!,{r4-r12,pc} +.size sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + +# ifdef __thumb2__ +# define INST(a,b,c,d) .byte c,d|0xc,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d +# endif + +.type sha256_block_data_order_armv8,%function +.align 5 +sha256_block_data_order_armv8: +.LARMv8: + vld1.32 {q0,q1},[r0] +# ifdef __thumb2__ + adr r3,.LARMv8 + sub r3,r3,#.LARMv8-K256 +# else + adrl r3,K256 +# endif + add r2,r1,r2,lsl#6 @ len to point at the end of inp + +.Loop_v8: + vld1.8 {q8-q9},[r1]! + vld1.8 {q10-q11},[r1]! + vld1.32 {q12},[r3]! + vrev32.8 q8,q8 + vrev32.8 q9,q9 + vrev32.8 q10,q10 + vrev32.8 q11,q11 + vmov q14,q0 @ offload + vmov q15,q1 + teq r1,r2 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + + vld1.32 {q13},[r3] + vadd.i32 q12,q12,q10 + sub r3,r3,#256-16 @ rewind + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + + vadd.i32 q13,q13,q11 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + + vadd.i32 q0,q0,q14 + vadd.i32 q1,q1,q15 + it ne + bne .Loop_v8 + + vst1.32 {q0,q1},[r0] + + bx lr @ bx lr +.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 +#endif +.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro@openssl.org>" +.align 2 +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) +.comm OPENSSL_armcap_P,4,4 +#endif diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c new file mode 100644 index 000000000000..a84e869ef900 --- /dev/null +++ b/arch/arm/crypto/sha256_glue.c @@ -0,0 +1,128 @@ +/* + * Glue code for the SHA256 Secure Hash Algorithm assembly implementation + * using optimized ARM assembler and NEON instructions. + * + * Copyright © 2015 Google Inc. + * + * This file is based on sha256_ssse3_glue.c: + * Copyright (C) 2013 Intel Corporation + * Author: Tim Chen <tim.c.chen@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <crypto/internal/hash.h> +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <linux/string.h> +#include <crypto/sha.h> +#include <crypto/sha256_base.h> +#include <asm/simd.h> +#include <asm/neon.h> + +#include "sha256_glue.h" + +asmlinkage void sha256_block_data_order(u32 *digest, const void *data, + unsigned int num_blks); + +int crypto_sha256_arm_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + /* make sure casting to sha256_block_fn() is safe */ + BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0); + + return sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_data_order); +} +EXPORT_SYMBOL(crypto_sha256_arm_update); + +static int sha256_final(struct shash_desc *desc, u8 *out) +{ + sha256_base_do_finalize(desc, + (sha256_block_fn *)sha256_block_data_order); + return sha256_base_finish(desc, out); +} + +int crypto_sha256_arm_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_data_order); + return sha256_final(desc, out); +} +EXPORT_SYMBOL(crypto_sha256_arm_finup); + +static struct shash_alg algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = crypto_sha256_arm_update, + .final = sha256_final, + .finup = crypto_sha256_arm_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-asm", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = crypto_sha256_arm_update, + .final = sha256_final, + .finup = crypto_sha256_arm_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-asm", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static int __init sha256_mod_init(void) +{ + int res = crypto_register_shashes(algs, ARRAY_SIZE(algs)); + + if (res < 0) + return res; + + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) { + res = crypto_register_shashes(sha256_neon_algs, + ARRAY_SIZE(sha256_neon_algs)); + + if (res < 0) + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); + } + + return res; +} + +static void __exit sha256_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); + + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) + crypto_unregister_shashes(sha256_neon_algs, + ARRAY_SIZE(sha256_neon_algs)); +} + +module_init(sha256_mod_init); +module_exit(sha256_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA256 Secure Hash Algorithm (ARM), including NEON"); + +MODULE_ALIAS_CRYPTO("sha256"); diff --git a/arch/arm/crypto/sha256_glue.h b/arch/arm/crypto/sha256_glue.h new file mode 100644 index 000000000000..7cf0bf786ada --- /dev/null +++ b/arch/arm/crypto/sha256_glue.h @@ -0,0 +1,14 @@ +#ifndef _CRYPTO_SHA256_GLUE_H +#define _CRYPTO_SHA256_GLUE_H + +#include <linux/crypto.h> + +extern struct shash_alg sha256_neon_algs[2]; + +int crypto_sha256_arm_update(struct shash_desc *desc, const u8 *data, + unsigned int len); + +int crypto_sha256_arm_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *hash); + +#endif /* _CRYPTO_SHA256_GLUE_H */ diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c new file mode 100644 index 000000000000..39ccd658817e --- /dev/null +++ b/arch/arm/crypto/sha256_neon_glue.c @@ -0,0 +1,101 @@ +/* + * Glue code for the SHA256 Secure Hash Algorithm assembly implementation + * using NEON instructions. + * + * Copyright © 2015 Google Inc. + * + * This file is based on sha512_neon_glue.c: + * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <crypto/internal/hash.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <linux/string.h> +#include <crypto/sha.h> +#include <crypto/sha256_base.h> +#include <asm/byteorder.h> +#include <asm/simd.h> +#include <asm/neon.h> + +#include "sha256_glue.h" + +asmlinkage void sha256_block_data_order_neon(u32 *digest, const void *data, + unsigned int num_blks); + +static int sha256_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + if (!may_use_simd() || + (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) + return crypto_sha256_arm_update(desc, data, len); + + kernel_neon_begin(); + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_data_order_neon); + kernel_neon_end(); + + return 0; +} + +static int sha256_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + if (!may_use_simd()) + return crypto_sha256_arm_finup(desc, data, len, out); + + kernel_neon_begin(); + if (len) + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_block_data_order_neon); + sha256_base_do_finalize(desc, + (sha256_block_fn *)sha256_block_data_order_neon); + kernel_neon_end(); + + return sha256_base_finish(desc, out); +} + +static int sha256_final(struct shash_desc *desc, u8 *out) +{ + return sha256_finup(desc, NULL, 0, out); +} + +struct shash_alg sha256_neon_algs[] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = sha256_base_init, + .update = sha256_update, + .final = sha256_final, + .finup = sha256_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name = "sha256-neon", + .cra_priority = 250, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = sha224_base_init, + .update = sha256_update, + .final = sha256_final, + .finup = sha256_finup, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name = "sha224-neon", + .cra_priority = 250, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index b1b5b893eb20..05d9e16c0dfd 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -284,7 +284,8 @@ static struct crypto_alg aes_algs[] = { { .cra_name = "__ecb-aes-" MODE, .cra_driver_name = "__driver-ecb-aes-" MODE, .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx), .cra_alignmask = 7, @@ -302,7 +303,8 @@ static struct crypto_alg aes_algs[] = { { .cra_name = "__cbc-aes-" MODE, .cra_driver_name = "__driver-cbc-aes-" MODE, .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx), .cra_alignmask = 7, @@ -320,7 +322,8 @@ static struct crypto_alg aes_algs[] = { { .cra_name = "__ctr-aes-" MODE, .cra_driver_name = "__driver-ctr-aes-" MODE, .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct crypto_aes_ctx), .cra_alignmask = 7, @@ -338,7 +341,8 @@ static struct crypto_alg aes_algs[] = { { .cra_name = "__xts-aes-" MODE, .cra_driver_name = "__driver-xts-aes-" MODE, .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_xts_ctx), .cra_alignmask = 7, diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S index 09d57d98609c..033aae6d732a 100644 --- a/arch/arm64/crypto/sha1-ce-core.S +++ b/arch/arm64/crypto/sha1-ce-core.S @@ -66,8 +66,8 @@ .word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 /* - * void sha1_ce_transform(int blocks, u8 const *src, u32 *state, - * u8 *head, long bytes) + * void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, + * int blocks) */ ENTRY(sha1_ce_transform) /* load round constants */ @@ -78,25 +78,22 @@ ENTRY(sha1_ce_transform) ld1r {k3.4s}, [x6] /* load state */ - ldr dga, [x2] - ldr dgb, [x2, #16] + ldr dga, [x0] + ldr dgb, [x0, #16] - /* load partial state (if supplied) */ - cbz x3, 0f - ld1 {v8.4s-v11.4s}, [x3] - b 1f + /* load sha1_ce_state::finalize */ + ldr w4, [x0, #:lo12:sha1_ce_offsetof_finalize] /* load input */ 0: ld1 {v8.4s-v11.4s}, [x1], #64 - sub w0, w0, #1 + sub w2, w2, #1 -1: CPU_LE( rev32 v8.16b, v8.16b ) CPU_LE( rev32 v9.16b, v9.16b ) CPU_LE( rev32 v10.16b, v10.16b ) CPU_LE( rev32 v11.16b, v11.16b ) -2: add t0.4s, v8.4s, k0.4s +1: add t0.4s, v8.4s, k0.4s mov dg0v.16b, dgav.16b add_update c, ev, k0, 8, 9, 10, 11, dgb @@ -127,15 +124,15 @@ CPU_LE( rev32 v11.16b, v11.16b ) add dgbv.2s, dgbv.2s, dg1v.2s add dgav.4s, dgav.4s, dg0v.4s - cbnz w0, 0b + cbnz w2, 0b /* * Final block: add padding and total bit count. - * Skip if we have no total byte count in x4. In that case, the input - * size was not a round multiple of the block size, and the padding is - * handled by the C code. + * Skip if the input size was not a round multiple of the block size, + * the padding is handled by the C code in that case. */ cbz x4, 3f + ldr x4, [x0, #:lo12:sha1_ce_offsetof_count] movi v9.2d, #0 mov x8, #0x80000000 movi v10.2d, #0 @@ -144,10 +141,10 @@ CPU_LE( rev32 v11.16b, v11.16b ) mov x4, #0 mov v11.d[0], xzr mov v11.d[1], x7 - b 2b + b 1b /* store new state */ -3: str dga, [x2] - str dgb, [x2, #16] +3: str dga, [x0] + str dgb, [x0, #16] ret ENDPROC(sha1_ce_transform) diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index 6fe83f37a750..114e7cc5de8c 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -12,144 +12,81 @@ #include <asm/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/sha.h> +#include <crypto/sha1_base.h> #include <linux/cpufeature.h> #include <linux/crypto.h> #include <linux/module.h> +#define ASM_EXPORT(sym, val) \ + asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val)); + MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); -asmlinkage void sha1_ce_transform(int blocks, u8 const *src, u32 *state, - u8 *head, long bytes); +struct sha1_ce_state { + struct sha1_state sst; + u32 finalize; +}; -static int sha1_init(struct shash_desc *desc) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); +asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src, + int blocks); - *sctx = (struct sha1_state){ - .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, - }; - return 0; -} - -static int sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha1_ce_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - struct sha1_state *sctx = shash_desc_ctx(desc); - unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; - - sctx->count += len; - - if ((partial + len) >= SHA1_BLOCK_SIZE) { - int blocks; - - if (partial) { - int p = SHA1_BLOCK_SIZE - partial; + struct sha1_ce_state *sctx = shash_desc_ctx(desc); - memcpy(sctx->buffer + partial, data, p); - data += p; - len -= p; - } - - blocks = len / SHA1_BLOCK_SIZE; - len %= SHA1_BLOCK_SIZE; - - kernel_neon_begin_partial(16); - sha1_ce_transform(blocks, data, sctx->state, - partial ? sctx->buffer : NULL, 0); - kernel_neon_end(); + sctx->finalize = 0; + kernel_neon_begin_partial(16); + sha1_base_do_update(desc, data, len, + (sha1_block_fn *)sha1_ce_transform); + kernel_neon_end(); - data += blocks * SHA1_BLOCK_SIZE; - partial = 0; - } - if (len) - memcpy(sctx->buffer + partial, data, len); return 0; } -static int sha1_final(struct shash_desc *desc, u8 *out) +static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; + struct sha1_ce_state *sctx = shash_desc_ctx(desc); + bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE); - struct sha1_state *sctx = shash_desc_ctx(desc); - __be64 bits = cpu_to_be64(sctx->count << 3); - __be32 *dst = (__be32 *)out; - int i; - - u32 padlen = SHA1_BLOCK_SIZE - - ((sctx->count + sizeof(bits)) % SHA1_BLOCK_SIZE); - - sha1_update(desc, padding, padlen); - sha1_update(desc, (const u8 *)&bits, sizeof(bits)); - - for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) - put_unaligned_be32(sctx->state[i], dst++); - - *sctx = (struct sha1_state){}; - return 0; -} - -static int sha1_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - __be32 *dst = (__be32 *)out; - int blocks; - int i; - - if (sctx->count || !len || (len % SHA1_BLOCK_SIZE)) { - sha1_update(desc, data, len); - return sha1_final(desc, out); - } + ASM_EXPORT(sha1_ce_offsetof_count, + offsetof(struct sha1_ce_state, sst.count)); + ASM_EXPORT(sha1_ce_offsetof_finalize, + offsetof(struct sha1_ce_state, finalize)); /* - * Use a fast path if the input is a multiple of 64 bytes. In - * this case, there is no need to copy data around, and we can - * perform the entire digest calculation in a single invocation - * of sha1_ce_transform() + * Allow the asm code to perform the finalization if there is no + * partial data and the input is a round multiple of the block size. */ - blocks = len / SHA1_BLOCK_SIZE; + sctx->finalize = finalize; kernel_neon_begin_partial(16); - sha1_ce_transform(blocks, data, sctx->state, NULL, len); + sha1_base_do_update(desc, data, len, + (sha1_block_fn *)sha1_ce_transform); + if (!finalize) + sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform); kernel_neon_end(); - - for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(__be32); i++) - put_unaligned_be32(sctx->state[i], dst++); - - *sctx = (struct sha1_state){}; - return 0; + return sha1_base_finish(desc, out); } -static int sha1_export(struct shash_desc *desc, void *out) +static int sha1_ce_final(struct shash_desc *desc, u8 *out) { - struct sha1_state *sctx = shash_desc_ctx(desc); - struct sha1_state *dst = out; - - *dst = *sctx; - return 0; -} - -static int sha1_import(struct shash_desc *desc, const void *in) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - struct sha1_state const *src = in; - - *sctx = *src; - return 0; + kernel_neon_begin_partial(16); + sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform); + kernel_neon_end(); + return sha1_base_finish(desc, out); } static struct shash_alg alg = { - .init = sha1_init, - .update = sha1_update, - .final = sha1_final, - .finup = sha1_finup, - .export = sha1_export, - .import = sha1_import, - .descsize = sizeof(struct sha1_state), + .init = sha1_base_init, + .update = sha1_ce_update, + .final = sha1_ce_final, + .finup = sha1_ce_finup, + .descsize = sizeof(struct sha1_ce_state), .digestsize = SHA1_DIGEST_SIZE, - .statesize = sizeof(struct sha1_state), .base = { .cra_name = "sha1", .cra_driver_name = "sha1-ce", diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S index 7f29fc031ea8..5df9d9d470ad 100644 --- a/arch/arm64/crypto/sha2-ce-core.S +++ b/arch/arm64/crypto/sha2-ce-core.S @@ -73,8 +73,8 @@ .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 /* - * void sha2_ce_transform(int blocks, u8 const *src, u32 *state, - * u8 *head, long bytes) + * void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src, + * int blocks) */ ENTRY(sha2_ce_transform) /* load round constants */ @@ -85,24 +85,21 @@ ENTRY(sha2_ce_transform) ld1 {v12.4s-v15.4s}, [x8] /* load state */ - ldp dga, dgb, [x2] + ldp dga, dgb, [x0] - /* load partial input (if supplied) */ - cbz x3, 0f - ld1 {v16.4s-v19.4s}, [x3] - b 1f + /* load sha256_ce_state::finalize */ + ldr w4, [x0, #:lo12:sha256_ce_offsetof_finalize] /* load input */ 0: ld1 {v16.4s-v19.4s}, [x1], #64 - sub w0, w0, #1 + sub w2, w2, #1 -1: CPU_LE( rev32 v16.16b, v16.16b ) CPU_LE( rev32 v17.16b, v17.16b ) CPU_LE( rev32 v18.16b, v18.16b ) CPU_LE( rev32 v19.16b, v19.16b ) -2: add t0.4s, v16.4s, v0.4s +1: add t0.4s, v16.4s, v0.4s mov dg0v.16b, dgav.16b mov dg1v.16b, dgbv.16b @@ -131,15 +128,15 @@ CPU_LE( rev32 v19.16b, v19.16b ) add dgbv.4s, dgbv.4s, dg1v.4s /* handled all input blocks? */ - cbnz w0, 0b + cbnz w2, 0b /* * Final block: add padding and total bit count. - * Skip if we have no total byte count in x4. In that case, the input - * size was not a round multiple of the block size, and the padding is - * handled by the C code. + * Skip if the input size was not a round multiple of the block size, + * the padding is handled by the C code in that case. */ cbz x4, 3f + ldr x4, [x0, #:lo12:sha256_ce_offsetof_count] movi v17.2d, #0 mov x8, #0x80000000 movi v18.2d, #0 @@ -148,9 +145,9 @@ CPU_LE( rev32 v19.16b, v19.16b ) mov x4, #0 mov v19.d[0], xzr mov v19.d[1], x7 - b 2b + b 1b /* store new state */ -3: stp dga, dgb, [x2] +3: stp dga, dgb, [x0] ret ENDPROC(sha2_ce_transform) diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index ae67e88c28b9..1340e44c048b 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -12,206 +12,82 @@ #include <asm/unaligned.h> #include <crypto/internal/hash.h> #include <crypto/sha.h> +#include <crypto/sha256_base.h> #include <linux/cpufeature.h> #include <linux/crypto.h> #include <linux/module.h> +#define ASM_EXPORT(sym, val) \ + asm(".globl " #sym "; .set " #sym ", %0" :: "I"(val)); + MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions"); MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); MODULE_LICENSE("GPL v2"); -asmlinkage int sha2_ce_transform(int blocks, u8 const *src, u32 *state, - u8 *head, long bytes); - -static int sha224_init(struct shash_desc *desc) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - *sctx = (struct sha256_state){ - .state = { - SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3, - SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7, - } - }; - return 0; -} - -static int sha256_init(struct shash_desc *desc) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - *sctx = (struct sha256_state){ - .state = { - SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, - SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, - } - }; - return 0; -} - -static int sha2_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; - - sctx->count += len; - - if ((partial + len) >= SHA256_BLOCK_SIZE) { - int blocks; - - if (partial) { - int p = SHA256_BLOCK_SIZE - partial; - - memcpy(sctx->buf + partial, data, p); - data += p; - len -= p; - } +struct sha256_ce_state { + struct sha256_state sst; + u32 finalize; +}; - blocks = len / SHA256_BLOCK_SIZE; - len %= SHA256_BLOCK_SIZE; +asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src, + int blocks); - kernel_neon_begin_partial(28); - sha2_ce_transform(blocks, data, sctx->state, - partial ? sctx->buf : NULL, 0); - kernel_neon_end(); - - data += blocks * SHA256_BLOCK_SIZE; - partial = 0; - } - if (len) - memcpy(sctx->buf + partial, data, len); - return 0; -} - -static void sha2_final(struct shash_desc *desc) +static int sha256_ce_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { - static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; - - struct sha256_state *sctx = shash_desc_ctx(desc); - __be64 bits = cpu_to_be64(sctx->count << 3); - u32 padlen = SHA256_BLOCK_SIZE - - ((sctx->count + sizeof(bits)) % SHA256_BLOCK_SIZE); - - sha2_update(desc, padding, padlen); - sha2_update(desc, (const u8 *)&bits, sizeof(bits)); -} - -static int sha224_final(struct shash_desc *desc, u8 *out) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - __be32 *dst = (__be32 *)out; - int i; - - sha2_final(desc); - - for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) - put_unaligned_be32(sctx->state[i], dst++); - - *sctx = (struct sha256_state){}; - return 0; -} + struct sha256_ce_state *sctx = shash_desc_ctx(desc); -static int sha256_final(struct shash_desc *desc, u8 *out) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - __be32 *dst = (__be32 *)out; - int i; - - sha2_final(desc); - - for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) - put_unaligned_be32(sctx->state[i], dst++); + sctx->finalize = 0; + kernel_neon_begin_partial(28); + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha2_ce_transform); + kernel_neon_end(); - *sctx = (struct sha256_state){}; return 0; } -static void sha2_finup(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha256_ce_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - struct sha256_state *sctx = shash_desc_ctx(desc); - int blocks; + struct sha256_ce_state *sctx = shash_desc_ctx(desc); + bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE); - if (sctx->count || !len || (len % SHA256_BLOCK_SIZE)) { - sha2_update(desc, data, len); - sha2_final(desc); - return; - } + ASM_EXPORT(sha256_ce_offsetof_count, + offsetof(struct sha256_ce_state, sst.count)); + ASM_EXPORT(sha256_ce_offsetof_finalize, + offsetof(struct sha256_ce_state, finalize)); /* - * Use a fast path if the input is a multiple of 64 bytes. In - * this case, there is no need to copy data around, and we can - * perform the entire digest calculation in a single invocation - * of sha2_ce_transform() + * Allow the asm code to perform the finalization if there is no + * partial data and the input is a round multiple of the block size. */ - blocks = len / SHA256_BLOCK_SIZE; + sctx->finalize = finalize; kernel_neon_begin_partial(28); - sha2_ce_transform(blocks, data, sctx->state, NULL, len); + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha2_ce_transform); + if (!finalize) + sha256_base_do_finalize(desc, + (sha256_block_fn *)sha2_ce_transform); kernel_neon_end(); + return sha256_base_finish(desc, out); } -static int sha224_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) +static int sha256_ce_final(struct shash_desc *desc, u8 *out) { - struct sha256_state *sctx = shash_desc_ctx(desc); - __be32 *dst = (__be32 *)out; - int i; - - sha2_finup(desc, data, len); - - for (i = 0; i < SHA224_DIGEST_SIZE / sizeof(__be32); i++) - put_unaligned_be32(sctx->state[i], dst++); - - *sctx = (struct sha256_state){}; - return 0; -} - -static int sha256_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - __be32 *dst = (__be32 *)out; - int i; - - sha2_finup(desc, data, len); - - for (i = 0; i < SHA256_DIGEST_SIZE / sizeof(__be32); i++) - put_unaligned_be32(sctx->state[i], dst++); - - *sctx = (struct sha256_state){}; - return 0; -} - -static int sha2_export(struct shash_desc *desc, void *out) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - struct sha256_state *dst = out; - - *dst = *sctx; - return 0; -} - -static int sha2_import(struct shash_desc *desc, const void *in) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - struct sha256_state const *src = in; - - *sctx = *src; - return 0; + kernel_neon_begin_partial(28); + sha256_base_do_finalize(desc, (sha256_block_fn *)sha2_ce_transform); + kernel_neon_end(); + return sha256_base_finish(desc, out); } static struct shash_alg algs[] = { { - .init = sha224_init, - .update = sha2_update, - .final = sha224_final, - .finup = sha224_finup, - .export = sha2_export, - .import = sha2_import, - .descsize = sizeof(struct sha256_state), + .init = sha224_base_init, + .update = sha256_ce_update, + .final = sha256_ce_final, + .finup = sha256_ce_finup, + .descsize = sizeof(struct sha256_ce_state), .digestsize = SHA224_DIGEST_SIZE, - .statesize = sizeof(struct sha256_state), .base = { .cra_name = "sha224", .cra_driver_name = "sha224-ce", @@ -221,15 +97,12 @@ static struct shash_alg algs[] = { { .cra_module = THIS_MODULE, } }, { - .init = sha256_init, - .update = sha2_update, - .final = sha256_final, - .finup = sha256_finup, - .export = sha2_export, - .import = sha2_import, - .descsize = sizeof(struct sha256_state), + .init = sha256_base_init, + .update = sha256_ce_update, + .final = sha256_ce_final, + .finup = sha256_ce_finup, + .descsize = sizeof(struct sha256_ce_state), .digestsize = SHA256_DIGEST_SIZE, - .statesize = sizeof(struct sha256_state), .base = { .cra_name = "sha256", .cra_driver_name = "sha256-ce", diff --git a/arch/mips/cavium-octeon/crypto/Makefile b/arch/mips/cavium-octeon/crypto/Makefile index a74f76d85a2f..f7aa9d5d3b87 100644 --- a/arch/mips/cavium-octeon/crypto/Makefile +++ b/arch/mips/cavium-octeon/crypto/Makefile @@ -4,4 +4,7 @@ obj-y += octeon-crypto.o -obj-$(CONFIG_CRYPTO_MD5_OCTEON) += octeon-md5.o +obj-$(CONFIG_CRYPTO_MD5_OCTEON) += octeon-md5.o +obj-$(CONFIG_CRYPTO_SHA1_OCTEON) += octeon-sha1.o +obj-$(CONFIG_CRYPTO_SHA256_OCTEON) += octeon-sha256.o +obj-$(CONFIG_CRYPTO_SHA512_OCTEON) += octeon-sha512.o diff --git a/arch/mips/cavium-octeon/crypto/octeon-crypto.c b/arch/mips/cavium-octeon/crypto/octeon-crypto.c index 7c82ff463b65..f66bd1adc7ff 100644 --- a/arch/mips/cavium-octeon/crypto/octeon-crypto.c +++ b/arch/mips/cavium-octeon/crypto/octeon-crypto.c @@ -17,7 +17,7 @@ * crypto operations in calls to octeon_crypto_enable/disable in order to make * sure the state of COP2 isn't corrupted if userspace is also performing * hardware crypto operations. Allocate the state parameter on the stack. - * Preemption must be disabled to prevent context switches. + * Returns with preemption disabled. * * @state: Pointer to state structure to store current COP2 state in. * @@ -28,6 +28,7 @@ unsigned long octeon_crypto_enable(struct octeon_cop2_state *state) int status; unsigned long flags; + preempt_disable(); local_irq_save(flags); status = read_c0_status(); write_c0_status(status | ST0_CU2); @@ -62,5 +63,6 @@ void octeon_crypto_disable(struct octeon_cop2_state *state, else write_c0_status(read_c0_status() & ~ST0_CU2); local_irq_restore(flags); + preempt_enable(); } EXPORT_SYMBOL_GPL(octeon_crypto_disable); diff --git a/arch/mips/cavium-octeon/crypto/octeon-crypto.h b/arch/mips/cavium-octeon/crypto/octeon-crypto.h index e2a4aece9c24..355072535110 100644 --- a/arch/mips/cavium-octeon/crypto/octeon-crypto.h +++ b/arch/mips/cavium-octeon/crypto/octeon-crypto.h @@ -5,7 +5,8 @@ * * Copyright (C) 2012-2013 Cavium Inc., All Rights Reserved. * - * MD5 instruction definitions added by Aaro Koskinen <aaro.koskinen@iki.fi>. + * MD5/SHA1/SHA256/SHA512 instruction definitions added by + * Aaro Koskinen <aaro.koskinen@iki.fi>. * */ #ifndef __LINUX_OCTEON_CRYPTO_H @@ -21,11 +22,11 @@ extern void octeon_crypto_disable(struct octeon_cop2_state *state, unsigned long flags); /* - * Macros needed to implement MD5: + * Macros needed to implement MD5/SHA1/SHA256: */ /* - * The index can be 0-1. + * The index can be 0-1 (MD5) or 0-2 (SHA1), 0-3 (SHA256). */ #define write_octeon_64bit_hash_dword(value, index) \ do { \ @@ -36,7 +37,7 @@ do { \ } while (0) /* - * The index can be 0-1. + * The index can be 0-1 (MD5) or 0-2 (SHA1), 0-3 (SHA256). */ #define read_octeon_64bit_hash_dword(index) \ ({ \ @@ -72,4 +73,78 @@ do { \ : [rt] "d" (value)); \ } while (0) +/* + * The value is the final block dword (64-bit). + */ +#define octeon_sha1_start(value) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x4057" \ + : \ + : [rt] "d" (value)); \ +} while (0) + +/* + * The value is the final block dword (64-bit). + */ +#define octeon_sha256_start(value) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x404f" \ + : \ + : [rt] "d" (value)); \ +} while (0) + +/* + * Macros needed to implement SHA512: + */ + +/* + * The index can be 0-7. + */ +#define write_octeon_64bit_hash_sha512(value, index) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x0250+" STR(index) \ + : \ + : [rt] "d" (value)); \ +} while (0) + +/* + * The index can be 0-7. + */ +#define read_octeon_64bit_hash_sha512(index) \ +({ \ + u64 __value; \ + \ + __asm__ __volatile__ ( \ + "dmfc2 %[rt],0x0250+" STR(index) \ + : [rt] "=d" (__value) \ + : ); \ + \ + __value; \ +}) + +/* + * The index can be 0-14. + */ +#define write_octeon_64bit_block_sha512(value, index) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x0240+" STR(index) \ + : \ + : [rt] "d" (value)); \ +} while (0) + +/* + * The value is the final block word (64-bit). + */ +#define octeon_sha512_start(value) \ +do { \ + __asm__ __volatile__ ( \ + "dmtc2 %[rt],0x424f" \ + : \ + : [rt] "d" (value)); \ +} while (0) + #endif /* __LINUX_OCTEON_CRYPTO_H */ diff --git a/arch/mips/cavium-octeon/crypto/octeon-md5.c b/arch/mips/cavium-octeon/crypto/octeon-md5.c index b909881ba6c1..12dccdb38286 100644 --- a/arch/mips/cavium-octeon/crypto/octeon-md5.c +++ b/arch/mips/cavium-octeon/crypto/octeon-md5.c @@ -97,8 +97,6 @@ static int octeon_md5_update(struct shash_desc *desc, const u8 *data, memcpy((char *)mctx->block + (sizeof(mctx->block) - avail), data, avail); - local_bh_disable(); - preempt_disable(); flags = octeon_crypto_enable(&state); octeon_md5_store_hash(mctx); @@ -114,8 +112,6 @@ static int octeon_md5_update(struct shash_desc *desc, const u8 *data, octeon_md5_read_hash(mctx); octeon_crypto_disable(&state, flags); - preempt_enable(); - local_bh_enable(); memcpy(mctx->block, data, len); @@ -133,8 +129,6 @@ static int octeon_md5_final(struct shash_desc *desc, u8 *out) *p++ = 0x80; - local_bh_disable(); - preempt_disable(); flags = octeon_crypto_enable(&state); octeon_md5_store_hash(mctx); @@ -152,8 +146,6 @@ static int octeon_md5_final(struct shash_desc *desc, u8 *out) octeon_md5_read_hash(mctx); octeon_crypto_disable(&state, flags); - preempt_enable(); - local_bh_enable(); memcpy(out, mctx->hash, sizeof(mctx->hash)); memset(mctx, 0, sizeof(*mctx)); diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha1.c b/arch/mips/cavium-octeon/crypto/octeon-sha1.c new file mode 100644 index 000000000000..2b74b5b67cae --- /dev/null +++ b/arch/mips/cavium-octeon/crypto/octeon-sha1.c @@ -0,0 +1,241 @@ +/* + * Cryptographic API. + * + * SHA1 Secure Hash Algorithm. + * + * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>. + * + * Based on crypto/sha1_generic.c, which is: + * + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <linux/mm.h> +#include <crypto/sha.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/module.h> +#include <asm/byteorder.h> +#include <asm/octeon/octeon.h> +#include <crypto/internal/hash.h> + +#include "octeon-crypto.h" + +/* + * We pass everything as 64-bit. OCTEON can handle misaligned data. + */ + +static void octeon_sha1_store_hash(struct sha1_state *sctx) +{ + u64 *hash = (u64 *)sctx->state; + union { + u32 word[2]; + u64 dword; + } hash_tail = { { sctx->state[4], } }; + + write_octeon_64bit_hash_dword(hash[0], 0); + write_octeon_64bit_hash_dword(hash[1], 1); + write_octeon_64bit_hash_dword(hash_tail.dword, 2); + memzero_explicit(&hash_tail.word[0], sizeof(hash_tail.word[0])); +} + +static void octeon_sha1_read_hash(struct sha1_state *sctx) +{ + u64 *hash = (u64 *)sctx->state; + union { + u32 word[2]; + u64 dword; + } hash_tail; + + hash[0] = read_octeon_64bit_hash_dword(0); + hash[1] = read_octeon_64bit_hash_dword(1); + hash_tail.dword = read_octeon_64bit_hash_dword(2); + sctx->state[4] = hash_tail.word[0]; + memzero_explicit(&hash_tail.dword, sizeof(hash_tail.dword)); +} + +static void octeon_sha1_transform(const void *_block) +{ + const u64 *block = _block; + + write_octeon_64bit_block_dword(block[0], 0); + write_octeon_64bit_block_dword(block[1], 1); + write_octeon_64bit_block_dword(block[2], 2); + write_octeon_64bit_block_dword(block[3], 3); + write_octeon_64bit_block_dword(block[4], 4); + write_octeon_64bit_block_dword(block[5], 5); + write_octeon_64bit_block_dword(block[6], 6); + octeon_sha1_start(block[7]); +} + +static int octeon_sha1_init(struct shash_desc *desc) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA1_H0; + sctx->state[1] = SHA1_H1; + sctx->state[2] = SHA1_H2; + sctx->state[3] = SHA1_H3; + sctx->state[4] = SHA1_H4; + sctx->count = 0; + + return 0; +} + +static void __octeon_sha1_update(struct sha1_state *sctx, const u8 *data, + unsigned int len) +{ + unsigned int partial; + unsigned int done; + const u8 *src; + + partial = sctx->count % SHA1_BLOCK_SIZE; + sctx->count += len; + done = 0; + src = data; + + if ((partial + len) >= SHA1_BLOCK_SIZE) { + if (partial) { + done = -partial; + memcpy(sctx->buffer + partial, data, + done + SHA1_BLOCK_SIZE); + src = sctx->buffer; + } + + do { + octeon_sha1_transform(src); + done += SHA1_BLOCK_SIZE; + src = data + done; + } while (done + SHA1_BLOCK_SIZE <= len); + + partial = 0; + } + memcpy(sctx->buffer + partial, src, len - done); +} + +static int octeon_sha1_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + struct octeon_cop2_state state; + unsigned long flags; + + /* + * Small updates never reach the crypto engine, so the generic sha1 is + * faster because of the heavyweight octeon_crypto_enable() / + * octeon_crypto_disable(). + */ + if ((sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) + return crypto_sha1_update(desc, data, len); + + flags = octeon_crypto_enable(&state); + octeon_sha1_store_hash(sctx); + + __octeon_sha1_update(sctx, data, len); + + octeon_sha1_read_hash(sctx); + octeon_crypto_disable(&state, flags); + + return 0; +} + +static int octeon_sha1_final(struct shash_desc *desc, u8 *out) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + static const u8 padding[64] = { 0x80, }; + struct octeon_cop2_state state; + __be32 *dst = (__be32 *)out; + unsigned int pad_len; + unsigned long flags; + unsigned int index; + __be64 bits; + int i; + + /* Save number of bits. */ + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64. */ + index = sctx->count & 0x3f; + pad_len = (index < 56) ? (56 - index) : ((64+56) - index); + + flags = octeon_crypto_enable(&state); + octeon_sha1_store_hash(sctx); + + __octeon_sha1_update(sctx, padding, pad_len); + + /* Append length (before padding). */ + __octeon_sha1_update(sctx, (const u8 *)&bits, sizeof(bits)); + + octeon_sha1_read_hash(sctx); + octeon_crypto_disable(&state, flags); + + /* Store state in digest */ + for (i = 0; i < 5; i++) + dst[i] = cpu_to_be32(sctx->state[i]); + + /* Zeroize sensitive information. */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} + +static int octeon_sha1_export(struct shash_desc *desc, void *out) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + return 0; +} + +static int octeon_sha1_import(struct shash_desc *desc, const void *in) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + return 0; +} + +static struct shash_alg octeon_sha1_alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = octeon_sha1_init, + .update = octeon_sha1_update, + .final = octeon_sha1_final, + .export = octeon_sha1_export, + .import = octeon_sha1_import, + .descsize = sizeof(struct sha1_state), + .statesize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name= "octeon-sha1", + .cra_priority = OCTEON_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init octeon_sha1_mod_init(void) +{ + if (!octeon_has_crypto()) + return -ENOTSUPP; + return crypto_register_shash(&octeon_sha1_alg); +} + +static void __exit octeon_sha1_mod_fini(void) +{ + crypto_unregister_shash(&octeon_sha1_alg); +} + +module_init(octeon_sha1_mod_init); +module_exit(octeon_sha1_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm (OCTEON)"); +MODULE_AUTHOR("Aaro Koskinen <aaro.koskinen@iki.fi>"); diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha256.c b/arch/mips/cavium-octeon/crypto/octeon-sha256.c new file mode 100644 index 000000000000..97e96fead08a --- /dev/null +++ b/arch/mips/cavium-octeon/crypto/octeon-sha256.c @@ -0,0 +1,280 @@ +/* + * Cryptographic API. + * + * SHA-224 and SHA-256 Secure Hash Algorithm. + * + * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>. + * + * Based on crypto/sha256_generic.c, which is: + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> + * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <linux/mm.h> +#include <crypto/sha.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/module.h> +#include <asm/byteorder.h> +#include <asm/octeon/octeon.h> +#include <crypto/internal/hash.h> + +#include "octeon-crypto.h" + +/* + * We pass everything as 64-bit. OCTEON can handle misaligned data. + */ + +static void octeon_sha256_store_hash(struct sha256_state *sctx) +{ + u64 *hash = (u64 *)sctx->state; + + write_octeon_64bit_hash_dword(hash[0], 0); + write_octeon_64bit_hash_dword(hash[1], 1); + write_octeon_64bit_hash_dword(hash[2], 2); + write_octeon_64bit_hash_dword(hash[3], 3); +} + +static void octeon_sha256_read_hash(struct sha256_state *sctx) +{ + u64 *hash = (u64 *)sctx->state; + + hash[0] = read_octeon_64bit_hash_dword(0); + hash[1] = read_octeon_64bit_hash_dword(1); + hash[2] = read_octeon_64bit_hash_dword(2); + hash[3] = read_octeon_64bit_hash_dword(3); +} + +static void octeon_sha256_transform(const void *_block) +{ + const u64 *block = _block; + + write_octeon_64bit_block_dword(block[0], 0); + write_octeon_64bit_block_dword(block[1], 1); + write_octeon_64bit_block_dword(block[2], 2); + write_octeon_64bit_block_dword(block[3], 3); + write_octeon_64bit_block_dword(block[4], 4); + write_octeon_64bit_block_dword(block[5], 5); + write_octeon_64bit_block_dword(block[6], 6); + octeon_sha256_start(block[7]); +} + +static int octeon_sha224_init(struct shash_desc *desc) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA224_H0; + sctx->state[1] = SHA224_H1; + sctx->state[2] = SHA224_H2; + sctx->state[3] = SHA224_H3; + sctx->state[4] = SHA224_H4; + sctx->state[5] = SHA224_H5; + sctx->state[6] = SHA224_H6; + sctx->state[7] = SHA224_H7; + sctx->count = 0; + + return 0; +} + +static int octeon_sha256_init(struct shash_desc *desc) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA256_H0; + sctx->state[1] = SHA256_H1; + sctx->state[2] = SHA256_H2; + sctx->state[3] = SHA256_H3; + sctx->state[4] = SHA256_H4; + sctx->state[5] = SHA256_H5; + sctx->state[6] = SHA256_H6; + sctx->state[7] = SHA256_H7; + sctx->count = 0; + + return 0; +} + +static void __octeon_sha256_update(struct sha256_state *sctx, const u8 *data, + unsigned int len) +{ + unsigned int partial; + unsigned int done; + const u8 *src; + + partial = sctx->count % SHA256_BLOCK_SIZE; + sctx->count += len; + done = 0; + src = data; + + if ((partial + len) >= SHA256_BLOCK_SIZE) { + if (partial) { + done = -partial; + memcpy(sctx->buf + partial, data, + done + SHA256_BLOCK_SIZE); + src = sctx->buf; + } + + do { + octeon_sha256_transform(src); + done += SHA256_BLOCK_SIZE; + src = data + done; + } while (done + SHA256_BLOCK_SIZE <= len); + + partial = 0; + } + memcpy(sctx->buf + partial, src, len - done); +} + +static int octeon_sha256_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + struct octeon_cop2_state state; + unsigned long flags; + + /* + * Small updates never reach the crypto engine, so the generic sha256 is + * faster because of the heavyweight octeon_crypto_enable() / + * octeon_crypto_disable(). + */ + if ((sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) + return crypto_sha256_update(desc, data, len); + + flags = octeon_crypto_enable(&state); + octeon_sha256_store_hash(sctx); + + __octeon_sha256_update(sctx, data, len); + + octeon_sha256_read_hash(sctx); + octeon_crypto_disable(&state, flags); + + return 0; +} + +static int octeon_sha256_final(struct shash_desc *desc, u8 *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + static const u8 padding[64] = { 0x80, }; + struct octeon_cop2_state state; + __be32 *dst = (__be32 *)out; + unsigned int pad_len; + unsigned long flags; + unsigned int index; + __be64 bits; + int i; + + /* Save number of bits. */ + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64. */ + index = sctx->count & 0x3f; + pad_len = (index < 56) ? (56 - index) : ((64+56) - index); + + flags = octeon_crypto_enable(&state); + octeon_sha256_store_hash(sctx); + + __octeon_sha256_update(sctx, padding, pad_len); + + /* Append length (before padding). */ + __octeon_sha256_update(sctx, (const u8 *)&bits, sizeof(bits)); + + octeon_sha256_read_hash(sctx); + octeon_crypto_disable(&state, flags); + + /* Store state in digest */ + for (i = 0; i < 8; i++) + dst[i] = cpu_to_be32(sctx->state[i]); + + /* Zeroize sensitive information. */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} + +static int octeon_sha224_final(struct shash_desc *desc, u8 *hash) +{ + u8 D[SHA256_DIGEST_SIZE]; + + octeon_sha256_final(desc, D); + + memcpy(hash, D, SHA224_DIGEST_SIZE); + memzero_explicit(D, SHA256_DIGEST_SIZE); + + return 0; +} + +static int octeon_sha256_export(struct shash_desc *desc, void *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + return 0; +} + +static int octeon_sha256_import(struct shash_desc *desc, const void *in) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + return 0; +} + +static struct shash_alg octeon_sha256_algs[2] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = octeon_sha256_init, + .update = octeon_sha256_update, + .final = octeon_sha256_final, + .export = octeon_sha256_export, + .import = octeon_sha256_import, + .descsize = sizeof(struct sha256_state), + .statesize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name= "octeon-sha256", + .cra_priority = OCTEON_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = octeon_sha224_init, + .update = octeon_sha256_update, + .final = octeon_sha224_final, + .descsize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name= "octeon-sha224", + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static int __init octeon_sha256_mod_init(void) +{ + if (!octeon_has_crypto()) + return -ENOTSUPP; + return crypto_register_shashes(octeon_sha256_algs, + ARRAY_SIZE(octeon_sha256_algs)); +} + +static void __exit octeon_sha256_mod_fini(void) +{ + crypto_unregister_shashes(octeon_sha256_algs, + ARRAY_SIZE(octeon_sha256_algs)); +} + +module_init(octeon_sha256_mod_init); +module_exit(octeon_sha256_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm (OCTEON)"); +MODULE_AUTHOR("Aaro Koskinen <aaro.koskinen@iki.fi>"); diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha512.c b/arch/mips/cavium-octeon/crypto/octeon-sha512.c new file mode 100644 index 000000000000..d5fb3c6f22ae --- /dev/null +++ b/arch/mips/cavium-octeon/crypto/octeon-sha512.c @@ -0,0 +1,277 @@ +/* + * Cryptographic API. + * + * SHA-512 and SHA-384 Secure Hash Algorithm. + * + * Adapted for OCTEON by Aaro Koskinen <aaro.koskinen@iki.fi>. + * + * Based on crypto/sha512_generic.c, which is: + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2003 Kyle McMartin <kyle@debian.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + */ + +#include <linux/mm.h> +#include <crypto/sha.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/module.h> +#include <asm/byteorder.h> +#include <asm/octeon/octeon.h> +#include <crypto/internal/hash.h> + +#include "octeon-crypto.h" + +/* + * We pass everything as 64-bit. OCTEON can handle misaligned data. + */ + +static void octeon_sha512_store_hash(struct sha512_state *sctx) +{ + write_octeon_64bit_hash_sha512(sctx->state[0], 0); + write_octeon_64bit_hash_sha512(sctx->state[1], 1); + write_octeon_64bit_hash_sha512(sctx->state[2], 2); + write_octeon_64bit_hash_sha512(sctx->state[3], 3); + write_octeon_64bit_hash_sha512(sctx->state[4], 4); + write_octeon_64bit_hash_sha512(sctx->state[5], 5); + write_octeon_64bit_hash_sha512(sctx->state[6], 6); + write_octeon_64bit_hash_sha512(sctx->state[7], 7); +} + +static void octeon_sha512_read_hash(struct sha512_state *sctx) +{ + sctx->state[0] = read_octeon_64bit_hash_sha512(0); + sctx->state[1] = read_octeon_64bit_hash_sha512(1); + sctx->state[2] = read_octeon_64bit_hash_sha512(2); + sctx->state[3] = read_octeon_64bit_hash_sha512(3); + sctx->state[4] = read_octeon_64bit_hash_sha512(4); + sctx->state[5] = read_octeon_64bit_hash_sha512(5); + sctx->state[6] = read_octeon_64bit_hash_sha512(6); + sctx->state[7] = read_octeon_64bit_hash_sha512(7); +} + +static void octeon_sha512_transform(const void *_block) +{ + const u64 *block = _block; + + write_octeon_64bit_block_sha512(block[0], 0); + write_octeon_64bit_block_sha512(block[1], 1); + write_octeon_64bit_block_sha512(block[2], 2); + write_octeon_64bit_block_sha512(block[3], 3); + write_octeon_64bit_block_sha512(block[4], 4); + write_octeon_64bit_block_sha512(block[5], 5); + write_octeon_64bit_block_sha512(block[6], 6); + write_octeon_64bit_block_sha512(block[7], 7); + write_octeon_64bit_block_sha512(block[8], 8); + write_octeon_64bit_block_sha512(block[9], 9); + write_octeon_64bit_block_sha512(block[10], 10); + write_octeon_64bit_block_sha512(block[11], 11); + write_octeon_64bit_block_sha512(block[12], 12); + write_octeon_64bit_block_sha512(block[13], 13); + write_octeon_64bit_block_sha512(block[14], 14); + octeon_sha512_start(block[15]); +} + +static int octeon_sha512_init(struct shash_desc *desc) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA512_H0; + sctx->state[1] = SHA512_H1; + sctx->state[2] = SHA512_H2; + sctx->state[3] = SHA512_H3; + sctx->state[4] = SHA512_H4; + sctx->state[5] = SHA512_H5; + sctx->state[6] = SHA512_H6; + sctx->state[7] = SHA512_H7; + sctx->count[0] = sctx->count[1] = 0; + + return 0; +} + +static int octeon_sha384_init(struct shash_desc *desc) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA384_H0; + sctx->state[1] = SHA384_H1; + sctx->state[2] = SHA384_H2; + sctx->state[3] = SHA384_H3; + sctx->state[4] = SHA384_H4; + sctx->state[5] = SHA384_H5; + sctx->state[6] = SHA384_H6; + sctx->state[7] = SHA384_H7; + sctx->count[0] = sctx->count[1] = 0; + + return 0; +} + +static void __octeon_sha512_update(struct sha512_state *sctx, const u8 *data, + unsigned int len) +{ + unsigned int part_len; + unsigned int index; + unsigned int i; + + /* Compute number of bytes mod 128. */ + index = sctx->count[0] % SHA512_BLOCK_SIZE; + + /* Update number of bytes. */ + if ((sctx->count[0] += len) < len) + sctx->count[1]++; + + part_len = SHA512_BLOCK_SIZE - index; + + /* Transform as many times as possible. */ + if (len >= part_len) { + memcpy(&sctx->buf[index], data, part_len); + octeon_sha512_transform(sctx->buf); + + for (i = part_len; i + SHA512_BLOCK_SIZE <= len; + i += SHA512_BLOCK_SIZE) + octeon_sha512_transform(&data[i]); + + index = 0; + } else { + i = 0; + } + + /* Buffer remaining input. */ + memcpy(&sctx->buf[index], &data[i], len - i); +} + +static int octeon_sha512_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + struct octeon_cop2_state state; + unsigned long flags; + + /* + * Small updates never reach the crypto engine, so the generic sha512 is + * faster because of the heavyweight octeon_crypto_enable() / + * octeon_crypto_disable(). + */ + if ((sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) + return crypto_sha512_update(desc, data, len); + + flags = octeon_crypto_enable(&state); + octeon_sha512_store_hash(sctx); + + __octeon_sha512_update(sctx, data, len); + + octeon_sha512_read_hash(sctx); + octeon_crypto_disable(&state, flags); + + return 0; +} + +static int octeon_sha512_final(struct shash_desc *desc, u8 *hash) +{ + struct sha512_state *sctx = shash_desc_ctx(desc); + static u8 padding[128] = { 0x80, }; + struct octeon_cop2_state state; + __be64 *dst = (__be64 *)hash; + unsigned int pad_len; + unsigned long flags; + unsigned int index; + __be64 bits[2]; + int i; + + /* Save number of bits. */ + bits[1] = cpu_to_be64(sctx->count[0] << 3); + bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61); + + /* Pad out to 112 mod 128. */ + index = sctx->count[0] & 0x7f; + pad_len = (index < 112) ? (112 - index) : ((128+112) - index); + + flags = octeon_crypto_enable(&state); + octeon_sha512_store_hash(sctx); + + __octeon_sha512_update(sctx, padding, pad_len); + + /* Append length (before padding). */ + __octeon_sha512_update(sctx, (const u8 *)bits, sizeof(bits)); + + octeon_sha512_read_hash(sctx); + octeon_crypto_disable(&state, flags); + + /* Store state in digest. */ + for (i = 0; i < 8; i++) + dst[i] = cpu_to_be64(sctx->state[i]); + + /* Zeroize sensitive information. */ + memset(sctx, 0, sizeof(struct sha512_state)); + + return 0; +} + +static int octeon_sha384_final(struct shash_desc *desc, u8 *hash) +{ + u8 D[64]; + + octeon_sha512_final(desc, D); + + memcpy(hash, D, 48); + memzero_explicit(D, 64); + + return 0; +} + +static struct shash_alg octeon_sha512_algs[2] = { { + .digestsize = SHA512_DIGEST_SIZE, + .init = octeon_sha512_init, + .update = octeon_sha512_update, + .final = octeon_sha512_final, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha512", + .cra_driver_name= "octeon-sha512", + .cra_priority = OCTEON_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA512_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA384_DIGEST_SIZE, + .init = octeon_sha384_init, + .update = octeon_sha512_update, + .final = octeon_sha384_final, + .descsize = sizeof(struct sha512_state), + .base = { + .cra_name = "sha384", + .cra_driver_name= "octeon-sha384", + .cra_priority = OCTEON_CR_OPCODE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA384_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static int __init octeon_sha512_mod_init(void) +{ + if (!octeon_has_crypto()) + return -ENOTSUPP; + return crypto_register_shashes(octeon_sha512_algs, + ARRAY_SIZE(octeon_sha512_algs)); +} + +static void __exit octeon_sha512_mod_fini(void) +{ + crypto_unregister_shashes(octeon_sha512_algs, + ARRAY_SIZE(octeon_sha512_algs)); +} + +module_init(octeon_sha512_mod_init); +module_exit(octeon_sha512_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA-512 and SHA-384 Secure Hash Algorithms (OCTEON)"); +MODULE_AUTHOR("Aaro Koskinen <aaro.koskinen@iki.fi>"); diff --git a/arch/mips/include/asm/mach-bcm63xx/bcm63xx_regs.h b/arch/mips/include/asm/mach-bcm63xx/bcm63xx_regs.h index 4794067cb5a7..5035f09c5427 100644 --- a/arch/mips/include/asm/mach-bcm63xx/bcm63xx_regs.h +++ b/arch/mips/include/asm/mach-bcm63xx/bcm63xx_regs.h @@ -1259,20 +1259,6 @@ #define M2M_DSTID_REG(x) ((x) * 0x40 + 0x18) /************************************************************************* - * _REG relative to RSET_RNG - *************************************************************************/ - -#define RNG_CTRL 0x00 -#define RNG_EN (1 << 0) - -#define RNG_STAT 0x04 -#define RNG_AVAIL_MASK (0xff000000) - -#define RNG_DATA 0x08 -#define RNG_THRES 0x0c -#define RNG_MASK 0x10 - -/************************************************************************* * _REG relative to RSET_SPI *************************************************************************/ diff --git a/arch/powerpc/crypto/Makefile b/arch/powerpc/crypto/Makefile index 2926fb9c570a..9c221b69c181 100644 --- a/arch/powerpc/crypto/Makefile +++ b/arch/powerpc/crypto/Makefile @@ -4,6 +4,14 @@ # Arch-specific CryptoAPI modules. # +obj-$(CONFIG_CRYPTO_AES_PPC_SPE) += aes-ppc-spe.o +obj-$(CONFIG_CRYPTO_MD5_PPC) += md5-ppc.o obj-$(CONFIG_CRYPTO_SHA1_PPC) += sha1-powerpc.o +obj-$(CONFIG_CRYPTO_SHA1_PPC_SPE) += sha1-ppc-spe.o +obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o +aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o +md5-ppc-y := md5-asm.o md5-glue.o sha1-powerpc-y := sha1-powerpc-asm.o sha1.o +sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o +sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o diff --git a/arch/powerpc/crypto/aes-spe-core.S b/arch/powerpc/crypto/aes-spe-core.S new file mode 100644 index 000000000000..5dc6bce90a77 --- /dev/null +++ b/arch/powerpc/crypto/aes-spe-core.S @@ -0,0 +1,351 @@ +/* + * Fast AES implementation for SPE instruction set (PPC) + * + * This code makes use of the SPE SIMD instruction set as defined in + * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf + * Implementation is based on optimization guide notes from + * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <asm/ppc_asm.h> +#include "aes-spe-regs.h" + +#define EAD(in, bpos) \ + rlwimi rT0,in,28-((bpos+3)%4)*8,20,27; + +#define DAD(in, bpos) \ + rlwimi rT1,in,24-((bpos+3)%4)*8,24,31; + +#define LWH(out, off) \ + evlwwsplat out,off(rT0); /* load word high */ + +#define LWL(out, off) \ + lwz out,off(rT0); /* load word low */ + +#define LBZ(out, tab, off) \ + lbz out,off(tab); /* load byte */ + +#define LAH(out, in, bpos, off) \ + EAD(in, bpos) /* calc addr + load word high */ \ + LWH(out, off) + +#define LAL(out, in, bpos, off) \ + EAD(in, bpos) /* calc addr + load word low */ \ + LWL(out, off) + +#define LAE(out, in, bpos) \ + EAD(in, bpos) /* calc addr + load enc byte */ \ + LBZ(out, rT0, 8) + +#define LBE(out) \ + LBZ(out, rT0, 8) /* load enc byte */ + +#define LAD(out, in, bpos) \ + DAD(in, bpos) /* calc addr + load dec byte */ \ + LBZ(out, rT1, 0) + +#define LBD(out) \ + LBZ(out, rT1, 0) + +/* + * ppc_encrypt_block: The central encryption function for a single 16 bytes + * block. It does no stack handling or register saving to support fast calls + * via bl/blr. It expects that caller has pre-xored input data with first + * 4 words of encryption key into rD0-rD3. Pointer/counter registers must + * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3 + * and rW0-rW3 and caller must execute a final xor on the ouput registers. + * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing. + * + */ +_GLOBAL(ppc_encrypt_block) + LAH(rW4, rD1, 2, 4) + LAH(rW6, rD0, 3, 0) + LAH(rW3, rD0, 1, 8) +ppc_encrypt_block_loop: + LAH(rW0, rD3, 0, 12) + LAL(rW0, rD0, 0, 12) + LAH(rW1, rD1, 0, 12) + LAH(rW2, rD2, 1, 8) + LAL(rW2, rD3, 1, 8) + LAL(rW3, rD1, 1, 8) + LAL(rW4, rD2, 2, 4) + LAL(rW6, rD1, 3, 0) + LAH(rW5, rD3, 2, 4) + LAL(rW5, rD0, 2, 4) + LAH(rW7, rD2, 3, 0) + evldw rD1,16(rKP) + EAD(rD3, 3) + evxor rW2,rW2,rW4 + LWL(rW7, 0) + evxor rW2,rW2,rW6 + EAD(rD2, 0) + evxor rD1,rD1,rW2 + LWL(rW1, 12) + evxor rD1,rD1,rW0 + evldw rD3,24(rKP) + evmergehi rD0,rD0,rD1 + EAD(rD1, 2) + evxor rW3,rW3,rW5 + LWH(rW4, 4) + evxor rW3,rW3,rW7 + EAD(rD0, 3) + evxor rD3,rD3,rW3 + LWH(rW6, 0) + evxor rD3,rD3,rW1 + EAD(rD0, 1) + evmergehi rD2,rD2,rD3 + LWH(rW3, 8) + LAH(rW0, rD3, 0, 12) + LAL(rW0, rD0, 0, 12) + LAH(rW1, rD1, 0, 12) + LAH(rW2, rD2, 1, 8) + LAL(rW2, rD3, 1, 8) + LAL(rW3, rD1, 1, 8) + LAL(rW4, rD2, 2, 4) + LAL(rW6, rD1, 3, 0) + LAH(rW5, rD3, 2, 4) + LAL(rW5, rD0, 2, 4) + LAH(rW7, rD2, 3, 0) + evldw rD1,32(rKP) + EAD(rD3, 3) + evxor rW2,rW2,rW4 + LWL(rW7, 0) + evxor rW2,rW2,rW6 + EAD(rD2, 0) + evxor rD1,rD1,rW2 + LWL(rW1, 12) + evxor rD1,rD1,rW0 + evldw rD3,40(rKP) + evmergehi rD0,rD0,rD1 + EAD(rD1, 2) + evxor rW3,rW3,rW5 + LWH(rW4, 4) + evxor rW3,rW3,rW7 + EAD(rD0, 3) + evxor rD3,rD3,rW3 + LWH(rW6, 0) + evxor rD3,rD3,rW1 + EAD(rD0, 1) + evmergehi rD2,rD2,rD3 + LWH(rW3, 8) + addi rKP,rKP,32 + bdnz ppc_encrypt_block_loop + LAH(rW0, rD3, 0, 12) + LAL(rW0, rD0, 0, 12) + LAH(rW1, rD1, 0, 12) + LAH(rW2, rD2, 1, 8) + LAL(rW2, rD3, 1, 8) + LAL(rW3, rD1, 1, 8) + LAL(rW4, rD2, 2, 4) + LAH(rW5, rD3, 2, 4) + LAL(rW6, rD1, 3, 0) + LAL(rW5, rD0, 2, 4) + LAH(rW7, rD2, 3, 0) + evldw rD1,16(rKP) + EAD(rD3, 3) + evxor rW2,rW2,rW4 + LWL(rW7, 0) + evxor rW2,rW2,rW6 + EAD(rD2, 0) + evxor rD1,rD1,rW2 + LWL(rW1, 12) + evxor rD1,rD1,rW0 + evldw rD3,24(rKP) + evmergehi rD0,rD0,rD1 + EAD(rD1, 0) + evxor rW3,rW3,rW5 + LBE(rW2) + evxor rW3,rW3,rW7 + EAD(rD0, 1) + evxor rD3,rD3,rW3 + LBE(rW6) + evxor rD3,rD3,rW1 + EAD(rD0, 0) + evmergehi rD2,rD2,rD3 + LBE(rW1) + LAE(rW0, rD3, 0) + LAE(rW1, rD0, 0) + LAE(rW4, rD2, 1) + LAE(rW5, rD3, 1) + LAE(rW3, rD2, 0) + LAE(rW7, rD1, 1) + rlwimi rW0,rW4,8,16,23 + rlwimi rW1,rW5,8,16,23 + LAE(rW4, rD1, 2) + LAE(rW5, rD2, 2) + rlwimi rW2,rW6,8,16,23 + rlwimi rW3,rW7,8,16,23 + LAE(rW6, rD3, 2) + LAE(rW7, rD0, 2) + rlwimi rW0,rW4,16,8,15 + rlwimi rW1,rW5,16,8,15 + LAE(rW4, rD0, 3) + LAE(rW5, rD1, 3) + rlwimi rW2,rW6,16,8,15 + lwz rD0,32(rKP) + rlwimi rW3,rW7,16,8,15 + lwz rD1,36(rKP) + LAE(rW6, rD2, 3) + LAE(rW7, rD3, 3) + rlwimi rW0,rW4,24,0,7 + lwz rD2,40(rKP) + rlwimi rW1,rW5,24,0,7 + lwz rD3,44(rKP) + rlwimi rW2,rW6,24,0,7 + rlwimi rW3,rW7,24,0,7 + blr + +/* + * ppc_decrypt_block: The central decryption function for a single 16 bytes + * block. It does no stack handling or register saving to support fast calls + * via bl/blr. It expects that caller has pre-xored input data with first + * 4 words of encryption key into rD0-rD3. Pointer/counter registers must + * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3 + * and rW0-rW3 and caller must execute a final xor on the ouput registers. + * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing. + * + */ +_GLOBAL(ppc_decrypt_block) + LAH(rW0, rD1, 0, 12) + LAH(rW6, rD0, 3, 0) + LAH(rW3, rD0, 1, 8) +ppc_decrypt_block_loop: + LAH(rW1, rD3, 0, 12) + LAL(rW0, rD2, 0, 12) + LAH(rW2, rD2, 1, 8) + LAL(rW2, rD3, 1, 8) + LAH(rW4, rD3, 2, 4) + LAL(rW4, rD0, 2, 4) + LAL(rW6, rD1, 3, 0) + LAH(rW5, rD1, 2, 4) + LAH(rW7, rD2, 3, 0) + LAL(rW7, rD3, 3, 0) + LAL(rW3, rD1, 1, 8) + evldw rD1,16(rKP) + EAD(rD0, 0) + evxor rW4,rW4,rW6 + LWL(rW1, 12) + evxor rW0,rW0,rW4 + EAD(rD2, 2) + evxor rW0,rW0,rW2 + LWL(rW5, 4) + evxor rD1,rD1,rW0 + evldw rD3,24(rKP) + evmergehi rD0,rD0,rD1 + EAD(rD1, 0) + evxor rW3,rW3,rW7 + LWH(rW0, 12) + evxor rW3,rW3,rW1 + EAD(rD0, 3) + evxor rD3,rD3,rW3 + LWH(rW6, 0) + evxor rD3,rD3,rW5 + EAD(rD0, 1) + evmergehi rD2,rD2,rD3 + LWH(rW3, 8) + LAH(rW1, rD3, 0, 12) + LAL(rW0, rD2, 0, 12) + LAH(rW2, rD2, 1, 8) + LAL(rW2, rD3, 1, 8) + LAH(rW4, rD3, 2, 4) + LAL(rW4, rD0, 2, 4) + LAL(rW6, rD1, 3, 0) + LAH(rW5, rD1, 2, 4) + LAH(rW7, rD2, 3, 0) + LAL(rW7, rD3, 3, 0) + LAL(rW3, rD1, 1, 8) + evldw rD1,32(rKP) + EAD(rD0, 0) + evxor rW4,rW4,rW6 + LWL(rW1, 12) + evxor rW0,rW0,rW4 + EAD(rD2, 2) + evxor rW0,rW0,rW2 + LWL(rW5, 4) + evxor rD1,rD1,rW0 + evldw rD3,40(rKP) + evmergehi rD0,rD0,rD1 + EAD(rD1, 0) + evxor rW3,rW3,rW7 + LWH(rW0, 12) + evxor rW3,rW3,rW1 + EAD(rD0, 3) + evxor rD3,rD3,rW3 + LWH(rW6, 0) + evxor rD3,rD3,rW5 + EAD(rD0, 1) + evmergehi rD2,rD2,rD3 + LWH(rW3, 8) + addi rKP,rKP,32 + bdnz ppc_decrypt_block_loop + LAH(rW1, rD3, 0, 12) + LAL(rW0, rD2, 0, 12) + LAH(rW2, rD2, 1, 8) + LAL(rW2, rD3, 1, 8) + LAH(rW4, rD3, 2, 4) + LAL(rW4, rD0, 2, 4) + LAL(rW6, rD1, 3, 0) + LAH(rW5, rD1, 2, 4) + LAH(rW7, rD2, 3, 0) + LAL(rW7, rD3, 3, 0) + LAL(rW3, rD1, 1, 8) + evldw rD1,16(rKP) + EAD(rD0, 0) + evxor rW4,rW4,rW6 + LWL(rW1, 12) + evxor rW0,rW0,rW4 + EAD(rD2, 2) + evxor rW0,rW0,rW2 + LWL(rW5, 4) + evxor rD1,rD1,rW0 + evldw rD3,24(rKP) + evmergehi rD0,rD0,rD1 + DAD(rD1, 0) + evxor rW3,rW3,rW7 + LBD(rW0) + evxor rW3,rW3,rW1 + DAD(rD0, 1) + evxor rD3,rD3,rW3 + LBD(rW6) + evxor rD3,rD3,rW5 + DAD(rD0, 0) + evmergehi rD2,rD2,rD3 + LBD(rW3) + LAD(rW2, rD3, 0) + LAD(rW1, rD2, 0) + LAD(rW4, rD2, 1) + LAD(rW5, rD3, 1) + LAD(rW7, rD1, 1) + rlwimi rW0,rW4,8,16,23 + rlwimi rW1,rW5,8,16,23 + LAD(rW4, rD3, 2) + LAD(rW5, rD0, 2) + rlwimi rW2,rW6,8,16,23 + rlwimi rW3,rW7,8,16,23 + LAD(rW6, rD1, 2) + LAD(rW7, rD2, 2) + rlwimi rW0,rW4,16,8,15 + rlwimi rW1,rW5,16,8,15 + LAD(rW4, rD0, 3) + LAD(rW5, rD1, 3) + rlwimi rW2,rW6,16,8,15 + lwz rD0,32(rKP) + rlwimi rW3,rW7,16,8,15 + lwz rD1,36(rKP) + LAD(rW6, rD2, 3) + LAD(rW7, rD3, 3) + rlwimi rW0,rW4,24,0,7 + lwz rD2,40(rKP) + rlwimi rW1,rW5,24,0,7 + lwz rD3,44(rKP) + rlwimi rW2,rW6,24,0,7 + rlwimi rW3,rW7,24,0,7 + blr diff --git a/arch/powerpc/crypto/aes-spe-glue.c b/arch/powerpc/crypto/aes-spe-glue.c new file mode 100644 index 000000000000..bd5e63f72ad4 --- /dev/null +++ b/arch/powerpc/crypto/aes-spe-glue.c @@ -0,0 +1,512 @@ +/* + * Glue code for AES implementation for SPE instructions (PPC) + * + * Based on generic implementation. The assembler module takes care + * about the SPE registers so it can run from interrupt context. + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <crypto/aes.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/crypto.h> +#include <asm/byteorder.h> +#include <asm/switch_to.h> +#include <crypto/algapi.h> + +/* + * MAX_BYTES defines the number of bytes that are allowed to be processed + * between preempt_disable() and preempt_enable(). e500 cores can issue two + * instructions per clock cycle using one 32/64 bit unit (SU1) and one 32 + * bit unit (SU2). One of these can be a memory access that is executed via + * a single load and store unit (LSU). XTS-AES-256 takes ~780 operations per + * 16 byte block block or 25 cycles per byte. Thus 768 bytes of input data + * will need an estimated maximum of 20,000 cycles. Headroom for cache misses + * included. Even with the low end model clocked at 667 MHz this equals to a + * critical time window of less than 30us. The value has been choosen to + * process a 512 byte disk block in one or a large 1400 bytes IPsec network + * packet in two runs. + * + */ +#define MAX_BYTES 768 + +struct ppc_aes_ctx { + u32 key_enc[AES_MAX_KEYLENGTH_U32]; + u32 key_dec[AES_MAX_KEYLENGTH_U32]; + u32 rounds; +}; + +struct ppc_xts_ctx { + u32 key_enc[AES_MAX_KEYLENGTH_U32]; + u32 key_dec[AES_MAX_KEYLENGTH_U32]; + u32 key_twk[AES_MAX_KEYLENGTH_U32]; + u32 rounds; +}; + +extern void ppc_encrypt_aes(u8 *out, const u8 *in, u32 *key_enc, u32 rounds); +extern void ppc_decrypt_aes(u8 *out, const u8 *in, u32 *key_dec, u32 rounds); +extern void ppc_encrypt_ecb(u8 *out, const u8 *in, u32 *key_enc, u32 rounds, + u32 bytes); +extern void ppc_decrypt_ecb(u8 *out, const u8 *in, u32 *key_dec, u32 rounds, + u32 bytes); +extern void ppc_encrypt_cbc(u8 *out, const u8 *in, u32 *key_enc, u32 rounds, + u32 bytes, u8 *iv); +extern void ppc_decrypt_cbc(u8 *out, const u8 *in, u32 *key_dec, u32 rounds, + u32 bytes, u8 *iv); +extern void ppc_crypt_ctr (u8 *out, const u8 *in, u32 *key_enc, u32 rounds, + u32 bytes, u8 *iv); +extern void ppc_encrypt_xts(u8 *out, const u8 *in, u32 *key_enc, u32 rounds, + u32 bytes, u8 *iv, u32 *key_twk); +extern void ppc_decrypt_xts(u8 *out, const u8 *in, u32 *key_dec, u32 rounds, + u32 bytes, u8 *iv, u32 *key_twk); + +extern void ppc_expand_key_128(u32 *key_enc, const u8 *key); +extern void ppc_expand_key_192(u32 *key_enc, const u8 *key); +extern void ppc_expand_key_256(u32 *key_enc, const u8 *key); + +extern void ppc_generate_decrypt_key(u32 *key_dec,u32 *key_enc, + unsigned int key_len); + +static void spe_begin(void) +{ + /* disable preemption and save users SPE registers if required */ + preempt_disable(); + enable_kernel_spe(); +} + +static void spe_end(void) +{ + /* reenable preemption */ + preempt_enable(); +} + +static int ppc_aes_setkey(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct ppc_aes_ctx *ctx = crypto_tfm_ctx(tfm); + + if (key_len != AES_KEYSIZE_128 && + key_len != AES_KEYSIZE_192 && + key_len != AES_KEYSIZE_256) { + tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + switch (key_len) { + case AES_KEYSIZE_128: + ctx->rounds = 4; + ppc_expand_key_128(ctx->key_enc, in_key); + break; + case AES_KEYSIZE_192: + ctx->rounds = 5; + ppc_expand_key_192(ctx->key_enc, in_key); + break; + case AES_KEYSIZE_256: + ctx->rounds = 6; + ppc_expand_key_256(ctx->key_enc, in_key); + break; + } + + ppc_generate_decrypt_key(ctx->key_dec, ctx->key_enc, key_len); + + return 0; +} + +static int ppc_xts_setkey(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct ppc_xts_ctx *ctx = crypto_tfm_ctx(tfm); + + key_len >>= 1; + + if (key_len != AES_KEYSIZE_128 && + key_len != AES_KEYSIZE_192 && + key_len != AES_KEYSIZE_256) { + tfm->crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + switch (key_len) { + case AES_KEYSIZE_128: + ctx->rounds = 4; + ppc_expand_key_128(ctx->key_enc, in_key); + ppc_expand_key_128(ctx->key_twk, in_key + AES_KEYSIZE_128); + break; + case AES_KEYSIZE_192: + ctx->rounds = 5; + ppc_expand_key_192(ctx->key_enc, in_key); + ppc_expand_key_192(ctx->key_twk, in_key + AES_KEYSIZE_192); + break; + case AES_KEYSIZE_256: + ctx->rounds = 6; + ppc_expand_key_256(ctx->key_enc, in_key); + ppc_expand_key_256(ctx->key_twk, in_key + AES_KEYSIZE_256); + break; + } + + ppc_generate_decrypt_key(ctx->key_dec, ctx->key_enc, key_len); + + return 0; +} + +static void ppc_aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + struct ppc_aes_ctx *ctx = crypto_tfm_ctx(tfm); + + spe_begin(); + ppc_encrypt_aes(out, in, ctx->key_enc, ctx->rounds); + spe_end(); +} + +static void ppc_aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) +{ + struct ppc_aes_ctx *ctx = crypto_tfm_ctx(tfm); + + spe_begin(); + ppc_decrypt_aes(out, in, ctx->key_dec, ctx->rounds); + spe_end(); +} + +static int ppc_ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + unsigned int ubytes; + int err; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + ubytes = nbytes > MAX_BYTES ? + nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1); + nbytes -= ubytes; + + spe_begin(); + ppc_encrypt_ecb(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_enc, ctx->rounds, nbytes); + spe_end(); + + err = blkcipher_walk_done(desc, &walk, ubytes); + } + + return err; +} + +static int ppc_ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + unsigned int ubytes; + int err; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + ubytes = nbytes > MAX_BYTES ? + nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1); + nbytes -= ubytes; + + spe_begin(); + ppc_decrypt_ecb(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_dec, ctx->rounds, nbytes); + spe_end(); + + err = blkcipher_walk_done(desc, &walk, ubytes); + } + + return err; +} + +static int ppc_cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + unsigned int ubytes; + int err; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + ubytes = nbytes > MAX_BYTES ? + nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1); + nbytes -= ubytes; + + spe_begin(); + ppc_encrypt_cbc(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_enc, ctx->rounds, nbytes, walk.iv); + spe_end(); + + err = blkcipher_walk_done(desc, &walk, ubytes); + } + + return err; +} + +static int ppc_cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + unsigned int ubytes; + int err; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + ubytes = nbytes > MAX_BYTES ? + nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1); + nbytes -= ubytes; + + spe_begin(); + ppc_decrypt_cbc(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_dec, ctx->rounds, nbytes, walk.iv); + spe_end(); + + err = blkcipher_walk_done(desc, &walk, ubytes); + } + + return err; +} + +static int ppc_ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct ppc_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + unsigned int pbytes, ubytes; + int err; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); + + while ((pbytes = walk.nbytes)) { + pbytes = pbytes > MAX_BYTES ? MAX_BYTES : pbytes; + pbytes = pbytes == nbytes ? + nbytes : pbytes & ~(AES_BLOCK_SIZE - 1); + ubytes = walk.nbytes - pbytes; + + spe_begin(); + ppc_crypt_ctr(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_enc, ctx->rounds, pbytes , walk.iv); + spe_end(); + + nbytes -= pbytes; + err = blkcipher_walk_done(desc, &walk, ubytes); + } + + return err; +} + +static int ppc_xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct ppc_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + unsigned int ubytes; + int err; + u32 *twk; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + twk = ctx->key_twk; + + while ((nbytes = walk.nbytes)) { + ubytes = nbytes > MAX_BYTES ? + nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1); + nbytes -= ubytes; + + spe_begin(); + ppc_encrypt_xts(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_enc, ctx->rounds, nbytes, walk.iv, twk); + spe_end(); + + twk = NULL; + err = blkcipher_walk_done(desc, &walk, ubytes); + } + + return err; +} + +static int ppc_xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct ppc_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + unsigned int ubytes; + int err; + u32 *twk; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + twk = ctx->key_twk; + + while ((nbytes = walk.nbytes)) { + ubytes = nbytes > MAX_BYTES ? + nbytes - MAX_BYTES : nbytes & (AES_BLOCK_SIZE - 1); + nbytes -= ubytes; + + spe_begin(); + ppc_decrypt_xts(walk.dst.virt.addr, walk.src.virt.addr, + ctx->key_dec, ctx->rounds, nbytes, walk.iv, twk); + spe_end(); + + twk = NULL; + err = blkcipher_walk_done(desc, &walk, ubytes); + } + + return err; +} + +/* + * Algorithm definitions. Disabling alignment (cra_alignmask=0) was chosen + * because the e500 platform can handle unaligned reads/writes very efficently. + * This improves IPsec thoughput by another few percent. Additionally we assume + * that AES context is always aligned to at least 8 bytes because it is created + * with kmalloc() in the crypto infrastructure + * + */ +static struct crypto_alg aes_algs[] = { { + .cra_name = "aes", + .cra_driver_name = "aes-ppc-spe", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct ppc_aes_ctx), + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = ppc_aes_setkey, + .cia_encrypt = ppc_aes_encrypt, + .cia_decrypt = ppc_aes_decrypt + } + } +}, { + .cra_name = "ecb(aes)", + .cra_driver_name = "ecb-ppc-spe", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct ppc_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ppc_aes_setkey, + .encrypt = ppc_ecb_encrypt, + .decrypt = ppc_ecb_decrypt, + } + } +}, { + .cra_name = "cbc(aes)", + .cra_driver_name = "cbc-ppc-spe", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct ppc_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ppc_aes_setkey, + .encrypt = ppc_cbc_encrypt, + .decrypt = ppc_cbc_decrypt, + } + } +}, { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-ppc-spe", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct ppc_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ppc_aes_setkey, + .encrypt = ppc_ctr_crypt, + .decrypt = ppc_ctr_crypt, + } + } +}, { + .cra_name = "xts(aes)", + .cra_driver_name = "xts-ppc-spe", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct ppc_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE * 2, + .max_keysize = AES_MAX_KEY_SIZE * 2, + .ivsize = AES_BLOCK_SIZE, + .setkey = ppc_xts_setkey, + .encrypt = ppc_xts_encrypt, + .decrypt = ppc_xts_decrypt, + } + } +} }; + +static int __init ppc_aes_mod_init(void) +{ + return crypto_register_algs(aes_algs, ARRAY_SIZE(aes_algs)); +} + +static void __exit ppc_aes_mod_fini(void) +{ + crypto_unregister_algs(aes_algs, ARRAY_SIZE(aes_algs)); +} + +module_init(ppc_aes_mod_init); +module_exit(ppc_aes_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS, SPE optimized"); + +MODULE_ALIAS_CRYPTO("aes"); +MODULE_ALIAS_CRYPTO("ecb(aes)"); +MODULE_ALIAS_CRYPTO("cbc(aes)"); +MODULE_ALIAS_CRYPTO("ctr(aes)"); +MODULE_ALIAS_CRYPTO("xts(aes)"); +MODULE_ALIAS_CRYPTO("aes-ppc-spe"); diff --git a/arch/powerpc/crypto/aes-spe-keys.S b/arch/powerpc/crypto/aes-spe-keys.S new file mode 100644 index 000000000000..be8090f3d700 --- /dev/null +++ b/arch/powerpc/crypto/aes-spe-keys.S @@ -0,0 +1,283 @@ +/* + * Key handling functions for PPC AES implementation + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <asm/ppc_asm.h> + +#ifdef __BIG_ENDIAN__ +#define LOAD_KEY(d, s, off) \ + lwz d,off(s); +#else +#define LOAD_KEY(d, s, off) \ + li r0,off; \ + lwbrx d,s,r0; +#endif + +#define INITIALIZE_KEY \ + stwu r1,-32(r1); /* create stack frame */ \ + stw r14,8(r1); /* save registers */ \ + stw r15,12(r1); \ + stw r16,16(r1); + +#define FINALIZE_KEY \ + lwz r14,8(r1); /* restore registers */ \ + lwz r15,12(r1); \ + lwz r16,16(r1); \ + xor r5,r5,r5; /* clear sensitive data */ \ + xor r6,r6,r6; \ + xor r7,r7,r7; \ + xor r8,r8,r8; \ + xor r9,r9,r9; \ + xor r10,r10,r10; \ + xor r11,r11,r11; \ + xor r12,r12,r12; \ + addi r1,r1,32; /* cleanup stack */ + +#define LS_BOX(r, t1, t2) \ + lis t2,PPC_AES_4K_ENCTAB@h; \ + ori t2,t2,PPC_AES_4K_ENCTAB@l; \ + rlwimi t2,r,4,20,27; \ + lbz t1,8(t2); \ + rlwimi r,t1,0,24,31; \ + rlwimi t2,r,28,20,27; \ + lbz t1,8(t2); \ + rlwimi r,t1,8,16,23; \ + rlwimi t2,r,20,20,27; \ + lbz t1,8(t2); \ + rlwimi r,t1,16,8,15; \ + rlwimi t2,r,12,20,27; \ + lbz t1,8(t2); \ + rlwimi r,t1,24,0,7; + +#define GF8_MUL(out, in, t1, t2) \ + lis t1,0x8080; /* multiplication in GF8 */ \ + ori t1,t1,0x8080; \ + and t1,t1,in; \ + srwi t1,t1,7; \ + mulli t1,t1,0x1b; \ + lis t2,0x7f7f; \ + ori t2,t2,0x7f7f; \ + and t2,t2,in; \ + slwi t2,t2,1; \ + xor out,t1,t2; + +/* + * ppc_expand_key_128(u32 *key_enc, const u8 *key) + * + * Expand 128 bit key into 176 bytes encryption key. It consists of + * key itself plus 10 rounds with 16 bytes each + * + */ +_GLOBAL(ppc_expand_key_128) + INITIALIZE_KEY + LOAD_KEY(r5,r4,0) + LOAD_KEY(r6,r4,4) + LOAD_KEY(r7,r4,8) + LOAD_KEY(r8,r4,12) + stw r5,0(r3) /* key[0..3] = input data */ + stw r6,4(r3) + stw r7,8(r3) + stw r8,12(r3) + li r16,10 /* 10 expansion rounds */ + lis r0,0x0100 /* RCO(1) */ +ppc_expand_128_loop: + addi r3,r3,16 + mr r14,r8 /* apply LS_BOX to 4th temp */ + rotlwi r14,r14,8 + LS_BOX(r14, r15, r4) + xor r14,r14,r0 + xor r5,r5,r14 /* xor next 4 keys */ + xor r6,r6,r5 + xor r7,r7,r6 + xor r8,r8,r7 + stw r5,0(r3) /* store next 4 keys */ + stw r6,4(r3) + stw r7,8(r3) + stw r8,12(r3) + GF8_MUL(r0, r0, r4, r14) /* multiply RCO by 2 in GF */ + subi r16,r16,1 + cmpwi r16,0 + bt eq,ppc_expand_128_end + b ppc_expand_128_loop +ppc_expand_128_end: + FINALIZE_KEY + blr + +/* + * ppc_expand_key_192(u32 *key_enc, const u8 *key) + * + * Expand 192 bit key into 208 bytes encryption key. It consists of key + * itself plus 12 rounds with 16 bytes each + * + */ +_GLOBAL(ppc_expand_key_192) + INITIALIZE_KEY + LOAD_KEY(r5,r4,0) + LOAD_KEY(r6,r4,4) + LOAD_KEY(r7,r4,8) + LOAD_KEY(r8,r4,12) + LOAD_KEY(r9,r4,16) + LOAD_KEY(r10,r4,20) + stw r5,0(r3) + stw r6,4(r3) + stw r7,8(r3) + stw r8,12(r3) + stw r9,16(r3) + stw r10,20(r3) + li r16,8 /* 8 expansion rounds */ + lis r0,0x0100 /* RCO(1) */ +ppc_expand_192_loop: + addi r3,r3,24 + mr r14,r10 /* apply LS_BOX to 6th temp */ + rotlwi r14,r14,8 + LS_BOX(r14, r15, r4) + xor r14,r14,r0 + xor r5,r5,r14 /* xor next 6 keys */ + xor r6,r6,r5 + xor r7,r7,r6 + xor r8,r8,r7 + xor r9,r9,r8 + xor r10,r10,r9 + stw r5,0(r3) + stw r6,4(r3) + stw r7,8(r3) + stw r8,12(r3) + subi r16,r16,1 + cmpwi r16,0 /* last round early kick out */ + bt eq,ppc_expand_192_end + stw r9,16(r3) + stw r10,20(r3) + GF8_MUL(r0, r0, r4, r14) /* multiply RCO GF8 */ + b ppc_expand_192_loop +ppc_expand_192_end: + FINALIZE_KEY + blr + +/* + * ppc_expand_key_256(u32 *key_enc, const u8 *key) + * + * Expand 256 bit key into 240 bytes encryption key. It consists of key + * itself plus 14 rounds with 16 bytes each + * + */ +_GLOBAL(ppc_expand_key_256) + INITIALIZE_KEY + LOAD_KEY(r5,r4,0) + LOAD_KEY(r6,r4,4) + LOAD_KEY(r7,r4,8) + LOAD_KEY(r8,r4,12) + LOAD_KEY(r9,r4,16) + LOAD_KEY(r10,r4,20) + LOAD_KEY(r11,r4,24) + LOAD_KEY(r12,r4,28) + stw r5,0(r3) + stw r6,4(r3) + stw r7,8(r3) + stw r8,12(r3) + stw r9,16(r3) + stw r10,20(r3) + stw r11,24(r3) + stw r12,28(r3) + li r16,7 /* 7 expansion rounds */ + lis r0,0x0100 /* RCO(1) */ +ppc_expand_256_loop: + addi r3,r3,32 + mr r14,r12 /* apply LS_BOX to 8th temp */ + rotlwi r14,r14,8 + LS_BOX(r14, r15, r4) + xor r14,r14,r0 + xor r5,r5,r14 /* xor 4 keys */ + xor r6,r6,r5 + xor r7,r7,r6 + xor r8,r8,r7 + mr r14,r8 + LS_BOX(r14, r15, r4) /* apply LS_BOX to 4th temp */ + xor r9,r9,r14 /* xor 4 keys */ + xor r10,r10,r9 + xor r11,r11,r10 + xor r12,r12,r11 + stw r5,0(r3) + stw r6,4(r3) + stw r7,8(r3) + stw r8,12(r3) + subi r16,r16,1 + cmpwi r16,0 /* last round early kick out */ + bt eq,ppc_expand_256_end + stw r9,16(r3) + stw r10,20(r3) + stw r11,24(r3) + stw r12,28(r3) + GF8_MUL(r0, r0, r4, r14) + b ppc_expand_256_loop +ppc_expand_256_end: + FINALIZE_KEY + blr + +/* + * ppc_generate_decrypt_key: derive decryption key from encryption key + * number of bytes to handle are calculated from length of key (16/24/32) + * + */ +_GLOBAL(ppc_generate_decrypt_key) + addi r6,r5,24 + slwi r6,r6,2 + lwzx r7,r4,r6 /* first/last 4 words are same */ + stw r7,0(r3) + lwz r7,0(r4) + stwx r7,r3,r6 + addi r6,r6,4 + lwzx r7,r4,r6 + stw r7,4(r3) + lwz r7,4(r4) + stwx r7,r3,r6 + addi r6,r6,4 + lwzx r7,r4,r6 + stw r7,8(r3) + lwz r7,8(r4) + stwx r7,r3,r6 + addi r6,r6,4 + lwzx r7,r4,r6 + stw r7,12(r3) + lwz r7,12(r4) + stwx r7,r3,r6 + addi r3,r3,16 + add r4,r4,r6 + subi r4,r4,28 + addi r5,r5,20 + srwi r5,r5,2 +ppc_generate_decrypt_block: + li r6,4 + mtctr r6 +ppc_generate_decrypt_word: + lwz r6,0(r4) + GF8_MUL(r7, r6, r0, r7) + GF8_MUL(r8, r7, r0, r8) + GF8_MUL(r9, r8, r0, r9) + xor r10,r9,r6 + xor r11,r7,r8 + xor r11,r11,r9 + xor r12,r7,r10 + rotrwi r12,r12,24 + xor r11,r11,r12 + xor r12,r8,r10 + rotrwi r12,r12,16 + xor r11,r11,r12 + rotrwi r12,r10,8 + xor r11,r11,r12 + stw r11,0(r3) + addi r3,r3,4 + addi r4,r4,4 + bdnz ppc_generate_decrypt_word + subi r4,r4,32 + subi r5,r5,1 + cmpwi r5,0 + bt gt,ppc_generate_decrypt_block + blr diff --git a/arch/powerpc/crypto/aes-spe-modes.S b/arch/powerpc/crypto/aes-spe-modes.S new file mode 100644 index 000000000000..ad48032ca8e0 --- /dev/null +++ b/arch/powerpc/crypto/aes-spe-modes.S @@ -0,0 +1,630 @@ +/* + * AES modes (ECB/CBC/CTR/XTS) for PPC AES implementation + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <asm/ppc_asm.h> +#include "aes-spe-regs.h" + +#ifdef __BIG_ENDIAN__ /* Macros for big endian builds */ + +#define LOAD_DATA(reg, off) \ + lwz reg,off(rSP); /* load with offset */ +#define SAVE_DATA(reg, off) \ + stw reg,off(rDP); /* save with offset */ +#define NEXT_BLOCK \ + addi rSP,rSP,16; /* increment pointers per bloc */ \ + addi rDP,rDP,16; +#define LOAD_IV(reg, off) \ + lwz reg,off(rIP); /* IV loading with offset */ +#define SAVE_IV(reg, off) \ + stw reg,off(rIP); /* IV saving with offset */ +#define START_IV /* nothing to reset */ +#define CBC_DEC 16 /* CBC decrement per block */ +#define CTR_DEC 1 /* CTR decrement one byte */ + +#else /* Macros for little endian */ + +#define LOAD_DATA(reg, off) \ + lwbrx reg,0,rSP; /* load reversed */ \ + addi rSP,rSP,4; /* and increment pointer */ +#define SAVE_DATA(reg, off) \ + stwbrx reg,0,rDP; /* save reversed */ \ + addi rDP,rDP,4; /* and increment pointer */ +#define NEXT_BLOCK /* nothing todo */ +#define LOAD_IV(reg, off) \ + lwbrx reg,0,rIP; /* load reversed */ \ + addi rIP,rIP,4; /* and increment pointer */ +#define SAVE_IV(reg, off) \ + stwbrx reg,0,rIP; /* load reversed */ \ + addi rIP,rIP,4; /* and increment pointer */ +#define START_IV \ + subi rIP,rIP,16; /* must reset pointer */ +#define CBC_DEC 32 /* 2 blocks because of incs */ +#define CTR_DEC 17 /* 1 block because of incs */ + +#endif + +#define SAVE_0_REGS +#define LOAD_0_REGS + +#define SAVE_4_REGS \ + stw rI0,96(r1); /* save 32 bit registers */ \ + stw rI1,100(r1); \ + stw rI2,104(r1); \ + stw rI3,108(r1); + +#define LOAD_4_REGS \ + lwz rI0,96(r1); /* restore 32 bit registers */ \ + lwz rI1,100(r1); \ + lwz rI2,104(r1); \ + lwz rI3,108(r1); + +#define SAVE_8_REGS \ + SAVE_4_REGS \ + stw rG0,112(r1); /* save 32 bit registers */ \ + stw rG1,116(r1); \ + stw rG2,120(r1); \ + stw rG3,124(r1); + +#define LOAD_8_REGS \ + LOAD_4_REGS \ + lwz rG0,112(r1); /* restore 32 bit registers */ \ + lwz rG1,116(r1); \ + lwz rG2,120(r1); \ + lwz rG3,124(r1); + +#define INITIALIZE_CRYPT(tab,nr32bitregs) \ + mflr r0; \ + stwu r1,-160(r1); /* create stack frame */ \ + lis rT0,tab@h; /* en-/decryption table pointer */ \ + stw r0,8(r1); /* save link register */ \ + ori rT0,rT0,tab@l; \ + evstdw r14,16(r1); \ + mr rKS,rKP; \ + evstdw r15,24(r1); /* We must save non volatile */ \ + evstdw r16,32(r1); /* registers. Take the chance */ \ + evstdw r17,40(r1); /* and save the SPE part too */ \ + evstdw r18,48(r1); \ + evstdw r19,56(r1); \ + evstdw r20,64(r1); \ + evstdw r21,72(r1); \ + evstdw r22,80(r1); \ + evstdw r23,88(r1); \ + SAVE_##nr32bitregs##_REGS + +#define FINALIZE_CRYPT(nr32bitregs) \ + lwz r0,8(r1); \ + evldw r14,16(r1); /* restore SPE registers */ \ + evldw r15,24(r1); \ + evldw r16,32(r1); \ + evldw r17,40(r1); \ + evldw r18,48(r1); \ + evldw r19,56(r1); \ + evldw r20,64(r1); \ + evldw r21,72(r1); \ + evldw r22,80(r1); \ + evldw r23,88(r1); \ + LOAD_##nr32bitregs##_REGS \ + mtlr r0; /* restore link register */ \ + xor r0,r0,r0; \ + stw r0,16(r1); /* delete sensitive data */ \ + stw r0,24(r1); /* that we might have pushed */ \ + stw r0,32(r1); /* from other context that runs */ \ + stw r0,40(r1); /* the same code */ \ + stw r0,48(r1); \ + stw r0,56(r1); \ + stw r0,64(r1); \ + stw r0,72(r1); \ + stw r0,80(r1); \ + stw r0,88(r1); \ + addi r1,r1,160; /* cleanup stack frame */ + +#define ENDIAN_SWAP(t0, t1, s0, s1) \ + rotrwi t0,s0,8; /* swap endianness for 2 GPRs */ \ + rotrwi t1,s1,8; \ + rlwimi t0,s0,8,8,15; \ + rlwimi t1,s1,8,8,15; \ + rlwimi t0,s0,8,24,31; \ + rlwimi t1,s1,8,24,31; + +#define GF128_MUL(d0, d1, d2, d3, t0) \ + li t0,0x87; /* multiplication in GF128 */ \ + cmpwi d3,-1; \ + iselgt t0,0,t0; \ + rlwimi d3,d2,0,0,0; /* propagate "carry" bits */ \ + rotlwi d3,d3,1; \ + rlwimi d2,d1,0,0,0; \ + rotlwi d2,d2,1; \ + rlwimi d1,d0,0,0,0; \ + slwi d0,d0,1; /* shift left 128 bit */ \ + rotlwi d1,d1,1; \ + xor d0,d0,t0; + +#define START_KEY(d0, d1, d2, d3) \ + lwz rW0,0(rKP); \ + mtctr rRR; \ + lwz rW1,4(rKP); \ + lwz rW2,8(rKP); \ + lwz rW3,12(rKP); \ + xor rD0,d0,rW0; \ + xor rD1,d1,rW1; \ + xor rD2,d2,rW2; \ + xor rD3,d3,rW3; + +/* + * ppc_encrypt_aes(u8 *out, const u8 *in, u32 *key_enc, + * u32 rounds) + * + * called from glue layer to encrypt a single 16 byte block + * round values are AES128 = 4, AES192 = 5, AES256 = 6 + * + */ +_GLOBAL(ppc_encrypt_aes) + INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 0) + LOAD_DATA(rD0, 0) + LOAD_DATA(rD1, 4) + LOAD_DATA(rD2, 8) + LOAD_DATA(rD3, 12) + START_KEY(rD0, rD1, rD2, rD3) + bl ppc_encrypt_block + xor rD0,rD0,rW0 + SAVE_DATA(rD0, 0) + xor rD1,rD1,rW1 + SAVE_DATA(rD1, 4) + xor rD2,rD2,rW2 + SAVE_DATA(rD2, 8) + xor rD3,rD3,rW3 + SAVE_DATA(rD3, 12) + FINALIZE_CRYPT(0) + blr + +/* + * ppc_decrypt_aes(u8 *out, const u8 *in, u32 *key_dec, + * u32 rounds) + * + * called from glue layer to decrypt a single 16 byte block + * round values are AES128 = 4, AES192 = 5, AES256 = 6 + * + */ +_GLOBAL(ppc_decrypt_aes) + INITIALIZE_CRYPT(PPC_AES_4K_DECTAB,0) + LOAD_DATA(rD0, 0) + addi rT1,rT0,4096 + LOAD_DATA(rD1, 4) + LOAD_DATA(rD2, 8) + LOAD_DATA(rD3, 12) + START_KEY(rD0, rD1, rD2, rD3) + bl ppc_decrypt_block + xor rD0,rD0,rW0 + SAVE_DATA(rD0, 0) + xor rD1,rD1,rW1 + SAVE_DATA(rD1, 4) + xor rD2,rD2,rW2 + SAVE_DATA(rD2, 8) + xor rD3,rD3,rW3 + SAVE_DATA(rD3, 12) + FINALIZE_CRYPT(0) + blr + +/* + * ppc_encrypt_ecb(u8 *out, const u8 *in, u32 *key_enc, + * u32 rounds, u32 bytes); + * + * called from glue layer to encrypt multiple blocks via ECB + * Bytes must be larger or equal 16 and only whole blocks are + * processed. round values are AES128 = 4, AES192 = 5 and + * AES256 = 6 + * + */ +_GLOBAL(ppc_encrypt_ecb) + INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 0) +ppc_encrypt_ecb_loop: + LOAD_DATA(rD0, 0) + mr rKP,rKS + LOAD_DATA(rD1, 4) + subi rLN,rLN,16 + LOAD_DATA(rD2, 8) + cmpwi rLN,15 + LOAD_DATA(rD3, 12) + START_KEY(rD0, rD1, rD2, rD3) + bl ppc_encrypt_block + xor rD0,rD0,rW0 + SAVE_DATA(rD0, 0) + xor rD1,rD1,rW1 + SAVE_DATA(rD1, 4) + xor rD2,rD2,rW2 + SAVE_DATA(rD2, 8) + xor rD3,rD3,rW3 + SAVE_DATA(rD3, 12) + NEXT_BLOCK + bt gt,ppc_encrypt_ecb_loop + FINALIZE_CRYPT(0) + blr + +/* + * ppc_decrypt_ecb(u8 *out, const u8 *in, u32 *key_dec, + * u32 rounds, u32 bytes); + * + * called from glue layer to decrypt multiple blocks via ECB + * Bytes must be larger or equal 16 and only whole blocks are + * processed. round values are AES128 = 4, AES192 = 5 and + * AES256 = 6 + * + */ +_GLOBAL(ppc_decrypt_ecb) + INITIALIZE_CRYPT(PPC_AES_4K_DECTAB, 0) + addi rT1,rT0,4096 +ppc_decrypt_ecb_loop: + LOAD_DATA(rD0, 0) + mr rKP,rKS + LOAD_DATA(rD1, 4) + subi rLN,rLN,16 + LOAD_DATA(rD2, 8) + cmpwi rLN,15 + LOAD_DATA(rD3, 12) + START_KEY(rD0, rD1, rD2, rD3) + bl ppc_decrypt_block + xor rD0,rD0,rW0 + SAVE_DATA(rD0, 0) + xor rD1,rD1,rW1 + SAVE_DATA(rD1, 4) + xor rD2,rD2,rW2 + SAVE_DATA(rD2, 8) + xor rD3,rD3,rW3 + SAVE_DATA(rD3, 12) + NEXT_BLOCK + bt gt,ppc_decrypt_ecb_loop + FINALIZE_CRYPT(0) + blr + +/* + * ppc_encrypt_cbc(u8 *out, const u8 *in, u32 *key_enc, + * 32 rounds, u32 bytes, u8 *iv); + * + * called from glue layer to encrypt multiple blocks via CBC + * Bytes must be larger or equal 16 and only whole blocks are + * processed. round values are AES128 = 4, AES192 = 5 and + * AES256 = 6 + * + */ +_GLOBAL(ppc_encrypt_cbc) + INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 4) + LOAD_IV(rI0, 0) + LOAD_IV(rI1, 4) + LOAD_IV(rI2, 8) + LOAD_IV(rI3, 12) +ppc_encrypt_cbc_loop: + LOAD_DATA(rD0, 0) + mr rKP,rKS + LOAD_DATA(rD1, 4) + subi rLN,rLN,16 + LOAD_DATA(rD2, 8) + cmpwi rLN,15 + LOAD_DATA(rD3, 12) + xor rD0,rD0,rI0 + xor rD1,rD1,rI1 + xor rD2,rD2,rI2 + xor rD3,rD3,rI3 + START_KEY(rD0, rD1, rD2, rD3) + bl ppc_encrypt_block + xor rI0,rD0,rW0 + SAVE_DATA(rI0, 0) + xor rI1,rD1,rW1 + SAVE_DATA(rI1, 4) + xor rI2,rD2,rW2 + SAVE_DATA(rI2, 8) + xor rI3,rD3,rW3 + SAVE_DATA(rI3, 12) + NEXT_BLOCK + bt gt,ppc_encrypt_cbc_loop + START_IV + SAVE_IV(rI0, 0) + SAVE_IV(rI1, 4) + SAVE_IV(rI2, 8) + SAVE_IV(rI3, 12) + FINALIZE_CRYPT(4) + blr + +/* + * ppc_decrypt_cbc(u8 *out, const u8 *in, u32 *key_dec, + * u32 rounds, u32 bytes, u8 *iv); + * + * called from glue layer to decrypt multiple blocks via CBC + * round values are AES128 = 4, AES192 = 5, AES256 = 6 + * + */ +_GLOBAL(ppc_decrypt_cbc) + INITIALIZE_CRYPT(PPC_AES_4K_DECTAB, 4) + li rT1,15 + LOAD_IV(rI0, 0) + andc rLN,rLN,rT1 + LOAD_IV(rI1, 4) + subi rLN,rLN,16 + LOAD_IV(rI2, 8) + add rSP,rSP,rLN /* reverse processing */ + LOAD_IV(rI3, 12) + add rDP,rDP,rLN + LOAD_DATA(rD0, 0) + addi rT1,rT0,4096 + LOAD_DATA(rD1, 4) + LOAD_DATA(rD2, 8) + LOAD_DATA(rD3, 12) + START_IV + SAVE_IV(rD0, 0) + SAVE_IV(rD1, 4) + SAVE_IV(rD2, 8) + cmpwi rLN,16 + SAVE_IV(rD3, 12) + bt lt,ppc_decrypt_cbc_end +ppc_decrypt_cbc_loop: + mr rKP,rKS + START_KEY(rD0, rD1, rD2, rD3) + bl ppc_decrypt_block + subi rLN,rLN,16 + subi rSP,rSP,CBC_DEC + xor rW0,rD0,rW0 + LOAD_DATA(rD0, 0) + xor rW1,rD1,rW1 + LOAD_DATA(rD1, 4) + xor rW2,rD2,rW2 + LOAD_DATA(rD2, 8) + xor rW3,rD3,rW3 + LOAD_DATA(rD3, 12) + xor rW0,rW0,rD0 + SAVE_DATA(rW0, 0) + xor rW1,rW1,rD1 + SAVE_DATA(rW1, 4) + xor rW2,rW2,rD2 + SAVE_DATA(rW2, 8) + xor rW3,rW3,rD3 + SAVE_DATA(rW3, 12) + cmpwi rLN,15 + subi rDP,rDP,CBC_DEC + bt gt,ppc_decrypt_cbc_loop +ppc_decrypt_cbc_end: + mr rKP,rKS + START_KEY(rD0, rD1, rD2, rD3) + bl ppc_decrypt_block + xor rW0,rW0,rD0 + xor rW1,rW1,rD1 + xor rW2,rW2,rD2 + xor rW3,rW3,rD3 + xor rW0,rW0,rI0 /* decrypt with initial IV */ + SAVE_DATA(rW0, 0) + xor rW1,rW1,rI1 + SAVE_DATA(rW1, 4) + xor rW2,rW2,rI2 + SAVE_DATA(rW2, 8) + xor rW3,rW3,rI3 + SAVE_DATA(rW3, 12) + FINALIZE_CRYPT(4) + blr + +/* + * ppc_crypt_ctr(u8 *out, const u8 *in, u32 *key_enc, + * u32 rounds, u32 bytes, u8 *iv); + * + * called from glue layer to encrypt/decrypt multiple blocks + * via CTR. Number of bytes does not need to be a multiple of + * 16. Round values are AES128 = 4, AES192 = 5, AES256 = 6 + * + */ +_GLOBAL(ppc_crypt_ctr) + INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 4) + LOAD_IV(rI0, 0) + LOAD_IV(rI1, 4) + LOAD_IV(rI2, 8) + cmpwi rLN,16 + LOAD_IV(rI3, 12) + START_IV + bt lt,ppc_crypt_ctr_partial +ppc_crypt_ctr_loop: + mr rKP,rKS + START_KEY(rI0, rI1, rI2, rI3) + bl ppc_encrypt_block + xor rW0,rD0,rW0 + xor rW1,rD1,rW1 + xor rW2,rD2,rW2 + xor rW3,rD3,rW3 + LOAD_DATA(rD0, 0) + subi rLN,rLN,16 + LOAD_DATA(rD1, 4) + LOAD_DATA(rD2, 8) + LOAD_DATA(rD3, 12) + xor rD0,rD0,rW0 + SAVE_DATA(rD0, 0) + xor rD1,rD1,rW1 + SAVE_DATA(rD1, 4) + xor rD2,rD2,rW2 + SAVE_DATA(rD2, 8) + xor rD3,rD3,rW3 + SAVE_DATA(rD3, 12) + addic rI3,rI3,1 /* increase counter */ + addze rI2,rI2 + addze rI1,rI1 + addze rI0,rI0 + NEXT_BLOCK + cmpwi rLN,15 + bt gt,ppc_crypt_ctr_loop +ppc_crypt_ctr_partial: + cmpwi rLN,0 + bt eq,ppc_crypt_ctr_end + mr rKP,rKS + START_KEY(rI0, rI1, rI2, rI3) + bl ppc_encrypt_block + xor rW0,rD0,rW0 + SAVE_IV(rW0, 0) + xor rW1,rD1,rW1 + SAVE_IV(rW1, 4) + xor rW2,rD2,rW2 + SAVE_IV(rW2, 8) + xor rW3,rD3,rW3 + SAVE_IV(rW3, 12) + mtctr rLN + subi rIP,rIP,CTR_DEC + subi rSP,rSP,1 + subi rDP,rDP,1 +ppc_crypt_ctr_xorbyte: + lbzu rW4,1(rIP) /* bytewise xor for partial block */ + lbzu rW5,1(rSP) + xor rW4,rW4,rW5 + stbu rW4,1(rDP) + bdnz ppc_crypt_ctr_xorbyte + subf rIP,rLN,rIP + addi rIP,rIP,1 + addic rI3,rI3,1 + addze rI2,rI2 + addze rI1,rI1 + addze rI0,rI0 +ppc_crypt_ctr_end: + SAVE_IV(rI0, 0) + SAVE_IV(rI1, 4) + SAVE_IV(rI2, 8) + SAVE_IV(rI3, 12) + FINALIZE_CRYPT(4) + blr + +/* + * ppc_encrypt_xts(u8 *out, const u8 *in, u32 *key_enc, + * u32 rounds, u32 bytes, u8 *iv, u32 *key_twk); + * + * called from glue layer to encrypt multiple blocks via XTS + * If key_twk is given, the initial IV encryption will be + * processed too. Round values are AES128 = 4, AES192 = 5, + * AES256 = 6 + * + */ +_GLOBAL(ppc_encrypt_xts) + INITIALIZE_CRYPT(PPC_AES_4K_ENCTAB, 8) + LOAD_IV(rI0, 0) + LOAD_IV(rI1, 4) + LOAD_IV(rI2, 8) + cmpwi rKT,0 + LOAD_IV(rI3, 12) + bt eq,ppc_encrypt_xts_notweak + mr rKP,rKT + START_KEY(rI0, rI1, rI2, rI3) + bl ppc_encrypt_block + xor rI0,rD0,rW0 + xor rI1,rD1,rW1 + xor rI2,rD2,rW2 + xor rI3,rD3,rW3 +ppc_encrypt_xts_notweak: + ENDIAN_SWAP(rG0, rG1, rI0, rI1) + ENDIAN_SWAP(rG2, rG3, rI2, rI3) +ppc_encrypt_xts_loop: + LOAD_DATA(rD0, 0) + mr rKP,rKS + LOAD_DATA(rD1, 4) + subi rLN,rLN,16 + LOAD_DATA(rD2, 8) + LOAD_DATA(rD3, 12) + xor rD0,rD0,rI0 + xor rD1,rD1,rI1 + xor rD2,rD2,rI2 + xor rD3,rD3,rI3 + START_KEY(rD0, rD1, rD2, rD3) + bl ppc_encrypt_block + xor rD0,rD0,rW0 + xor rD1,rD1,rW1 + xor rD2,rD2,rW2 + xor rD3,rD3,rW3 + xor rD0,rD0,rI0 + SAVE_DATA(rD0, 0) + xor rD1,rD1,rI1 + SAVE_DATA(rD1, 4) + xor rD2,rD2,rI2 + SAVE_DATA(rD2, 8) + xor rD3,rD3,rI3 + SAVE_DATA(rD3, 12) + GF128_MUL(rG0, rG1, rG2, rG3, rW0) + ENDIAN_SWAP(rI0, rI1, rG0, rG1) + ENDIAN_SWAP(rI2, rI3, rG2, rG3) + cmpwi rLN,0 + NEXT_BLOCK + bt gt,ppc_encrypt_xts_loop + START_IV + SAVE_IV(rI0, 0) + SAVE_IV(rI1, 4) + SAVE_IV(rI2, 8) + SAVE_IV(rI3, 12) + FINALIZE_CRYPT(8) + blr + +/* + * ppc_decrypt_xts(u8 *out, const u8 *in, u32 *key_dec, + * u32 rounds, u32 blocks, u8 *iv, u32 *key_twk); + * + * called from glue layer to decrypt multiple blocks via XTS + * If key_twk is given, the initial IV encryption will be + * processed too. Round values are AES128 = 4, AES192 = 5, + * AES256 = 6 + * + */ +_GLOBAL(ppc_decrypt_xts) + INITIALIZE_CRYPT(PPC_AES_4K_DECTAB, 8) + LOAD_IV(rI0, 0) + addi rT1,rT0,4096 + LOAD_IV(rI1, 4) + LOAD_IV(rI2, 8) + cmpwi rKT,0 + LOAD_IV(rI3, 12) + bt eq,ppc_decrypt_xts_notweak + subi rT0,rT0,4096 + mr rKP,rKT + START_KEY(rI0, rI1, rI2, rI3) + bl ppc_encrypt_block + xor rI0,rD0,rW0 + xor rI1,rD1,rW1 + xor rI2,rD2,rW2 + xor rI3,rD3,rW3 + addi rT0,rT0,4096 +ppc_decrypt_xts_notweak: + ENDIAN_SWAP(rG0, rG1, rI0, rI1) + ENDIAN_SWAP(rG2, rG3, rI2, rI3) +ppc_decrypt_xts_loop: + LOAD_DATA(rD0, 0) + mr rKP,rKS + LOAD_DATA(rD1, 4) + subi rLN,rLN,16 + LOAD_DATA(rD2, 8) + LOAD_DATA(rD3, 12) + xor rD0,rD0,rI0 + xor rD1,rD1,rI1 + xor rD2,rD2,rI2 + xor rD3,rD3,rI3 + START_KEY(rD0, rD1, rD2, rD3) + bl ppc_decrypt_block + xor rD0,rD0,rW0 + xor rD1,rD1,rW1 + xor rD2,rD2,rW2 + xor rD3,rD3,rW3 + xor rD0,rD0,rI0 + SAVE_DATA(rD0, 0) + xor rD1,rD1,rI1 + SAVE_DATA(rD1, 4) + xor rD2,rD2,rI2 + SAVE_DATA(rD2, 8) + xor rD3,rD3,rI3 + SAVE_DATA(rD3, 12) + GF128_MUL(rG0, rG1, rG2, rG3, rW0) + ENDIAN_SWAP(rI0, rI1, rG0, rG1) + ENDIAN_SWAP(rI2, rI3, rG2, rG3) + cmpwi rLN,0 + NEXT_BLOCK + bt gt,ppc_decrypt_xts_loop + START_IV + SAVE_IV(rI0, 0) + SAVE_IV(rI1, 4) + SAVE_IV(rI2, 8) + SAVE_IV(rI3, 12) + FINALIZE_CRYPT(8) + blr diff --git a/arch/powerpc/crypto/aes-spe-regs.h b/arch/powerpc/crypto/aes-spe-regs.h new file mode 100644 index 000000000000..30d217b399c3 --- /dev/null +++ b/arch/powerpc/crypto/aes-spe-regs.h @@ -0,0 +1,42 @@ +/* + * Common registers for PPC AES implementation + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#define rKS r0 /* copy of en-/decryption key pointer */ +#define rDP r3 /* destination pointer */ +#define rSP r4 /* source pointer */ +#define rKP r5 /* pointer to en-/decryption key pointer */ +#define rRR r6 /* en-/decryption rounds */ +#define rLN r7 /* length of data to be processed */ +#define rIP r8 /* potiner to IV (CBC/CTR/XTS modes) */ +#define rKT r9 /* pointer to tweak key (XTS mode) */ +#define rT0 r11 /* pointers to en-/decrpytion tables */ +#define rT1 r10 +#define rD0 r9 /* data */ +#define rD1 r14 +#define rD2 r12 +#define rD3 r15 +#define rW0 r16 /* working registers */ +#define rW1 r17 +#define rW2 r18 +#define rW3 r19 +#define rW4 r20 +#define rW5 r21 +#define rW6 r22 +#define rW7 r23 +#define rI0 r24 /* IV */ +#define rI1 r25 +#define rI2 r26 +#define rI3 r27 +#define rG0 r28 /* endian reversed tweak (XTS mode) */ +#define rG1 r29 +#define rG2 r30 +#define rG3 r31 diff --git a/arch/powerpc/crypto/aes-tab-4k.S b/arch/powerpc/crypto/aes-tab-4k.S new file mode 100644 index 000000000000..701e60240dc3 --- /dev/null +++ b/arch/powerpc/crypto/aes-tab-4k.S @@ -0,0 +1,331 @@ +/* + * 4K AES tables for PPC AES implementation + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +/* + * These big endian AES encryption/decryption tables have been taken from + * crypto/aes_generic.c and are designed to be simply accessed by a combination + * of rlwimi/lwz instructions with a minimum of table registers (usually only + * one required). Thus they are aligned to 4K. The locality of rotated values + * is derived from the reduced offsets that are available in the SPE load + * instructions. E.g. evldw, evlwwsplat, ... + * + * For the safety-conscious it has to be noted that they might be vulnerable + * to cache timing attacks because of their size. Nevertheless in contrast to + * the generic tables they have been reduced from 16KB to 8KB + 256 bytes. + * This is a quite good tradeoff for low power devices (e.g. routers) without + * dedicated encryption hardware where we usually have no multiuser + * environment. + * + */ + +#define R(a, b, c, d) \ + 0x##a##b##c##d, 0x##d##a##b##c, 0x##c##d##a##b, 0x##b##c##d##a + +.data +.align 12 +.globl PPC_AES_4K_ENCTAB +PPC_AES_4K_ENCTAB: +/* encryption table, same as crypto_ft_tab in crypto/aes-generic.c */ + .long R(c6, 63, 63, a5), R(f8, 7c, 7c, 84) + .long R(ee, 77, 77, 99), R(f6, 7b, 7b, 8d) + .long R(ff, f2, f2, 0d), R(d6, 6b, 6b, bd) + .long R(de, 6f, 6f, b1), R(91, c5, c5, 54) + .long R(60, 30, 30, 50), R(02, 01, 01, 03) + .long R(ce, 67, 67, a9), R(56, 2b, 2b, 7d) + .long R(e7, fe, fe, 19), R(b5, d7, d7, 62) + .long R(4d, ab, ab, e6), R(ec, 76, 76, 9a) + .long R(8f, ca, ca, 45), R(1f, 82, 82, 9d) + .long R(89, c9, c9, 40), R(fa, 7d, 7d, 87) + .long R(ef, fa, fa, 15), R(b2, 59, 59, eb) + .long R(8e, 47, 47, c9), R(fb, f0, f0, 0b) + .long R(41, ad, ad, ec), R(b3, d4, d4, 67) + .long R(5f, a2, a2, fd), R(45, af, af, ea) + .long R(23, 9c, 9c, bf), R(53, a4, a4, f7) + .long R(e4, 72, 72, 96), R(9b, c0, c0, 5b) + .long R(75, b7, b7, c2), R(e1, fd, fd, 1c) + .long R(3d, 93, 93, ae), R(4c, 26, 26, 6a) + .long R(6c, 36, 36, 5a), R(7e, 3f, 3f, 41) + .long R(f5, f7, f7, 02), R(83, cc, cc, 4f) + .long R(68, 34, 34, 5c), R(51, a5, a5, f4) + .long R(d1, e5, e5, 34), R(f9, f1, f1, 08) + .long R(e2, 71, 71, 93), R(ab, d8, d8, 73) + .long R(62, 31, 31, 53), R(2a, 15, 15, 3f) + .long R(08, 04, 04, 0c), R(95, c7, c7, 52) + .long R(46, 23, 23, 65), R(9d, c3, c3, 5e) + .long R(30, 18, 18, 28), R(37, 96, 96, a1) + .long R(0a, 05, 05, 0f), R(2f, 9a, 9a, b5) + .long R(0e, 07, 07, 09), R(24, 12, 12, 36) + .long R(1b, 80, 80, 9b), R(df, e2, e2, 3d) + .long R(cd, eb, eb, 26), R(4e, 27, 27, 69) + .long R(7f, b2, b2, cd), R(ea, 75, 75, 9f) + .long R(12, 09, 09, 1b), R(1d, 83, 83, 9e) + .long R(58, 2c, 2c, 74), R(34, 1a, 1a, 2e) + .long R(36, 1b, 1b, 2d), R(dc, 6e, 6e, b2) + .long R(b4, 5a, 5a, ee), R(5b, a0, a0, fb) + .long R(a4, 52, 52, f6), R(76, 3b, 3b, 4d) + .long R(b7, d6, d6, 61), R(7d, b3, b3, ce) + .long R(52, 29, 29, 7b), R(dd, e3, e3, 3e) + .long R(5e, 2f, 2f, 71), R(13, 84, 84, 97) + .long R(a6, 53, 53, f5), R(b9, d1, d1, 68) + .long R(00, 00, 00, 00), R(c1, ed, ed, 2c) + .long R(40, 20, 20, 60), R(e3, fc, fc, 1f) + .long R(79, b1, b1, c8), R(b6, 5b, 5b, ed) + .long R(d4, 6a, 6a, be), R(8d, cb, cb, 46) + .long R(67, be, be, d9), R(72, 39, 39, 4b) + .long R(94, 4a, 4a, de), R(98, 4c, 4c, d4) + .long R(b0, 58, 58, e8), R(85, cf, cf, 4a) + .long R(bb, d0, d0, 6b), R(c5, ef, ef, 2a) + .long R(4f, aa, aa, e5), R(ed, fb, fb, 16) + .long R(86, 43, 43, c5), R(9a, 4d, 4d, d7) + .long R(66, 33, 33, 55), R(11, 85, 85, 94) + .long R(8a, 45, 45, cf), R(e9, f9, f9, 10) + .long R(04, 02, 02, 06), R(fe, 7f, 7f, 81) + .long R(a0, 50, 50, f0), R(78, 3c, 3c, 44) + .long R(25, 9f, 9f, ba), R(4b, a8, a8, e3) + .long R(a2, 51, 51, f3), R(5d, a3, a3, fe) + .long R(80, 40, 40, c0), R(05, 8f, 8f, 8a) + .long R(3f, 92, 92, ad), R(21, 9d, 9d, bc) + .long R(70, 38, 38, 48), R(f1, f5, f5, 04) + .long R(63, bc, bc, df), R(77, b6, b6, c1) + .long R(af, da, da, 75), R(42, 21, 21, 63) + .long R(20, 10, 10, 30), R(e5, ff, ff, 1a) + .long R(fd, f3, f3, 0e), R(bf, d2, d2, 6d) + .long R(81, cd, cd, 4c), R(18, 0c, 0c, 14) + .long R(26, 13, 13, 35), R(c3, ec, ec, 2f) + .long R(be, 5f, 5f, e1), R(35, 97, 97, a2) + .long R(88, 44, 44, cc), R(2e, 17, 17, 39) + .long R(93, c4, c4, 57), R(55, a7, a7, f2) + .long R(fc, 7e, 7e, 82), R(7a, 3d, 3d, 47) + .long R(c8, 64, 64, ac), R(ba, 5d, 5d, e7) + .long R(32, 19, 19, 2b), R(e6, 73, 73, 95) + .long R(c0, 60, 60, a0), R(19, 81, 81, 98) + .long R(9e, 4f, 4f, d1), R(a3, dc, dc, 7f) + .long R(44, 22, 22, 66), R(54, 2a, 2a, 7e) + .long R(3b, 90, 90, ab), R(0b, 88, 88, 83) + .long R(8c, 46, 46, ca), R(c7, ee, ee, 29) + .long R(6b, b8, b8, d3), R(28, 14, 14, 3c) + .long R(a7, de, de, 79), R(bc, 5e, 5e, e2) + .long R(16, 0b, 0b, 1d), R(ad, db, db, 76) + .long R(db, e0, e0, 3b), R(64, 32, 32, 56) + .long R(74, 3a, 3a, 4e), R(14, 0a, 0a, 1e) + .long R(92, 49, 49, db), R(0c, 06, 06, 0a) + .long R(48, 24, 24, 6c), R(b8, 5c, 5c, e4) + .long R(9f, c2, c2, 5d), R(bd, d3, d3, 6e) + .long R(43, ac, ac, ef), R(c4, 62, 62, a6) + .long R(39, 91, 91, a8), R(31, 95, 95, a4) + .long R(d3, e4, e4, 37), R(f2, 79, 79, 8b) + .long R(d5, e7, e7, 32), R(8b, c8, c8, 43) + .long R(6e, 37, 37, 59), R(da, 6d, 6d, b7) + .long R(01, 8d, 8d, 8c), R(b1, d5, d5, 64) + .long R(9c, 4e, 4e, d2), R(49, a9, a9, e0) + .long R(d8, 6c, 6c, b4), R(ac, 56, 56, fa) + .long R(f3, f4, f4, 07), R(cf, ea, ea, 25) + .long R(ca, 65, 65, af), R(f4, 7a, 7a, 8e) + .long R(47, ae, ae, e9), R(10, 08, 08, 18) + .long R(6f, ba, ba, d5), R(f0, 78, 78, 88) + .long R(4a, 25, 25, 6f), R(5c, 2e, 2e, 72) + .long R(38, 1c, 1c, 24), R(57, a6, a6, f1) + .long R(73, b4, b4, c7), R(97, c6, c6, 51) + .long R(cb, e8, e8, 23), R(a1, dd, dd, 7c) + .long R(e8, 74, 74, 9c), R(3e, 1f, 1f, 21) + .long R(96, 4b, 4b, dd), R(61, bd, bd, dc) + .long R(0d, 8b, 8b, 86), R(0f, 8a, 8a, 85) + .long R(e0, 70, 70, 90), R(7c, 3e, 3e, 42) + .long R(71, b5, b5, c4), R(cc, 66, 66, aa) + .long R(90, 48, 48, d8), R(06, 03, 03, 05) + .long R(f7, f6, f6, 01), R(1c, 0e, 0e, 12) + .long R(c2, 61, 61, a3), R(6a, 35, 35, 5f) + .long R(ae, 57, 57, f9), R(69, b9, b9, d0) + .long R(17, 86, 86, 91), R(99, c1, c1, 58) + .long R(3a, 1d, 1d, 27), R(27, 9e, 9e, b9) + .long R(d9, e1, e1, 38), R(eb, f8, f8, 13) + .long R(2b, 98, 98, b3), R(22, 11, 11, 33) + .long R(d2, 69, 69, bb), R(a9, d9, d9, 70) + .long R(07, 8e, 8e, 89), R(33, 94, 94, a7) + .long R(2d, 9b, 9b, b6), R(3c, 1e, 1e, 22) + .long R(15, 87, 87, 92), R(c9, e9, e9, 20) + .long R(87, ce, ce, 49), R(aa, 55, 55, ff) + .long R(50, 28, 28, 78), R(a5, df, df, 7a) + .long R(03, 8c, 8c, 8f), R(59, a1, a1, f8) + .long R(09, 89, 89, 80), R(1a, 0d, 0d, 17) + .long R(65, bf, bf, da), R(d7, e6, e6, 31) + .long R(84, 42, 42, c6), R(d0, 68, 68, b8) + .long R(82, 41, 41, c3), R(29, 99, 99, b0) + .long R(5a, 2d, 2d, 77), R(1e, 0f, 0f, 11) + .long R(7b, b0, b0, cb), R(a8, 54, 54, fc) + .long R(6d, bb, bb, d6), R(2c, 16, 16, 3a) +.globl PPC_AES_4K_DECTAB +PPC_AES_4K_DECTAB: +/* decryption table, same as crypto_it_tab in crypto/aes-generic.c */ + .long R(51, f4, a7, 50), R(7e, 41, 65, 53) + .long R(1a, 17, a4, c3), R(3a, 27, 5e, 96) + .long R(3b, ab, 6b, cb), R(1f, 9d, 45, f1) + .long R(ac, fa, 58, ab), R(4b, e3, 03, 93) + .long R(20, 30, fa, 55), R(ad, 76, 6d, f6) + .long R(88, cc, 76, 91), R(f5, 02, 4c, 25) + .long R(4f, e5, d7, fc), R(c5, 2a, cb, d7) + .long R(26, 35, 44, 80), R(b5, 62, a3, 8f) + .long R(de, b1, 5a, 49), R(25, ba, 1b, 67) + .long R(45, ea, 0e, 98), R(5d, fe, c0, e1) + .long R(c3, 2f, 75, 02), R(81, 4c, f0, 12) + .long R(8d, 46, 97, a3), R(6b, d3, f9, c6) + .long R(03, 8f, 5f, e7), R(15, 92, 9c, 95) + .long R(bf, 6d, 7a, eb), R(95, 52, 59, da) + .long R(d4, be, 83, 2d), R(58, 74, 21, d3) + .long R(49, e0, 69, 29), R(8e, c9, c8, 44) + .long R(75, c2, 89, 6a), R(f4, 8e, 79, 78) + .long R(99, 58, 3e, 6b), R(27, b9, 71, dd) + .long R(be, e1, 4f, b6), R(f0, 88, ad, 17) + .long R(c9, 20, ac, 66), R(7d, ce, 3a, b4) + .long R(63, df, 4a, 18), R(e5, 1a, 31, 82) + .long R(97, 51, 33, 60), R(62, 53, 7f, 45) + .long R(b1, 64, 77, e0), R(bb, 6b, ae, 84) + .long R(fe, 81, a0, 1c), R(f9, 08, 2b, 94) + .long R(70, 48, 68, 58), R(8f, 45, fd, 19) + .long R(94, de, 6c, 87), R(52, 7b, f8, b7) + .long R(ab, 73, d3, 23), R(72, 4b, 02, e2) + .long R(e3, 1f, 8f, 57), R(66, 55, ab, 2a) + .long R(b2, eb, 28, 07), R(2f, b5, c2, 03) + .long R(86, c5, 7b, 9a), R(d3, 37, 08, a5) + .long R(30, 28, 87, f2), R(23, bf, a5, b2) + .long R(02, 03, 6a, ba), R(ed, 16, 82, 5c) + .long R(8a, cf, 1c, 2b), R(a7, 79, b4, 92) + .long R(f3, 07, f2, f0), R(4e, 69, e2, a1) + .long R(65, da, f4, cd), R(06, 05, be, d5) + .long R(d1, 34, 62, 1f), R(c4, a6, fe, 8a) + .long R(34, 2e, 53, 9d), R(a2, f3, 55, a0) + .long R(05, 8a, e1, 32), R(a4, f6, eb, 75) + .long R(0b, 83, ec, 39), R(40, 60, ef, aa) + .long R(5e, 71, 9f, 06), R(bd, 6e, 10, 51) + .long R(3e, 21, 8a, f9), R(96, dd, 06, 3d) + .long R(dd, 3e, 05, ae), R(4d, e6, bd, 46) + .long R(91, 54, 8d, b5), R(71, c4, 5d, 05) + .long R(04, 06, d4, 6f), R(60, 50, 15, ff) + .long R(19, 98, fb, 24), R(d6, bd, e9, 97) + .long R(89, 40, 43, cc), R(67, d9, 9e, 77) + .long R(b0, e8, 42, bd), R(07, 89, 8b, 88) + .long R(e7, 19, 5b, 38), R(79, c8, ee, db) + .long R(a1, 7c, 0a, 47), R(7c, 42, 0f, e9) + .long R(f8, 84, 1e, c9), R(00, 00, 00, 00) + .long R(09, 80, 86, 83), R(32, 2b, ed, 48) + .long R(1e, 11, 70, ac), R(6c, 5a, 72, 4e) + .long R(fd, 0e, ff, fb), R(0f, 85, 38, 56) + .long R(3d, ae, d5, 1e), R(36, 2d, 39, 27) + .long R(0a, 0f, d9, 64), R(68, 5c, a6, 21) + .long R(9b, 5b, 54, d1), R(24, 36, 2e, 3a) + .long R(0c, 0a, 67, b1), R(93, 57, e7, 0f) + .long R(b4, ee, 96, d2), R(1b, 9b, 91, 9e) + .long R(80, c0, c5, 4f), R(61, dc, 20, a2) + .long R(5a, 77, 4b, 69), R(1c, 12, 1a, 16) + .long R(e2, 93, ba, 0a), R(c0, a0, 2a, e5) + .long R(3c, 22, e0, 43), R(12, 1b, 17, 1d) + .long R(0e, 09, 0d, 0b), R(f2, 8b, c7, ad) + .long R(2d, b6, a8, b9), R(14, 1e, a9, c8) + .long R(57, f1, 19, 85), R(af, 75, 07, 4c) + .long R(ee, 99, dd, bb), R(a3, 7f, 60, fd) + .long R(f7, 01, 26, 9f), R(5c, 72, f5, bc) + .long R(44, 66, 3b, c5), R(5b, fb, 7e, 34) + .long R(8b, 43, 29, 76), R(cb, 23, c6, dc) + .long R(b6, ed, fc, 68), R(b8, e4, f1, 63) + .long R(d7, 31, dc, ca), R(42, 63, 85, 10) + .long R(13, 97, 22, 40), R(84, c6, 11, 20) + .long R(85, 4a, 24, 7d), R(d2, bb, 3d, f8) + .long R(ae, f9, 32, 11), R(c7, 29, a1, 6d) + .long R(1d, 9e, 2f, 4b), R(dc, b2, 30, f3) + .long R(0d, 86, 52, ec), R(77, c1, e3, d0) + .long R(2b, b3, 16, 6c), R(a9, 70, b9, 99) + .long R(11, 94, 48, fa), R(47, e9, 64, 22) + .long R(a8, fc, 8c, c4), R(a0, f0, 3f, 1a) + .long R(56, 7d, 2c, d8), R(22, 33, 90, ef) + .long R(87, 49, 4e, c7), R(d9, 38, d1, c1) + .long R(8c, ca, a2, fe), R(98, d4, 0b, 36) + .long R(a6, f5, 81, cf), R(a5, 7a, de, 28) + .long R(da, b7, 8e, 26), R(3f, ad, bf, a4) + .long R(2c, 3a, 9d, e4), R(50, 78, 92, 0d) + .long R(6a, 5f, cc, 9b), R(54, 7e, 46, 62) + .long R(f6, 8d, 13, c2), R(90, d8, b8, e8) + .long R(2e, 39, f7, 5e), R(82, c3, af, f5) + .long R(9f, 5d, 80, be), R(69, d0, 93, 7c) + .long R(6f, d5, 2d, a9), R(cf, 25, 12, b3) + .long R(c8, ac, 99, 3b), R(10, 18, 7d, a7) + .long R(e8, 9c, 63, 6e), R(db, 3b, bb, 7b) + .long R(cd, 26, 78, 09), R(6e, 59, 18, f4) + .long R(ec, 9a, b7, 01), R(83, 4f, 9a, a8) + .long R(e6, 95, 6e, 65), R(aa, ff, e6, 7e) + .long R(21, bc, cf, 08), R(ef, 15, e8, e6) + .long R(ba, e7, 9b, d9), R(4a, 6f, 36, ce) + .long R(ea, 9f, 09, d4), R(29, b0, 7c, d6) + .long R(31, a4, b2, af), R(2a, 3f, 23, 31) + .long R(c6, a5, 94, 30), R(35, a2, 66, c0) + .long R(74, 4e, bc, 37), R(fc, 82, ca, a6) + .long R(e0, 90, d0, b0), R(33, a7, d8, 15) + .long R(f1, 04, 98, 4a), R(41, ec, da, f7) + .long R(7f, cd, 50, 0e), R(17, 91, f6, 2f) + .long R(76, 4d, d6, 8d), R(43, ef, b0, 4d) + .long R(cc, aa, 4d, 54), R(e4, 96, 04, df) + .long R(9e, d1, b5, e3), R(4c, 6a, 88, 1b) + .long R(c1, 2c, 1f, b8), R(46, 65, 51, 7f) + .long R(9d, 5e, ea, 04), R(01, 8c, 35, 5d) + .long R(fa, 87, 74, 73), R(fb, 0b, 41, 2e) + .long R(b3, 67, 1d, 5a), R(92, db, d2, 52) + .long R(e9, 10, 56, 33), R(6d, d6, 47, 13) + .long R(9a, d7, 61, 8c), R(37, a1, 0c, 7a) + .long R(59, f8, 14, 8e), R(eb, 13, 3c, 89) + .long R(ce, a9, 27, ee), R(b7, 61, c9, 35) + .long R(e1, 1c, e5, ed), R(7a, 47, b1, 3c) + .long R(9c, d2, df, 59), R(55, f2, 73, 3f) + .long R(18, 14, ce, 79), R(73, c7, 37, bf) + .long R(53, f7, cd, ea), R(5f, fd, aa, 5b) + .long R(df, 3d, 6f, 14), R(78, 44, db, 86) + .long R(ca, af, f3, 81), R(b9, 68, c4, 3e) + .long R(38, 24, 34, 2c), R(c2, a3, 40, 5f) + .long R(16, 1d, c3, 72), R(bc, e2, 25, 0c) + .long R(28, 3c, 49, 8b), R(ff, 0d, 95, 41) + .long R(39, a8, 01, 71), R(08, 0c, b3, de) + .long R(d8, b4, e4, 9c), R(64, 56, c1, 90) + .long R(7b, cb, 84, 61), R(d5, 32, b6, 70) + .long R(48, 6c, 5c, 74), R(d0, b8, 57, 42) +.globl PPC_AES_4K_DECTAB2 +PPC_AES_4K_DECTAB2: +/* decryption table, same as crypto_il_tab in crypto/aes-generic.c */ + .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d diff --git a/arch/powerpc/crypto/md5-asm.S b/arch/powerpc/crypto/md5-asm.S new file mode 100644 index 000000000000..10cdf5bceebb --- /dev/null +++ b/arch/powerpc/crypto/md5-asm.S @@ -0,0 +1,243 @@ +/* + * Fast MD5 implementation for PPC + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + +#define rHP r3 +#define rWP r4 + +#define rH0 r0 +#define rH1 r6 +#define rH2 r7 +#define rH3 r5 + +#define rW00 r8 +#define rW01 r9 +#define rW02 r10 +#define rW03 r11 +#define rW04 r12 +#define rW05 r14 +#define rW06 r15 +#define rW07 r16 +#define rW08 r17 +#define rW09 r18 +#define rW10 r19 +#define rW11 r20 +#define rW12 r21 +#define rW13 r22 +#define rW14 r23 +#define rW15 r24 + +#define rT0 r25 +#define rT1 r26 + +#define INITIALIZE \ + PPC_STLU r1,-INT_FRAME_SIZE(r1); \ + SAVE_8GPRS(14, r1); /* push registers onto stack */ \ + SAVE_4GPRS(22, r1); \ + SAVE_GPR(26, r1) + +#define FINALIZE \ + REST_8GPRS(14, r1); /* pop registers from stack */ \ + REST_4GPRS(22, r1); \ + REST_GPR(26, r1); \ + addi r1,r1,INT_FRAME_SIZE; + +#ifdef __BIG_ENDIAN__ +#define LOAD_DATA(reg, off) \ + lwbrx reg,0,rWP; /* load data */ +#define INC_PTR \ + addi rWP,rWP,4; /* increment per word */ +#define NEXT_BLOCK /* nothing to do */ +#else +#define LOAD_DATA(reg, off) \ + lwz reg,off(rWP); /* load data */ +#define INC_PTR /* nothing to do */ +#define NEXT_BLOCK \ + addi rWP,rWP,64; /* increment per block */ +#endif + +#define R_00_15(a, b, c, d, w0, w1, p, q, off, k0h, k0l, k1h, k1l) \ + LOAD_DATA(w0, off) /* W */ \ + and rT0,b,c; /* 1: f = b and c */ \ + INC_PTR /* ptr++ */ \ + andc rT1,d,b; /* 1: f' = ~b and d */ \ + LOAD_DATA(w1, off+4) /* W */ \ + or rT0,rT0,rT1; /* 1: f = f or f' */ \ + addi w0,w0,k0l; /* 1: wk = w + k */ \ + add a,a,rT0; /* 1: a = a + f */ \ + addis w0,w0,k0h; /* 1: wk = w + k' */ \ + addis w1,w1,k1h; /* 2: wk = w + k */ \ + add a,a,w0; /* 1: a = a + wk */ \ + addi w1,w1,k1l; /* 2: wk = w + k' */ \ + rotrwi a,a,p; /* 1: a = a rotl x */ \ + add d,d,w1; /* 2: a = a + wk */ \ + add a,a,b; /* 1: a = a + b */ \ + and rT0,a,b; /* 2: f = b and c */ \ + andc rT1,c,a; /* 2: f' = ~b and d */ \ + or rT0,rT0,rT1; /* 2: f = f or f' */ \ + add d,d,rT0; /* 2: a = a + f */ \ + INC_PTR /* ptr++ */ \ + rotrwi d,d,q; /* 2: a = a rotl x */ \ + add d,d,a; /* 2: a = a + b */ + +#define R_16_31(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \ + andc rT0,c,d; /* 1: f = c and ~d */ \ + and rT1,b,d; /* 1: f' = b and d */ \ + addi w0,w0,k0l; /* 1: wk = w + k */ \ + or rT0,rT0,rT1; /* 1: f = f or f' */ \ + addis w0,w0,k0h; /* 1: wk = w + k' */ \ + add a,a,rT0; /* 1: a = a + f */ \ + addi w1,w1,k1l; /* 2: wk = w + k */ \ + add a,a,w0; /* 1: a = a + wk */ \ + addis w1,w1,k1h; /* 2: wk = w + k' */ \ + andc rT0,b,c; /* 2: f = c and ~d */ \ + rotrwi a,a,p; /* 1: a = a rotl x */ \ + add a,a,b; /* 1: a = a + b */ \ + add d,d,w1; /* 2: a = a + wk */ \ + and rT1,a,c; /* 2: f' = b and d */ \ + or rT0,rT0,rT1; /* 2: f = f or f' */ \ + add d,d,rT0; /* 2: a = a + f */ \ + rotrwi d,d,q; /* 2: a = a rotl x */ \ + add d,d,a; /* 2: a = a +b */ + +#define R_32_47(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \ + xor rT0,b,c; /* 1: f' = b xor c */ \ + addi w0,w0,k0l; /* 1: wk = w + k */ \ + xor rT1,rT0,d; /* 1: f = f xor f' */ \ + addis w0,w0,k0h; /* 1: wk = w + k' */ \ + add a,a,rT1; /* 1: a = a + f */ \ + addi w1,w1,k1l; /* 2: wk = w + k */ \ + add a,a,w0; /* 1: a = a + wk */ \ + addis w1,w1,k1h; /* 2: wk = w + k' */ \ + rotrwi a,a,p; /* 1: a = a rotl x */ \ + add d,d,w1; /* 2: a = a + wk */ \ + add a,a,b; /* 1: a = a + b */ \ + xor rT1,rT0,a; /* 2: f = b xor f' */ \ + add d,d,rT1; /* 2: a = a + f */ \ + rotrwi d,d,q; /* 2: a = a rotl x */ \ + add d,d,a; /* 2: a = a + b */ + +#define R_48_63(a, b, c, d, w0, w1, p, q, k0h, k0l, k1h, k1l) \ + addi w0,w0,k0l; /* 1: w = w + k */ \ + orc rT0,b,d; /* 1: f = b or ~d */ \ + addis w0,w0,k0h; /* 1: w = w + k' */ \ + xor rT0,rT0,c; /* 1: f = f xor c */ \ + add a,a,w0; /* 1: a = a + wk */ \ + addi w1,w1,k1l; /* 2: w = w + k */ \ + add a,a,rT0; /* 1: a = a + f */ \ + addis w1,w1,k1h; /* 2: w = w + k' */ \ + rotrwi a,a,p; /* 1: a = a rotl x */ \ + add a,a,b; /* 1: a = a + b */ \ + orc rT0,a,c; /* 2: f = b or ~d */ \ + add d,d,w1; /* 2: a = a + wk */ \ + xor rT0,rT0,b; /* 2: f = f xor c */ \ + add d,d,rT0; /* 2: a = a + f */ \ + rotrwi d,d,q; /* 2: a = a rotl x */ \ + add d,d,a; /* 2: a = a + b */ + +_GLOBAL(ppc_md5_transform) + INITIALIZE + + mtctr r5 + lwz rH0,0(rHP) + lwz rH1,4(rHP) + lwz rH2,8(rHP) + lwz rH3,12(rHP) + +ppc_md5_main: + R_00_15(rH0, rH1, rH2, rH3, rW00, rW01, 25, 20, 0, + 0xd76b, -23432, 0xe8c8, -18602) + R_00_15(rH2, rH3, rH0, rH1, rW02, rW03, 15, 10, 8, + 0x2420, 0x70db, 0xc1be, -12562) + R_00_15(rH0, rH1, rH2, rH3, rW04, rW05, 25, 20, 16, + 0xf57c, 0x0faf, 0x4788, -14806) + R_00_15(rH2, rH3, rH0, rH1, rW06, rW07, 15, 10, 24, + 0xa830, 0x4613, 0xfd47, -27391) + R_00_15(rH0, rH1, rH2, rH3, rW08, rW09, 25, 20, 32, + 0x6981, -26408, 0x8b45, -2129) + R_00_15(rH2, rH3, rH0, rH1, rW10, rW11, 15, 10, 40, + 0xffff, 0x5bb1, 0x895d, -10306) + R_00_15(rH0, rH1, rH2, rH3, rW12, rW13, 25, 20, 48, + 0x6b90, 0x1122, 0xfd98, 0x7193) + R_00_15(rH2, rH3, rH0, rH1, rW14, rW15, 15, 10, 56, + 0xa679, 0x438e, 0x49b4, 0x0821) + + R_16_31(rH0, rH1, rH2, rH3, rW01, rW06, 27, 23, + 0x0d56, 0x6e0c, 0x1810, 0x6d2d) + R_16_31(rH2, rH3, rH0, rH1, rW11, rW00, 18, 12, + 0x9d02, -32109, 0x124c, 0x2332) + R_16_31(rH0, rH1, rH2, rH3, rW05, rW10, 27, 23, + 0x8ea7, 0x4a33, 0x0245, -18270) + R_16_31(rH2, rH3, rH0, rH1, rW15, rW04, 18, 12, + 0x8eee, -8608, 0xf258, -5095) + R_16_31(rH0, rH1, rH2, rH3, rW09, rW14, 27, 23, + 0x969d, -10697, 0x1cbe, -15288) + R_16_31(rH2, rH3, rH0, rH1, rW03, rW08, 18, 12, + 0x3317, 0x3e99, 0xdbd9, 0x7c15) + R_16_31(rH0, rH1, rH2, rH3, rW13, rW02, 27, 23, + 0xac4b, 0x7772, 0xd8cf, 0x331d) + R_16_31(rH2, rH3, rH0, rH1, rW07, rW12, 18, 12, + 0x6a28, 0x6dd8, 0x219a, 0x3b68) + + R_32_47(rH0, rH1, rH2, rH3, rW05, rW08, 28, 21, + 0x29cb, 0x28e5, 0x4218, -7788) + R_32_47(rH2, rH3, rH0, rH1, rW11, rW14, 16, 9, + 0x473f, 0x06d1, 0x3aae, 0x3036) + R_32_47(rH0, rH1, rH2, rH3, rW01, rW04, 28, 21, + 0xaea1, -15134, 0x640b, -11295) + R_32_47(rH2, rH3, rH0, rH1, rW07, rW10, 16, 9, + 0x8f4c, 0x4887, 0xbc7c, -22499) + R_32_47(rH0, rH1, rH2, rH3, rW13, rW00, 28, 21, + 0x7eb8, -27199, 0x00ea, 0x6050) + R_32_47(rH2, rH3, rH0, rH1, rW03, rW06, 16, 9, + 0xe01a, 0x22fe, 0x4447, 0x69c5) + R_32_47(rH0, rH1, rH2, rH3, rW09, rW12, 28, 21, + 0xb7f3, 0x0253, 0x59b1, 0x4d5b) + R_32_47(rH2, rH3, rH0, rH1, rW15, rW02, 16, 9, + 0x4701, -27017, 0xc7bd, -19859) + + R_48_63(rH0, rH1, rH2, rH3, rW00, rW07, 26, 22, + 0x0988, -1462, 0x4c70, -19401) + R_48_63(rH2, rH3, rH0, rH1, rW14, rW05, 17, 11, + 0xadaf, -5221, 0xfc99, 0x66f7) + R_48_63(rH0, rH1, rH2, rH3, rW12, rW03, 26, 22, + 0x7e80, -16418, 0xba1e, -25587) + R_48_63(rH2, rH3, rH0, rH1, rW10, rW01, 17, 11, + 0x4130, 0x380d, 0xe0c5, 0x738d) + lwz rW00,0(rHP) + R_48_63(rH0, rH1, rH2, rH3, rW08, rW15, 26, 22, + 0xe837, -30770, 0xde8a, 0x69e8) + lwz rW14,4(rHP) + R_48_63(rH2, rH3, rH0, rH1, rW06, rW13, 17, 11, + 0x9e79, 0x260f, 0x256d, -27941) + lwz rW12,8(rHP) + R_48_63(rH0, rH1, rH2, rH3, rW04, rW11, 26, 22, + 0xab75, -20775, 0x4f9e, -28397) + lwz rW10,12(rHP) + R_48_63(rH2, rH3, rH0, rH1, rW02, rW09, 17, 11, + 0x662b, 0x7c56, 0x11b2, 0x0358) + + add rH0,rH0,rW00 + stw rH0,0(rHP) + add rH1,rH1,rW14 + stw rH1,4(rHP) + add rH2,rH2,rW12 + stw rH2,8(rHP) + add rH3,rH3,rW10 + stw rH3,12(rHP) + NEXT_BLOCK + + bdnz ppc_md5_main + + FINALIZE + blr diff --git a/arch/powerpc/crypto/md5-glue.c b/arch/powerpc/crypto/md5-glue.c new file mode 100644 index 000000000000..452fb4dc575f --- /dev/null +++ b/arch/powerpc/crypto/md5-glue.c @@ -0,0 +1,165 @@ +/* + * Glue code for MD5 implementation for PPC assembler + * + * Based on generic implementation. + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <crypto/md5.h> +#include <asm/byteorder.h> + +extern void ppc_md5_transform(u32 *state, const u8 *src, u32 blocks); + +static inline void ppc_md5_clear_context(struct md5_state *sctx) +{ + int count = sizeof(struct md5_state) >> 2; + u32 *ptr = (u32 *)sctx; + + /* make sure we can clear the fast way */ + BUILD_BUG_ON(sizeof(struct md5_state) % 4); + do { *ptr++ = 0; } while (--count); +} + +static int ppc_md5_init(struct shash_desc *desc) +{ + struct md5_state *sctx = shash_desc_ctx(desc); + + sctx->hash[0] = 0x67452301; + sctx->hash[1] = 0xefcdab89; + sctx->hash[2] = 0x98badcfe; + sctx->hash[3] = 0x10325476; + sctx->byte_count = 0; + + return 0; +} + +static int ppc_md5_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct md5_state *sctx = shash_desc_ctx(desc); + const unsigned int offset = sctx->byte_count & 0x3f; + unsigned int avail = 64 - offset; + const u8 *src = data; + + sctx->byte_count += len; + + if (avail > len) { + memcpy((char *)sctx->block + offset, src, len); + return 0; + } + + if (offset) { + memcpy((char *)sctx->block + offset, src, avail); + ppc_md5_transform(sctx->hash, (const u8 *)sctx->block, 1); + len -= avail; + src += avail; + } + + if (len > 63) { + ppc_md5_transform(sctx->hash, src, len >> 6); + src += len & ~0x3f; + len &= 0x3f; + } + + memcpy((char *)sctx->block, src, len); + return 0; +} + +static int ppc_md5_final(struct shash_desc *desc, u8 *out) +{ + struct md5_state *sctx = shash_desc_ctx(desc); + const unsigned int offset = sctx->byte_count & 0x3f; + const u8 *src = (const u8 *)sctx->block; + u8 *p = (u8 *)src + offset; + int padlen = 55 - offset; + __le64 *pbits = (__le64 *)((char *)sctx->block + 56); + __le32 *dst = (__le32 *)out; + + *p++ = 0x80; + + if (padlen < 0) { + memset(p, 0x00, padlen + sizeof (u64)); + ppc_md5_transform(sctx->hash, src, 1); + p = (char *)sctx->block; + padlen = 56; + } + + memset(p, 0, padlen); + *pbits = cpu_to_le64(sctx->byte_count << 3); + ppc_md5_transform(sctx->hash, src, 1); + + dst[0] = cpu_to_le32(sctx->hash[0]); + dst[1] = cpu_to_le32(sctx->hash[1]); + dst[2] = cpu_to_le32(sctx->hash[2]); + dst[3] = cpu_to_le32(sctx->hash[3]); + + ppc_md5_clear_context(sctx); + return 0; +} + +static int ppc_md5_export(struct shash_desc *desc, void *out) +{ + struct md5_state *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + return 0; +} + +static int ppc_md5_import(struct shash_desc *desc, const void *in) +{ + struct md5_state *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + return 0; +} + +static struct shash_alg alg = { + .digestsize = MD5_DIGEST_SIZE, + .init = ppc_md5_init, + .update = ppc_md5_update, + .final = ppc_md5_final, + .export = ppc_md5_export, + .import = ppc_md5_import, + .descsize = sizeof(struct md5_state), + .statesize = sizeof(struct md5_state), + .base = { + .cra_name = "md5", + .cra_driver_name= "md5-ppc", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = MD5_HMAC_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init ppc_md5_mod_init(void) +{ + return crypto_register_shash(&alg); +} + +static void __exit ppc_md5_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(ppc_md5_mod_init); +module_exit(ppc_md5_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MD5 Secure Hash Algorithm, PPC assembler"); + +MODULE_ALIAS_CRYPTO("md5"); +MODULE_ALIAS_CRYPTO("md5-ppc"); diff --git a/arch/powerpc/crypto/sha1-spe-asm.S b/arch/powerpc/crypto/sha1-spe-asm.S new file mode 100644 index 000000000000..fcb6cf002889 --- /dev/null +++ b/arch/powerpc/crypto/sha1-spe-asm.S @@ -0,0 +1,299 @@ +/* + * Fast SHA-1 implementation for SPE instruction set (PPC) + * + * This code makes use of the SPE SIMD instruction set as defined in + * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf + * Implementation is based on optimization guide notes from + * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + +#define rHP r3 /* pointer to hash value */ +#define rWP r4 /* pointer to input */ +#define rKP r5 /* pointer to constants */ + +#define rW0 r14 /* 64 bit round words */ +#define rW1 r15 +#define rW2 r16 +#define rW3 r17 +#define rW4 r18 +#define rW5 r19 +#define rW6 r20 +#define rW7 r21 + +#define rH0 r6 /* 32 bit hash values */ +#define rH1 r7 +#define rH2 r8 +#define rH3 r9 +#define rH4 r10 + +#define rT0 r22 /* 64 bit temporary */ +#define rT1 r0 /* 32 bit temporaries */ +#define rT2 r11 +#define rT3 r12 + +#define rK r23 /* 64 bit constant in volatile register */ + +#define LOAD_K01 + +#define LOAD_K11 \ + evlwwsplat rK,0(rKP); + +#define LOAD_K21 \ + evlwwsplat rK,4(rKP); + +#define LOAD_K31 \ + evlwwsplat rK,8(rKP); + +#define LOAD_K41 \ + evlwwsplat rK,12(rKP); + +#define INITIALIZE \ + stwu r1,-128(r1); /* create stack frame */ \ + evstdw r14,8(r1); /* We must save non volatile */ \ + evstdw r15,16(r1); /* registers. Take the chance */ \ + evstdw r16,24(r1); /* and save the SPE part too */ \ + evstdw r17,32(r1); \ + evstdw r18,40(r1); \ + evstdw r19,48(r1); \ + evstdw r20,56(r1); \ + evstdw r21,64(r1); \ + evstdw r22,72(r1); \ + evstdw r23,80(r1); + + +#define FINALIZE \ + evldw r14,8(r1); /* restore SPE registers */ \ + evldw r15,16(r1); \ + evldw r16,24(r1); \ + evldw r17,32(r1); \ + evldw r18,40(r1); \ + evldw r19,48(r1); \ + evldw r20,56(r1); \ + evldw r21,64(r1); \ + evldw r22,72(r1); \ + evldw r23,80(r1); \ + xor r0,r0,r0; \ + stw r0,8(r1); /* Delete sensitive data */ \ + stw r0,16(r1); /* that we might have pushed */ \ + stw r0,24(r1); /* from other context that runs */ \ + stw r0,32(r1); /* the same code. Assume that */ \ + stw r0,40(r1); /* the lower part of the GPRs */ \ + stw r0,48(r1); /* were already overwritten on */ \ + stw r0,56(r1); /* the way down to here */ \ + stw r0,64(r1); \ + stw r0,72(r1); \ + stw r0,80(r1); \ + addi r1,r1,128; /* cleanup stack frame */ + +#ifdef __BIG_ENDIAN__ +#define LOAD_DATA(reg, off) \ + lwz reg,off(rWP); /* load data */ +#define NEXT_BLOCK \ + addi rWP,rWP,64; /* increment per block */ +#else +#define LOAD_DATA(reg, off) \ + lwbrx reg,0,rWP; /* load data */ \ + addi rWP,rWP,4; /* increment per word */ +#define NEXT_BLOCK /* nothing to do */ +#endif + +#define R_00_15(a, b, c, d, e, w0, w1, k, off) \ + LOAD_DATA(w0, off) /* 1: W */ \ + and rT2,b,c; /* 1: F' = B and C */ \ + LOAD_K##k##1 \ + andc rT1,d,b; /* 1: F" = ~B and D */ \ + rotrwi rT0,a,27; /* 1: A' = A rotl 5 */ \ + or rT2,rT2,rT1; /* 1: F = F' or F" */ \ + add e,e,rT0; /* 1: E = E + A' */ \ + rotrwi b,b,2; /* 1: B = B rotl 30 */ \ + add e,e,w0; /* 1: E = E + W */ \ + LOAD_DATA(w1, off+4) /* 2: W */ \ + add e,e,rT2; /* 1: E = E + F */ \ + and rT1,a,b; /* 2: F' = B and C */ \ + add e,e,rK; /* 1: E = E + K */ \ + andc rT2,c,a; /* 2: F" = ~B and D */ \ + add d,d,rK; /* 2: E = E + K */ \ + or rT2,rT2,rT1; /* 2: F = F' or F" */ \ + rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ + add d,d,w1; /* 2: E = E + W */ \ + rotrwi a,a,2; /* 2: B = B rotl 30 */ \ + add d,d,rT0; /* 2: E = E + A' */ \ + evmergelo w1,w1,w0; /* mix W[0]/W[1] */ \ + add d,d,rT2 /* 2: E = E + F */ + +#define R_16_19(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ + and rT2,b,c; /* 1: F' = B and C */ \ + evmergelohi rT0,w7,w6; /* W[-3] */ \ + andc rT1,d,b; /* 1: F" = ~B and D */ \ + evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ + or rT1,rT1,rT2; /* 1: F = F' or F" */ \ + evxor w0,w0,w4; /* W = W xor W[-8] */ \ + add e,e,rT1; /* 1: E = E + F */ \ + evxor w0,w0,w1; /* W = W xor W[-14] */ \ + rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ + evrlwi w0,w0,1; /* W = W rotl 1 */ \ + add e,e,rT2; /* 1: E = E + A' */ \ + evaddw rT0,w0,rK; /* WK = W + K */ \ + rotrwi b,b,2; /* 1: B = B rotl 30 */ \ + LOAD_K##k##1 \ + evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ + add e,e,rT0; /* 1: E = E + WK */ \ + add d,d,rT1; /* 2: E = E + WK */ \ + and rT2,a,b; /* 2: F' = B and C */ \ + andc rT1,c,a; /* 2: F" = ~B and D */ \ + rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ + or rT1,rT1,rT2; /* 2: F = F' or F" */ \ + add d,d,rT0; /* 2: E = E + A' */ \ + rotrwi a,a,2; /* 2: B = B rotl 30 */ \ + add d,d,rT1 /* 2: E = E + F */ + +#define R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ + evmergelohi rT0,w7,w6; /* W[-3] */ \ + xor rT2,b,c; /* 1: F' = B xor C */ \ + evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ + xor rT2,rT2,d; /* 1: F = F' xor D */ \ + evxor w0,w0,w4; /* W = W xor W[-8] */ \ + add e,e,rT2; /* 1: E = E + F */ \ + evxor w0,w0,w1; /* W = W xor W[-14] */ \ + rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ + evrlwi w0,w0,1; /* W = W rotl 1 */ \ + add e,e,rT2; /* 1: E = E + A' */ \ + evaddw rT0,w0,rK; /* WK = W + K */ \ + rotrwi b,b,2; /* 1: B = B rotl 30 */ \ + LOAD_K##k##1 \ + evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ + add e,e,rT0; /* 1: E = E + WK */ \ + xor rT2,a,b; /* 2: F' = B xor C */ \ + add d,d,rT1; /* 2: E = E + WK */ \ + xor rT2,rT2,c; /* 2: F = F' xor D */ \ + rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ + add d,d,rT2; /* 2: E = E + F */ \ + rotrwi a,a,2; /* 2: B = B rotl 30 */ \ + add d,d,rT0 /* 2: E = E + A' */ + +#define R_40_59(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ + and rT2,b,c; /* 1: F' = B and C */ \ + evmergelohi rT0,w7,w6; /* W[-3] */ \ + or rT1,b,c; /* 1: F" = B or C */ \ + evxor w0,w0,rT0; /* W = W[-16] xor W[-3] */ \ + and rT1,d,rT1; /* 1: F" = F" and D */ \ + evxor w0,w0,w4; /* W = W xor W[-8] */ \ + or rT2,rT2,rT1; /* 1: F = F' or F" */ \ + evxor w0,w0,w1; /* W = W xor W[-14] */ \ + add e,e,rT2; /* 1: E = E + F */ \ + evrlwi w0,w0,1; /* W = W rotl 1 */ \ + rotrwi rT2,a,27; /* 1: A' = A rotl 5 */ \ + evaddw rT0,w0,rK; /* WK = W + K */ \ + add e,e,rT2; /* 1: E = E + A' */ \ + LOAD_K##k##1 \ + evmergehi rT1,rT1,rT0; /* WK1/WK2 */ \ + rotrwi b,b,2; /* 1: B = B rotl 30 */ \ + add e,e,rT0; /* 1: E = E + WK */ \ + and rT2,a,b; /* 2: F' = B and C */ \ + or rT0,a,b; /* 2: F" = B or C */ \ + add d,d,rT1; /* 2: E = E + WK */ \ + and rT0,c,rT0; /* 2: F" = F" and D */ \ + rotrwi a,a,2; /* 2: B = B rotl 30 */ \ + or rT2,rT2,rT0; /* 2: F = F' or F" */ \ + rotrwi rT0,e,27; /* 2: A' = A rotl 5 */ \ + add d,d,rT2; /* 2: E = E + F */ \ + add d,d,rT0 /* 2: E = E + A' */ + +#define R_60_79(a, b, c, d, e, w0, w1, w4, w6, w7, k) \ + R_20_39(a, b, c, d, e, w0, w1, w4, w6, w7, k) + +_GLOBAL(ppc_spe_sha1_transform) + INITIALIZE + + lwz rH0,0(rHP) + lwz rH1,4(rHP) + mtctr r5 + lwz rH2,8(rHP) + lis rKP,PPC_SPE_SHA1_K@h + lwz rH3,12(rHP) + ori rKP,rKP,PPC_SPE_SHA1_K@l + lwz rH4,16(rHP) + +ppc_spe_sha1_main: + R_00_15(rH0, rH1, rH2, rH3, rH4, rW1, rW0, 1, 0) + R_00_15(rH3, rH4, rH0, rH1, rH2, rW2, rW1, 0, 8) + R_00_15(rH1, rH2, rH3, rH4, rH0, rW3, rW2, 0, 16) + R_00_15(rH4, rH0, rH1, rH2, rH3, rW4, rW3, 0, 24) + R_00_15(rH2, rH3, rH4, rH0, rH1, rW5, rW4, 0, 32) + R_00_15(rH0, rH1, rH2, rH3, rH4, rW6, rW5, 0, 40) + R_00_15(rH3, rH4, rH0, rH1, rH2, rT3, rW6, 0, 48) + R_00_15(rH1, rH2, rH3, rH4, rH0, rT3, rW7, 0, 56) + + R_16_19(rH4, rH0, rH1, rH2, rH3, rW0, rW1, rW4, rW6, rW7, 0) + R_16_19(rH2, rH3, rH4, rH0, rH1, rW1, rW2, rW5, rW7, rW0, 2) + + R_20_39(rH0, rH1, rH2, rH3, rH4, rW2, rW3, rW6, rW0, rW1, 0) + R_20_39(rH3, rH4, rH0, rH1, rH2, rW3, rW4, rW7, rW1, rW2, 0) + R_20_39(rH1, rH2, rH3, rH4, rH0, rW4, rW5, rW0, rW2, rW3, 0) + R_20_39(rH4, rH0, rH1, rH2, rH3, rW5, rW6, rW1, rW3, rW4, 0) + R_20_39(rH2, rH3, rH4, rH0, rH1, rW6, rW7, rW2, rW4, rW5, 0) + R_20_39(rH0, rH1, rH2, rH3, rH4, rW7, rW0, rW3, rW5, rW6, 0) + R_20_39(rH3, rH4, rH0, rH1, rH2, rW0, rW1, rW4, rW6, rW7, 0) + R_20_39(rH1, rH2, rH3, rH4, rH0, rW1, rW2, rW5, rW7, rW0, 0) + R_20_39(rH4, rH0, rH1, rH2, rH3, rW2, rW3, rW6, rW0, rW1, 0) + R_20_39(rH2, rH3, rH4, rH0, rH1, rW3, rW4, rW7, rW1, rW2, 3) + + R_40_59(rH0, rH1, rH2, rH3, rH4, rW4, rW5, rW0, rW2, rW3, 0) + R_40_59(rH3, rH4, rH0, rH1, rH2, rW5, rW6, rW1, rW3, rW4, 0) + R_40_59(rH1, rH2, rH3, rH4, rH0, rW6, rW7, rW2, rW4, rW5, 0) + R_40_59(rH4, rH0, rH1, rH2, rH3, rW7, rW0, rW3, rW5, rW6, 0) + R_40_59(rH2, rH3, rH4, rH0, rH1, rW0, rW1, rW4, rW6, rW7, 0) + R_40_59(rH0, rH1, rH2, rH3, rH4, rW1, rW2, rW5, rW7, rW0, 0) + R_40_59(rH3, rH4, rH0, rH1, rH2, rW2, rW3, rW6, rW0, rW1, 0) + R_40_59(rH1, rH2, rH3, rH4, rH0, rW3, rW4, rW7, rW1, rW2, 0) + R_40_59(rH4, rH0, rH1, rH2, rH3, rW4, rW5, rW0, rW2, rW3, 0) + R_40_59(rH2, rH3, rH4, rH0, rH1, rW5, rW6, rW1, rW3, rW4, 4) + + R_60_79(rH0, rH1, rH2, rH3, rH4, rW6, rW7, rW2, rW4, rW5, 0) + R_60_79(rH3, rH4, rH0, rH1, rH2, rW7, rW0, rW3, rW5, rW6, 0) + R_60_79(rH1, rH2, rH3, rH4, rH0, rW0, rW1, rW4, rW6, rW7, 0) + R_60_79(rH4, rH0, rH1, rH2, rH3, rW1, rW2, rW5, rW7, rW0, 0) + R_60_79(rH2, rH3, rH4, rH0, rH1, rW2, rW3, rW6, rW0, rW1, 0) + R_60_79(rH0, rH1, rH2, rH3, rH4, rW3, rW4, rW7, rW1, rW2, 0) + R_60_79(rH3, rH4, rH0, rH1, rH2, rW4, rW5, rW0, rW2, rW3, 0) + lwz rT3,0(rHP) + R_60_79(rH1, rH2, rH3, rH4, rH0, rW5, rW6, rW1, rW3, rW4, 0) + lwz rW1,4(rHP) + R_60_79(rH4, rH0, rH1, rH2, rH3, rW6, rW7, rW2, rW4, rW5, 0) + lwz rW2,8(rHP) + R_60_79(rH2, rH3, rH4, rH0, rH1, rW7, rW0, rW3, rW5, rW6, 0) + lwz rW3,12(rHP) + NEXT_BLOCK + lwz rW4,16(rHP) + + add rH0,rH0,rT3 + stw rH0,0(rHP) + add rH1,rH1,rW1 + stw rH1,4(rHP) + add rH2,rH2,rW2 + stw rH2,8(rHP) + add rH3,rH3,rW3 + stw rH3,12(rHP) + add rH4,rH4,rW4 + stw rH4,16(rHP) + + bdnz ppc_spe_sha1_main + + FINALIZE + blr + +.data +.align 4 +PPC_SPE_SHA1_K: + .long 0x5A827999,0x6ED9EBA1,0x8F1BBCDC,0xCA62C1D6 diff --git a/arch/powerpc/crypto/sha1-spe-glue.c b/arch/powerpc/crypto/sha1-spe-glue.c new file mode 100644 index 000000000000..3e1d22212521 --- /dev/null +++ b/arch/powerpc/crypto/sha1-spe-glue.c @@ -0,0 +1,210 @@ +/* + * Glue code for SHA-1 implementation for SPE instructions (PPC) + * + * Based on generic implementation. + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <crypto/sha.h> +#include <asm/byteorder.h> +#include <asm/switch_to.h> +#include <linux/hardirq.h> + +/* + * MAX_BYTES defines the number of bytes that are allowed to be processed + * between preempt_disable() and preempt_enable(). SHA1 takes ~1000 + * operations per 64 bytes. e500 cores can issue two arithmetic instructions + * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2). + * Thus 2KB of input data will need an estimated maximum of 18,000 cycles. + * Headroom for cache misses included. Even with the low end model clocked + * at 667 MHz this equals to a critical time window of less than 27us. + * + */ +#define MAX_BYTES 2048 + +extern void ppc_spe_sha1_transform(u32 *state, const u8 *src, u32 blocks); + +static void spe_begin(void) +{ + /* We just start SPE operations and will save SPE registers later. */ + preempt_disable(); + enable_kernel_spe(); +} + +static void spe_end(void) +{ + /* reenable preemption */ + preempt_enable(); +} + +static inline void ppc_sha1_clear_context(struct sha1_state *sctx) +{ + int count = sizeof(struct sha1_state) >> 2; + u32 *ptr = (u32 *)sctx; + + /* make sure we can clear the fast way */ + BUILD_BUG_ON(sizeof(struct sha1_state) % 4); + do { *ptr++ = 0; } while (--count); +} + +static int ppc_spe_sha1_init(struct shash_desc *desc) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA1_H0; + sctx->state[1] = SHA1_H1; + sctx->state[2] = SHA1_H2; + sctx->state[3] = SHA1_H3; + sctx->state[4] = SHA1_H4; + sctx->count = 0; + + return 0; +} + +static int ppc_spe_sha1_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + const unsigned int offset = sctx->count & 0x3f; + const unsigned int avail = 64 - offset; + unsigned int bytes; + const u8 *src = data; + + if (avail > len) { + sctx->count += len; + memcpy((char *)sctx->buffer + offset, src, len); + return 0; + } + + sctx->count += len; + + if (offset) { + memcpy((char *)sctx->buffer + offset, src, avail); + + spe_begin(); + ppc_spe_sha1_transform(sctx->state, (const u8 *)sctx->buffer, 1); + spe_end(); + + len -= avail; + src += avail; + } + + while (len > 63) { + bytes = (len > MAX_BYTES) ? MAX_BYTES : len; + bytes = bytes & ~0x3f; + + spe_begin(); + ppc_spe_sha1_transform(sctx->state, src, bytes >> 6); + spe_end(); + + src += bytes; + len -= bytes; + }; + + memcpy((char *)sctx->buffer, src, len); + return 0; +} + +static int ppc_spe_sha1_final(struct shash_desc *desc, u8 *out) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + const unsigned int offset = sctx->count & 0x3f; + char *p = (char *)sctx->buffer + offset; + int padlen; + __be64 *pbits = (__be64 *)(((char *)&sctx->buffer) + 56); + __be32 *dst = (__be32 *)out; + + padlen = 55 - offset; + *p++ = 0x80; + + spe_begin(); + + if (padlen < 0) { + memset(p, 0x00, padlen + sizeof (u64)); + ppc_spe_sha1_transform(sctx->state, sctx->buffer, 1); + p = (char *)sctx->buffer; + padlen = 56; + } + + memset(p, 0, padlen); + *pbits = cpu_to_be64(sctx->count << 3); + ppc_spe_sha1_transform(sctx->state, sctx->buffer, 1); + + spe_end(); + + dst[0] = cpu_to_be32(sctx->state[0]); + dst[1] = cpu_to_be32(sctx->state[1]); + dst[2] = cpu_to_be32(sctx->state[2]); + dst[3] = cpu_to_be32(sctx->state[3]); + dst[4] = cpu_to_be32(sctx->state[4]); + + ppc_sha1_clear_context(sctx); + return 0; +} + +static int ppc_spe_sha1_export(struct shash_desc *desc, void *out) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + return 0; +} + +static int ppc_spe_sha1_import(struct shash_desc *desc, const void *in) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + return 0; +} + +static struct shash_alg alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = ppc_spe_sha1_init, + .update = ppc_spe_sha1_update, + .final = ppc_spe_sha1_final, + .export = ppc_spe_sha1_export, + .import = ppc_spe_sha1_import, + .descsize = sizeof(struct sha1_state), + .statesize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name= "sha1-ppc-spe", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init ppc_spe_sha1_mod_init(void) +{ + return crypto_register_shash(&alg); +} + +static void __exit ppc_spe_sha1_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(ppc_spe_sha1_mod_init); +module_exit(ppc_spe_sha1_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, SPE optimized"); + +MODULE_ALIAS_CRYPTO("sha1"); +MODULE_ALIAS_CRYPTO("sha1-ppc-spe"); diff --git a/arch/powerpc/crypto/sha256-spe-asm.S b/arch/powerpc/crypto/sha256-spe-asm.S new file mode 100644 index 000000000000..2d10e4c08f03 --- /dev/null +++ b/arch/powerpc/crypto/sha256-spe-asm.S @@ -0,0 +1,323 @@ +/* + * Fast SHA-256 implementation for SPE instruction set (PPC) + * + * This code makes use of the SPE SIMD instruction set as defined in + * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf + * Implementation is based on optimization guide notes from + * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + +#define rHP r3 /* pointer to hash values in memory */ +#define rKP r24 /* pointer to round constants */ +#define rWP r4 /* pointer to input data */ + +#define rH0 r5 /* 8 32 bit hash values in 8 registers */ +#define rH1 r6 +#define rH2 r7 +#define rH3 r8 +#define rH4 r9 +#define rH5 r10 +#define rH6 r11 +#define rH7 r12 + +#define rW0 r14 /* 64 bit registers. 16 words in 8 registers */ +#define rW1 r15 +#define rW2 r16 +#define rW3 r17 +#define rW4 r18 +#define rW5 r19 +#define rW6 r20 +#define rW7 r21 + +#define rT0 r22 /* 64 bit temporaries */ +#define rT1 r23 +#define rT2 r0 /* 32 bit temporaries */ +#define rT3 r25 + +#define CMP_KN_LOOP +#define CMP_KC_LOOP \ + cmpwi rT1,0; + +#define INITIALIZE \ + stwu r1,-128(r1); /* create stack frame */ \ + evstdw r14,8(r1); /* We must save non volatile */ \ + evstdw r15,16(r1); /* registers. Take the chance */ \ + evstdw r16,24(r1); /* and save the SPE part too */ \ + evstdw r17,32(r1); \ + evstdw r18,40(r1); \ + evstdw r19,48(r1); \ + evstdw r20,56(r1); \ + evstdw r21,64(r1); \ + evstdw r22,72(r1); \ + evstdw r23,80(r1); \ + stw r24,88(r1); /* save normal registers */ \ + stw r25,92(r1); + + +#define FINALIZE \ + evldw r14,8(r1); /* restore SPE registers */ \ + evldw r15,16(r1); \ + evldw r16,24(r1); \ + evldw r17,32(r1); \ + evldw r18,40(r1); \ + evldw r19,48(r1); \ + evldw r20,56(r1); \ + evldw r21,64(r1); \ + evldw r22,72(r1); \ + evldw r23,80(r1); \ + lwz r24,88(r1); /* restore normal registers */ \ + lwz r25,92(r1); \ + xor r0,r0,r0; \ + stw r0,8(r1); /* Delete sensitive data */ \ + stw r0,16(r1); /* that we might have pushed */ \ + stw r0,24(r1); /* from other context that runs */ \ + stw r0,32(r1); /* the same code. Assume that */ \ + stw r0,40(r1); /* the lower part of the GPRs */ \ + stw r0,48(r1); /* was already overwritten on */ \ + stw r0,56(r1); /* the way down to here */ \ + stw r0,64(r1); \ + stw r0,72(r1); \ + stw r0,80(r1); \ + addi r1,r1,128; /* cleanup stack frame */ + +#ifdef __BIG_ENDIAN__ +#define LOAD_DATA(reg, off) \ + lwz reg,off(rWP); /* load data */ +#define NEXT_BLOCK \ + addi rWP,rWP,64; /* increment per block */ +#else +#define LOAD_DATA(reg, off) \ + lwbrx reg,0,rWP; /* load data */ \ + addi rWP,rWP,4; /* increment per word */ +#define NEXT_BLOCK /* nothing to do */ +#endif + +#define R_LOAD_W(a, b, c, d, e, f, g, h, w, off) \ + LOAD_DATA(w, off) /* 1: W */ \ + rotrwi rT0,e,6; /* 1: S1 = e rotr 6 */ \ + rotrwi rT1,e,11; /* 1: S1' = e rotr 11 */ \ + rotrwi rT2,e,25; /* 1: S1" = e rotr 25 */ \ + xor rT0,rT0,rT1; /* 1: S1 = S1 xor S1' */ \ + and rT3,e,f; /* 1: ch = e and f */ \ + xor rT0,rT0,rT2; /* 1: S1 = S1 xor S1" */ \ + andc rT1,g,e; /* 1: ch' = ~e and g */ \ + lwz rT2,off(rKP); /* 1: K */ \ + xor rT3,rT3,rT1; /* 1: ch = ch xor ch' */ \ + add h,h,rT0; /* 1: temp1 = h + S1 */ \ + add rT3,rT3,w; /* 1: temp1' = ch + w */ \ + rotrwi rT0,a,2; /* 1: S0 = a rotr 2 */ \ + add h,h,rT3; /* 1: temp1 = temp1 + temp1' */ \ + rotrwi rT1,a,13; /* 1: S0' = a rotr 13 */ \ + add h,h,rT2; /* 1: temp1 = temp1 + K */ \ + rotrwi rT3,a,22; /* 1: S0" = a rotr 22 */ \ + xor rT0,rT0,rT1; /* 1: S0 = S0 xor S0' */ \ + add d,d,h; /* 1: d = d + temp1 */ \ + xor rT3,rT0,rT3; /* 1: S0 = S0 xor S0" */ \ + evmergelo w,w,w; /* shift W */ \ + or rT2,a,b; /* 1: maj = a or b */ \ + and rT1,a,b; /* 1: maj' = a and b */ \ + and rT2,rT2,c; /* 1: maj = maj and c */ \ + LOAD_DATA(w, off+4) /* 2: W */ \ + or rT2,rT1,rT2; /* 1: maj = maj or maj' */ \ + rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \ + add rT3,rT3,rT2; /* 1: temp2 = S0 + maj */ \ + rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \ + add h,h,rT3; /* 1: h = temp1 + temp2 */ \ + rotrwi rT2,d,25; /* 2: S1" = e rotr 25 */ \ + xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \ + and rT3,d,e; /* 2: ch = e and f */ \ + xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \ + andc rT1,f,d; /* 2: ch' = ~e and g */ \ + lwz rT2,off+4(rKP); /* 2: K */ \ + xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \ + add g,g,rT0; /* 2: temp1 = h + S1 */ \ + add rT3,rT3,w; /* 2: temp1' = ch + w */ \ + rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \ + add g,g,rT3; /* 2: temp1 = temp1 + temp1' */ \ + rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \ + add g,g,rT2; /* 2: temp1 = temp1 + K */ \ + rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \ + xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \ + or rT2,h,a; /* 2: maj = a or b */ \ + xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \ + and rT1,h,a; /* 2: maj' = a and b */ \ + and rT2,rT2,b; /* 2: maj = maj and c */ \ + add c,c,g; /* 2: d = d + temp1 */ \ + or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \ + add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \ + add g,g,rT3 /* 2: h = temp1 + temp2 */ + +#define R_CALC_W(a, b, c, d, e, f, g, h, w0, w1, w4, w5, w7, k, off) \ + rotrwi rT2,e,6; /* 1: S1 = e rotr 6 */ \ + evmergelohi rT0,w0,w1; /* w[-15] */ \ + rotrwi rT3,e,11; /* 1: S1' = e rotr 11 */ \ + evsrwiu rT1,rT0,3; /* s0 = w[-15] >> 3 */ \ + xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \ + evrlwi rT0,rT0,25; /* s0' = w[-15] rotr 7 */ \ + rotrwi rT3,e,25; /* 1: S1' = e rotr 25 */ \ + evxor rT1,rT1,rT0; /* s0 = s0 xor s0' */ \ + xor rT2,rT2,rT3; /* 1: S1 = S1 xor S1' */ \ + evrlwi rT0,rT0,21; /* s0' = w[-15] rotr 18 */ \ + add h,h,rT2; /* 1: temp1 = h + S1 */ \ + evxor rT0,rT0,rT1; /* s0 = s0 xor s0' */ \ + and rT2,e,f; /* 1: ch = e and f */ \ + evaddw w0,w0,rT0; /* w = w[-16] + s0 */ \ + andc rT3,g,e; /* 1: ch' = ~e and g */ \ + evsrwiu rT0,w7,10; /* s1 = w[-2] >> 10 */ \ + xor rT2,rT2,rT3; /* 1: ch = ch xor ch' */ \ + evrlwi rT1,w7,15; /* s1' = w[-2] rotr 17 */ \ + add h,h,rT2; /* 1: temp1 = temp1 + ch */ \ + evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \ + rotrwi rT2,a,2; /* 1: S0 = a rotr 2 */ \ + evrlwi rT1,w7,13; /* s1' = w[-2] rotr 19 */ \ + rotrwi rT3,a,13; /* 1: S0' = a rotr 13 */ \ + evxor rT0,rT0,rT1; /* s1 = s1 xor s1' */ \ + xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \ + evldw rT1,off(rKP); /* k */ \ + rotrwi rT3,a,22; /* 1: S0' = a rotr 22 */ \ + evaddw w0,w0,rT0; /* w = w + s1 */ \ + xor rT2,rT2,rT3; /* 1: S0 = S0 xor S0' */ \ + evmergelohi rT0,w4,w5; /* w[-7] */ \ + and rT3,a,b; /* 1: maj = a and b */ \ + evaddw w0,w0,rT0; /* w = w + w[-7] */ \ + CMP_K##k##_LOOP \ + add rT2,rT2,rT3; /* 1: temp2 = S0 + maj */ \ + evaddw rT1,rT1,w0; /* wk = w + k */ \ + xor rT3,a,b; /* 1: maj = a xor b */ \ + evmergehi rT0,rT1,rT1; /* wk1/wk2 */ \ + and rT3,rT3,c; /* 1: maj = maj and c */ \ + add h,h,rT0; /* 1: temp1 = temp1 + wk */ \ + add rT2,rT2,rT3; /* 1: temp2 = temp2 + maj */ \ + add g,g,rT1; /* 2: temp1 = temp1 + wk */ \ + add d,d,h; /* 1: d = d + temp1 */ \ + rotrwi rT0,d,6; /* 2: S1 = e rotr 6 */ \ + add h,h,rT2; /* 1: h = temp1 + temp2 */ \ + rotrwi rT1,d,11; /* 2: S1' = e rotr 11 */ \ + rotrwi rT2,d,25; /* 2: S" = e rotr 25 */ \ + xor rT0,rT0,rT1; /* 2: S1 = S1 xor S1' */ \ + and rT3,d,e; /* 2: ch = e and f */ \ + xor rT0,rT0,rT2; /* 2: S1 = S1 xor S1" */ \ + andc rT1,f,d; /* 2: ch' = ~e and g */ \ + add g,g,rT0; /* 2: temp1 = h + S1 */ \ + xor rT3,rT3,rT1; /* 2: ch = ch xor ch' */ \ + rotrwi rT0,h,2; /* 2: S0 = a rotr 2 */ \ + add g,g,rT3; /* 2: temp1 = temp1 + ch */ \ + rotrwi rT1,h,13; /* 2: S0' = a rotr 13 */ \ + rotrwi rT3,h,22; /* 2: S0" = a rotr 22 */ \ + xor rT0,rT0,rT1; /* 2: S0 = S0 xor S0' */ \ + or rT2,h,a; /* 2: maj = a or b */ \ + and rT1,h,a; /* 2: maj' = a and b */ \ + and rT2,rT2,b; /* 2: maj = maj and c */ \ + xor rT3,rT0,rT3; /* 2: S0 = S0 xor S0" */ \ + or rT2,rT1,rT2; /* 2: maj = maj or maj' */ \ + add c,c,g; /* 2: d = d + temp1 */ \ + add rT3,rT3,rT2; /* 2: temp2 = S0 + maj */ \ + add g,g,rT3 /* 2: h = temp1 + temp2 */ + +_GLOBAL(ppc_spe_sha256_transform) + INITIALIZE + + mtctr r5 + lwz rH0,0(rHP) + lwz rH1,4(rHP) + lwz rH2,8(rHP) + lwz rH3,12(rHP) + lwz rH4,16(rHP) + lwz rH5,20(rHP) + lwz rH6,24(rHP) + lwz rH7,28(rHP) + +ppc_spe_sha256_main: + lis rKP,PPC_SPE_SHA256_K@ha + addi rKP,rKP,PPC_SPE_SHA256_K@l + + R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW0, 0) + R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW1, 8) + R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW2, 16) + R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW3, 24) + R_LOAD_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, rW4, 32) + R_LOAD_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, rW5, 40) + R_LOAD_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, rW6, 48) + R_LOAD_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, rW7, 56) +ppc_spe_sha256_16_rounds: + addi rKP,rKP,64 + R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, + rW0, rW1, rW4, rW5, rW7, N, 0) + R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, + rW1, rW2, rW5, rW6, rW0, N, 8) + R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, + rW2, rW3, rW6, rW7, rW1, N, 16) + R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, + rW3, rW4, rW7, rW0, rW2, N, 24) + R_CALC_W(rH0, rH1, rH2, rH3, rH4, rH5, rH6, rH7, + rW4, rW5, rW0, rW1, rW3, N, 32) + R_CALC_W(rH6, rH7, rH0, rH1, rH2, rH3, rH4, rH5, + rW5, rW6, rW1, rW2, rW4, N, 40) + R_CALC_W(rH4, rH5, rH6, rH7, rH0, rH1, rH2, rH3, + rW6, rW7, rW2, rW3, rW5, N, 48) + R_CALC_W(rH2, rH3, rH4, rH5, rH6, rH7, rH0, rH1, + rW7, rW0, rW3, rW4, rW6, C, 56) + bt gt,ppc_spe_sha256_16_rounds + + lwz rW0,0(rHP) + NEXT_BLOCK + lwz rW1,4(rHP) + lwz rW2,8(rHP) + lwz rW3,12(rHP) + lwz rW4,16(rHP) + lwz rW5,20(rHP) + lwz rW6,24(rHP) + lwz rW7,28(rHP) + + add rH0,rH0,rW0 + stw rH0,0(rHP) + add rH1,rH1,rW1 + stw rH1,4(rHP) + add rH2,rH2,rW2 + stw rH2,8(rHP) + add rH3,rH3,rW3 + stw rH3,12(rHP) + add rH4,rH4,rW4 + stw rH4,16(rHP) + add rH5,rH5,rW5 + stw rH5,20(rHP) + add rH6,rH6,rW6 + stw rH6,24(rHP) + add rH7,rH7,rW7 + stw rH7,28(rHP) + + bdnz ppc_spe_sha256_main + + FINALIZE + blr + +.data +.align 5 +PPC_SPE_SHA256_K: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 diff --git a/arch/powerpc/crypto/sha256-spe-glue.c b/arch/powerpc/crypto/sha256-spe-glue.c new file mode 100644 index 000000000000..f4a616fe1a82 --- /dev/null +++ b/arch/powerpc/crypto/sha256-spe-glue.c @@ -0,0 +1,275 @@ +/* + * Glue code for SHA-256 implementation for SPE instructions (PPC) + * + * Based on generic implementation. The assembler module takes care + * about the SPE registers so it can run from interrupt context. + * + * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <crypto/sha.h> +#include <asm/byteorder.h> +#include <asm/switch_to.h> +#include <linux/hardirq.h> + +/* + * MAX_BYTES defines the number of bytes that are allowed to be processed + * between preempt_disable() and preempt_enable(). SHA256 takes ~2,000 + * operations per 64 bytes. e500 cores can issue two arithmetic instructions + * per clock cycle using one 32/64 bit unit (SU1) and one 32 bit unit (SU2). + * Thus 1KB of input data will need an estimated maximum of 18,000 cycles. + * Headroom for cache misses included. Even with the low end model clocked + * at 667 MHz this equals to a critical time window of less than 27us. + * + */ +#define MAX_BYTES 1024 + +extern void ppc_spe_sha256_transform(u32 *state, const u8 *src, u32 blocks); + +static void spe_begin(void) +{ + /* We just start SPE operations and will save SPE registers later. */ + preempt_disable(); + enable_kernel_spe(); +} + +static void spe_end(void) +{ + /* reenable preemption */ + preempt_enable(); +} + +static inline void ppc_sha256_clear_context(struct sha256_state *sctx) +{ + int count = sizeof(struct sha256_state) >> 2; + u32 *ptr = (u32 *)sctx; + + /* make sure we can clear the fast way */ + BUILD_BUG_ON(sizeof(struct sha256_state) % 4); + do { *ptr++ = 0; } while (--count); +} + +static int ppc_spe_sha256_init(struct shash_desc *desc) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA256_H0; + sctx->state[1] = SHA256_H1; + sctx->state[2] = SHA256_H2; + sctx->state[3] = SHA256_H3; + sctx->state[4] = SHA256_H4; + sctx->state[5] = SHA256_H5; + sctx->state[6] = SHA256_H6; + sctx->state[7] = SHA256_H7; + sctx->count = 0; + + return 0; +} + +static int ppc_spe_sha224_init(struct shash_desc *desc) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + sctx->state[0] = SHA224_H0; + sctx->state[1] = SHA224_H1; + sctx->state[2] = SHA224_H2; + sctx->state[3] = SHA224_H3; + sctx->state[4] = SHA224_H4; + sctx->state[5] = SHA224_H5; + sctx->state[6] = SHA224_H6; + sctx->state[7] = SHA224_H7; + sctx->count = 0; + + return 0; +} + +static int ppc_spe_sha256_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + const unsigned int offset = sctx->count & 0x3f; + const unsigned int avail = 64 - offset; + unsigned int bytes; + const u8 *src = data; + + if (avail > len) { + sctx->count += len; + memcpy((char *)sctx->buf + offset, src, len); + return 0; + } + + sctx->count += len; + + if (offset) { + memcpy((char *)sctx->buf + offset, src, avail); + + spe_begin(); + ppc_spe_sha256_transform(sctx->state, (const u8 *)sctx->buf, 1); + spe_end(); + + len -= avail; + src += avail; + } + + while (len > 63) { + /* cut input data into smaller blocks */ + bytes = (len > MAX_BYTES) ? MAX_BYTES : len; + bytes = bytes & ~0x3f; + + spe_begin(); + ppc_spe_sha256_transform(sctx->state, src, bytes >> 6); + spe_end(); + + src += bytes; + len -= bytes; + }; + + memcpy((char *)sctx->buf, src, len); + return 0; +} + +static int ppc_spe_sha256_final(struct shash_desc *desc, u8 *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + const unsigned int offset = sctx->count & 0x3f; + char *p = (char *)sctx->buf + offset; + int padlen; + __be64 *pbits = (__be64 *)(((char *)&sctx->buf) + 56); + __be32 *dst = (__be32 *)out; + + padlen = 55 - offset; + *p++ = 0x80; + + spe_begin(); + + if (padlen < 0) { + memset(p, 0x00, padlen + sizeof (u64)); + ppc_spe_sha256_transform(sctx->state, sctx->buf, 1); + p = (char *)sctx->buf; + padlen = 56; + } + + memset(p, 0, padlen); + *pbits = cpu_to_be64(sctx->count << 3); + ppc_spe_sha256_transform(sctx->state, sctx->buf, 1); + + spe_end(); + + dst[0] = cpu_to_be32(sctx->state[0]); + dst[1] = cpu_to_be32(sctx->state[1]); + dst[2] = cpu_to_be32(sctx->state[2]); + dst[3] = cpu_to_be32(sctx->state[3]); + dst[4] = cpu_to_be32(sctx->state[4]); + dst[5] = cpu_to_be32(sctx->state[5]); + dst[6] = cpu_to_be32(sctx->state[6]); + dst[7] = cpu_to_be32(sctx->state[7]); + + ppc_sha256_clear_context(sctx); + return 0; +} + +static int ppc_spe_sha224_final(struct shash_desc *desc, u8 *out) +{ + u32 D[SHA256_DIGEST_SIZE >> 2]; + __be32 *dst = (__be32 *)out; + + ppc_spe_sha256_final(desc, (u8 *)D); + + /* avoid bytewise memcpy */ + dst[0] = D[0]; + dst[1] = D[1]; + dst[2] = D[2]; + dst[3] = D[3]; + dst[4] = D[4]; + dst[5] = D[5]; + dst[6] = D[6]; + + /* clear sensitive data */ + memzero_explicit(D, SHA256_DIGEST_SIZE); + return 0; +} + +static int ppc_spe_sha256_export(struct shash_desc *desc, void *out) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + return 0; +} + +static int ppc_spe_sha256_import(struct shash_desc *desc, const void *in) +{ + struct sha256_state *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + return 0; +} + +static struct shash_alg algs[2] = { { + .digestsize = SHA256_DIGEST_SIZE, + .init = ppc_spe_sha256_init, + .update = ppc_spe_sha256_update, + .final = ppc_spe_sha256_final, + .export = ppc_spe_sha256_export, + .import = ppc_spe_sha256_import, + .descsize = sizeof(struct sha256_state), + .statesize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha256", + .cra_driver_name= "sha256-ppc-spe", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA256_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}, { + .digestsize = SHA224_DIGEST_SIZE, + .init = ppc_spe_sha224_init, + .update = ppc_spe_sha256_update, + .final = ppc_spe_sha224_final, + .export = ppc_spe_sha256_export, + .import = ppc_spe_sha256_import, + .descsize = sizeof(struct sha256_state), + .statesize = sizeof(struct sha256_state), + .base = { + .cra_name = "sha224", + .cra_driver_name= "sha224-ppc-spe", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA224_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +} }; + +static int __init ppc_spe_sha256_mod_init(void) +{ + return crypto_register_shashes(algs, ARRAY_SIZE(algs)); +} + +static void __exit ppc_spe_sha256_mod_fini(void) +{ + crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); +} + +module_init(ppc_spe_sha256_mod_init); +module_exit(ppc_spe_sha256_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm, SPE optimized"); + +MODULE_ALIAS_CRYPTO("sha224"); +MODULE_ALIAS_CRYPTO("sha224-ppc-spe"); +MODULE_ALIAS_CRYPTO("sha256"); +MODULE_ALIAS_CRYPTO("sha256-ppc-spe"); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 54f60ab41c63..112cefacf2af 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -797,7 +797,9 @@ static int rfc4106_init(struct crypto_tfm *tfm) PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); struct crypto_aead *cryptd_child; struct aesni_rfc4106_gcm_ctx *child_ctx; - cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0); + cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); @@ -890,15 +892,12 @@ out_free_ablkcipher: return ret; } -static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, - unsigned int key_len) +static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, + unsigned int key_len) { int ret = 0; - struct crypto_tfm *tfm = crypto_aead_tfm(parent); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); - struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); - struct aesni_rfc4106_gcm_ctx *child_ctx = - aesni_rfc4106_gcm_ctx_get(cryptd_child); + struct crypto_tfm *tfm = crypto_aead_tfm(aead); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead); u8 *new_key_align, *new_key_mem = NULL; if (key_len < 4) { @@ -943,20 +942,31 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, goto exit; } ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); - memcpy(child_ctx, ctx, sizeof(*ctx)); exit: kfree(new_key_mem); return ret; } -/* This is the Integrity Check Value (aka the authentication tag length and can - * be 8, 12 or 16 bytes long. */ -static int rfc4106_set_authsize(struct crypto_aead *parent, - unsigned int authsize) +static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, + unsigned int key_len) { struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); - struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); + struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm); + struct aesni_rfc4106_gcm_ctx *c_ctx = aesni_rfc4106_gcm_ctx_get(child); + struct cryptd_aead *cryptd_tfm = ctx->cryptd_tfm; + int ret; + ret = crypto_aead_setkey(child, key, key_len); + if (!ret) { + memcpy(ctx, c_ctx, sizeof(*ctx)); + ctx->cryptd_tfm = cryptd_tfm; + } + return ret; +} + +static int common_rfc4106_set_authsize(struct crypto_aead *aead, + unsigned int authsize) +{ switch (authsize) { case 8: case 12: @@ -965,51 +975,23 @@ static int rfc4106_set_authsize(struct crypto_aead *parent, default: return -EINVAL; } - crypto_aead_crt(parent)->authsize = authsize; - crypto_aead_crt(cryptd_child)->authsize = authsize; + crypto_aead_crt(aead)->authsize = authsize; return 0; } -static int rfc4106_encrypt(struct aead_request *req) -{ - int ret; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - - if (!irq_fpu_usable()) { - struct aead_request *cryptd_req = - (struct aead_request *) aead_request_ctx(req); - memcpy(cryptd_req, req, sizeof(*req)); - aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - return crypto_aead_encrypt(cryptd_req); - } else { - struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); - kernel_fpu_begin(); - ret = cryptd_child->base.crt_aead.encrypt(req); - kernel_fpu_end(); - return ret; - } -} - -static int rfc4106_decrypt(struct aead_request *req) +/* This is the Integrity Check Value (aka the authentication tag length and can + * be 8, 12 or 16 bytes long. */ +static int rfc4106_set_authsize(struct crypto_aead *parent, + unsigned int authsize) { + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); + struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm); int ret; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); - if (!irq_fpu_usable()) { - struct aead_request *cryptd_req = - (struct aead_request *) aead_request_ctx(req); - memcpy(cryptd_req, req, sizeof(*req)); - aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - return crypto_aead_decrypt(cryptd_req); - } else { - struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); - kernel_fpu_begin(); - ret = cryptd_child->base.crt_aead.decrypt(req); - kernel_fpu_end(); - return ret; - } + ret = crypto_aead_setauthsize(child, authsize); + if (!ret) + crypto_aead_crt(parent)->authsize = authsize; + return ret; } static int __driver_rfc4106_encrypt(struct aead_request *req) @@ -1185,6 +1167,78 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) } return retval; } + +static int rfc4106_encrypt(struct aead_request *req) +{ + int ret; + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + + if (!irq_fpu_usable()) { + struct aead_request *cryptd_req = + (struct aead_request *) aead_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + ret = crypto_aead_encrypt(cryptd_req); + } else { + kernel_fpu_begin(); + ret = __driver_rfc4106_encrypt(req); + kernel_fpu_end(); + } + return ret; +} + +static int rfc4106_decrypt(struct aead_request *req) +{ + int ret; + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + + if (!irq_fpu_usable()) { + struct aead_request *cryptd_req = + (struct aead_request *) aead_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + ret = crypto_aead_decrypt(cryptd_req); + } else { + kernel_fpu_begin(); + ret = __driver_rfc4106_decrypt(req); + kernel_fpu_end(); + } + return ret; +} + +static int helper_rfc4106_encrypt(struct aead_request *req) +{ + int ret; + + if (unlikely(!irq_fpu_usable())) { + WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context"); + ret = -EINVAL; + } else { + kernel_fpu_begin(); + ret = __driver_rfc4106_encrypt(req); + kernel_fpu_end(); + } + return ret; +} + +static int helper_rfc4106_decrypt(struct aead_request *req) +{ + int ret; + + if (unlikely(!irq_fpu_usable())) { + WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context"); + ret = -EINVAL; + } else { + kernel_fpu_begin(); + ret = __driver_rfc4106_decrypt(req); + kernel_fpu_end(); + } + return ret; +} #endif static struct crypto_alg aesni_algs[] = { { @@ -1210,7 +1264,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_name = "__aes-aesni", .cra_driver_name = "__driver-aes-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1, @@ -1229,7 +1283,8 @@ static struct crypto_alg aesni_algs[] = { { .cra_name = "__ecb-aes-aesni", .cra_driver_name = "__driver-ecb-aes-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1, @@ -1249,7 +1304,8 @@ static struct crypto_alg aesni_algs[] = { { .cra_name = "__cbc-aes-aesni", .cra_driver_name = "__driver-cbc-aes-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1, @@ -1313,7 +1369,8 @@ static struct crypto_alg aesni_algs[] = { { .cra_name = "__ctr-aes-aesni", .cra_driver_name = "__driver-ctr-aes-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1, @@ -1357,7 +1414,7 @@ static struct crypto_alg aesni_algs[] = { { .cra_name = "__gcm-aes-aesni", .cra_driver_name = "__driver-gcm-aes-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_AEAD, + .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN, @@ -1366,8 +1423,12 @@ static struct crypto_alg aesni_algs[] = { { .cra_module = THIS_MODULE, .cra_u = { .aead = { - .encrypt = __driver_rfc4106_encrypt, - .decrypt = __driver_rfc4106_decrypt, + .setkey = common_rfc4106_set_key, + .setauthsize = common_rfc4106_set_authsize, + .encrypt = helper_rfc4106_encrypt, + .decrypt = helper_rfc4106_decrypt, + .ivsize = 8, + .maxauthsize = 16, }, }, }, { @@ -1423,7 +1484,8 @@ static struct crypto_alg aesni_algs[] = { { .cra_name = "__lrw-aes-aesni", .cra_driver_name = "__driver-lrw-aes-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct aesni_lrw_ctx), .cra_alignmask = 0, @@ -1444,7 +1506,8 @@ static struct crypto_alg aesni_algs[] = { { .cra_name = "__xts-aes-aesni", .cra_driver_name = "__driver-xts-aes-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = AES_BLOCK_SIZE, .cra_ctxsize = sizeof(struct aesni_xts_ctx), .cra_alignmask = 0, diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c index 9a07fafe3831..baf0ac21ace5 100644 --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c @@ -343,7 +343,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__ecb-camellia-aesni-avx2", .cra_driver_name = "__driver-ecb-camellia-aesni-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_ctx), .cra_alignmask = 0, @@ -362,7 +363,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__cbc-camellia-aesni-avx2", .cra_driver_name = "__driver-cbc-camellia-aesni-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_ctx), .cra_alignmask = 0, @@ -381,7 +383,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__ctr-camellia-aesni-avx2", .cra_driver_name = "__driver-ctr-camellia-aesni-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct camellia_ctx), .cra_alignmask = 0, @@ -401,7 +404,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__lrw-camellia-aesni-avx2", .cra_driver_name = "__driver-lrw-camellia-aesni-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_lrw_ctx), .cra_alignmask = 0, @@ -424,7 +428,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__xts-camellia-aesni-avx2", .cra_driver_name = "__driver-xts-camellia-aesni-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_xts_ctx), .cra_alignmask = 0, diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index ed38d959add6..78818a1e73e3 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c @@ -335,7 +335,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__ecb-camellia-aesni", .cra_driver_name = "__driver-ecb-camellia-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_ctx), .cra_alignmask = 0, @@ -354,7 +355,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__cbc-camellia-aesni", .cra_driver_name = "__driver-cbc-camellia-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_ctx), .cra_alignmask = 0, @@ -373,7 +375,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__ctr-camellia-aesni", .cra_driver_name = "__driver-ctr-camellia-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct camellia_ctx), .cra_alignmask = 0, @@ -393,7 +396,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__lrw-camellia-aesni", .cra_driver_name = "__driver-lrw-camellia-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_lrw_ctx), .cra_alignmask = 0, @@ -416,7 +420,8 @@ static struct crypto_alg cmll_algs[10] = { { .cra_name = "__xts-camellia-aesni", .cra_driver_name = "__driver-xts-camellia-aesni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAMELLIA_BLOCK_SIZE, .cra_ctxsize = sizeof(struct camellia_xts_ctx), .cra_alignmask = 0, diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 60ada677a928..236c80974457 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -341,7 +341,8 @@ static struct crypto_alg cast5_algs[6] = { { .cra_name = "__ecb-cast5-avx", .cra_driver_name = "__driver-ecb-cast5-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAST5_BLOCK_SIZE, .cra_ctxsize = sizeof(struct cast5_ctx), .cra_alignmask = 0, @@ -360,7 +361,8 @@ static struct crypto_alg cast5_algs[6] = { { .cra_name = "__cbc-cast5-avx", .cra_driver_name = "__driver-cbc-cast5-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAST5_BLOCK_SIZE, .cra_ctxsize = sizeof(struct cast5_ctx), .cra_alignmask = 0, @@ -379,7 +381,8 @@ static struct crypto_alg cast5_algs[6] = { { .cra_name = "__ctr-cast5-avx", .cra_driver_name = "__driver-ctr-cast5-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct cast5_ctx), .cra_alignmask = 0, diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 0160f68a57ff..f448810ca4ac 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c @@ -372,7 +372,8 @@ static struct crypto_alg cast6_algs[10] = { { .cra_name = "__ecb-cast6-avx", .cra_driver_name = "__driver-ecb-cast6-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAST6_BLOCK_SIZE, .cra_ctxsize = sizeof(struct cast6_ctx), .cra_alignmask = 0, @@ -391,7 +392,8 @@ static struct crypto_alg cast6_algs[10] = { { .cra_name = "__cbc-cast6-avx", .cra_driver_name = "__driver-cbc-cast6-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAST6_BLOCK_SIZE, .cra_ctxsize = sizeof(struct cast6_ctx), .cra_alignmask = 0, @@ -410,7 +412,8 @@ static struct crypto_alg cast6_algs[10] = { { .cra_name = "__ctr-cast6-avx", .cra_driver_name = "__driver-ctr-cast6-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct cast6_ctx), .cra_alignmask = 0, @@ -430,7 +433,8 @@ static struct crypto_alg cast6_algs[10] = { { .cra_name = "__lrw-cast6-avx", .cra_driver_name = "__driver-lrw-cast6-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAST6_BLOCK_SIZE, .cra_ctxsize = sizeof(struct cast6_lrw_ctx), .cra_alignmask = 0, @@ -453,7 +457,8 @@ static struct crypto_alg cast6_algs[10] = { { .cra_name = "__xts-cast6-avx", .cra_driver_name = "__driver-xts-cast6-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = CAST6_BLOCK_SIZE, .cra_ctxsize = sizeof(struct cast6_xts_ctx), .cra_alignmask = 0, diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 8253d85aa165..2079baf06bdd 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -154,7 +154,8 @@ static struct shash_alg ghash_alg = { .cra_name = "__ghash", .cra_driver_name = "__ghash-pclmulqdqni", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_flags = CRYPTO_ALG_TYPE_SHASH | + CRYPTO_ALG_INTERNAL, .cra_blocksize = GHASH_BLOCK_SIZE, .cra_ctxsize = sizeof(struct ghash_ctx), .cra_module = THIS_MODULE, @@ -261,7 +262,9 @@ static int ghash_async_init_tfm(struct crypto_tfm *tfm) struct cryptd_ahash *cryptd_tfm; struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); - cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0); + cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); ctx->cryptd_tfm = cryptd_tfm; diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 432f1d76ceb8..6a85598931b5 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -232,7 +232,6 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr, le128_to_be128((be128 *)walk->iv, &ctrblk); } -EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit); static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, struct blkcipher_desc *desc, diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c index 437e47a4d302..2f63dc89e7a9 100644 --- a/arch/x86/crypto/serpent_avx2_glue.c +++ b/arch/x86/crypto/serpent_avx2_glue.c @@ -309,7 +309,8 @@ static struct crypto_alg srp_algs[10] = { { .cra_name = "__ecb-serpent-avx2", .cra_driver_name = "__driver-ecb-serpent-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -329,7 +330,8 @@ static struct crypto_alg srp_algs[10] = { { .cra_name = "__cbc-serpent-avx2", .cra_driver_name = "__driver-cbc-serpent-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -349,7 +351,8 @@ static struct crypto_alg srp_algs[10] = { { .cra_name = "__ctr-serpent-avx2", .cra_driver_name = "__driver-ctr-serpent-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -370,7 +373,8 @@ static struct crypto_alg srp_algs[10] = { { .cra_name = "__lrw-serpent-avx2", .cra_driver_name = "__driver-lrw-serpent-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_lrw_ctx), .cra_alignmask = 0, @@ -394,7 +398,8 @@ static struct crypto_alg srp_algs[10] = { { .cra_name = "__xts-serpent-avx2", .cra_driver_name = "__driver-xts-serpent-avx2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_xts_ctx), .cra_alignmask = 0, diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 7e217398b4eb..c8d478af8456 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c @@ -378,7 +378,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__ecb-serpent-avx", .cra_driver_name = "__driver-ecb-serpent-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -397,7 +398,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__cbc-serpent-avx", .cra_driver_name = "__driver-cbc-serpent-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -416,7 +418,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__ctr-serpent-avx", .cra_driver_name = "__driver-ctr-serpent-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -436,7 +439,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__lrw-serpent-avx", .cra_driver_name = "__driver-lrw-serpent-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_lrw_ctx), .cra_alignmask = 0, @@ -459,7 +463,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__xts-serpent-avx", .cra_driver_name = "__driver-xts-serpent-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_xts_ctx), .cra_alignmask = 0, diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index bf025adaea01..3643dd508f45 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c @@ -387,7 +387,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__ecb-serpent-sse2", .cra_driver_name = "__driver-ecb-serpent-sse2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -406,7 +407,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__cbc-serpent-sse2", .cra_driver_name = "__driver-cbc-serpent-sse2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -425,7 +427,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__ctr-serpent-sse2", .cra_driver_name = "__driver-ctr-serpent-sse2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct serpent_ctx), .cra_alignmask = 0, @@ -445,7 +448,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__lrw-serpent-sse2", .cra_driver_name = "__driver-lrw-serpent-sse2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_lrw_ctx), .cra_alignmask = 0, @@ -468,7 +472,8 @@ static struct crypto_alg serpent_algs[10] = { { .cra_name = "__xts-serpent-sse2", .cra_driver_name = "__driver-xts-serpent-sse2", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SERPENT_BLOCK_SIZE, .cra_ctxsize = sizeof(struct serpent_xts_ctx), .cra_alignmask = 0, diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c index fd9f6b035b16..e510b1c5d690 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb.c +++ b/arch/x86/crypto/sha-mb/sha1_mb.c @@ -694,7 +694,8 @@ static struct shash_alg sha1_mb_shash_alg = { * use ASYNC flag as some buffers in multi-buffer * algo may not have completed before hashing thread sleep */ - .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC, + .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC | + CRYPTO_ALG_INTERNAL, .cra_blocksize = SHA1_BLOCK_SIZE, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list), @@ -770,7 +771,9 @@ static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm) struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm); struct mcryptd_hash_ctx *mctx; - mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", 0, 0); + mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", + CRYPTO_ALG_INTERNAL, + CRYPTO_ALG_INTERNAL); if (IS_ERR(mcryptd_tfm)) return PTR_ERR(mcryptd_tfm); mctx = crypto_ahash_ctx(&mcryptd_tfm->base); @@ -828,7 +831,7 @@ static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate) while (!list_empty(&cstate->work_list)) { rctx = list_entry(cstate->work_list.next, struct mcryptd_hash_request_ctx, waiter); - if time_before(cur_time, rctx->tag.expire) + if (time_before(cur_time, rctx->tag.expire)) break; kernel_fpu_begin(); sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr); diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c index 4ca7e166a2aa..822acb5b464c 100644 --- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c +++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c @@ -56,7 +56,7 @@ void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state) { unsigned int j; - state->unused_lanes = 0xF76543210; + state->unused_lanes = 0xF76543210ULL; for (j = 0; j < 8; j++) { state->lens[j] = 0xFFFFFFFF; state->ldata[j].job_in_lane = NULL; diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 6c20fe04a738..33d1b9dc14cc 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -28,7 +28,7 @@ #include <linux/cryptohash.h> #include <linux/types.h> #include <crypto/sha.h> -#include <asm/byteorder.h> +#include <crypto/sha1_base.h> #include <asm/i387.h> #include <asm/xcr.h> #include <asm/xsave.h> @@ -44,132 +44,51 @@ asmlinkage void sha1_transform_avx(u32 *digest, const char *data, #define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, - unsigned int rounds); + unsigned int rounds); #endif -static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); - - -static int sha1_ssse3_init(struct shash_desc *desc) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - - *sctx = (struct sha1_state){ - .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, - }; - - return 0; -} - -static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len, unsigned int partial) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - unsigned int done = 0; - - sctx->count += len; - - if (partial) { - done = SHA1_BLOCK_SIZE - partial; - memcpy(sctx->buffer + partial, data, done); - sha1_transform_asm(sctx->state, sctx->buffer, 1); - } - - if (len - done >= SHA1_BLOCK_SIZE) { - const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE; - - sha1_transform_asm(sctx->state, data + done, rounds); - done += rounds * SHA1_BLOCK_SIZE; - } - - memcpy(sctx->buffer, data + done, len - done); - - return 0; -} +static void (*sha1_transform_asm)(u32 *, const char *, unsigned int); static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct sha1_state *sctx = shash_desc_ctx(desc); - unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; - int res; - /* Handle the fast case right here */ - if (partial + len < SHA1_BLOCK_SIZE) { - sctx->count += len; - memcpy(sctx->buffer + partial, data, len); + if (!irq_fpu_usable() || + (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) + return crypto_sha1_update(desc, data, len); - return 0; - } + /* make sure casting to sha1_block_fn() is safe */ + BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0); - if (!irq_fpu_usable()) { - res = crypto_sha1_update(desc, data, len); - } else { - kernel_fpu_begin(); - res = __sha1_ssse3_update(desc, data, len, partial); - kernel_fpu_end(); - } - - return res; -} - - -/* Add padding and return the message digest. */ -static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) -{ - struct sha1_state *sctx = shash_desc_ctx(desc); - unsigned int i, index, padlen; - __be32 *dst = (__be32 *)out; - __be64 bits; - static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; - - bits = cpu_to_be64(sctx->count << 3); - - /* Pad out to 56 mod 64 and append length */ - index = sctx->count % SHA1_BLOCK_SIZE; - padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index); - if (!irq_fpu_usable()) { - crypto_sha1_update(desc, padding, padlen); - crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits)); - } else { - kernel_fpu_begin(); - /* We need to fill a whole block for __sha1_ssse3_update() */ - if (padlen <= 56) { - sctx->count += padlen; - memcpy(sctx->buffer + index, padding, padlen); - } else { - __sha1_ssse3_update(desc, padding, padlen, index); - } - __sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56); - kernel_fpu_end(); - } - - /* Store state in digest */ - for (i = 0; i < 5; i++) - dst[i] = cpu_to_be32(sctx->state[i]); - - /* Wipe context */ - memset(sctx, 0, sizeof(*sctx)); + kernel_fpu_begin(); + sha1_base_do_update(desc, data, len, + (sha1_block_fn *)sha1_transform_asm); + kernel_fpu_end(); return 0; } -static int sha1_ssse3_export(struct shash_desc *desc, void *out) +static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - struct sha1_state *sctx = shash_desc_ctx(desc); + if (!irq_fpu_usable()) + return crypto_sha1_finup(desc, data, len, out); - memcpy(out, sctx, sizeof(*sctx)); + kernel_fpu_begin(); + if (len) + sha1_base_do_update(desc, data, len, + (sha1_block_fn *)sha1_transform_asm); + sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm); + kernel_fpu_end(); - return 0; + return sha1_base_finish(desc, out); } -static int sha1_ssse3_import(struct shash_desc *desc, const void *in) +/* Add padding and return the message digest. */ +static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) { - struct sha1_state *sctx = shash_desc_ctx(desc); - - memcpy(sctx, in, sizeof(*sctx)); - - return 0; + return sha1_ssse3_finup(desc, NULL, 0, out); } #ifdef CONFIG_AS_AVX2 @@ -186,13 +105,11 @@ static void sha1_apply_transform_avx2(u32 *digest, const char *data, static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, - .init = sha1_ssse3_init, + .init = sha1_base_init, .update = sha1_ssse3_update, .final = sha1_ssse3_final, - .export = sha1_ssse3_export, - .import = sha1_ssse3_import, + .finup = sha1_ssse3_finup, .descsize = sizeof(struct sha1_state), - .statesize = sizeof(struct sha1_state), .base = { .cra_name = "sha1", .cra_driver_name= "sha1-ssse3", diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S index 642f15687a0a..92b3b5d75ba9 100644 --- a/arch/x86/crypto/sha256-avx-asm.S +++ b/arch/x86/crypto/sha256-avx-asm.S @@ -96,10 +96,10 @@ SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00 BYTE_FLIP_MASK = %xmm13 NUM_BLKS = %rdx # 3rd arg -CTX = %rsi # 2nd arg -INP = %rdi # 1st arg +INP = %rsi # 2nd arg +CTX = %rdi # 1st arg -SRND = %rdi # clobbers INP +SRND = %rsi # clobbers INP c = %ecx d = %r8d e = %edx @@ -342,8 +342,8 @@ a = TMP_ ######################################################################## ## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks) -## arg 1 : pointer to input data -## arg 2 : pointer to digest +## arg 1 : pointer to digest +## arg 2 : pointer to input data ## arg 3 : Num blocks ######################################################################## .text diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S index 9e86944c539d..570ec5ec62d7 100644 --- a/arch/x86/crypto/sha256-avx2-asm.S +++ b/arch/x86/crypto/sha256-avx2-asm.S @@ -91,12 +91,12 @@ BYTE_FLIP_MASK = %ymm13 X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK NUM_BLKS = %rdx # 3rd arg -CTX = %rsi # 2nd arg -INP = %rdi # 1st arg +INP = %rsi # 2nd arg +CTX = %rdi # 1st arg c = %ecx d = %r8d e = %edx # clobbers NUM_BLKS -y3 = %edi # clobbers INP +y3 = %esi # clobbers INP TBL = %rbp @@ -523,8 +523,8 @@ STACK_SIZE = _RSP + _RSP_SIZE ######################################################################## ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks) -## arg 1 : pointer to input data -## arg 2 : pointer to digest +## arg 1 : pointer to digest +## arg 2 : pointer to input data ## arg 3 : Num blocks ######################################################################## .text diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S index f833b74d902b..2cedc44e8121 100644 --- a/arch/x86/crypto/sha256-ssse3-asm.S +++ b/arch/x86/crypto/sha256-ssse3-asm.S @@ -88,10 +88,10 @@ SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00 BYTE_FLIP_MASK = %xmm12 NUM_BLKS = %rdx # 3rd arg -CTX = %rsi # 2nd arg -INP = %rdi # 1st arg +INP = %rsi # 2nd arg +CTX = %rdi # 1st arg -SRND = %rdi # clobbers INP +SRND = %rsi # clobbers INP c = %ecx d = %r8d e = %edx @@ -348,8 +348,8 @@ a = TMP_ ######################################################################## ## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks) -## arg 1 : pointer to input data -## arg 2 : pointer to digest +## arg 1 : pointer to digest +## arg 2 : pointer to input data ## arg 3 : Num blocks ######################################################################## .text diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 8fad72f4dfd2..ccc338881ee8 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -36,195 +36,74 @@ #include <linux/cryptohash.h> #include <linux/types.h> #include <crypto/sha.h> -#include <asm/byteorder.h> +#include <crypto/sha256_base.h> #include <asm/i387.h> #include <asm/xcr.h> #include <asm/xsave.h> #include <linux/string.h> -asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest, - u64 rounds); +asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, + u64 rounds); #ifdef CONFIG_AS_AVX -asmlinkage void sha256_transform_avx(const char *data, u32 *digest, +asmlinkage void sha256_transform_avx(u32 *digest, const char *data, u64 rounds); #endif #ifdef CONFIG_AS_AVX2 -asmlinkage void sha256_transform_rorx(const char *data, u32 *digest, - u64 rounds); +asmlinkage void sha256_transform_rorx(u32 *digest, const char *data, + u64 rounds); #endif -static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64); - - -static int sha256_ssse3_init(struct shash_desc *desc) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA256_H0; - sctx->state[1] = SHA256_H1; - sctx->state[2] = SHA256_H2; - sctx->state[3] = SHA256_H3; - sctx->state[4] = SHA256_H4; - sctx->state[5] = SHA256_H5; - sctx->state[6] = SHA256_H6; - sctx->state[7] = SHA256_H7; - sctx->count = 0; - - return 0; -} - -static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len, unsigned int partial) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - unsigned int done = 0; - - sctx->count += len; - - if (partial) { - done = SHA256_BLOCK_SIZE - partial; - memcpy(sctx->buf + partial, data, done); - sha256_transform_asm(sctx->buf, sctx->state, 1); - } - - if (len - done >= SHA256_BLOCK_SIZE) { - const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE; - - sha256_transform_asm(data + done, sctx->state, (u64) rounds); - - done += rounds * SHA256_BLOCK_SIZE; - } - - memcpy(sctx->buf, data + done, len - done); - - return 0; -} +static void (*sha256_transform_asm)(u32 *, const char *, u64); static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct sha256_state *sctx = shash_desc_ctx(desc); - unsigned int partial = sctx->count % SHA256_BLOCK_SIZE; - int res; - /* Handle the fast case right here */ - if (partial + len < SHA256_BLOCK_SIZE) { - sctx->count += len; - memcpy(sctx->buf + partial, data, len); + if (!irq_fpu_usable() || + (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) + return crypto_sha256_update(desc, data, len); - return 0; - } - - if (!irq_fpu_usable()) { - res = crypto_sha256_update(desc, data, len); - } else { - kernel_fpu_begin(); - res = __sha256_ssse3_update(desc, data, len, partial); - kernel_fpu_end(); - } - - return res; -} + /* make sure casting to sha256_block_fn() is safe */ + BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0); - -/* Add padding and return the message digest. */ -static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - unsigned int i, index, padlen; - __be32 *dst = (__be32 *)out; - __be64 bits; - static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, }; - - bits = cpu_to_be64(sctx->count << 3); - - /* Pad out to 56 mod 64 and append length */ - index = sctx->count % SHA256_BLOCK_SIZE; - padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index); - - if (!irq_fpu_usable()) { - crypto_sha256_update(desc, padding, padlen); - crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits)); - } else { - kernel_fpu_begin(); - /* We need to fill a whole block for __sha256_ssse3_update() */ - if (padlen <= 56) { - sctx->count += padlen; - memcpy(sctx->buf + index, padding, padlen); - } else { - __sha256_ssse3_update(desc, padding, padlen, index); - } - __sha256_ssse3_update(desc, (const u8 *)&bits, - sizeof(bits), 56); - kernel_fpu_end(); - } - - /* Store state in digest */ - for (i = 0; i < 8; i++) - dst[i] = cpu_to_be32(sctx->state[i]); - - /* Wipe context */ - memset(sctx, 0, sizeof(*sctx)); + kernel_fpu_begin(); + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_transform_asm); + kernel_fpu_end(); return 0; } -static int sha256_ssse3_export(struct shash_desc *desc, void *out) +static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - struct sha256_state *sctx = shash_desc_ctx(desc); + if (!irq_fpu_usable()) + return crypto_sha256_finup(desc, data, len, out); - memcpy(out, sctx, sizeof(*sctx)); + kernel_fpu_begin(); + if (len) + sha256_base_do_update(desc, data, len, + (sha256_block_fn *)sha256_transform_asm); + sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm); + kernel_fpu_end(); - return 0; + return sha256_base_finish(desc, out); } -static int sha256_ssse3_import(struct shash_desc *desc, const void *in) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - memcpy(sctx, in, sizeof(*sctx)); - - return 0; -} - -static int sha224_ssse3_init(struct shash_desc *desc) -{ - struct sha256_state *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA224_H0; - sctx->state[1] = SHA224_H1; - sctx->state[2] = SHA224_H2; - sctx->state[3] = SHA224_H3; - sctx->state[4] = SHA224_H4; - sctx->state[5] = SHA224_H5; - sctx->state[6] = SHA224_H6; - sctx->state[7] = SHA224_H7; - sctx->count = 0; - - return 0; -} - -static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash) +/* Add padding and return the message digest. */ +static int sha256_ssse3_final(struct shash_desc *desc, u8 *out) { - u8 D[SHA256_DIGEST_SIZE]; - - sha256_ssse3_final(desc, D); - - memcpy(hash, D, SHA224_DIGEST_SIZE); - memzero_explicit(D, SHA256_DIGEST_SIZE); - - return 0; + return sha256_ssse3_finup(desc, NULL, 0, out); } static struct shash_alg algs[] = { { .digestsize = SHA256_DIGEST_SIZE, - .init = sha256_ssse3_init, + .init = sha256_base_init, .update = sha256_ssse3_update, .final = sha256_ssse3_final, - .export = sha256_ssse3_export, - .import = sha256_ssse3_import, + .finup = sha256_ssse3_finup, .descsize = sizeof(struct sha256_state), - .statesize = sizeof(struct sha256_state), .base = { .cra_name = "sha256", .cra_driver_name = "sha256-ssse3", @@ -235,13 +114,11 @@ static struct shash_alg algs[] = { { } }, { .digestsize = SHA224_DIGEST_SIZE, - .init = sha224_ssse3_init, + .init = sha224_base_init, .update = sha256_ssse3_update, - .final = sha224_ssse3_final, - .export = sha256_ssse3_export, - .import = sha256_ssse3_import, + .final = sha256_ssse3_final, + .finup = sha256_ssse3_finup, .descsize = sizeof(struct sha256_state), - .statesize = sizeof(struct sha256_state), .base = { .cra_name = "sha224", .cra_driver_name = "sha224-ssse3", diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S index 974dde9bc6cd..565274d6a641 100644 --- a/arch/x86/crypto/sha512-avx-asm.S +++ b/arch/x86/crypto/sha512-avx-asm.S @@ -54,9 +54,9 @@ # Virtual Registers # ARG1 -msg = %rdi +digest = %rdi # ARG2 -digest = %rsi +msg = %rsi # ARG3 msglen = %rdx T1 = %rcx @@ -271,7 +271,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE .endm ######################################################################## -# void sha512_transform_avx(const void* M, void* D, u64 L) +# void sha512_transform_avx(void* D, const void* M, u64 L) # Purpose: Updates the SHA512 digest stored at D with the message stored in M. # The size of the message pointed to by M must be an integer multiple of SHA512 # message blocks. diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index 568b96105f5c..a4771dcd1fcf 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -70,9 +70,9 @@ XFER = YTMP0 BYTE_FLIP_MASK = %ymm9 # 1st arg -INP = %rdi +CTX = %rdi # 2nd arg -CTX = %rsi +INP = %rsi # 3rd arg NUM_BLKS = %rdx @@ -562,7 +562,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE .endm ######################################################################## -# void sha512_transform_rorx(const void* M, void* D, uint64_t L)# +# void sha512_transform_rorx(void* D, const void* M, uint64_t L)# # Purpose: Updates the SHA512 digest stored at D with the message stored in M. # The size of the message pointed to by M must be an integer multiple of SHA512 # message blocks. diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S index fb56855d51f5..e610e29cbc81 100644 --- a/arch/x86/crypto/sha512-ssse3-asm.S +++ b/arch/x86/crypto/sha512-ssse3-asm.S @@ -53,9 +53,9 @@ # Virtual Registers # ARG1 -msg = %rdi +digest = %rdi # ARG2 -digest = %rsi +msg = %rsi # ARG3 msglen = %rdx T1 = %rcx @@ -269,7 +269,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE .endm ######################################################################## -# void sha512_transform_ssse3(const void* M, void* D, u64 L)# +# void sha512_transform_ssse3(void* D, const void* M, u64 L)# # Purpose: Updates the SHA512 digest stored at D with the message stored in M. # The size of the message pointed to by M must be an integer multiple of SHA512 # message blocks. diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 0b6af26832bf..d9fa4c1e063f 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -34,205 +34,75 @@ #include <linux/cryptohash.h> #include <linux/types.h> #include <crypto/sha.h> -#include <asm/byteorder.h> +#include <crypto/sha512_base.h> #include <asm/i387.h> #include <asm/xcr.h> #include <asm/xsave.h> #include <linux/string.h> -asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest, - u64 rounds); +asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data, + u64 rounds); #ifdef CONFIG_AS_AVX -asmlinkage void sha512_transform_avx(const char *data, u64 *digest, +asmlinkage void sha512_transform_avx(u64 *digest, const char *data, u64 rounds); #endif #ifdef CONFIG_AS_AVX2 -asmlinkage void sha512_transform_rorx(const char *data, u64 *digest, - u64 rounds); +asmlinkage void sha512_transform_rorx(u64 *digest, const char *data, + u64 rounds); #endif -static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64); - - -static int sha512_ssse3_init(struct shash_desc *desc) -{ - struct sha512_state *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA512_H0; - sctx->state[1] = SHA512_H1; - sctx->state[2] = SHA512_H2; - sctx->state[3] = SHA512_H3; - sctx->state[4] = SHA512_H4; - sctx->state[5] = SHA512_H5; - sctx->state[6] = SHA512_H6; - sctx->state[7] = SHA512_H7; - sctx->count[0] = sctx->count[1] = 0; - - return 0; -} +static void (*sha512_transform_asm)(u64 *, const char *, u64); -static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len, unsigned int partial) +static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) { struct sha512_state *sctx = shash_desc_ctx(desc); - unsigned int done = 0; - - sctx->count[0] += len; - if (sctx->count[0] < len) - sctx->count[1]++; - if (partial) { - done = SHA512_BLOCK_SIZE - partial; - memcpy(sctx->buf + partial, data, done); - sha512_transform_asm(sctx->buf, sctx->state, 1); - } - - if (len - done >= SHA512_BLOCK_SIZE) { - const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE; + if (!irq_fpu_usable() || + (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) + return crypto_sha512_update(desc, data, len); - sha512_transform_asm(data + done, sctx->state, (u64) rounds); - - done += rounds * SHA512_BLOCK_SIZE; - } + /* make sure casting to sha512_block_fn() is safe */ + BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0); - memcpy(sctx->buf, data + done, len - done); + kernel_fpu_begin(); + sha512_base_do_update(desc, data, len, + (sha512_block_fn *)sha512_transform_asm); + kernel_fpu_end(); return 0; } -static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data, - unsigned int len) +static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) { - struct sha512_state *sctx = shash_desc_ctx(desc); - unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE; - int res; - - /* Handle the fast case right here */ - if (partial + len < SHA512_BLOCK_SIZE) { - sctx->count[0] += len; - if (sctx->count[0] < len) - sctx->count[1]++; - memcpy(sctx->buf + partial, data, len); - - return 0; - } + if (!irq_fpu_usable()) + return crypto_sha512_finup(desc, data, len, out); - if (!irq_fpu_usable()) { - res = crypto_sha512_update(desc, data, len); - } else { - kernel_fpu_begin(); - res = __sha512_ssse3_update(desc, data, len, partial); - kernel_fpu_end(); - } + kernel_fpu_begin(); + if (len) + sha512_base_do_update(desc, data, len, + (sha512_block_fn *)sha512_transform_asm); + sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm); + kernel_fpu_end(); - return res; + return sha512_base_finish(desc, out); } - /* Add padding and return the message digest. */ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) { - struct sha512_state *sctx = shash_desc_ctx(desc); - unsigned int i, index, padlen; - __be64 *dst = (__be64 *)out; - __be64 bits[2]; - static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, }; - - /* save number of bits */ - bits[1] = cpu_to_be64(sctx->count[0] << 3); - bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61); - - /* Pad out to 112 mod 128 and append length */ - index = sctx->count[0] & 0x7f; - padlen = (index < 112) ? (112 - index) : ((128+112) - index); - - if (!irq_fpu_usable()) { - crypto_sha512_update(desc, padding, padlen); - crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits)); - } else { - kernel_fpu_begin(); - /* We need to fill a whole block for __sha512_ssse3_update() */ - if (padlen <= 112) { - sctx->count[0] += padlen; - if (sctx->count[0] < padlen) - sctx->count[1]++; - memcpy(sctx->buf + index, padding, padlen); - } else { - __sha512_ssse3_update(desc, padding, padlen, index); - } - __sha512_ssse3_update(desc, (const u8 *)&bits, - sizeof(bits), 112); - kernel_fpu_end(); - } - - /* Store state in digest */ - for (i = 0; i < 8; i++) - dst[i] = cpu_to_be64(sctx->state[i]); - - /* Wipe context */ - memset(sctx, 0, sizeof(*sctx)); - - return 0; -} - -static int sha512_ssse3_export(struct shash_desc *desc, void *out) -{ - struct sha512_state *sctx = shash_desc_ctx(desc); - - memcpy(out, sctx, sizeof(*sctx)); - - return 0; -} - -static int sha512_ssse3_import(struct shash_desc *desc, const void *in) -{ - struct sha512_state *sctx = shash_desc_ctx(desc); - - memcpy(sctx, in, sizeof(*sctx)); - - return 0; -} - -static int sha384_ssse3_init(struct shash_desc *desc) -{ - struct sha512_state *sctx = shash_desc_ctx(desc); - - sctx->state[0] = SHA384_H0; - sctx->state[1] = SHA384_H1; - sctx->state[2] = SHA384_H2; - sctx->state[3] = SHA384_H3; - sctx->state[4] = SHA384_H4; - sctx->state[5] = SHA384_H5; - sctx->state[6] = SHA384_H6; - sctx->state[7] = SHA384_H7; - - sctx->count[0] = sctx->count[1] = 0; - - return 0; -} - -static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash) -{ - u8 D[SHA512_DIGEST_SIZE]; - - sha512_ssse3_final(desc, D); - - memcpy(hash, D, SHA384_DIGEST_SIZE); - memzero_explicit(D, SHA512_DIGEST_SIZE); - - return 0; + return sha512_ssse3_finup(desc, NULL, 0, out); } static struct shash_alg algs[] = { { .digestsize = SHA512_DIGEST_SIZE, - .init = sha512_ssse3_init, + .init = sha512_base_init, .update = sha512_ssse3_update, .final = sha512_ssse3_final, - .export = sha512_ssse3_export, - .import = sha512_ssse3_import, + .finup = sha512_ssse3_finup, .descsize = sizeof(struct sha512_state), - .statesize = sizeof(struct sha512_state), .base = { .cra_name = "sha512", .cra_driver_name = "sha512-ssse3", @@ -243,13 +113,11 @@ static struct shash_alg algs[] = { { } }, { .digestsize = SHA384_DIGEST_SIZE, - .init = sha384_ssse3_init, + .init = sha384_base_init, .update = sha512_ssse3_update, - .final = sha384_ssse3_final, - .export = sha512_ssse3_export, - .import = sha512_ssse3_import, + .final = sha512_ssse3_final, + .finup = sha512_ssse3_finup, .descsize = sizeof(struct sha512_state), - .statesize = sizeof(struct sha512_state), .base = { .cra_name = "sha384", .cra_driver_name = "sha384-ssse3", diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index 1ac531ea9bcc..b5e2d5651851 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c @@ -340,7 +340,8 @@ static struct crypto_alg twofish_algs[10] = { { .cra_name = "__ecb-twofish-avx", .cra_driver_name = "__driver-ecb-twofish-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = TF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct twofish_ctx), .cra_alignmask = 0, @@ -359,7 +360,8 @@ static struct crypto_alg twofish_algs[10] = { { .cra_name = "__cbc-twofish-avx", .cra_driver_name = "__driver-cbc-twofish-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = TF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct twofish_ctx), .cra_alignmask = 0, @@ -378,7 +380,8 @@ static struct crypto_alg twofish_algs[10] = { { .cra_name = "__ctr-twofish-avx", .cra_driver_name = "__driver-ctr-twofish-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct twofish_ctx), .cra_alignmask = 0, @@ -398,7 +401,8 @@ static struct crypto_alg twofish_algs[10] = { { .cra_name = "__lrw-twofish-avx", .cra_driver_name = "__driver-lrw-twofish-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = TF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct twofish_lrw_ctx), .cra_alignmask = 0, @@ -421,7 +425,8 @@ static struct crypto_alg twofish_algs[10] = { { .cra_name = "__xts-twofish-avx", .cra_driver_name = "__driver-xts-twofish-avx", .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_INTERNAL, .cra_blocksize = TF_BLOCK_SIZE, .cra_ctxsize = sizeof(struct twofish_xts_ctx), .cra_alignmask = 0, |