crypto: aegis - avoid prerotated AES tables

The generic AES code provides four sets of lookup tables, where each set consists of four tables containing the same 32-bit values, but rotated by 0, 8, 16 and 24 bits, respectively. This makes sense for CISC architectures such as x86 which support memory operands, but for other architectures, the rotates are quite cheap, and using all four tables needlessly thrashes the D-cache, and actually hurts rather than helps performance. Since x86 already has its own implementation of AEGIS based on AES-NI instructions, let's tweak the generic implementation towards other architectures, and avoid the prerotated tables, and perform the rotations inline. On ARM Cortex-A53, this results in a ~8% speedup. Acked-by: Ondrej Mosnacek <omosnace@redhat.com> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Ard Biesheuvel 2019-07-03 10:55:09 +0200
committer: Herbert Xu 2019-07-26 15:03:57 +1000
commit: 521cdde758bf331d4e264ef3deef5a26d5ce0b4f (patch)
tree: f356195f6c5753583c26d4d3512bc88508ae64ef /crypto
parent: 368b1bdc0a5983c8c908ce2001229e2291eac583 (diff)
1 files changed, 6 insertions, 8 deletions
diff --git a/crypto/aegis.h b/crypto/aegis.h
index 41a3090cda8e..3308066ddde0 100644
--- a/crypto/aegis.h
+++ b/crypto/aegis.h
@@ -10,6 +10,7 @@
 #define _CRYPTO_AEGIS_H
 
 #include <crypto/aes.h>
+#include <linux/bitops.h>
 #include <linux/types.h>
 
 #define AEGIS_BLOCK_SIZE 16
@@ -53,16 +54,13 @@ static void crypto_aegis_aesenc(union aegis_block *dst,
 				const union aegis_block *key)
 {
 	const u8  *s  = src->bytes;
-	const u32 *t0 = crypto_ft_tab[0];
-	const u32 *t1 = crypto_ft_tab[1];
-	const u32 *t2 = crypto_ft_tab[2];
-	const u32 *t3 = crypto_ft_tab[3];
+	const u32 *t = crypto_ft_tab[0];
 	u32 d0, d1, d2, d3;
 
-	d0 = t0[s[ 0]] ^ t1[s[ 5]] ^ t2[s[10]] ^ t3[s[15]];
-	d1 = t0[s[ 4]] ^ t1[s[ 9]] ^ t2[s[14]] ^ t3[s[ 3]];
-	d2 = t0[s[ 8]] ^ t1[s[13]] ^ t2[s[ 2]] ^ t3[s[ 7]];
-	d3 = t0[s[12]] ^ t1[s[ 1]] ^ t2[s[ 6]] ^ t3[s[11]];
+	d0 = t[s[ 0]] ^ rol32(t[s[ 5]], 8) ^ rol32(t[s[10]], 16) ^ rol32(t[s[15]], 24);
+	d1 = t[s[ 4]] ^ rol32(t[s[ 9]], 8) ^ rol32(t[s[14]], 16) ^ rol32(t[s[ 3]], 24);
+	d2 = t[s[ 8]] ^ rol32(t[s[13]], 8) ^ rol32(t[s[ 2]], 16) ^ rol32(t[s[ 7]], 24);
+	d3 = t[s[12]] ^ rol32(t[s[ 1]], 8) ^ rol32(t[s[ 6]], 16) ^ rol32(t[s[11]], 24);
 
 	dst->words32[0] = cpu_to_le32(d0) ^ key->words32[0];
 	dst->words32[1] = cpu_to_le32(d1) ^ key->words32[1];
author	Ard Biesheuvel	2019-07-03 10:55:09 +0200
committer	Herbert Xu	2019-07-26 15:03:57 +1000
commit	521cdde758bf331d4e264ef3deef5a26d5ce0b4f (patch)
tree	f356195f6c5753583c26d4d3512bc88508ae64ef /crypto
parent	368b1bdc0a5983c8c908ce2001229e2291eac583 (diff)