aboutsummaryrefslogtreecommitdiff
path: root/arch/arm
diff options
context:
space:
mode:
authorLinus Torvalds2018-10-25 16:43:35 -0700
committerLinus Torvalds2018-10-25 16:43:35 -0700
commit62606c224d72a98c35d21a849f95cccf95b0a252 (patch)
tree6f6f3466451edf9baa2ea8b5f9fc558aa555c69a /arch/arm
parent24ed334f33666f2ae929ccc08f72e7e72e353c64 (diff)
parenta1c6fd4308d37f072e939a2782f24214115fc7e8 (diff)
Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto updates from Herbert Xu: "API: - Remove VLA usage - Add cryptostat user-space interface - Add notifier for new crypto algorithms Algorithms: - Add OFB mode - Remove speck Drivers: - Remove x86/sha*-mb as they are buggy - Remove pcbc(aes) from x86/aesni - Improve performance of arm/ghash-ce by up to 85% - Implement CTS-CBC in arm64/aes-blk, faster by up to 50% - Remove PMULL based arm64/crc32 driver - Use PMULL in arm64/crct10dif - Add aes-ctr support in s5p-sss - Add caam/qi2 driver Others: - Pick better transform if one becomes available in crc-t10dif" * 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (124 commits) crypto: chelsio - Update ntx queue received from cxgb4 crypto: ccree - avoid implicit enum conversion crypto: caam - add SPDX license identifier to all files crypto: caam/qi - simplify CGR allocation, freeing crypto: mxs-dcp - make symbols 'sha1_null_hash' and 'sha256_null_hash' static crypto: arm64/aes-blk - ensure XTS mask is always loaded crypto: testmgr - fix sizeof() on COMP_BUF_SIZE crypto: chtls - remove set but not used variable 'csk' crypto: axis - fix platform_no_drv_owner.cocci warnings crypto: x86/aes-ni - fix build error following fpu template removal crypto: arm64/aes - fix handling sub-block CTS-CBC inputs crypto: caam/qi2 - avoid double export crypto: mxs-dcp - Fix AES issues crypto: mxs-dcp - Fix SHA null hashes and output length crypto: mxs-dcp - Implement sha import/export crypto: aegis/generic - fix for big endian systems crypto: morus/generic - fix for big endian systems crypto: lrw - fix rebase error after out of bounds fix crypto: cavium/nitrox - use pci_alloc_irq_vectors() while enabling MSI-X. crypto: cavium/nitrox - NITROX command queue changes. ...
Diffstat (limited to 'arch/arm')
-rw-r--r--arch/arm/crypto/Kconfig7
-rw-r--r--arch/arm/crypto/Makefile2
-rw-r--r--arch/arm/crypto/chacha20-neon-core.S277
-rw-r--r--arch/arm/crypto/crc32-ce-glue.c2
-rw-r--r--arch/arm/crypto/ghash-ce-core.S108
-rw-r--r--arch/arm/crypto/ghash-ce-glue.c38
-rw-r--r--arch/arm/crypto/speck-neon-core.S434
-rw-r--r--arch/arm/crypto/speck-neon-glue.c288
8 files changed, 275 insertions, 881 deletions
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 925d1364727a..ef0c7feea6e2 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -99,6 +99,7 @@ config CRYPTO_GHASH_ARM_CE
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_CRYPTD
+ select CRYPTO_GF128MUL
help
Use an implementation of GHASH (used by the GCM AEAD chaining mode)
that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
@@ -121,10 +122,4 @@ config CRYPTO_CHACHA20_NEON
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
-config CRYPTO_SPECK_NEON
- tristate "NEON accelerated Speck cipher algorithms"
- depends on KERNEL_MODE_NEON
- select CRYPTO_BLKCIPHER
- select CRYPTO_SPECK
-
endif
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 8de542c48ade..bd5bceef0605 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -10,7 +10,6 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
-obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -54,7 +53,6 @@ ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
-speck-neon-y := speck-neon-core.o speck-neon-glue.o
ifdef REGENERATE_ARM_CRYPTO
quiet_cmd_perl = PERL $@
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S
index 451a849ad518..50e7b9896818 100644
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ b/arch/arm/crypto/chacha20-neon-core.S
@@ -18,6 +18,34 @@
* (at your option) any later version.
*/
+ /*
+ * NEON doesn't have a rotate instruction. The alternatives are, more or less:
+ *
+ * (a) vshl.u32 + vsri.u32 (needs temporary register)
+ * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register)
+ * (c) vrev32.16 (16-bit rotations only)
+ * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only,
+ * needs index vector)
+ *
+ * ChaCha20 has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit
+ * rotations, the only choices are (a) and (b). We use (a) since it takes
+ * two-thirds the cycles of (b) on both Cortex-A7 and Cortex-A53.
+ *
+ * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
+ * and doesn't need a temporary register.
+ *
+ * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
+ * is twice as fast as (a), even when doing (a) on multiple registers
+ * simultaneously to eliminate the stall between vshl and vsri. Also, it
+ * parallelizes better when temporary registers are scarce.
+ *
+ * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
+ * (a), so the need to load the rotation table actually makes the vtbl method
+ * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it
+ * seems to be a good compromise to get a more significant speed boost on some
+ * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
+ */
+
#include <linux/linkage.h>
.text
@@ -46,7 +74,9 @@ ENTRY(chacha20_block_xor_neon)
vmov q10, q2
vmov q11, q3
+ adr ip, .Lrol8_table
mov r3, #10
+ vld1.8 {d10}, [ip, :64]
.Ldoubleround:
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
@@ -62,9 +92,9 @@ ENTRY(chacha20_block_xor_neon)
// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vadd.i32 q0, q0, q1
- veor q4, q3, q0
- vshl.u32 q3, q4, #8
- vsri.u32 q3, q4, #24
+ veor q3, q3, q0
+ vtbl.8 d6, {d6}, d10
+ vtbl.8 d7, {d7}, d10
// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vadd.i32 q2, q2, q3
@@ -92,9 +122,9 @@ ENTRY(chacha20_block_xor_neon)
// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vadd.i32 q0, q0, q1
- veor q4, q3, q0
- vshl.u32 q3, q4, #8
- vsri.u32 q3, q4, #24
+ veor q3, q3, q0
+ vtbl.8 d6, {d6}, d10
+ vtbl.8 d7, {d7}, d10
// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vadd.i32 q2, q2, q3
@@ -139,13 +169,17 @@ ENTRY(chacha20_block_xor_neon)
bx lr
ENDPROC(chacha20_block_xor_neon)
+ .align 4
+.Lctrinc: .word 0, 1, 2, 3
+.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6
+
.align 5
ENTRY(chacha20_4block_xor_neon)
- push {r4-r6, lr}
- mov ip, sp // preserve the stack pointer
- sub r3, sp, #0x20 // allocate a 32 byte buffer
- bic r3, r3, #0x1f // aligned to 32 bytes
- mov sp, r3
+ push {r4-r5}
+ mov r4, sp // preserve the stack pointer
+ sub ip, sp, #0x20 // allocate a 32 byte buffer
+ bic ip, ip, #0x1f // aligned to 32 bytes
+ mov sp, ip
// r0: Input state matrix, s
// r1: 4 data blocks output, o
@@ -155,25 +189,24 @@ ENTRY(chacha20_4block_xor_neon)
// This function encrypts four consecutive ChaCha20 blocks by loading
// the state matrix in NEON registers four times. The algorithm performs
// each operation on the corresponding word of each state matrix, hence
- // requires no word shuffling. For final XORing step we transpose the
- // matrix by interleaving 32- and then 64-bit words, which allows us to
- // do XOR in NEON registers.
+ // requires no word shuffling. The words are re-interleaved before the
+ // final addition of the original state and the XORing step.
//
- // x0..15[0-3] = s0..3[0..3]
- add r3, r0, #0x20
+ // x0..15[0-3] = s0..15[0-3]
+ add ip, r0, #0x20
vld1.32 {q0-q1}, [r0]
- vld1.32 {q2-q3}, [r3]
+ vld1.32 {q2-q3}, [ip]
- adr r3, CTRINC
+ adr r5, .Lctrinc
vdup.32 q15, d7[1]
vdup.32 q14, d7[0]
- vld1.32 {q11}, [r3, :128]
+ vld1.32 {q4}, [r5, :128]
vdup.32 q13, d6[1]
vdup.32 q12, d6[0]
- vadd.i32 q12, q12, q11 // x12 += counter values 0-3
vdup.32 q11, d5[1]
vdup.32 q10, d5[0]
+ vadd.u32 q12, q12, q4 // x12 += counter values 0-3
vdup.32 q9, d4[1]
vdup.32 q8, d4[0]
vdup.32 q7, d3[1]
@@ -185,9 +218,13 @@ ENTRY(chacha20_4block_xor_neon)
vdup.32 q1, d0[1]
vdup.32 q0, d0[0]
+ adr ip, .Lrol8_table
mov r3, #10
+ b 1f
.Ldoubleround4:
+ vld1.32 {q8-q9}, [sp, :256]
+1:
// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
@@ -236,24 +273,25 @@ ENTRY(chacha20_4block_xor_neon)
// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vld1.8 {d16}, [ip, :64]
vadd.i32 q0, q0, q4
vadd.i32 q1, q1, q5
vadd.i32 q2, q2, q6
vadd.i32 q3, q3, q7
- veor q8, q12, q0
- veor q9, q13, q1
- vshl.u32 q12, q8, #8
- vshl.u32 q13, q9, #8
- vsri.u32 q12, q8, #24
- vsri.u32 q13, q9, #24
+ veor q12, q12, q0
+ veor q13, q13, q1
+ veor q14, q14, q2
+ veor q15, q15, q3
- veor q8, q14, q2
- veor q9, q15, q3
- vshl.u32 q14, q8, #8
- vshl.u32 q15, q9, #8
- vsri.u32 q14, q8, #24
- vsri.u32 q15, q9, #24
+ vtbl.8 d24, {d24}, d16
+ vtbl.8 d25, {d25}, d16
+ vtbl.8 d26, {d26}, d16
+ vtbl.8 d27, {d27}, d16
+ vtbl.8 d28, {d28}, d16
+ vtbl.8 d29, {d29}, d16
+ vtbl.8 d30, {d30}, d16
+ vtbl.8 d31, {d31}, d16
vld1.32 {q8-q9}, [sp, :256]
@@ -332,24 +370,25 @@ ENTRY(chacha20_4block_xor_neon)
// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vld1.8 {d16}, [ip, :64]
vadd.i32 q0, q0, q5
vadd.i32 q1, q1, q6
vadd.i32 q2, q2, q7
vadd.i32 q3, q3, q4
- veor q8, q15, q0
- veor q9, q12, q1
- vshl.u32 q15, q8, #8
- vshl.u32 q12, q9, #8
- vsri.u32 q15, q8, #24
- vsri.u32 q12, q9, #24
+ veor q15, q15, q0
+ veor q12, q12, q1
+ veor q13, q13, q2
+ veor q14, q14, q3
- veor q8, q13, q2
- veor q9, q14, q3
- vshl.u32 q13, q8, #8
- vshl.u32 q14, q9, #8
- vsri.u32 q13, q8, #24
- vsri.u32 q14, q9, #24
+ vtbl.8 d30, {d30}, d16
+ vtbl.8 d31, {d31}, d16
+ vtbl.8 d24, {d24}, d16
+ vtbl.8 d25, {d25}, d16
+ vtbl.8 d26, {d26}, d16
+ vtbl.8 d27, {d27}, d16
+ vtbl.8 d28, {d28}, d16
+ vtbl.8 d29, {d29}, d16
vld1.32 {q8-q9}, [sp, :256]
@@ -379,104 +418,76 @@ ENTRY(chacha20_4block_xor_neon)
vsri.u32 q6, q9, #25
subs r3, r3, #1
- beq 0f
-
- vld1.32 {q8-q9}, [sp, :256]
- b .Ldoubleround4
-
- // x0[0-3] += s0[0]
- // x1[0-3] += s0[1]
- // x2[0-3] += s0[2]
- // x3[0-3] += s0[3]
-0: ldmia r0!, {r3-r6}
- vdup.32 q8, r3
- vdup.32 q9, r4
- vadd.i32 q0, q0, q8
- vadd.i32 q1, q1, q9
- vdup.32 q8, r5
- vdup.32 q9, r6
- vadd.i32 q2, q2, q8
- vadd.i32 q3, q3, q9
-
- // x4[0-3] += s1[0]
- // x5[0-3] += s1[1]
- // x6[0-3] += s1[2]
- // x7[0-3] += s1[3]
- ldmia r0!, {r3-r6}
- vdup.32 q8, r3
- vdup.32 q9, r4
- vadd.i32 q4, q4, q8
- vadd.i32 q5, q5, q9
- vdup.32 q8, r5
- vdup.32 q9, r6
- vadd.i32 q6, q6, q8
- vadd.i32 q7, q7, q9
-
- // interleave 32-bit words in state n, n+1
- vzip.32 q0, q1
- vzip.32 q2, q3
- vzip.32 q4, q5
- vzip.32 q6, q7
-
- // interleave 64-bit words in state n, n+2
+ bne .Ldoubleround4
+
+ // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
+ // x8..9[0-3] are on the stack.
+
+ // Re-interleave the words in the first two rows of each block (x0..7).
+ // Also add the counter values 0-3 to x12[0-3].
+ vld1.32 {q8}, [r5, :128] // load counter values 0-3
+ vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
+ vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
+ vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
+ vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7)
+ vadd.u32 q12, q8 // x12 += counter values 0-3
vswp d1, d4
vswp d3, d6
+ vld1.32 {q8-q9}, [r0]! // load s0..7
vswp d9, d12
vswp d11, d14
- // xor with corresponding input, write to output
+ // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
+ // after XORing the first 32 bytes.
+ vswp q1, q4
+
+ // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
+
+ // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
+ vadd.u32 q0, q0, q8
+ vadd.u32 q2, q2, q8
+ vadd.u32 q4, q4, q8
+ vadd.u32 q3, q3, q8
+
+ // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
+ vadd.u32 q1, q1, q9
+ vadd.u32 q6, q6, q9
+ vadd.u32 q5, q5, q9
+ vadd.u32 q7, q7, q9
+
+ // XOR first 32 bytes using keystream from first two rows of first block
vld1.8 {q8-q9}, [r2]!
veor q8, q8, q0
- veor q9, q9, q4
+ veor q9, q9, q1
vst1.8 {q8-q9}, [r1]!
+ // Re-interleave the words in the last two rows of each block (x8..15).
vld1.32 {q8-q9}, [sp, :256]
-
- // x8[0-3] += s2[0]
- // x9[0-3] += s2[1]
- // x10[0-3] += s2[2]
- // x11[0-3] += s2[3]
- ldmia r0!, {r3-r6}
- vdup.32 q0, r3
- vdup.32 q4, r4
- vadd.i32 q8, q8, q0
- vadd.i32 q9, q9, q4
- vdup.32 q0, r5
- vdup.32 q4, r6
- vadd.i32 q10, q10, q0
- vadd.i32 q11, q11, q4
-
- // x12[0-3] += s3[0]
- // x13[0-3] += s3[1]
- // x14[0-3] += s3[2]
- // x15[0-3] += s3[3]
- ldmia r0!, {r3-r6}
- vdup.32 q0, r3
- vdup.32 q4, r4
- adr r3, CTRINC
- vadd.i32 q12, q12, q0
- vld1.32 {q0}, [r3, :128]
- vadd.i32 q13, q13, q4
- vadd.i32 q12, q12, q0 // x12 += counter values 0-3
-
- vdup.32 q0, r5
- vdup.32 q4, r6
- vadd.i32 q14, q14, q0
- vadd.i32 q15, q15, q4
-
- // interleave 32-bit words in state n, n+1
- vzip.32 q8, q9
- vzip.32 q10, q11
- vzip.32 q12, q13
- vzip.32 q14, q15
-
- // interleave 64-bit words in state n, n+2
- vswp d17, d20
- vswp d19, d22
+ vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
+ vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
+ vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
+ vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11)
+ vld1.32 {q0-q1}, [r0] // load s8..15
vswp d25, d28
vswp d27, d30
+ vswp d17, d20
+ vswp d19, d22
+
+ // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
+
+ // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
+ vadd.u32 q8, q8, q0
+ vadd.u32 q10, q10, q0
+ vadd.u32 q9, q9, q0
+ vadd.u32 q11, q11, q0
+
+ // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
+ vadd.u32 q12, q12, q1
+ vadd.u32 q14, q14, q1
+ vadd.u32 q13, q13, q1
+ vadd.u32 q15, q15, q1
- vmov q4, q1
+ // XOR the rest of the data with the keystream
vld1.8 {q0-q1}, [r2]!
veor q0, q0, q8
@@ -509,13 +520,11 @@ ENTRY(chacha20_4block_xor_neon)
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]
+ mov sp, r4 // restore original stack pointer
veor q0, q0, q11
veor q1, q1, q15
vst1.8 {q0-q1}, [r1]
- mov sp, ip
- pop {r4-r6, pc}
+ pop {r4-r5}
+ bx lr
ENDPROC(chacha20_4block_xor_neon)
-
- .align 4
-CTRINC: .word 0, 1, 2, 3
diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c
index 96e62ec105d0..cd9e93b46c2d 100644
--- a/arch/arm/crypto/crc32-ce-glue.c
+++ b/arch/arm/crypto/crc32-ce-glue.c
@@ -236,7 +236,7 @@ static void __exit crc32_pmull_mod_exit(void)
ARRAY_SIZE(crc32_pmull_algs));
}
-static const struct cpu_feature crc32_cpu_feature[] = {
+static const struct cpu_feature __maybe_unused crc32_cpu_feature[] = {
{ cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
};
MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);
diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S
index 2f78c10b1881..406009afa9cf 100644
--- a/arch/arm/crypto/ghash-ce-core.S
+++ b/arch/arm/crypto/ghash-ce-core.S
@@ -63,6 +63,33 @@
k48 .req d31
SHASH2_p64 .req d31
+ HH .req q10
+ HH3 .req q11
+ HH4 .req q12
+ HH34 .req q13
+
+ HH_L .req d20
+ HH_H .req d21
+ HH3_L .req d22
+ HH3_H .req d23
+ HH4_L .req d24
+ HH4_H .req d25
+ HH34_L .req d26
+ HH34_H .req d27
+ SHASH2_H .req d29
+
+ XL2 .req q5
+ XM2 .req q6
+ XH2 .req q7
+ T3 .req q8
+
+ XL2_L .req d10
+ XL2_H .req d11
+ XM2_L .req d12
+ XM2_H .req d13
+ T3_L .req d16
+ T3_H .req d17
+
.text
.fpu crypto-neon-fp-armv8
@@ -175,12 +202,77 @@
beq 0f
vld1.64 {T1}, [ip]
teq r0, #0
- b 1f
+ b 3f
+
+0: .ifc \pn, p64
+ tst r0, #3 // skip until #blocks is a
+ bne 2f // round multiple of 4
+
+ vld1.8 {XL2-XM2}, [r2]!
+1: vld1.8 {T3-T2}, [r2]!
+ vrev64.8 XL2, XL2
+ vrev64.8 XM2, XM2
+
+ subs r0, r0, #4
+
+ vext.8 T1, XL2, XL2, #8
+ veor XL2_H, XL2_H, XL_L
+ veor XL, XL, T1
+
+ vrev64.8 T3, T3
+ vrev64.8 T1, T2
+
+ vmull.p64 XH, HH4_H, XL_H // a1 * b1
+ veor XL2_H, XL2_H, XL_H
+ vmull.p64 XL, HH4_L, XL_L // a0 * b0
+ vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
+
+ vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
+ veor XM2_L, XM2_L, XM2_H
+ vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
+ vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
+
+ veor XH, XH, XH2
+ veor XL, XL, XL2
+ veor XM, XM, XM2
+
+ vmull.p64 XH2, HH_H, T3_L // a1 * b1
+ veor T3_L, T3_L, T3_H
+ vmull.p64 XL2, HH_L, T3_H // a0 * b0
+ vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
+
+ veor XH, XH, XH2
+ veor XL, XL, XL2
+ veor XM, XM, XM2
+
+ vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
+ veor T1_L, T1_L, T1_H
+ vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
+ vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
+
+ veor XH, XH, XH2
+ veor XL, XL, XL2
+ veor XM, XM, XM2
-0: vld1.64 {T1}, [r2]!
+ beq 4f
+
+ vld1.8 {XL2-XM2}, [r2]!
+
+ veor T1, XL, XH
+ veor XM, XM, T1
+
+ __pmull_reduce_p64
+
+ veor T1, T1, XH
+ veor XL, XL, T1
+
+ b 1b
+ .endif
+
+2: vld1.64 {T1}, [r2]!
subs r0, r0, #1
-1: /* multiply XL by SHASH in GF(2^128) */
+3: /* multiply XL by SHASH in GF(2^128) */
#ifndef CONFIG_CPU_BIG_ENDIAN
vrev64.8 T1, T1
#endif
@@ -193,7 +285,7 @@
__pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
__pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
- veor T1, XL, XH
+4: veor T1, XL, XH
veor XM, XM, T1
__pmull_reduce_\pn
@@ -212,8 +304,14 @@
* struct ghash_key const *k, const char *head)
*/
ENTRY(pmull_ghash_update_p64)
- vld1.64 {SHASH}, [r3]
+ vld1.64 {SHASH}, [r3]!
+ vld1.64 {HH}, [r3]!
+ vld1.64 {HH3-HH4}, [r3]
+
veor SHASH2_p64, SHASH_L, SHASH_H
+ veor SHASH2_H, HH_L, HH_H
+ veor HH34_L, HH3_L, HH3_H
+ veor HH34_H, HH4_L, HH4_H
vmov.i8 MASK, #0xe1
vshl.u64 MASK, MASK, #57
diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c
index 8930fc4e7c22..b7d30b6cf49c 100644
--- a/arch/arm/crypto/ghash-ce-glue.c
+++ b/arch/arm/crypto/ghash-ce-glue.c
@@ -1,7 +1,7 @@
/*
* Accelerated GHASH implementation with ARMv8 vmull.p64 instructions.
*
- * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2015 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
@@ -28,8 +28,10 @@ MODULE_ALIAS_CRYPTO("ghash");
#define GHASH_DIGEST_SIZE 16
struct ghash_key {
- u64 a;
- u64 b;
+ u64 h[2];
+ u64 h2[2];
+ u64 h3[2];
+ u64 h4[2];
};
struct ghash_desc_ctx {
@@ -117,26 +119,40 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
return 0;
}
+static void ghash_reflect(u64 h[], const be128 *k)
+{
+ u64 carry = be64_to_cpu(k->a) >> 63;
+
+ h[0] = (be64_to_cpu(k->b) << 1) | carry;
+ h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63);
+
+ if (carry)
+ h[1] ^= 0xc200000000000000UL;
+}
+
static int ghash_setkey(struct crypto_shash *tfm,
const u8 *inkey, unsigned int keylen)
{
struct ghash_key *key = crypto_shash_ctx(tfm);
- u64 a, b;
+ be128 h, k;
if (keylen != GHASH_BLOCK_SIZE) {
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
- /* perform multiplication by 'x' in GF(2^128) */
- b = get_unaligned_be64(inkey);
- a = get_unaligned_be64(inkey + 8);
+ memcpy(&k, inkey, GHASH_BLOCK_SIZE);
+ ghash_reflect(key->h, &k);
+
+ h = k;
+ gf128mul_lle(&h, &k);
+ ghash_reflect(key->h2, &h);
- key->a = (a << 1) | (b >> 63);
- key->b = (b << 1) | (a >> 63);
+ gf128mul_lle(&h, &k);
+ ghash_reflect(key->h3, &h);
- if (b >> 63)
- key->b ^= 0xc200000000000000UL;
+ gf128mul_lle(&h, &k);
+ ghash_reflect(key->h4, &h);
return 0;
}
diff --git a/arch/arm/crypto/speck-neon-core.S b/arch/arm/crypto/speck-neon-core.S
deleted file mode 100644
index 57caa742016e..000000000000
--- a/arch/arm/crypto/speck-neon-core.S
+++ /dev/null
@@ -1,434 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- *
- * Copyright (c) 2018 Google, Inc
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-
- .text
- .fpu neon
-
- // arguments
- ROUND_KEYS .req r0 // const {u64,u32} *round_keys
- NROUNDS .req r1 // int nrounds
- DST .req r2 // void *dst
- SRC .req r3 // const void *src
- NBYTES .req r4 // unsigned int nbytes
- TWEAK .req r5 // void *tweak
-
- // registers which hold the data being encrypted/decrypted
- X0 .req q0
- X0_L .req d0
- X0_H .req d1
- Y0 .req q1
- Y0_H .req d3
- X1 .req q2
- X1_L .req d4
- X1_H .req d5
- Y1 .req q3
- Y1_H .req d7
- X2 .req q4
- X2_L .req d8
- X2_H .req d9
- Y2 .req q5
- Y2_H .req d11
- X3 .req q6
- X3_L .req d12
- X3_H .req d13
- Y3 .req q7
- Y3_H .req d15
-
- // the round key, duplicated in all lanes
- ROUND_KEY .req q8
- ROUND_KEY_L .req d16
- ROUND_KEY_H .req d17
-
- // index vector for vtbl-based 8-bit rotates
- ROTATE_TABLE .req d18
-
- // multiplication table for updating XTS tweaks
- GF128MUL_TABLE .req d19
- GF64MUL_TABLE .req d19
-
- // current XTS tweak value(s)
- TWEAKV .req q10
- TWEAKV_L .req d20
- TWEAKV_H .req d21
-
- TMP0 .req q12
- TMP0_L .req d24
- TMP0_H .req d25
- TMP1 .req q13
- TMP2 .req q14
- TMP3 .req q15
-
- .align 4
-.Lror64_8_table:
- .byte 1, 2, 3, 4, 5, 6, 7, 0
-.Lror32_8_table:
- .byte 1, 2, 3, 0, 5, 6, 7, 4
-.Lrol64_8_table:
- .byte 7, 0, 1, 2, 3, 4, 5, 6
-.Lrol32_8_table:
- .byte 3, 0, 1, 2, 7, 4, 5, 6
-.Lgf128mul_table:
- .byte 0, 0x87
- .fill 14
-.Lgf64mul_table:
- .byte 0, 0x1b, (0x1b << 1), (0x1b << 1) ^ 0x1b
- .fill 12
-
-/*
- * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
- *
- * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
- * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
- * of ROUND_KEY. 'n' is the lane size: 64 for Speck128, or 32 for Speck64.
- *
- * The 8-bit rotates are implemented using vtbl instead of vshr + vsli because
- * the vtbl approach is faster on some processors and the same speed on others.
- */
-.macro _speck_round_128bytes n
-
- // x = ror(x, 8)
- vtbl.8 X0_L, {X0_L}, ROTATE_TABLE
- vtbl.8 X0_H, {X0_H}, ROTATE_TABLE
- vtbl.8 X1_L, {X1_L}, ROTATE_TABLE
- vtbl.8 X1_H, {X1_H}, ROTATE_TABLE
- vtbl.8 X2_L, {X2_L}, ROTATE_TABLE
- vtbl.8 X2_H, {X2_H}, ROTATE_TABLE
- vtbl.8 X3_L, {X3_L}, ROTATE_TABLE
- vtbl.8 X3_H, {X3_H}, ROTATE_TABLE
-
- // x += y
- vadd.u\n X0, Y0
- vadd.u\n X1, Y1
- vadd.u\n X2, Y2
- vadd.u\n X3, Y3
-
- // x ^= k
- veor X0, ROUND_KEY
- veor X1, ROUND_KEY
- veor X2, ROUND_KEY
- veor X3, ROUND_KEY
-
- // y = rol(y, 3)
- vshl.u\n TMP0, Y0, #3
- vshl.u\n TMP1, Y1, #3
- vshl.u\n TMP2, Y2, #3
- vshl.u\n TMP3, Y3, #3
- vsri.u\n TMP0, Y0, #(\n - 3)
- vsri.u\n TMP1, Y1, #(\n - 3)
- vsri.u\n TMP2, Y2, #(\n - 3)
- vsri.u\n TMP3, Y3, #(\n - 3)
-
- // y ^= x
- veor Y0, TMP0, X0
- veor Y1, TMP1, X1
- veor Y2, TMP2, X2
- veor Y3, TMP3, X3
-.endm
-
-/*
- * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
- *
- * This is the inverse of _speck_round_128bytes().
- */
-.macro _speck_unround_128bytes n
-
- // y ^= x
- veor TMP0, Y0, X0
- veor TMP1, Y1, X1
- veor TMP2, Y2, X2
- veor TMP3, Y3, X3
-
- // y = ror(y, 3)
- vshr.u\n Y0, TMP0, #3
- vshr.u\n Y1, TMP1, #3
- vshr.u\n Y2, TMP2, #3
- vshr.u\n Y3, TMP3, #3
- vsli.u\n Y0, TMP0, #(\n - 3)
- vsli.u\n Y1, TMP1, #(\n - 3)
- vsli.u\n Y2, TMP2, #(\n - 3)
- vsli.u\n Y3, TMP3, #(\n - 3)
-
- // x ^= k
- veor X0, ROUND_KEY
- veor X1, ROUND_KEY
- veor X2, ROUND_KEY
- veor X3, ROUND_KEY
-
- // x -= y
- vsub.u\n X0, Y0
- vsub.u\n X1, Y1
- vsub.u\n X2, Y2
- vsub.u\n X3, Y3
-
- // x = rol(x, 8);
- vtbl.8 X0_L, {X0_L}, ROTATE_TABLE
- vtbl.8 X0_H, {X0_H}, ROTATE_TABLE
- vtbl.8 X1_L, {X1_L}, ROTATE_TABLE
- vtbl.8 X1_H, {X1_H}, ROTATE_TABLE
- vtbl.8 X2_L, {X2_L}, ROTATE_TABLE
- vtbl.8 X2_H, {X2_H}, ROTATE_TABLE
- vtbl.8 X3_L, {X3_L}, ROTATE_TABLE
- vtbl.8 X3_H, {X3_H}, ROTATE_TABLE
-.endm
-
-.macro _xts128_precrypt_one dst_reg, tweak_buf, tmp
-
- // Load the next source block
- vld1.8 {\dst_reg}, [SRC]!
-
- // Save the current tweak in the tweak buffer
- vst1.8 {TWEAKV}, [\tweak_buf:128]!
-
- // XOR the next source block with the current tweak
- veor \dst_reg, TWEAKV
-
- /*
- * Calculate the next tweak by multiplying the current one by x,
- * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
- */
- vshr.u64 \tmp, TWEAKV, #63
- vshl.u64 TWEAKV, #1
- veor TWEAKV_H, \tmp\()_L
- vtbl.8 \tmp\()_H, {GF128MUL_TABLE}, \tmp\()_H
- veor TWEAKV_L, \tmp\()_H
-.endm
-
-.macro _xts64_precrypt_two dst_reg, tweak_buf, tmp
-
- // Load the next two source blocks
- vld1.8 {\dst_reg}, [SRC]!
-
- // Save the current two tweaks in the tweak buffer
- vst1.8 {TWEAKV}, [\tweak_buf:128]!
-
- // XOR the next two source blocks with the current two tweaks
- veor \dst_reg, TWEAKV
-
- /*
- * Calculate the next two tweaks by multiplying the current ones by x^2,
- * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
- */
- vshr.u64 \tmp, TWEAKV, #62
- vshl.u64 TWEAKV, #2
- vtbl.8 \tmp\()_L, {GF64MUL_TABLE}, \tmp\()_L
- vtbl.8 \tmp\()_H, {GF64MUL_TABLE}, \tmp\()_H
- veor TWEAKV, \tmp
-.endm
-
-/*
- * _speck_xts_crypt() - Speck-XTS encryption/decryption
- *
- * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
- * using Speck-XTS, specifically the variant with a block size of '2n' and round
- * count given by NROUNDS. The expanded round keys are given in ROUND_KEYS, and
- * the current XTS tweak value is given in TWEAK. It's assumed that NBYTES is a
- * nonzero multiple of 128.
- */
-.macro _speck_xts_crypt n, decrypting
- push {r4-r7}
- mov r7, sp
-
- /*
- * The first four parameters were passed in registers r0-r3. Load the
- * additional parameters, which were passed on the stack.
- */
- ldr NBYTES, [sp, #16]
- ldr TWEAK, [sp, #20]
-
- /*
- * If decrypting, modify the ROUND_KEYS parameter to point to the last
- * round key rather than the first, since for decryption the round keys
- * are used in reverse order.
- */
-.if \decrypting
-.if \n == 64
- add ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #3
- sub ROUND_KEYS, #8
-.else
- add ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #2
- sub ROUND_KEYS, #4
-.endif
-.endif
-
- // Load the index vector for vtbl-based 8-bit rotates
-.if \decrypting
- ldr r12, =.Lrol\n\()_8_table
-.else
- ldr r12, =.Lror\n\()_8_table
-.endif
- vld1.8 {ROTATE_TABLE}, [r12:64]
-
- // One-time XTS preparation
-
- /*
- * Allocate stack space to store 128 bytes worth of tweaks. For
- * performance, this space is aligned to a 16-byte boundary so that we
- * can use the load/store instructions that declare 16-byte alignment.
- * For Thumb2 compatibility, don't do the 'bic' directly on 'sp'.
- */
- sub r12, sp, #128
- bic r12, #0xf
- mov sp, r12
-
-.if \n == 64
- // Load first tweak
- vld1.8 {TWEAKV}, [TWEAK]
-
- // Load GF(2^128) multiplication table
- ldr r12, =.Lgf128mul_table
- vld1.8 {GF128MUL_TABLE}, [r12:64]
-.else
- // Load first tweak
- vld1.8 {TWEAKV_L}, [TWEAK]
-
- // Load GF(2^64) multiplication table
- ldr r12, =.Lgf64mul_table
- vld1.8 {GF64MUL_TABLE}, [r12:64]
-
- // Calculate second tweak, packing it together with the first
- vshr.u64 TMP0_L, TWEAKV_L, #63
- vtbl.u8 TMP0_L, {GF64MUL_TABLE}, TMP0_L
- vshl.u64 TWEAKV_H, TWEAKV_L, #1
- veor TWEAKV_H, TMP0_L
-.endif
-
-.Lnext_128bytes_\@:
-
- /*
- * Load the source blocks into {X,Y}[0-3], XOR them with their XTS tweak
- * values, and save the tweaks on the stack for later. Then
- * de-interleave the 'x' and 'y' elements of each block, i.e. make it so
- * that the X[0-3] registers contain only the second halves of blocks,
- * and the Y[0-3] registers contain only the first halves of blocks.
- * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
- */
- mov r12, sp
-.if \n == 64
- _xts128_precrypt_one X0, r12, TMP0
- _xts128_precrypt_one Y0, r12, TMP0
- _xts128_precrypt_one X1, r12, TMP0
- _xts128_precrypt_one Y1, r12, TMP0
- _xts128_precrypt_one X2, r12, TMP0
- _xts128_precrypt_one Y2, r12, TMP0
- _xts128_precrypt_one X3, r12, TMP0
- _xts128_precrypt_one Y3, r12, TMP0
- vswp X0_L, Y0_H
- vswp X1_L, Y1_H
- vswp X2_L, Y2_H
- vswp X3_L, Y3_H
-.else
- _xts64_precrypt_two X0, r12, TMP0
- _xts64_precrypt_two Y0, r12, TMP0
- _xts64_precrypt_two X1, r12, TMP0
- _xts64_precrypt_two Y1, r12, TMP0
- _xts64_precrypt_two X2, r12, TMP0
- _xts64_precrypt_two Y2, r12, TMP0
- _xts64_precrypt_two X3, r12, TMP0
- _xts64_precrypt_two Y3, r12, TMP0
- vuzp.32 Y0, X0
- vuzp.32 Y1, X1
- vuzp.32 Y2, X2
- vuzp.32 Y3, X3
-.endif
-
- // Do the cipher rounds
-
- mov r12, ROUND_KEYS
- mov r6, NROUNDS
-
-.Lnext_round_\@:
-.if \decrypting
-.if \n == 64
- vld1.64 ROUND_KEY_L, [r12]
- sub r12, #8
- vmov ROUND_KEY_H, ROUND_KEY_L
-.else
- vld1.32 {ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]
- sub r12, #4
-.endif
- _speck_unround_128bytes \n
-.else
-.if \n == 64
- vld1.64 ROUND_KEY_L, [r12]!
- vmov ROUND_KEY_H, ROUND_KEY_L
-.else
- vld1.32 {ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]!
-.endif
- _speck_round_128bytes \n
-.endif
- subs r6, r6, #1
- bne .Lnext_round_\@
-
- // Re-interleave the 'x' and 'y' elements of each block
-.if \n == 64
- vswp X0_L, Y0_H
- vswp X1_L, Y1_H
- vswp X2_L, Y2_H
- vswp X3_L, Y3_H
-.else
- vzip.32 Y0, X0
- vzip.32 Y1, X1
- vzip.32 Y2, X2
- vzip.32 Y3, X3
-.endif
-
- // XOR the encrypted/decrypted blocks with the tweaks we saved earlier
- mov r12, sp
- vld1.8 {TMP0, TMP1}, [r12:128]!
- vld1.8 {TMP2, TMP3}, [r12:128]!
- veor X0, TMP0
- veor Y0, TMP1
- veor X1, TMP2
- veor Y1, TMP3
- vld1.8 {TMP0, TMP1}, [r12:128]!
- vld1.8 {TMP2, TMP3}, [r12:128]!
- veor X2, TMP0
- veor Y2, TMP1
- veor X3, TMP2
- veor Y3, TMP3
-
- // Store the ciphertext in the destination buffer
- vst1.8 {X0, Y0}, [DST]!
- vst1.8 {X1, Y1}, [DST]!
- vst1.8 {X2, Y2}, [DST]!
- vst1.8 {X3, Y3}, [DST]!
-
- // Continue if there are more 128-byte chunks remaining, else return
- subs NBYTES, #128
- bne .Lnext_128bytes_\@
-
- // Store the next tweak
-.if \n == 64
- vst1.8 {TWEAKV}, [TWEAK]
-.else
- vst1.8 {TWEAKV_L}, [TWEAK]
-.endif
-
- mov sp, r7
- pop {r4-r7}
- bx lr
-.endm
-
-ENTRY(speck128_xts_encrypt_neon)
- _speck_xts_crypt n=64, decrypting=0
-ENDPROC(speck128_xts_encrypt_neon)
-
-ENTRY(speck128_xts_decrypt_neon)
- _speck_xts_crypt n=64, decrypting=1
-ENDPROC(speck128_xts_decrypt_neon)
-
-ENTRY(speck64_xts_encrypt_neon)
- _speck_xts_crypt n=32, decrypting=0
-ENDPROC(speck64_xts_encrypt_neon)
-
-ENTRY(speck64_xts_decrypt_neon)
- _speck_xts_crypt n=32, decrypting=1
-ENDPROC(speck64_xts_decrypt_neon)
diff --git a/arch/arm/crypto/speck-neon-glue.c b/arch/arm/crypto/speck-neon-glue.c
deleted file mode 100644
index f012c3ea998f..000000000000
--- a/arch/arm/crypto/speck-neon-glue.c
+++ /dev/null
@@ -1,288 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- *
- * Copyright (c) 2018 Google, Inc
- *
- * Note: the NIST recommendation for XTS only specifies a 128-bit block size,
- * but a 64-bit version (needed for Speck64) is fairly straightforward; the math
- * is just done in GF(2^64) instead of GF(2^128), with the reducing polynomial
- * x^64 + x^4 + x^3 + x + 1 from the original XEX paper (Rogaway, 2004:
- * "Efficient Instantiations of Tweakable Blockciphers and Refinements to Modes
- * OCB and PMAC"), represented as 0x1B.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/algapi.h>
-#include <crypto/gf128mul.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/speck.h>
-#include <crypto/xts.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-/* The assembly functions only handle multiples of 128 bytes */
-#define SPECK_NEON_CHUNK_SIZE 128
-
-/* Speck128 */
-
-struct speck128_xts_tfm_ctx {
- struct speck128_tfm_ctx main_key;
- struct speck128_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
- void *dst, const void *src,
- unsigned int nbytes, void *tweak);
-
-asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
- void *dst, const void *src,
- unsigned int nbytes, void *tweak);
-
-typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
- u8 *, const u8 *);
-typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
- const void *, unsigned int, void *);
-
-static __always_inline int
-__speck128_xts_crypt(struct skcipher_request *req,
- speck128_crypt_one_t crypt_one,
- speck128_xts_crypt_many_t crypt_many)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- const struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- le128 tweak;
- int err;
-
- err = skcipher_walk_virt(&walk, req, true);
-
- crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
- while (walk.nbytes > 0) {
- unsigned int nbytes = walk.nbytes;
- u8 *dst = walk.dst.virt.addr;
- const u8 *src = walk.src.virt.addr;
-
- if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
- unsigned int count;
-
- count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
- kernel_neon_begin();
- (*crypt_many)(ctx->main_key.round_keys,
- ctx->main_key.nrounds,
- dst, src, count, &tweak);
- kernel_neon_end();
- dst += count;
- src += count;
- nbytes -= count;
- }
-
- /* Handle any remainder with generic code */
- while (nbytes >= sizeof(tweak)) {
- le128_xor((le128 *)dst, (const le128 *)src, &tweak);
- (*crypt_one)(&ctx->main_key, dst, dst);
- le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
- gf128mul_x_ble(&tweak, &tweak);
-
- dst += sizeof(tweak);
- src += sizeof(tweak);
- nbytes -= sizeof(tweak);
- }
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
-}
-
-static int speck128_xts_encrypt(struct skcipher_request *req)
-{
- return __speck128_xts_crypt(req, crypto_speck128_encrypt,
- speck128_xts_encrypt_neon);
-}
-
-static int speck128_xts_decrypt(struct skcipher_request *req)
-{
- return __speck128_xts_crypt(req, crypto_speck128_decrypt,
- speck128_xts_decrypt_neon);
-}
-
-static int speck128_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = xts_verify_key(tfm, key, keylen);
- if (err)
- return err;
-
- keylen /= 2;
-
- err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
- if (err)
- return err;
-
- return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-/* Speck64 */
-
-struct speck64_xts_tfm_ctx {
- struct speck64_tfm_ctx main_key;
- struct speck64_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
- void *dst, const void *src,
- unsigned int nbytes, void *tweak);
-
-asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
- void *dst, const void *src,
- unsigned int nbytes, void *tweak);
-
-typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
- u8 *, const u8 *);
-typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
- const void *, unsigned int, void *);
-
-static __always_inline int
-__speck64_xts_crypt(struct skcipher_request *req, speck64_crypt_one_t crypt_one,
- speck64_xts_crypt_many_t crypt_many)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- const struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
- struct skcipher_walk walk;
- __le64 tweak;
- int err;
-
- err = skcipher_walk_virt(&walk, req, true);
-
- crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
- while (walk.nbytes > 0) {
- unsigned int nbytes = walk.nbytes;
- u8 *dst = walk.dst.virt.addr;
- const u8 *src = walk.src.virt.addr;
-
- if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
- unsigned int count;
-
- count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
- kernel_neon_begin();
- (*crypt_many)(ctx->main_key.round_keys,
- ctx->main_key.nrounds,
- dst, src, count, &tweak);
- kernel_neon_end();
- dst += count;
- src += count;
- nbytes -= count;
- }
-
- /* Handle any remainder with generic code */
- while (nbytes >= sizeof(tweak)) {
- *(__le64 *)dst = *(__le64 *)src ^ tweak;
- (*crypt_one)(&ctx->main_key, dst, dst);
- *(__le64 *)dst ^= tweak;
- tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
- ((tweak & cpu_to_le64(1ULL << 63)) ?
- 0x1B : 0));
- dst += sizeof(tweak);
- src += sizeof(tweak);
- nbytes -= sizeof(tweak);
- }
- err = skcipher_walk_done(&walk, nbytes);
- }
-
- return err;
-}
-
-static int speck64_xts_encrypt(struct skcipher_request *req)
-{
- return __speck64_xts_crypt(req, crypto_speck64_encrypt,
- speck64_xts_encrypt_neon);
-}
-
-static int speck64_xts_decrypt(struct skcipher_request *req)
-{
- return __speck64_xts_crypt(req, crypto_speck64_decrypt,
- speck64_xts_decrypt_neon);
-}
-
-static int speck64_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
- int err;
-
- err = xts_verify_key(tfm, key, keylen);
- if (err)
- return err;
-
- keylen /= 2;
-
- err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
- if (err)
- return err;
-
- return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-static struct skcipher_alg speck_algs[] = {
- {
- .base.cra_name = "xts(speck128)",
- .base.cra_driver_name = "xts-speck128-neon",
- .base.cra_priority = 300,
- .base.cra_blocksize = SPECK128_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct speck128_xts_tfm_ctx),
- .base.cra_alignmask = 7,
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * SPECK128_128_KEY_SIZE,
- .max_keysize = 2 * SPECK128_256_KEY_SIZE,
- .ivsize = SPECK128_BLOCK_SIZE,
- .walksize = SPECK_NEON_CHUNK_SIZE,
- .setkey = speck128_xts_setkey,
- .encrypt = speck128_xts_encrypt,
- .decrypt = speck128_xts_decrypt,
- }, {
- .base.cra_name = "xts(speck64)",
- .base.cra_driver_name = "xts-speck64-neon",
- .base.cra_priority = 300,
- .base.cra_blocksize = SPECK64_BLOCK_SIZE,
- .base.cra_ctxsize = sizeof(struct speck64_xts_tfm_ctx),
- .base.cra_alignmask = 7,
- .base.cra_module = THIS_MODULE,
- .min_keysize = 2 * SPECK64_96_KEY_SIZE,
- .max_keysize = 2 * SPECK64_128_KEY_SIZE,
- .ivsize = SPECK64_BLOCK_SIZE,
- .walksize = SPECK_NEON_CHUNK_SIZE,
- .setkey = speck64_xts_setkey,
- .encrypt = speck64_xts_encrypt,
- .decrypt = speck64_xts_decrypt,
- }
-};
-
-static int __init speck_neon_module_init(void)
-{
- if (!(elf_hwcap & HWCAP_NEON))
- return -ENODEV;
- return crypto_register_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-static void __exit speck_neon_module_exit(void)
-{
- crypto_unregister_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-module_init(speck_neon_module_init);
-module_exit(speck_neon_module_exit);
-
-MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("xts(speck128)");
-MODULE_ALIAS_CRYPTO("xts-speck128-neon");
-MODULE_ALIAS_CRYPTO("xts(speck64)");
-MODULE_ALIAS_CRYPTO("xts-speck64-neon");