diff options
author | Will Deacon | 2021-06-24 13:33:57 +0100 |
---|---|---|
committer | Will Deacon | 2021-06-24 13:33:57 +0100 |
commit | 5ceb045541ad979fd304ca2321bf1fbb76189867 (patch) | |
tree | 1dbc577394ea33a53d1f1071cebd93cba2994df3 | |
parent | 25377204ebd4db2048c873b7c68874247a391998 (diff) | |
parent | 6b8f648959e5036695f056a60e3444f4753f643e (diff) |
Merge branch 'for-next/cortex-strings' into for-next/core
Update our kernel string routines to the latest Cortex Strings
implementation.
* for-next/cortex-strings:
arm64: update string routine copyrights and URLs
arm64: Rewrite __arch_clear_user()
arm64: Better optimised memchr()
arm64: Import latest memcpy()/memmove() implementation
arm64: Add assembly annotations for weak-PI-alias madness
arm64: Import latest version of Cortex Strings' strncmp
arm64: Import updated version of Cortex Strings' strlen
arm64: Import latest version of Cortex Strings' strcmp
arm64: Import latest version of Cortex Strings' memcmp
-rw-r--r-- | arch/arm64/include/asm/linkage.h | 8 | ||||
-rw-r--r-- | arch/arm64/lib/Makefile | 2 | ||||
-rw-r--r-- | arch/arm64/lib/clear_user.S | 47 | ||||
-rw-r--r-- | arch/arm64/lib/memchr.S | 65 | ||||
-rw-r--r-- | arch/arm64/lib/memcmp.S | 346 | ||||
-rw-r--r-- | arch/arm64/lib/memcpy.S | 272 | ||||
-rw-r--r-- | arch/arm64/lib/memmove.S | 189 | ||||
-rw-r--r-- | arch/arm64/lib/strcmp.S | 289 | ||||
-rw-r--r-- | arch/arm64/lib/strlen.S | 258 | ||||
-rw-r--r-- | arch/arm64/lib/strncmp.S | 406 |
10 files changed, 915 insertions, 967 deletions
diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h index ba89a9af820a..9906541a6861 100644 --- a/arch/arm64/include/asm/linkage.h +++ b/arch/arm64/include/asm/linkage.h @@ -56,8 +56,16 @@ SYM_FUNC_START_ALIAS(__pi_##x); \ SYM_FUNC_START_WEAK(x) +#define SYM_FUNC_START_WEAK_ALIAS_PI(x) \ + SYM_FUNC_START_ALIAS(__pi_##x); \ + SYM_START(x, SYM_L_WEAK, SYM_A_ALIGN) + #define SYM_FUNC_END_PI(x) \ SYM_FUNC_END(x); \ SYM_FUNC_END_ALIAS(__pi_##x) +#define SYM_FUNC_END_ALIAS_PI(x) \ + SYM_FUNC_END_ALIAS(x); \ + SYM_FUNC_END_ALIAS(__pi_##x) + #endif diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index d31e1169d9b8..01c596aa539c 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 lib-y := clear_user.o delay.o copy_from_user.o \ copy_to_user.o copy_in_user.o copy_page.o \ - clear_page.o csum.o memchr.o memcpy.o memmove.o \ + clear_page.o csum.o memchr.o memcpy.o \ memset.o memcmp.o strcmp.o strncmp.o strlen.o \ strnlen.o strchr.o strrchr.o tishift.o diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index af9afcbec92c..a7efb2ad2a1c 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -1,12 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Based on arch/arm/lib/clear_user.S - * - * Copyright (C) 2012 ARM Ltd. + * Copyright (C) 2021 Arm Ltd. */ -#include <linux/linkage.h> -#include <asm/asm-uaccess.h> +#include <linux/linkage.h> #include <asm/assembler.h> .text @@ -19,25 +16,33 @@ * * Alignment fixed up by hardware. */ + + .p2align 4 + // Alignment is for the loop, but since the prologue (including BTI) + // is also 16 bytes we can keep any padding outside the function SYM_FUNC_START(__arch_clear_user) - mov x2, x1 // save the size for fixup return + add x2, x0, x1 subs x1, x1, #8 b.mi 2f 1: -user_ldst 9f, sttr, xzr, x0, 8 +USER(9f, sttr xzr, [x0]) + add x0, x0, #8 subs x1, x1, #8 - b.pl 1b -2: adds x1, x1, #4 - b.mi 3f -user_ldst 9f, sttr, wzr, x0, 4 - sub x1, x1, #4 -3: adds x1, x1, #2 - b.mi 4f -user_ldst 9f, sttrh, wzr, x0, 2 - sub x1, x1, #2 -4: adds x1, x1, #1 - b.mi 5f -user_ldst 9f, sttrb, wzr, x0, 0 + b.hi 1b +USER(9f, sttr xzr, [x2, #-8]) + mov x0, #0 + ret + +2: tbz x1, #2, 3f +USER(9f, sttr wzr, [x0]) +USER(8f, sttr wzr, [x2, #-4]) + mov x0, #0 + ret + +3: tbz x1, #1, 4f +USER(9f, sttrh wzr, [x0]) +4: tbz x1, #0, 5f +USER(7f, sttrb wzr, [x2, #-1]) 5: mov x0, #0 ret SYM_FUNC_END(__arch_clear_user) @@ -45,6 +50,8 @@ EXPORT_SYMBOL(__arch_clear_user) .section .fixup,"ax" .align 2 -9: mov x0, x2 // return the original size +7: sub x0, x2, #5 // Adjust for faulting on the final byte... +8: add x0, x0, #4 // ...or the second word of the 4-7 byte case +9: sub x0, x2, x0 ret .previous diff --git a/arch/arm64/lib/memchr.S b/arch/arm64/lib/memchr.S index edf6b970a277..7c2276fdab54 100644 --- a/arch/arm64/lib/memchr.S +++ b/arch/arm64/lib/memchr.S @@ -1,9 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Based on arch/arm/lib/memchr.S - * - * Copyright (C) 1995-2000 Russell King - * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2021 Arm Ltd. */ #include <linux/linkage.h> @@ -19,16 +16,60 @@ * Returns: * x0 - address of first occurrence of 'c' or 0 */ + +#define L(label) .L ## label + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +#define srcin x0 +#define chrin w1 +#define cntin x2 + +#define result x0 + +#define wordcnt x3 +#define rep01 x4 +#define repchr x5 +#define cur_word x6 +#define cur_byte w6 +#define tmp x7 +#define tmp2 x8 + + .p2align 4 + nop SYM_FUNC_START_WEAK_PI(memchr) - and w1, w1, #0xff -1: subs x2, x2, #1 - b.mi 2f - ldrb w3, [x0], #1 - cmp w3, w1 - b.ne 1b - sub x0, x0, #1 + and chrin, chrin, #0xff + lsr wordcnt, cntin, #3 + cbz wordcnt, L(byte_loop) + mov rep01, #REP8_01 + mul repchr, x1, rep01 + and cntin, cntin, #7 +L(word_loop): + ldr cur_word, [srcin], #8 + sub wordcnt, wordcnt, #1 + eor cur_word, cur_word, repchr + sub tmp, cur_word, rep01 + orr tmp2, cur_word, #REP8_7f + bics tmp, tmp, tmp2 + b.ne L(found_word) + cbnz wordcnt, L(word_loop) +L(byte_loop): + cbz cntin, L(not_found) + ldrb cur_byte, [srcin], #1 + sub cntin, cntin, #1 + cmp cur_byte, chrin + b.ne L(byte_loop) + sub srcin, srcin, #1 + ret +L(found_word): +CPU_LE( rev tmp, tmp) + clz tmp, tmp + sub tmp, tmp, #64 + add result, srcin, tmp, asr #3 ret -2: mov x0, #0 +L(not_found): + mov result, #0 ret SYM_FUNC_END_PI(memchr) EXPORT_SYMBOL_NOKASAN(memchr) diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S index c0671e793ea9..7d956384222f 100644 --- a/arch/arm64/lib/memcmp.S +++ b/arch/arm64/lib/memcmp.S @@ -1,247 +1,139 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2013 ARM Ltd. - * Copyright (C) 2013 Linaro. + * Copyright (c) 2013-2021, Arm Limited. * - * This code is based on glibc cortex strings work originally authored by Linaro - * be found @ - * - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ - * files/head:/src/aarch64/ + * Adapted from the original at: + * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/memcmp.S */ #include <linux/linkage.h> #include <asm/assembler.h> -/* -* compare memory areas(when two memory areas' offset are different, -* alignment handled by the hardware) -* -* Parameters: -* x0 - const memory area 1 pointer -* x1 - const memory area 2 pointer -* x2 - the maximal compare byte length -* Returns: -* x0 - a compare result, maybe less than, equal to, or greater than ZERO -*/ +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + */ + +#define L(label) .L ## label /* Parameters and result. */ -src1 .req x0 -src2 .req x1 -limit .req x2 -result .req x0 +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 /* Internal variables. */ -data1 .req x3 -data1w .req w3 -data2 .req x4 -data2w .req w4 -has_nul .req x5 -diff .req x6 -endloop .req x7 -tmp1 .req x8 -tmp2 .req x9 -tmp3 .req x10 -pos .req x11 -limit_wd .req x12 -mask .req x13 +#define data1 x3 +#define data1w w3 +#define data1h x4 +#define data2 x5 +#define data2w w5 +#define data2h x6 +#define tmp1 x7 +#define tmp2 x8 SYM_FUNC_START_WEAK_PI(memcmp) - cbz limit, .Lret0 - eor tmp1, src1, src2 - tst tmp1, #7 - b.ne .Lmisaligned8 - ands tmp1, src1, #7 - b.ne .Lmutual_align - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ - lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ - /* - * The input source addresses are at alignment boundary. - * Directly compare eight bytes each time. - */ -.Lloop_aligned: - ldr data1, [src1], #8 - ldr data2, [src2], #8 -.Lstart_realigned: - subs limit_wd, limit_wd, #1 - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, cs /* Last Dword or differences. */ - cbz endloop, .Lloop_aligned - - /* Not reached the limit, must have found a diff. */ - tbz limit_wd, #63, .Lnot_limit - - /* Limit % 8 == 0 => the diff is in the last 8 bytes. */ - ands limit, limit, #7 - b.eq .Lnot_limit - /* - * The remained bytes less than 8. It is needed to extract valid data - * from last eight bytes of the intended memory range. - */ - lsl limit, limit, #3 /* bytes-> bits. */ - mov mask, #~0 -CPU_BE( lsr mask, mask, limit ) -CPU_LE( lsl mask, mask, limit ) - bic data1, data1, mask - bic data2, data2, mask - - orr diff, diff, mask - b .Lnot_limit - -.Lmutual_align: - /* - * Sources are mutually aligned, but are not currently at an - * alignment boundary. Round down the addresses and then mask off - * the bytes that precede the start point. - */ - bic src1, src1, #7 - bic src2, src2, #7 - ldr data1, [src1], #8 - ldr data2, [src2], #8 - /* - * We can not add limit with alignment offset(tmp1) here. Since the - * addition probably make the limit overflown. - */ - sub limit_wd, limit, #1/*limit != 0, so no underflow.*/ - and tmp3, limit_wd, #7 - lsr limit_wd, limit_wd, #3 - add tmp3, tmp3, tmp1 - add limit_wd, limit_wd, tmp3, lsr #3 - add limit, limit, tmp1/* Adjust the limit for the extra. */ - - lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/ - neg tmp1, tmp1/* Bits to alignment -64. */ - mov tmp2, #~0 - /*mask off the non-intended bytes before the start address.*/ -CPU_BE( lsl tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/ - /* Little-endian. Early bytes are at LSB. */ -CPU_LE( lsr tmp2, tmp2, tmp1 ) - - orr data1, data1, tmp2 - orr data2, data2, tmp2 - b .Lstart_realigned - - /*src1 and src2 have different alignment offset.*/ -.Lmisaligned8: - cmp limit, #8 - b.lo .Ltiny8proc /*limit < 8: compare byte by byte*/ - - and tmp1, src1, #7 - neg tmp1, tmp1 - add tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/ - and tmp2, src2, #7 - neg tmp2, tmp2 - add tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/ - subs tmp3, tmp1, tmp2 - csel pos, tmp1, tmp2, hi /*Choose the maximum.*/ - - sub limit, limit, pos - /*compare the proceeding bytes in the first 8 byte segment.*/ -.Ltinycmp: - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs pos, pos, #1 - ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ - b.eq .Ltinycmp - cbnz pos, 1f /*diff occurred before the last byte.*/ - cmp data1w, data2w - b.eq .Lstart_align -1: - sub result, data1, data2 + subs limit, limit, 8 + b.lo L(less8) + + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + b.ne L(return) + + subs limit, limit, 8 + b.gt L(more16) + + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) + +L(more16): + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + bne L(return) + + /* Jump directly to comparing the last 16 bytes for 32 byte (or less) + strings. */ + subs limit, limit, 16 + b.ls L(last_bytes) + + /* We overlap loads between 0-32 bytes at either side of SRC1 when we + try to align, so limit it only to strings larger than 128 bytes. */ + cmp limit, 96 + b.ls L(loop16) + + /* Align src1 and adjust src2 with bytes not yet done. */ + and tmp1, src1, 15 + add limit, limit, tmp1 + sub src1, src1, tmp1 + sub src2, src2, tmp1 + + /* Loop performing 16 bytes per iteration using aligned src1. + Limit is pre-decremented by 16 and must be larger than zero. + Exit if <= 16 bytes left to do or if the data is not equal. */ + .p2align 4 +L(loop16): + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + subs limit, limit, 16 + ccmp data1, data2, 0, hi + ccmp data1h, data2h, 0, eq + b.eq L(loop16) + + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + bne L(return) + + /* Compare last 1-16 bytes using unaligned access. */ +L(last_bytes): + add src1, src1, limit + add src2, src2, limit + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + + /* Compare data bytes and set return value to 0, -1 or 1. */ +L(return): +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + cmp data1, data2 +L(ret_eq): + cset result, ne + cneg result, result, lo ret -.Lstart_align: - lsr limit_wd, limit, #3 - cbz limit_wd, .Lremain8 - - ands xzr, src1, #7 - b.eq .Lrecal_offset - /*process more leading bytes to make src1 aligned...*/ - add src1, src1, tmp3 /*backwards src1 to alignment boundary*/ - add src2, src2, tmp3 - sub limit, limit, tmp3 - lsr limit_wd, limit, #3 - cbz limit_wd, .Lremain8 - /*load 8 bytes from aligned SRC1..*/ - ldr data1, [src1], #8 - ldr data2, [src2], #8 - - subs limit_wd, limit_wd, #1 - eor diff, data1, data2 /*Non-zero if differences found.*/ - csinv endloop, diff, xzr, ne - cbnz endloop, .Lunequal_proc - /*How far is the current SRC2 from the alignment boundary...*/ - and tmp3, tmp3, #7 - -.Lrecal_offset:/*src1 is aligned now..*/ - neg pos, tmp3 -.Lloopcmp_proc: - /* - * Divide the eight bytes into two parts. First,backwards the src2 - * to an alignment boundary,load eight bytes and compare from - * the SRC2 alignment boundary. If all 8 bytes are equal,then start - * the second part's comparison. Otherwise finish the comparison. - * This special handle can garantee all the accesses are in the - * thread/task space in avoid to overrange access. - */ - ldr data1, [src1,pos] - ldr data2, [src2,pos] - eor diff, data1, data2 /* Non-zero if differences found. */ - cbnz diff, .Lnot_limit - - /*The second part process*/ - ldr data1, [src1], #8 - ldr data2, [src2], #8 - eor diff, data1, data2 /* Non-zero if differences found. */ - subs limit_wd, limit_wd, #1 - csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ - cbz endloop, .Lloopcmp_proc -.Lunequal_proc: - cbz diff, .Lremain8 - -/* There is difference occurred in the latest comparison. */ -.Lnot_limit: -/* -* For little endian,reverse the low significant equal bits into MSB,then -* following CLZ can find how many equal bits exist. -*/ -CPU_LE( rev diff, diff ) -CPU_LE( rev data1, data1 ) -CPU_LE( rev data2, data2 ) - - /* - * The MS-non-zero bit of DIFF marks either the first bit - * that is different, or the end of the significant data. - * Shifting left now will bring the critical information into the - * top bits. - */ - clz pos, diff - lsl data1, data1, pos - lsl data2, data2, pos - /* - * We need to zero-extend (char is unsigned) the value and then - * perform a signed subtraction. - */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 + .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less8): + adds limit, limit, 4 + b.lo L(less4) + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 + cmp data1w, data2w + b.ne L(return) + sub limit, limit, 4 +L(less4): + adds limit, limit, 4 + beq L(ret_eq) +L(byte_loop): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq L(byte_loop) + sub result, data1w, data2w ret -.Lremain8: - /* Limit % 8 == 0 =>. all data are equal.*/ - ands limit, limit, #7 - b.eq .Lret0 - -.Ltiny8proc: - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 - - ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ - b.eq .Ltiny8proc - sub result, data1, data2 - ret -.Lret0: - mov result, #0 - ret SYM_FUNC_END_PI(memcmp) EXPORT_SYMBOL_NOKASAN(memcmp) diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index dc8d2a216a6e..b82fd64ee1e1 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S @@ -1,66 +1,252 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2013 ARM Ltd. - * Copyright (C) 2013 Linaro. + * Copyright (c) 2012-2021, Arm Limited. * - * This code is based on glibc cortex strings work originally authored by Linaro - * be found @ - * - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ - * files/head:/src/aarch64/ + * Adapted from the original at: + * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S */ #include <linux/linkage.h> #include <asm/assembler.h> -#include <asm/cache.h> -/* - * Copy a buffer from src to dest (alignment handled by the hardware) +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. * - * Parameters: - * x0 - dest - * x1 - src - * x2 - n - * Returns: - * x0 - dest */ - .macro ldrb1 reg, ptr, val - ldrb \reg, [\ptr], \val - .endm - - .macro strb1 reg, ptr, val - strb \reg, [\ptr], \val - .endm - .macro ldrh1 reg, ptr, val - ldrh \reg, [\ptr], \val - .endm +#define L(label) .L ## label - .macro strh1 reg, ptr, val - strh \reg, [\ptr], \val - .endm +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_lw w10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l x14 +#define E_h x15 +#define F_l x16 +#define F_h x17 +#define G_l count +#define G_h dst +#define H_l src +#define H_h srcend +#define tmp1 x14 - .macro ldr1 reg, ptr, val - ldr \reg, [\ptr], \val - .endm +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. - .macro str1 reg, ptr, val - str \reg, [\ptr], \val - .endm + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. - .macro ldp1 reg1, reg2, ptr, val - ldp \reg1, \reg2, [\ptr], \val - .endm - - .macro stp1 reg1, reg2, ptr, val - stp \reg1, \reg2, [\ptr], \val - .endm + Large copies use a software pipelined loop processing 64 bytes per iteration. + The destination pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ +SYM_FUNC_START_ALIAS(__memmove) +SYM_FUNC_START_WEAK_ALIAS_PI(memmove) SYM_FUNC_START_ALIAS(__memcpy) SYM_FUNC_START_WEAK_PI(memcpy) -#include "copy_template.S" + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldp A_l, A_h, [src] + ldp D_l, D_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi L(copy128) + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] ret + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_l, E_h, [src, 32] + ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) + ldp G_l, G_h, [srcend, -64] + ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +L(copy96): + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy more than 128 bytes. */ +L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(copy0) + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + + ldp D_l, D_h, [src] + and tmp1, dstin, 15 + bic dst, dstin, 15 + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) + +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +L(copy_long_backwards): + ldp D_l, D_h, [srcend, -16] + and tmp1, dstend, 15 + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] + ret + SYM_FUNC_END_PI(memcpy) EXPORT_SYMBOL(memcpy) SYM_FUNC_END_ALIAS(__memcpy) EXPORT_SYMBOL(__memcpy) +SYM_FUNC_END_ALIAS_PI(memmove) +EXPORT_SYMBOL(memmove) +SYM_FUNC_END_ALIAS(__memmove) +EXPORT_SYMBOL(__memmove) diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S deleted file mode 100644 index 1035dce4bdaf..000000000000 --- a/arch/arm64/lib/memmove.S +++ /dev/null @@ -1,189 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2013 ARM Ltd. - * Copyright (C) 2013 Linaro. - * - * This code is based on glibc cortex strings work originally authored by Linaro - * be found @ - * - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ - * files/head:/src/aarch64/ - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> -#include <asm/cache.h> - -/* - * Move a buffer from src to test (alignment handled by the hardware). - * If dest <= src, call memcpy, otherwise copy in reverse order. - * - * Parameters: - * x0 - dest - * x1 - src - * x2 - n - * Returns: - * x0 - dest - */ -dstin .req x0 -src .req x1 -count .req x2 -tmp1 .req x3 -tmp1w .req w3 -tmp2 .req x4 -tmp2w .req w4 -tmp3 .req x5 -tmp3w .req w5 -dst .req x6 - -A_l .req x7 -A_h .req x8 -B_l .req x9 -B_h .req x10 -C_l .req x11 -C_h .req x12 -D_l .req x13 -D_h .req x14 - -SYM_FUNC_START_ALIAS(__memmove) -SYM_FUNC_START_WEAK_PI(memmove) - cmp dstin, src - b.lo __memcpy - add tmp1, src, count - cmp dstin, tmp1 - b.hs __memcpy /* No overlap. */ - - add dst, dstin, count - add src, src, count - cmp count, #16 - b.lo .Ltail15 /*probably non-alignment accesses.*/ - - ands tmp2, src, #15 /* Bytes to reach alignment. */ - b.eq .LSrcAligned - sub count, count, tmp2 - /* - * process the aligned offset length to make the src aligned firstly. - * those extra instructions' cost is acceptable. It also make the - * coming accesses are based on aligned address. - */ - tbz tmp2, #0, 1f - ldrb tmp1w, [src, #-1]! - strb tmp1w, [dst, #-1]! -1: - tbz tmp2, #1, 2f - ldrh tmp1w, [src, #-2]! - strh tmp1w, [dst, #-2]! -2: - tbz tmp2, #2, 3f - ldr tmp1w, [src, #-4]! - str tmp1w, [dst, #-4]! -3: - tbz tmp2, #3, .LSrcAligned - ldr tmp1, [src, #-8]! - str tmp1, [dst, #-8]! - -.LSrcAligned: - cmp count, #64 - b.ge .Lcpy_over64 - - /* - * Deal with small copies quickly by dropping straight into the - * exit block. - */ -.Ltail63: - /* - * Copy up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. - */ - ands tmp1, count, #0x30 - b.eq .Ltail15 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src, #-16]! - stp A_l, A_h, [dst, #-16]! -1: - ldp A_l, A_h, [src, #-16]! - stp A_l, A_h, [dst, #-16]! -2: - ldp A_l, A_h, [src, #-16]! - stp A_l, A_h, [dst, #-16]! - -.Ltail15: - tbz count, #3, 1f - ldr tmp1, [src, #-8]! - str tmp1, [dst, #-8]! -1: - tbz count, #2, 2f - ldr tmp1w, [src, #-4]! - str tmp1w, [dst, #-4]! -2: - tbz count, #1, 3f - ldrh tmp1w, [src, #-2]! - strh tmp1w, [dst, #-2]! -3: - tbz count, #0, .Lexitfunc - ldrb tmp1w, [src, #-1] - strb tmp1w, [dst, #-1] - -.Lexitfunc: - ret - -.Lcpy_over64: - subs count, count, #128 - b.ge .Lcpy_body_large - /* - * Less than 128 bytes to copy, so handle 64 bytes here and then jump - * to the tail. - */ - ldp A_l, A_h, [src, #-16] - stp A_l, A_h, [dst, #-16] - ldp B_l, B_h, [src, #-32] - ldp C_l, C_h, [src, #-48] - stp B_l, B_h, [dst, #-32] - stp C_l, C_h, [dst, #-48] - ldp D_l, D_h, [src, #-64]! - stp D_l, D_h, [dst, #-64]! - - tst count, #0x3f - b.ne .Ltail63 - ret - - /* - * Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. - */ - .p2align L1_CACHE_SHIFT -.Lcpy_body_large: - /* pre-load 64 bytes data. */ - ldp A_l, A_h, [src, #-16] - ldp B_l, B_h, [src, #-32] - ldp C_l, C_h, [src, #-48] - ldp D_l, D_h, [src, #-64]! -1: - /* - * interlace the load of next 64 bytes data block with store of the last - * loaded 64 bytes data. - */ - stp A_l, A_h, [dst, #-16] - ldp A_l, A_h, [src, #-16] - stp B_l, B_h, [dst, #-32] - ldp B_l, B_h, [src, #-32] - stp C_l, C_h, [dst, #-48] - ldp C_l, C_h, [src, #-48] - stp D_l, D_h, [dst, #-64]! - ldp D_l, D_h, [src, #-64]! - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst, #-16] - stp B_l, B_h, [dst, #-32] - stp C_l, C_h, [dst, #-48] - stp D_l, D_h, [dst, #-64]! - - tst count, #0x3f - b.ne .Ltail63 - ret -SYM_FUNC_END_PI(memmove) -EXPORT_SYMBOL(memmove) -SYM_FUNC_END_ALIAS(__memmove) -EXPORT_SYMBOL(__memmove) diff --git a/arch/arm64/lib/strcmp.S b/arch/arm64/lib/strcmp.S index 4e79566726c8..d7bee210a798 100644 --- a/arch/arm64/lib/strcmp.S +++ b/arch/arm64/lib/strcmp.S @@ -1,84 +1,123 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2013 ARM Ltd. - * Copyright (C) 2013 Linaro. + * Copyright (c) 2012-2021, Arm Limited. * - * This code is based on glibc cortex strings work originally authored by Linaro - * be found @ - * - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ - * files/head:/src/aarch64/ + * Adapted from the original at: + * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/strcmp.S */ #include <linux/linkage.h> #include <asm/assembler.h> -/* - * compare two strings +/* Assumptions: * - * Parameters: - * x0 - const string 1 pointer - * x1 - const string 2 pointer - * Returns: - * x0 - an integer less than, equal to, or greater than zero - * if s1 is found, respectively, to be less than, to match, - * or be greater than s2. + * ARMv8-a, AArch64 */ +#define L(label) .L ## label + #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f #define REP8_80 0x8080808080808080 /* Parameters and result. */ -src1 .req x0 -src2 .req x1 -result .req x0 +#define src1 x0 +#define src2 x1 +#define result x0 /* Internal variables. */ -data1 .req x2 -data1w .req w2 -data2 .req x3 -data2w .req w3 -has_nul .req x4 -diff .req x5 -syndrome .req x6 -tmp1 .req x7 -tmp2 .req x8 -tmp3 .req x9 -zeroones .req x10 -pos .req x11 - +#define data1 x2 +#define data1w w2 +#define data2 x3 +#define data2w w3 +#define has_nul x4 +#define diff x5 +#define syndrome x6 +#define tmp1 x7 +#define tmp2 x8 +#define tmp3 x9 +#define zeroones x10 +#define pos x11 + + /* Start of performance-critical section -- one 64B cache line. */ + .align 6 SYM_FUNC_START_WEAK_PI(strcmp) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 - b.ne .Lmisaligned8 + b.ne L(misaligned8) ands tmp1, src1, #7 - b.ne .Lmutual_align - - /* - * NUL detection works on the principle that (X - 1) & (~X) & 0x80 - * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - * can be done in parallel across the entire word. - */ -.Lloop_aligned: + b.ne L(mutual_align) + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ +L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 -.Lstart_realigned: +L(start_realigned): sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f eor diff, data1, data2 /* Non-zero if differences found. */ bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ orr syndrome, diff, has_nul - cbz syndrome, .Lloop_aligned - b .Lcal_cmpresult + cbz syndrome, L(loop_aligned) + /* End of performance-critical section -- one 64B cache line. */ + +L(end): +#ifndef __AARCH64EB__ + rev syndrome, syndrome + rev data1, data1 + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + clz pos, syndrome + rev data2, data2 + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#else + /* For big-endian we cannot use the trick with the syndrome value + as carry-propagation can corrupt the upper bits if the trailing + bytes in the string contain 0x01. */ + /* However, if there is no NUL byte in the dword, we can generate + the result directly. We can't just subtract the bytes as the + MSB might be significant. */ + cbnz has_nul, 1f + cmp data1, data2 + cset result, ne + cneg result, result, lo + ret +1: + /* Re-compute the NUL-byte detection, using a byte-reversed value. */ + rev tmp3, data1 + sub tmp1, tmp3, zeroones + orr tmp2, tmp3, #REP8_7f + bic has_nul, tmp1, tmp2 + rev has_nul, has_nul + orr syndrome, diff, has_nul + clz pos, syndrome + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#endif -.Lmutual_align: - /* - * Sources are mutually aligned, but are not currently at an - * alignment boundary. Round down the addresses and then mask off - * the bytes that preceed the start point. - */ +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that preceed the start point. */ bic src1, src1, #7 bic src2, src2, #7 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ @@ -86,138 +125,52 @@ SYM_FUNC_START_WEAK_PI(strcmp) neg tmp1, tmp1 /* Bits to alignment -64. */ ldr data2, [src2], #8 mov tmp2, #~0 +#ifdef __AARCH64EB__ /* Big-endian. Early bytes are at MSB. */ -CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ + lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#else /* Little-endian. Early bytes are at LSB. */ -CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ - + lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ +#endif orr data1, data1, tmp2 orr data2, data2, tmp2 - b .Lstart_realigned - -.Lmisaligned8: - /* - * Get the align offset length to compare per byte first. - * After this process, one string's address will be aligned. - */ - and tmp1, src1, #7 - neg tmp1, tmp1 - add tmp1, tmp1, #8 - and tmp2, src2, #7 - neg tmp2, tmp2 - add tmp2, tmp2, #8 - subs tmp3, tmp1, tmp2 - csel pos, tmp1, tmp2, hi /*Choose the maximum. */ -.Ltinycmp: + b L(start_realigned) + +L(misaligned8): + /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always + checking to make sure that we don't access beyond page boundary in + SRC2. */ + tst src1, #7 + b.eq L(loop_misaligned) +L(do_misaligned): ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 - subs pos, pos, #1 - ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq .Ltinycmp - cbnz pos, 1f /*find the null or unequal...*/ cmp data1w, #1 - ccmp data1w, data2w, #0, cs - b.eq .Lstart_align /*the last bytes are equal....*/ -1: - sub result, data1, data2 - ret - -.Lstart_align: - ands xzr, src1, #7 - b.eq .Lrecal_offset - /*process more leading bytes to make str1 aligned...*/ - add src1, src1, tmp3 - add src2, src2, tmp3 - /*load 8 bytes from aligned str1 and non-aligned str2..*/ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.ne L(done) + tst src1, #7 + b.ne L(do_misaligned) + +L(loop_misaligned): + /* Test if we are within the last dword of the end of a 4K page. If + yes then jump back to the misaligned loop to copy a byte at a time. */ + and tmp1, src2, #0xff8 + eor tmp1, tmp1, #0xff8 + cbz tmp1, L(do_misaligned) ldr data1, [src1], #8 ldr data2, [src2], #8 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f - bic has_nul, tmp1, tmp2 - eor diff, data1, data2 /* Non-zero if differences found. */ - orr syndrome, diff, has_nul - cbnz syndrome, .Lcal_cmpresult - /*How far is the current str2 from the alignment boundary...*/ - and tmp3, tmp3, #7 -.Lrecal_offset: - neg pos, tmp3 -.Lloopcmp_proc: - /* - * Divide the eight bytes into two parts. First,backwards the src2 - * to an alignment boundary,load eight bytes from the SRC2 alignment - * boundary,then compare with the relative bytes from SRC1. - * If all 8 bytes are equal,then start the second part's comparison. - * Otherwise finish the comparison. - * This special handle can garantee all the accesses are in the - * thread/task space in avoid to overrange access. - */ - ldr data1, [src1,pos] - ldr data2, [src2,pos] - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bic has_nul, tmp1, tmp2 - eor diff, data1, data2 /* Non-zero if differences found. */ - orr syndrome, diff, has_nul - cbnz syndrome, .Lcal_cmpresult - - /*The second part process*/ - ldr data1, [src1], #8 - ldr data2, [src2], #8 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bic has_nul, tmp1, tmp2 - eor diff, data1, data2 /* Non-zero if differences found. */ + eor diff, data1, data2 /* Non-zero if differences found. */ + bic has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ orr syndrome, diff, has_nul - cbz syndrome, .Lloopcmp_proc + cbz syndrome, L(loop_misaligned) + b L(end) -.Lcal_cmpresult: - /* - * reversed the byte-order as big-endian,then CLZ can find the most - * significant zero bits. - */ -CPU_LE( rev syndrome, syndrome ) -CPU_LE( rev data1, data1 ) -CPU_LE( rev data2, data2 ) - - /* - * For big-endian we cannot use the trick with the syndrome value - * as carry-propagation can corrupt the upper bits if the trailing - * bytes in the string contain 0x01. - * However, if there is no NUL byte in the dword, we can generate - * the result directly. We cannot just subtract the bytes as the - * MSB might be significant. - */ -CPU_BE( cbnz has_nul, 1f ) -CPU_BE( cmp data1, data2 ) -CPU_BE( cset result, ne ) -CPU_BE( cneg result, result, lo ) -CPU_BE( ret ) -CPU_BE( 1: ) - /*Re-compute the NUL-byte detection, using a byte-reversed value. */ -CPU_BE( rev tmp3, data1 ) -CPU_BE( sub tmp1, tmp3, zeroones ) -CPU_BE( orr tmp2, tmp3, #REP8_7f ) -CPU_BE( bic has_nul, tmp1, tmp2 ) -CPU_BE( rev has_nul, has_nul ) -CPU_BE( orr syndrome, diff, has_nul ) - - clz pos, syndrome - /* - * The MS-non-zero bit of the syndrome marks either the first bit - * that is different, or the top bit of the first zero byte. - * Shifting left now will bring the critical information into the - * top bits. - */ - lsl data1, data1, pos - lsl data2, data2, pos - /* - * But we need to zero-extend (char is unsigned) the value and then - * perform a signed 32-bit subtraction. - */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 +L(done): + sub result, data1, data2 ret + SYM_FUNC_END_PI(strcmp) EXPORT_SYMBOL_NOKASAN(strcmp) diff --git a/arch/arm64/lib/strlen.S b/arch/arm64/lib/strlen.S index ee3ed882dd79..35fbdb7d6e1a 100644 --- a/arch/arm64/lib/strlen.S +++ b/arch/arm64/lib/strlen.S @@ -1,115 +1,203 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2013 ARM Ltd. - * Copyright (C) 2013 Linaro. + * Copyright (c) 2013-2021, Arm Limited. * - * This code is based on glibc cortex strings work originally authored by Linaro - * be found @ - * - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ - * files/head:/src/aarch64/ + * Adapted from the original at: + * https://github.com/ARM-software/optimized-routines/blob/98e4d6a5c13c8e54/string/aarch64/strlen.S */ #include <linux/linkage.h> #include <asm/assembler.h> -/* - * calculate the length of a string +/* Assumptions: * - * Parameters: - * x0 - const string pointer - * Returns: - * x0 - the return length of specific string + * ARMv8-a, AArch64, unaligned accesses, min page size 4k. */ +#define L(label) .L ## label + /* Arguments and results. */ -srcin .req x0 -len .req x0 +#define srcin x0 +#define len x0 /* Locals and temporaries. */ -src .req x1 -data1 .req x2 -data2 .req x3 -data2a .req x4 -has_nul1 .req x5 -has_nul2 .req x6 -tmp1 .req x7 -tmp2 .req x8 -tmp3 .req x9 -tmp4 .req x10 -zeroones .req x11 -pos .req x12 +#define src x1 +#define data1 x2 +#define data2 x3 +#define has_nul1 x4 +#define has_nul2 x5 +#define tmp1 x4 +#define tmp2 x5 +#define tmp3 x6 +#define tmp4 x7 +#define zeroones x8 + + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. A faster check + (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives + false hits for characters 129..255. */ #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f #define REP8_80 0x8080808080808080 +#define MIN_PAGE_SIZE 4096 + + /* Since strings are short on average, we check the first 16 bytes + of the string for a NUL character. In order to do an unaligned ldp + safely we have to do a page cross check first. If there is a NUL + byte we calculate the length from the 2 8-byte words using + conditional select to reduce branch mispredictions (it is unlikely + strlen will be repeatedly called on strings with the same length). + + If the string is longer than 16 bytes, we align src so don't need + further page cross checks, and process 32 bytes per iteration + using the fast NUL check. If we encounter non-ASCII characters, + fallback to a second loop using the full NUL check. + + If the page cross check fails, we read 16 bytes from an aligned + address, remove any characters before the string, and continue + in the main loop using aligned loads. Since strings crossing a + page in the first 16 bytes are rare (probability of + 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. + + AArch64 systems have a minimum page size of 4k. We don't bother + checking for larger page sizes - the cost of setting up the correct + page size is just not worth the extra gain from a small reduction in + the cases taking the slow path. Note that we only care about + whether the first fetch, which may be misaligned, crosses a page + boundary. */ + SYM_FUNC_START_WEAK_PI(strlen) - mov zeroones, #REP8_01 - bic src, srcin, #15 - ands tmp1, srcin, #15 - b.ne .Lmisaligned - /* - * NUL detection works on the principle that (X - 1) & (~X) & 0x80 - * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - * can be done in parallel across the entire word. - */ - /* - * The inner loop deals with two Dwords at a time. This has a - * slightly higher start-up cost, but we should win quite quickly, - * especially on cores with a high number of issue slots per - * cycle, as we get much better parallelism out of the operations. - */ -.Lloop: - ldp data1, data2, [src], #16 -.Lrealigned: + and tmp1, srcin, MIN_PAGE_SIZE - 1 + mov zeroones, REP8_01 + cmp tmp1, MIN_PAGE_SIZE - 16 + b.gt L(page_cross) + ldp data1, data2, [srcin] +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul1/2 directly. + Since we expect strings to be small and early-exit, + byte-swap the data now so has_null1/2 will be correct. */ + rev data1, data1 + rev data2, data2 +#endif sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f + orr tmp2, data1, REP8_7f sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq .Lloop + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(main_loop_entry) + + /* Enter with C = has_nul1 == 0. */ + csel has_nul1, has_nul1, has_nul2, cc + mov len, 8 + rev has_nul1, has_nul1 + clz tmp1, has_nul1 + csel len, xzr, len, cc + add len, len, tmp1, lsr 3 + ret + /* The inner loop processes 32 bytes per iteration and uses the fast + NUL check. If we encounter non-ASCII characters, use a second + loop with the accurate NUL check. */ + .p2align 4 +L(main_loop_entry): + bic src, srcin, 15 + sub src, src, 16 +L(main_loop): + ldp data1, data2, [src, 32]! +L(page_cross_entry): + sub tmp1, data1, zeroones + sub tmp3, data2, zeroones + orr tmp2, tmp1, tmp3 + tst tmp2, zeroones, lsl 7 + bne 1f + ldp data1, data2, [src, 16] + sub tmp1, data1, zeroones + sub tmp3, data2, zeroones + orr tmp2, tmp1, tmp3 + tst tmp2, zeroones, lsl 7 + beq L(main_loop) + add src, src, 16 +1: + /* The fast check failed, so do the slower, accurate NUL check. */ + orr tmp2, data1, REP8_7f + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(nonascii_loop) + + /* Enter with C = has_nul1 == 0. */ +L(tail): +#ifdef __AARCH64EB__ + /* For big-endian, carry propagation (if the final byte in the + string is 0x01) means we cannot use has_nul1/2 directly. The + easiest way to get the correct byte is to byte-swap the data + and calculate the syndrome a second time. */ + csel data1, data1, data2, cc + rev data1, data1 + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + bic has_nul1, tmp1, tmp2 +#else + csel has_nul1, has_nul1, has_nul2, cc +#endif sub len, src, srcin - cbz has_nul1, .Lnul_in_data2 -CPU_BE( mov data2, data1 ) /*prepare data to re-calculate the syndrome*/ - sub len, len, #8 - mov has_nul2, has_nul1 -.Lnul_in_data2: - /* - * For big-endian, carry propagation (if the final byte in the - * string is 0x01) means we cannot use has_nul directly. The - * easiest way to get the correct byte is to byte-swap the data - * and calculate the syndrome a second time. - */ -CPU_BE( rev data2, data2 ) -CPU_BE( sub tmp1, data2, zeroones ) -CPU_BE( orr tmp2, data2, #REP8_7f ) -CPU_BE( bic has_nul2, tmp1, tmp2 ) - - sub len, len, #8 - rev has_nul2, has_nul2 - clz pos, has_nul2 - add len, len, pos, lsr #3 /* Bits to bytes. */ + rev has_nul1, has_nul1 + add tmp2, len, 8 + clz tmp1, has_nul1 + csel len, len, tmp2, cc + add len, len, tmp1, lsr 3 ret -.Lmisaligned: - cmp tmp1, #8 - neg tmp1, tmp1 - ldp data1, data2, [src], #16 - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ - mov tmp2, #~0 - /* Big-endian. Early bytes are at MSB. */ -CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ +L(nonascii_loop): + ldp data1, data2, [src, 16]! + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + bne L(tail) + ldp data1, data2, [src, 16]! + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(nonascii_loop) + b L(tail) + + /* Load 16 bytes from [srcin & ~15] and force the bytes that precede + srcin to 0x7f, so we ignore any NUL bytes before the string. + Then continue in the aligned loop. */ +L(page_cross): + bic src, srcin, 15 + ldp data1, data2, [src] + lsl tmp1, srcin, 3 + mov tmp4, -1 +#ifdef __AARCH64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ +#else /* Little-endian. Early bytes are at LSB. */ -CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */ + lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */ +#endif + orr tmp1, tmp1, REP8_80 + orn data1, data1, tmp1 + orn tmp2, data2, tmp1 + tst srcin, 8 + csel data1, data1, tmp4, eq + csel data2, data2, tmp2, eq + b L(page_cross_entry) - orr data1, data1, tmp2 - orr data2a, data2, tmp2 - csinv data1, data1, xzr, le - csel data2, data2, data2a, le - b .Lrealigned SYM_FUNC_END_PI(strlen) EXPORT_SYMBOL_NOKASAN(strlen) diff --git a/arch/arm64/lib/strncmp.S b/arch/arm64/lib/strncmp.S index 2a7ee949ed47..48d44f7fddb1 100644 --- a/arch/arm64/lib/strncmp.S +++ b/arch/arm64/lib/strncmp.S @@ -1,299 +1,261 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2013 ARM Ltd. - * Copyright (C) 2013 Linaro. + * Copyright (c) 2013-2021, Arm Limited. * - * This code is based on glibc cortex strings work originally authored by Linaro - * be found @ - * - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ - * files/head:/src/aarch64/ + * Adapted from the original at: + * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/strncmp.S */ #include <linux/linkage.h> #include <asm/assembler.h> -/* - * compare two strings +/* Assumptions: * - * Parameters: - * x0 - const string 1 pointer - * x1 - const string 2 pointer - * x2 - the maximal length to be compared - * Returns: - * x0 - an integer less than, equal to, or greater than zero if s1 is found, - * respectively, to be less than, to match, or be greater than s2. + * ARMv8-a, AArch64 */ +#define L(label) .L ## label + #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f #define REP8_80 0x8080808080808080 /* Parameters and result. */ -src1 .req x0 -src2 .req x1 -limit .req x2 -result .req x0 +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result x0 /* Internal variables. */ -data1 .req x3 -data1w .req w3 -data2 .req x4 -data2w .req w4 -has_nul .req x5 -diff .req x6 -syndrome .req x7 -tmp1 .req x8 -tmp2 .req x9 -tmp3 .req x10 -zeroones .req x11 -pos .req x12 -limit_wd .req x13 -mask .req x14 -endloop .req x15 +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define has_nul x5 +#define diff x6 +#define syndrome x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define zeroones x11 +#define pos x12 +#define limit_wd x13 +#define mask x14 +#define endloop x15 +#define count mask SYM_FUNC_START_WEAK_PI(strncmp) - cbz limit, .Lret0 + cbz limit, L(ret0) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 - b.ne .Lmisaligned8 - ands tmp1, src1, #7 - b.ne .Lmutual_align + and count, src1, #7 + b.ne L(misaligned8) + cbnz count, L(mutual_align) /* Calculate the number of full and partial words -1. */ - /* - * when limit is mulitply of 8, if not sub 1, - * the judgement of last dword will wrong. - */ - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ - lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ - /* - * NUL detection works on the principle that (X - 1) & (~X) & 0x80 - * (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and - * can be done in parallel across the entire word. - */ -.Lloop_aligned: + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and + can be done in parallel across the entire word. */ + .p2align 4 +L(loop_aligned): ldr data1, [src1], #8 ldr data2, [src2], #8 -.Lstart_realigned: +L(start_realigned): subs limit_wd, limit_wd, #1 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, pl /* Last Dword or differences.*/ - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, pl /* Last Dword or differences. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ ccmp endloop, #0, #0, eq - b.eq .Lloop_aligned + b.eq L(loop_aligned) + /* End of main loop */ - /*Not reached the limit, must have found the end or a diff. */ - tbz limit_wd, #63, .Lnot_limit + /* Not reached the limit, must have found the end or a diff. */ + tbz limit_wd, #63, L(not_limit) /* Limit % 8 == 0 => all bytes significant. */ ands limit, limit, #7 - b.eq .Lnot_limit + b.eq L(not_limit) - lsl limit, limit, #3 /* Bits -> bytes. */ + lsl limit, limit, #3 /* Bits -> bytes. */ mov mask, #~0 -CPU_BE( lsr mask, mask, limit ) -CPU_LE( lsl mask, mask, limit ) +#ifdef __AARCH64EB__ + lsr mask, mask, limit +#else + lsl mask, mask, limit +#endif bic data1, data1, mask bic data2, data2, mask /* Make sure that the NUL byte is marked in the syndrome. */ orr has_nul, has_nul, mask -.Lnot_limit: +L(not_limit): orr syndrome, diff, has_nul - b .Lcal_cmpresult -.Lmutual_align: - /* - * Sources are mutually aligned, but are not currently at an - * alignment boundary. Round down the addresses and then mask off - * the bytes that precede the start point. - * We also need to adjust the limit calculations, but without - * overflowing if the limit is near ULONG_MAX. - */ +#ifndef __AARCH64EB__ + rev syndrome, syndrome + rev data1, data1 + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + clz pos, syndrome + rev data2, data2 + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#else + /* For big-endian we cannot use the trick with the syndrome value + as carry-propagation can corrupt the upper bits if the trailing + bytes in the string contain 0x01. */ + /* However, if there is no NUL byte in the dword, we can generate + the result directly. We can't just subtract the bytes as the + MSB might be significant. */ + cbnz has_nul, 1f + cmp data1, data2 + cset result, ne + cneg result, result, lo + ret +1: + /* Re-compute the NUL-byte detection, using a byte-reversed value. */ + rev tmp3, data1 + sub tmp1, tmp3, zeroones + orr tmp2, tmp3, #REP8_7f + bic has_nul, tmp1, tmp2 + rev has_nul, has_nul + orr syndrome, diff, has_nul + clz pos, syndrome + /* The MS-non-zero bit of the syndrome marks either the first bit + that is different, or the top bit of the first zero byte. + Shifting left now will bring the critical information into the + top bits. */ + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret +#endif + +L(mutual_align): + /* Sources are mutually aligned, but are not currently at an + alignment boundary. Round down the addresses and then mask off + the bytes that precede the start point. + We also need to adjust the limit calculations, but without + overflowing if the limit is near ULONG_MAX. */ bic src1, src1, #7 bic src2, src2, #7 ldr data1, [src1], #8 - neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */ + neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ ldr data2, [src2], #8 mov tmp2, #~0 - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ +#ifdef __AARCH64EB__ /* Big-endian. Early bytes are at MSB. */ -CPU_BE( lsl tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */ + lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ +#else /* Little-endian. Early bytes are at LSB. */ -CPU_LE( lsr tmp2, tmp2, tmp3 ) /* Shift (tmp1 & 63). */ - + lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ +#endif and tmp3, limit_wd, #7 lsr limit_wd, limit_wd, #3 - /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.*/ - add limit, limit, tmp1 - add tmp3, tmp3, tmp1 + /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ + add limit, limit, count + add tmp3, tmp3, count orr data1, data1, tmp2 orr data2, data2, tmp2 add limit_wd, limit_wd, tmp3, lsr #3 - b .Lstart_realigned + b L(start_realigned) + + .p2align 4 + /* Don't bother with dwords for up to 16 bytes. */ +L(misaligned8): + cmp limit, #16 + b.hs L(try_misaligned_words) -/*when src1 offset is not equal to src2 offset...*/ -.Lmisaligned8: - cmp limit, #8 - b.lo .Ltiny8proc /*limit < 8... */ - /* - * Get the align offset length to compare per byte first. - * After this process, one string's address will be aligned.*/ - and tmp1, src1, #7 - neg tmp1, tmp1 - add tmp1, tmp1, #8 - and tmp2, src2, #7 - neg tmp2, tmp2 - add tmp2, tmp2, #8 - subs tmp3, tmp1, tmp2 - csel pos, tmp1, tmp2, hi /*Choose the maximum. */ - /* - * Here, limit is not less than 8, so directly run .Ltinycmp - * without checking the limit.*/ - sub limit, limit, pos -.Ltinycmp: +L(byte_loop): + /* Perhaps we can do better than this. */ ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 - subs pos, pos, #1 - ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq .Ltinycmp - cbnz pos, 1f /*find the null or unequal...*/ - cmp data1w, #1 - ccmp data1w, data2w, #0, cs - b.eq .Lstart_align /*the last bytes are equal....*/ -1: + subs limit, limit, #1 + ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.eq L(byte_loop) +L(done): sub result, data1, data2 ret - -.Lstart_align: + /* Align the SRC1 to a dword by doing a bytewise compare and then do + the dword loop. */ +L(try_misaligned_words): lsr limit_wd, limit, #3 - cbz limit_wd, .Lremain8 - /*process more leading bytes to make str1 aligned...*/ - ands xzr, src1, #7 - b.eq .Lrecal_offset - add src1, src1, tmp3 /*tmp3 is positive in this branch.*/ - add src2, src2, tmp3 - ldr data1, [src1], #8 - ldr data2, [src2], #8 + cbz count, L(do_misaligned) - sub limit, limit, tmp3 + neg count, count + and count, count, #7 + sub limit, limit, count lsr limit_wd, limit, #3 - subs limit_wd, limit_wd, #1 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ - bics has_nul, tmp1, tmp2 - ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ - b.ne .Lunequal_proc - /*How far is the current str2 from the alignment boundary...*/ - and tmp3, tmp3, #7 -.Lrecal_offset: - neg pos, tmp3 -.Lloopcmp_proc: - /* - * Divide the eight bytes into two parts. First,backwards the src2 - * to an alignment boundary,load eight bytes from the SRC2 alignment - * boundary,then compare with the relative bytes from SRC1. - * If all 8 bytes are equal,then start the second part's comparison. - * Otherwise finish the comparison. - * This special handle can garantee all the accesses are in the - * thread/task space in avoid to overrange access. - */ - ldr data1, [src1,pos] - ldr data2, [src2,pos] - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, eq - cbnz endloop, .Lunequal_proc +L(page_end_loop): + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + cmp data1w, #1 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.ne L(done) + subs count, count, #1 + b.hi L(page_end_loop) + +L(do_misaligned): + /* Prepare ourselves for the next page crossing. Unlike the aligned + loop, we fetch 1 less dword because we risk crossing bounds on + SRC2. */ + mov count, #8 + subs limit_wd, limit_wd, #1 + b.lo L(done_loop) +L(loop_misaligned): + and tmp2, src2, #0xff8 + eor tmp2, tmp2, #0xff8 + cbz tmp2, L(page_end_loop) - /*The second part process*/ ldr data1, [src1], #8 ldr data2, [src2], #8 - subs limit_wd, limit_wd, #1 sub tmp1, data1, zeroones orr tmp2, data1, #REP8_7f - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ - bics has_nul, tmp1, tmp2 - ccmp endloop, #0, #0, eq /*has_null is ZERO: no null byte*/ - b.eq .Lloopcmp_proc - -.Lunequal_proc: - orr syndrome, diff, has_nul - cbz syndrome, .Lremain8 -.Lcal_cmpresult: - /* - * reversed the byte-order as big-endian,then CLZ can find the most - * significant zero bits. - */ -CPU_LE( rev syndrome, syndrome ) -CPU_LE( rev data1, data1 ) -CPU_LE( rev data2, data2 ) - /* - * For big-endian we cannot use the trick with the syndrome value - * as carry-propagation can corrupt the upper bits if the trailing - * bytes in the string contain 0x01. - * However, if there is no NUL byte in the dword, we can generate - * the result directly. We can't just subtract the bytes as the - * MSB might be significant. - */ -CPU_BE( cbnz has_nul, 1f ) -CPU_BE( cmp data1, data2 ) -CPU_BE( cset result, ne ) -CPU_BE( cneg result, result, lo ) -CPU_BE( ret ) -CPU_BE( 1: ) - /* Re-compute the NUL-byte detection, using a byte-reversed value.*/ -CPU_BE( rev tmp3, data1 ) -CPU_BE( sub tmp1, tmp3, zeroones ) -CPU_BE( orr tmp2, tmp3, #REP8_7f ) -CPU_BE( bic has_nul, tmp1, tmp2 ) -CPU_BE( rev has_nul, has_nul ) -CPU_BE( orr syndrome, diff, has_nul ) - /* - * The MS-non-zero bit of the syndrome marks either the first bit - * that is different, or the top bit of the first zero byte. - * Shifting left now will bring the critical information into the - * top bits. - */ - clz pos, syndrome - lsl data1, data1, pos - lsl data2, data2, pos - /* - * But we need to zero-extend (char is unsigned) the value and then - * perform a signed 32-bit subtraction. - */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret - -.Lremain8: - /* Limit % 8 == 0 => all bytes significant. */ - ands limit, limit, #7 - b.eq .Lret0 -.Ltiny8proc: - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 + eor diff, data1, data2 /* Non-zero if differences found. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp diff, #0, #0, eq + b.ne L(not_limit) + subs limit_wd, limit_wd, #1 + b.pl L(loop_misaligned) - ccmp data1w, #1, #0, ne /* NZCV = 0b0000. */ - ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq .Ltiny8proc - sub result, data1, data2 - ret +L(done_loop): + /* We found a difference or a NULL before the limit was reached. */ + and limit, limit, #7 + cbz limit, L(not_limit) + /* Read the last word. */ + sub src1, src1, 8 + sub src2, src2, 8 + ldr data1, [src1, limit] + ldr data2, [src2, limit] + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp diff, #0, #0, eq + b.ne L(not_limit) -.Lret0: +L(ret0): mov result, #0 ret + SYM_FUNC_END_PI(strncmp) EXPORT_SYMBOL_NOKASAN(strncmp) |