diff options
-rw-r--r-- | arch/x86/lib/Makefile | 1 | ||||
-rw-r--r-- | arch/x86/lib/memcpy_32.c | 187 | ||||
-rw-r--r-- | arch/x86/lib/memmove_32.S | 200 | ||||
-rw-r--r-- | lib/memcpy_kunit.c | 22 |
4 files changed, 223 insertions, 187 deletions
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 7ba5f61d7273..4f1a40a86534 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -60,6 +60,7 @@ ifeq ($(CONFIG_X86_32),y) lib-y += checksum_32.o lib-y += strstr_32.o lib-y += string_32.o + lib-y += memmove_32.o ifneq ($(CONFIG_X86_CMPXCHG64),y) lib-y += cmpxchg8b_emu.o atomic64_386_32.o endif diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index ef3af7ff2c8a..a29b64befb93 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -17,190 +17,3 @@ __visible void *memset(void *s, int c, size_t count) return __memset(s, c, count); } EXPORT_SYMBOL(memset); - -__visible void *memmove(void *dest, const void *src, size_t n) -{ - int d0,d1,d2,d3,d4,d5; - char *ret = dest; - - __asm__ __volatile__( - /* Handle more 16 bytes in loop */ - "cmp $0x10, %0\n\t" - "jb 1f\n\t" - - /* Decide forward/backward copy mode */ - "cmp %2, %1\n\t" - "jb 2f\n\t" - - /* - * movs instruction have many startup latency - * so we handle small size by general register. - */ - "cmp $680, %0\n\t" - "jb 3f\n\t" - /* - * movs instruction is only good for aligned case. - */ - "mov %1, %3\n\t" - "xor %2, %3\n\t" - "and $0xff, %3\n\t" - "jz 4f\n\t" - "3:\n\t" - "sub $0x10, %0\n\t" - - /* - * We gobble 16 bytes forward in each loop. - */ - "3:\n\t" - "sub $0x10, %0\n\t" - "mov 0*4(%1), %3\n\t" - "mov 1*4(%1), %4\n\t" - "mov %3, 0*4(%2)\n\t" - "mov %4, 1*4(%2)\n\t" - "mov 2*4(%1), %3\n\t" - "mov 3*4(%1), %4\n\t" - "mov %3, 2*4(%2)\n\t" - "mov %4, 3*4(%2)\n\t" - "lea 0x10(%1), %1\n\t" - "lea 0x10(%2), %2\n\t" - "jae 3b\n\t" - "add $0x10, %0\n\t" - "jmp 1f\n\t" - - /* - * Handle data forward by movs. - */ - ".p2align 4\n\t" - "4:\n\t" - "mov -4(%1, %0), %3\n\t" - "lea -4(%2, %0), %4\n\t" - "shr $2, %0\n\t" - "rep movsl\n\t" - "mov %3, (%4)\n\t" - "jmp 11f\n\t" - /* - * Handle data backward by movs. - */ - ".p2align 4\n\t" - "6:\n\t" - "mov (%1), %3\n\t" - "mov %2, %4\n\t" - "lea -4(%1, %0), %1\n\t" - "lea -4(%2, %0), %2\n\t" - "shr $2, %0\n\t" - "std\n\t" - "rep movsl\n\t" - "mov %3,(%4)\n\t" - "cld\n\t" - "jmp 11f\n\t" - - /* - * Start to prepare for backward copy. - */ - ".p2align 4\n\t" - "2:\n\t" - "cmp $680, %0\n\t" - "jb 5f\n\t" - "mov %1, %3\n\t" - "xor %2, %3\n\t" - "and $0xff, %3\n\t" - "jz 6b\n\t" - - /* - * Calculate copy position to tail. - */ - "5:\n\t" - "add %0, %1\n\t" - "add %0, %2\n\t" - "sub $0x10, %0\n\t" - - /* - * We gobble 16 bytes backward in each loop. - */ - "7:\n\t" - "sub $0x10, %0\n\t" - - "mov -1*4(%1), %3\n\t" - "mov -2*4(%1), %4\n\t" - "mov %3, -1*4(%2)\n\t" - "mov %4, -2*4(%2)\n\t" - "mov -3*4(%1), %3\n\t" - "mov -4*4(%1), %4\n\t" - "mov %3, -3*4(%2)\n\t" - "mov %4, -4*4(%2)\n\t" - "lea -0x10(%1), %1\n\t" - "lea -0x10(%2), %2\n\t" - "jae 7b\n\t" - /* - * Calculate copy position to head. - */ - "add $0x10, %0\n\t" - "sub %0, %1\n\t" - "sub %0, %2\n\t" - - /* - * Move data from 8 bytes to 15 bytes. - */ - ".p2align 4\n\t" - "1:\n\t" - "cmp $8, %0\n\t" - "jb 8f\n\t" - "mov 0*4(%1), %3\n\t" - "mov 1*4(%1), %4\n\t" - "mov -2*4(%1, %0), %5\n\t" - "mov -1*4(%1, %0), %1\n\t" - - "mov %3, 0*4(%2)\n\t" - "mov %4, 1*4(%2)\n\t" - "mov %5, -2*4(%2, %0)\n\t" - "mov %1, -1*4(%2, %0)\n\t" - "jmp 11f\n\t" - - /* - * Move data from 4 bytes to 7 bytes. - */ - ".p2align 4\n\t" - "8:\n\t" - "cmp $4, %0\n\t" - "jb 9f\n\t" - "mov 0*4(%1), %3\n\t" - "mov -1*4(%1, %0), %4\n\t" - "mov %3, 0*4(%2)\n\t" - "mov %4, -1*4(%2, %0)\n\t" - "jmp 11f\n\t" - - /* - * Move data from 2 bytes to 3 bytes. - */ - ".p2align 4\n\t" - "9:\n\t" - "cmp $2, %0\n\t" - "jb 10f\n\t" - "movw 0*2(%1), %%dx\n\t" - "movw -1*2(%1, %0), %%bx\n\t" - "movw %%dx, 0*2(%2)\n\t" - "movw %%bx, -1*2(%2, %0)\n\t" - "jmp 11f\n\t" - - /* - * Move data for 1 byte. - */ - ".p2align 4\n\t" - "10:\n\t" - "cmp $1, %0\n\t" - "jb 11f\n\t" - "movb (%1), %%cl\n\t" - "movb %%cl, (%2)\n\t" - ".p2align 4\n\t" - "11:" - : "=&c" (d0), "=&S" (d1), "=&D" (d2), - "=r" (d3),"=r" (d4), "=r"(d5) - :"0" (n), - "1" (src), - "2" (dest) - :"memory"); - - return ret; - -} -EXPORT_SYMBOL(memmove); diff --git a/arch/x86/lib/memmove_32.S b/arch/x86/lib/memmove_32.S new file mode 100644 index 000000000000..0588b2c0fc95 --- /dev/null +++ b/arch/x86/lib/memmove_32.S @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/linkage.h> +#include <asm/export.h> + +SYM_FUNC_START(memmove) +/* + * void *memmove(void *dest_in, const void *src_in, size_t n) + * -mregparm=3 passes these in registers: + * dest_in: %eax + * src_in: %edx + * n: %ecx + * See also: arch/x86/entry/calling.h for description of the calling convention. + * + * n can remain in %ecx, but for `rep movsl`, we'll need dest in %edi and src + * in %esi. + */ +.set dest_in, %eax +.set dest, %edi +.set src_in, %edx +.set src, %esi +.set n, %ecx +.set tmp0, %edx +.set tmp0w, %dx +.set tmp1, %ebx +.set tmp1w, %bx +.set tmp2, %eax +.set tmp3b, %cl + +/* + * Save all callee-saved registers, because this function is going to clobber + * all of them: + */ + pushl %ebp + movl %esp, %ebp // set standard frame pointer + + pushl %ebx + pushl %edi + pushl %esi + pushl %eax // save 'dest_in' parameter [eax] as the return value + + movl src_in, src + movl dest_in, dest + + /* Handle more 16 bytes in loop */ + cmpl $0x10, n + jb .Lmove_16B + + /* Decide forward/backward copy mode */ + cmpl dest, src + jb .Lbackwards_header + + /* + * movs instruction have many startup latency + * so we handle small size by general register. + */ + cmpl $680, n + jb .Ltoo_small_forwards + /* movs instruction is only good for aligned case. */ + movl src, tmp0 + xorl dest, tmp0 + andl $0xff, tmp0 + jz .Lforward_movs +.Ltoo_small_forwards: + subl $0x10, n + + /* We gobble 16 bytes forward in each loop. */ +.Lmove_16B_forwards_loop: + subl $0x10, n + movl 0*4(src), tmp0 + movl 1*4(src), tmp1 + movl tmp0, 0*4(dest) + movl tmp1, 1*4(dest) + movl 2*4(src), tmp0 + movl 3*4(src), tmp1 + movl tmp0, 2*4(dest) + movl tmp1, 3*4(dest) + leal 0x10(src), src + leal 0x10(dest), dest + jae .Lmove_16B_forwards_loop + addl $0x10, n + jmp .Lmove_16B + + /* Handle data forward by movs. */ +.p2align 4 +.Lforward_movs: + movl -4(src, n), tmp0 + leal -4(dest, n), tmp1 + shrl $2, n + rep movsl + movl tmp0, (tmp1) + jmp .Ldone + + /* Handle data backward by movs. */ +.p2align 4 +.Lbackwards_movs: + movl (src), tmp0 + movl dest, tmp1 + leal -4(src, n), src + leal -4(dest, n), dest + shrl $2, n + std + rep movsl + movl tmp0,(tmp1) + cld + jmp .Ldone + + /* Start to prepare for backward copy. */ +.p2align 4 +.Lbackwards_header: + cmpl $680, n + jb .Ltoo_small_backwards + movl src, tmp0 + xorl dest, tmp0 + andl $0xff, tmp0 + jz .Lbackwards_movs + + /* Calculate copy position to tail. */ +.Ltoo_small_backwards: + addl n, src + addl n, dest + subl $0x10, n + + /* We gobble 16 bytes backward in each loop. */ +.Lmove_16B_backwards_loop: + subl $0x10, n + + movl -1*4(src), tmp0 + movl -2*4(src), tmp1 + movl tmp0, -1*4(dest) + movl tmp1, -2*4(dest) + movl -3*4(src), tmp0 + movl -4*4(src), tmp1 + movl tmp0, -3*4(dest) + movl tmp1, -4*4(dest) + leal -0x10(src), src + leal -0x10(dest), dest + jae .Lmove_16B_backwards_loop + /* Calculate copy position to head. */ + addl $0x10, n + subl n, src + subl n, dest + + /* Move data from 8 bytes to 15 bytes. */ +.p2align 4 +.Lmove_16B: + cmpl $8, n + jb .Lmove_8B + movl 0*4(src), tmp0 + movl 1*4(src), tmp1 + movl -2*4(src, n), tmp2 + movl -1*4(src, n), src + + movl tmp0, 0*4(dest) + movl tmp1, 1*4(dest) + movl tmp2, -2*4(dest, n) + movl src, -1*4(dest, n) + jmp .Ldone + + /* Move data from 4 bytes to 7 bytes. */ +.p2align 4 +.Lmove_8B: + cmpl $4, n + jb .Lmove_4B + movl 0*4(src), tmp0 + movl -1*4(src, n), tmp1 + movl tmp0, 0*4(dest) + movl tmp1, -1*4(dest, n) + jmp .Ldone + + /* Move data from 2 bytes to 3 bytes. */ +.p2align 4 +.Lmove_4B: + cmpl $2, n + jb .Lmove_1B + movw 0*2(src), tmp0w + movw -1*2(src, n), tmp1w + movw tmp0w, 0*2(dest) + movw tmp1w, -1*2(dest, n) + jmp .Ldone + + /* Move data for 1 byte. */ +.p2align 4 +.Lmove_1B: + cmpl $1, n + jb .Ldone + movb (src), tmp3b + movb tmp3b, (dest) +.p2align 4 +.Ldone: + popl dest_in // restore 'dest_in' [eax] as the return value + /* Restore all callee-saved registers: */ + popl %esi + popl %edi + popl %ebx + popl %ebp + + RET +SYM_FUNC_END(memmove) +EXPORT_SYMBOL(memmove) diff --git a/lib/memcpy_kunit.c b/lib/memcpy_kunit.c index 2b5cc70ac53f..7513e6d5dc90 100644 --- a/lib/memcpy_kunit.c +++ b/lib/memcpy_kunit.c @@ -105,6 +105,8 @@ static void memcpy_test(struct kunit *test) #undef TEST_OP } +static unsigned char larger_array [2048]; + static void memmove_test(struct kunit *test) { #define TEST_OP "memmove" @@ -179,6 +181,26 @@ static void memmove_test(struct kunit *test) ptr = &overlap.data[2]; memmove(ptr, overlap.data, 5); compare("overlapping write", overlap, overlap_expected); + + /* Verify larger overlapping moves. */ + larger_array[256] = 0xAAu; + /* + * Test a backwards overlapping memmove first. 256 and 1024 are + * important for i386 to use rep movsl. + */ + memmove(larger_array, larger_array + 256, 1024); + KUNIT_ASSERT_EQ(test, larger_array[0], 0xAAu); + KUNIT_ASSERT_EQ(test, larger_array[256], 0x00); + KUNIT_ASSERT_NULL(test, + memchr(larger_array + 1, 0xaa, ARRAY_SIZE(larger_array) - 1)); + /* Test a forwards overlapping memmove. */ + larger_array[0] = 0xBBu; + memmove(larger_array + 256, larger_array, 1024); + KUNIT_ASSERT_EQ(test, larger_array[0], 0xBBu); + KUNIT_ASSERT_EQ(test, larger_array[256], 0xBBu); + KUNIT_ASSERT_NULL(test, memchr(larger_array + 1, 0xBBu, 256 - 1)); + KUNIT_ASSERT_NULL(test, + memchr(larger_array + 257, 0xBBu, ARRAY_SIZE(larger_array) - 257)); #undef TEST_OP } |