From 2272382879d93d37e7964554cea5b0583c94c247 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Tue, 4 Feb 2014 12:56:15 +0400 Subject: arc: add library functions These are library functions used by ARC700 architecture. Signed-off-by: Alexey Brodkin Cc: Vineet Gupta Cc: Francois Bedard Cc: Wolfgang Denk Cc: Heiko Schocher --- arch/arc/lib/Makefile | 16 ++++++ arch/arc/lib/bootm.c | 106 ++++++++++++++++++++++++++++++++++ arch/arc/lib/memcmp.S | 121 +++++++++++++++++++++++++++++++++++++++ arch/arc/lib/memcpy-700.S | 63 +++++++++++++++++++++ arch/arc/lib/memset.S | 62 ++++++++++++++++++++ arch/arc/lib/relocate.c | 72 +++++++++++++++++++++++ arch/arc/lib/sections.c | 21 +++++++ arch/arc/lib/strchr-700.S | 141 ++++++++++++++++++++++++++++++++++++++++++++++ arch/arc/lib/strcmp.S | 97 +++++++++++++++++++++++++++++++ arch/arc/lib/strcpy-700.S | 67 ++++++++++++++++++++++ arch/arc/lib/strlen.S | 80 ++++++++++++++++++++++++++ 11 files changed, 846 insertions(+) create mode 100644 arch/arc/lib/Makefile create mode 100644 arch/arc/lib/bootm.c create mode 100644 arch/arc/lib/memcmp.S create mode 100644 arch/arc/lib/memcpy-700.S create mode 100644 arch/arc/lib/memset.S create mode 100644 arch/arc/lib/relocate.c create mode 100644 arch/arc/lib/sections.c create mode 100644 arch/arc/lib/strchr-700.S create mode 100644 arch/arc/lib/strcmp.S create mode 100644 arch/arc/lib/strcpy-700.S create mode 100644 arch/arc/lib/strlen.S (limited to 'arch') diff --git a/arch/arc/lib/Makefile b/arch/arc/lib/Makefile new file mode 100644 index 00000000000..7675f855d5a --- /dev/null +++ b/arch/arc/lib/Makefile @@ -0,0 +1,16 @@ +# +# Copyright (C) 2013-2014 Synopsys, Inc. All rights reserved. +# +# SPDX-License-Identifier: GPL-2.0+ +# + +obj-y += sections.o +obj-y += relocate.o +obj-y += strchr-700.o +obj-y += strcmp.o +obj-y += strcpy-700.o +obj-y += strlen.o +obj-y += memcmp.o +obj-y += memcpy-700.o +obj-y += memset.o +obj-$(CONFIG_CMD_BOOTM) += bootm.o diff --git a/arch/arc/lib/bootm.c b/arch/arc/lib/bootm.c new file mode 100644 index 00000000000..d185a50bd31 --- /dev/null +++ b/arch/arc/lib/bootm.c @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2013-2014 Synopsys, Inc. All rights reserved. + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +#include + +DECLARE_GLOBAL_DATA_PTR; + +static ulong get_sp(void) +{ + ulong ret; + + asm("mov %0, sp" : "=r"(ret) : ); + return ret; +} + +void arch_lmb_reserve(struct lmb *lmb) +{ + ulong sp; + + /* + * Booting a (Linux) kernel image + * + * Allocate space for command line and board info - the + * address should be as high as possible within the reach of + * the kernel (see CONFIG_SYS_BOOTMAPSZ settings), but in unused + * memory, which means far enough below the current stack + * pointer. + */ + sp = get_sp(); + debug("## Current stack ends at 0x%08lx ", sp); + + /* adjust sp by 4K to be safe */ + sp -= 4096; + lmb_reserve(lmb, sp, (CONFIG_SYS_SDRAM_BASE + gd->ram_size - sp)); +} + +static int cleanup_before_linux(void) +{ + disable_interrupts(); + flush_dcache_all(); + invalidate_icache_all(); + + return 0; +} + +/* Subcommand: PREP */ +static void boot_prep_linux(bootm_headers_t *images) +{ + if (image_setup_linux(images)) + hang(); +} + +/* Subcommand: GO */ +static void boot_jump_linux(bootm_headers_t *images, int flag) +{ + void (*kernel_entry)(int zero, int arch, uint params); + unsigned int r0, r2; + int fake = (flag & BOOTM_STATE_OS_FAKE_GO); + + kernel_entry = (void (*)(int, int, uint))images->ep; + + debug("## Transferring control to Linux (at address %08lx)...\n", + (ulong) kernel_entry); + bootstage_mark(BOOTSTAGE_ID_RUN_OS); + + printf("\nStarting kernel ...%s\n\n", fake ? + "(fake run for tracing)" : ""); + bootstage_mark_name(BOOTSTAGE_ID_BOOTM_HANDOFF, "start_kernel"); + + cleanup_before_linux(); + + if (IMAGE_ENABLE_OF_LIBFDT && images->ft_len) { + r0 = 2; + r2 = (unsigned int)images->ft_addr; + } else { + r0 = 1; + r2 = (unsigned int)getenv("bootargs"); + } + + if (!fake) + kernel_entry(r0, 0, r2); +} + +int do_bootm_linux(int flag, int argc, char *argv[], bootm_headers_t *images) +{ + /* No need for those on ARC */ + if ((flag & BOOTM_STATE_OS_BD_T) || (flag & BOOTM_STATE_OS_CMDLINE)) + return -1; + + if (flag & BOOTM_STATE_OS_PREP) { + boot_prep_linux(images); + return 0; + } + + if (flag & (BOOTM_STATE_OS_GO | BOOTM_STATE_OS_FAKE_GO)) { + boot_jump_linux(images, flag); + return 0; + } + + boot_prep_linux(images); + boot_jump_linux(images, flag); + return 0; +} diff --git a/arch/arc/lib/memcmp.S b/arch/arc/lib/memcmp.S new file mode 100644 index 00000000000..fa5aac5f674 --- /dev/null +++ b/arch/arc/lib/memcmp.S @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +#ifdef __LITTLE_ENDIAN__ +#define WORD2 r2 +#define SHIFT r3 +#else /* __BIG_ENDIAN__ */ +#define WORD2 r3 +#define SHIFT r2 +#endif /* _ENDIAN__ */ + +.global memcmp +.align 4 +memcmp: + or %r12, %r0, %r1 + asl_s %r12, %r12, 30 + sub %r3, %r2, 1 + brls %r2, %r12, .Lbytewise + ld %r4, [%r0, 0] + ld %r5, [%r1, 0] + lsr.f %lp_count, %r3, 3 + lpne .Loop_end + ld_s WORD2, [%r0, 4] + ld_s %r12, [%r1, 4] + brne %r4, %r5, .Leven + ld.a %r4, [%r0, 8] + ld.a %r5, [%r1, 8] + brne WORD2, %r12, .Lodd +.Loop_end: + asl_s SHIFT, SHIFT, 3 + bhs_s .Last_cmp + brne %r4, %r5, .Leven + ld %r4, [%r0, 4] + ld %r5, [%r1, 4] +#ifdef __LITTLE_ENDIAN__ + nop_s + /* one more load latency cycle */ +.Last_cmp: + xor %r0, %r4, %r5 + bset %r0, %r0, SHIFT + sub_s %r1, %r0, 1 + bic_s %r1, %r1, %r0 + norm %r1, %r1 + b.d .Leven_cmp + and %r1, %r1, 24 +.Leven: + xor %r0, %r4, %r5 + sub_s %r1, %r0, 1 + bic_s %r1, %r1, %r0 + norm %r1, %r1 + /* slow track insn */ + and %r1, %r1, 24 +.Leven_cmp: + asl %r2, %r4, %r1 + asl %r12, %r5, %r1 + lsr_s %r2, %r2, 1 + lsr_s %r12, %r12, 1 + j_s.d [%blink] + sub %r0, %r2, %r12 + .balign 4 +.Lodd: + xor %r0, WORD2, %r12 + sub_s %r1, %r0, 1 + bic_s %r1, %r1, %r0 + norm %r1, %r1 + /* slow track insn */ + and %r1, %r1, 24 + asl_s %r2, %r2, %r1 + asl_s %r12, %r12, %r1 + lsr_s %r2, %r2, 1 + lsr_s %r12, %r12, 1 + j_s.d [%blink] + sub %r0, %r2, %r12 +#else /* __BIG_ENDIAN__ */ +.Last_cmp: + neg_s SHIFT, SHIFT + lsr %r4, %r4, SHIFT + lsr %r5, %r5, SHIFT + /* slow track insn */ +.Leven: + sub.f %r0, %r4, %r5 + mov.ne %r0, 1 + j_s.d [%blink] + bset.cs %r0, %r0, 31 +.Lodd: + cmp_s WORD2, %r12 + + mov_s %r0, 1 + j_s.d [%blink] + bset.cs %r0, %r0, 31 +#endif /* _ENDIAN__ */ + .balign 4 +.Lbytewise: + breq %r2, 0, .Lnil + ldb %r4, [%r0, 0] + ldb %r5, [%r1, 0] + lsr.f %lp_count, %r3 + lpne .Lbyte_end + ldb_s %r3, [%r0, 1] + ldb %r12, [%r1, 1] + brne %r4, %r5, .Lbyte_even + ldb.a %r4, [%r0, 2] + ldb.a %r5, [%r1, 2] + brne %r3, %r12, .Lbyte_odd +.Lbyte_end: + bcc .Lbyte_even + brne %r4, %r5, .Lbyte_even + ldb_s %r3, [%r0, 1] + ldb_s %r12, [%r1, 1] +.Lbyte_odd: + j_s.d [%blink] + sub %r0, %r3, %r12 +.Lbyte_even: + j_s.d [%blink] + sub %r0, %r4, %r5 +.Lnil: + j_s.d [%blink] + mov %r0, 0 diff --git a/arch/arc/lib/memcpy-700.S b/arch/arc/lib/memcpy-700.S new file mode 100644 index 00000000000..51dd73ab8fd --- /dev/null +++ b/arch/arc/lib/memcpy-700.S @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +.global memcpy +.align 4 +memcpy: + or %r3, %r0, %r1 + asl_s %r3, %r3, 30 + mov_s %r5, %r0 + brls.d %r2, %r3, .Lcopy_bytewise + sub.f %r3, %r2, 1 + ld_s %r12, [%r1, 0] + asr.f %lp_count, %r3, 3 + bbit0.d %r3, 2, .Lnox4 + bmsk_s %r2, %r2, 1 + st.ab %r12, [%r5, 4] + ld.a %r12, [%r1, 4] +.Lnox4: + lppnz .Lendloop + ld_s %r3, [%r1, 4] + st.ab %r12, [%r5, 4] + ld.a %r12, [%r1, 8] + st.ab %r3, [%r5, 4] +.Lendloop: + breq %r2, 0, .Last_store + ld %r3, [%r5, 0] +#ifdef __LITTLE_ENDIAN__ + add3 %r2, -1, %r2 + /* uses long immediate */ + xor_s %r12, %r12, %r3 + bmsk %r12, %r12, %r2 + xor_s %r12, %r12, %r3 +#else /* __BIG_ENDIAN__ */ + sub3 %r2, 31, %r2 + /* uses long immediate */ + xor_s %r3, %r3, %r12 + bmsk %r3, %r3, %r2 + xor_s %r12, %r12, %r3 +#endif /* _ENDIAN__ */ +.Last_store: + j_s.d [%blink] + st %r12, [%r5, 0] + + .balign 4 +.Lcopy_bytewise: + jcs [%blink] + ldb_s %r12, [%r1, 0] + lsr.f %lp_count, %r3 + bhs_s .Lnox1 + stb.ab %r12, [%r5, 1] + ldb.a %r12, [%r1, 1] +.Lnox1: + lppnz .Lendbloop + ldb_s %r3, [%r1, 1] + stb.ab %r12, [%r5, 1] + ldb.a %r12, [%r1, 2] + stb.ab %r3, [%r5, 1] +.Lendbloop: + j_s.d [%blink] + stb %r12, [%r5, 0] diff --git a/arch/arc/lib/memset.S b/arch/arc/lib/memset.S new file mode 100644 index 00000000000..017e8af0e88 --- /dev/null +++ b/arch/arc/lib/memset.S @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +#define SMALL 7 /* Must be at least 6 to deal with alignment/loop issues. */ + +.global memset +.align 4 +memset: + mov_s %r4, %r0 + or %r12, %r0, %r2 + bmsk.f %r12, %r12, 1 + extb_s %r1, %r1 + asl %r3, %r1, 8 + beq.d .Laligned + or_s %r1, %r1, %r3 + brls %r2, SMALL, .Ltiny + add %r3, %r2, %r0 + stb %r1, [%r3, -1] + bclr_s %r3, %r3, 0 + stw %r1, [%r3, -2] + bmsk.f %r12, %r0, 1 + add_s %r2, %r2, %r12 + sub.ne %r2, %r2, 4 + stb.ab %r1, [%r4, 1] + and %r4, %r4, -2 + stw.ab %r1, [%r4, 2] + and %r4, %r4, -4 + + .balign 4 +.Laligned: + asl %r3, %r1, 16 + lsr.f %lp_count, %r2, 2 + or_s %r1, %r1, %r3 + lpne .Loop_end + st.ab %r1, [%r4, 4] +.Loop_end: + j_s [%blink] + + .balign 4 +.Ltiny: + mov.f %lp_count, %r2 + lpne .Ltiny_end + stb.ab %r1, [%r4, 1] +.Ltiny_end: + j_s [%blink] + +/* + * memzero: @r0 = mem, @r1 = size_t + * memset: @r0 = mem, @r1 = char, @r2 = size_t + */ + +.global memzero +.align 4 +memzero: + /* adjust bzero args to memset args */ + mov %r2, %r1 + mov %r1, 0 + /* tail call so need to tinker with blink */ + b memset diff --git a/arch/arc/lib/relocate.c b/arch/arc/lib/relocate.c new file mode 100644 index 00000000000..956aa1494e1 --- /dev/null +++ b/arch/arc/lib/relocate.c @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2013-2014 Synopsys, Inc. All rights reserved. + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +#include +#include +#include + +DECLARE_GLOBAL_DATA_PTR; + +/* + * Base functionality is taken from x86 version with added ARC-specifics + */ +int do_elf_reloc_fixups(void) +{ + Elf32_Rela *re_src = (Elf32_Rela *)(&__rel_dyn_start); + Elf32_Rela *re_end = (Elf32_Rela *)(&__rel_dyn_end); + + Elf32_Addr *offset_ptr_rom, *last_offset = NULL; + Elf32_Addr *offset_ptr_ram; + + do { + /* Get the location from the relocation entry */ + offset_ptr_rom = (Elf32_Addr *)re_src->r_offset; + + /* Check that the location of the relocation is in .text */ + if (offset_ptr_rom >= (Elf32_Addr *)CONFIG_SYS_TEXT_BASE && + offset_ptr_rom > last_offset) { + unsigned int val; + /* Switch to the in-RAM version */ + offset_ptr_ram = (Elf32_Addr *)((ulong)offset_ptr_rom + + gd->reloc_off); + + /* + * Use "memcpy" because target location might be + * 16-bit aligned on ARC so we may need to read + * byte-by-byte. On attempt to read entire word by + * CPU throws an exception + */ + memcpy(&val, offset_ptr_ram, sizeof(int)); + + /* If location in ".text" section swap value */ + if ((unsigned int)offset_ptr_rom < + (unsigned int)&__text_end) + val = (val << 16) | (val >> 16); + + /* Check that the target points into .text */ + if (val >= CONFIG_SYS_TEXT_BASE && val <= + (unsigned int)&__bss_end) { + val += gd->reloc_off; + /* If location in ".text" section swap value */ + if ((unsigned int)offset_ptr_rom < + (unsigned int)&__text_end) + val = (val << 16) | (val >> 16); + memcpy(offset_ptr_ram, &val, sizeof(int)); + } else { + debug(" %p: rom reloc %x, ram %p, value %x, limit %x\n", + re_src, re_src->r_offset, offset_ptr_ram, + val, (unsigned int)&__bss_end); + } + } else { + debug(" %p: rom reloc %x, last %p\n", re_src, + re_src->r_offset, last_offset); + } + last_offset = offset_ptr_rom; + + } while (++re_src < re_end); + + return 0; +} diff --git a/arch/arc/lib/sections.c b/arch/arc/lib/sections.c new file mode 100644 index 00000000000..b0b46a4e9ae --- /dev/null +++ b/arch/arc/lib/sections.c @@ -0,0 +1,21 @@ +/* + * Copyright (C) 2013-2014 Synopsys, Inc. All rights reserved. + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +/* + * For some reason linker sets linker-generated symbols to zero in PIE mode. + * A work-around is substitution of linker-generated symbols with + * compiler-generated symbols which are properly handled by linker in PAE mode. + */ + +char __bss_start[0] __attribute__((section(".__bss_start"))); +char __bss_end[0] __attribute__((section(".__bss_end"))); +char __image_copy_start[0] __attribute__((section(".__image_copy_start"))); +char __image_copy_end[0] __attribute__((section(".__image_copy_end"))); +char __rel_dyn_start[0] __attribute__((section(".__rel_dyn_start"))); +char __rel_dyn_end[0] __attribute__((section(".__rel_dyn_end"))); +char __text_start[0] __attribute__((section(".__text_start"))); +char __text_end[0] __attribute__((section(".__text_end"))); +char __init_end[0] __attribute__((section(".__init_end"))); diff --git a/arch/arc/lib/strchr-700.S b/arch/arc/lib/strchr-700.S new file mode 100644 index 00000000000..55fcc9fb00e --- /dev/null +++ b/arch/arc/lib/strchr-700.S @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +/* + * ARC700 has a relatively long pipeline and branch prediction, so we want + * to avoid branches that are hard to predict. On the other hand, the + * presence of the norm instruction makes it easier to operate on whole + * words branch-free. + */ + +.global strchr +.align 4 +strchr: + extb_s %r1, %r1 + asl %r5, %r1, 8 + bmsk %r2, %r0, 1 + or %r5, %r5, %r1 + mov_s %r3, 0x01010101 + breq.d %r2, %r0, .Laligned + asl %r4, %r5, 16 + sub_s %r0, %r0, %r2 + asl %r7, %r2, 3 + ld_s %r2, [%r0] +#ifdef __LITTLE_ENDIAN__ + asl %r7, %r3, %r7 +#else /* __BIG_ENDIAN__ */ + lsr %r7, %r3, %r7 +#endif /* _ENDIAN__ */ + or %r5, %r5, %r4 + ror %r4, %r3 + sub %r12, %r2, %r7 + bic_s %r12, %r12, %r2 + and %r12, %r12, %r4 + brne.d %r12, 0, .Lfound0_ua + xor %r6, %r2, %r5 + ld.a %r2, [%r0, 4] + sub %r12, %r6, %r7 + bic %r12, %r12, %r6 +#ifdef __LITTLE_ENDIAN__ + and %r7, %r12, %r4 + /* For speed, we want this branch to be unaligned. */ + breq %r7, 0, .Loop + /* Likewise this one */ + b .Lfound_char +#else /* __BIG_ENDIAN__ */ + and %r12, %r12, %r4 + /* For speed, we want this branch to be unaligned. */ + breq %r12, 0, .Loop + lsr_s %r12, %r12, 7 + bic %r2, %r7, %r6 + b.d .Lfound_char_b + and_s %r2, %r2, %r12 +#endif /* _ENDIAN__ */ + /* We require this code address to be unaligned for speed... */ +.Laligned: + ld_s %r2, [%r0] + or %r5, %r5, %r4 + ror %r4, %r3 + /* ... so that this code address is aligned, for itself and ... */ +.Loop: + sub %r12, %r2, %r3 + bic_s %r12, %r12, %r2 + and %r12, %r12, %r4 + brne.d %r12, 0, .Lfound0 + xor %r6, %r2, %r5 + ld.a %r2, [%r0, 4] + sub %r12, %r6, %r3 + bic %r12, %r12, %r6 + and %r7, %r12, %r4 + breq %r7, 0, .Loop + /* + *... so that this branch is unaligned. + * Found searched-for character. + * r0 has already advanced to next word. + */ +#ifdef __LITTLE_ENDIAN__ + /* + * We only need the information about the first matching byte + * (i.e. the least significant matching byte) to be exact, + * hence there is no problem with carry effects. + */ +.Lfound_char: + sub %r3, %r7, 1 + bic %r3, %r3, %r7 + norm %r2, %r3 + sub_s %r0, %r0, 1 + asr_s %r2, %r2, 3 + j.d [%blink] + sub_s %r0, %r0, %r2 + + .balign 4 +.Lfound0_ua: + mov %r3, %r7 +.Lfound0: + sub %r3, %r6, %r3 + bic %r3, %r3, %r6 + and %r2, %r3, %r4 + or_s %r12, %r12, %r2 + sub_s %r3, %r12, 1 + bic_s %r3, %r3, %r12 + norm %r3, %r3 + add_s %r0, %r0, 3 + asr_s %r12, %r3, 3 + asl.f 0, %r2, %r3 + sub_s %r0, %r0, %r12 + j_s.d [%blink] + mov.pl %r0, 0 +#else /* __BIG_ENDIAN__ */ +.Lfound_char: + lsr %r7, %r7, 7 + + bic %r2, %r7, %r6 +.Lfound_char_b: + norm %r2, %r2 + sub_s %r0, %r0, 4 + asr_s %r2, %r2, 3 + j.d [%blink] + add_s %r0, %r0, %r2 + +.Lfound0_ua: + mov_s %r3, %r7 +.Lfound0: + asl_s %r2, %r2, 7 + or %r7, %r6, %r4 + bic_s %r12, %r12, %r2 + sub %r2, %r7, %r3 + or %r2, %r2, %r6 + bic %r12, %r2, %r12 + bic.f %r3, %r4, %r12 + norm %r3, %r3 + + add.pl %r3, %r3, 1 + asr_s %r12, %r3, 3 + asl.f 0, %r2, %r3 + add_s %r0, %r0, %r12 + j_s.d [%blink] + mov.mi %r0, 0 +#endif /* _ENDIAN__ */ diff --git a/arch/arc/lib/strcmp.S b/arch/arc/lib/strcmp.S new file mode 100644 index 00000000000..8cb7d2f18c6 --- /dev/null +++ b/arch/arc/lib/strcmp.S @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +/* + * This is optimized primarily for the ARC700. + * It would be possible to speed up the loops by one cycle / word + * respective one cycle / byte by forcing double source 1 alignment, unrolling + * by a factor of two, and speculatively loading the second word / byte of + * source 1; however, that would increase the overhead for loop setup / finish, + * and strcmp might often terminate early. + */ + +.global strcmp +.align 4 +strcmp: + or %r2, %r0, %r1 + bmsk_s %r2, %r2, 1 + brne %r2, 0, .Lcharloop + mov_s %r12, 0x01010101 + ror %r5, %r12 +.Lwordloop: + ld.ab %r2, [%r0, 4] + ld.ab %r3, [%r1, 4] + nop_s + sub %r4, %r2, %r12 + bic %r4, %r4, %r2 + and %r4, %r4, %r5 + brne %r4, 0, .Lfound0 + breq %r2 ,%r3, .Lwordloop +#ifdef __LITTLE_ENDIAN__ + xor %r0, %r2, %r3 /* mask for difference */ + sub_s %r1, %r0, 1 + bic_s %r0, %r0, %r1 /* mask for least significant difference bit */ + sub %r1, %r5, %r0 + xor %r0, %r5, %r1 /* mask for least significant difference byte */ + and_s %r2, %r2, %r0 + and_s %r3, %r3, %r0 +#endif /* _ENDIAN__ */ + cmp_s %r2, %r3 + mov_s %r0, 1 + j_s.d [%blink] + bset.lo %r0, %r0, 31 + + .balign 4 +#ifdef __LITTLE_ENDIAN__ +.Lfound0: + xor %r0, %r2, %r3 /* mask for difference */ + or %r0, %r0, %r4 /* or in zero indicator */ + sub_s %r1, %r0, 1 + bic_s %r0, %r0, %r1 /* mask for least significant difference bit */ + sub %r1, %r5, %r0 + xor %r0, %r5, %r1 /* mask for least significant difference byte */ + and_s %r2, %r2, %r0 + and_s %r3, %r3, %r0 + sub.f %r0, %r2, %r3 + mov.hi %r0, 1 + j_s.d [%blink] + bset.lo %r0, %r0, 31 +#else /* __BIG_ENDIAN__ */ + /* + * The zero-detection above can mis-detect 0x01 bytes as zeroes + * because of carry-propagateion from a lower significant zero byte. + * We can compensate for this by checking that bit0 is zero. + * This compensation is not necessary in the step where we + * get a low estimate for r2, because in any affected bytes + * we already have 0x00 or 0x01, which will remain unchanged + * when bit 7 is cleared. + */ + .balign 4 +.Lfound0: + lsr %r0, %r4, 8 + lsr_s %r1, %r2 + bic_s %r2, %r2, %r0 /* get low estimate for r2 and get ... */ + bic_s %r0, %r0, %r1 /* */ + or_s %r3, %r3, %r0 /* ... high estimate r3 so that r2 > r3 will */ + cmp_s %r3, %r2 /* ... be independent of trailing garbage */ + or_s %r2, %r2, %r0 /* likewise for r3 > r2 */ + bic_s %r3, %r3, %r0 + rlc %r0, 0 /* r0 := r2 > r3 ? 1 : 0 */ + cmp_s %r2, %r3 + j_s.d [%blink] + bset.lo %r0, %r0, 31 +#endif /* _ENDIAN__ */ + + .balign 4 +.Lcharloop: + ldb.ab %r2,[%r0,1] + ldb.ab %r3,[%r1,1] + nop_s + breq %r2, 0, .Lcmpend + breq %r2, %r3, .Lcharloop +.Lcmpend: + j_s.d [%blink] + sub %r0, %r2, %r3 diff --git a/arch/arc/lib/strcpy-700.S b/arch/arc/lib/strcpy-700.S new file mode 100644 index 00000000000..41bb53e5015 --- /dev/null +++ b/arch/arc/lib/strcpy-700.S @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +/* + * If dst and src are 4 byte aligned, copy 8 bytes at a time. + * If the src is 4, but not 8 byte aligned, we first read 4 bytes to get + * it 8 byte aligned. Thus, we can do a little read-ahead, without + * dereferencing a cache line that we should not touch. + * Note that short and long instructions have been scheduled to avoid + * branch stalls. + * The beq_s to r3z could be made unaligned & long to avoid a stall + * there, but it is not likely to be taken often, and it would also be likely + * to cost an unaligned mispredict at the next call. + */ + +.global strcpy +.align 4 +strcpy: + or %r2, %r0, %r1 + bmsk_s %r2, %r2, 1 + brne.d %r2, 0, charloop + mov_s %r10, %r0 + ld_s %r3, [%r1, 0] + mov %r8, 0x01010101 + bbit0.d %r1, 2, loop_start + ror %r12, %r8 + sub %r2, %r3, %r8 + bic_s %r2, %r2, %r3 + tst_s %r2,%r12 + bne r3z + mov_s %r4,%r3 + .balign 4 +loop: + ld.a %r3, [%r1, 4] + st.ab %r4, [%r10, 4] +loop_start: + ld.a %r4, [%r1, 4] + sub %r2, %r3, %r8 + bic_s %r2, %r2, %r3 + tst_s %r2, %r12 + bne_s r3z + st.ab %r3, [%r10, 4] + sub %r2, %r4, %r8 + bic %r2, %r2, %r4 + tst %r2, %r12 + beq loop + mov_s %r3, %r4 +#ifdef __LITTLE_ENDIAN__ +r3z: bmsk.f %r1, %r3, 7 + lsr_s %r3, %r3, 8 +#else /* __BIG_ENDIAN__ */ +r3z: lsr.f %r1, %r3, 24 + asl_s %r3, %r3, 8 +#endif /* _ENDIAN__ */ + bne.d r3z + stb.ab %r1, [%r10, 1] + j_s [%blink] + + .balign 4 +charloop: + ldb.ab %r3, [%r1, 1] + brne.d %r3, 0, charloop + stb.ab %r3, [%r10, 1] + j [%blink] diff --git a/arch/arc/lib/strlen.S b/arch/arc/lib/strlen.S new file mode 100644 index 00000000000..666e22c0d54 --- /dev/null +++ b/arch/arc/lib/strlen.S @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2004, 2007-2010, 2011-2014 Synopsys, Inc. All rights reserved. + * + * SPDX-License-Identifier: GPL-2.0+ + */ + +.global strlen +.align 4 +strlen: + or %r3, %r0, 7 + ld %r2, [%r3, -7] + ld.a %r6, [%r3, -3] + mov %r4, 0x01010101 + /* uses long immediate */ +#ifdef __LITTLE_ENDIAN__ + asl_s %r1, %r0, 3 + btst_s %r0, 2 + asl %r7, %r4, %r1 + ror %r5, %r4 + sub %r1, %r2, %r7 + bic_s %r1, %r1, %r2 + mov.eq %r7, %r4 + sub %r12, %r6, %r7 + bic %r12, %r12, %r6 + or.eq %r12, %r12, %r1 + and %r12, %r12, %r5 + brne %r12, 0, .Learly_end +#else /* __BIG_ENDIAN__ */ + ror %r5, %r4 + btst_s %r0, 2 + mov_s %r1, 31 + sub3 %r7, %r1, %r0 + sub %r1, %r2, %r4 + bic_s %r1, %r1, %r2 + bmsk %r1, %r1, %r7 + sub %r12, %r6, %r4 + bic %r12, %r12, %r6 + bmsk.ne %r12, %r12, %r7 + or.eq %r12, %r12, %r1 + and %r12, %r12, %r5 + brne %r12, 0, .Learly_end +#endif /* _ENDIAN__ */ + +.Loop: + ld_s %r2, [%r3, 4] + ld.a %r6, [%r3, 8] + /* stall for load result */ + sub %r1, %r2, %r4 + bic_s %r1, %r1, %r2 + sub %r12, %r6, %r4 + bic %r12, %r12, %r6 + or %r12, %r12, %r1 + and %r12, %r12, %r5 + breq %r12, 0, .Loop +.Lend: + and.f %r1, %r1, %r5 + sub.ne %r3, %r3, 4 + mov.eq %r1, %r12 +#ifdef __LITTLE_ENDIAN__ + sub_s %r2, %r1, 1 + bic_s %r2, %r2, %r1 + norm %r1, %r2 + sub_s %r0, %r0, 3 + lsr_s %r1, %r1, 3 + sub %r0, %r3, %r0 + j_s.d [%blink] + sub %r0, %r0, %r1 +#else /* __BIG_ENDIAN__ */ + lsr_s %r1, %r1, 7 + mov.eq %r2, %r6 + bic_s %r1, %r1, %r2 + norm %r1, %r1 + sub %r0, %r3, %r0 + lsr_s %r1, %r1, 3 + j_s.d [%blink] + add %r0, %r0, %r1 +#endif /* _ENDIAN */ +.Learly_end: + b.d .Lend + sub_s.ne %r1, %r1, %r1 -- cgit v1.2.3