Merge tag 'loongarch-6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson

Pull LoongArch updates from Huacai Chen: - Better backtraces for humanization - Relay BCE exceptions to userland as SIGSEGV - Provide kernel fpu functions - Optimize memory ops (memset/memcpy/memmove) - Optimize checksum and crc32(c) calculation - Add ARCH_HAS_FORTIFY_SOURCE selection - Add function error injection support - Add ftrace with direct call support - Add basic perf tools support * tag 'loongarch-6.4' of git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson: (24 commits) tools/perf: Add basic support for LoongArch LoongArch: ftrace: Add direct call trampoline samples support LoongArch: ftrace: Add direct call support LoongArch: ftrace: Implement ftrace_find_callable_addr() to simplify code LoongArch: ftrace: Fix build error if DYNAMIC_FTRACE_WITH_REGS is not set LoongArch: ftrace: Abstract DYNAMIC_FTRACE_WITH_ARGS accesses LoongArch: Add support for function error injection LoongArch: Add ARCH_HAS_FORTIFY_SOURCE selection LoongArch: crypto: Add crc32 and crc32c hw acceleration LoongArch: Add checksum optimization for 64-bit system LoongArch: Optimize memory ops (memset/memcpy/memmove) LoongArch: Provide kernel fpu functions LoongArch: Relay BCE exceptions to userland as SIGSEGV with si_code=SEGV_BNDERR LoongArch: Tweak the BADV and CPUCFG.PRID lines in show_regs() LoongArch: Humanize the ESTAT line when showing registers LoongArch: Humanize the ECFG line when showing registers LoongArch: Humanize the EUEN line when showing registers LoongArch: Humanize the PRMD line when showing registers LoongArch: Humanize the CRMD line when showing registers LoongArch: Fix format of CSR lines during show_regs() ...
author: Linus Torvalds 2023-05-04 12:40:16 -0700
committer: Linus Torvalds 2023-05-04 12:40:16 -0700
commit: 611c9d88302cb9ac3b0f58f4a06c0ffb98345bd2 (patch)
tree: 3aeefa3eb0499df16ec08965da68d9fa38f1befa /arch
parent: a1f749de8a610096ca0dd9a40d89c8fa4098c8eb (diff)
parent: 2fa5ebe3bc4e31e07a99196455498472417842f2 (diff)
28 files changed, 1666 insertions, 295 deletions
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index f44f6ea54e46..d38b066fc931 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -10,6 +10,7 @@ config LOONGARCH
 	select ARCH_ENABLE_MEMORY_HOTPLUG
 	select ARCH_ENABLE_MEMORY_HOTREMOVE
 	select ARCH_HAS_ACPI_TABLE_UPGRADE	if ACPI
+	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
 	select ARCH_HAS_PTE_SPECIAL
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
@@ -93,6 +94,7 @@ config LOONGARCH
 	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_ARGS
+	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_EBPF_JIT
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if !ARCH_STRICT_ALIGN
@@ -100,6 +102,7 @@ config LOONGARCH
 	select HAVE_FAST_GUP
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_ARG_ACCESS_API
+	select HAVE_FUNCTION_ERROR_INJECTION
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
 	select HAVE_GENERIC_VDSO
@@ -118,6 +121,8 @@ config LOONGARCH
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RSEQ
+	select HAVE_SAMPLE_FTRACE_DIRECT
+	select HAVE_SAMPLE_FTRACE_DIRECT_MULTI
 	select HAVE_SETUP_PER_CPU_AREA if NUMA
 	select HAVE_STACKPROTECTOR
 	select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile
index f71edf574101..a27e264bdaa5 100644
--- a/arch/loongarch/Makefile
+++ b/arch/loongarch/Makefile
@@ -115,6 +115,8 @@ endif
 libs-y += arch/loongarch/lib/
 libs-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
 
+drivers-y		+= arch/loongarch/crypto/
+
 # suspend and hibernation support
 drivers-$(CONFIG_PM)	+= arch/loongarch/power/
 
diff --git a/arch/loongarch/crypto/Kconfig b/arch/loongarch/crypto/Kconfig
new file mode 100644
index 000000000000..200a6e8b43b1
--- /dev/null
+++ b/arch/loongarch/crypto/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menu "Accelerated Cryptographic Algorithms for CPU (loongarch)"
+
+config CRYPTO_CRC32_LOONGARCH
+	tristate "CRC32c and CRC32"
+	select CRC32
+	select CRYPTO_HASH
+	help
+	  CRC32c and CRC32 CRC algorithms
+
+	  Architecture: LoongArch with CRC32 instructions
+
+endmenu
diff --git a/arch/loongarch/crypto/Makefile b/arch/loongarch/crypto/Makefile
new file mode 100644
index 000000000000..d22613d27ce9
--- /dev/null
+++ b/arch/loongarch/crypto/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for LoongArch crypto files..
+#
+
+obj-$(CONFIG_CRYPTO_CRC32_LOONGARCH) += crc32-loongarch.o
diff --git a/arch/loongarch/crypto/crc32-loongarch.c b/arch/loongarch/crypto/crc32-loongarch.c
new file mode 100644
index 000000000000..1f2a2c3839bc
--- /dev/null
+++ b/arch/loongarch/crypto/crc32-loongarch.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * crc32.c - CRC32 and CRC32C using LoongArch crc* instructions
+ *
+ * Module based on mips/crypto/crc32-mips.c
+ *
+ * Copyright (C) 2014 Linaro Ltd <yazen.ghannam@linaro.org>
+ * Copyright (C) 2018 MIPS Tech, LLC
+ * Copyright (C) 2020-2023 Loongson Technology Corporation Limited
+ */
+
+#include <linux/module.h>
+#include <crypto/internal/hash.h>
+
+#include <asm/cpu-features.h>
+#include <asm/unaligned.h>
+
+#define _CRC32(crc, value, size, type)			\
+do {							\
+	__asm__ __volatile__(				\
+		#type ".w." #size ".w" " %0, %1, %0\n\t"\
+		: "+r" (crc)				\
+		: "r" (value)				\
+		: "memory");				\
+} while (0)
+
+#define CRC32(crc, value, size)		_CRC32(crc, value, size, crc)
+#define CRC32C(crc, value, size)	_CRC32(crc, value, size, crcc)
+
+static u32 crc32_loongarch_hw(u32 crc_, const u8 *p, unsigned int len)
+{
+	u32 crc = crc_;
+
+	while (len >= sizeof(u64)) {
+		u64 value = get_unaligned_le64(p);
+
+		CRC32(crc, value, d);
+		p += sizeof(u64);
+		len -= sizeof(u64);
+	}
+
+	if (len & sizeof(u32)) {
+		u32 value = get_unaligned_le32(p);
+
+		CRC32(crc, value, w);
+		p += sizeof(u32);
+		len -= sizeof(u32);
+	}
+
+	if (len & sizeof(u16)) {
+		u16 value = get_unaligned_le16(p);
+
+		CRC32(crc, value, h);
+		p += sizeof(u16);
+	}
+
+	if (len & sizeof(u8)) {
+		u8 value = *p++;
+
+		CRC32(crc, value, b);
+	}
+
+	return crc;
+}
+
+static u32 crc32c_loongarch_hw(u32 crc_, const u8 *p, unsigned int len)
+{
+	u32 crc = crc_;
+
+	while (len >= sizeof(u64)) {
+		u64 value = get_unaligned_le64(p);
+
+		CRC32C(crc, value, d);
+		p += sizeof(u64);
+		len -= sizeof(u64);
+	}
+
+	if (len & sizeof(u32)) {
+		u32 value = get_unaligned_le32(p);
+
+		CRC32C(crc, value, w);
+		p += sizeof(u32);
+		len -= sizeof(u32);
+	}
+
+	if (len & sizeof(u16)) {
+		u16 value = get_unaligned_le16(p);
+
+		CRC32C(crc, value, h);
+		p += sizeof(u16);
+	}
+
+	if (len & sizeof(u8)) {
+		u8 value = *p++;
+
+		CRC32C(crc, value, b);
+	}
+
+	return crc;
+}
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+struct chksum_ctx {
+	u32 key;
+};
+
+struct chksum_desc_ctx {
+	u32 crc;
+};
+
+static int chksum_init(struct shash_desc *desc)
+{
+	struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	ctx->crc = mctx->key;
+
+	return 0;
+}
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set the seed.
+ */
+static int chksum_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen)
+{
+	struct chksum_ctx *mctx = crypto_shash_ctx(tfm);
+
+	if (keylen != sizeof(mctx->key))
+		return -EINVAL;
+
+	mctx->key = get_unaligned_le32(key);
+
+	return 0;
+}
+
+static int chksum_update(struct shash_desc *desc, const u8 *data, unsigned int length)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	ctx->crc = crc32_loongarch_hw(ctx->crc, data, length);
+	return 0;
+}
+
+static int chksumc_update(struct shash_desc *desc, const u8 *data, unsigned int length)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	ctx->crc = crc32c_loongarch_hw(ctx->crc, data, length);
+	return 0;
+}
+
+static int chksum_final(struct shash_desc *desc, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	put_unaligned_le32(ctx->crc, out);
+	return 0;
+}
+
+static int chksumc_final(struct shash_desc *desc, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	put_unaligned_le32(~ctx->crc, out);
+	return 0;
+}
+
+static int __chksum_finup(u32 crc, const u8 *data, unsigned int len, u8 *out)
+{
+	put_unaligned_le32(crc32_loongarch_hw(crc, data, len), out);
+	return 0;
+}
+
+static int __chksumc_finup(u32 crc, const u8 *data, unsigned int len, u8 *out)
+{
+	put_unaligned_le32(~crc32c_loongarch_hw(crc, data, len), out);
+	return 0;
+}
+
+static int chksum_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	return __chksum_finup(ctx->crc, data, len, out);
+}
+
+static int chksumc_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out)
+{
+	struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+	return __chksumc_finup(ctx->crc, data, len, out);
+}
+
+static int chksum_digest(struct shash_desc *desc, const u8 *data, unsigned int length, u8 *out)
+{
+	struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
+
+	return __chksum_finup(mctx->key, data, length, out);
+}
+
+static int chksumc_digest(struct shash_desc *desc, const u8 *data, unsigned int length, u8 *out)
+{
+	struct chksum_ctx *mctx = crypto_shash_ctx(desc->tfm);
+
+	return __chksumc_finup(mctx->key, data, length, out);
+}
+
+static int chksum_cra_init(struct crypto_tfm *tfm)
+{
+	struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
+
+	mctx->key = 0;
+	return 0;
+}
+
+static int chksumc_cra_init(struct crypto_tfm *tfm)
+{
+	struct chksum_ctx *mctx = crypto_tfm_ctx(tfm);
+
+	mctx->key = ~0;
+	return 0;
+}
+
+static struct shash_alg crc32_alg = {
+	.digestsize		=	CHKSUM_DIGEST_SIZE,
+	.setkey			=	chksum_setkey,
+	.init			=	chksum_init,
+	.update			=	chksum_update,
+	.final			=	chksum_final,
+	.finup			=	chksum_finup,
+	.digest			=	chksum_digest,
+	.descsize		=	sizeof(struct chksum_desc_ctx),
+	.base			=	{
+		.cra_name		=	"crc32",
+		.cra_driver_name	=	"crc32-loongarch",
+		.cra_priority		=	300,
+		.cra_flags		=	CRYPTO_ALG_OPTIONAL_KEY,
+		.cra_blocksize		=	CHKSUM_BLOCK_SIZE,
+		.cra_alignmask		=	0,
+		.cra_ctxsize		=	sizeof(struct chksum_ctx),
+		.cra_module		=	THIS_MODULE,
+		.cra_init		=	chksum_cra_init,
+	}
+};
+
+static struct shash_alg crc32c_alg = {
+	.digestsize		=	CHKSUM_DIGEST_SIZE,
+	.setkey			=	chksum_setkey,
+	.init			=	chksum_init,
+	.update			=	chksumc_update,
+	.final			=	chksumc_final,
+	.finup			=	chksumc_finup,
+	.digest			=	chksumc_digest,
+	.descsize		=	sizeof(struct chksum_desc_ctx),
+	.base			=	{
+		.cra_name		=	"crc32c",
+		.cra_driver_name	=	"crc32c-loongarch",
+		.cra_priority		=	300,
+		.cra_flags		=	CRYPTO_ALG_OPTIONAL_KEY,
+		.cra_blocksize		=	CHKSUM_BLOCK_SIZE,
+		.cra_alignmask		=	0,
+		.cra_ctxsize		=	sizeof(struct chksum_ctx),
+		.cra_module		=	THIS_MODULE,
+		.cra_init		=	chksumc_cra_init,
+	}
+};
+
+static int __init crc32_mod_init(void)
+{
+	int err;
+
+	if (!cpu_has(CPU_FEATURE_CRC32))
+		return 0;
+
+	err = crypto_register_shash(&crc32_alg);
+	if (err)
+		return err;
+
+	err = crypto_register_shash(&crc32c_alg);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static void __exit crc32_mod_exit(void)
+{
+	if (!cpu_has(CPU_FEATURE_CRC32))
+		return;
+
+	crypto_unregister_shash(&crc32_alg);
+	crypto_unregister_shash(&crc32c_alg);
+}
+
+module_init(crc32_mod_init);
+module_exit(crc32_mod_exit);
+
+MODULE_AUTHOR("Min Zhou <zhoumin@loongson.cn>");
+MODULE_AUTHOR("Huacai Chen <chenhuacai@loongson.cn>");
+MODULE_DESCRIPTION("CRC32 and CRC32C using LoongArch crc* instructions");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/loongarch/include/asm/checksum.h b/arch/loongarch/include/asm/checksum.h
new file mode 100644
index 000000000000..cabbf6af44c4
--- /dev/null
+++ b/arch/loongarch/include/asm/checksum.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2016 ARM Ltd.
+ * Copyright (C) 2023 Loongson Technology Corporation Limited
+ */
+#ifndef __ASM_CHECKSUM_H
+#define __ASM_CHECKSUM_H
+
+#include <linux/bitops.h>
+#include <linux/in6.h>
+
+#define _HAVE_ARCH_IPV6_CSUM
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+			const struct in6_addr *daddr,
+			__u32 len, __u8 proto, __wsum sum);
+
+/*
+ * turns a 32-bit partial checksum (e.g. from csum_partial) into a
+ * 1's complement 16-bit checksum.
+ */
+static inline __sum16 csum_fold(__wsum sum)
+{
+	u32 tmp = (__force u32)sum;
+
+	/*
+	 * swap the two 16-bit halves of sum
+	 * if there is a carry from adding the two 16-bit halves,
+	 * it will carry from the lower half into the upper half,
+	 * giving us the correct sum in the upper half.
+	 */
+	return (__force __sum16)(~(tmp + rol32(tmp, 16)) >> 16);
+}
+#define csum_fold csum_fold
+
+/*
+ * This is a version of ip_compute_csum() optimized for IP headers,
+ * which always checksum on 4 octet boundaries.  ihl is the number
+ * of 32-bit words and is always >= 5.
+ */
+static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
+{
+	u64 sum;
+	__uint128_t tmp;
+	int n = ihl; /* we want it signed */
+
+	tmp = *(const __uint128_t *)iph;
+	iph += 16;
+	n -= 4;
+	tmp += ((tmp >> 64) | (tmp << 64));
+	sum = tmp >> 64;
+	do {
+		sum += *(const u32 *)iph;
+		iph += 4;
+	} while (--n > 0);
+
+	sum += ror64(sum, 32);
+	return csum_fold((__force __wsum)(sum >> 32));
+}
+#define ip_fast_csum ip_fast_csum
+
+extern unsigned int do_csum(const unsigned char *buff, int len);
+#define do_csum do_csum
+
+#include <asm-generic/checksum.h>
+
+#endif	/* __ASM_CHECKSUM_H */
diff --git a/arch/loongarch/include/asm/fpu.h b/arch/loongarch/include/asm/fpu.h
index 358b254d9c1d..192f8e35d912 100644
--- a/arch/loongarch/include/asm/fpu.h
+++ b/arch/loongarch/include/asm/fpu.h
@@ -21,6 +21,9 @@
 
 struct sigcontext;
 
+extern void kernel_fpu_begin(void);
+extern void kernel_fpu_end(void);
+
 extern void _init_fpu(unsigned int);
 extern void _save_fp(struct loongarch_fpu *);
 extern void _restore_fp(struct loongarch_fpu *);
diff --git a/arch/loongarch/include/asm/ftrace.h b/arch/loongarch/include/asm/ftrace.h
index 3418d32d4fc7..23e2ba78dcb0 100644
--- a/arch/loongarch/include/asm/ftrace.h
+++ b/arch/loongarch/include/asm/ftrace.h
@@ -54,9 +54,46 @@ static __always_inline struct pt_regs *arch_ftrace_get_regs(struct ftrace_regs *
 	return &fregs->regs;
 }
 
+static __always_inline unsigned long
+ftrace_regs_get_instruction_pointer(struct ftrace_regs *fregs)
+{
+	return instruction_pointer(&fregs->regs);
+}
+
+static __always_inline void
+ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs, unsigned long ip)
+{
+	regs_set_return_value(&fregs->regs, ip);
+}
+
+#define ftrace_regs_get_argument(fregs, n) \
+	regs_get_kernel_argument(&(fregs)->regs, n)
+#define ftrace_regs_get_stack_pointer(fregs) \
+	kernel_stack_pointer(&(fregs)->regs)
+#define ftrace_regs_return_value(fregs) \
+	regs_return_value(&(fregs)->regs)
+#define ftrace_regs_set_return_value(fregs, ret) \
+	regs_set_return_value(&(fregs)->regs, ret)
+#define ftrace_override_function_with_return(fregs) \
+	override_function_with_return(&(fregs)->regs)
+#define ftrace_regs_query_register_offset(name) \
+	regs_query_register_offset(name)
+
 #define ftrace_graph_func ftrace_graph_func
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+static inline void
+__arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr)
+{
+	regs->regs[13] = addr;	/* t1 */
+}
+
+#define arch_ftrace_set_direct_caller(fregs, addr) \
+	__arch_ftrace_set_direct_caller(&(fregs)->regs, addr)
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
 #endif
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h
index a04fe755d719..b09887ffcd15 100644
--- a/arch/loongarch/include/asm/inst.h
+++ b/arch/loongarch/include/asm/inst.h
@@ -121,6 +121,8 @@ enum reg2bstrd_op {
 };
 
 enum reg3_op {
+	asrtle_op	= 0x02,
+	asrtgt_op	= 0x03,
 	addw_op		= 0x20,
 	addd_op		= 0x21,
 	subw_op		= 0x22,
@@ -176,6 +178,30 @@ enum reg3_op {
 	amord_op	= 0x70c7,
 	amxorw_op	= 0x70c8,
 	amxord_op	= 0x70c9,
+	fldgts_op	= 0x70e8,
+	fldgtd_op	= 0x70e9,
+	fldles_op	= 0x70ea,
+	fldled_op	= 0x70eb,
+	fstgts_op	= 0x70ec,
+	fstgtd_op	= 0x70ed,
+	fstles_op	= 0x70ee,
+	fstled_op	= 0x70ef,
+	ldgtb_op	= 0x70f0,
+	ldgth_op	= 0x70f1,
+	ldgtw_op	= 0x70f2,
+	ldgtd_op	= 0x70f3,
+	ldleb_op	= 0x70f4,
+	ldleh_op	= 0x70f5,
+	ldlew_op	= 0x70f6,
+	ldled_op	= 0x70f7,
+	stgtb_op	= 0x70f8,
+	stgth_op	= 0x70f9,
+	stgtw_op	= 0x70fa,
+	stgtd_op	= 0x70fb,
+	stleb_op	= 0x70fc,
+	stleh_op	= 0x70fd,
+	stlew_op	= 0x70fe,
+	stled_op	= 0x70ff,
 };
 
 enum reg3sa2_op {
diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h
index 83da5d29e2d1..b3323ab5b78d 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -311,8 +311,8 @@ static __always_inline void iocsr_write64(u64 val, u32 reg)
 #define  CSR_ECFG_VS_WIDTH		3
 #define  CSR_ECFG_VS			(_ULCAST_(0x7) << CSR_ECFG_VS_SHIFT)
 #define  CSR_ECFG_IM_SHIFT		0
-#define  CSR_ECFG_IM_WIDTH		13
-#define  CSR_ECFG_IM			(_ULCAST_(0x1fff) << CSR_ECFG_IM_SHIFT)
+#define  CSR_ECFG_IM_WIDTH		14
+#define  CSR_ECFG_IM			(_ULCAST_(0x3fff) << CSR_ECFG_IM_SHIFT)
 
 #define LOONGARCH_CSR_ESTAT		0x5	/* Exception status */
 #define  CSR_ESTAT_ESUBCODE_SHIFT	22
@@ -322,8 +322,8 @@ static __always_inline void iocsr_write64(u64 val, u32 reg)
 #define  CSR_ESTAT_EXC_WIDTH		6
 #define  CSR_ESTAT_EXC			(_ULCAST_(0x3f) << CSR_ESTAT_EXC_SHIFT)
 #define  CSR_ESTAT_IS_SHIFT		0
-#define  CSR_ESTAT_IS_WIDTH		15
-#define  CSR_ESTAT_IS			(_ULCAST_(0x7fff) << CSR_ESTAT_IS_SHIFT)
+#define  CSR_ESTAT_IS_WIDTH		14
+#define  CSR_ESTAT_IS			(_ULCAST_(0x3fff) << CSR_ESTAT_IS_SHIFT)
 
 #define LOONGARCH_CSR_ERA		0x6	/* ERA */
 
@@ -1090,7 +1090,7 @@ static __always_inline void iocsr_write64(u64 val, u32 reg)
 #define ECFGF_IPI		(_ULCAST_(1) << ECFGB_IPI)
 #define ECFGF(hwirq)		(_ULCAST_(1) << hwirq)
 
-#define ESTATF_IP		0x00001fff
+#define ESTATF_IP		0x00003fff
 
 #define LOONGARCH_IOCSR_FEATURES	0x8
 #define  IOCSRF_TEMP			BIT_ULL(0)
@@ -1397,7 +1397,7 @@ __BUILD_CSR_OP(tlbidx)
 	#define EXSUBCODE_ADEF		0	/* Fetch Instruction */
 	#define EXSUBCODE_ADEM		1	/* Access Memory*/
 #define EXCCODE_ALE		9	/* Unalign Access */
-#define EXCCODE_OOB		10	/* Out of bounds */
+#define EXCCODE_BCE		10	/* Bounds Check Error */
 #define EXCCODE_SYS		11	/* System call */
 #define EXCCODE_BP		12	/* Breakpoint */
 #define EXCCODE_INE		13	/* Inst. Not Exist */
@@ -1408,33 +1408,38 @@ __BUILD_CSR_OP(tlbidx)
 #define EXCCODE_FPE		18	/* Floating Point Exception */
 	#define EXCSUBCODE_FPE		0	/* Floating Point Exception */
 	#define EXCSUBCODE_VFPE		1	/* Vector Exception */
-#define EXCCODE_WATCH		19	/* Watch address reference */
+#define EXCCODE_WATCH		19	/* WatchPoint Exception */
+	#define EXCSUBCODE_WPEF		0	/* ... on Instruction Fetch */
+	#define EXCSUBCODE_WPEM		1	/* ... on Memory Accesses */
 #define EXCCODE_BTDIS		20	/* Binary Trans. Disabled */
 #define EXCCODE_BTE		21	/* Binary Trans. Exception */
-#define EXCCODE_PSI		22	/* Guest Privileged Error */
-#define EXCCODE_HYP		23	/* Hypercall */
+#define EXCCODE_GSPR		22	/* Guest Privileged Error */
+#define EXCCODE_HVC		23	/* Hypercall */
 #define EXCCODE_GCM		24	/* Guest CSR modified */
 	#define EXCSUBCODE_GCSC		0	/* Software caused */
 	#define EXCSUBCODE_GCHC		1	/* Hardware caused */
 #define EXCCODE_SE		25	/* Security */
 
-#define EXCCODE_INT_START   64
-#define EXCCODE_SIP0        64
-#define EXCCODE_SIP1        65
-#define EXCCODE_IP0         66
-#define EXCCODE_IP1         67
-#define EXCCODE_IP2         68
-#define EXCCODE_IP3         69
-#define EXCCODE_IP4         70
-#define EXCCODE_IP5         71
-#define EXCCODE_IP6         72
-#define EXCCODE_IP7         73
-#define EXCCODE_PMC         74 /* Performance Counter */
-#define EXCCODE_TIMER       75
-#define EXCCODE_IPI         76
-#define EXCCODE_NMI         77
-#define EXCCODE_INT_END     78
-#define EXCCODE_INT_NUM	    (EXCCODE_INT_END - EXCCODE_INT_START)
+/* Interrupt numbers */
+#define INT_SWI0	0	/* Software Interrupts */
+#define INT_SWI1	1
+#define INT_HWI0	2	/* Hardware Interrupts */
+#define INT_HWI1	3
+#define INT_HWI2	4
+#define INT_HWI3	5
+#define INT_HWI4	6
+#define INT_HWI5	7
+#define INT_HWI6	8
+#define INT_HWI7	9
+#define INT_PCOV	10	/* Performance Counter Overflow */
+#define INT_TI		11	/* Timer */
+#define INT_IPI		12
+#define INT_NMI		13
+
+/* ExcCodes corresponding to interrupts */
+#define EXCCODE_INT_NUM		(INT_NMI + 1)
+#define EXCCODE_INT_START	64
+#define EXCCODE_INT_END		(EXCCODE_INT_START + EXCCODE_INT_NUM - 1)
 
 /* FPU register names */
 #define LOONGARCH_FCSR0	$r0
diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h
index d761db943335..35f0958163ac 100644
--- a/arch/loongarch/include/asm/ptrace.h
+++ b/arch/loongarch/include/asm/ptrace.h
@@ -154,6 +154,11 @@ static inline long regs_return_value(struct pt_regs *regs)
 	return regs->regs[4];
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long val)
+{
+	regs->regs[4] = val;
+}
+
 #define instruction_pointer(regs) ((regs)->csr_era)
 #define profile_pc(regs) instruction_pointer(regs)
 
diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile
index 78d4e3384305..9a72d91cd104 100644
--- a/arch/loongarch/kernel/Makefile
+++ b/arch/loongarch/kernel/Makefile
@@ -13,7 +13,7 @@ obj-y		+= head.o cpu-probe.o cacheinfo.o env.o setup.o entry.o genex.o \
 obj-$(CONFIG_ACPI)		+= acpi.o
 obj-$(CONFIG_EFI) 		+= efi.o
 
-obj-$(CONFIG_CPU_HAS_FPU)	+= fpu.o
+obj-$(CONFIG_CPU_HAS_FPU)	+= fpu.o kfpu.o
 
 obj-$(CONFIG_ARCH_STRICT_ALIGN)	+= unaligned.o
 
diff --git a/arch/loongarch/kernel/ftrace_dyn.c b/arch/loongarch/kernel/ftrace_dyn.c
index 4a3ef8516ccc..73858c9029cc 100644
--- a/arch/loongarch/kernel/ftrace_dyn.c
+++ b/arch/loongarch/kernel/ftrace_dyn.c
@@ -30,19 +30,12 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new, bool validate)
 	return 0;
 }
 
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
-
 #ifdef CONFIG_MODULES
-static inline int __get_mod(struct module **mod, unsigned long addr)
+static bool reachable_by_bl(unsigned long addr, unsigned long pc)
 {
-	preempt_disable();
-	*mod = __module_text_address(addr);
-	preempt_enable();
+	long offset = (long)addr - (long)pc;
 
-	if (WARN_ON(!(*mod)))
-		return -EINVAL;
-
-	return 0;
+	return offset >= -SZ_128M && offset < SZ_128M;
 }
 
 static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr)
@@ -58,51 +51,88 @@ static struct plt_entry *get_ftrace_plt(struct module *mod, unsigned long addr)
 	return NULL;
 }
 
-static unsigned long get_plt_addr(struct module *mod, unsigned long addr)
+/*
+ * Find the address the callsite must branch to in order to reach '*addr'.
+ *
+ * Due to the limited range of 'bl' instruction, modules may be placed too far
+ * away to branch directly and we must use a PLT.
+ *
+ * Returns true when '*addr' contains a reachable target address, or has been
+ * modified to contain a PLT address. Returns false otherwise.
+ */
+static bool ftrace_find_callable_addr(struct dyn_ftrace *rec, struct module *mod, unsigned long *addr)
 {
+	unsigned long pc = rec->ip + LOONGARCH_INSN_SIZE;
 	struct plt_entry *plt;
 
-	plt = get_ftrace_plt(mod, addr);
+	/*
+	 * If a custom trampoline is unreachable, rely on the ftrace_regs_caller
+	 * trampoline which knows how to indirectly reach that trampoline through
+	 * ops->direct_call.
+	 */
+	if (*addr != FTRACE_ADDR && *addr != FTRACE_REGS_ADDR && !reachable_by_bl(*addr, pc))
+		*addr = FTRACE_REGS_ADDR;
+
+	/*
+	 * When the target is within range of the 'bl' instruction, use 'addr'
+	 * as-is and branch to that directly.
+	 */
+	if (reachable_by_bl(*addr, pc))
+		return true;
+
+	/*
+	 * 'mod' is only set at module load time, but if we end up
+	 * dealing with an out-of-range condition, we can assume it
+	 * is due to a module being loaded far away from the kernel.
+	 *
+	 * NOTE: __module_text_address() must be called with preemption
+	 * disabled, but we can rely on ftrace_lock to ensure that 'mod'
+	 * retains its validity throughout the remainder of this code.
+	 */
+	if (!mod) {
+		preempt_disable();
+		mod = __module_text_address(pc);
+		preempt_enable();
+	}
+
+	if (WARN_ON(!mod))
+		return false;
+
+	plt = get_ftrace_plt(mod, *addr);
 	if (!plt) {
-		pr_err("ftrace: no module PLT for %ps\n", (void *)addr);
-		return -EINVAL;
+		pr_err("ftrace: no module PLT for %ps\n", (void *)*addr);
+		return false;
 	}
 
-	return (unsigned long)plt;
+	*addr = (unsigned long)plt;
+	return true;
+}
+#else /* !CONFIG_MODULES */
+static bool ftrace_find_callable_addr(struct dyn_ftrace *rec, struct module *mod, unsigned long *addr)
+{
+	return true;
 }
 #endif
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
 int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr)
 {
 	u32 old, new;
 	unsigned long pc;
-	long offset __maybe_unused;
 
 	pc = rec->ip + LOONGARCH_INSN_SIZE;
 
-#ifdef CONFIG_MODULES
-	offset = (long)pc - (long)addr;
-
-	if (offset < -SZ_128M || offset >= SZ_128M) {
-		int ret;
-		struct module *mod;
-
-		ret = __get_mod(&mod, pc);
-		if (ret)
-			return ret;
-
-		addr = get_plt_addr(mod, addr);
+	if (!ftrace_find_callable_addr(rec, NULL, &addr))
+		return -EINVAL;
 
-		old_addr = get_plt_addr(mod, old_addr);
-	}
-#endif
+	if (!ftrace_find_callable_addr(rec, NULL, &old_addr))
+		return -EINVAL;
 
 	new = larch_insn_gen_bl(pc, addr);
 	old = larch_insn_gen_bl(pc, old_addr);
 
 	return ftrace_modify_code(pc, old, new, true);
 }
-
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
 
 int ftrace_update_ftrace_func(ftrace_func_t func)
@@ -153,24 +183,11 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 {
 	u32 old, new;
 	unsigned long pc;
-	long offset __maybe_unused;
 
 	pc = rec->ip + LOONGARCH_INSN_SIZE;
 
-#ifdef CONFIG_MODULES
-	offset = (long)pc - (long)addr;
-
-	if (offset < -SZ_128M || offset >= SZ_128M) {
-		int ret;
-		struct module *mod;
-
-		ret = __get_mod(&mod, pc);
-		if (ret)
-			return ret;
-
-		addr = get_plt_addr(mod, addr);
-	}
-#endif
+	if (!ftrace_find_callable_addr(rec, NULL, &addr))
+		return -EINVAL;
 
 	old = larch_insn_gen_nop();
 	new = larch_insn_gen_bl(pc, addr);
@@ -182,24 +199,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long ad
 {
 	u32 old, new;
 	unsigned long pc;
-	long offset __maybe_unused;
 
 	pc = rec->ip + LOONGARCH_INSN_SIZE;
 
-#ifdef CONFIG_MODULES
-	offset = (long)pc - (long)addr;
-
-	if (offset < -SZ_128M || offset >= SZ_128M) {
-		int ret;
-		struct module *mod;
-
-		ret = __get_mod(&mod, pc);
-		if (ret)
-			return ret;
-
-		addr = get_plt_addr(mod, addr);
-	}
-#endif
+	if (!ftrace_find_callable_addr(rec, NULL, &addr))
+		return -EINVAL;
 
 	new = larch_insn_gen_nop();
 	old = larch_insn_gen_bl(pc, addr);
diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S
index 44ff1ff64260..78f066384657 100644
--- a/arch/loongarch/kernel/genex.S
+++ b/arch/loongarch/kernel/genex.S
@@ -82,6 +82,7 @@ SYM_FUNC_END(except_vec_cex)
 
 	BUILD_HANDLER ade ade badv
 	BUILD_HANDLER ale ale badv
+	BUILD_HANDLER bce bce none
 	BUILD_HANDLER bp bp none
 	BUILD_HANDLER fpe fpe fcsr
 	BUILD_HANDLER fpu fpu none
diff --git a/arch/loongarch/kernel/irq.c b/arch/loongarch/kernel/irq.c
index 0524bf1169b7..883e5066ae44 100644
--- a/arch/loongarch/kernel/irq.c
+++ b/arch/loongarch/kernel/irq.c
@@ -92,7 +92,7 @@ static int __init get_ipi_irq(void)
 	struct irq_domain *d = irq_find_matching_fwnode(cpuintc_handle, DOMAIN_BUS_ANY);
 
 	if (d)
-		return irq_create_mapping(d, EXCCODE_IPI - EXCCODE_INT_START);
+		return irq_create_mapping(d, INT_IPI);
 
 	return -EINVAL;
 }
diff --git a/arch/loongarch/kernel/kfpu.c b/arch/loongarch/kernel/kfpu.c
new file mode 100644
index 000000000000..5c46ae8c6cac
--- /dev/null
+++ b/arch/loongarch/kernel/kfpu.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 Loongson Technology Corporation Limited
+ */
+
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <asm/fpu.h>
+#include <asm/smp.h>
+
+static DEFINE_PER_CPU(bool, in_kernel_fpu);
+
+void kernel_fpu_begin(void)
+{
+	preempt_disable();
+
+	WARN_ON(this_cpu_read(in_kernel_fpu));
+
+	this_cpu_write(in_kernel_fpu, true);
+
+	if (!is_fpu_owner())
+		enable_fpu();
+	else
+		_save_fp(&current->thread.fpu);
+
+	write_fcsr(LOONGARCH_FCSR0, 0);
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+void kernel_fpu_end(void)
+{
+	WARN_ON(!this_cpu_read(in_kernel_fpu));
+
+	if (!is_fpu_owner())
+		disable_fpu();
+	else
+		_restore_fp(&current->thread.fpu);
+
+	this_cpu_write(in_kernel_fpu, false);
+
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_end);
diff --git a/arch/loongarch/kernel/mcount_dyn.S b/arch/loongarch/kernel/mcount_dyn.S
index bbabf06244c2..c7d961fc72c2 100644
--- a/arch/loongarch/kernel/mcount_dyn.S
+++ b/arch/loongarch/kernel/mcount_dyn.S
@@ -42,7 +42,6 @@
 	.if \allregs
 	PTR_S		tp, sp, PT_R2
 	PTR_S		t0, sp, PT_R12
-	PTR_S		t1, sp, PT_R13
 	PTR_S		t2, sp, PT_R14
 	PTR_S		t3, sp, PT_R15
 	PTR_S		t4, sp, PT_R16
@@ -64,6 +63,8 @@
 	PTR_S		zero, sp, PT_R0
 	.endif
 	PTR_S		ra, sp, PT_ERA /* Save trace function ra at PT_ERA */
+	move		t1, zero
+	PTR_S		t1, sp, PT_R13
 	PTR_ADDI	t8, sp, PT_SIZE
 	PTR_S		t8, sp, PT_R3
 	.endm
@@ -104,8 +105,12 @@ ftrace_common_return:
 	PTR_L		a7, sp, PT_R11
 	PTR_L		fp, sp, PT_R22
 	PTR_L		t0, sp, PT_ERA
+	PTR_L		t1, sp, PT_R13
 	PTR_ADDI	sp, sp, PT_SIZE
+	bnez		t1, .Ldirect
 	jr		t0
+.Ldirect:
+	jr		t1
 SYM_CODE_END(ftrace_common)
 
 SYM_CODE_START(ftrace_caller)
@@ -147,3 +152,9 @@ SYM_CODE_START(return_to_handler)
 	jr		ra
 SYM_CODE_END(return_to_handler)
 #endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+SYM_CODE_START(ftrace_stub_direct_tramp)
+	jr		t0
+SYM_CODE_END(ftrace_stub_direct_tramp)
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
diff --git a/arch/loongarch/kernel/perf_event.c b/arch/loongarch/kernel/perf_event.c
index 707bd32e5c4f..ff28f99b47d7 100644
--- a/arch/loongarch/kernel/perf_event.c
+++ b/arch/loongarch/kernel/perf_event.c
@@ -461,7 +461,7 @@ static int get_pmc_irq(void)
 	struct irq_domain *d = irq_find_matching_fwnode(cpuintc_handle, DOMAIN_BUS_ANY);
 
 	if (d)
-		return irq_create_mapping(d, EXCCODE_PMC - EXCCODE_INT_START);
+		return irq_create_mapping(d, INT_PCOV);
 
 	return -EINVAL;
 }
diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c
index 4351f69d9950..f377e50f3c66 100644
--- a/arch/loongarch/kernel/time.c
+++ b/arch/loongarch/kernel/time.c
@@ -133,7 +133,7 @@ static int get_timer_irq(void)
 	struct irq_domain *d = irq_find_matching_fwnode(cpuintc_handle, DOMAIN_BUS_ANY);
 
 	if (d)
-		return irq_create_mapping(d, EXCCODE_TIMER - EXCCODE_INT_START);
+		return irq_create_mapping(d, INT_TI);
 
 	return -EINVAL;
 }
diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c
index de8ebe20b666..8db26e4ca447 100644
--- a/arch/loongarch/kernel/traps.c
+++ b/arch/loongarch/kernel/traps.c
@@ -3,6 +3,7 @@
  * Author: Huacai Chen <chenhuacai@loongson.cn>
  * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
  */
+#include <linux/bitfield.h>
 #include <linux/bitops.h>
 #include <linux/bug.h>
 #include <linux/compiler.h>
@@ -35,6 +36,7 @@
 #include <asm/break.h>
 #include <asm/cpu.h>
 #include <asm/fpu.h>
+#include <asm/inst.h>
 #include <asm/loongarch.h>
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
@@ -50,6 +52,7 @@
 
 extern asmlinkage void handle_ade(void);
 extern asmlinkage void handle_ale(void);
+extern asmlinkage void handle_bce(void);
 extern asmlinkage void handle_sys(void);
 extern asmlinkage void handle_bp(void);
 extern asmlinkage void handle_ri(void);
@@ -153,53 +156,210 @@ static void show_code(unsigned int *pc, bool user)
 	pr_cont("\n");
 }
 
-static void __show_regs(const struct pt_regs *regs)
+static void print_bool_fragment(const char *key, unsigned long val, bool first)
 {
-	const int field = 2 * sizeof(unsigned long);
-	unsigned int excsubcode;
-	unsigned int exccode;
-	int i;
+	/* e.g. "+PG", "-DA" */
+	pr_cont("%s%c%s", first ? "" : " ", val ? '+' : '-', key);
+}
 
-	show_regs_print_info(KERN_DEFAULT);
+static void print_plv_fragment(const char *key, int val)
+{
+	/* e.g. "PLV0", "PPLV3" */
+	pr_cont("%s%d", key, val);
+}
 
-	/*
-	 * Saved main processor registers
-	 */
-	for (i = 0; i < 32; ) {
-		if ((i % 4) == 0)
-			printk("$%2d   :", i);
-		pr_cont(" %0*lx", field, regs->regs[i]);
+static void print_memory_type_fragment(const char *key, unsigned long val)
+{
+	const char *humanized_type;
 
-		i++;
-		if ((i % 4) == 0)
-			pr_cont("\n");
+	switch (val) {
+	case 0:
+		humanized_type = "SUC";
+		break;
+	case 1:
+		humanized_type = "CC";
+		break;
+	case 2:
+		humanized_type = "WUC";
+		break;
+	default:
+		pr_cont(" %s=Reserved(%lu)", key, val);
+		return;
 	}
 
+	/* e.g. " DATM=WUC" */
+	pr_cont(" %s=%s", key, humanized_type);
+}
+
+static void print_intr_fragment(const char *key, unsigned long val)
+{
+	/* e.g. "LIE=0-1,3,5-7" */
+	pr_cont("%s=%*pbl", key, EXCCODE_INT_NUM, &val);
+}
+
+static void print_crmd(unsigned long x)
+{
+	printk(" CRMD: %08lx (", x);
+	print_plv_fragment("PLV", (int) FIELD_GET(CSR_CRMD_PLV, x));
+	print_bool_fragment("IE", FIELD_GET(CSR_CRMD_IE, x), false);
+	print_bool_fragment("DA", FIELD_GET(CSR_CRMD_DA, x), false);
+	print_bool_fragment("PG", FIELD_GET(CSR_CRMD_PG, x), false);
+	print_memory_type_fragment("DACF", FIELD_GET(CSR_CRMD_DACF, x));
+	print_memory_type_fragment("DACM", FIELD_GET(CSR_CRMD_DACM, x));
+	print_bool_fragment("WE", FIELD_GET(CSR_CRMD_WE, x), false);
+	pr_cont(")\n");
+}
+
+static void print_prmd(unsigned long x)
+{
+	printk(" PRMD: %08lx (", x);
+	print_plv_fragment("PPLV", (int) FIELD_GET(CSR_PRMD_PPLV, x));
+	print_bool_fragment("PIE", FIELD_GET(CSR_PRMD_PIE, x), false);
+	print_bool_fragment("PWE", FIELD_GET(CSR_PRMD_PWE, x), false);
+	pr_cont(")\n");
+}
+
+static void print_euen(unsigned long x)
+{
+	printk(" EUEN: %08lx (", x);
+	print_bool_fragment("FPE", FIELD_GET(CSR_EUEN_FPEN, x), true);
+	print_bool_fragment("SXE", FIELD_GET(CSR_EUEN_LSXEN, x), false);
+	print_bool_fragment("ASXE", FIELD_GET(CSR_EUEN_LASXEN, x), false);
+	print_bool_fragment("BTE", FIELD_GET(CSR_EUEN_LBTEN, x), false);
+	pr_cont(")\n");
+}
+
+static void print_ecfg(unsigned long x)
+{
+	printk(" ECFG: %08lx (", x);
+	print_intr_fragment("LIE", FIELD_GET(CSR_ECFG_IM, x));
+	pr_cont(" VS=%d)\n", (int) FIELD_GET(CSR_ECFG_VS, x));
+}
+
+static const char *humanize_exc_name(unsigned int ecode, unsigned int esubcode)
+{
+	/*
+	 * LoongArch users and developers are probably more familiar with
+	 * those names found in the ISA manual, so we are going to print out
+	 * the latter. This will require some mapping.
+	 */
+	switch (ecode) {
+	case EXCCODE_RSV: return "INT";
+	case EXCCODE_TLBL: return "PIL";
+	case EXCCODE_TLBS: return "PIS";
+	case EXCCODE_TLBI: return "PIF";
+	case EXCCODE_TLBM: return "PME";
+	case EXCCODE_TLBNR: return "PNR";
+	case EXCCODE_TLBNX: return "PNX";
+	case EXCCODE_TLBPE: return "PPI";
+	case EXCCODE_ADE:
+		switch (esubcode) {
+		case EXSUBCODE_ADEF: return "ADEF";
+		case EXSUBCODE_ADEM: return "ADEM";
+		}
+		break;
+	case EXCCODE_ALE: return "ALE";
+	case EXCCODE_BCE: return "BCE";
+	case EXCCODE_SYS: return "SYS";
+	case EXCCODE_BP: return "BRK";
+	case EXCCODE_INE: return "INE";
+	case EXCCODE_IPE: return "IPE";
+	case EXCCODE_FPDIS: return "FPD";
+	case EXCCODE_LSXDIS: return "SXD";
+	case EXCCODE_LASXDIS: return "ASXD";
+	case EXCCODE_FPE:
+		switch (esubcode) {
+		case EXCSUBCODE_FPE: return "FPE";
+		case EXCSUBCODE_VFPE: return "VFPE";
+		}
+		break;
+	case EXCCODE_WATCH:
+		switch (esubcode) {
+		case EXCSUBCODE_WPEF: return "WPEF";
+		case EXCSUBCODE_WPEM: return "WPEM";
+		}
+		break;
+	case EXCCODE_BTDIS: return "BTD";
+	case EXCCODE_BTE: return "BTE";
+	case EXCCODE_GSPR: return "GSPR";
+	case EXCCODE_HVC: return "HVC";
+	case EXCCODE_GCM:
+		switch (esubcode) {
+		case EXCSUBCODE_GCSC: return "GCSC";
+		case EXCSUBCODE_GCHC: return "GCHC";
+		}
+		break;
 	/*
-	 * Saved csr registers
+	 * The manual did not mention the EXCCODE_SE case, but print out it
+	 * nevertheless.
 	 */
-	printk("era   : %0*lx %pS\n", field, regs->csr_era,
-	       (void *) regs->csr_era);
-	printk("ra    : %0*lx %pS\n", field, regs->regs[1],
-	       (void *) regs->regs[1]);
+	case EXCCODE_SE: return "SE";
+	}
 
-	printk("CSR crmd: %08lx	", regs->csr_crmd);
-	printk("CSR prmd: %08lx	", regs->csr_prmd);
-	printk("CSR euen: %08lx	", regs->csr_euen);
-	printk("CSR ecfg: %08lx	", regs->csr_ecfg);
-	printk("CSR estat: %08lx	", regs->csr_estat);
+	return "???";
+}
 
-	pr_cont("\n");
+static void print_estat(unsigned long x)
+{
+	unsigned int ecode = FIELD_GET(CSR_ESTAT_EXC, x);
+	unsigned int esubcode = FIELD_GET(CSR_ESTAT_ESUBCODE, x);
+
+	printk("ESTAT: %08lx [%s] (", x, humanize_exc_name(ecode, esubcode));
+	print_intr_fragment("IS", FIELD_GET(CSR_ESTAT_IS, x));
+	pr_cont(" ECode=%d EsubCode=%d)\n", (int) ecode, (int) esubcode);
+}
+
+static void __show_regs(const struct pt_regs *regs)
+{
+	const int field = 2 * sizeof(unsigned long);
+	unsigned int exccode = FIELD_GET(CSR_ESTAT_EXC, regs->csr_estat);
+
+	show_regs_print_info(KERN_DEFAULT);
+
+	/* Print saved GPRs except $zero (substituting with PC/ERA) */
+#define GPR_FIELD(x) field, regs->regs[x]
+	printk("pc %0*lx ra %0*lx tp %0*lx sp %0*lx\n",
+	       field, regs->csr_era, GPR_FIELD(1), GPR_FIELD(2), GPR_FIELD(3));
+	printk("a0 %0*lx a1 %0*lx a2 %0*lx a3 %0*lx\n",
+	       GPR_FIELD(4), GPR_FIELD(5), GPR_FIELD(6), GPR_FIELD(7));
+	printk("a4 %0*lx a5 %0*lx a6 %0*lx a7 %0*lx\n",
+	       GPR_FIELD(8), GPR_FIELD(9), GPR_FIELD(10), GPR_FIELD(11));
+	printk("t0 %0*lx t1 %0*lx t2 %0*lx t3 %0*lx\n",
+	       GPR_FIELD(12), GPR_FIELD(13), GPR_FIELD(14), GPR_FIELD(15));
+	printk("t4 %0*lx t5 %0*lx t6 %0*lx t7 %0*lx\n",
+	       GPR_FIELD(16), GPR_FIELD(17), GPR_FIELD(18), GPR_FIELD(19));
+	printk("t8 %0*lx u0 %0*lx s9 %0*lx s0 %0*lx\n",
+	       GPR_FIELD(20), GPR_FIELD(21), GPR_FIELD(22), GPR_FIELD(23));
+	printk("s1 %0*lx s2 %0*lx s3 %0*lx s4 %0*lx\n",
+	       GPR_FIELD(24), GPR_FIELD(25), GPR_FIELD(26), GPR_FIELD(27));
+	printk("s5 %0*lx s6 %0*lx s7 %0*lx s8 %0*lx\n",
+	       GPR_FIELD(28), GPR_FIELD(29), GPR_FIELD(30), GPR_FIELD(31));
+
+	/* The slot for $zero is reused as the syscall restart flag */
+	if (regs->regs[0])
+		printk("syscall restart flag: %0*lx\n", GPR_FIELD(0));
+
+	if (user_mode(regs)) {
+		printk("   ra: %0*lx\n", GPR_FIELD(1));
+		printk("  ERA: %0*lx\n", field, regs->csr_era);
+	} else {
+		printk("   ra: %0*lx %pS\n", GPR_FIELD(1), (void *) regs->regs[1]);
+		printk("  ERA: %0*lx %pS\n", field, regs->csr_era, (void *) regs->csr_era);
+	}
+#undef GPR_FIELD
 
-	exccode = ((regs->csr_estat) & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT;
-	excsubcode = ((regs->csr_estat) & CSR_ESTAT_ESUBCODE) >> CSR_ESTAT_ESUBCODE_SHIFT;
-	printk("ExcCode : %x (SubCode %x)\n", exccode, excsubcode);
+	/* Print saved important CSRs */
+	print_crmd(regs->csr_crmd);
+	print_prmd(regs->csr_prmd);
+	print_euen(regs->csr_euen);
+	print_ecfg(regs->csr_ecfg);
+	print_estat(regs->csr_estat);
 
 	if (exccode >= EXCCODE_TLBL && exccode <= EXCCODE_ALE)
-		printk("BadVA : %0*lx\n", field, regs->csr_badvaddr);
+		printk(" BADV: %0*lx\n", field, regs->csr_badvaddr);
 
-	printk("PrId  : %08x (%s)\n", read_cpucfg(LOONGARCH_CPUCFG0),
-	       cpu_family_string());
+	printk(" PRID: %08x (%s, %s)\n", read_cpucfg(LOONGARCH_CPUCFG0),
+	       cpu_family_string(), cpu_full_name_string());
 }
 
 void show_regs(struct pt_regs *regs)
@@ -430,6 +590,95 @@ static void bug_handler(struct pt_regs *regs)
 	}
 }
 
+asmlinkage void noinstr do_bce(struct pt_regs *regs)
+{
+	bool user = user_mode(regs);
+	unsigned long era = exception_era(regs);
+	u64 badv = 0, lower = 0, upper = ULONG_MAX;
+	union loongarch_instruction insn;
+	irqentry_state_t state = irqentry_enter(regs);
+
+	if (regs->csr_prmd & CSR_PRMD_PIE)
+		local_irq_enable();
+
+	current->thread.trap_nr = read_csr_excode();
+
+	die_if_kernel("Bounds check error in kernel code", regs);
+
+	/*
+	 * Pull out the address that failed bounds checking, and the lower /
+	 * upper bound, by minimally looking at the faulting instruction word
+	 * and reading from the correct register.
+	 */
+	if (__get_inst(&insn.word, (u32 *)era, user))
+		goto bad_era;
+
+	switch (insn.reg3_format.opcode) {
+	case asrtle_op:
+		if (insn.reg3_format.rd != 0)
+			break;	/* not asrtle */
+		badv = regs->regs[insn.reg3_format.rj];
+		upper = regs->regs[insn.reg3_format.rk];
+		break;
+
+	case asrtgt_op:
+		if (insn.reg3_format.rd != 0)
+			break;	/* not asrtgt */
+		badv = regs->regs[insn.reg3_format.rj];
+		lower = regs->regs[insn.reg3_format.rk];
+		break;
+
+	case ldleb_op:
+	case ldleh_op:
+	case ldlew_op:
+	case ldled_op:
+	case stleb_op:
+	case stleh_op:
+	case stlew_op:
+	case stled_op:
+	case fldles_op:
+	case fldled_op:
+	case fstles_op:
+	case fstled_op:
+		badv = regs->regs[insn.reg3_format.rj];
+		upper = regs->regs[insn.reg3_format.rk];
+		break;
+
+	case ldgtb_op:
+	case ldgth_op:
+	case ldgtw_op:
+	case ldgtd_op:
+	case stgtb_op:
+	case stgth_op:
+	case stgtw_op:
+	case stgtd_op:
+	case fldgts_op:
+	case fldgtd_op:
+	case fstgts_op:
+	case fstgtd_op:
+		badv = regs->regs[insn.reg3_format.rj];
+		lower = regs->regs[insn.reg3_format.rk];
+		break;
+	}
+
+	force_sig_bnderr((void __user *)badv, (void __user *)lower, (void __user *)upper);
+
+out:
+	if (regs->csr_prmd & CSR_PRMD_PIE)
+		local_irq_disable();
+
+	irqentry_exit(regs, state);
+	return;
+
+bad_era:
+	/*
+	 * Cannot pull out the instruction word, hence cannot provide more
+	 * info than a regular SIGSEGV in this case.
+	 */
+	force_sig(SIGSEGV);
+	goto out;
+}
+
 asmlinkage void noinstr do_bp(struct pt_regs *regs)
 {
 	bool user = user_mode(regs);
@@ -792,11 +1041,12 @@ void __init trap_init(void)
 	long i;
 
 	/* Set interrupt vector handler */
-	for (i = EXCCODE_INT_START; i < EXCCODE_INT_END; i++)
+	for (i = EXCCODE_INT_START; i <= EXCCODE_INT_END; i++)
 		set_handler(i * VECSIZE, handle_vint, VECSIZE);
 
 	set_handler(EXCCODE_ADE * VECSIZE, handle_ade, VECSIZE);
 	set_handler(EXCCODE_ALE * VECSIZE, handle_ale, VECSIZE);
+	set_handler(EXCCODE_BCE * VECSIZE, handle_bce, VECSIZE);
 	set_handler(EXCCODE_SYS * VECSIZE, handle_sys, VECSIZE);
 	set_handler(EXCCODE_BP * VECSIZE, handle_bp, VECSIZE);
 	set_handler(EXCCODE_INE * VECSIZE, handle_ri, VECSIZE);
diff --git a/arch/loongarch/lib/Makefile b/arch/loongarch/lib/Makefile
index 40bde632900f..d60d4e096cfa 100644
--- a/arch/loongarch/lib/Makefile
+++ b/arch/loongarch/lib/Makefile
@@ -4,4 +4,6 @@
 #
 
 lib-y	+= delay.o memset.o memcpy.o memmove.o \
-	   clear_user.o copy_user.o dump_tlb.o unaligned.o
+	   clear_user.o copy_user.o csum.o dump_tlb.o unaligned.o
+
+obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
diff --git a/arch/loongarch/lib/clear_user.S b/arch/loongarch/lib/clear_user.S
index 2dc48e61a2c8..fd1d62b244f2 100644
--- a/arch/loongarch/lib/clear_user.S
+++ b/arch/loongarch/lib/clear_user.S
@@ -13,7 +13,14 @@
 
 .irp to, 0, 1, 2, 3, 4, 5, 6, 7
 .L_fixup_handle_\to\():
-	addi.d	a0, a1, (\to) * (-8)
+	sub.d	a0, a2, a0
+	addi.d	a0, a0, (\to) * (-8)
+	jr	ra
+.endr
+
+.irp to, 0, 2, 4
+.L_fixup_handle_s\to\():
+	addi.d	a0, a1, -\to
 	jr	ra
 .endr
 
@@ -44,7 +51,7 @@ SYM_FUNC_START(__clear_user_generic)
 2:	move	a0, a1
 	jr	ra
 
-	_asm_extable 1b, .L_fixup_handle_0
+	_asm_extable 1b, .L_fixup_handle_s0
 SYM_FUNC_END(__clear_user_generic)
 
 /*
@@ -54,12 +61,21 @@ SYM_FUNC_END(__clear_user_generic)
  * a1: size
  */
 SYM_FUNC_START(__clear_user_fast)
-	beqz	a1, 10f
+	sltui	t0, a1, 9
+	bnez	t0, .Lsmall
 
-	ori	a2, zero, 64
-	blt	a1, a2, 9f
+	add.d	a2, a0, a1
+0:	st.d	zero, a0, 0
+
+	/* align up address */
+	addi.d	a0, a0, 8
+	bstrins.d	a0, zero, 2, 0
+
+	addi.d	a3, a2, -64
+	bgeu	a0, a3, .Llt64
 
 	/* set 64 bytes at a time */
+.Lloop64:
 1:	st.d	zero, a0, 0
 2:	st.d	zero, a0, 8
 3:	st.d	zero, a0, 16
@@ -68,24 +84,95 @@ SYM_FUNC_START(__clear_user_fast)
 6:	st.d	zero, a0, 40
 7:	st.d	zero, a0, 48
 8:	st.d	zero, a0, 56
-
 	addi.d	a0, a0, 64
-	addi.d	a1, a1, -64
-	bge	a1, a2, 1b
-
-	beqz	a1, 10f
+	bltu	a0, a3, .Lloop64
 
 	/* set the remaining bytes */
-9:	st.b	zero, a0, 0
-	addi.d	a0, a0, 1
-	addi.d	a1, a1, -1
-	bgt	a1, zero, 9b
+.Llt64:
+	addi.d	a3, a2, -32
+	bgeu	a0, a3, .Llt32
+9:	st.d	zero, a0, 0
+10:	st.d	zero, a0, 8
+11:	st.d	zero, a0, 16
+12:	st.d	zero, a0, 24
+	addi.d	a0, a0, 32
+
+.Llt32:
+	addi.d	a3, a2, -16
+	bgeu	a0, a3, .Llt16
+13:	st.d	zero, a0, 0
+14:	st.d	zero, a0, 8
+	addi.d	a0, a0, 16
+
+.Llt16:
+	addi.d	a3, a2, -8
+	bgeu	a0, a3, .Llt8
+15:	st.d	zero, a0, 0
+
+.Llt8:
+16:	st.d	zero, a2, -8
 
 	/* return */
-10:	move	a0, a1
+	move	a0, zero
+	jr	ra
+
+	.align	4
+.Lsmall:
+	pcaddi	t0, 4
+	slli.d	a2, a1, 4
+	add.d	t0, t0, a2
+	jr	t0
+
+	.align	4
+	move	a0, zero
+	jr	ra
+
+	.align	4
+17:	st.b	zero, a0, 0
+	move	a0, zero
+	jr	ra
+
+	.align	4
+18:	st.h	zero, a0, 0
+	move	a0, zero
+	jr	ra
+
+	.align	4
+19:	st.h	zero, a0, 0
+20:	st.b	zero, a0, 2
+	move	a0, zero
+	jr	ra
+
+	.align	4
+21:	st.w	zero, a0, 0
+	move	a0, zero
+	jr	ra
+
+	.align	4
+22:	st.w	zero, a0, 0
+23:	st.b	zero, a0, 4
+	move	a0, zero
+	jr	ra
+
+	.align	4
+24:	st.w	zero, a0, 0
+25:	st.h	zero, a0, 4
+	move	a0, zero
+	jr	ra
+
+	.align	4
+26:	st.w	zero, a0, 0
+27:	st.w	zero, a0, 3
+	move	a0, zero
+	jr	ra
+
+	.align	4
+28:	st.d	zero, a0, 0
+	move	a0, zero
 	jr	ra
 
 	/* fixup and ex_table */
+	_asm_extable 0b, .L_fixup_handle_0
 	_asm_extable 1b, .L_fixup_handle_0
 	_asm_extable 2b, .L_fixup_handle_1
 	_asm_extable 3b, .L_fixup_handle_2
@@ -95,4 +182,23 @@ SYM_FUNC_START(__clear_user_fast)
 	_asm_extable 7b, .L_fixup_handle_6
 	_asm_extable 8b, .L_fixup_handle_7
 	_asm_extable 9b, .L_fixup_handle_0
+	_asm_extable 10b, .L_fixup_handle_1
+	_asm_extable 11b, .L_fixup_handle_2
+	_asm_extable 12b, .L_fixup_handle_3
+	_asm_extable 13b, .L_fixup_handle_0
+	_asm_extable 14b, .L_fixup_handle_1
+	_asm_extable 15b, .L_fixup_handle_0
+	_asm_extable 16b, .L_fixup_handle_1
+	_asm_extable 17b, .L_fixup_handle_s0
+	_asm_extable 18b, .L_fixup_handle_s0
+	_asm_extable 19b, .L_fixup_handle_s0
+	_asm_extable 20b, .L_fixup_handle_s2
+	_asm_extable 21b, .L_fixup_handle_s0
+	_asm_extable 22b, .L_fixup_handle_s0
+	_asm_extable 23b, .L_fixup_handle_s4
+	_asm_extable 24b, .L_fixup_handle_s0
+	_asm_extable 25b, .L_fixup_handle_s4
+	_asm_extable 26b, .L_fixup_handle_s0
+	_asm_extable 27b, .L_fixup_handle_s4
+	_asm_extable 28b, .L_fixup_handle_s0
 SYM_FUNC_END(__clear_user_fast)
diff --git a/arch/loongarch/lib/copy_user.S b/arch/loongarch/lib/copy_user.S
index 55ac6020a1ad..b21f6d5d38f5 100644
--- a/arch/loongarch/lib/copy_user.S
+++ b/arch/loongarch/lib/copy_user.S
@@ -13,7 +13,14 @@
 
 .irp to, 0, 1, 2, 3, 4, 5, 6, 7
 .L_fixup_handle_\to\():
-	addi.d	a0, a2, (\to) * (-8)
+	sub.d	a0, a2, a0
+	addi.d	a0, a0, (\to) * (-8)
+	jr	ra
+.endr
+
+.irp to, 0, 2, 4
+.L_fixup_handle_s\to\():
+	addi.d	a0, a2, -\to
 	jr	ra
 .endr
 
@@ -47,8 +54,8 @@ SYM_FUNC_START(__copy_user_generic)
 3:	move	a0, a2
 	jr	ra
 
-	_asm_extable 1b, .L_fixup_handle_0
-	_asm_extable 2b, .L_fixup_handle_0
+	_asm_extable 1b, .L_fixup_handle_s0
+	_asm_extable 2b, .L_fixup_handle_s0
 SYM_FUNC_END(__copy_user_generic)
 
 /*
@@ -59,65 +66,209 @@ SYM_FUNC_END(__copy_user_generic)
  * a2: n
  */
 SYM_FUNC_START(__copy_user_fast)
-	beqz	a2, 19f
+	sltui	t0, a2, 9
+	bnez	t0, .Lsmall
 
-	ori	a3, zero, 64
-	blt	a2, a3, 17f
+	add.d	a3, a1, a2
+	add.d	a2, a0, a2
+0:	ld.d	t0, a1, 0
+1:	st.d	t0, a0, 0
 
-	/* copy 64 bytes at a time */
-1:	ld.d	t0, a1, 0
-2:	ld.d	t1, a1, 8
-3:	ld.d	t2, a1, 16
-4:	ld.d	t3, a1, 24
-5:	ld.d	t4, a1, 32
-6:	ld.d	t5, a1, 40
-7:	ld.d	t6, a1, 48
-8:	ld.d	t7, a1, 56
-9:	st.d	t0, a0, 0
-10:	st.d	t1, a0, 8
-11:	st.d	t2, a0, 16
-12:	st.d	t3, a0, 24
-13:	st.d	t4, a0, 32
-14:	st.d	t5, a0, 40
-15:	st.d	t6, a0, 48
-16:	st.d	t7, a0, 56
+	/* align up destination address */
+	andi	t1, a0, 7
+	sub.d	t0, zero, t1
+	addi.d	t0, t0, 8
+	add.d	a1, a1, t0
+	add.d	a0, a0, t0
 
-	addi.d	a0, a0, 64
-	addi.d	a1, a1, 64
-	addi.d	a2, a2, -64
-	bge	a2, a3, 1b
+	addi.d	a4, a3, -64
+	bgeu	a1, a4, .Llt64
 
-	beqz	a2, 19f
+	/* copy 64 bytes at a time */
+.Lloop64:
+2:	ld.d	t0, a1, 0
+3:	ld.d	t1, a1, 8
+4:	ld.d	t2, a1, 16
+5:	ld.d	t3, a1, 24
+6:	ld.d	t4, a1, 32
+7:	ld.d	t5, a1, 40
+8:	ld.d	t6, a1, 48
+9:	ld.d	t7, a1, 56
+	addi.d	a1, a1, 64
+10:	st.d	t0, a0, 0
+11:	st.d	t1, a0, 8
+12:	st.d	t2, a0, 16
+13:	st.d	t3, a0, 24
+14:	st.d	t4, a0, 32
+15:	st.d	t5, a0, 40
+16:	st.d	t6, a0, 48
+17:	st.d	t7, a0, 56
+	addi.d	a0, a0, 64
+	bltu	a1, a4, .Lloop64
 
 	/* copy the remaining bytes */
-17:	ld.b	t0, a1, 0
-18:	st.b	t0, a0, 0
-	addi.d	a0, a0, 1
-	addi.d	a1, a1, 1
-	addi.d	a2, a2, -1
-	bgt	a2, zero, 17b
+.Llt64:
+	addi.d	a4, a3, -32
+	bgeu	a1, a4, .Llt32
+18:	ld.d	t0, a1, 0
+19:	ld.d	t1, a1, 8
+20:	ld.d	t2, a1, 16
+21:	ld.d	t3, a1, 24
+	addi.d	a1, a1, 32
+22:	st.d	t0, a0, 0
+23:	st.d	t1, a0, 8
+24:	st.d	t2, a0, 16
+25:	st.d	t3, a0, 24
+	addi.d	a0, a0, 32
+
+.Llt32:
+	addi.d	a4, a3, -16
+	bgeu	a1, a4, .Llt16
+26:	ld.d	t0, a1, 0
+27:	ld.d	t1, a1, 8
+	addi.d	a1, a1, 16
+28:	st.d	t0, a0, 0
+29:	st.d	t1, a0, 8
+	addi.d	a0, a0, 16
+
+.Llt16:
+	addi.d	a4, a3, -8
+	bgeu	a1, a4, .Llt8
+30:	ld.d	t0, a1, 0
+31:	st.d	t0, a0, 0
+
+.Llt8:
+32:	ld.d	t0, a3, -8
+33:	st.d	t0, a2, -8
 
 	/* return */
-19:	move	a0, a2
+	move	a0, zero
+	jr	ra
+
+	.align	5
+.Lsmall:
+	pcaddi	t0, 8
+	slli.d	a3, a2, 5
+	add.d	t0, t0, a3
+	jr	t0
+
+	.align	5
+	move	a0, zero
+	jr	ra
+
+	.align	5
+34:	ld.b	t0, a1, 0
+35:	st.b	t0, a0, 0
+	move	a0, zero
+	jr	ra
+
+	.align	5
+36:	ld.h	t0, a1, 0
+37:	st.h	t0, a0, 0
+	move	a0, zero
+	jr	ra
+
+	.align	5
+38:	ld.h	t0, a1, 0
+39:	ld.b	t1, a1, 2
+40:	st.h	t0, a0, 0
+41:	st.b	t1, a0, 2
+	move	a0, zero
+	jr	ra
+
+	.align	5
+42:	ld.w	t0, a1, 0
+43:	st.w	t0, a0, 0
+	move	a0, zero
+	jr	ra
+
+	.align	5
+44:	ld.w	t0, a1, 0
+45:	ld.b	t1, a1, 4
+46:	st.w	t0, a0, 0
+47:	st.b	t1, a0, 4
+	move	a0, zero
+	jr	ra
+
+	.align	5
+48:	ld.w	t0, a1, 0
+49:	ld.h	t1, a1, 4
+50:	st.w	t0, a0, 0
+51:	st.h	t1, a0, 4
+	move	a0, zero
+	jr	ra
+
+	.align	5
+52:	ld.w	t0, a1, 0
+53:	ld.w	t1, a1, 3
+54:	st.w	t0, a0, 0
+55:	st.w	t1, a0, 3
+	move	a0, zero
+	jr	ra
+
+	.align	5
+56:	ld.d	t0, a1, 0
+57:	st.d	t0, a0, 0
+	move	a0, zero
 	jr	ra
 
 	/* fixup and ex_table */
+	_asm_extable 0b, .L_fixup_handle_0
 	_asm_extable 1b, .L_fixup_handle_0
-	_asm_extable 2b, .L_fixup_handle_1
-	_asm_extable 3b, .L_fixup_handle_2
-	_asm_extable 4b, .L_fixup_handle_3
-	_asm_extable 5b, .L_fixup_handle_4
-	_asm_extable 6b, .L_fixup_handle_5
-	_asm_extable 7b, .L_fixup_handle_6
-	_asm_extable 8b, .L_fixup_handle_7
+	_asm_extable 2b, .L_fixup_handle_0
+	_asm_extable 3b, .L_fixup_handle_0
+	_asm_extable 4b, .L_fixup_handle_0
+	_asm_extable 5b, .L_fixup_handle_0
+	_asm_extable 6b, .L_fixup_handle_0
+	_asm_extable 7b, .L_fixup_handle_0
+	_asm_extable 8b, .L_fixup_handle_0
 	_asm_extable 9b, .L_fixup_handle_0
-	_asm_extable 10b, .L_fixup_handle_1
-	_asm_extable 11b, .L_fixup_handle_2
-	_asm_extable 12b, .L_fixup_handle_3
-	_asm_extable 13b, .L_fixup_handle_4
-	_asm_extable 14b, .L_fixup_handle_5
-	_asm_extable 15b, .L_fixup_handle_6
-	_asm_extable 16b, .L_fixup_handle_7
-	_asm_extable 17b, .L_fixup_handle_0
+	_asm_extable 10b, .L_fixup_handle_0
+	_asm_extable 11b, .L_fixup_handle_1
+	_asm_extable 12b, .L_fixup_handle_2
+	_asm_extable 13b, .L_fixup_handle_3
+	_asm_extable 14b, .L_fixup_handle_4
+	_asm_extable 15b, .L_fixup_handle_5
+	_asm_extable 16b, .L_fixup_handle_6
+	_asm_extable 17b, .L_fixup_handle_7
 	_asm_extable 18b, .L_fixup_handle_0
+	_asm_extable 19b, .L_fixup_handle_0
+	_asm_extable 20b, .L_fixup_handle_0
+	_asm_extable 21b, .L_fixup_handle_0
+	_asm_extable 22b, .L_fixup_handle_0
+	_asm_extable 23b, .L_fixup_handle_1
+	_asm_extable 24b, .L_fixup_handle_2
+	_asm_extable 25b, .L_fixup_handle_3
+	_asm_extable 26b, .L_fixup_handle_0
+	_asm_extable 27b, .L_fixup_handle_0
+	_asm_extable 28b, .L_fixup_handle_0
+	_asm_extable 29b, .L_fixup_handle_1
+	_asm_extable 30b, .L_fixup_handle_0
+	_asm_extable 31b, .L_fixup_handle_0
+	_asm_extable 32b, .L_fixup_handle_0
+	_asm_extable 33b, .L_fixup_handle_1
+	_asm_extable 34b, .L_fixup_handle_s0
+	_asm_extable 35b, .L_fixup_handle_s0
+	_asm_extable 36b, .L_fixup_handle_s0
+	_asm_extable 37b, .L_fixup_handle_s0
+	_asm_extable 38b, .L_fixup_handle_s0
+	_asm_extable 39b, .L_fixup_handle_s0
+	_asm_extable 40b, .L_fixup_handle_s0
+	_asm_extable 41b, .L_fixup_handle_s2
+	_asm_extable 42b, .L_fixup_handle_s0
+	_asm_extable 43b, .L_fixup_handle_s0
+	_asm_extable 44b, .L_fixup_handle_s0
+	_asm_extable 45b, .L_fixup_handle_s0
+	_asm_extable 46b, .L_fixup_handle_s0
+	_asm_extable 47b, .L_fixup_handle_s4
+	_asm_extable 48b, .L_fixup_handle_s0
+	_asm_extable 49b, .L_fixup_handle_s0
+	_asm_extable 50b, .L_fixup_handle_s0
+	_asm_extable 51b, .L_fixup_handle_s4
+	_asm_extable 52b, .L_fixup_handle_s0
+	_asm_extable 53b, .L_fixup_handle_s0
+	_asm_extable 54b, .L_fixup_handle_s0
+	_asm_extable 55b, .L_fixup_handle_s4
+	_asm_extable 56b, .L_fixup_handle_s0
+	_asm_extable 57b, .L_fixup_handle_s0
 SYM_FUNC_END(__copy_user_fast)
diff --git a/arch/loongarch/lib/csum.c b/arch/loongarch/lib/csum.c
new file mode 100644
index 000000000000..a5e84b403c3b
--- /dev/null
+++ b/arch/loongarch/lib/csum.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2019-2020 Arm Ltd.
+
+#include <linux/compiler.h>
+#include <linux/kasan-checks.h>
+#include <linux/kernel.h>
+
+#include <net/checksum.h>
+
+static u64 accumulate(u64 sum, u64 data)
+{
+	sum += data;
+	if (sum < data)
+		sum += 1;
+	return sum;
+}
+
+/*
+ * We over-read the buffer and this makes KASAN unhappy. Instead, disable
+ * instrumentation and call kasan explicitly.
+ */
+unsigned int __no_sanitize_address do_csum(const unsigned char *buff, int len)
+{
+	unsigned int offset, shift, sum;
+	const u64 *ptr;
+	u64 data, sum64 = 0;
+
+	if (unlikely(len == 0))
+		return 0;
+
+	offset = (unsigned long)buff & 7;
+	/*
+	 * This is to all intents and purposes safe, since rounding down cannot
+	 * result in a different page or cache line being accessed, and @buff
+	 * should absolutely not be pointing to anything read-sensitive. We do,
+	 * however, have to be careful not to piss off KASAN, which means using
+	 * unchecked reads to accommodate the head and tail, for which we'll
+	 * compensate with an explicit check up-front.
+	 */
+	kasan_check_read(buff, len);
+	ptr = (u64 *)(buff - offset);
+	len = len + offset - 8;
+
+	/*
+	 * Head: zero out any excess leading bytes. Shifting back by the same
+	 * amount should be at least as fast as any other way of handling the
+	 * odd/even alignment, and means we can ignore it until the very end.
+	 */
+	shift = offset * 8;
+	data = *ptr++;
+	data = (data >> shift) << shift;
+
+	/*
+	 * Body: straightforward aligned loads from here on (the paired loads
+	 * underlying the quadword type still only need dword alignment). The
+	 * main loop strictly excludes the tail, so the second loop will always
+	 * run at least once.
+	 */
+	while (unlikely(len > 64)) {
+		__uint128_t tmp1, tmp2, tmp3, tmp4;
+
+		tmp1 = *(__uint128_t *)ptr;
+		tmp2 = *(__uint128_t *)(ptr + 2);
+		tmp3 = *(__uint128_t *)(ptr + 4);
+		tmp4 = *(__uint128_t *)(ptr + 6);
+
+		len -= 64;
+		ptr += 8;
+
+		/* This is the "don't dump the carry flag into a GPR" idiom */
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		tmp2 += (tmp2 >> 64) | (tmp2 << 64);
+		tmp3 += (tmp3 >> 64) | (tmp3 << 64);
+		tmp4 += (tmp4 >> 64) | (tmp4 << 64);
+		tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64);
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64);
+		tmp3 += (tmp3 >> 64) | (tmp3 << 64);
+		tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64);
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		tmp1 = ((tmp1 >> 64) << 64) | sum64;
+		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
+		sum64 = tmp1 >> 64;
+	}
+	while (len > 8) {
+		__uint128_t tmp;
+
+		sum64 = accumulate(sum64, data);
+		tmp = *(__uint128_t *)ptr;
+
+		len -= 16;
+		ptr += 2;
+
+		data = tmp >> 64;
+		sum64 = accumulate(sum64, tmp);
+	}
+	if (len > 0) {
+		sum64 = accumulate(sum64, data);
+		data = *ptr;
+		len -= 8;
+	}
+	/*
+	 * Tail: zero any over-read bytes similarly to the head, again
+	 * preserving odd/even alignment.
+	 */
+	shift = len * -8;
+	data = (data << shift) >> shift;
+	sum64 = accumulate(sum64, data);
+
+	/* Finally, folding */
+	sum64 += (sum64 >> 32) | (sum64 << 32);
+	sum = sum64 >> 32;
+	sum += (sum >> 16) | (sum << 16);
+	if (offset & 1)
+		return (u16)swab32(sum);
+
+	return sum >> 16;
+}
+
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+			const struct in6_addr *daddr,
+			__u32 len, __u8 proto, __wsum csum)
+{
+	__uint128_t src, dst;
+	u64 sum = (__force u64)csum;
+
+	src = *(const __uint128_t *)saddr->s6_addr;
+	dst = *(const __uint128_t *)daddr->s6_addr;
+
+	sum += (__force u32)htonl(len);
+	sum += (u32)proto << 24;
+	src += (src >> 64) | (src << 64);
+	dst += (dst >> 64) | (dst << 64);
+
+	sum = accumulate(sum, src >> 64);
+	sum = accumulate(sum, dst >> 64);
+
+	sum += ((sum >> 32) | (sum << 32));
+	return csum_fold((__force __wsum)(sum >> 32));
+}
+EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/loongarch/lib/error-inject.c b/arch/loongarch/lib/error-inject.c
new file mode 100644
index 000000000000..afc9e1c7c973
--- /dev/null
+++ b/arch/loongarch/lib/error-inject.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/error-injection.h>
+#include <linux/kprobes.h>
+
+void override_function_with_return(struct pt_regs *regs)
+{
+	instruction_pointer_set(regs, regs->regs[1]);
+}
+NOKPROBE_SYMBOL(override_function_with_return);
diff --git a/arch/loongarch/lib/memcpy.S b/arch/loongarch/lib/memcpy.S
index 3b7e1dec7109..39ce6621c704 100644
--- a/arch/loongarch/lib/memcpy.S
+++ b/arch/loongarch/lib/memcpy.S
@@ -44,6 +44,66 @@ SYM_FUNC_START(__memcpy_generic)
 SYM_FUNC_END(__memcpy_generic)
 _ASM_NOKPROBE(__memcpy_generic)
 
+	.align	5
+SYM_FUNC_START_NOALIGN(__memcpy_small)
+	pcaddi	t0, 8
+	slli.d	a2, a2, 5
+	add.d	t0, t0, a2
+	jr	t0
+
+	.align	5
+0:	jr	ra
+
+	.align	5
+1:	ld.b	t0, a1, 0
+	st.b	t0, a0, 0
+	jr	ra
+
+	.align	5
+2:	ld.h	t0, a1, 0
+	st.h	t0, a0, 0
+	jr	ra
+
+	.align	5
+3:	ld.h	t0, a1, 0
+	ld.b	t1, a1, 2
+	st.h	t0, a0, 0
+	st.b	t1, a0, 2
+	jr	ra
+
+	.align	5
+4:	ld.w	t0, a1, 0
+	st.w	t0, a0, 0
+	jr	ra
+
+	.align	5
+5:	ld.w	t0, a1, 0
+	ld.b	t1, a1, 4
+	st.w	t0, a0, 0
+	st.b	t1, a0, 4
+	jr	ra
+
+	.align	5
+6:	ld.w	t0, a1, 0
+	ld.h	t1, a1, 4
+	st.w	t0, a0, 0
+	st.h	t1, a0, 4
+	jr	ra
+
+	.align	5
+7:	ld.w	t0, a1, 0
+	ld.w	t1, a1, 3
+	st.w	t0, a0, 0
+	st.w	t1, a0, 3
+	jr	ra
+
+	.align	5
+8:	ld.d	t0, a1, 0
+	st.d	t0, a0, 0
+	jr	ra
+SYM_FUNC_END(__memcpy_small)
+_ASM_NOKPROBE(__memcpy_small)
+
 /*
  * void *__memcpy_fast(void *dst, const void *src, size_t n)
  *
@@ -52,14 +112,27 @@ _ASM_NOKPROBE(__memcpy_generic)
  * a2: n
  */
 SYM_FUNC_START(__memcpy_fast)
-	move	a3, a0
-	beqz	a2, 3f
+	sltui	t0, a2, 9
+	bnez	t0, __memcpy_small
+
+	add.d	a3, a1, a2
+	add.d	a2, a0, a2
+	ld.d	a6, a1, 0
+	ld.d	a7, a3, -8
+
+	/* align up destination address */
+	andi	t1, a0, 7
+	sub.d	t0, zero, t1
+	addi.d	t0, t0, 8
+	add.d	a1, a1, t0
+	add.d	a5, a0, t0
 
-	ori	a4, zero, 64
-	blt	a2, a4, 2f
+	addi.d	a4, a3, -64
+	bgeu	a1, a4, .Llt64
 
 	/* copy 64 bytes at a time */
-1:	ld.d	t0, a1, 0
+.Lloop64:
+	ld.d	t0, a1, 0
 	ld.d	t1, a1, 8
 	ld.d	t2, a1, 16
 	ld.d	t3, a1, 24
@@ -67,32 +140,54 @@ SYM_FUNC_START(__memcpy_fast)
 	ld.d	t5, a1, 40
 	ld.d	t6, a1, 48
 	ld.d	t7, a1, 56
-	st.d	t0, a0, 0
-	st.d	t1, a0, 8
-	st.d	t2, a0, 16
-	st.d	t3, a0, 24
-	st.d	t4, a0, 32
-	st.d	t5, a0, 40
-	st.d	t6, a0, 48
-	st.d	t7, a0, 56
-
-	addi.d	a0, a0, 64
 	addi.d	a1, a1, 64
-	addi.d	a2, a2, -64
-	bge	a2, a4, 1b
-
-	beqz	a2, 3f
+	st.d	t0, a5, 0
+	st.d	t1, a5, 8
+	st.d	t2, a5, 16
+	st.d	t3, a5, 24
+	st.d	t4, a5, 32
+	st.d	t5, a5, 40
+	st.d	t6, a5, 48
+	st.d	t7, a5, 56
+	addi.d	a5, a5, 64
+	bltu	a1, a4, .Lloop64
 
 	/* copy the remaining bytes */
-2:	ld.b	t0, a1, 0
-	st.b	t0, a0, 0
-	addi.d	a0, a0, 1
-	addi.d	a1, a1, 1
-	addi.d	a2, a2, -1
-	bgt	a2, zero, 2b
+.Llt64:
+	addi.d	a4, a3, -32
+	bgeu	a1, a4, .Llt32
+	ld.d	t0, a1, 0
+	ld.d	t1, a1, 8
+	ld.d	t2, a1, 16
+	ld.d	t3, a1, 24
+	addi.d	a1, a1, 32
+	st.d	t0, a5, 0
+	st.d	t1, a5, 8
+	st.d	t2, a5, 16
+	st.d	t3, a5, 24
+	addi.d	a5, a5, 32
+
+.Llt32:
+	addi.d	a4, a3, -16
+	bgeu	a1, a4, .Llt16
+	ld.d	t0, a1, 0
+	ld.d	t1, a1, 8
+	addi.d	a1, a1, 16
+	st.d	t0, a5, 0
+	st.d	t1, a5, 8
+	addi.d	a5, a5, 16
+
+.Llt16:
+	addi.d	a4, a3, -8
+	bgeu	a1, a4, .Llt8
+	ld.d	t0, a1, 0
+	st.d	t0, a5, 0
+
+.Llt8:
+	st.d	a6, a0, 0
+	st.d	a7, a2, -8
 
 	/* return */
-3:	move	a0, a3
 	jr	ra
 SYM_FUNC_END(__memcpy_fast)
 _ASM_NOKPROBE(__memcpy_fast)
diff --git a/arch/loongarch/lib/memmove.S b/arch/loongarch/lib/memmove.S
index b796c3d6da05..45b725ba7867 100644
--- a/arch/loongarch/lib/memmove.S
+++ b/arch/loongarch/lib/memmove.S
@@ -11,23 +11,9 @@
 #include <asm/regdef.h>
 
 SYM_FUNC_START(memmove)
-	blt	a0, a1, 1f	/* dst < src, memcpy */
-	blt	a1, a0, 3f	/* src < dst, rmemcpy */
+	blt	a0, a1, memcpy	/* dst < src, memcpy */
+	blt	a1, a0, rmemcpy	/* src < dst, rmemcpy */
 	jr	ra		/* dst == src, return */
-
-	/* if (src - dst) < 64, copy 1 byte at a time */
-1:	ori	a3, zero, 64
-	sub.d	t0, a1, a0
-	blt	t0, a3, 2f
-	b	memcpy
-2:	b	__memcpy_generic
-
-	/* if (dst - src) < 64, copy 1 byte at a time */
-3:	ori	a3, zero, 64
-	sub.d	t0, a0, a1
-	blt	t0, a3, 4f
-	b	rmemcpy
-4:	b	__rmemcpy_generic
 SYM_FUNC_END(memmove)
 _ASM_NOKPROBE(memmove)
 
@@ -76,50 +62,80 @@ _ASM_NOKPROBE(__rmemcpy_generic)
  * a2: n
  */
 SYM_FUNC_START(__rmemcpy_fast)
-	move	a3, a0
-	beqz	a2, 3f
+	sltui	t0, a2, 9
+	bnez	t0, __memcpy_small
 
-	add.d	a0, a0, a2
-	add.d	a1, a1, a2
+	add.d	a3, a1, a2
+	add.d	a2, a0, a2
+	ld.d	a6, a1, 0
+	ld.d	a7, a3, -8
+
+	/* align up destination address */
+	andi	t1, a2, 7
+	sub.d	a3, a3, t1
+	sub.d	a5, a2, t1
 
-	ori	a4, zero, 64
-	blt	a2, a4, 2f
+	addi.d	a4, a1, 64
+	bgeu	a4, a3, .Llt64
 
 	/* copy 64 bytes at a time */
-1:	ld.d	t0, a1, -8
-	ld.d	t1, a1, -16
-	ld.d	t2, a1, -24
-	ld.d	t3, a1, -32
-	ld.d	t4, a1, -40
-	ld.d	t5, a1, -48
-	ld.d	t6, a1, -56
-	ld.d	t7, a1, -64
-	st.d	t0, a0, -8
-	st.d	t1, a0, -16
-	st.d	t2, a0, -24
-	st.d	t3, a0, -32
-	st.d	t4, a0, -40
-	st.d	t5, a0, -48
-	st.d	t6, a0, -56
-	st.d	t7, a0, -64
-
-	addi.d	a0, a0, -64
-	addi.d	a1, a1, -64
-	addi.d	a2, a2, -64
-	bge	a2, a4, 1b
-
-	beqz	a2, 3f
+.Lloop64:
+	ld.d	t0, a3, -8
+	ld.d	t1, a3, -16
+	ld.d	t2, a3, -24
+	ld.d	t3, a3, -32
+	ld.d	t4, a3, -40
+	ld.d	t5, a3, -48
+	ld.d	t6, a3, -56
+	ld.d	t7, a3, -64
+	addi.d	a3, a3, -64
+	st.d	t0, a5, -8
+	st.d	t1, a5, -16
+	st.d	t2, a5, -24
+	st.d	t3, a5, -32
+	st.d	t4, a5, -40
+	st.d	t5, a5, -48
+	st.d	t6, a5, -56
+	st.d	t7, a5, -64
+	addi.d	a5, a5, -64
+	bltu	a4, a3, .Lloop64
 
 	/* copy the remaining bytes */
-2:	ld.b	t0, a1, -1
-	st.b	t0, a0, -1
-	addi.d	a0, a0, -1
-	addi.d	a1, a1, -1
-	addi.d	a2, a2, -1
-	bgt	a2, zero, 2b
+.Llt64:
+	addi.d	a4, a1, 32
+	bgeu	a4, a3, .Llt32
+	ld.d	t0, a3, -8
+	ld.d	t1, a3, -16
+	ld.d	t2, a3, -24
+	ld.d	t3, a3, -32
+	addi.d	a3, a3, -32
+	st.d	t0, a5, -8
+	st.d	t1, a5, -16
+	st.d	t2, a5, -24
+	st.d	t3, a5, -32
+	addi.d	a5, a5, -32
+
+.Llt32:
+	addi.d	a4, a1, 16
+	bgeu	a4, a3, .Llt16
+	ld.d	t0, a3, -8
+	ld.d	t1, a3, -16
+	addi.d	a3, a3, -16
+	st.d	t0, a5, -8
+	st.d	t1, a5, -16
+	addi.d	a5, a5, -16
+
+.Llt16:
+	addi.d	a4, a1, 8
+	bgeu	a4, a3, .Llt8
+	ld.d	t0, a3, -8
+	st.d	t0, a5, -8
+
+.Llt8:
+	st.d	a6, a0, 0
+	st.d	a7, a2, -8
 
 	/* return */
-3:	move	a0, a3
 	jr	ra
 SYM_FUNC_END(__rmemcpy_fast)
 _ASM_NOKPROBE(__rmemcpy_fast)
diff --git a/arch/loongarch/lib/memset.S b/arch/loongarch/lib/memset.S
index a9eb732ab2ad..b39c6194e3ae 100644
--- a/arch/loongarch/lib/memset.S
+++ b/arch/loongarch/lib/memset.S
@@ -56,39 +56,107 @@ _ASM_NOKPROBE(__memset_generic)
  * a2: n
  */
 SYM_FUNC_START(__memset_fast)
-	move	a3, a0
-	beqz	a2, 3f
-
-	ori	a4, zero, 64
-	blt	a2, a4, 2f
-
 	/* fill a1 to 64 bits */
 	fill_to_64 a1
 
-	/* set 64 bytes at a time */
-1:	st.d	a1, a0, 0
-	st.d	a1, a0, 8
-	st.d	a1, a0, 16
-	st.d	a1, a0, 24
-	st.d	a1, a0, 32
-	st.d	a1, a0, 40
-	st.d	a1, a0, 48
-	st.d	a1, a0, 56
+	sltui	t0, a2, 9
+	bnez	t0, .Lsmall
 
-	addi.d	a0, a0, 64
-	addi.d	a2, a2, -64
-	bge	a2, a4, 1b
+	add.d	a2, a0, a2
+	st.d	a1, a0, 0
 
-	beqz	a2, 3f
+	/* align up address */
+	addi.d	a3, a0, 8
+	bstrins.d	a3, zero, 2, 0
+
+	addi.d	a4, a2, -64
+	bgeu	a3, a4, .Llt64
+
+	/* set 64 bytes at a time */
+.Lloop64:
+	st.d	a1, a3, 0
+	st.d	a1, a3, 8
+	st.d	a1, a3, 16
+	st.d	a1, a3, 24
+	st.d	a1, a3, 32
+	st.d	a1, a3, 40
+	st.d	a1, a3, 48
+	st.d	a1, a3, 56
+	addi.d	a3, a3, 64
+	bltu	a3, a4, .Lloop64
 
 	/* set the remaining bytes */
-2:	st.b	a1, a0, 0
-	addi.d	a0, a0, 1
-	addi.d	a2, a2, -1
-	bgt	a2, zero, 2b
+.Llt64:
+	addi.d	a4, a2, -32
+	bgeu	a3, a4, .Llt32
+	st.d	a1, a3, 0
+	st.d	a1, a3, 8
+	st.d	a1, a3, 16
+	st.d	a1, a3, 24
+	addi.d	a3, a3, 32
+
+.Llt32:
+	addi.d	a4, a2, -16
+	bgeu	a3, a4, .Llt16
+	st.d	a1, a3, 0
+	st.d	a1, a3, 8
+	addi.d	a3, a3, 16
+
+.Llt16:
+	addi.d	a4, a2, -8
+	bgeu	a3, a4, .Llt8
+	st.d	a1, a3, 0
+
+.Llt8:
+	st.d	a1, a2, -8
 
 	/* return */
-3:	move	a0, a3
+	jr	ra
+
+	.align	4
+.Lsmall:
+	pcaddi	t0, 4
+	slli.d	a2, a2, 4
+	add.d	t0, t0, a2
+	jr	t0
+
+	.align	4
+0:	jr	ra
+
+	.align	4
+1:	st.b	a1, a0, 0
+	jr	ra
+
+	.align	4
+2:	st.h	a1, a0, 0
+	jr	ra
+
+	.align	4
+3:	st.h	a1, a0, 0
+	st.b	a1, a0, 2
+	jr	ra
+
+	.align	4
+4:	st.w	a1, a0, 0
+	jr	ra
+
+	.align	4
+5:	st.w	a1, a0, 0
+	st.b	a1, a0, 4
+	jr	ra
+
+	.align	4
+6:	st.w	a1, a0, 0
+	st.h	a1, a0, 4
+	jr	ra
+
+	.align	4
+7:	st.w	a1, a0, 0
+	st.w	a1, a0, 3
+	jr	ra
+
+	.align	4
+8:	st.d	a1, a0, 0
 	jr	ra
 SYM_FUNC_END(__memset_fast)
 _ASM_NOKPROBE(__memset_fast)
author	Linus Torvalds	2023-05-04 12:40:16 -0700
committer	Linus Torvalds	2023-05-04 12:40:16 -0700
commit	611c9d88302cb9ac3b0f58f4a06c0ffb98345bd2 (patch)
tree	3aeefa3eb0499df16ec08965da68d9fa38f1befa /arch
parent	a1f749de8a610096ca0dd9a40d89c8fa4098c8eb (diff)
parent	2fa5ebe3bc4e31e07a99196455498472417842f2 (diff)