From 18269c0fd4db9bec2668f895f21d742486ccb90f Mon Sep 17 00:00:00 2001
From: Sam Ravnborg
Date: Wed, 3 Dec 2008 03:07:00 -0800
Subject: sparc: prepare lib/ for unification

Identical named files renamed to <name>_32.S
Refactored Makefile to prepare for unification.

Linking order was altered slightly - but this is a lib.a file so
it should not matter.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc/lib/Makefile               |  25 +-
 arch/sparc/lib/atomic.S               |  99 ------
 arch/sparc/lib/atomic_32.S            |  99 ++++++
 arch/sparc/lib/checksum.S             | 583 ----------------------------------
 arch/sparc/lib/checksum_32.S          | 583 ++++++++++++++++++++++++++++++++++
 arch/sparc/lib/memcmp.S               | 312 ------------------
 arch/sparc/lib/memcmp_32.S            | 312 ++++++++++++++++++
 arch/sparc/lib/memscan.S              | 133 --------
 arch/sparc/lib/memscan_32.S           | 133 ++++++++
 arch/sparc/lib/rwsem.S                | 204 ------------
 arch/sparc/lib/rwsem_32.S             | 204 ++++++++++++
 arch/sparc/lib/strlen.S               |  81 -----
 arch/sparc/lib/strlen_32.S            |  81 +++++
 arch/sparc/lib/strlen_user.S          | 109 -------
 arch/sparc/lib/strlen_user_32.S       | 109 +++++++
 arch/sparc/lib/strncmp.S              | 118 -------
 arch/sparc/lib/strncmp_32.S           | 118 +++++++
 arch/sparc/lib/strncpy_from_user.S    |  47 ---
 arch/sparc/lib/strncpy_from_user_32.S |  47 +++
 19 files changed, 1703 insertions(+), 1694 deletions(-)
 delete mode 100644 arch/sparc/lib/atomic.S
 create mode 100644 arch/sparc/lib/atomic_32.S
 delete mode 100644 arch/sparc/lib/checksum.S
 create mode 100644 arch/sparc/lib/checksum_32.S
 delete mode 100644 arch/sparc/lib/memcmp.S
 create mode 100644 arch/sparc/lib/memcmp_32.S
 delete mode 100644 arch/sparc/lib/memscan.S
 create mode 100644 arch/sparc/lib/memscan_32.S
 delete mode 100644 arch/sparc/lib/rwsem.S
 create mode 100644 arch/sparc/lib/rwsem_32.S
 delete mode 100644 arch/sparc/lib/strlen.S
 create mode 100644 arch/sparc/lib/strlen_32.S
 delete mode 100644 arch/sparc/lib/strlen_user.S
 create mode 100644 arch/sparc/lib/strlen_user_32.S
 delete mode 100644 arch/sparc/lib/strncmp.S
 create mode 100644 arch/sparc/lib/strncmp_32.S
 delete mode 100644 arch/sparc/lib/strncpy_from_user.S
 create mode 100644 arch/sparc/lib/strncpy_from_user_32.S

(limited to 'arch')
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index 6e303e10c3b9..f2650545c774 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -1,13 +1,22 @@
 # Makefile for Sparc library files..
 #
 
-EXTRA_AFLAGS := -ansi -DST_DIV0=0x02
+asflags-y := -ansi -DST_DIV0=0x02
+ccflags-y := -Werror
 
-lib-y := mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o memcpy.o memset.o \
-         strlen.o checksum.o blockops.o memscan.o memcmp.o strncmp.o \
-	 strncpy_from_user.o divdi3.o udivdi3.o strlen_user.o \
-	 copy_user.o locks.o atomic.o \
-	 lshrdi3.o ashldi3.o rwsem.o muldi3.o bitext.o \
-	 cmpdi2.o
+lib-$(CONFIG_SPARC32) += mul.o rem.o sdiv.o udiv.o umul.o urem.o ashrdi3.o
+lib-$(CONFIG_SPARC32) += memcpy.o memset.o
+lib-y                 += strlen_$(BITS).o
+lib-y                 += checksum_$(BITS).o
+lib-$(CONFIG_SPARC32) += blockops.o
+lib-y                 += memscan_$(BITS).o memcmp_$(BITS).o strncmp_$(BITS).o
+lib-y                 += strncpy_from_user_$(BITS).o strlen_user_$(BITS).o
+lib-$(CONFIG_SPARC32) += divdi3.o udivdi3.o
+lib-$(CONFIG_SPARC32) += copy_user.o locks.o
+lib-y                 += atomic_$(BITS).o
+lib-$(CONFIG_SPARC32) += lshrdi3.o ashldi3.o
+lib-y                 += rwsem_$(BITS).o
+lib-$(CONFIG_SPARC32) += muldi3.o bitext.o cmpdi2.o
 
-obj-y += iomap.o atomic32.o
+obj-y                 += iomap.o
+obj-$(CONFIG_SPARC32) += atomic32.o
diff --git a/arch/sparc/lib/atomic.S b/arch/sparc/lib/atomic.S
deleted file mode 100644
index 178cbb8ae1b9..000000000000
--- a/arch/sparc/lib/atomic.S
+++ /dev/null
@@ -1,99 +0,0 @@
-/* atomic.S: Move this stuff here for better ICACHE hit rates.
- *
- * Copyright (C) 1996 David S. Miller (davem@caipfs.rutgers.edu)
- */
-
-#include <asm/ptrace.h>
-#include <asm/psr.h>
-
-	.text
-	.align	4
-
-	.globl  __atomic_begin
-__atomic_begin:
-
-#ifndef CONFIG_SMP
-	.globl	___xchg32_sun4c
-___xchg32_sun4c:
-	rd	%psr, %g3
-	andcc	%g3, PSR_PIL, %g0
-	bne	1f
-	 nop
-	wr	%g3, PSR_PIL, %psr
-	nop; nop; nop
-1:
-	andcc	%g3, PSR_PIL, %g0
-	ld	[%g1], %g7
-	bne	1f
-	 st	%g2, [%g1]
-	wr	%g3, 0x0, %psr
-	nop; nop; nop
-1:
-	mov	%g7, %g2
-	jmpl	%o7 + 8, %g0
-	 mov	%g4, %o7
-
-	.globl	___xchg32_sun4md
-___xchg32_sun4md:
-	swap	[%g1], %g2
-	jmpl	%o7 + 8, %g0
-	 mov	%g4, %o7
-#endif
-
-	/* Read asm-sparc/atomic.h carefully to understand how this works for SMP.
-	 * Really, some things here for SMP are overly clever, go read the header.
-	 */
-	.globl	___atomic24_add
-___atomic24_add:
-	rd	%psr, %g3		! Keep the code small, old way was stupid
-	nop; nop; nop;			! Let the bits set
-	or	%g3, PSR_PIL, %g7	! Disable interrupts
-	wr	%g7, 0x0, %psr		! Set %psr
-	nop; nop; nop;			! Let the bits set
-#ifdef CONFIG_SMP
-1:	ldstub	[%g1 + 3], %g7		! Spin on the byte lock for SMP.
-	orcc	%g7, 0x0, %g0		! Did we get it?
-	bne	1b			! Nope...
-	 ld	[%g1], %g7		! Load locked atomic24_t
-	sra	%g7, 8, %g7		! Get signed 24-bit integer
-	add	%g7, %g2, %g2		! Add in argument
-	sll	%g2, 8, %g7		! Transpose back to atomic24_t
-	st	%g7, [%g1]		! Clever: This releases the lock as well.
-#else
-	ld	[%g1], %g7		! Load locked atomic24_t
-	add	%g7, %g2, %g2		! Add in argument
-	st	%g2, [%g1]		! Store it back
-#endif
-	wr	%g3, 0x0, %psr		! Restore original PSR_PIL
-	nop; nop; nop;			! Let the bits set
-	jmpl	%o7, %g0		! NOTE: not + 8, see callers in atomic.h
-	 mov	%g4, %o7		! Restore %o7
-
-	.globl	___atomic24_sub
-___atomic24_sub:
-	rd	%psr, %g3		! Keep the code small, old way was stupid
-	nop; nop; nop;			! Let the bits set
-	or	%g3, PSR_PIL, %g7	! Disable interrupts
-	wr	%g7, 0x0, %psr		! Set %psr
-	nop; nop; nop;			! Let the bits set
-#ifdef CONFIG_SMP
-1:	ldstub	[%g1 + 3], %g7		! Spin on the byte lock for SMP.
-	orcc	%g7, 0x0, %g0		! Did we get it?
-	bne	1b			! Nope...
-	 ld	[%g1], %g7		! Load locked atomic24_t
-	sra	%g7, 8, %g7		! Get signed 24-bit integer
-	sub	%g7, %g2, %g2		! Subtract argument
-	sll	%g2, 8, %g7		! Transpose back to atomic24_t
-	st	%g7, [%g1]		! Clever: This releases the lock as well
-#else
-	ld	[%g1], %g7		! Load locked atomic24_t
-	sub	%g7, %g2, %g2		! Subtract argument
-	st	%g2, [%g1]		! Store it back
-#endif
-	wr	%g3, 0x0, %psr		! Restore original PSR_PIL
-	nop; nop; nop;			! Let the bits set
-	jmpl	%o7, %g0		! NOTE: not + 8, see callers in atomic.h
-	 mov	%g4, %o7		! Restore %o7
-
-	.globl  __atomic_end
-__atomic_end:
diff --git a/arch/sparc/lib/atomic_32.S b/arch/sparc/lib/atomic_32.S
new file mode 100644
index 000000000000..178cbb8ae1b9
--- /dev/null
+++ b/arch/sparc/lib/atomic_32.S
@@ -0,0 +1,99 @@
+/* atomic.S: Move this stuff here for better ICACHE hit rates.
+ *
+ * Copyright (C) 1996 David S. Miller (davem@caipfs.rutgers.edu)
+ */
+
+#include <asm/ptrace.h>
+#include <asm/psr.h>
+
+	.text
+	.align	4
+
+	.globl  __atomic_begin
+__atomic_begin:
+
+#ifndef CONFIG_SMP
+	.globl	___xchg32_sun4c
+___xchg32_sun4c:
+	rd	%psr, %g3
+	andcc	%g3, PSR_PIL, %g0
+	bne	1f
+	 nop
+	wr	%g3, PSR_PIL, %psr
+	nop; nop; nop
+1:
+	andcc	%g3, PSR_PIL, %g0
+	ld	[%g1], %g7
+	bne	1f
+	 st	%g2, [%g1]
+	wr	%g3, 0x0, %psr
+	nop; nop; nop
+1:
+	mov	%g7, %g2
+	jmpl	%o7 + 8, %g0
+	 mov	%g4, %o7
+
+	.globl	___xchg32_sun4md
+___xchg32_sun4md:
+	swap	[%g1], %g2
+	jmpl	%o7 + 8, %g0
+	 mov	%g4, %o7
+#endif
+
+	/* Read asm-sparc/atomic.h carefully to understand how this works for SMP.
+	 * Really, some things here for SMP are overly clever, go read the header.
+	 */
+	.globl	___atomic24_add
+___atomic24_add:
+	rd	%psr, %g3		! Keep the code small, old way was stupid
+	nop; nop; nop;			! Let the bits set
+	or	%g3, PSR_PIL, %g7	! Disable interrupts
+	wr	%g7, 0x0, %psr		! Set %psr
+	nop; nop; nop;			! Let the bits set
+#ifdef CONFIG_SMP
+1:	ldstub	[%g1 + 3], %g7		! Spin on the byte lock for SMP.
+	orcc	%g7, 0x0, %g0		! Did we get it?
+	bne	1b			! Nope...
+	 ld	[%g1], %g7		! Load locked atomic24_t
+	sra	%g7, 8, %g7		! Get signed 24-bit integer
+	add	%g7, %g2, %g2		! Add in argument
+	sll	%g2, 8, %g7		! Transpose back to atomic24_t
+	st	%g7, [%g1]		! Clever: This releases the lock as well.
+#else
+	ld	[%g1], %g7		! Load locked atomic24_t
+	add	%g7, %g2, %g2		! Add in argument
+	st	%g2, [%g1]		! Store it back
+#endif
+	wr	%g3, 0x0, %psr		! Restore original PSR_PIL
+	nop; nop; nop;			! Let the bits set
+	jmpl	%o7, %g0		! NOTE: not + 8, see callers in atomic.h
+	 mov	%g4, %o7		! Restore %o7
+
+	.globl	___atomic24_sub
+___atomic24_sub:
+	rd	%psr, %g3		! Keep the code small, old way was stupid
+	nop; nop; nop;			! Let the bits set
+	or	%g3, PSR_PIL, %g7	! Disable interrupts
+	wr	%g7, 0x0, %psr		! Set %psr
+	nop; nop; nop;			! Let the bits set
+#ifdef CONFIG_SMP
+1:	ldstub	[%g1 + 3], %g7		! Spin on the byte lock for SMP.
+	orcc	%g7, 0x0, %g0		! Did we get it?
+	bne	1b			! Nope...
+	 ld	[%g1], %g7		! Load locked atomic24_t
+	sra	%g7, 8, %g7		! Get signed 24-bit integer
+	sub	%g7, %g2, %g2		! Subtract argument
+	sll	%g2, 8, %g7		! Transpose back to atomic24_t
+	st	%g7, [%g1]		! Clever: This releases the lock as well
+#else
+	ld	[%g1], %g7		! Load locked atomic24_t
+	sub	%g7, %g2, %g2		! Subtract argument
+	st	%g2, [%g1]		! Store it back
+#endif
+	wr	%g3, 0x0, %psr		! Restore original PSR_PIL
+	nop; nop; nop;			! Let the bits set
+	jmpl	%o7, %g0		! NOTE: not + 8, see callers in atomic.h
+	 mov	%g4, %o7		! Restore %o7
+
+	.globl  __atomic_end
+__atomic_end:
diff --git a/arch/sparc/lib/checksum.S b/arch/sparc/lib/checksum.S
deleted file mode 100644
index 77f228533d47..000000000000
--- a/arch/sparc/lib/checksum.S
+++ /dev/null
@@ -1,583 +0,0 @@
-/* checksum.S: Sparc optimized checksum code.
- *
- *  Copyright(C) 1995 Linus Torvalds
- *  Copyright(C) 1995 Miguel de Icaza
- *  Copyright(C) 1996 David S. Miller
- *  Copyright(C) 1997 Jakub Jelinek
- *
- * derived from:
- *	Linux/Alpha checksum c-code
- *      Linux/ix86 inline checksum assembly
- *      RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
- *	David Mosberger-Tang for optimized reference c-code
- *	BSD4.4 portable checksum routine
- */
-
-#include <asm/errno.h>
-
-#define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5)	\
-	ldd	[buf + offset + 0x00], t0;			\
-	ldd	[buf + offset + 0x08], t2;			\
-	addxcc	t0, sum, sum;					\
-	addxcc	t1, sum, sum;					\
-	ldd	[buf + offset + 0x10], t4;			\
-	addxcc	t2, sum, sum;					\
-	addxcc	t3, sum, sum;					\
-	ldd	[buf + offset + 0x18], t0;			\
-	addxcc	t4, sum, sum;					\
-	addxcc	t5, sum, sum;					\
-	addxcc	t0, sum, sum;					\
-	addxcc	t1, sum, sum;
-
-#define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3)	\
-	ldd	[buf - offset - 0x08], t0;			\
-	ldd	[buf - offset - 0x00], t2;			\
-	addxcc	t0, sum, sum;					\
-	addxcc	t1, sum, sum;					\
-	addxcc	t2, sum, sum;					\
-	addxcc	t3, sum, sum;
-
-	/* Do end cruft out of band to get better cache patterns. */
-csum_partial_end_cruft:
-	be	1f				! caller asks %o1 & 0x8
-	 andcc	%o1, 4, %g0			! nope, check for word remaining
-	ldd	[%o0], %g2			! load two
-	addcc	%g2, %o2, %o2			! add first word to sum
-	addxcc	%g3, %o2, %o2			! add second word as well
-	add	%o0, 8, %o0			! advance buf ptr
-	addx	%g0, %o2, %o2			! add in final carry
-	andcc	%o1, 4, %g0			! check again for word remaining
-1:	be	1f				! nope, skip this code
-	 andcc	%o1, 3, %o1			! check for trailing bytes
-	ld	[%o0], %g2			! load it
-	addcc	%g2, %o2, %o2			! add to sum
-	add	%o0, 4, %o0			! advance buf ptr
-	addx	%g0, %o2, %o2			! add in final carry
-	andcc	%o1, 3, %g0			! check again for trailing bytes
-1:	be	1f				! no trailing bytes, return
-	 addcc	%o1, -1, %g0			! only one byte remains?
-	bne	2f				! at least two bytes more
-	 subcc	%o1, 2, %o1			! only two bytes more?
-	b	4f				! only one byte remains
-	 or	%g0, %g0, %o4			! clear fake hword value
-2:	lduh	[%o0], %o4			! get hword
-	be	6f				! jmp if only hword remains
-	 add	%o0, 2, %o0			! advance buf ptr either way
-	sll	%o4, 16, %o4			! create upper hword
-4:	ldub	[%o0], %o5			! get final byte
-	sll	%o5, 8, %o5			! put into place
-	or	%o5, %o4, %o4			! coalese with hword (if any)
-6:	addcc	%o4, %o2, %o2			! add to sum
-1:	retl					! get outta here
-	 addx	%g0, %o2, %o0			! add final carry into retval
-
-	/* Also do alignment out of band to get better cache patterns. */
-csum_partial_fix_alignment:
-	cmp	%o1, 6
-	bl	cpte - 0x4
-	 andcc	%o0, 0x2, %g0
-	be	1f
-	 andcc	%o0, 0x4, %g0
-	lduh	[%o0 + 0x00], %g2
-	sub	%o1, 2, %o1
-	add	%o0, 2, %o0
-	sll	%g2, 16, %g2
-	addcc	%g2, %o2, %o2
-	srl	%o2, 16, %g3
-	addx	%g0, %g3, %g2
-	sll	%o2, 16, %o2
-	sll	%g2, 16, %g3
-	srl	%o2, 16, %o2
-	andcc	%o0, 0x4, %g0
-	or	%g3, %o2, %o2
-1:	be	cpa
-	 andcc	%o1, 0xffffff80, %o3
-	ld	[%o0 + 0x00], %g2
-	sub	%o1, 4, %o1
-	addcc	%g2, %o2, %o2
-	add	%o0, 4, %o0
-	addx	%g0, %o2, %o2
-	b	cpa
-	 andcc	%o1, 0xffffff80, %o3
-
-	/* The common case is to get called with a nicely aligned
-	 * buffer of size 0x20.  Follow the code path for that case.
-	 */
-	.globl	csum_partial
-csum_partial:			/* %o0=buf, %o1=len, %o2=sum */
-	andcc	%o0, 0x7, %g0				! alignment problems?
-	bne	csum_partial_fix_alignment		! yep, handle it
-	 sethi	%hi(cpte - 8), %g7			! prepare table jmp ptr
-	andcc	%o1, 0xffffff80, %o3			! num loop iterations
-cpa:	be	3f					! none to do
-	 andcc	%o1, 0x70, %g1				! clears carry flag too
-5:	CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
-	CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
-	CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
-	CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
-	addx	%g0, %o2, %o2				! sink in final carry
-	subcc	%o3, 128, %o3				! detract from loop iters
-	bne	5b					! more to do
-	 add	%o0, 128, %o0				! advance buf ptr
-	andcc	%o1, 0x70, %g1				! clears carry flag too
-3:	be	cpte					! nope
-	 andcc	%o1, 0xf, %g0				! anything left at all?
-	srl	%g1, 1, %o4				! compute offset
-	sub	%g7, %g1, %g7				! adjust jmp ptr
-	sub	%g7, %o4, %g7				! final jmp ptr adjust
-	jmp	%g7 + %lo(cpte - 8)			! enter the table
-	 add	%o0, %g1, %o0				! advance buf ptr
-cptbl:	CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5)
-	CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5)
-	CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5)
-	CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5)
-	CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5)
-	CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5)
-	CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5)
-	addx	%g0, %o2, %o2				! fetch final carry
-	andcc	%o1, 0xf, %g0				! anything left at all?
-cpte:	bne	csum_partial_end_cruft			! yep, handle it
-	 andcc	%o1, 8, %g0				! check how much
-cpout:	retl						! get outta here
-	 mov	%o2, %o0				! return computed csum
-
-	.globl __csum_partial_copy_start, __csum_partial_copy_end
-__csum_partial_copy_start:
-
-/* Work around cpp -rob */
-#define ALLOC #alloc
-#define EXECINSTR #execinstr
-#define EX(x,y,a,b)				\
-98:     x,y;                                    \
-        .section .fixup,ALLOC,EXECINSTR;	\
-        .align  4;                              \
-99:     ba 30f;                                 \
-         a, b, %o3;                             \
-        .section __ex_table,ALLOC;		\
-        .align  4;                              \
-        .word   98b, 99b;                       \
-        .text;                                  \
-        .align  4
-
-#define EX2(x,y)				\
-98:     x,y;                                    \
-        .section __ex_table,ALLOC;		\
-        .align  4;                              \
-        .word   98b, 30f;                       \
-        .text;                                  \
-        .align  4
-
-#define EX3(x,y)				\
-98:     x,y;                                    \
-        .section __ex_table,ALLOC;		\
-        .align  4;                              \
-        .word   98b, 96f;                       \
-        .text;                                  \
-        .align  4
-
-#define EXT(start,end,handler)			\
-        .section __ex_table,ALLOC;		\
-        .align  4;                              \
-        .word   start, 0, end, handler;         \
-        .text;                                  \
-        .align  4
-
-	/* This aligned version executes typically in 8.5 superscalar cycles, this
-	 * is the best I can do.  I say 8.5 because the final add will pair with
-	 * the next ldd in the main unrolled loop.  Thus the pipe is always full.
-	 * If you change these macros (including order of instructions),
-	 * please check the fixup code below as well.
-	 */
-#define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)	\
-	ldd	[src + off + 0x00], t0;							\
-	ldd	[src + off + 0x08], t2;							\
-	addxcc	t0, sum, sum;								\
-	ldd	[src + off + 0x10], t4;							\
-	addxcc	t1, sum, sum;								\
-	ldd	[src + off + 0x18], t6;							\
-	addxcc	t2, sum, sum;								\
-	std	t0, [dst + off + 0x00];							\
-	addxcc	t3, sum, sum;								\
-	std	t2, [dst + off + 0x08];							\
-	addxcc	t4, sum, sum;								\
-	std	t4, [dst + off + 0x10];							\
-	addxcc	t5, sum, sum;								\
-	std	t6, [dst + off + 0x18];							\
-	addxcc	t6, sum, sum;								\
-	addxcc	t7, sum, sum;
-
-	/* 12 superscalar cycles seems to be the limit for this case,
-	 * because of this we thus do all the ldd's together to get
-	 * Viking MXCC into streaming mode.  Ho hum...
-	 */
-#define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)	\
-	ldd	[src + off + 0x00], t0;						\
-	ldd	[src + off + 0x08], t2;						\
-	ldd	[src + off + 0x10], t4;						\
-	ldd	[src + off + 0x18], t6;						\
-	st	t0, [dst + off + 0x00];						\
-	addxcc	t0, sum, sum;							\
-	st	t1, [dst + off + 0x04];						\
-	addxcc	t1, sum, sum;							\
-	st	t2, [dst + off + 0x08];						\
-	addxcc	t2, sum, sum;							\
-	st	t3, [dst + off + 0x0c];						\
-	addxcc	t3, sum, sum;							\
-	st	t4, [dst + off + 0x10];						\
-	addxcc	t4, sum, sum;							\
-	st	t5, [dst + off + 0x14];						\
-	addxcc	t5, sum, sum;							\
-	st	t6, [dst + off + 0x18];						\
-	addxcc	t6, sum, sum;							\
-	st	t7, [dst + off + 0x1c];						\
-	addxcc	t7, sum, sum;
-
-	/* Yuck, 6 superscalar cycles... */
-#define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3)	\
-	ldd	[src - off - 0x08], t0;				\
-	ldd	[src - off - 0x00], t2;				\
-	addxcc	t0, sum, sum;					\
-	st	t0, [dst - off - 0x08];				\
-	addxcc	t1, sum, sum;					\
-	st	t1, [dst - off - 0x04];				\
-	addxcc	t2, sum, sum;					\
-	st	t2, [dst - off - 0x00];				\
-	addxcc	t3, sum, sum;					\
-	st	t3, [dst - off + 0x04];
-
-	/* Handle the end cruft code out of band for better cache patterns. */
-cc_end_cruft:
-	be	1f
-	 andcc	%o3, 4, %g0
-	EX(ldd	[%o0 + 0x00], %g2, and %o3, 0xf)
-	add	%o1, 8, %o1
-	addcc	%g2, %g7, %g7
-	add	%o0, 8, %o0
-	addxcc	%g3, %g7, %g7
-	EX2(st	%g2, [%o1 - 0x08])
-	addx	%g0, %g7, %g7
-	andcc	%o3, 4, %g0
-	EX2(st	%g3, [%o1 - 0x04])
-1:	be	1f
-	 andcc	%o3, 3, %o3
-	EX(ld	[%o0 + 0x00], %g2, add %o3, 4)
-	add	%o1, 4, %o1
-	addcc	%g2, %g7, %g7
-	EX2(st	%g2, [%o1 - 0x04])
-	addx	%g0, %g7, %g7
-	andcc	%o3, 3, %g0
-	add	%o0, 4, %o0
-1:	be	1f
-	 addcc	%o3, -1, %g0
-	bne	2f
-	 subcc	%o3, 2, %o3
-	b	4f
-	 or	%g0, %g0, %o4
-2:	EX(lduh	[%o0 + 0x00], %o4, add %o3, 2)
-	add	%o0, 2, %o0
-	EX2(sth	%o4, [%o1 + 0x00])
-	be	6f
-	 add	%o1, 2, %o1
-	sll	%o4, 16, %o4
-4:	EX(ldub	[%o0 + 0x00], %o5, add %g0, 1)
-	EX2(stb	%o5, [%o1 + 0x00])
-	sll	%o5, 8, %o5
-	or	%o5, %o4, %o4
-6:	addcc	%o4, %g7, %g7
-1:	retl
-	 addx	%g0, %g7, %o0
-
-	/* Also, handle the alignment code out of band. */
-cc_dword_align:
-	cmp	%g1, 6
-	bl,a	ccte
-	 andcc	%g1, 0xf, %o3
-	andcc	%o0, 0x1, %g0
-	bne	ccslow
-	 andcc	%o0, 0x2, %g0
-	be	1f
-	 andcc	%o0, 0x4, %g0
-	EX(lduh	[%o0 + 0x00], %g4, add %g1, 0)
-	sub	%g1, 2, %g1
-	EX2(sth	%g4, [%o1 + 0x00])
-	add	%o0, 2, %o0
-	sll	%g4, 16, %g4
-	addcc	%g4, %g7, %g7
-	add	%o1, 2, %o1
-	srl	%g7, 16, %g3
-	addx	%g0, %g3, %g4
-	sll	%g7, 16, %g7
-	sll	%g4, 16, %g3
-	srl	%g7, 16, %g7
-	andcc	%o0, 0x4, %g0
-	or	%g3, %g7, %g7
-1:	be	3f
-	 andcc	%g1, 0xffffff80, %g0
-	EX(ld	[%o0 + 0x00], %g4, add %g1, 0)
-	sub	%g1, 4, %g1
-	EX2(st	%g4, [%o1 + 0x00])
-	add	%o0, 4, %o0
-	addcc	%g4, %g7, %g7
-	add	%o1, 4, %o1
-	addx	%g0, %g7, %g7
-	b	3f
-	 andcc	%g1, 0xffffff80, %g0
-
-	/* Sun, you just can't beat me, you just can't.  Stop trying,
-	 * give up.  I'm serious, I am going to kick the living shit
-	 * out of you, game over, lights out.
-	 */
-	.align	8
-	.globl	__csum_partial_copy_sparc_generic
-__csum_partial_copy_sparc_generic:
-					/* %o0=src, %o1=dest, %g1=len, %g7=sum */
-	xor	%o0, %o1, %o4		! get changing bits
-	andcc	%o4, 3, %g0		! check for mismatched alignment
-	bne	ccslow			! better this than unaligned/fixups
-	 andcc	%o0, 7, %g0		! need to align things?
-	bne	cc_dword_align		! yes, we check for short lengths there
-	 andcc	%g1, 0xffffff80, %g0	! can we use unrolled loop?
-3:	be	3f			! nope, less than one loop remains
-	 andcc	%o1, 4, %g0		! dest aligned on 4 or 8 byte boundary?
-	be	ccdbl + 4		! 8 byte aligned, kick ass
-5:	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
-	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
-	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
-	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
-10:	EXT(5b, 10b, 20f)		! note for exception handling
-	sub	%g1, 128, %g1		! detract from length
-	addx	%g0, %g7, %g7		! add in last carry bit
-	andcc	%g1, 0xffffff80, %g0	! more to csum?
-	add	%o0, 128, %o0		! advance src ptr
-	bne	5b			! we did not go negative, continue looping
-	 add	%o1, 128, %o1		! advance dest ptr
-3:	andcc	%g1, 0x70, %o2		! can use table?
-ccmerge:be	ccte			! nope, go and check for end cruft
-	 andcc	%g1, 0xf, %o3		! get low bits of length (clears carry btw)
-	srl	%o2, 1, %o4		! begin negative offset computation
-	sethi	%hi(12f), %o5		! set up table ptr end
-	add	%o0, %o2, %o0		! advance src ptr
-	sub	%o5, %o4, %o5		! continue table calculation
-	sll	%o2, 1, %g2		! constant multiplies are fun...
-	sub	%o5, %g2, %o5		! some more adjustments
-	jmp	%o5 + %lo(12f)		! jump into it, duff style, wheee...
-	 add	%o1, %o2, %o1		! advance dest ptr (carry is clear btw)
-cctbl:	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5)
-	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5)
-	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5)
-	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5)
-	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5)
-	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5)
-	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5)
-12:	EXT(cctbl, 12b, 22f)		! note for exception table handling
-	addx	%g0, %g7, %g7
-	andcc	%o3, 0xf, %g0		! check for low bits set
-ccte:	bne	cc_end_cruft		! something left, handle it out of band
-	 andcc	%o3, 8, %g0		! begin checks for that code
-	retl				! return
-	 mov	%g7, %o0		! give em the computed checksum
-ccdbl:	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
-	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
-	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
-	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
-11:	EXT(ccdbl, 11b, 21f)		! note for exception table handling
-	sub	%g1, 128, %g1		! detract from length
-	addx	%g0, %g7, %g7		! add in last carry bit
-	andcc	%g1, 0xffffff80, %g0	! more to csum?
-	add	%o0, 128, %o0		! advance src ptr
-	bne	ccdbl			! we did not go negative, continue looping
-	 add	%o1, 128, %o1		! advance dest ptr
-	b	ccmerge			! finish it off, above
-	 andcc	%g1, 0x70, %o2		! can use table? (clears carry btw)
-
-ccslow:	cmp	%g1, 0
-	mov	0, %g5
-	bleu	4f
-	 andcc	%o0, 1, %o5		
-	be,a	1f
-	 srl	%g1, 1, %g4		
-	sub	%g1, 1, %g1	
-	EX(ldub	[%o0], %g5, add %g1, 1)
-	add	%o0, 1, %o0	
-	EX2(stb	%g5, [%o1])
-	srl	%g1, 1, %g4
-	add	%o1, 1, %o1
-1:	cmp	%g4, 0		
-	be,a	3f
-	 andcc	%g1, 1, %g0
-	andcc	%o0, 2, %g0	
-	be,a	1f
-	 srl	%g4, 1, %g4
-	EX(lduh	[%o0], %o4, add %g1, 0)
-	sub	%g1, 2, %g1	
-	srl	%o4, 8, %g2
-	sub	%g4, 1, %g4	
-	EX2(stb	%g2, [%o1])
-	add	%o4, %g5, %g5
-	EX2(stb	%o4, [%o1 + 1])
-	add	%o0, 2, %o0	
-	srl	%g4, 1, %g4
-	add	%o1, 2, %o1
-1:	cmp	%g4, 0		
-	be,a	2f
-	 andcc	%g1, 2, %g0
-	EX3(ld	[%o0], %o4)
-5:	srl	%o4, 24, %g2
-	srl	%o4, 16, %g3
-	EX2(stb	%g2, [%o1])
-	srl	%o4, 8, %g2
-	EX2(stb	%g3, [%o1 + 1])
-	add	%o0, 4, %o0
-	EX2(stb	%g2, [%o1 + 2])
-	addcc	%o4, %g5, %g5
-	EX2(stb	%o4, [%o1 + 3])
-	addx	%g5, %g0, %g5	! I am now to lazy to optimize this (question it
-	add	%o1, 4, %o1	! is worthy). Maybe some day - with the sll/srl
-	subcc	%g4, 1, %g4	! tricks
-	bne,a	5b
-	 EX3(ld	[%o0], %o4)
-	sll	%g5, 16, %g2
-	srl	%g5, 16, %g5
-	srl	%g2, 16, %g2
-	andcc	%g1, 2, %g0
-	add	%g2, %g5, %g5 
-2:	be,a	3f		
-	 andcc	%g1, 1, %g0
-	EX(lduh	[%o0], %o4, and %g1, 3)
-	andcc	%g1, 1, %g0
-	srl	%o4, 8, %g2
-	add	%o0, 2, %o0	
-	EX2(stb	%g2, [%o1])
-	add	%g5, %o4, %g5
-	EX2(stb	%o4, [%o1 + 1])
-	add	%o1, 2, %o1
-3:	be,a	1f		
-	 sll	%g5, 16, %o4
-	EX(ldub	[%o0], %g2, add %g0, 1)
-	sll	%g2, 8, %o4	
-	EX2(stb	%g2, [%o1])
-	add	%g5, %o4, %g5
-	sll	%g5, 16, %o4
-1:	addcc	%o4, %g5, %g5
-	srl	%g5, 16, %o4
-	addx	%g0, %o4, %g5
-	orcc	%o5, %g0, %g0
-	be	4f
-	 srl	%g5, 8, %o4
-	and	%g5, 0xff, %g2
-	and	%o4, 0xff, %o4
-	sll	%g2, 8, %g2
-	or	%g2, %o4, %g5
-4:	addcc	%g7, %g5, %g7
-	retl	
-	 addx	%g0, %g7, %o0
-__csum_partial_copy_end:
-
-/* We do these strange calculations for the csum_*_from_user case only, ie.
- * we only bother with faults on loads... */
-
-/* o2 = ((g2%20)&3)*8
- * o3 = g1 - (g2/20)*32 - o2 */
-20:
-	cmp	%g2, 20
-	blu,a	1f
-	 and	%g2, 3, %o2
-	sub	%g1, 32, %g1
-	b	20b
-	 sub	%g2, 20, %g2
-1:
-	sll	%o2, 3, %o2
-	b	31f
-	 sub	%g1, %o2, %o3
-
-/* o2 = (!(g2 & 15) ? 0 : (((g2 & 15) + 1) & ~1)*8)
- * o3 = g1 - (g2/16)*32 - o2 */
-21:
-	andcc	%g2, 15, %o3
-	srl	%g2, 4, %g2
-	be,a	1f
-	 clr	%o2
-	add	%o3, 1, %o3
-	and	%o3, 14, %o3
-	sll	%o3, 3, %o2
-1:
-	sll	%g2, 5, %g2
-	sub	%g1, %g2, %o3
-	b	31f
-	 sub	%o3, %o2, %o3
-
-/* o0 += (g2/10)*16 - 0x70
- * 01 += (g2/10)*16 - 0x70
- * o2 = (g2 % 10) ? 8 : 0
- * o3 += 0x70 - (g2/10)*16 - o2 */
-22:
-	cmp	%g2, 10
-	blu,a	1f
-	 sub	%o0, 0x70, %o0
-	add	%o0, 16, %o0
-	add	%o1, 16, %o1
-	sub	%o3, 16, %o3
-	b	22b
-	 sub	%g2, 10, %g2
-1:
-	sub	%o1, 0x70, %o1
-	add	%o3, 0x70, %o3
-	clr	%o2
-	tst	%g2
-	bne,a	1f
-	 mov	8, %o2
-1:
-	b	31f
-	 sub	%o3, %o2, %o3
-96:
-	and	%g1, 3, %g1
-	sll	%g4, 2, %g4
-	add	%g1, %g4, %o3
-30:
-/* %o1 is dst
- * %o3 is # bytes to zero out
- * %o4 is faulting address
- * %o5 is %pc where fault occurred */
-	clr	%o2
-31:
-/* %o0 is src
- * %o1 is dst
- * %o2 is # of bytes to copy from src to dst
- * %o3 is # bytes to zero out
- * %o4 is faulting address
- * %o5 is %pc where fault occurred */
-	save	%sp, -104, %sp
-        mov     %i5, %o0
-        mov     %i7, %o1
-        mov	%i4, %o2
-        call    lookup_fault
-	 mov	%g7, %i4
-	cmp	%o0, 2
-	bne	1f	
-	 add	%g0, -EFAULT, %i5
-	tst	%i2
-	be	2f
-	 mov	%i0, %o1
-	mov	%i1, %o0
-5:
-	call	__memcpy
-	 mov	%i2, %o2
-	tst	%o0
-	bne,a	2f
-	 add	%i3, %i2, %i3
-	add	%i1, %i2, %i1
-2:
-	mov	%i1, %o0
-6:
-	call	__bzero
-	 mov	%i3, %o1
-1:
-	ld	[%sp + 168], %o2		! struct_ptr of parent
-	st	%i5, [%o2]
-	ret
-	 restore
-
-        .section __ex_table,#alloc
-        .align 4
-        .word 5b,2
-	.word 6b,2
diff --git a/arch/sparc/lib/checksum_32.S b/arch/sparc/lib/checksum_32.S
new file mode 100644
index 000000000000..77f228533d47
--- /dev/null
+++ b/arch/sparc/lib/checksum_32.S
@@ -0,0 +1,583 @@
+/* checksum.S: Sparc optimized checksum code.
+ *
+ *  Copyright(C) 1995 Linus Torvalds
+ *  Copyright(C) 1995 Miguel de Icaza
+ *  Copyright(C) 1996 David S. Miller
+ *  Copyright(C) 1997 Jakub Jelinek
+ *
+ * derived from:
+ *	Linux/Alpha checksum c-code
+ *      Linux/ix86 inline checksum assembly
+ *      RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
+ *	David Mosberger-Tang for optimized reference c-code
+ *	BSD4.4 portable checksum routine
+ */
+
+#include <asm/errno.h>
+
+#define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5)	\
+	ldd	[buf + offset + 0x00], t0;			\
+	ldd	[buf + offset + 0x08], t2;			\
+	addxcc	t0, sum, sum;					\
+	addxcc	t1, sum, sum;					\
+	ldd	[buf + offset + 0x10], t4;			\
+	addxcc	t2, sum, sum;					\
+	addxcc	t3, sum, sum;					\
+	ldd	[buf + offset + 0x18], t0;			\
+	addxcc	t4, sum, sum;					\
+	addxcc	t5, sum, sum;					\
+	addxcc	t0, sum, sum;					\
+	addxcc	t1, sum, sum;
+
+#define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3)	\
+	ldd	[buf - offset - 0x08], t0;			\
+	ldd	[buf - offset - 0x00], t2;			\
+	addxcc	t0, sum, sum;					\
+	addxcc	t1, sum, sum;					\
+	addxcc	t2, sum, sum;					\
+	addxcc	t3, sum, sum;
+
+	/* Do end cruft out of band to get better cache patterns. */
+csum_partial_end_cruft:
+	be	1f				! caller asks %o1 & 0x8
+	 andcc	%o1, 4, %g0			! nope, check for word remaining
+	ldd	[%o0], %g2			! load two
+	addcc	%g2, %o2, %o2			! add first word to sum
+	addxcc	%g3, %o2, %o2			! add second word as well
+	add	%o0, 8, %o0			! advance buf ptr
+	addx	%g0, %o2, %o2			! add in final carry
+	andcc	%o1, 4, %g0			! check again for word remaining
+1:	be	1f				! nope, skip this code
+	 andcc	%o1, 3, %o1			! check for trailing bytes
+	ld	[%o0], %g2			! load it
+	addcc	%g2, %o2, %o2			! add to sum
+	add	%o0, 4, %o0			! advance buf ptr
+	addx	%g0, %o2, %o2			! add in final carry
+	andcc	%o1, 3, %g0			! check again for trailing bytes
+1:	be	1f				! no trailing bytes, return
+	 addcc	%o1, -1, %g0			! only one byte remains?
+	bne	2f				! at least two bytes more
+	 subcc	%o1, 2, %o1			! only two bytes more?
+	b	4f				! only one byte remains
+	 or	%g0, %g0, %o4			! clear fake hword value
+2:	lduh	[%o0], %o4			! get hword
+	be	6f				! jmp if only hword remains
+	 add	%o0, 2, %o0			! advance buf ptr either way
+	sll	%o4, 16, %o4			! create upper hword
+4:	ldub	[%o0], %o5			! get final byte
+	sll	%o5, 8, %o5			! put into place
+	or	%o5, %o4, %o4			! coalese with hword (if any)
+6:	addcc	%o4, %o2, %o2			! add to sum
+1:	retl					! get outta here
+	 addx	%g0, %o2, %o0			! add final carry into retval
+
+	/* Also do alignment out of band to get better cache patterns. */
+csum_partial_fix_alignment:
+	cmp	%o1, 6
+	bl	cpte - 0x4
+	 andcc	%o0, 0x2, %g0
+	be	1f
+	 andcc	%o0, 0x4, %g0
+	lduh	[%o0 + 0x00], %g2
+	sub	%o1, 2, %o1
+	add	%o0, 2, %o0
+	sll	%g2, 16, %g2
+	addcc	%g2, %o2, %o2
+	srl	%o2, 16, %g3
+	addx	%g0, %g3, %g2
+	sll	%o2, 16, %o2
+	sll	%g2, 16, %g3
+	srl	%o2, 16, %o2
+	andcc	%o0, 0x4, %g0
+	or	%g3, %o2, %o2
+1:	be	cpa
+	 andcc	%o1, 0xffffff80, %o3
+	ld	[%o0 + 0x00], %g2
+	sub	%o1, 4, %o1
+	addcc	%g2, %o2, %o2
+	add	%o0, 4, %o0
+	addx	%g0, %o2, %o2
+	b	cpa
+	 andcc	%o1, 0xffffff80, %o3
+
+	/* The common case is to get called with a nicely aligned
+	 * buffer of size 0x20.  Follow the code path for that case.
+	 */
+	.globl	csum_partial
+csum_partial:			/* %o0=buf, %o1=len, %o2=sum */
+	andcc	%o0, 0x7, %g0				! alignment problems?
+	bne	csum_partial_fix_alignment		! yep, handle it
+	 sethi	%hi(cpte - 8), %g7			! prepare table jmp ptr
+	andcc	%o1, 0xffffff80, %o3			! num loop iterations
+cpa:	be	3f					! none to do
+	 andcc	%o1, 0x70, %g1				! clears carry flag too
+5:	CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+	CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+	CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+	CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
+	addx	%g0, %o2, %o2				! sink in final carry
+	subcc	%o3, 128, %o3				! detract from loop iters
+	bne	5b					! more to do
+	 add	%o0, 128, %o0				! advance buf ptr
+	andcc	%o1, 0x70, %g1				! clears carry flag too
+3:	be	cpte					! nope
+	 andcc	%o1, 0xf, %g0				! anything left at all?
+	srl	%g1, 1, %o4				! compute offset
+	sub	%g7, %g1, %g7				! adjust jmp ptr
+	sub	%g7, %o4, %g7				! final jmp ptr adjust
+	jmp	%g7 + %lo(cpte - 8)			! enter the table
+	 add	%o0, %g1, %o0				! advance buf ptr
+cptbl:	CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5)
+	CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5)
+	addx	%g0, %o2, %o2				! fetch final carry
+	andcc	%o1, 0xf, %g0				! anything left at all?
+cpte:	bne	csum_partial_end_cruft			! yep, handle it
+	 andcc	%o1, 8, %g0				! check how much
+cpout:	retl						! get outta here
+	 mov	%o2, %o0				! return computed csum
+
+	.globl __csum_partial_copy_start, __csum_partial_copy_end
+__csum_partial_copy_start:
+
+/* Work around cpp -rob */
+#define ALLOC #alloc
+#define EXECINSTR #execinstr
+#define EX(x,y,a,b)				\
+98:     x,y;                                    \
+        .section .fixup,ALLOC,EXECINSTR;	\
+        .align  4;                              \
+99:     ba 30f;                                 \
+         a, b, %o3;                             \
+        .section __ex_table,ALLOC;		\
+        .align  4;                              \
+        .word   98b, 99b;                       \
+        .text;                                  \
+        .align  4
+
+#define EX2(x,y)				\
+98:     x,y;                                    \
+        .section __ex_table,ALLOC;		\
+        .align  4;                              \
+        .word   98b, 30f;                       \
+        .text;                                  \
+        .align  4
+
+#define EX3(x,y)				\
+98:     x,y;                                    \
+        .section __ex_table,ALLOC;		\
+        .align  4;                              \
+        .word   98b, 96f;                       \
+        .text;                                  \
+        .align  4
+
+#define EXT(start,end,handler)			\
+        .section __ex_table,ALLOC;		\
+        .align  4;                              \
+        .word   start, 0, end, handler;         \
+        .text;                                  \
+        .align  4
+
+	/* This aligned version executes typically in 8.5 superscalar cycles, this
+	 * is the best I can do.  I say 8.5 because the final add will pair with
+	 * the next ldd in the main unrolled loop.  Thus the pipe is always full.
+	 * If you change these macros (including order of instructions),
+	 * please check the fixup code below as well.
+	 */
+#define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)	\
+	ldd	[src + off + 0x00], t0;							\
+	ldd	[src + off + 0x08], t2;							\
+	addxcc	t0, sum, sum;								\
+	ldd	[src + off + 0x10], t4;							\
+	addxcc	t1, sum, sum;								\
+	ldd	[src + off + 0x18], t6;							\
+	addxcc	t2, sum, sum;								\
+	std	t0, [dst + off + 0x00];							\
+	addxcc	t3, sum, sum;								\
+	std	t2, [dst + off + 0x08];							\
+	addxcc	t4, sum, sum;								\
+	std	t4, [dst + off + 0x10];							\
+	addxcc	t5, sum, sum;								\
+	std	t6, [dst + off + 0x18];							\
+	addxcc	t6, sum, sum;								\
+	addxcc	t7, sum, sum;
+
+	/* 12 superscalar cycles seems to be the limit for this case,
+	 * because of this we thus do all the ldd's together to get
+	 * Viking MXCC into streaming mode.  Ho hum...
+	 */
+#define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)	\
+	ldd	[src + off + 0x00], t0;						\
+	ldd	[src + off + 0x08], t2;						\
+	ldd	[src + off + 0x10], t4;						\
+	ldd	[src + off + 0x18], t6;						\
+	st	t0, [dst + off + 0x00];						\
+	addxcc	t0, sum, sum;							\
+	st	t1, [dst + off + 0x04];						\
+	addxcc	t1, sum, sum;							\
+	st	t2, [dst + off + 0x08];						\
+	addxcc	t2, sum, sum;							\
+	st	t3, [dst + off + 0x0c];						\
+	addxcc	t3, sum, sum;							\
+	st	t4, [dst + off + 0x10];						\
+	addxcc	t4, sum, sum;							\
+	st	t5, [dst + off + 0x14];						\
+	addxcc	t5, sum, sum;							\
+	st	t6, [dst + off + 0x18];						\
+	addxcc	t6, sum, sum;							\
+	st	t7, [dst + off + 0x1c];						\
+	addxcc	t7, sum, sum;
+
+	/* Yuck, 6 superscalar cycles... */
+#define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3)	\
+	ldd	[src - off - 0x08], t0;				\
+	ldd	[src - off - 0x00], t2;				\
+	addxcc	t0, sum, sum;					\
+	st	t0, [dst - off - 0x08];				\
+	addxcc	t1, sum, sum;					\
+	st	t1, [dst - off - 0x04];				\
+	addxcc	t2, sum, sum;					\
+	st	t2, [dst - off - 0x00];				\
+	addxcc	t3, sum, sum;					\
+	st	t3, [dst - off + 0x04];
+
+	/* Handle the end cruft code out of band for better cache patterns. */
+cc_end_cruft:
+	be	1f
+	 andcc	%o3, 4, %g0
+	EX(ldd	[%o0 + 0x00], %g2, and %o3, 0xf)
+	add	%o1, 8, %o1
+	addcc	%g2, %g7, %g7
+	add	%o0, 8, %o0
+	addxcc	%g3, %g7, %g7
+	EX2(st	%g2, [%o1 - 0x08])
+	addx	%g0, %g7, %g7
+	andcc	%o3, 4, %g0
+	EX2(st	%g3, [%o1 - 0x04])
+1:	be	1f
+	 andcc	%o3, 3, %o3
+	EX(ld	[%o0 + 0x00], %g2, add %o3, 4)
+	add	%o1, 4, %o1
+	addcc	%g2, %g7, %g7
+	EX2(st	%g2, [%o1 - 0x04])
+	addx	%g0, %g7, %g7
+	andcc	%o3, 3, %g0
+	add	%o0, 4, %o0
+1:	be	1f
+	 addcc	%o3, -1, %g0
+	bne	2f
+	 subcc	%o3, 2, %o3
+	b	4f
+	 or	%g0, %g0, %o4
+2:	EX(lduh	[%o0 + 0x00], %o4, add %o3, 2)
+	add	%o0, 2, %o0
+	EX2(sth	%o4, [%o1 + 0x00])
+	be	6f
+	 add	%o1, 2, %o1
+	sll	%o4, 16, %o4
+4:	EX(ldub	[%o0 + 0x00], %o5, add %g0, 1)
+	EX2(stb	%o5, [%o1 + 0x00])
+	sll	%o5, 8, %o5
+	or	%o5, %o4, %o4
+6:	addcc	%o4, %g7, %g7
+1:	retl
+	 addx	%g0, %g7, %o0
+
+	/* Also, handle the alignment code out of band. */
+cc_dword_align:
+	cmp	%g1, 6
+	bl,a	ccte
+	 andcc	%g1, 0xf, %o3
+	andcc	%o0, 0x1, %g0
+	bne	ccslow
+	 andcc	%o0, 0x2, %g0
+	be	1f
+	 andcc	%o0, 0x4, %g0
+	EX(lduh	[%o0 + 0x00], %g4, add %g1, 0)
+	sub	%g1, 2, %g1
+	EX2(sth	%g4, [%o1 + 0x00])
+	add	%o0, 2, %o0
+	sll	%g4, 16, %g4
+	addcc	%g4, %g7, %g7
+	add	%o1, 2, %o1
+	srl	%g7, 16, %g3
+	addx	%g0, %g3, %g4
+	sll	%g7, 16, %g7
+	sll	%g4, 16, %g3
+	srl	%g7, 16, %g7
+	andcc	%o0, 0x4, %g0
+	or	%g3, %g7, %g7
+1:	be	3f
+	 andcc	%g1, 0xffffff80, %g0
+	EX(ld	[%o0 + 0x00], %g4, add %g1, 0)
+	sub	%g1, 4, %g1
+	EX2(st	%g4, [%o1 + 0x00])
+	add	%o0, 4, %o0
+	addcc	%g4, %g7, %g7
+	add	%o1, 4, %o1
+	addx	%g0, %g7, %g7
+	b	3f
+	 andcc	%g1, 0xffffff80, %g0
+
+	/* Sun, you just can't beat me, you just can't.  Stop trying,
+	 * give up.  I'm serious, I am going to kick the living shit
+	 * out of you, game over, lights out.
+	 */
+	.align	8
+	.globl	__csum_partial_copy_sparc_generic
+__csum_partial_copy_sparc_generic:
+					/* %o0=src, %o1=dest, %g1=len, %g7=sum */
+	xor	%o0, %o1, %o4		! get changing bits
+	andcc	%o4, 3, %g0		! check for mismatched alignment
+	bne	ccslow			! better this than unaligned/fixups
+	 andcc	%o0, 7, %g0		! need to align things?
+	bne	cc_dword_align		! yes, we check for short lengths there
+	 andcc	%g1, 0xffffff80, %g0	! can we use unrolled loop?
+3:	be	3f			! nope, less than one loop remains
+	 andcc	%o1, 4, %g0		! dest aligned on 4 or 8 byte boundary?
+	be	ccdbl + 4		! 8 byte aligned, kick ass
+5:	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+10:	EXT(5b, 10b, 20f)		! note for exception handling
+	sub	%g1, 128, %g1		! detract from length
+	addx	%g0, %g7, %g7		! add in last carry bit
+	andcc	%g1, 0xffffff80, %g0	! more to csum?
+	add	%o0, 128, %o0		! advance src ptr
+	bne	5b			! we did not go negative, continue looping
+	 add	%o1, 128, %o1		! advance dest ptr
+3:	andcc	%g1, 0x70, %o2		! can use table?
+ccmerge:be	ccte			! nope, go and check for end cruft
+	 andcc	%g1, 0xf, %o3		! get low bits of length (clears carry btw)
+	srl	%o2, 1, %o4		! begin negative offset computation
+	sethi	%hi(12f), %o5		! set up table ptr end
+	add	%o0, %o2, %o0		! advance src ptr
+	sub	%o5, %o4, %o5		! continue table calculation
+	sll	%o2, 1, %g2		! constant multiplies are fun...
+	sub	%o5, %g2, %o5		! some more adjustments
+	jmp	%o5 + %lo(12f)		! jump into it, duff style, wheee...
+	 add	%o1, %o2, %o1		! advance dest ptr (carry is clear btw)
+cctbl:	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5)
+	CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5)
+12:	EXT(cctbl, 12b, 22f)		! note for exception table handling
+	addx	%g0, %g7, %g7
+	andcc	%o3, 0xf, %g0		! check for low bits set
+ccte:	bne	cc_end_cruft		! something left, handle it out of band
+	 andcc	%o3, 8, %g0		! begin checks for that code
+	retl				! return
+	 mov	%g7, %o0		! give em the computed checksum
+ccdbl:	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+	CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
+11:	EXT(ccdbl, 11b, 21f)		! note for exception table handling
+	sub	%g1, 128, %g1		! detract from length
+	addx	%g0, %g7, %g7		! add in last carry bit
+	andcc	%g1, 0xffffff80, %g0	! more to csum?
+	add	%o0, 128, %o0		! advance src ptr
+	bne	ccdbl			! we did not go negative, continue looping
+	 add	%o1, 128, %o1		! advance dest ptr
+	b	ccmerge			! finish it off, above
+	 andcc	%g1, 0x70, %o2		! can use table? (clears carry btw)
+
+ccslow:	cmp	%g1, 0
+	mov	0, %g5
+	bleu	4f
+	 andcc	%o0, 1, %o5		
+	be,a	1f
+	 srl	%g1, 1, %g4		
+	sub	%g1, 1, %g1	
+	EX(ldub	[%o0], %g5, add %g1, 1)
+	add	%o0, 1, %o0	
+	EX2(stb	%g5, [%o1])
+	srl	%g1, 1, %g4
+	add	%o1, 1, %o1
+1:	cmp	%g4, 0		
+	be,a	3f
+	 andcc	%g1, 1, %g0
+	andcc	%o0, 2, %g0	
+	be,a	1f
+	 srl	%g4, 1, %g4
+	EX(lduh	[%o0], %o4, add %g1, 0)
+	sub	%g1, 2, %g1	
+	srl	%o4, 8, %g2
+	sub	%g4, 1, %g4	
+	EX2(stb	%g2, [%o1])
+	add	%o4, %g5, %g5
+	EX2(stb	%o4, [%o1 + 1])
+	add	%o0, 2, %o0	
+	srl	%g4, 1, %g4
+	add	%o1, 2, %o1
+1:	cmp	%g4, 0		
+	be,a	2f
+	 andcc	%g1, 2, %g0
+	EX3(ld	[%o0], %o4)
+5:	srl	%o4, 24, %g2
+	srl	%o4, 16, %g3
+	EX2(stb	%g2, [%o1])
+	srl	%o4, 8, %g2
+	EX2(stb	%g3, [%o1 + 1])
+	add	%o0, 4, %o0
+	EX2(stb	%g2, [%o1 + 2])
+	addcc	%o4, %g5, %g5
+	EX2(stb	%o4, [%o1 + 3])
+	addx	%g5, %g0, %g5	! I am now to lazy to optimize this (question it
+	add	%o1, 4, %o1	! is worthy). Maybe some day - with the sll/srl
+	subcc	%g4, 1, %g4	! tricks
+	bne,a	5b
+	 EX3(ld	[%o0], %o4)
+	sll	%g5, 16, %g2
+	srl	%g5, 16, %g5
+	srl	%g2, 16, %g2
+	andcc	%g1, 2, %g0
+	add	%g2, %g5, %g5 
+2:	be,a	3f		
+	 andcc	%g1, 1, %g0
+	EX(lduh	[%o0], %o4, and %g1, 3)
+	andcc	%g1, 1, %g0
+	srl	%o4, 8, %g2
+	add	%o0, 2, %o0	
+	EX2(stb	%g2, [%o1])
+	add	%g5, %o4, %g5
+	EX2(stb	%o4, [%o1 + 1])
+	add	%o1, 2, %o1
+3:	be,a	1f		
+	 sll	%g5, 16, %o4
+	EX(ldub	[%o0], %g2, add %g0, 1)
+	sll	%g2, 8, %o4	
+	EX2(stb	%g2, [%o1])
+	add	%g5, %o4, %g5
+	sll	%g5, 16, %o4
+1:	addcc	%o4, %g5, %g5
+	srl	%g5, 16, %o4
+	addx	%g0, %o4, %g5
+	orcc	%o5, %g0, %g0
+	be	4f
+	 srl	%g5, 8, %o4
+	and	%g5, 0xff, %g2
+	and	%o4, 0xff, %o4
+	sll	%g2, 8, %g2
+	or	%g2, %o4, %g5
+4:	addcc	%g7, %g5, %g7
+	retl	
+	 addx	%g0, %g7, %o0
+__csum_partial_copy_end:
+
+/* We do these strange calculations for the csum_*_from_user case only, ie.
+ * we only bother with faults on loads... */
+
+/* o2 = ((g2%20)&3)*8
+ * o3 = g1 - (g2/20)*32 - o2 */
+20:
+	cmp	%g2, 20
+	blu,a	1f
+	 and	%g2, 3, %o2
+	sub	%g1, 32, %g1
+	b	20b
+	 sub	%g2, 20, %g2
+1:
+	sll	%o2, 3, %o2
+	b	31f
+	 sub	%g1, %o2, %o3
+
+/* o2 = (!(g2 & 15) ? 0 : (((g2 & 15) + 1) & ~1)*8)
+ * o3 = g1 - (g2/16)*32 - o2 */
+21:
+	andcc	%g2, 15, %o3
+	srl	%g2, 4, %g2
+	be,a	1f
+	 clr	%o2
+	add	%o3, 1, %o3
+	and	%o3, 14, %o3
+	sll	%o3, 3, %o2
+1:
+	sll	%g2, 5, %g2
+	sub	%g1, %g2, %o3
+	b	31f
+	 sub	%o3, %o2, %o3
+
+/* o0 += (g2/10)*16 - 0x70
+ * 01 += (g2/10)*16 - 0x70
+ * o2 = (g2 % 10) ? 8 : 0
+ * o3 += 0x70 - (g2/10)*16 - o2 */
+22:
+	cmp	%g2, 10
+	blu,a	1f
+	 sub	%o0, 0x70, %o0
+	add	%o0, 16, %o0
+	add	%o1, 16, %o1
+	sub	%o3, 16, %o3
+	b	22b
+	 sub	%g2, 10, %g2
+1:
+	sub	%o1, 0x70, %o1
+	add	%o3, 0x70, %o3
+	clr	%o2
+	tst	%g2
+	bne,a	1f
+	 mov	8, %o2
+1:
+	b	31f
+	 sub	%o3, %o2, %o3
+96:
+	and	%g1, 3, %g1
+	sll	%g4, 2, %g4
+	add	%g1, %g4, %o3
+30:
+/* %o1 is dst
+ * %o3 is # bytes to zero out
+ * %o4 is faulting address
+ * %o5 is %pc where fault occurred */
+	clr	%o2
+31:
+/* %o0 is src
+ * %o1 is dst
+ * %o2 is # of bytes to copy from src to dst
+ * %o3 is # bytes to zero out
+ * %o4 is faulting address
+ * %o5 is %pc where fault occurred */
+	save	%sp, -104, %sp
+        mov     %i5, %o0
+        mov     %i7, %o1
+        mov	%i4, %o2
+        call    lookup_fault
+	 mov	%g7, %i4
+	cmp	%o0, 2
+	bne	1f	
+	 add	%g0, -EFAULT, %i5
+	tst	%i2
+	be	2f
+	 mov	%i0, %o1
+	mov	%i1, %o0
+5:
+	call	__memcpy
+	 mov	%i2, %o2
+	tst	%o0
+	bne,a	2f
+	 add	%i3, %i2, %i3
+	add	%i1, %i2, %i1
+2:
+	mov	%i1, %o0
+6:
+	call	__bzero
+	 mov	%i3, %o1
+1:
+	ld	[%sp + 168], %o2		! struct_ptr of parent
+	st	%i5, [%o2]
+	ret
+	 restore
+
+        .section __ex_table,#alloc
+        .align 4
+        .word 5b,2
+	.word 6b,2
diff --git a/arch/sparc/lib/memcmp.S b/arch/sparc/lib/memcmp.S
deleted file mode 100644
index cb4bdb0cc2af..000000000000
--- a/arch/sparc/lib/memcmp.S
+++ /dev/null
@@ -1,312 +0,0 @@
-	.text
-	.align 4
-	.global __memcmp, memcmp
-__memcmp:
-memcmp:
-#if 1
-	cmp	%o2, 0
-	ble	L3
-	 mov	0, %g3
-L5:
-	ldub	[%o0], %g2
-	ldub	[%o1], %g3
-	sub	%g2, %g3, %g2
-	mov	%g2, %g3
-	sll	%g2, 24, %g2
-
-	cmp	%g2, 0
-	bne	L3
-	 add	%o0, 1, %o0
-
-	add	%o2, -1, %o2
-
-	cmp	%o2, 0
-	bg	L5
-	 add	%o1, 1, %o1
-L3:
-	sll	%g3, 24, %o0
-	sra	%o0, 24, %o0
-
-	retl
-	 nop
-#else
-	save	%sp, -104, %sp
-	mov	%i2, %o4
-	mov	%i0, %o0
-
-	cmp	%o4, 15
-	ble	L72
-	 mov	%i1, %i2
-
-	andcc	%i2, 3, %g0
-	be	L161
-	 andcc	%o0, 3, %g2
-L75:
-	ldub	[%o0], %g3
-	ldub	[%i2], %g2
-	add	%o0,1, %o0
-
-	subcc	%g3, %g2, %i0
-	bne	L156
-	 add	%i2, 1, %i2
-
-	andcc	%i2, 3, %g0
-	bne	L75
-	 add	%o4, -1, %o4
-
-	andcc	%o0, 3, %g2
-L161:
-	bne,a	L78
-	 mov	%i2, %i1
-
-	mov	%o0, %i5
-	mov	%i2, %i3
-	srl	%o4, 2, %i4
-
-	cmp	%i4, 0
-	bge	L93
-	 mov	%i4, %g2
-
-	add %i4, 3, %g2
-L93:
-	sra	%g2, 2, %g2
-	sll	%g2, 2, %g2
-	sub	%i4, %g2, %g2
-
-	cmp	%g2, 1
-	be,a	L88
-	 add	%o0, 4, %i5
-
-	bg	L94
-	 cmp	%g2, 2
-
-	cmp	%g2, 0
-	be,a	L86
-	 ld	[%o0], %g3
-
-	b	L162
-	 ld	[%i5], %g3
-L94:
-	be	L81
-	 cmp	%g2, 3
-
-	be,a	L83
-	 add	%o0, -4, %i5
-
-	b	L162
-	 ld	[%i5], %g3
-L81:
-	add	%o0, -8, %i5
-	ld	[%o0], %g3
-	add	%i2, -8, %i3
-	ld	[%i2], %g2
-
-	b	L82
-	 add	%i4, 2, %i4
-L83:
-	ld	[%o0], %g4
-	add	%i2, -4, %i3
-	ld	[%i2], %g1
-
-	b	L84
-	 add	%i4, 1, %i4
-L86:
-	b	L87
-	 ld	[%i2], %g2
-L88:
-	add	%i2, 4, %i3
-	ld	[%o0], %g4
-	add	%i4, -1, %i4
-	ld	[%i2], %g1
-L95:
-	ld	[%i5], %g3
-L162:
-	cmp	%g4, %g1
-	be	L87
-	 ld	[%i3], %g2
-
-	cmp	%g4, %g1
-L163:
-	bleu	L114
-	 mov	-1, %i0
-
-	b	L114
-	 mov	1, %i0
-L87:
-	ld	[%i5 + 4], %g4
-	cmp	%g3, %g2
-	bne	L163
-	 ld	[%i3 + 4], %g1
-L84:
-	ld	[%i5 + 8], %g3
-
-	cmp	%g4, %g1
-	bne	L163
-	 ld	[%i3 + 8], %g2
-L82:
-	ld	[%i5 + 12], %g4
-	cmp	%g3, %g2
-	bne	L163
-	 ld	[%i3 + 12], %g1
-
-	add	%i5, 16, %i5
-
-	addcc	%i4, -4, %i4
-	bne	L95
-	 add	%i3, 16, %i3
-
-	cmp	%g4, %g1
-	bne	L163
-	 nop
-
-	b	L114
-	 mov	0, %i0
-L78:
-	srl	%o4, 2, %i0
-	and	%o0, -4, %i3
-	orcc	%i0, %g0, %g3
-	sll	%g2, 3, %o7
-	mov	32, %g2
-
-	bge	L129
-	 sub	%g2, %o7, %o1
-
-	add	%i0, 3, %g3
-L129:
-	sra	%g3, 2, %g2
-	sll	%g2, 2, %g2
-	sub	%i0, %g2, %g2
-
-	cmp	%g2, 1
-	be,a	L124
-	 ld	[%i3], %o3
-
-	bg	L130
-	 cmp	%g2, 2
-
-	cmp	%g2, 0
-	be,a	L122
-	 ld	[%i3], %o2
-
-	b	L164
-	sll	%o3, %o7, %g3
-L130:
-	be	L117
-	 cmp	%g2, 3
-
-	be,a	L119
-	 ld	[%i3], %g1
-
-	b	L164
-	 sll	%o3, %o7, %g3
-L117:
-	ld	[%i3], %g4
-	add	%i2, -8, %i1
-	ld	[%i3 + 4], %o3
-	add	%i0, 2, %i0
-	ld	[%i2], %i4
-
-	b	L118
-	 add	%i3, -4, %i3
-L119:
-	ld	[%i3 + 4], %g4
-	add	%i2, -4, %i1
-	ld	[%i2], %i5
-
-	b	L120
-	 add	%i0, 1, %i0
-L122:
-	ld	[%i3 + 4], %g1
-	ld	[%i2], %i4
-
-	b	L123
-	 add	%i3, 4, %i3
-L124:
-	add	%i2, 4, %i1
-	ld	[%i3 + 4], %o2
-	add	%i0, -1, %i0
-	ld	[%i2], %i5
-	add	%i3, 8, %i3
-L131:
-	sll	%o3, %o7, %g3
-L164:
-	srl	%o2, %o1, %g2
-	ld	[%i3], %g1
-	or	%g3, %g2, %g3
-
-	cmp	%g3, %i5
-	bne	L163
-	 ld	[%i1], %i4
-L123:
-	sll	%o2, %o7, %g3
-	srl	%g1, %o1, %g2
-	ld	[%i3 + 4], %g4
-	or	%g3, %g2, %g3
-
-	cmp	%g3, %i4
-	bne	L163
-	 ld	[%i1 + 4], %i5
-L120:
-	sll	%g1, %o7, %g3
-	srl	%g4, %o1, %g2
-	ld	[%i3 + 8], %o3
-	or	%g3, %g2, %g3
-
-	cmp	%g3, %i5
-	bne	L163
-	 ld	[%i1 + 8], %i4
-L118:
-	sll	%g4, %o7, %g3
-	srl	%o3, %o1, %g2
-	ld	[%i3 + 12], %o2
-	or	%g3, %g2, %g3
-
-	cmp	%g3, %i4
-	bne	L163
-	 ld	[%i1 + 12], %i5
-
-	add	%i3, 16, %i3
-	addcc	%i0, -4, %i0
-	bne	L131
-	 add	%i1, 16, %i1
-
-	sll	%o3, %o7, %g3
-	srl	%o2, %o1, %g2
-	or	%g3, %g2, %g3
-
-	cmp	%g3, %i5
-	be,a	L114
-	 mov	0, %i0
-
-	b,a L163
-L114:
-	cmp	%i0, 0
-	bne	L156
-	 and	%o4, -4, %g2
-
-	add	%o0, %g2, %o0
-	add	%i2, %g2, %i2
-	and	%o4, 3, %o4
-L72:
-	cmp	%o4, 0
-	be	L156
-	 mov	0, %i0
-
-	ldub	[%o0], %g3
-L165:
-	ldub	[%i2], %g2
-	add	%o0, 1, %o0
-
-	subcc	%g3, %g2, %i0
-	bne	L156
-	 add	%i2, 1, %i2
-
-	addcc	%o4, -1, %o4
-	bne,a	L165
-	 ldub	[%o0], %g3
-
-	mov	0, %i0
-L156:
-	ret
-	restore
-#endif
diff --git a/arch/sparc/lib/memcmp_32.S b/arch/sparc/lib/memcmp_32.S
new file mode 100644
index 000000000000..cb4bdb0cc2af
--- /dev/null
+++ b/arch/sparc/lib/memcmp_32.S
@@ -0,0 +1,312 @@
+	.text
+	.align 4
+	.global __memcmp, memcmp
+__memcmp:
+memcmp:
+#if 1
+	cmp	%o2, 0
+	ble	L3
+	 mov	0, %g3
+L5:
+	ldub	[%o0], %g2
+	ldub	[%o1], %g3
+	sub	%g2, %g3, %g2
+	mov	%g2, %g3
+	sll	%g2, 24, %g2
+
+	cmp	%g2, 0
+	bne	L3
+	 add	%o0, 1, %o0
+
+	add	%o2, -1, %o2
+
+	cmp	%o2, 0
+	bg	L5
+	 add	%o1, 1, %o1
+L3:
+	sll	%g3, 24, %o0
+	sra	%o0, 24, %o0
+
+	retl
+	 nop
+#else
+	save	%sp, -104, %sp
+	mov	%i2, %o4
+	mov	%i0, %o0
+
+	cmp	%o4, 15
+	ble	L72
+	 mov	%i1, %i2
+
+	andcc	%i2, 3, %g0
+	be	L161
+	 andcc	%o0, 3, %g2
+L75:
+	ldub	[%o0], %g3
+	ldub	[%i2], %g2
+	add	%o0,1, %o0
+
+	subcc	%g3, %g2, %i0
+	bne	L156
+	 add	%i2, 1, %i2
+
+	andcc	%i2, 3, %g0
+	bne	L75
+	 add	%o4, -1, %o4
+
+	andcc	%o0, 3, %g2
+L161:
+	bne,a	L78
+	 mov	%i2, %i1
+
+	mov	%o0, %i5
+	mov	%i2, %i3
+	srl	%o4, 2, %i4
+
+	cmp	%i4, 0
+	bge	L93
+	 mov	%i4, %g2
+
+	add %i4, 3, %g2
+L93:
+	sra	%g2, 2, %g2
+	sll	%g2, 2, %g2
+	sub	%i4, %g2, %g2
+
+	cmp	%g2, 1
+	be,a	L88
+	 add	%o0, 4, %i5
+
+	bg	L94
+	 cmp	%g2, 2
+
+	cmp	%g2, 0
+	be,a	L86
+	 ld	[%o0], %g3
+
+	b	L162
+	 ld	[%i5], %g3
+L94:
+	be	L81
+	 cmp	%g2, 3
+
+	be,a	L83
+	 add	%o0, -4, %i5
+
+	b	L162
+	 ld	[%i5], %g3
+L81:
+	add	%o0, -8, %i5
+	ld	[%o0], %g3
+	add	%i2, -8, %i3
+	ld	[%i2], %g2
+
+	b	L82
+	 add	%i4, 2, %i4
+L83:
+	ld	[%o0], %g4
+	add	%i2, -4, %i3
+	ld	[%i2], %g1
+
+	b	L84
+	 add	%i4, 1, %i4
+L86:
+	b	L87
+	 ld	[%i2], %g2
+L88:
+	add	%i2, 4, %i3
+	ld	[%o0], %g4
+	add	%i4, -1, %i4
+	ld	[%i2], %g1
+L95:
+	ld	[%i5], %g3
+L162:
+	cmp	%g4, %g1
+	be	L87
+	 ld	[%i3], %g2
+
+	cmp	%g4, %g1
+L163:
+	bleu	L114
+	 mov	-1, %i0
+
+	b	L114
+	 mov	1, %i0
+L87:
+	ld	[%i5 + 4], %g4
+	cmp	%g3, %g2
+	bne	L163
+	 ld	[%i3 + 4], %g1
+L84:
+	ld	[%i5 + 8], %g3
+
+	cmp	%g4, %g1
+	bne	L163
+	 ld	[%i3 + 8], %g2
+L82:
+	ld	[%i5 + 12], %g4
+	cmp	%g3, %g2
+	bne	L163
+	 ld	[%i3 + 12], %g1
+
+	add	%i5, 16, %i5
+
+	addcc	%i4, -4, %i4
+	bne	L95
+	 add	%i3, 16, %i3
+
+	cmp	%g4, %g1
+	bne	L163
+	 nop
+
+	b	L114
+	 mov	0, %i0
+L78:
+	srl	%o4, 2, %i0
+	and	%o0, -4, %i3
+	orcc	%i0, %g0, %g3
+	sll	%g2, 3, %o7
+	mov	32, %g2
+
+	bge	L129
+	 sub	%g2, %o7, %o1
+
+	add	%i0, 3, %g3
+L129:
+	sra	%g3, 2, %g2
+	sll	%g2, 2, %g2
+	sub	%i0, %g2, %g2
+
+	cmp	%g2, 1
+	be,a	L124
+	 ld	[%i3], %o3
+
+	bg	L130
+	 cmp	%g2, 2
+
+	cmp	%g2, 0
+	be,a	L122
+	 ld	[%i3], %o2
+
+	b	L164
+	sll	%o3, %o7, %g3
+L130:
+	be	L117
+	 cmp	%g2, 3
+
+	be,a	L119
+	 ld	[%i3], %g1
+
+	b	L164
+	 sll	%o3, %o7, %g3
+L117:
+	ld	[%i3], %g4
+	add	%i2, -8, %i1
+	ld	[%i3 + 4], %o3
+	add	%i0, 2, %i0
+	ld	[%i2], %i4
+
+	b	L118
+	 add	%i3, -4, %i3
+L119:
+	ld	[%i3 + 4], %g4
+	add	%i2, -4, %i1
+	ld	[%i2], %i5
+
+	b	L120
+	 add	%i0, 1, %i0
+L122:
+	ld	[%i3 + 4], %g1
+	ld	[%i2], %i4
+
+	b	L123
+	 add	%i3, 4, %i3
+L124:
+	add	%i2, 4, %i1
+	ld	[%i3 + 4], %o2
+	add	%i0, -1, %i0
+	ld	[%i2], %i5
+	add	%i3, 8, %i3
+L131:
+	sll	%o3, %o7, %g3
+L164:
+	srl	%o2, %o1, %g2
+	ld	[%i3], %g1
+	or	%g3, %g2, %g3
+
+	cmp	%g3, %i5
+	bne	L163
+	 ld	[%i1], %i4
+L123:
+	sll	%o2, %o7, %g3
+	srl	%g1, %o1, %g2
+	ld	[%i3 + 4], %g4
+	or	%g3, %g2, %g3
+
+	cmp	%g3, %i4
+	bne	L163
+	 ld	[%i1 + 4], %i5
+L120:
+	sll	%g1, %o7, %g3
+	srl	%g4, %o1, %g2
+	ld	[%i3 + 8], %o3
+	or	%g3, %g2, %g3
+
+	cmp	%g3, %i5
+	bne	L163
+	 ld	[%i1 + 8], %i4
+L118:
+	sll	%g4, %o7, %g3
+	srl	%o3, %o1, %g2
+	ld	[%i3 + 12], %o2
+	or	%g3, %g2, %g3
+
+	cmp	%g3, %i4
+	bne	L163
+	 ld	[%i1 + 12], %i5
+
+	add	%i3, 16, %i3
+	addcc	%i0, -4, %i0
+	bne	L131
+	 add	%i1, 16, %i1
+
+	sll	%o3, %o7, %g3
+	srl	%o2, %o1, %g2
+	or	%g3, %g2, %g3
+
+	cmp	%g3, %i5
+	be,a	L114
+	 mov	0, %i0
+
+	b,a L163
+L114:
+	cmp	%i0, 0
+	bne	L156
+	 and	%o4, -4, %g2
+
+	add	%o0, %g2, %o0
+	add	%i2, %g2, %i2
+	and	%o4, 3, %o4
+L72:
+	cmp	%o4, 0
+	be	L156
+	 mov	0, %i0
+
+	ldub	[%o0], %g3
+L165:
+	ldub	[%i2], %g2
+	add	%o0, 1, %o0
+
+	subcc	%g3, %g2, %i0
+	bne	L156
+	 add	%i2, 1, %i2
+
+	addcc	%o4, -1, %o4
+	bne,a	L165
+	 ldub	[%o0], %g3
+
+	mov	0, %i0
+L156:
+	ret
+	restore
+#endif
diff --git a/arch/sparc/lib/memscan.S b/arch/sparc/lib/memscan.S
deleted file mode 100644
index 4ff1657dfc24..000000000000
--- a/arch/sparc/lib/memscan.S
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * memscan.S: Optimized memscan for the Sparc.
- *
- * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
- */
-
-/* In essence, this is just a fancy strlen. */
-
-#define LO_MAGIC 0x01010101
-#define HI_MAGIC 0x80808080
-
-	.text
-	.align	4
-	.globl	__memscan_zero, __memscan_generic
-	.globl	memscan
-__memscan_zero:
-	/* %o0 = addr, %o1 = size */
-	cmp	%o1, 0
-	bne,a	1f
-	 andcc	%o0, 3, %g0
-
-	retl
-	 nop
-
-1:
-	be	mzero_scan_word
-	 sethi	%hi(HI_MAGIC), %g2
-
-	ldsb	[%o0], %g3
-mzero_still_not_word_aligned:
-	cmp	%g3, 0
-	bne	1f
-	 add	%o0, 1, %o0
-
-	retl
-	 sub	%o0, 1, %o0
-
-1:
-	subcc	%o1, 1, %o1
-	bne,a	1f
-	 andcc	%o0, 3, %g0
-
-	retl
-	 nop
-
-1:
-	bne,a	mzero_still_not_word_aligned
-	 ldsb	[%o0], %g3
-
-	sethi	%hi(HI_MAGIC), %g2
-mzero_scan_word:
-	or	%g2, %lo(HI_MAGIC), %o3
-	sethi	%hi(LO_MAGIC), %g3
-	or	%g3, %lo(LO_MAGIC), %o2
-mzero_next_word:
-	ld	[%o0], %g2
-mzero_next_word_preloaded:
-	sub	%g2, %o2, %g2
-mzero_next_word_preloaded_next:
-	andcc	%g2, %o3, %g0
-	bne	mzero_byte_zero
-	 add	%o0, 4, %o0
-
-mzero_check_out_of_fuel:
-	subcc	%o1, 4, %o1
-	bg,a	1f
-	 ld	[%o0], %g2
-
-	retl
-	 nop
-
-1:
-	b	mzero_next_word_preloaded_next
-	 sub	%g2, %o2, %g2
-
-	/* Check every byte. */
-mzero_byte_zero:
-	ldsb	[%o0 - 4], %g2
-	cmp	%g2, 0
-	bne	mzero_byte_one
-	 sub	%o0, 4, %g3
-
-	retl
-	 mov	%g3, %o0
-
-mzero_byte_one:
-	ldsb	[%o0 - 3], %g2
-	cmp	%g2, 0
-	bne,a	mzero_byte_two_and_three
-	 ldsb	[%o0 - 2], %g2
-
-	retl
-	 sub	%o0, 3, %o0
-
-mzero_byte_two_and_three:
-	cmp	%g2, 0
-	bne,a	1f
-	 ldsb	[%o0 - 1], %g2
-
-	retl
-	 sub	%o0, 2, %o0
-
-1:
-	cmp	%g2, 0
-	bne,a	mzero_next_word_preloaded
-	 ld	[%o0], %g2
-
-	retl
-	 sub	%o0, 1, %o0
-
-mzero_found_it:
-	retl
-	 sub	%o0, 2, %o0
-
-memscan:
-__memscan_generic:
-	/* %o0 = addr, %o1 = c, %o2 = size */
-	cmp	%o2, 0
-	bne,a	0f
-	 ldub	[%o0], %g2
-
-	b,a	2f
-1:
-	ldub	[%o0], %g2
-0:
-	cmp	%g2, %o1
-	be	2f
-	 addcc	%o2, -1, %o2
-	bne	1b
-	 add	%o0, 1, %o0
-2:
-	retl
-	 nop
diff --git a/arch/sparc/lib/memscan_32.S b/arch/sparc/lib/memscan_32.S
new file mode 100644
index 000000000000..4ff1657dfc24
--- /dev/null
+++ b/arch/sparc/lib/memscan_32.S
@@ -0,0 +1,133 @@
+/*
+ * memscan.S: Optimized memscan for the Sparc.
+ *
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ */
+
+/* In essence, this is just a fancy strlen. */
+
+#define LO_MAGIC 0x01010101
+#define HI_MAGIC 0x80808080
+
+	.text
+	.align	4
+	.globl	__memscan_zero, __memscan_generic
+	.globl	memscan
+__memscan_zero:
+	/* %o0 = addr, %o1 = size */
+	cmp	%o1, 0
+	bne,a	1f
+	 andcc	%o0, 3, %g0
+
+	retl
+	 nop
+
+1:
+	be	mzero_scan_word
+	 sethi	%hi(HI_MAGIC), %g2
+
+	ldsb	[%o0], %g3
+mzero_still_not_word_aligned:
+	cmp	%g3, 0
+	bne	1f
+	 add	%o0, 1, %o0
+
+	retl
+	 sub	%o0, 1, %o0
+
+1:
+	subcc	%o1, 1, %o1
+	bne,a	1f
+	 andcc	%o0, 3, %g0
+
+	retl
+	 nop
+
+1:
+	bne,a	mzero_still_not_word_aligned
+	 ldsb	[%o0], %g3
+
+	sethi	%hi(HI_MAGIC), %g2
+mzero_scan_word:
+	or	%g2, %lo(HI_MAGIC), %o3
+	sethi	%hi(LO_MAGIC), %g3
+	or	%g3, %lo(LO_MAGIC), %o2
+mzero_next_word:
+	ld	[%o0], %g2
+mzero_next_word_preloaded:
+	sub	%g2, %o2, %g2
+mzero_next_word_preloaded_next:
+	andcc	%g2, %o3, %g0
+	bne	mzero_byte_zero
+	 add	%o0, 4, %o0
+
+mzero_check_out_of_fuel:
+	subcc	%o1, 4, %o1
+	bg,a	1f
+	 ld	[%o0], %g2
+
+	retl
+	 nop
+
+1:
+	b	mzero_next_word_preloaded_next
+	 sub	%g2, %o2, %g2
+
+	/* Check every byte. */
+mzero_byte_zero:
+	ldsb	[%o0 - 4], %g2
+	cmp	%g2, 0
+	bne	mzero_byte_one
+	 sub	%o0, 4, %g3
+
+	retl
+	 mov	%g3, %o0
+
+mzero_byte_one:
+	ldsb	[%o0 - 3], %g2
+	cmp	%g2, 0
+	bne,a	mzero_byte_two_and_three
+	 ldsb	[%o0 - 2], %g2
+
+	retl
+	 sub	%o0, 3, %o0
+
+mzero_byte_two_and_three:
+	cmp	%g2, 0
+	bne,a	1f
+	 ldsb	[%o0 - 1], %g2
+
+	retl
+	 sub	%o0, 2, %o0
+
+1:
+	cmp	%g2, 0
+	bne,a	mzero_next_word_preloaded
+	 ld	[%o0], %g2
+
+	retl
+	 sub	%o0, 1, %o0
+
+mzero_found_it:
+	retl
+	 sub	%o0, 2, %o0
+
+memscan:
+__memscan_generic:
+	/* %o0 = addr, %o1 = c, %o2 = size */
+	cmp	%o2, 0
+	bne,a	0f
+	 ldub	[%o0], %g2
+
+	b,a	2f
+1:
+	ldub	[%o0], %g2
+0:
+	cmp	%g2, %o1
+	be	2f
+	 addcc	%o2, -1, %o2
+	bne	1b
+	 add	%o0, 1, %o0
+2:
+	retl
+	 nop
diff --git a/arch/sparc/lib/rwsem.S b/arch/sparc/lib/rwsem.S
deleted file mode 100644
index 9675268e7fde..000000000000
--- a/arch/sparc/lib/rwsem.S
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Assembly part of rw semaphores.
- *
- * Copyright (C) 1999 Jakub Jelinek (jakub@redhat.com)
- */
-
-#include <asm/ptrace.h>
-#include <asm/psr.h>
-
-	.section .sched.text, "ax"
-	.align	4
-
-	.globl		___down_read
-___down_read:
-	rd		%psr, %g3
-	nop
-	nop
-	nop
-	or		%g3, PSR_PIL, %g7
-	wr		%g7, 0, %psr
-	nop
-	nop
-	nop
-#ifdef CONFIG_SMP
-1:	ldstub		[%g1 + 4], %g7
-	tst		%g7
-	bne		1b
-	 ld		[%g1], %g7
-	sub		%g7, 1, %g7
-	st		%g7, [%g1]
-	stb		%g0, [%g1 + 4]
-#else
-	ld		[%g1], %g7
-	sub		%g7, 1, %g7
-	st		%g7, [%g1]
-#endif
-	wr		%g3, 0, %psr
-	add		%g7, 1, %g7
-	nop
-	nop
-	subcc		%g7, 1, %g7
-	bneg		3f
-	 nop
-2:	jmpl		%o7, %g0
-	 mov		%g4, %o7
-3:	save		%sp, -64, %sp
-	mov		%g1, %l1
-	mov		%g4, %l4
-	bcs		4f
-	 mov		%g5, %l5
-	call		down_read_failed
-	 mov		%l1, %o0
-	mov		%l1, %g1
-	mov		%l4, %g4
-	ba		___down_read
-	 restore	%l5, %g0, %g5
-4:	call		down_read_failed_biased
-	 mov		%l1, %o0
-	mov		%l1, %g1
-	mov		%l4, %g4
-	ba		2b
-	 restore	%l5, %g0, %g5
-
-	.globl		___down_write
-___down_write:
-	rd		%psr, %g3
-	nop
-	nop
-	nop
-	or		%g3, PSR_PIL, %g7
-	wr		%g7, 0, %psr
-	sethi		%hi(0x01000000), %g2
-	nop
-	nop
-#ifdef CONFIG_SMP
-1:	ldstub		[%g1 + 4], %g7
-	tst		%g7
-	bne		1b
-	 ld		[%g1], %g7
-	sub		%g7, %g2, %g7
-	st		%g7, [%g1]
-	stb		%g0, [%g1 + 4]
-#else
-	ld		[%g1], %g7
-	sub		%g7, %g2, %g7
-	st		%g7, [%g1]
-#endif
-	wr		%g3, 0, %psr
-	add		%g7, %g2, %g7
-	nop
-	nop
-	subcc		%g7, %g2, %g7
-	bne		3f
-	 nop
-2:	jmpl		%o7, %g0
-	 mov		%g4, %o7
-3:	save		%sp, -64, %sp
-	mov		%g1, %l1
-	mov		%g4, %l4
-	bcs		4f
-	 mov		%g5, %l5
-	call		down_write_failed
-	 mov		%l1, %o0
-	mov		%l1, %g1
-	mov		%l4, %g4
-	ba		___down_write
-	 restore	%l5, %g0, %g5
-4:	call		down_write_failed_biased
-	 mov		%l1, %o0
-	mov		%l1, %g1
-	mov		%l4, %g4
-	ba		2b
-	 restore	%l5, %g0, %g5
-
-	.text
-	.globl		___up_read
-___up_read:
-	rd		%psr, %g3
-	nop
-	nop
-	nop
-	or		%g3, PSR_PIL, %g7
-	wr		%g7, 0, %psr
-	nop
-	nop
-	nop
-#ifdef CONFIG_SMP
-1:	ldstub		[%g1 + 4], %g7
-	tst		%g7
-	bne		1b
-	 ld		[%g1], %g7
-	add		%g7, 1, %g7
-	st		%g7, [%g1]
-	stb		%g0, [%g1 + 4]
-#else
-	ld		[%g1], %g7
-	add		%g7, 1, %g7
-	st		%g7, [%g1]
-#endif
-	wr		%g3, 0, %psr
-	nop
-	nop
-	nop
-	cmp		%g7, 0
-	be		3f
-	 nop
-2:	jmpl		%o7, %g0
-	 mov		%g4, %o7
-3:	save		%sp, -64, %sp
-	mov		%g1, %l1
-	mov		%g4, %l4
-	mov		%g5, %l5
-	clr		%o1
-	call		__rwsem_wake
-	 mov		%l1, %o0
-	mov		%l1, %g1
-	mov		%l4, %g4
-	ba		2b
-	 restore	%l5, %g0, %g5
-
-	.globl		___up_write
-___up_write:
-	rd		%psr, %g3
-	nop
-	nop
-	nop
-	or		%g3, PSR_PIL, %g7
-	wr		%g7, 0, %psr
-	sethi		%hi(0x01000000), %g2
-	nop
-	nop
-#ifdef CONFIG_SMP
-1:	ldstub		[%g1 + 4], %g7
-	tst		%g7
-	bne		1b
-	 ld		[%g1], %g7
-	add		%g7, %g2, %g7
-	st		%g7, [%g1]
-	stb		%g0, [%g1 + 4]
-#else
-	ld		[%g1], %g7
-	add		%g7, %g2, %g7
-	st		%g7, [%g1]
-#endif
-	wr		%g3, 0, %psr
-	sub		%g7, %g2, %g7
-	nop
-	nop
-	addcc		%g7, %g2, %g7
-	bcs		3f
-	 nop
-2:	jmpl		%o7, %g0
-	 mov		%g4, %o7
-3:	save		%sp, -64, %sp
-	mov		%g1, %l1
-	mov		%g4, %l4
-	mov		%g5, %l5
-	mov		%g7, %o1
-	call		__rwsem_wake
-	 mov		%l1, %o0
-	mov		%l1, %g1
-	mov		%l4, %g4
-	ba		2b
-	 restore	%l5, %g0, %g5
diff --git a/arch/sparc/lib/rwsem_32.S b/arch/sparc/lib/rwsem_32.S
new file mode 100644
index 000000000000..9675268e7fde
--- /dev/null
+++ b/arch/sparc/lib/rwsem_32.S
@@ -0,0 +1,204 @@
+/*
+ * Assembly part of rw semaphores.
+ *
+ * Copyright (C) 1999 Jakub Jelinek (jakub@redhat.com)
+ */
+
+#include <asm/ptrace.h>
+#include <asm/psr.h>
+
+	.section .sched.text, "ax"
+	.align	4
+
+	.globl		___down_read
+___down_read:
+	rd		%psr, %g3
+	nop
+	nop
+	nop
+	or		%g3, PSR_PIL, %g7
+	wr		%g7, 0, %psr
+	nop
+	nop
+	nop
+#ifdef CONFIG_SMP
+1:	ldstub		[%g1 + 4], %g7
+	tst		%g7
+	bne		1b
+	 ld		[%g1], %g7
+	sub		%g7, 1, %g7
+	st		%g7, [%g1]
+	stb		%g0, [%g1 + 4]
+#else
+	ld		[%g1], %g7
+	sub		%g7, 1, %g7
+	st		%g7, [%g1]
+#endif
+	wr		%g3, 0, %psr
+	add		%g7, 1, %g7
+	nop
+	nop
+	subcc		%g7, 1, %g7
+	bneg		3f
+	 nop
+2:	jmpl		%o7, %g0
+	 mov		%g4, %o7
+3:	save		%sp, -64, %sp
+	mov		%g1, %l1
+	mov		%g4, %l4
+	bcs		4f
+	 mov		%g5, %l5
+	call		down_read_failed
+	 mov		%l1, %o0
+	mov		%l1, %g1
+	mov		%l4, %g4
+	ba		___down_read
+	 restore	%l5, %g0, %g5
+4:	call		down_read_failed_biased
+	 mov		%l1, %o0
+	mov		%l1, %g1
+	mov		%l4, %g4
+	ba		2b
+	 restore	%l5, %g0, %g5
+
+	.globl		___down_write
+___down_write:
+	rd		%psr, %g3
+	nop
+	nop
+	nop
+	or		%g3, PSR_PIL, %g7
+	wr		%g7, 0, %psr
+	sethi		%hi(0x01000000), %g2
+	nop
+	nop
+#ifdef CONFIG_SMP
+1:	ldstub		[%g1 + 4], %g7
+	tst		%g7
+	bne		1b
+	 ld		[%g1], %g7
+	sub		%g7, %g2, %g7
+	st		%g7, [%g1]
+	stb		%g0, [%g1 + 4]
+#else
+	ld		[%g1], %g7
+	sub		%g7, %g2, %g7
+	st		%g7, [%g1]
+#endif
+	wr		%g3, 0, %psr
+	add		%g7, %g2, %g7
+	nop
+	nop
+	subcc		%g7, %g2, %g7
+	bne		3f
+	 nop
+2:	jmpl		%o7, %g0
+	 mov		%g4, %o7
+3:	save		%sp, -64, %sp
+	mov		%g1, %l1
+	mov		%g4, %l4
+	bcs		4f
+	 mov		%g5, %l5
+	call		down_write_failed
+	 mov		%l1, %o0
+	mov		%l1, %g1
+	mov		%l4, %g4
+	ba		___down_write
+	 restore	%l5, %g0, %g5
+4:	call		down_write_failed_biased
+	 mov		%l1, %o0
+	mov		%l1, %g1
+	mov		%l4, %g4
+	ba		2b
+	 restore	%l5, %g0, %g5
+
+	.text
+	.globl		___up_read
+___up_read:
+	rd		%psr, %g3
+	nop
+	nop
+	nop
+	or		%g3, PSR_PIL, %g7
+	wr		%g7, 0, %psr
+	nop
+	nop
+	nop
+#ifdef CONFIG_SMP
+1:	ldstub		[%g1 + 4], %g7
+	tst		%g7
+	bne		1b
+	 ld		[%g1], %g7
+	add		%g7, 1, %g7
+	st		%g7, [%g1]
+	stb		%g0, [%g1 + 4]
+#else
+	ld		[%g1], %g7
+	add		%g7, 1, %g7
+	st		%g7, [%g1]
+#endif
+	wr		%g3, 0, %psr
+	nop
+	nop
+	nop
+	cmp		%g7, 0
+	be		3f
+	 nop
+2:	jmpl		%o7, %g0
+	 mov		%g4, %o7
+3:	save		%sp, -64, %sp
+	mov		%g1, %l1
+	mov		%g4, %l4
+	mov		%g5, %l5
+	clr		%o1
+	call		__rwsem_wake
+	 mov		%l1, %o0
+	mov		%l1, %g1
+	mov		%l4, %g4
+	ba		2b
+	 restore	%l5, %g0, %g5
+
+	.globl		___up_write
+___up_write:
+	rd		%psr, %g3
+	nop
+	nop
+	nop
+	or		%g3, PSR_PIL, %g7
+	wr		%g7, 0, %psr
+	sethi		%hi(0x01000000), %g2
+	nop
+	nop
+#ifdef CONFIG_SMP
+1:	ldstub		[%g1 + 4], %g7
+	tst		%g7
+	bne		1b
+	 ld		[%g1], %g7
+	add		%g7, %g2, %g7
+	st		%g7, [%g1]
+	stb		%g0, [%g1 + 4]
+#else
+	ld		[%g1], %g7
+	add		%g7, %g2, %g7
+	st		%g7, [%g1]
+#endif
+	wr		%g3, 0, %psr
+	sub		%g7, %g2, %g7
+	nop
+	nop
+	addcc		%g7, %g2, %g7
+	bcs		3f
+	 nop
+2:	jmpl		%o7, %g0
+	 mov		%g4, %o7
+3:	save		%sp, -64, %sp
+	mov		%g1, %l1
+	mov		%g4, %l4
+	mov		%g5, %l5
+	mov		%g7, %o1
+	call		__rwsem_wake
+	 mov		%l1, %o0
+	mov		%l1, %g1
+	mov		%l4, %g4
+	ba		2b
+	 restore	%l5, %g0, %g5
diff --git a/arch/sparc/lib/strlen.S b/arch/sparc/lib/strlen.S
deleted file mode 100644
index ed9a763368cd..000000000000
--- a/arch/sparc/lib/strlen.S
+++ /dev/null
@@ -1,81 +0,0 @@
-/* strlen.S: Sparc optimized strlen code
- * Hand optimized from GNU libc's strlen
- * Copyright (C) 1991,1996 Free Software Foundation
- * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
- * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- */
-
-#define LO_MAGIC 0x01010101
-#define HI_MAGIC 0x80808080
-
-0:
-	ldub	[%o0], %o5
-	cmp	%o5, 0
-	be	1f
-	 add	%o0, 1, %o0
-	andcc	%o0, 3, %g0
-	be	4f
-	 or	%o4, %lo(HI_MAGIC), %o3
-	ldub	[%o0], %o5
-	cmp	%o5, 0
-	be	2f
-	 add	%o0, 1, %o0
-	andcc	%o0, 3, %g0
-	be	5f
-	 sethi	%hi(LO_MAGIC), %o4
-	ldub	[%o0], %o5
-	cmp	%o5, 0
-	be	3f
-	 add	%o0, 1, %o0
-	b	8f
-	 or	%o4, %lo(LO_MAGIC), %o2
-1:
-	retl
-	 mov	0, %o0
-2:
-	retl
-	 mov	1, %o0
-3:
-	retl
-	 mov	2, %o0
-
-	.align 4
-	.global strlen
-strlen:
-	mov	%o0, %o1
-	andcc	%o0, 3, %g0
-	bne	0b
-	 sethi	%hi(HI_MAGIC), %o4
-	or	%o4, %lo(HI_MAGIC), %o3
-4:
-	sethi	%hi(LO_MAGIC), %o4
-5:
-	or	%o4, %lo(LO_MAGIC), %o2
-8:
-	ld	[%o0], %o5
-2:
-	sub	%o5, %o2, %o4
-	andcc	%o4, %o3, %g0
-	be	8b
-	 add	%o0, 4, %o0
-
-	/* Check every byte. */
-	srl	%o5, 24, %g5
-	andcc	%g5, 0xff, %g0
-	be	1f
-	 add	%o0, -4, %o4
-	srl	%o5, 16, %g5
-	andcc	%g5, 0xff, %g0
-	be	1f
-	 add	%o4, 1, %o4
-	srl	%o5, 8, %g5
-	andcc	%g5, 0xff, %g0
-	be	1f
-	 add	%o4, 1, %o4
-	andcc	%o5, 0xff, %g0
-	bne,a	2b
-	 ld	[%o0], %o5
-	add	%o4, 1, %o4
-1:
-	retl
-	 sub	%o4, %o1, %o0
diff --git a/arch/sparc/lib/strlen_32.S b/arch/sparc/lib/strlen_32.S
new file mode 100644
index 000000000000..ed9a763368cd
--- /dev/null
+++ b/arch/sparc/lib/strlen_32.S
@@ -0,0 +1,81 @@
+/* strlen.S: Sparc optimized strlen code
+ * Hand optimized from GNU libc's strlen
+ * Copyright (C) 1991,1996 Free Software Foundation
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+
+#define LO_MAGIC 0x01010101
+#define HI_MAGIC 0x80808080
+
+0:
+	ldub	[%o0], %o5
+	cmp	%o5, 0
+	be	1f
+	 add	%o0, 1, %o0
+	andcc	%o0, 3, %g0
+	be	4f
+	 or	%o4, %lo(HI_MAGIC), %o3
+	ldub	[%o0], %o5
+	cmp	%o5, 0
+	be	2f
+	 add	%o0, 1, %o0
+	andcc	%o0, 3, %g0
+	be	5f
+	 sethi	%hi(LO_MAGIC), %o4
+	ldub	[%o0], %o5
+	cmp	%o5, 0
+	be	3f
+	 add	%o0, 1, %o0
+	b	8f
+	 or	%o4, %lo(LO_MAGIC), %o2
+1:
+	retl
+	 mov	0, %o0
+2:
+	retl
+	 mov	1, %o0
+3:
+	retl
+	 mov	2, %o0
+
+	.align 4
+	.global strlen
+strlen:
+	mov	%o0, %o1
+	andcc	%o0, 3, %g0
+	bne	0b
+	 sethi	%hi(HI_MAGIC), %o4
+	or	%o4, %lo(HI_MAGIC), %o3
+4:
+	sethi	%hi(LO_MAGIC), %o4
+5:
+	or	%o4, %lo(LO_MAGIC), %o2
+8:
+	ld	[%o0], %o5
+2:
+	sub	%o5, %o2, %o4
+	andcc	%o4, %o3, %g0
+	be	8b
+	 add	%o0, 4, %o0
+
+	/* Check every byte. */
+	srl	%o5, 24, %g5
+	andcc	%g5, 0xff, %g0
+	be	1f
+	 add	%o0, -4, %o4
+	srl	%o5, 16, %g5
+	andcc	%g5, 0xff, %g0
+	be	1f
+	 add	%o4, 1, %o4
+	srl	%o5, 8, %g5
+	andcc	%g5, 0xff, %g0
+	be	1f
+	 add	%o4, 1, %o4
+	andcc	%o5, 0xff, %g0
+	bne,a	2b
+	 ld	[%o0], %o5
+	add	%o4, 1, %o4
+1:
+	retl
+	 sub	%o4, %o1, %o0
diff --git a/arch/sparc/lib/strlen_user.S b/arch/sparc/lib/strlen_user.S
deleted file mode 100644
index 8c8a371df3c9..000000000000
--- a/arch/sparc/lib/strlen_user.S
+++ /dev/null
@@ -1,109 +0,0 @@
-/* strlen_user.S: Sparc optimized strlen_user code
- *
- * Return length of string in userspace including terminating 0
- * or 0 for error
- *
- * Copyright (C) 1991,1996 Free Software Foundation
- * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
- * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- */
-
-#define LO_MAGIC 0x01010101
-#define HI_MAGIC 0x80808080
-
-10:
-	ldub	[%o0], %o5
-	cmp	%o5, 0
-	be	1f
-	 add	%o0, 1, %o0
-	andcc	%o0, 3, %g0
-	be	4f
-	 or	%o4, %lo(HI_MAGIC), %o3
-11:
-	ldub	[%o0], %o5
-	cmp	%o5, 0
-	be	2f
-	 add	%o0, 1, %o0
-	andcc	%o0, 3, %g0
-	be	5f
-	 sethi	%hi(LO_MAGIC), %o4
-12:
-	ldub	[%o0], %o5
-	cmp	%o5, 0
-	be	3f
-	 add	%o0, 1, %o0
-	b	13f
-	 or	%o4, %lo(LO_MAGIC), %o2
-1:
-	retl
-	 mov	1, %o0
-2:
-	retl
-	 mov	2, %o0
-3:
-	retl
-	 mov	3, %o0
-
-	.align 4
-	.global __strlen_user, __strnlen_user
-__strlen_user:
-	sethi	%hi(32768), %o1
-__strnlen_user:
-	mov	%o1, %g1
-	mov	%o0, %o1
-	andcc	%o0, 3, %g0
-	bne	10b
-	 sethi	%hi(HI_MAGIC), %o4
-	or	%o4, %lo(HI_MAGIC), %o3
-4:
-	sethi	%hi(LO_MAGIC), %o4
-5:
-	or	%o4, %lo(LO_MAGIC), %o2
-13:
-	ld	[%o0], %o5
-2:
-	sub	%o5, %o2, %o4
-	andcc	%o4, %o3, %g0
-	bne	82f
-	 add	%o0, 4, %o0
-	sub	%o0, %o1, %g2
-81:	cmp	%g2, %g1
-	blu	13b
-	 mov	%o0, %o4
-	ba,a	1f
-
-	/* Check every byte. */
-82:	srl	%o5, 24, %g5
-	andcc	%g5, 0xff, %g0
-	be	1f
-	 add	%o0, -3, %o4
-	srl	%o5, 16, %g5
-	andcc	%g5, 0xff, %g0
-	be	1f
-	 add	%o4, 1, %o4
-	srl	%o5, 8, %g5
-	andcc	%g5, 0xff, %g0
-	be	1f
-	 add	%o4, 1, %o4
-	andcc	%o5, 0xff, %g0
-	bne	81b
-	 sub	%o0, %o1, %g2
-
-	add	%o4, 1, %o4
-1:
-	retl
-	 sub	%o4, %o1, %o0
-
-	.section .fixup,#alloc,#execinstr
-	.align	4
-9:
-	retl
-	 clr	%o0
-
-	.section __ex_table,#alloc
-	.align	4
-
-	.word	10b, 9b
-	.word	11b, 9b
-	.word	12b, 9b
-	.word	13b, 9b
diff --git a/arch/sparc/lib/strlen_user_32.S b/arch/sparc/lib/strlen_user_32.S
new file mode 100644
index 000000000000..8c8a371df3c9
--- /dev/null
+++ b/arch/sparc/lib/strlen_user_32.S
@@ -0,0 +1,109 @@
+/* strlen_user.S: Sparc optimized strlen_user code
+ *
+ * Return length of string in userspace including terminating 0
+ * or 0 for error
+ *
+ * Copyright (C) 1991,1996 Free Software Foundation
+ * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ */
+
+#define LO_MAGIC 0x01010101
+#define HI_MAGIC 0x80808080
+
+10:
+	ldub	[%o0], %o5
+	cmp	%o5, 0
+	be	1f
+	 add	%o0, 1, %o0
+	andcc	%o0, 3, %g0
+	be	4f
+	 or	%o4, %lo(HI_MAGIC), %o3
+11:
+	ldub	[%o0], %o5
+	cmp	%o5, 0
+	be	2f
+	 add	%o0, 1, %o0
+	andcc	%o0, 3, %g0
+	be	5f
+	 sethi	%hi(LO_MAGIC), %o4
+12:
+	ldub	[%o0], %o5
+	cmp	%o5, 0
+	be	3f
+	 add	%o0, 1, %o0
+	b	13f
+	 or	%o4, %lo(LO_MAGIC), %o2
+1:
+	retl
+	 mov	1, %o0
+2:
+	retl
+	 mov	2, %o0
+3:
+	retl
+	 mov	3, %o0
+
+	.align 4
+	.global __strlen_user, __strnlen_user
+__strlen_user:
+	sethi	%hi(32768), %o1
+__strnlen_user:
+	mov	%o1, %g1
+	mov	%o0, %o1
+	andcc	%o0, 3, %g0
+	bne	10b
+	 sethi	%hi(HI_MAGIC), %o4
+	or	%o4, %lo(HI_MAGIC), %o3
+4:
+	sethi	%hi(LO_MAGIC), %o4
+5:
+	or	%o4, %lo(LO_MAGIC), %o2
+13:
+	ld	[%o0], %o5
+2:
+	sub	%o5, %o2, %o4
+	andcc	%o4, %o3, %g0
+	bne	82f
+	 add	%o0, 4, %o0
+	sub	%o0, %o1, %g2
+81:	cmp	%g2, %g1
+	blu	13b
+	 mov	%o0, %o4
+	ba,a	1f
+
+	/* Check every byte. */
+82:	srl	%o5, 24, %g5
+	andcc	%g5, 0xff, %g0
+	be	1f
+	 add	%o0, -3, %o4
+	srl	%o5, 16, %g5
+	andcc	%g5, 0xff, %g0
+	be	1f
+	 add	%o4, 1, %o4
+	srl	%o5, 8, %g5
+	andcc	%g5, 0xff, %g0
+	be	1f
+	 add	%o4, 1, %o4
+	andcc	%o5, 0xff, %g0
+	bne	81b
+	 sub	%o0, %o1, %g2
+
+	add	%o4, 1, %o4
+1:
+	retl
+	 sub	%o4, %o1, %o0
+
+	.section .fixup,#alloc,#execinstr
+	.align	4
+9:
+	retl
+	 clr	%o0
+
+	.section __ex_table,#alloc
+	.align	4
+
+	.word	10b, 9b
+	.word	11b, 9b
+	.word	12b, 9b
+	.word	13b, 9b
diff --git a/arch/sparc/lib/strncmp.S b/arch/sparc/lib/strncmp.S
deleted file mode 100644
index 494ec664537a..000000000000
--- a/arch/sparc/lib/strncmp.S
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * strncmp.S: Hand optimized Sparc assembly of GCC output from GNU libc
- *            generic strncmp routine.
- */
-
-	.text
-	.align 4
-	.global __strncmp, strncmp
-__strncmp:
-strncmp:
-	mov	%o0, %g3
-	mov	0, %o3
-
-	cmp	%o2, 3
-	ble	7f
-	 mov	0, %g2
-
-	sra	%o2, 2, %o4
-	ldub	[%g3], %o3
-
-0:
-	ldub	[%o1], %g2
-	add	%g3, 1, %g3
-	and	%o3, 0xff, %o0
-
-	cmp	%o0, 0
-	be	8f
-	 add	%o1, 1, %o1
-
-	cmp	%o0, %g2
-	be,a	1f
-	 ldub	[%g3], %o3
-
-	retl
-	 sub	%o0, %g2, %o0
-
-1:
-	ldub	[%o1], %g2
-	add	%g3,1, %g3
-	and	%o3, 0xff, %o0
-
-	cmp	%o0, 0
-	be	8f
-	 add	%o1, 1, %o1
-
-	cmp	%o0, %g2
-	be,a	1f
-	 ldub	[%g3], %o3
-
-	retl
-	 sub	%o0, %g2, %o0
-
-1:
-	ldub	[%o1], %g2
-	add	%g3, 1, %g3
-	and	%o3, 0xff, %o0
-
-	cmp	%o0, 0
-	be	8f
-	 add	%o1, 1, %o1
-
-	cmp	%o0, %g2
-	be,a	1f
-	 ldub	[%g3], %o3
-
-	retl
-	 sub	%o0, %g2, %o0
-
-1:
-	ldub	[%o1], %g2
-	add	%g3, 1, %g3
-	and	%o3, 0xff, %o0
-
-	cmp	%o0, 0
-	be	8f
-	 add	%o1, 1, %o1
-
-	cmp	%o0, %g2
-	be	1f
-	 add	%o4, -1, %o4
-
-	retl
-	 sub	%o0, %g2, %o0
-
-1:
-
-	cmp	%o4, 0
-	bg,a	0b
-	 ldub	[%g3], %o3
-
-	b	7f
-	 and	%o2, 3, %o2
-
-9:
-	ldub	[%o1], %g2
-	add	%g3, 1, %g3
-	and	%o3, 0xff, %o0
-
-	cmp	%o0, 0
-	be	8f
-	 add	%o1, 1, %o1
-
-	cmp	%o0, %g2
-	be	7f
-	 add	%o2, -1, %o2
-
-8:
-	retl
-	 sub	%o0, %g2, %o0
-
-7:
-	cmp	%o2, 0
-	bg,a	9b
-	 ldub	[%g3], %o3
-
-	and	%g2, 0xff, %o0
-	retl
-	 sub	%o3, %o0, %o0
diff --git a/arch/sparc/lib/strncmp_32.S b/arch/sparc/lib/strncmp_32.S
new file mode 100644
index 000000000000..494ec664537a
--- /dev/null
+++ b/arch/sparc/lib/strncmp_32.S
@@ -0,0 +1,118 @@
+/*
+ * strncmp.S: Hand optimized Sparc assembly of GCC output from GNU libc
+ *            generic strncmp routine.
+ */
+
+	.text
+	.align 4
+	.global __strncmp, strncmp
+__strncmp:
+strncmp:
+	mov	%o0, %g3
+	mov	0, %o3
+
+	cmp	%o2, 3
+	ble	7f
+	 mov	0, %g2
+
+	sra	%o2, 2, %o4
+	ldub	[%g3], %o3
+
+0:
+	ldub	[%o1], %g2
+	add	%g3, 1, %g3
+	and	%o3, 0xff, %o0
+
+	cmp	%o0, 0
+	be	8f
+	 add	%o1, 1, %o1
+
+	cmp	%o0, %g2
+	be,a	1f
+	 ldub	[%g3], %o3
+
+	retl
+	 sub	%o0, %g2, %o0
+
+1:
+	ldub	[%o1], %g2
+	add	%g3,1, %g3
+	and	%o3, 0xff, %o0
+
+	cmp	%o0, 0
+	be	8f
+	 add	%o1, 1, %o1
+
+	cmp	%o0, %g2
+	be,a	1f
+	 ldub	[%g3], %o3
+
+	retl
+	 sub	%o0, %g2, %o0
+
+1:
+	ldub	[%o1], %g2
+	add	%g3, 1, %g3
+	and	%o3, 0xff, %o0
+
+	cmp	%o0, 0
+	be	8f
+	 add	%o1, 1, %o1
+
+	cmp	%o0, %g2
+	be,a	1f
+	 ldub	[%g3], %o3
+
+	retl
+	 sub	%o0, %g2, %o0
+
+1:
+	ldub	[%o1], %g2
+	add	%g3, 1, %g3
+	and	%o3, 0xff, %o0
+
+	cmp	%o0, 0
+	be	8f
+	 add	%o1, 1, %o1
+
+	cmp	%o0, %g2
+	be	1f
+	 add	%o4, -1, %o4
+
+	retl
+	 sub	%o0, %g2, %o0
+
+1:
+
+	cmp	%o4, 0
+	bg,a	0b
+	 ldub	[%g3], %o3
+
+	b	7f
+	 and	%o2, 3, %o2
+
+9:
+	ldub	[%o1], %g2
+	add	%g3, 1, %g3
+	and	%o3, 0xff, %o0
+
+	cmp	%o0, 0
+	be	8f
+	 add	%o1, 1, %o1
+
+	cmp	%o0, %g2
+	be	7f
+	 add	%o2, -1, %o2
+
+8:
+	retl
+	 sub	%o0, %g2, %o0
+
+7:
+	cmp	%o2, 0
+	bg,a	9b
+	 ldub	[%g3], %o3
+
+	and	%g2, 0xff, %o0
+	retl
+	 sub	%o3, %o0, %o0
diff --git a/arch/sparc/lib/strncpy_from_user.S b/arch/sparc/lib/strncpy_from_user.S
deleted file mode 100644
index d77198976a66..000000000000
--- a/arch/sparc/lib/strncpy_from_user.S
+++ /dev/null
@@ -1,47 +0,0 @@
-/* strncpy_from_user.S: Sparc strncpy from userspace.
- *
- *  Copyright(C) 1996 David S. Miller
- */
-
-#include <asm/ptrace.h>
-#include <asm/errno.h>
-
-	.text
-	.align	4
-
-	/* Must return:
-	 *
-	 * -EFAULT		for an exception
-	 * count		if we hit the buffer limit
-	 * bytes copied		if we hit a null byte
-	 */
-
-	.globl	__strncpy_from_user
-__strncpy_from_user:
-	/* %o0=dest, %o1=src, %o2=count */
-	mov	%o2, %o3
-1:
-	subcc	%o2, 1, %o2
-	bneg	2f
-	 nop
-10:
-	ldub	[%o1], %o4
-	add	%o0, 1, %o0
-	cmp	%o4, 0
-	add	%o1, 1, %o1
-	bne	1b
-	 stb	%o4, [%o0 - 1]
-2:
-	add	%o2, 1, %o0
-	retl
-	 sub	%o3, %o0, %o0
-
-	.section .fixup,#alloc,#execinstr
-	.align	4
-4:
-	retl
-	 mov	-EFAULT, %o0
-
-	.section __ex_table,#alloc
-	.align	4
-	.word	10b, 4b
diff --git a/arch/sparc/lib/strncpy_from_user_32.S b/arch/sparc/lib/strncpy_from_user_32.S
new file mode 100644
index 000000000000..d77198976a66
--- /dev/null
+++ b/arch/sparc/lib/strncpy_from_user_32.S
@@ -0,0 +1,47 @@
+/* strncpy_from_user.S: Sparc strncpy from userspace.
+ *
+ *  Copyright(C) 1996 David S. Miller
+ */
+
+#include <asm/ptrace.h>
+#include <asm/errno.h>
+
+	.text
+	.align	4
+
+	/* Must return:
+	 *
+	 * -EFAULT		for an exception
+	 * count		if we hit the buffer limit
+	 * bytes copied		if we hit a null byte
+	 */
+
+	.globl	__strncpy_from_user
+__strncpy_from_user:
+	/* %o0=dest, %o1=src, %o2=count */
+	mov	%o2, %o3
+1:
+	subcc	%o2, 1, %o2
+	bneg	2f
+	 nop
+10:
+	ldub	[%o1], %o4
+	add	%o0, 1, %o0
+	cmp	%o4, 0
+	add	%o1, 1, %o1
+	bne	1b
+	 stb	%o4, [%o0 - 1]
+2:
+	add	%o2, 1, %o0
+	retl
+	 sub	%o3, %o0, %o0
+
+	.section .fixup,#alloc,#execinstr
+	.align	4
+4:
+	retl
+	 mov	-EFAULT, %o0
+
+	.section __ex_table,#alloc
+	.align	4
+	.word	10b, 4b
-- 
cgit v1.2.3