From 7a1ac5264108fc3ed22d17a3cdd76212ed1666d1 Mon Sep 17 00:00:00 2001
From: David S. Miller
Date: Thu, 16 Mar 2006 02:02:32 -0800
Subject: [SPARC64]: Fix and re-enable dynamic TSB sizing.

This is good for up to %50 performance improvement of some test cases.
The problem has been the race conditions, and hopefully I've plugged
them all up here.

1) There was a serious race in switch_mm() wrt. lazy TLB
   switching to and from kernel threads.

   We could erroneously skip a tsb_context_switch() and thus
   use a stale TSB across a TSB grow event.

   There is a big comment now in that function describing
   exactly how it can happen.

2) All code paths that do something with the TSB need to be
   guarded with the mm->context.lock spinlock.  This makes
   page table flushing paths properly synchronize with both
   TSB growing and TLB context changes.

3) TSB growing events are moved to the end of successful fault
   processing.  Previously it was in update_mmu_cache() but
   that is deadlock prone.  At the end of do_sparc64_fault()
   we hold no spinlocks that could deadlock the TSB grow
   sequence.  We also have dropped the address space semaphore.

While we're here, add prefetching to the copy_tsb() routine
and put it in assembler into the tsb.S file.  This piece of
code is quite time critical.

There are some small negative side effects to this code which
can be improved upon.  In particular we grab the mm->context.lock
even for the tsb insert done by update_mmu_cache() now and that's
a bit excessive.  We can get rid of that locking, and the same
lock taking in flush_tsb_user(), by disabling PSTATE_IE around
the whole operation including the capturing of the tsb pointer
and tsb_nentries value.  That would work because anyone growing
the TSB won't free up the old TSB until all cpus respond to the
TSB change cross call.

I'm not quite so confident in that optimization to put it in
right now, but eventually we might be able to and the description
is here for reference.

This code seems very solid now.  It passes several parallel GCC
bootstrap builds, and our favorite "nut cruncher" stress test which is
a full "make -j8192" build of a "make allmodconfig" kernel.  That puts
about 256 processes on each cpu's run queue, makes lots of process cpu
migrations occur, causes lots of page table and TLB flushing activity,
incurs many context version number changes, and it swaps the machine
real far out to disk even though there is 16GB of ram on this test
system. :-)

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/tsb.S | 71 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 70 insertions(+), 1 deletion(-)

(limited to 'arch/sparc64/kernel')

diff --git a/arch/sparc64/kernel/tsb.S b/arch/sparc64/kernel/tsb.S
index d738910153f6..1b154c863628 100644
--- a/arch/sparc64/kernel/tsb.S
+++ b/arch/sparc64/kernel/tsb.S
@@ -34,8 +34,9 @@ tsb_miss_itlb:
 	 ldxa		[%g4] ASI_IMMU, %g4
 
 	/* At this point we have:
-	 * %g4 --	missing virtual address
 	 * %g1 --	TSB entry address
+	 * %g3 --	FAULT_CODE_{D,I}TLB
+	 * %g4 --	missing virtual address
 	 * %g6 --	TAG TARGET (vaddr >> 22)
 	 */
 tsb_miss_page_table_walk:
@@ -45,6 +46,12 @@ tsb_miss_page_table_walk:
 tsb_miss_page_table_walk_sun4v_fastpath:
 	USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault)
 
+	/* At this point we have:
+	 * %g1 --	TSB entry address
+	 * %g3 --	FAULT_CODE_{D,I}TLB
+	 * %g5 --	physical address of PTE in Linux page tables
+	 * %g6 --	TAG TARGET (vaddr >> 22)
+	 */
 tsb_reload:
 	TSB_LOCK_TAG(%g1, %g2, %g7)
 
@@ -199,6 +206,7 @@ __tsb_insert:
 	wrpr	%o5, %pstate
 	retl
 	 nop
+	.size	__tsb_insert, .-__tsb_insert
 
 	/* Flush the given TSB entry if it has the matching
 	 * tag.
@@ -208,6 +216,7 @@ __tsb_insert:
 	 */
 	.align	32
 	.globl	tsb_flush
+	.type	tsb_flush,#function
 tsb_flush:
 	sethi	%hi(TSB_TAG_LOCK_HIGH), %g2
 1:	TSB_LOAD_TAG(%o0, %g1)
@@ -225,6 +234,7 @@ tsb_flush:
 	 nop
 2:	retl
 	 TSB_MEMBAR
+	.size	tsb_flush, .-tsb_flush
 
 	/* Reload MMU related context switch state at
 	 * schedule() time.
@@ -241,6 +251,7 @@ tsb_flush:
 	 */
 	.align	32
 	.globl	__tsb_context_switch
+	.type	__tsb_context_switch,#function
 __tsb_context_switch:
 	rdpr	%pstate, %o5
 	wrpr	%o5, PSTATE_IE, %pstate
@@ -302,3 +313,61 @@ __tsb_context_switch:
 
 	retl
 	 nop
+	.size	__tsb_context_switch, .-__tsb_context_switch
+
+#define TSB_PASS_BITS	((1 << TSB_TAG_LOCK_BIT) | \
+			 (1 << TSB_TAG_INVALID_BIT))
+
+	.align	32
+	.globl	copy_tsb
+	.type	copy_tsb,#function
+copy_tsb:		/* %o0=old_tsb_base, %o1=old_tsb_size
+			 * %o2=new_tsb_base, %o3=new_tsb_size
+			 */
+	sethi		%uhi(TSB_PASS_BITS), %g7
+	srlx		%o3, 4, %o3
+	add		%o0, %o1, %g1	/* end of old tsb */
+	sllx		%g7, 32, %g7
+	sub		%o3, 1, %o3	/* %o3 == new tsb hash mask */
+
+661:	prefetcha	[%o0] ASI_N, #one_read
+	.section	.tsb_phys_patch, "ax"
+	.word		661b
+	prefetcha	[%o0] ASI_PHYS_USE_EC, #one_read
+	.previous
+
+90:	andcc		%o0, (64 - 1), %g0
+	bne		1f
+	 add		%o0, 64, %o5
+
+661:	prefetcha	[%o5] ASI_N, #one_read
+	.section	.tsb_phys_patch, "ax"
+	.word		661b
+	prefetcha	[%o5] ASI_PHYS_USE_EC, #one_read
+	.previous
+
+1:	TSB_LOAD_QUAD(%o0, %g2)		/* %g2/%g3 == TSB entry */
+	andcc		%g2, %g7, %g0	/* LOCK or INVALID set? */
+	bne,pn		%xcc, 80f	/* Skip it */
+	 sllx		%g2, 22, %o4	/* TAG --> VADDR */
+
+	/* This can definitely be computed faster... */
+	srlx		%o0, 4, %o5	/* Build index */
+	and		%o5, 511, %o5	/* Mask index */
+	sllx		%o5, PAGE_SHIFT, %o5 /* Put into vaddr position */
+	or		%o4, %o5, %o4	/* Full VADDR. */
+	srlx		%o4, PAGE_SHIFT, %o4 /* Shift down to create index */
+	and		%o4, %o3, %o4	/* Mask with new_tsb_nents-1 */
+	sllx		%o4, 4, %o4	/* Shift back up into tsb ent offset */
+	TSB_STORE(%o2 + %o4, %g2)	/* Store TAG */
+	add		%o4, 0x8, %o4	/* Advance to TTE */
+	TSB_STORE(%o2 + %o4, %g3)	/* Store TTE */
+
+80:	add		%o0, 16, %o0
+	cmp		%o0, %g1
+	bne,pt		%xcc, 90b
+	 nop
+
+	retl
+	 TSB_MEMBAR
+	.size		copy_tsb, .-copy_tsb
-- 
cgit v1.2.3