aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/lib
diff options
context:
space:
mode:
authorLinus Torvalds2012-03-22 09:13:24 -0700
committerLinus Torvalds2012-03-22 09:13:24 -0700
commite17fdf5c6778ff77d93dd769910992e4073b9348 (patch)
treed1a7ca2b1faf4301b39300fbd82f9b91e605a77e /arch/x86/lib
parent95211279c5ad00a317c98221d7e4365e02f20836 (diff)
parenta240ada241dafe290e7532d1ddeb98fdf1419068 (diff)
Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86/asm changes from Ingo Molnar * 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86: Include probe_roms.h in probe_roms.c x86/32: Print control and debug registers for kerenel context x86: Tighten dependencies of CPU_SUP_*_32 x86/numa: Improve internode cache alignment x86: Fix the NMI nesting comments x86-64: Improve insn scheduling in SAVE_ARGS_IRQ x86-64: Fix CFI annotations for NMI nesting code bitops: Add missing parentheses to new get_order macro bitops: Optimise get_order() bitops: Adjust the comment on get_order() to describe the size==0 case x86/spinlocks: Eliminate TICKET_MASK x86-64: Handle byte-wise tail copying in memcpy() without a loop x86-64: Fix memcpy() to support sizes of 4Gb and above x86-64: Fix memset() to support sizes of 4Gb and above x86-64: Slightly shorten copy_page()
Diffstat (limited to 'arch/x86/lib')
-rw-r--r--arch/x86/lib/copy_page_64.S12
-rw-r--r--arch/x86/lib/memcpy_64.S44
-rw-r--r--arch/x86/lib/memset_64.S33
3 files changed, 39 insertions, 50 deletions
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 01c805ba5359..6b34d04d096a 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -20,14 +20,12 @@ ENDPROC(copy_page_c)
ENTRY(copy_page)
CFI_STARTPROC
- subq $3*8,%rsp
- CFI_ADJUST_CFA_OFFSET 3*8
+ subq $2*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 2*8
movq %rbx,(%rsp)
CFI_REL_OFFSET rbx, 0
movq %r12,1*8(%rsp)
CFI_REL_OFFSET r12, 1*8
- movq %r13,2*8(%rsp)
- CFI_REL_OFFSET r13, 2*8
movl $(4096/64)-5,%ecx
.p2align 4
@@ -91,10 +89,8 @@ ENTRY(copy_page)
CFI_RESTORE rbx
movq 1*8(%rsp),%r12
CFI_RESTORE r12
- movq 2*8(%rsp),%r13
- CFI_RESTORE r13
- addq $3*8,%rsp
- CFI_ADJUST_CFA_OFFSET -3*8
+ addq $2*8,%rsp
+ CFI_ADJUST_CFA_OFFSET -2*8
ret
.Lcopy_page_end:
CFI_ENDPROC
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index efbf2a0ecdea..1c273be7c97e 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -27,9 +27,8 @@
.section .altinstr_replacement, "ax", @progbits
.Lmemcpy_c:
movq %rdi, %rax
-
- movl %edx, %ecx
- shrl $3, %ecx
+ movq %rdx, %rcx
+ shrq $3, %rcx
andl $7, %edx
rep movsq
movl %edx, %ecx
@@ -48,8 +47,7 @@
.section .altinstr_replacement, "ax", @progbits
.Lmemcpy_c_e:
movq %rdi, %rax
-
- movl %edx, %ecx
+ movq %rdx, %rcx
rep movsb
ret
.Lmemcpy_e_e:
@@ -60,10 +58,7 @@ ENTRY(memcpy)
CFI_STARTPROC
movq %rdi, %rax
- /*
- * Use 32bit CMP here to avoid long NOP padding.
- */
- cmp $0x20, %edx
+ cmpq $0x20, %rdx
jb .Lhandle_tail
/*
@@ -72,7 +67,7 @@ ENTRY(memcpy)
*/
cmp %dil, %sil
jl .Lcopy_backward
- subl $0x20, %edx
+ subq $0x20, %rdx
.Lcopy_forward_loop:
subq $0x20, %rdx
@@ -91,7 +86,7 @@ ENTRY(memcpy)
movq %r11, 3*8(%rdi)
leaq 4*8(%rdi), %rdi
jae .Lcopy_forward_loop
- addq $0x20, %rdx
+ addl $0x20, %edx
jmp .Lhandle_tail
.Lcopy_backward:
@@ -123,11 +118,11 @@ ENTRY(memcpy)
/*
* Calculate copy position to head.
*/
- addq $0x20, %rdx
+ addl $0x20, %edx
subq %rdx, %rsi
subq %rdx, %rdi
.Lhandle_tail:
- cmpq $16, %rdx
+ cmpl $16, %edx
jb .Lless_16bytes
/*
@@ -144,7 +139,7 @@ ENTRY(memcpy)
retq
.p2align 4
.Lless_16bytes:
- cmpq $8, %rdx
+ cmpl $8, %edx
jb .Lless_8bytes
/*
* Move data from 8 bytes to 15 bytes.
@@ -156,7 +151,7 @@ ENTRY(memcpy)
retq
.p2align 4
.Lless_8bytes:
- cmpq $4, %rdx
+ cmpl $4, %edx
jb .Lless_3bytes
/*
@@ -169,18 +164,19 @@ ENTRY(memcpy)
retq
.p2align 4
.Lless_3bytes:
- cmpl $0, %edx
- je .Lend
+ subl $1, %edx
+ jb .Lend
/*
* Move data from 1 bytes to 3 bytes.
*/
-.Lloop_1:
- movb (%rsi), %r8b
- movb %r8b, (%rdi)
- incq %rdi
- incq %rsi
- decl %edx
- jnz .Lloop_1
+ movzbl (%rsi), %ecx
+ jz .Lstore_1byte
+ movzbq 1(%rsi), %r8
+ movzbq (%rsi, %rdx), %r9
+ movb %r8b, 1(%rdi)
+ movb %r9b, (%rdi, %rdx)
+.Lstore_1byte:
+ movb %cl, (%rdi)
.Lend:
retq
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 79bd454b78a3..2dcb3808cbda 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -19,16 +19,15 @@
.section .altinstr_replacement, "ax", @progbits
.Lmemset_c:
movq %rdi,%r9
- movl %edx,%r8d
- andl $7,%r8d
- movl %edx,%ecx
- shrl $3,%ecx
+ movq %rdx,%rcx
+ andl $7,%edx
+ shrq $3,%rcx
/* expand byte value */
movzbl %sil,%esi
movabs $0x0101010101010101,%rax
- mulq %rsi /* with rax, clobbers rdx */
+ imulq %rsi,%rax
rep stosq
- movl %r8d,%ecx
+ movl %edx,%ecx
rep stosb
movq %r9,%rax
ret
@@ -50,7 +49,7 @@
.Lmemset_c_e:
movq %rdi,%r9
movb %sil,%al
- movl %edx,%ecx
+ movq %rdx,%rcx
rep stosb
movq %r9,%rax
ret
@@ -61,12 +60,11 @@ ENTRY(memset)
ENTRY(__memset)
CFI_STARTPROC
movq %rdi,%r10
- movq %rdx,%r11
/* expand byte value */
movzbl %sil,%ecx
movabs $0x0101010101010101,%rax
- mul %rcx /* with rax, clobbers rdx */
+ imulq %rcx,%rax
/* align dst */
movl %edi,%r9d
@@ -75,13 +73,13 @@ ENTRY(__memset)
CFI_REMEMBER_STATE
.Lafter_bad_alignment:
- movl %r11d,%ecx
- shrl $6,%ecx
+ movq %rdx,%rcx
+ shrq $6,%rcx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
- decl %ecx
+ decq %rcx
movq %rax,(%rdi)
movq %rax,8(%rdi)
movq %rax,16(%rdi)
@@ -97,7 +95,7 @@ ENTRY(__memset)
to predict jump tables. */
.p2align 4
.Lhandle_tail:
- movl %r11d,%ecx
+ movl %edx,%ecx
andl $63&(~7),%ecx
jz .Lhandle_7
shrl $3,%ecx
@@ -109,12 +107,11 @@ ENTRY(__memset)
jnz .Lloop_8
.Lhandle_7:
- movl %r11d,%ecx
- andl $7,%ecx
+ andl $7,%edx
jz .Lende
.p2align 4
.Lloop_1:
- decl %ecx
+ decl %edx
movb %al,(%rdi)
leaq 1(%rdi),%rdi
jnz .Lloop_1
@@ -125,13 +122,13 @@ ENTRY(__memset)
CFI_RESTORE_STATE
.Lbad_alignment:
- cmpq $7,%r11
+ cmpq $7,%rdx
jbe .Lhandle_7
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
- subq %r8,%r11
+ subq %r8,%rdx
jmp .Lafter_bad_alignment
.Lfinal:
CFI_ENDPROC