1 files changed, 212 insertions, 0 deletions
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
new file mode 100644
index 000000000..0f8a0d033
--- /dev/null
+++ b/arch/x86/lib/memmove_64.S
@@ -0,0 +1,212 @@
+/*
+ * Normally compiler builtins are used, but sometimes the compiler calls out
+ * of line code. Based on asm-i386/string.h.
+ *
+ * This assembly file is re-written from memmove_64.c file.
+ *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+
+#undef memmove
+
+/*
+ * Implement memmove(). This can handle overlap between src and dst.
+ *
+ * Input:
+ * rdi: dest
+ * rsi: src
+ * rdx: count
+ *
+ * Output:
+ * rax: dest
+ */
+.weak memmove
+
+ENTRY(memmove)
+ENTRY(__memmove)
+	CFI_STARTPROC
+
+	/* Handle more 32 bytes in loop */
+	mov %rdi, %rax
+	cmp $0x20, %rdx
+	jb	1f
+
+	/* Decide forward/backward copy mode */
+	cmp %rdi, %rsi
+	jge .Lmemmove_begin_forward
+	mov %rsi, %r8
+	add %rdx, %r8
+	cmp %rdi, %r8
+	jg 2f
+
+.Lmemmove_begin_forward:
+	ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
+
+	/*
+	 * movsq instruction have many startup latency
+	 * so we handle small size by general register.
+	 */
+	cmp  $680, %rdx
+	jb	3f
+	/*
+	 * movsq instruction is only good for aligned case.
+	 */
+
+	cmpb %dil, %sil
+	je 4f
+3:
+	sub $0x20, %rdx
+	/*
+	 * We gobble 32 bytes forward in each loop.
+	 */
+5:
+	sub $0x20, %rdx
+	movq 0*8(%rsi), %r11
+	movq 1*8(%rsi), %r10
+	movq 2*8(%rsi), %r9
+	movq 3*8(%rsi), %r8
+	leaq 4*8(%rsi), %rsi
+
+	movq %r11, 0*8(%rdi)
+	movq %r10, 1*8(%rdi)
+	movq %r9, 2*8(%rdi)
+	movq %r8, 3*8(%rdi)
+	leaq 4*8(%rdi), %rdi
+	jae 5b
+	addq $0x20, %rdx
+	jmp 1f
+	/*
+	 * Handle data forward by movsq.
+	 */
+	.p2align 4
+4:
+	movq %rdx, %rcx
+	movq -8(%rsi, %rdx), %r11
+	lea -8(%rdi, %rdx), %r10
+	shrq $3, %rcx
+	rep movsq
+	movq %r11, (%r10)
+	jmp 13f
+.Lmemmove_end_forward:
+
+	/*
+	 * Handle data backward by movsq.
+	 */
+	.p2align 4
+7:
+	movq %rdx, %rcx
+	movq (%rsi), %r11
+	movq %rdi, %r10
+	leaq -8(%rsi, %rdx), %rsi
+	leaq -8(%rdi, %rdx), %rdi
+	shrq $3, %rcx
+	std
+	rep movsq
+	cld
+	movq %r11, (%r10)
+	jmp 13f
+
+	/*
+	 * Start to prepare for backward copy.
+	 */
+	.p2align 4
+2:
+	cmp $680, %rdx
+	jb 6f
+	cmp %dil, %sil
+	je 7b
+6:
+	/*
+	 * Calculate copy position to tail.
+	 */
+	addq %rdx, %rsi
+	addq %rdx, %rdi
+	subq $0x20, %rdx
+	/*
+	 * We gobble 32 bytes backward in each loop.
+	 */
+8:
+	subq $0x20, %rdx
+	movq -1*8(%rsi), %r11
+	movq -2*8(%rsi), %r10
+	movq -3*8(%rsi), %r9
+	movq -4*8(%rsi), %r8
+	leaq -4*8(%rsi), %rsi
+
+	movq %r11, -1*8(%rdi)
+	movq %r10, -2*8(%rdi)
+	movq %r9, -3*8(%rdi)
+	movq %r8, -4*8(%rdi)
+	leaq -4*8(%rdi), %rdi
+	jae 8b
+	/*
+	 * Calculate copy position to head.
+	 */
+	addq $0x20, %rdx
+	subq %rdx, %rsi
+	subq %rdx, %rdi
+1:
+	cmpq $16, %rdx
+	jb 9f
+	/*
+	 * Move data from 16 bytes to 31 bytes.
+	 */
+	movq 0*8(%rsi), %r11
+	movq 1*8(%rsi), %r10
+	movq -2*8(%rsi, %rdx), %r9
+	movq -1*8(%rsi, %rdx), %r8
+	movq %r11, 0*8(%rdi)
+	movq %r10, 1*8(%rdi)
+	movq %r9, -2*8(%rdi, %rdx)
+	movq %r8, -1*8(%rdi, %rdx)
+	jmp 13f
+	.p2align 4
+9:
+	cmpq $8, %rdx
+	jb 10f
+	/*
+	 * Move data from 8 bytes to 15 bytes.
+	 */
+	movq 0*8(%rsi), %r11
+	movq -1*8(%rsi, %rdx), %r10
+	movq %r11, 0*8(%rdi)
+	movq %r10, -1*8(%rdi, %rdx)
+	jmp 13f
+10:
+	cmpq $4, %rdx
+	jb 11f
+	/*
+	 * Move data from 4 bytes to 7 bytes.
+	 */
+	movl (%rsi), %r11d
+	movl -4(%rsi, %rdx), %r10d
+	movl %r11d, (%rdi)
+	movl %r10d, -4(%rdi, %rdx)
+	jmp 13f
+11:
+	cmp $2, %rdx
+	jb 12f
+	/*
+	 * Move data from 2 bytes to 3 bytes.
+	 */
+	movw (%rsi), %r11w
+	movw -2(%rsi, %rdx), %r10w
+	movw %r11w, (%rdi)
+	movw %r10w, -2(%rdi, %rdx)
+	jmp 13f
+12:
+	cmp $1, %rdx
+	jb 13f
+	/*
+	 * Move data for 1 byte.
+	 */
+	movb (%rsi), %r11b
+	movb %r11b, (%rdi)
+13:
+	retq
+	CFI_ENDPROC
+ENDPROC(__memmove)
+ENDPROC(memmove)