Initial import

author: André Fabian Silva Delgado <emulatorman@parabola.nu> 2015-08-05 17:04:01 -0300
committer: André Fabian Silva Delgado <emulatorman@parabola.nu> 2015-08-05 17:04:01 -0300
commit: 57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree: 5e910f0e82173f4ef4f51111366a3f1299037a7b /arch/sh/lib64/copy_page.S
1 files changed, 89 insertions, 0 deletions
diff --git a/arch/sh/lib64/copy_page.S b/arch/sh/lib64/copy_page.S
new file mode 100644
index 000000000..0ec6fca63
--- /dev/null
+++ b/arch/sh/lib64/copy_page.S
@@ -0,0 +1,89 @@
+/*
+   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
+
+   This file is subject to the terms and conditions of the GNU General Public
+   License.  See the file "COPYING" in the main directory of this archive
+   for more details.
+
+   Tight version of mempy for the case of just copying a page.
+   Prefetch strategy empirically optimised against RTL simulations
+   of SH5-101 cut2 eval chip with Cayman board DDR memory.
+
+   Parameters:
+   r2 : destination effective address (start of page)
+   r3 : source effective address (start of page)
+
+   Always copies 4096 bytes.
+
+   Points to review.
+   * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
+     It seems like the prefetch needs to be at at least 4 lines ahead to get
+     the data into the cache in time, and the allocos contend with outstanding
+     prefetches for the same cache set, so it's better to have the numbers
+     different.
+   */
+
+	.section .text..SHmedia32,"ax"
+	.little
+
+	.balign 8
+	.global copy_page
+copy_page:
+
+	/* Copy 4096 bytes worth of data from r3 to r2.
+	   Do prefetches 4 lines ahead.
+	   Do alloco 2 lines ahead */
+
+	pta 1f, tr1
+	pta 2f, tr2
+	pta 3f, tr3
+	ptabs r18, tr0
+
+#if 0
+	/* TAKum03020 */
+	ld.q r3, 0x00, r63
+	ld.q r3, 0x20, r63
+	ld.q r3, 0x40, r63
+	ld.q r3, 0x60, r63
+#endif
+	alloco r2, 0x00
+	synco		! TAKum03020
+	alloco r2, 0x20
+	synco		! TAKum03020
+
+	movi 3968, r6
+	add  r2, r6, r6
+	addi r6, 64, r7
+	addi r7, 64, r8
+	sub r3, r2, r60
+	addi r60, 8, r61
+	addi r61, 8, r62
+	addi r62, 8, r23
+	addi r60, 0x80, r22
+
+/* Minimal code size.  The extra branches inside the loop don't cost much
+   because they overlap with the time spent waiting for prefetches to
+   complete. */
+1:
+#if 0
+	/* TAKum03020 */
+	bge/u r2, r6, tr2  ! skip prefetch for last 4 lines
+	ldx.q r2, r22, r63 ! prefetch 4 lines hence
+#endif
+2:
+	bge/u r2, r7, tr3  ! skip alloco for last 2 lines
+	alloco r2, 0x40    ! alloc destination line 2 lines ahead
+	synco		! TAKum03020
+3:
+	ldx.q r2, r60, r36
+	ldx.q r2, r61, r37
+	ldx.q r2, r62, r38
+	ldx.q r2, r23, r39
+	st.q  r2,   0, r36
+	st.q  r2,   8, r37
+	st.q  r2,  16, r38
+	st.q  r2,  24, r39
+	addi r2, 32, r2
+	bgt/l r8, r2, tr1
+
+	blink tr0, r63	   ! return
author	André Fabian Silva Delgado <emulatorman@parabola.nu>	2015-08-05 17:04:01 -0300
committer	André Fabian Silva Delgado <emulatorman@parabola.nu>	2015-08-05 17:04:01 -0300
commit	57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree	5e910f0e82173f4ef4f51111366a3f1299037a7b /arch/sh/lib64/copy_page.S