From 57f0f512b273f60d52568b8c6b77e17f5636edc0 Mon Sep 17 00:00:00 2001
From: André Fabian Silva Delgado <emulatorman@parabola.nu>
Date: Wed, 5 Aug 2015 17:04:01 -0300
Subject: Initial import

---
 arch/sh/lib64/Makefile           |  17 +++
 arch/sh/lib64/copy_page.S        |  89 ++++++++++++++++
 arch/sh/lib64/copy_user_memcpy.S | 217 +++++++++++++++++++++++++++++++++++++++
 arch/sh/lib64/memcpy.S           | 201 ++++++++++++++++++++++++++++++++++++
 arch/sh/lib64/memset.S           |  91 ++++++++++++++++
 arch/sh/lib64/panic.c            |  15 +++
 arch/sh/lib64/sdivsi3.S          | 135 ++++++++++++++++++++++++
 arch/sh/lib64/strcpy.S           |  97 +++++++++++++++++
 arch/sh/lib64/strlen.S           |  33 ++++++
 arch/sh/lib64/udelay.c           |  49 +++++++++
 arch/sh/lib64/udivdi3.S          | 120 ++++++++++++++++++++++
 arch/sh/lib64/udivsi3.S          |  59 +++++++++++
 12 files changed, 1123 insertions(+)
 create mode 100644 arch/sh/lib64/Makefile
 create mode 100644 arch/sh/lib64/copy_page.S
 create mode 100644 arch/sh/lib64/copy_user_memcpy.S
 create mode 100644 arch/sh/lib64/memcpy.S
 create mode 100644 arch/sh/lib64/memset.S
 create mode 100644 arch/sh/lib64/panic.c
 create mode 100644 arch/sh/lib64/sdivsi3.S
 create mode 100644 arch/sh/lib64/strcpy.S
 create mode 100644 arch/sh/lib64/strlen.S
 create mode 100644 arch/sh/lib64/udelay.c
 create mode 100644 arch/sh/lib64/udivdi3.S
 create mode 100644 arch/sh/lib64/udivsi3.S

(limited to 'arch/sh/lib64')

diff --git a/arch/sh/lib64/Makefile b/arch/sh/lib64/Makefile
new file mode 100644
index 000000000..69779ff74
--- /dev/null
+++ b/arch/sh/lib64/Makefile
@@ -0,0 +1,17 @@
+#
+# Makefile for the SH-5 specific library files..
+#
+# Copyright (C) 2000, 2001  Paolo Alberelli
+# Copyright (C) 2003 - 2008  Paul Mundt
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+
+# Panic should really be compiled as PIC
+lib-y  := udelay.o panic.o memcpy.o memset.o \
+	  copy_user_memcpy.o copy_page.o strcpy.o strlen.o
+
+# Extracted from libgcc
+lib-y	+= udivsi3.o udivdi3.o sdivsi3.o
diff --git a/arch/sh/lib64/copy_page.S b/arch/sh/lib64/copy_page.S
new file mode 100644
index 000000000..0ec6fca63
--- /dev/null
+++ b/arch/sh/lib64/copy_page.S
@@ -0,0 +1,89 @@
+/*
+   Copyright 2003 Richard Curnow, SuperH (UK) Ltd.
+
+   This file is subject to the terms and conditions of the GNU General Public
+   License.  See the file "COPYING" in the main directory of this archive
+   for more details.
+
+   Tight version of mempy for the case of just copying a page.
+   Prefetch strategy empirically optimised against RTL simulations
+   of SH5-101 cut2 eval chip with Cayman board DDR memory.
+
+   Parameters:
+   r2 : destination effective address (start of page)
+   r3 : source effective address (start of page)
+
+   Always copies 4096 bytes.
+
+   Points to review.
+   * Currently the prefetch is 4 lines ahead and the alloco is 2 lines ahead.
+     It seems like the prefetch needs to be at at least 4 lines ahead to get
+     the data into the cache in time, and the allocos contend with outstanding
+     prefetches for the same cache set, so it's better to have the numbers
+     different.
+   */
+
+	.section .text..SHmedia32,"ax"
+	.little
+
+	.balign 8
+	.global copy_page
+copy_page:
+
+	/* Copy 4096 bytes worth of data from r3 to r2.
+	   Do prefetches 4 lines ahead.
+	   Do alloco 2 lines ahead */
+
+	pta 1f, tr1
+	pta 2f, tr2
+	pta 3f, tr3
+	ptabs r18, tr0
+
+#if 0
+	/* TAKum03020 */
+	ld.q r3, 0x00, r63
+	ld.q r3, 0x20, r63
+	ld.q r3, 0x40, r63
+	ld.q r3, 0x60, r63
+#endif
+	alloco r2, 0x00
+	synco		! TAKum03020
+	alloco r2, 0x20
+	synco		! TAKum03020
+
+	movi 3968, r6
+	add  r2, r6, r6
+	addi r6, 64, r7
+	addi r7, 64, r8
+	sub r3, r2, r60
+	addi r60, 8, r61
+	addi r61, 8, r62
+	addi r62, 8, r23
+	addi r60, 0x80, r22
+
+/* Minimal code size.  The extra branches inside the loop don't cost much
+   because they overlap with the time spent waiting for prefetches to
+   complete. */
+1:
+#if 0
+	/* TAKum03020 */
+	bge/u r2, r6, tr2  ! skip prefetch for last 4 lines
+	ldx.q r2, r22, r63 ! prefetch 4 lines hence
+#endif
+2:
+	bge/u r2, r7, tr3  ! skip alloco for last 2 lines
+	alloco r2, 0x40    ! alloc destination line 2 lines ahead
+	synco		! TAKum03020
+3:
+	ldx.q r2, r60, r36
+	ldx.q r2, r61, r37
+	ldx.q r2, r62, r38
+	ldx.q r2, r23, r39
+	st.q  r2,   0, r36
+	st.q  r2,   8, r37
+	st.q  r2,  16, r38
+	st.q  r2,  24, r39
+	addi r2, 32, r2
+	bgt/l r8, r2, tr1
+
+	blink tr0, r63	   ! return
diff --git a/arch/sh/lib64/copy_user_memcpy.S b/arch/sh/lib64/copy_user_memcpy.S
new file mode 100644
index 000000000..49aeabeba
--- /dev/null
+++ b/arch/sh/lib64/copy_user_memcpy.S
@@ -0,0 +1,217 @@
+!
+! Fast SH memcpy
+!
+! by Toshiyasu Morita (tm@netcom.com)
+! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
+! SH5 code Copyright 2002 SuperH Ltd.
+!
+! Entry: ARG0: destination pointer
+!        ARG1: source pointer
+!        ARG2: byte count
+!
+! Exit:  RESULT: destination pointer
+!        any other registers in the range r0-r7: trashed
+!
+! Notes: Usually one wants to do small reads and write a longword, but
+!        unfortunately it is difficult in some cases to concatanate bytes
+!        into a longword on the SH, so this does a longword read and small
+!        writes.
+!
+! This implementation makes two assumptions about how it is called:
+!
+! 1.: If the byte count is nonzero, the address of the last byte to be
+!     copied is unsigned greater than the address of the first byte to
+!     be copied.  This could be easily swapped for a signed comparison,
+!     but the algorithm used needs some comparison.
+!
+! 2.: When there are two or three bytes in the last word of an 11-or-more
+!     bytes memory chunk to b copied, the rest of the word can be read
+!     without side effects.
+!     This could be easily changed by increasing the minimum size of
+!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
+!     however, this would cost a few extra cyles on average.
+!     For SHmedia, the assumption is that any quadword can be read in its
+!     enirety if at least one byte is included in the copy.
+
+/* Imported into Linux kernel by Richard Curnow.  This is used to implement the
+   __copy_user function in the general case, so it has to be a distinct
+   function from intra-kernel memcpy to allow for exception fix-ups in the
+   event that the user pointer is bad somewhere in the copy (e.g. due to
+   running off the end of the vma).
+
+   Note, this algorithm will be slightly wasteful in the case where the source
+   and destination pointers are equally aligned, because the stlo/sthi pairs
+   could then be merged back into single stores.  If there are a lot of cache
+   misses, this is probably offset by the stall lengths on the preloads.
+
+*/
+
+/* NOTE : Prefetches removed and allocos guarded by synco to avoid TAKum03020
+ * erratum.  The first two prefetches are nop-ed out to avoid upsetting the
+ * instruction counts used in the jump address calculation.
+ * */
+
+	.section .text..SHmedia32,"ax"
+	.little
+	.balign 32
+	.global copy_user_memcpy
+	.global copy_user_memcpy_end
+copy_user_memcpy:
+
+#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
+#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
+#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
+#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
+
+	nop ! ld.b r3,0,r63 ! TAKum03020
+	pta/l Large,tr0
+	movi 25,r0
+	bgeu/u r4,r0,tr0
+	nsb r4,r0
+	shlli r0,5,r0
+	movi (L1-L0+63*32 + 1) & 0xffff,r1
+	sub r1, r0, r0
+L0:	ptrel r0,tr0
+	add r2,r4,r5
+	ptabs r18,tr1
+	add r3,r4,r6
+	blink tr0,r63
+
+/* Rearranged to make cut2 safe */
+	.balign 8
+L4_7:	/* 4..7 byte memcpy cntd. */
+	stlo.l r2, 0, r0
+	or r6, r7, r6
+	sthi.l r5, -1, r6
+	stlo.l r5, -4, r6
+	blink tr1,r63
+
+	.balign 8
+L1:	/* 0 byte memcpy */
+	nop
+	blink tr1,r63
+	nop
+	nop
+	nop
+	nop
+
+L2_3:	/* 2 or 3 byte memcpy cntd. */
+	st.b r5,-1,r6
+	blink tr1,r63
+
+	/* 1 byte memcpy */
+	ld.b r3,0,r0
+	st.b r2,0,r0
+	blink tr1,r63
+
+L8_15:	/* 8..15 byte memcpy cntd. */
+	stlo.q r2, 0, r0
+	or r6, r7, r6
+	sthi.q r5, -1, r6
+	stlo.q r5, -8, r6
+	blink tr1,r63
+
+	/* 2 or 3 byte memcpy */
+	ld.b r3,0,r0
+	nop ! ld.b r2,0,r63 ! TAKum03020
+	ld.b r3,1,r1
+	st.b r2,0,r0
+	pta/l L2_3,tr0
+	ld.b r6,-1,r6
+	st.b r2,1,r1
+	blink tr0, r63
+
+	/* 4 .. 7 byte memcpy */
+	LDUAL (r3, 0, r0, r1)
+	pta L4_7, tr0
+	ldlo.l r6, -4, r7
+	or r0, r1, r0
+	sthi.l r2, 3, r0
+	ldhi.l r6, -1, r6
+	blink tr0, r63
+
+	/* 8 .. 15 byte memcpy */
+	LDUAQ (r3, 0, r0, r1)
+	pta L8_15, tr0
+	ldlo.q r6, -8, r7
+	or r0, r1, r0
+	sthi.q r2, 7, r0
+	ldhi.q r6, -1, r6
+	blink tr0, r63
+
+	/* 16 .. 24 byte memcpy */
+	LDUAQ (r3, 0, r0, r1)
+	LDUAQ (r3, 8, r8, r9)
+	or r0, r1, r0
+	sthi.q r2, 7, r0
+	or r8, r9, r8
+	sthi.q r2, 15, r8
+	ldlo.q r6, -8, r7
+	ldhi.q r6, -1, r6
+	stlo.q r2, 8, r8
+	stlo.q r2, 0, r0
+	or r6, r7, r6
+	sthi.q r5, -1, r6
+	stlo.q r5, -8, r6
+	blink tr1,r63
+
+Large:
+	! ld.b r2, 0, r63 ! TAKum03020
+	pta/l  Loop_ua, tr1
+	ori r3, -8, r7
+	sub r2, r7, r22
+	sub r3, r2, r6
+	add r2, r4, r5
+	ldlo.q r3, 0, r0
+	addi r5, -16, r5
+	movi 64+8, r27 ! could subtract r7 from that.
+	stlo.q r2, 0, r0
+	sthi.q r2, 7, r0
+	ldx.q r22, r6, r0
+	bgtu/l r27, r4, tr1
+
+	addi r5, -48, r27
+	pta/l Loop_line, tr0
+	addi r6, 64, r36
+	addi r6, -24, r19
+	addi r6, -16, r20
+	addi r6, -8, r21
+
+Loop_line:
+	! ldx.q r22, r36, r63 ! TAKum03020
+	alloco r22, 32
+	synco
+	addi r22, 32, r22
+	ldx.q r22, r19, r23
+	sthi.q r22, -25, r0
+	ldx.q r22, r20, r24
+	ldx.q r22, r21, r25
+	stlo.q r22, -32, r0
+	ldx.q r22, r6,  r0
+	sthi.q r22, -17, r23
+	sthi.q r22,  -9, r24
+	sthi.q r22,  -1, r25
+	stlo.q r22, -24, r23
+	stlo.q r22, -16, r24
+	stlo.q r22,  -8, r25
+	bgeu r27, r22, tr0
+
+Loop_ua:
+	addi r22, 8, r22
+	sthi.q r22, -1, r0
+	stlo.q r22, -8, r0
+	ldx.q r22, r6, r0
+	bgtu/l r5, r22, tr1
+
+	add r3, r4, r7
+	ldlo.q r7, -8, r1
+	sthi.q r22, 7, r0
+	ldhi.q r7, -1, r7
+	ptabs r18,tr1
+	stlo.q r22, 0, r0
+	or r1, r7, r1
+	sthi.q r5, 15, r1
+	stlo.q r5, 8, r1
+	blink tr1, r63
+copy_user_memcpy_end:
+	nop
diff --git a/arch/sh/lib64/memcpy.S b/arch/sh/lib64/memcpy.S
new file mode 100644
index 000000000..5d682e0ee
--- /dev/null
+++ b/arch/sh/lib64/memcpy.S
@@ -0,0 +1,201 @@
+/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
+/* Modified by SuperH, Inc. September 2003 */
+!
+! Fast SH memcpy
+!
+! by Toshiyasu Morita (tm@netcom.com)
+! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
+! SH5 code Copyright 2002 SuperH Ltd.
+!
+! Entry: ARG0: destination pointer
+!        ARG1: source pointer
+!        ARG2: byte count
+!
+! Exit:  RESULT: destination pointer
+!        any other registers in the range r0-r7: trashed
+!
+! Notes: Usually one wants to do small reads and write a longword, but
+!        unfortunately it is difficult in some cases to concatanate bytes
+!        into a longword on the SH, so this does a longword read and small
+!        writes.
+!
+! This implementation makes two assumptions about how it is called:
+!
+! 1.: If the byte count is nonzero, the address of the last byte to be
+!     copied is unsigned greater than the address of the first byte to
+!     be copied.  This could be easily swapped for a signed comparison,
+!     but the algorithm used needs some comparison.
+!
+! 2.: When there are two or three bytes in the last word of an 11-or-more
+!     bytes memory chunk to b copied, the rest of the word can be read
+!     without side effects.
+!     This could be easily changed by increasing the minimum size of
+!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
+!     however, this would cost a few extra cyles on average.
+!     For SHmedia, the assumption is that any quadword can be read in its
+!     enirety if at least one byte is included in the copy.
+!
+
+	.section .text..SHmedia32,"ax"
+	.globl	memcpy
+	.type	memcpy, @function
+	.align	5
+
+memcpy:
+
+#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
+#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
+#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
+#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
+
+	ld.b r3,0,r63
+	pta/l Large,tr0
+	movi 25,r0
+	bgeu/u r4,r0,tr0
+	nsb r4,r0
+	shlli r0,5,r0
+	movi (L1-L0+63*32 + 1) & 0xffff,r1
+	sub r1, r0, r0
+L0:	ptrel r0,tr0
+	add r2,r4,r5
+	ptabs r18,tr1
+	add r3,r4,r6
+	blink tr0,r63
+	
+/* Rearranged to make cut2 safe */
+	.balign 8
+L4_7:	/* 4..7 byte memcpy cntd. */
+	stlo.l r2, 0, r0
+	or r6, r7, r6
+	sthi.l r5, -1, r6
+	stlo.l r5, -4, r6
+	blink tr1,r63
+
+	.balign 8
+L1:	/* 0 byte memcpy */
+	nop
+	blink tr1,r63
+	nop
+	nop
+	nop
+	nop
+
+L2_3:	/* 2 or 3 byte memcpy cntd. */
+	st.b r5,-1,r6
+	blink tr1,r63
+
+	/* 1 byte memcpy */
+	ld.b r3,0,r0
+	st.b r2,0,r0
+	blink tr1,r63
+
+L8_15:	/* 8..15 byte memcpy cntd. */
+	stlo.q r2, 0, r0
+	or r6, r7, r6
+	sthi.q r5, -1, r6
+	stlo.q r5, -8, r6
+	blink tr1,r63
+	
+	/* 2 or 3 byte memcpy */
+	ld.b r3,0,r0
+	ld.b r2,0,r63
+	ld.b r3,1,r1
+	st.b r2,0,r0
+	pta/l L2_3,tr0
+	ld.b r6,-1,r6
+	st.b r2,1,r1
+	blink tr0, r63
+
+	/* 4 .. 7 byte memcpy */
+	LDUAL (r3, 0, r0, r1)
+	pta L4_7, tr0
+	ldlo.l r6, -4, r7
+	or r0, r1, r0
+	sthi.l r2, 3, r0
+	ldhi.l r6, -1, r6
+	blink tr0, r63
+
+	/* 8 .. 15 byte memcpy */
+	LDUAQ (r3, 0, r0, r1)
+	pta L8_15, tr0
+	ldlo.q r6, -8, r7
+	or r0, r1, r0
+	sthi.q r2, 7, r0
+	ldhi.q r6, -1, r6
+	blink tr0, r63
+
+	/* 16 .. 24 byte memcpy */
+	LDUAQ (r3, 0, r0, r1)
+	LDUAQ (r3, 8, r8, r9)
+	or r0, r1, r0
+	sthi.q r2, 7, r0
+	or r8, r9, r8
+	sthi.q r2, 15, r8
+	ldlo.q r6, -8, r7
+	ldhi.q r6, -1, r6
+	stlo.q r2, 8, r8
+	stlo.q r2, 0, r0
+	or r6, r7, r6
+	sthi.q r5, -1, r6
+	stlo.q r5, -8, r6
+	blink tr1,r63
+
+Large:
+	ld.b r2, 0, r63
+	pta/l  Loop_ua, tr1
+	ori r3, -8, r7
+	sub r2, r7, r22
+	sub r3, r2, r6
+	add r2, r4, r5
+	ldlo.q r3, 0, r0
+	addi r5, -16, r5
+	movi 64+8, r27 // could subtract r7 from that.
+	stlo.q r2, 0, r0
+	sthi.q r2, 7, r0
+	ldx.q r22, r6, r0
+	bgtu/l r27, r4, tr1
+
+	addi r5, -48, r27
+	pta/l Loop_line, tr0
+	addi r6, 64, r36
+	addi r6, -24, r19
+	addi r6, -16, r20
+	addi r6, -8, r21
+
+Loop_line:
+	ldx.q r22, r36, r63
+	alloco r22, 32
+	addi r22, 32, r22
+	ldx.q r22, r19, r23
+	sthi.q r22, -25, r0
+	ldx.q r22, r20, r24
+	ldx.q r22, r21, r25
+	stlo.q r22, -32, r0
+	ldx.q r22, r6,  r0
+	sthi.q r22, -17, r23
+	sthi.q r22,  -9, r24
+	sthi.q r22,  -1, r25
+	stlo.q r22, -24, r23
+	stlo.q r22, -16, r24
+	stlo.q r22,  -8, r25
+	bgeu r27, r22, tr0
+
+Loop_ua:
+	addi r22, 8, r22
+	sthi.q r22, -1, r0
+	stlo.q r22, -8, r0
+	ldx.q r22, r6, r0
+	bgtu/l r5, r22, tr1
+
+	add r3, r4, r7
+	ldlo.q r7, -8, r1
+	sthi.q r22, 7, r0
+	ldhi.q r7, -1, r7
+	ptabs r18,tr1
+	stlo.q r22, 0, r0
+	or r1, r7, r1
+	sthi.q r5, 15, r1
+	stlo.q r5, 8, r1
+	blink tr1, r63
+
+	.size memcpy,.-memcpy
diff --git a/arch/sh/lib64/memset.S b/arch/sh/lib64/memset.S
new file mode 100644
index 000000000..2d37b0488
--- /dev/null
+++ b/arch/sh/lib64/memset.S
@@ -0,0 +1,91 @@
+/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
+/* Modified by SuperH, Inc. September 2003 */
+!
+! Fast SH memset
+!
+! by Toshiyasu Morita (tm@netcom.com)
+!
+! SH5 code by J"orn Rennecke (joern.rennecke@superh.com)
+! Copyright 2002 SuperH Ltd.
+!
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define SHHI shlld
+#define SHLO shlrd
+#else
+#define SHHI shlrd
+#define SHLO shlld
+#endif
+
+	.section .text..SHmedia32,"ax"
+	.globl	memset
+	.type	memset, @function
+
+	.align 5
+
+memset:
+	pta/l multiquad, tr0
+	andi r2, 7, r22
+	ptabs r18, tr2
+	mshflo.b r3,r3,r3
+	add r4, r22, r23
+	mperm.w r3, r63, r3	// Fill pattern now in every byte of r3
+
+	movi 8, r9
+	bgtu/u r23, r9, tr0 // multiquad
+
+	beqi/u r4, 0, tr2       // Return with size 0 - ensures no mem accesses
+	ldlo.q r2, 0, r7
+	shlli r4, 2, r4
+	movi -1, r8
+	SHHI r8, r4, r8
+	SHHI r8, r4, r8
+	mcmv r7, r8, r3
+	stlo.q r2, 0, r3
+	blink tr2, r63
+
+multiquad:
+	pta/l lastquad, tr0
+	stlo.q r2, 0, r3
+	shlri r23, 3, r24
+	add r2, r4, r5
+	beqi/u r24, 1, tr0 // lastquad
+	pta/l loop, tr1
+	sub r2, r22, r25
+	andi r5, -8, r20   // calculate end address and
+	addi r20, -7*8, r8 // loop end address; This might overflow, so we need
+	                   // to use a different test before we start the loop
+	bge/u r24, r9, tr1 // loop
+	st.q r25, 8, r3
+	st.q r20, -8, r3
+	shlri r24, 1, r24
+	beqi/u r24, 1, tr0 // lastquad
+	st.q r25, 16, r3
+	st.q r20, -16, r3
+	beqi/u r24, 2, tr0 // lastquad
+	st.q r25, 24, r3
+	st.q r20, -24, r3
+lastquad:
+	sthi.q r5, -1, r3
+	blink tr2,r63
+
+loop:
+!!!	alloco r25, 32	// QQQ comment out for short-term fix to SHUK #3895.
+			// QQQ commenting out is locically correct, but sub-optimal
+			// QQQ Sean McGoogan - 4th April 2003.
+	st.q r25, 8, r3
+	st.q r25, 16, r3
+	st.q r25, 24, r3
+	st.q r25, 32, r3
+	addi r25, 32, r25
+	bgeu/l r8, r25, tr1 // loop
+
+	st.q r20, -40, r3
+	st.q r20, -32, r3
+	st.q r20, -24, r3
+	st.q r20, -16, r3
+	st.q r20, -8, r3
+	sthi.q r5, -1, r3
+	blink tr2,r63
+
+	.size	memset,.-memset
diff --git a/arch/sh/lib64/panic.c b/arch/sh/lib64/panic.c
new file mode 100644
index 000000000..38c954e04
--- /dev/null
+++ b/arch/sh/lib64/panic.c
@@ -0,0 +1,15 @@
+/*
+ * Copyright (C) 2003  Richard Curnow, SuperH UK Limited
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+void
+panic_handler(unsigned long panicPC, unsigned long panicSSR,
+	      unsigned long panicEXPEVT)
+{
+	/* Never return from the panic handler */
+	for (;;) ;
+}
diff --git a/arch/sh/lib64/sdivsi3.S b/arch/sh/lib64/sdivsi3.S
new file mode 100644
index 000000000..1963bbd42
--- /dev/null
+++ b/arch/sh/lib64/sdivsi3.S
@@ -0,0 +1,135 @@
+	.global	__sdivsi3
+	.global	__sdivsi3_1
+	.global	__sdivsi3_2
+	.section	.text..SHmedia32,"ax"
+	.align	2
+
+	/* inputs: r4,r5 */
+	/* clobbered: r1,r18,r19,r20,r21,r25,tr0 */
+	/* result in r0 */
+__sdivsi3:
+__sdivsi3_1:
+	ptb __div_table,tr0
+	gettr tr0,r20
+
+__sdivsi3_2:
+	nsb r5, r1
+	shlld r5, r1, r25    /* normalize; [-2 ..1, 1..2) in s2.62 */
+	shari r25, 58, r21   /* extract 5(6) bit index (s2.4 with hole -1..1) */
+	/* bubble */
+	ldx.ub r20, r21, r19 /* u0.8 */
+	shari r25, 32, r25   /* normalize to s2.30 */
+	shlli r21, 1, r21
+	muls.l r25, r19, r19 /* s2.38 */
+	ldx.w r20, r21, r21  /* s2.14 */
+	ptabs r18, tr0
+	shari r19, 24, r19   /* truncate to s2.14 */
+	sub r21, r19, r19    /* some 11 bit inverse in s1.14 */
+	muls.l r19, r19, r21 /* u0.28 */
+	sub r63, r1, r1
+	addi r1, 92, r1
+	muls.l r25, r21, r18 /* s2.58 */
+	shlli r19, 45, r19   /* multiply by two and convert to s2.58 */
+	/* bubble */
+	sub r19, r18, r18
+	shari r18, 28, r18   /* some 22 bit inverse in s1.30 */
+	muls.l r18, r25, r0  /* s2.60 */
+	muls.l r18, r4, r25 /* s32.30 */
+	/* bubble */
+	shari r0, 16, r19   /* s-16.44 */
+	muls.l r19, r18, r19 /* s-16.74 */
+	shari r25, 63, r0
+	shari r4, 14, r18   /* s19.-14 */
+	shari r19, 30, r19   /* s-16.44 */
+	muls.l r19, r18, r19 /* s15.30 */
+	xor r21, r0, r21    /* You could also use the constant 1 << 27. */
+	add r21, r25, r21
+	sub r21, r19, r21
+	shard r21, r1, r21
+	sub r21, r0, r0
+	blink tr0, r63
+	
+/* This table has been generated by divtab.c .
+Defects for bias -330:
+   Max defect: 6.081536e-07 at -1.000000e+00
+   Min defect: 2.849516e-08 at 1.030651e+00
+   Max 2nd step defect: 9.606539e-12 at -1.000000e+00
+   Min 2nd step defect: 0.000000e+00 at 0.000000e+00
+   Defect at 1: 1.238659e-07
+   Defect at -2: 1.061708e-07 */
+
+	.balign 2
+	.type	__div_table,@object
+	.size	__div_table,128
+/* negative division constants */
+	.word	-16638
+	.word	-17135
+	.word	-17737
+	.word	-18433
+	.word	-19103
+	.word	-19751
+	.word	-20583
+	.word	-21383
+	.word	-22343
+	.word	-23353
+	.word	-24407
+	.word	-25582
+	.word	-26863
+	.word	-28382
+	.word	-29965
+	.word	-31800
+/* negative division factors */
+	.byte	66
+	.byte	70
+	.byte	75
+	.byte	81
+	.byte	87
+	.byte	93
+	.byte	101
+	.byte	109
+	.byte	119
+	.byte	130
+	.byte	142
+	.byte	156
+	.byte	172
+	.byte	192
+	.byte	214
+	.byte	241
+	.skip 16
+	.global	__div_table
+__div_table:
+	.skip 16
+/* positive division factors */
+	.byte	241
+	.byte	214
+	.byte	192
+	.byte	172
+	.byte	156
+	.byte	142
+	.byte	130
+	.byte	119
+	.byte	109
+	.byte	101
+	.byte	93
+	.byte	87
+	.byte	81
+	.byte	75
+	.byte	70
+	.byte	66
+/* positive division constants */
+	.word	31801
+	.word	29966
+	.word	28383
+	.word	26864
+	.word	25583
+	.word	24408
+	.word	23354
+	.word	22344
+	.word	21384
+	.word	20584
+	.word	19752
+	.word	19104
+	.word	18434
+	.word	17738
+	.word	17136
+	.word	16639
diff --git a/arch/sh/lib64/strcpy.S b/arch/sh/lib64/strcpy.S
new file mode 100644
index 000000000..ea7c9c533
--- /dev/null
+++ b/arch/sh/lib64/strcpy.S
@@ -0,0 +1,97 @@
+/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
+/* Modified by SuperH, Inc. September 2003 */
+! Entry: arg0: destination
+!        arg1: source
+! Exit:  result: destination
+!
+! SH5 code Copyright 2002 SuperH Ltd.
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define SHHI shlld
+#define SHLO shlrd
+#else
+#define SHHI shlrd
+#define SHLO shlld
+#endif
+
+	.section .text..SHmedia32,"ax"
+	.globl	strcpy
+	.type	strcpy, @function
+	.align 5
+
+strcpy:
+
+	pta/l shortstring,tr1
+	ldlo.q r3,0,r4
+	ptabs r18,tr4
+	shlli r3,3,r7
+	addi r2, 8, r0
+	mcmpeq.b r4,r63,r6
+	SHHI r6,r7,r6
+	bnei/u r6,0,tr1 // shortstring
+	pta/l no_lddst, tr2
+	ori r3,-8,r23
+	sub r2, r23, r0
+	sub r3, r2, r21
+	addi r21, 8, r20
+	ldx.q r0, r21, r5
+	pta/l loop, tr0
+	ori r2,-8,r22
+	mcmpeq.b r5, r63, r6
+	bgt/u r22, r23, tr2 // no_lddst
+
+	// r22 < r23 :  Need to do a load from the destination.
+	// r22 == r23 : Doesn't actually need to load from destination,
+	//              but still can be handled here.
+	ldlo.q r2, 0, r9
+	movi -1, r8
+	SHLO r8, r7, r8
+	mcmv r4, r8, r9
+	stlo.q r2, 0, r9
+	beqi/l r6, 0, tr0 // loop
+
+	add r5, r63, r4
+	addi r0, 8, r0
+	blink tr1, r63 // shortstring
+no_lddst:
+	// r22 > r23: note that for r22 == r23 the sthi.q would clobber
+	//            bytes before the destination region.
+	stlo.q r2, 0, r4
+	SHHI r4, r7, r4
+	sthi.q r0, -1, r4
+	beqi/l r6, 0, tr0 // loop
+
+	add r5, r63, r4
+	addi r0, 8, r0
+shortstring:
+#if __BYTE_ORDER != __LITTLE_ENDIAN
+	pta/l shortstring2,tr1
+	byterev r4,r4
+#endif
+shortstring2:
+	st.b r0,-8,r4
+	andi r4,0xff,r5
+	shlri r4,8,r4
+	addi r0,1,r0
+	bnei/l r5,0,tr1
+	blink tr4,r63 // return
+	
+	.balign 8
+loop:
+	stlo.q r0, 0, r5
+	ldx.q r0, r20, r4
+	addi r0, 16, r0
+	sthi.q r0, -9, r5
+	mcmpeq.b r4, r63, r6
+	bnei/u r6, 0, tr1 // shortstring
+	ldx.q r0, r21, r5
+	stlo.q r0, -8, r4
+	sthi.q r0, -1, r4
+	mcmpeq.b r5, r63, r6
+	beqi/l r6, 0, tr0 // loop
+
+	add r5, r63, r4
+	addi r0, 8, r0
+	blink tr1, r63 // shortstring
+
+	.size	strcpy,.-strcpy
diff --git a/arch/sh/lib64/strlen.S b/arch/sh/lib64/strlen.S
new file mode 100644
index 000000000..cbc0d912e
--- /dev/null
+++ b/arch/sh/lib64/strlen.S
@@ -0,0 +1,33 @@
+/*
+ * Simplistic strlen() implementation for SHmedia.
+ *
+ * Copyright (C) 2003  Paul Mundt <lethal@linux-sh.org>
+ */
+
+	.section .text..SHmedia32,"ax"
+	.globl	strlen
+	.type	strlen,@function
+
+	.balign 16
+strlen:
+	ptabs	r18, tr4
+
+	/*
+	 * Note: We could easily deal with the NULL case here with a simple
+	 * sanity check, though it seems that the behavior we want is to fault
+	 * in the event that r2 == NULL, so we don't bother.
+	 */
+/*	beqi    r2, 0, tr4 */	! Sanity check
+
+	movi	-1, r0
+	pta/l	loop, tr0
+loop:
+	ld.b	r2, 0, r1
+	addi	r2, 1, r2
+	addi	r0, 1, r0
+	bnei/l	r1, 0, tr0
+
+	or	r0, r63, r2
+	blink	tr4, r63
+
+	.size	strlen,.-strlen
diff --git a/arch/sh/lib64/udelay.c b/arch/sh/lib64/udelay.c
new file mode 100644
index 000000000..f215b063d
--- /dev/null
+++ b/arch/sh/lib64/udelay.c
@@ -0,0 +1,49 @@
+/*
+ * arch/sh/lib64/udelay.c
+ *
+ * Delay routines, using a pre-computed "loops_per_jiffy" value.
+ *
+ * Copyright (C) 2000, 2001  Paolo Alberelli
+ * Copyright (C) 2003, 2004  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/sched.h>
+#include <asm/param.h>
+
+/*
+ * Use only for very small delays (< 1 msec).
+ *
+ * The active part of our cycle counter is only 32-bits wide, and
+ * we're treating the difference between two marks as signed.  On
+ * a 1GHz box, that's about 2 seconds.
+ */
+
+void __delay(unsigned long loops)
+{
+	long long dummy;
+	__asm__ __volatile__("gettr	tr0, %1\n\t"
+			     "pta	$+4, tr0\n\t"
+			     "addi	%0, -1, %0\n\t"
+			     "bne	%0, r63, tr0\n\t"
+			     "ptabs	%1, tr0\n\t":"=r"(loops),
+			     "=r"(dummy)
+			     :"0"(loops));
+}
+
+void __const_udelay(unsigned long xloops)
+{
+	__delay(xloops * (HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy));
+}
+
+void __udelay(unsigned long usecs)
+{
+	__const_udelay(usecs * 0x000010c6);  /* 2**32 / 1000000 */
+}
+
+void __ndelay(unsigned long nsecs)
+{
+	__const_udelay(nsecs * 0x00000005);
+}
diff --git a/arch/sh/lib64/udivdi3.S b/arch/sh/lib64/udivdi3.S
new file mode 100644
index 000000000..6895c0225
--- /dev/null
+++ b/arch/sh/lib64/udivdi3.S
@@ -0,0 +1,120 @@
+	.section	.text..SHmedia32,"ax"
+	.align	2
+	.global	__udivdi3
+__udivdi3:
+	shlri r3,1,r4
+	nsb r4,r22
+	shlld r3,r22,r6
+	shlri r6,49,r5
+	movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */
+	sub r21,r5,r1
+	mmulfx.w r1,r1,r4
+	mshflo.w r1,r63,r1
+	sub r63,r22,r20 // r63 == 64 % 64
+	mmulfx.w r5,r4,r4
+	pta large_divisor,tr0
+	addi r20,32,r9
+	msub.w r1,r4,r1
+	madd.w r1,r1,r1
+	mmulfx.w r1,r1,r4
+	shlri r6,32,r7
+	bgt/u r9,r63,tr0 // large_divisor
+	mmulfx.w r5,r4,r4
+	shlri r2,32+14,r19
+	addi r22,-31,r0
+	msub.w r1,r4,r1
+
+	mulu.l r1,r7,r4
+	addi r1,-3,r5
+	mulu.l r5,r19,r5
+	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+	                 the case may be, %0000000000000000 000.11111111111, still */
+	muls.l r1,r4,r4 /* leaving at least one sign bit.  */
+	mulu.l r5,r3,r8
+	mshalds.l r1,r21,r1
+	shari r4,26,r4
+	shlld r8,r0,r8
+	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+	sub r2,r8,r2
+	/* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */
+
+	shlri r2,22,r21
+	mulu.l r21,r1,r21
+	shlld r5,r0,r8
+	addi r20,30-22,r0
+	shlrd r21,r0,r21
+	mulu.l r21,r3,r5
+	add r8,r21,r8
+	mcmpgt.l r21,r63,r21 // See Note 1
+	addi r20,30,r0
+	mshfhi.l r63,r21,r21
+	sub r2,r5,r2
+	andc r2,r21,r2
+
+	/* small divisor: need a third divide step */
+	mulu.l r2,r1,r7
+	ptabs r18,tr0
+	addi r2,1,r2
+	shlrd r7,r0,r7
+	mulu.l r7,r3,r5
+	add r8,r7,r8
+	sub r2,r3,r2
+	cmpgt r2,r5,r5
+	add r8,r5,r2
+	/* could test r3 here to check for divide by zero.  */
+	blink tr0,r63
+
+large_divisor:
+	mmulfx.w r5,r4,r4
+	shlrd r2,r9,r25
+	shlri r25,32,r8
+	msub.w r1,r4,r1
+
+	mulu.l r1,r7,r4
+	addi r1,-3,r5
+	mulu.l r5,r8,r5
+	sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+	shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+	                 the case may be, %0000000000000000 000.11111111111, still */
+	muls.l r1,r4,r4 /* leaving at least one sign bit.  */
+	shlri r5,14-1,r8
+	mulu.l r8,r7,r5
+	mshalds.l r1,r21,r1
+	shari r4,26,r4
+	add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+	sub r25,r5,r25
+	/* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */
+
+	shlri r25,22,r21
+	mulu.l r21,r1,r21
+	pta no_lo_adj,tr0
+	addi r22,32,r0
+	shlri r21,40,r21
+	mulu.l r21,r7,r5
+	add r8,r21,r8
+	shlld r2,r0,r2
+	sub r25,r5,r25
+	bgtu/u r7,r25,tr0 // no_lo_adj
+	addi r8,1,r8
+	sub r25,r7,r25
+no_lo_adj:
+	mextr4 r2,r25,r2
+
+	/* large_divisor: only needs a few adjustments.  */
+	mulu.l r8,r6,r5
+	ptabs r18,tr0
+	/* bubble */
+	cmpgtu r5,r2,r5
+	sub r8,r5,r2
+	blink tr0,r63
+	
+/* Note 1: To shift the result of the second divide stage so that the result
+   always fits into 32 bits, yet we still reduce the rest sufficiently
+   would require a lot of instructions to do the shifts just right.  Using
+   the full 64 bit shift result to multiply with the divisor would require
+   four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
+   Fortunately, if the upper 32 bits of the shift result are nonzero, we
+   know that the rest after taking this partial result into account will
+   fit into 32 bits.  So we just clear the upper 32 bits of the rest if the
+   upper 32 bits of the partial result are nonzero.  */
diff --git a/arch/sh/lib64/udivsi3.S b/arch/sh/lib64/udivsi3.S
new file mode 100644
index 000000000..e68120e4b
--- /dev/null
+++ b/arch/sh/lib64/udivsi3.S
@@ -0,0 +1,59 @@
+	.global	__udivsi3
+	.section	.text..SHmedia32,"ax"
+	.align	2
+
+/*
+   inputs: r4,r5
+   clobbered: r18,r19,r20,r21,r22,r25,tr0
+   result in r0.
+ */
+__udivsi3:
+	addz.l r5,r63,r22
+	nsb r22,r0
+	shlld r22,r0,r25
+	shlri r25,48,r25
+	movi 0xffffffffffffbb0c,r20 /* shift count eqiv 76 */
+	sub r20,r25,r21
+	mmulfx.w r21,r21,r19
+	mshflo.w r21,r63,r21
+	ptabs r18,tr0
+	mmulfx.w r25,r19,r19
+	sub r20,r0,r0
+	/* bubble */
+	msub.w r21,r19,r19
+
+	/*
+	 * It would be nice for scheduling to do this add to r21 before
+	 * the msub.w, but we need a different value for r19 to keep
+	 * errors under control.
+	 */
+	addi r19,-2,r21
+	mulu.l r4,r21,r18
+	mmulfx.w r19,r19,r19
+	shlli r21,15,r21
+	shlrd r18,r0,r18
+	mulu.l r18,r22,r20
+	mmacnfx.wl r25,r19,r21
+	/* bubble */
+	sub r4,r20,r25
+
+	mulu.l r25,r21,r19
+	addi r0,14,r0
+	/* bubble */
+	shlrd r19,r0,r19
+	mulu.l r19,r22,r20
+	add r18,r19,r18
+	/* bubble */
+	sub.l r25,r20,r25
+
+	mulu.l r25,r21,r19
+	addz.l r25,r63,r25
+	sub r25,r22,r25
+	shlrd r19,r0,r19
+	mulu.l r19,r22,r20
+	addi r25,1,r25
+	add r18,r19,r18
+
+	cmpgt r25,r20,r25
+	add.l r18,r25,r0
+	blink tr0,r63
-- 
cgit v1.2.3-54-g00ecf