23 files changed, 1710 insertions, 0 deletions
diff --git a/arch/blackfin/lib/Makefile b/arch/blackfin/lib/Makefile
new file mode 100644
index 000000000..42c47dc9e
--- /dev/null
+++ b/arch/blackfin/lib/Makefile
@@ -0,0 +1,11 @@
+#
+# arch/blackfin/lib/Makefile
+#
+
+lib-y := \
+	ashldi3.o ashrdi3.o lshrdi3.o \
+	muldi3.o divsi3.o udivsi3.o modsi3.o umodsi3.o \
+	memcpy.o memset.o memcmp.o memchr.o memmove.o \
+	strcmp.o strcpy.o strncmp.o strncpy.o \
+	umulsi3_highpart.o smulsi3_highpart.o \
+	ins.o outs.o
diff --git a/arch/blackfin/lib/ashldi3.c b/arch/blackfin/lib/ashldi3.c
new file mode 100644
index 000000000..ab69d8768
--- /dev/null
+++ b/arch/blackfin/lib/ashldi3.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2004-2009 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#include "gcclib.h"
+
+#ifdef CONFIG_ARITHMETIC_OPS_L1
+DItype __ashldi3(DItype u, word_type b)__attribute__((l1_text));
+#endif
+
+DItype __ashldi3(DItype u, word_type b)
+{
+	DIunion w;
+	word_type bm;
+	DIunion uu;
+
+	if (b == 0)
+		return u;
+
+	uu.ll = u;
+
+	bm = (sizeof(SItype) * BITS_PER_UNIT) - b;
+	if (bm <= 0) {
+		w.s.low = 0;
+		w.s.high = (USItype) uu.s.low << -bm;
+	} else {
+		USItype carries = (USItype) uu.s.low >> bm;
+		w.s.low = (USItype) uu.s.low << b;
+		w.s.high = ((USItype) uu.s.high << b) | carries;
+	}
+
+	return w.ll;
+}
diff --git a/arch/blackfin/lib/ashrdi3.c b/arch/blackfin/lib/ashrdi3.c
new file mode 100644
index 000000000..b5b351e82
--- /dev/null
+++ b/arch/blackfin/lib/ashrdi3.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2004-2009 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#include "gcclib.h"
+
+#ifdef CONFIG_ARITHMETIC_OPS_L1
+DItype __ashrdi3(DItype u, word_type b)__attribute__((l1_text));
+#endif
+
+DItype __ashrdi3(DItype u, word_type b)
+{
+	DIunion w;
+	word_type bm;
+	DIunion uu;
+
+	if (b == 0)
+		return u;
+
+	uu.ll = u;
+
+	bm = (sizeof(SItype) * BITS_PER_UNIT) - b;
+	if (bm <= 0) {
+		/* w.s.high = 1..1 or 0..0 */
+		w.s.high = uu.s.high >> (sizeof(SItype) * BITS_PER_UNIT - 1);
+		w.s.low = uu.s.high >> -bm;
+	} else {
+		USItype carries = (USItype) uu.s.high << bm;
+		w.s.high = uu.s.high >> b;
+		w.s.low = ((USItype) uu.s.low >> b) | carries;
+	}
+
+	return w.ll;
+}
diff --git a/arch/blackfin/lib/divsi3.S b/arch/blackfin/lib/divsi3.S
new file mode 100644
index 000000000..ef2cd99ef
--- /dev/null
+++ b/arch/blackfin/lib/divsi3.S
@@ -0,0 +1,199 @@
+/*
+ * Copyright 2004-2009 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ *
+ * 16 / 32 bit signed division.
+ *                 Special cases :
+ *                      1)  If(numerator == 0)
+ *                             return 0
+ *                      2)  If(denominator ==0)
+ *                             return positive max = 0x7fffffff
+ *                      3)  If(numerator == denominator)
+ *                             return 1
+ *                      4)  If(denominator ==1)
+ *                             return numerator
+ *                      5)  If(denominator == -1)
+ *                             return -numerator
+ *
+ *                 Operand         : R0 - Numerator   (i)
+ *                                   R1 - Denominator (i)
+ *                                   R0 - Quotient    (o)
+ *                 Registers Used : R2-R7,P0-P2
+ *
+ */
+
+.global   ___divsi3;
+.type ___divsi3, STT_FUNC;
+
+#ifdef CONFIG_ARITHMETIC_OPS_L1
+.section .l1.text
+#else
+.text
+#endif
+
+.align 2;
+___divsi3 :
+
+
+  R3 = R0 ^ R1;
+  R0 = ABS R0;
+
+  CC = V;
+
+  r3 = rot r3 by -1;
+  r1 = abs r1;      /* now both positive, r3.30 means "negate result",
+                    ** r3.31 means overflow, add one to result
+                    */
+  cc = r0 < r1;
+  if cc jump .Lret_zero;
+  r2 = r1 >> 15;
+  cc = r2;
+  if cc jump .Lidents;
+  r2 = r1 << 16;
+  cc = r2 <= r0;
+  if cc jump .Lidents;
+
+  DIVS(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+  DIVQ(R0, R1);
+
+  R0 = R0.L (Z);
+  r1 = r3 >> 31;    /* add overflow issue back in */
+  r0 = r0 + r1;
+  r1 = -r0;
+  cc = bittst(r3, 30);
+  if cc r0 = r1;
+  RTS;
+
+/* Can't use the primitives. Test common identities.
+** If the identity is true, return the value in R2.
+*/
+
+.Lidents:
+  CC = R1 == 0;                   /* check for divide by zero */
+  IF CC JUMP .Lident_return;
+
+  CC = R0 == 0;                   /* check for division of zero */
+  IF CC JUMP .Lzero_return;
+
+  CC = R0 == R1;                  /* check for identical operands */
+  IF CC JUMP .Lident_return;
+
+  CC = R1 == 1;                   /* check for divide by 1 */
+  IF CC JUMP .Lident_return;
+
+  R2.L = ONES R1;
+  R2 = R2.L (Z);
+  CC = R2 == 1;
+  IF CC JUMP .Lpower_of_two;
+
+  /* Identities haven't helped either.
+  ** Perform the full division process.
+  */
+
+  P1 = 31;                        /* Set loop counter   */
+
+  [--SP] = (R7:5);                /* Push registers R5-R7 */
+  R2 = -R1;
+  [--SP] = R2;
+  R2 = R0 << 1;                   /* R2 lsw of dividend  */
+  R6 = R0 ^ R1;                   /* Get sign */
+  R5 = R6 >> 31;                  /* Shift sign to LSB */
+
+  R0 = 0 ;                        /* Clear msw partial remainder */
+  R2 = R2 | R5;                   /* Shift quotient bit */
+  R6 = R0 ^ R1;                   /* Get new quotient bit */
+
+  LSETUP(.Llst,.Llend)  LC0 = P1;   /* Setup loop */
+.Llst:   R7 = R2 >> 31;            /* record copy of carry from R2 */
+        R2 = R2 << 1;             /* Shift 64 bit dividend up by 1 bit */
+        R0 = R0 << 1 || R5 = [SP];
+        R0 = R0 | R7;             /* and add carry */
+        CC = R6 < 0;              /* Check quotient(AQ) */
+                                  /* we might be subtracting divisor (AQ==0) */
+        IF CC R5 = R1;            /* or we might be adding divisor  (AQ==1)*/
+        R0 = R0 + R5;             /* do add or subtract, as indicated by AQ */
+        R6 = R0 ^ R1;             /* Generate next quotient bit */
+        R5 = R6 >> 31;
+                                  /* Assume AQ==1, shift in zero */
+        BITTGL(R5,0);             /* tweak AQ to be what we want to shift in */
+.Llend:  R2 = R2 + R5;             /* and then set shifted-in value to
+                                  ** tweaked AQ.
+                                  */
+  r1 = r3 >> 31;
+  r2 = r2 + r1;
+  cc = bittst(r3,30);
+  r0 = -r2;
+  if !cc r0 = r2;
+  SP += 4;
+  (R7:5)= [SP++];                 /* Pop registers R6-R7 */
+  RTS;
+
+.Lident_return:
+  CC = R1 == 0;                   /* check for divide by zero  => 0x7fffffff */
+  R2 = -1 (X);
+  R2 >>= 1;
+  IF CC JUMP .Ltrue_ident_return;
+
+  CC = R0 == R1;                  /* check for identical operands => 1 */
+  R2 = 1 (Z);
+  IF CC JUMP .Ltrue_ident_return;
+
+  R2 = R0;                        /* assume divide by 1 => numerator */
+  /*FALLTHRU*/
+
+.Ltrue_ident_return:
+  R0 = R2;                        /* Return an identity value */
+  R2 = -R2;
+  CC = bittst(R3,30);
+  IF CC R0 = R2;
+.Lzero_return:
+  RTS;                            /* ...including zero */
+
+.Lpower_of_two:
+  /* Y has a single bit set, which means it's a power of two.
+  ** That means we can perform the division just by shifting
+  ** X to the right the appropriate number of bits
+  */
+
+  /* signbits returns the number of sign bits, minus one.
+  ** 1=>30, 2=>29, ..., 0x40000000=>0. Which means we need
+  ** to shift right n-signbits spaces. It also means 0x80000000
+  ** is a special case, because that *also* gives a signbits of 0
+  */
+
+  R2 = R0 >> 31;
+  CC = R1 < 0;
+  IF CC JUMP .Ltrue_ident_return;
+
+  R1.l = SIGNBITS R1;
+  R1 = R1.L (Z);
+  R1 += -30;
+  R0 = LSHIFT R0 by R1.L;
+  r1 = r3 >> 31;
+  r0 = r0 + r1;
+  R2 = -R0;                       // negate result if necessary
+  CC = bittst(R3,30);
+  IF CC R0 = R2;
+  RTS;
+
+.Lret_zero:
+  R0 = 0;
+  RTS;
+
+.size ___divsi3, .-___divsi3
diff --git a/arch/blackfin/lib/gcclib.h b/arch/blackfin/lib/gcclib.h
new file mode 100644
index 000000000..724f07f14
--- /dev/null
+++ b/arch/blackfin/lib/gcclib.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2004-2009 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#define BITS_PER_UNIT  8
+#define SI_TYPE_SIZE (sizeof (SItype) * BITS_PER_UNIT)
+
+typedef unsigned int UQItype __attribute__ ((mode(QI)));
+typedef int SItype __attribute__ ((mode(SI)));
+typedef unsigned int USItype __attribute__ ((mode(SI)));
+typedef int DItype __attribute__ ((mode(DI)));
+typedef int word_type __attribute__ ((mode(__word__)));
+typedef unsigned int UDItype __attribute__ ((mode(DI)));
+
+struct DIstruct {
+	SItype low, high;
+};
+
+typedef union {
+	struct DIstruct s;
+	DItype ll;
+} DIunion;
diff --git a/arch/blackfin/lib/ins.S b/arch/blackfin/lib/ins.S
new file mode 100644
index 000000000..d59608dec
--- /dev/null
+++ b/arch/blackfin/lib/ins.S
@@ -0,0 +1,118 @@
+/*
+ * arch/blackfin/lib/ins.S - ins{bwl} using hardware loops
+ *
+ * Copyright 2004-2008 Analog Devices Inc.
+ * Copyright (C) 2005 Bas Vermeulen, BuyWays BV <bas@buyways.nl>
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/linkage.h>
+#include <asm/blackfin.h>
+
+.align 2
+
+#ifdef CONFIG_IPIPE
+# define DO_CLI \
+	[--sp] = rets; \
+	[--sp] = (P5:0); \
+	sp += -12; \
+	call ___ipipe_disable_root_irqs_hw; \
+	sp += 12; \
+	(P5:0) = [sp++];
+# define CLI_INNER_NOP
+#else
+# define DO_CLI cli R3;
+# define CLI_INNER_NOP nop; nop; nop;
+#endif
+
+#ifdef CONFIG_IPIPE
+# define DO_STI \
+	sp += -12; \
+	call ___ipipe_enable_root_irqs_hw; \
+	sp += 12; \
+2:	rets = [sp++];
+#else
+# define DO_STI 2: sti R3;
+#endif
+
+#ifdef CONFIG_BFIN_INS_LOWOVERHEAD
+# define CLI_OUTER DO_CLI;
+# define STI_OUTER DO_STI;
+# define CLI_INNER 1:
+# if ANOMALY_05000416
+#  define STI_INNER nop; 2: nop;
+# else
+#  define STI_INNER 2:
+# endif
+#else
+# define CLI_OUTER
+# define STI_OUTER
+# define CLI_INNER 1: DO_CLI; CLI_INNER_NOP;
+# define STI_INNER DO_STI;
+#endif
+
+/*
+ * Reads on the Blackfin are speculative. In Blackfin terms, this means they
+ * can be interrupted at any time (even after they have been issued on to the
+ * external bus), and re-issued after the interrupt occurs.
+ *
+ * If a FIFO is sitting on the end of the read, it will see two reads,
+ * when the core only sees one. The FIFO receives the read which is cancelled,
+ * and not delivered to the core.
+ *
+ * To solve this, interrupts are turned off before reads occur to I/O space.
+ * There are 3 versions of all these functions
+ *  - turns interrupts off every read (higher overhead, but lower latency)
+ *  - turns interrupts off every loop (low overhead, but longer latency)
+ *  - DMA version, which do not suffer from this issue. DMA versions have
+ *      different name (prefixed by dma_ ), and are located in
+ *      ../kernel/bfin_dma.c
+ * Using the dma related functions are recommended for transferring large
+ * buffers in/out of FIFOs.
+ */
+
+#define COMMON_INS(func, ops) \
+ENTRY(_ins##func) \
+	P0 = R0;	/* P0 = port */ \
+	CLI_OUTER;	/* 3 instructions before first read access */ \
+	P1 = R1;	/* P1 = address */ \
+	P2 = R2;	/* P2 = count */ \
+	SSYNC; \
+ \
+	LSETUP(1f, 2f) LC0 = P2; \
+	CLI_INNER; \
+	ops; \
+	STI_INNER; \
+ \
+	STI_OUTER; \
+	RTS; \
+ENDPROC(_ins##func)
+
+COMMON_INS(l, \
+	R0 = [P0]; \
+	[P1++] = R0; \
+)
+
+COMMON_INS(w, \
+	R0 = W[P0]; \
+	W[P1++] = R0; \
+)
+
+COMMON_INS(w_8, \
+	R0 = W[P0]; \
+	B[P1++] = R0; \
+	R0 = R0 >> 8; \
+	B[P1++] = R0; \
+)
+
+COMMON_INS(b, \
+	R0 = B[P0]; \
+	B[P1++] = R0; \
+)
+
+COMMON_INS(l_16, \
+	R0 = [P0]; \
+	W[P1++] = R0; \
+	R0 = R0 >> 16; \
+	W[P1++] = R0; \
+)
diff --git a/arch/blackfin/lib/lshrdi3.c b/arch/blackfin/lib/lshrdi3.c
new file mode 100644
index 000000000..53f174104
--- /dev/null
+++ b/arch/blackfin/lib/lshrdi3.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2004-2009 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#include "gcclib.h"
+
+#ifdef CONFIG_ARITHMETIC_OPS_L1
+DItype __lshrdi3(DItype u, word_type b)__attribute__((l1_text));
+#endif
+
+DItype __lshrdi3(DItype u, word_type b)
+{
+	DIunion w;
+	word_type bm;
+	DIunion uu;
+
+	if (b == 0)
+		return u;
+
+	uu.ll = u;
+
+	bm = (sizeof(SItype) * BITS_PER_UNIT) - b;
+	if (bm <= 0) {
+		w.s.high = 0;
+		w.s.low = (USItype) uu.s.high >> -bm;
+	} else {
+		USItype carries = (USItype) uu.s.high << bm;
+		w.s.high = (USItype) uu.s.high >> b;
+		w.s.low = ((USItype) uu.s.low >> b) | carries;
+	}
+
+	return w.ll;
+}
diff --git a/arch/blackfin/lib/memchr.S b/arch/blackfin/lib/memchr.S
new file mode 100644
index 000000000..bcfc8a14c
--- /dev/null
+++ b/arch/blackfin/lib/memchr.S
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2005-2009 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+#include <linux/linkage.h>
+
+/* void *memchr(const void *s, int c, size_t n);
+ * R0 = address (s)
+ * R1 = sought byte (c)
+ * R2 = count (n)
+ *
+ * Returns pointer to located character.
+ */
+
+.text
+
+.align 2
+
+ENTRY(_memchr)
+	P0 = R0;		/* P0 = address */
+	P2 = R2;		/* P2 = count */
+	R1 = R1.B(Z);
+	CC = R2 == 0;
+	IF CC JUMP .Lfailed;
+
+.Lbytes:
+	LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2;
+
+.Lbyte_loop_s:
+	R3 = B[P0++](Z);
+	CC = R3 == R1;
+	IF CC JUMP .Lfound;
+.Lbyte_loop_e:
+	NOP;
+
+.Lfailed:
+	R0=0;
+	RTS;
+
+.Lfound:
+	R0 = P0;
+	R0 += -1;
+	RTS;
+
+ENDPROC(_memchr)
diff --git a/arch/blackfin/lib/memcmp.S b/arch/blackfin/lib/memcmp.S
new file mode 100644
index 000000000..2e1c9477f
--- /dev/null
+++ b/arch/blackfin/lib/memcmp.S
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2004-2009 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+#include <linux/linkage.h>
+
+/* int memcmp(const void *s1, const void *s2, size_t n);
+ * R0 = First Address (s1)
+ * R1 = Second Address (s2)
+ * R2 = count (n)
+ *
+ * Favours word aligned data.
+ */
+
+.text
+
+.align 2
+
+ENTRY(_memcmp)
+	I1 = P3;
+	P0 = R0;			/* P0 = s1 address */
+	P3 = R1;			/* P3 = s2 Address  */
+	P2 = R2 ;			/* P2 = count */
+	CC = R2 <= 7(IU);
+	IF CC JUMP .Ltoo_small;
+	I0 = R1;			/* s2 */
+	R1 = R1 | R0;		/* OR addresses together */
+	R1 <<= 30;		/* check bottom two bits */
+	CC =  AZ;			/* AZ set if zero. */
+	IF !CC JUMP .Lbytes ;	/* Jump if addrs not aligned. */
+
+	P1 = P2 >> 2;		/* count = n/4 */
+	R3 =  3;
+	R2 = R2 & R3;		/* remainder */
+	P2 = R2;			/* set remainder */
+
+	LSETUP (.Lquad_loop_s, .Lquad_loop_e) LC0=P1;
+.Lquad_loop_s:
+#if ANOMALY_05000202
+	R0 = [P0++];
+	R1 = [I0++];
+#else
+	MNOP || R0 = [P0++] || R1 = [I0++];
+#endif
+	CC = R0 == R1;
+	IF !CC JUMP .Lquad_different;
+.Lquad_loop_e:
+	NOP;
+
+	P3 = I0;			/* s2 */
+.Ltoo_small:
+	CC = P2 == 0;		/* Check zero count*/
+	IF CC JUMP .Lfinished;	/* very unlikely*/
+
+.Lbytes:
+	LSETUP (.Lbyte_loop_s, .Lbyte_loop_e) LC0=P2;
+.Lbyte_loop_s:
+	R1 = B[P3++](Z);	/* *s2 */
+	R0 = B[P0++](Z);	/* *s1 */
+	CC = R0 == R1;
+	IF !CC JUMP .Ldifferent;
+.Lbyte_loop_e:
+	NOP;
+
+.Ldifferent:
+	R0 = R0 - R1;
+	P3 = I1;
+	RTS;
+
+.Lquad_different:
+	/* We've read two quads which don't match.
+	 * Can't just compare them, because we're
+	 * a little-endian machine, so the MSBs of
+	 * the regs occur at later addresses in the
+	 * string.
+	 * Arrange to re-read those two quads again,
+	 * byte-by-byte.
+	 */
+	P0 += -4;		/* back up to the start of the */
+	P3 = I0;		/* quads, and increase the*/
+	P2 += 4;		/* remainder count*/
+	P3 += -4;
+	JUMP .Lbytes;
+
+.Lfinished:
+	R0 = 0;
+	P3 = I1;
+	RTS;
+
+ENDPROC(_memcmp)
diff --git a/arch/blackfin/lib/memcpy.S b/arch/blackfin/lib/memcpy.S
new file mode 100644
index 000000000..53cb3698a
--- /dev/null
+++ b/arch/blackfin/lib/memcpy.S
@@ -0,0 +1,124 @@
+/*
+ * internal version of memcpy(), issued by the compiler to copy blocks of
+ * data around. This is really memmove() - it has to be able to deal with
+ * possible overlaps, because that ambiguity is when the compiler gives up
+ * and calls a function. We have our own, internal version so that we get
+ * something we trust, even if the user has redefined the normal symbol.
+ *
+ * Copyright 2004-2009 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+#include <linux/linkage.h>
+
+/* void *memcpy(void *dest, const void *src, size_t n);
+ * R0 = To Address (dest) (leave unchanged to form result)
+ * R1 = From Address (src)
+ * R2 = count
+ *
+ * Note: Favours word alignment
+ */
+
+#ifdef CONFIG_MEMCPY_L1
+.section .l1.text
+#else
+.text
+#endif
+
+.align 2
+
+ENTRY(_memcpy)
+	CC = R2 <=  0;	/* length not positive? */
+	IF CC JUMP .L_P1L2147483647;	/* Nothing to do */
+
+	P0 = R0 ;	/* dst*/
+	P1 = R1 ;	/* src*/
+	P2 = R2 ;	/* length */
+
+	/* check for overlapping data */
+	CC = R1 < R0;	/* src < dst */
+	IF !CC JUMP .Lno_overlap;
+	R3 = R1 + R2;
+	CC = R0 < R3;	/* and dst < src+len */
+	IF CC JUMP .Lhas_overlap;
+
+.Lno_overlap:
+	/* Check for aligned data.*/
+
+	R3 = R1 | R0;
+	R1 = 0x3;
+	R3 = R3 & R1;
+	CC = R3;	/* low bits set on either address? */
+	IF CC JUMP .Lnot_aligned;
+
+	/* Both addresses are word-aligned, so we can copy
+	at least part of the data using word copies.*/
+	P2 = P2 >> 2;
+	CC = P2 <= 2;
+	IF !CC JUMP .Lmore_than_seven;
+	/* less than eight bytes... */
+	P2 = R2;
+	LSETUP(.Lthree_start, .Lthree_end) LC0=P2;
+.Lthree_start:
+	R3 = B[P1++] (X);
+.Lthree_end:
+	B[P0++] = R3;
+
+	RTS;
+
+.Lmore_than_seven:
+	/* There's at least eight bytes to copy. */
+	P2 += -1;	/* because we unroll one iteration */
+	LSETUP(.Lword_loops, .Lword_loope) LC0=P2;
+	I1 = P1;
+	R3 = [I1++];
+#if ANOMALY_05000202
+.Lword_loops:
+	[P0++] = R3;
+.Lword_loope:
+	R3 = [I1++];
+#else
+.Lword_loops:
+.Lword_loope:
+	MNOP || [P0++] = R3 || R3 = [I1++];
+#endif
+	[P0++] = R3;
+	/* Any remaining bytes to copy? */
+	R3 = 0x3;
+	R3 = R2 & R3;
+	CC = R3 == 0;
+	P1 = I1;	/* in case there's something left, */
+	IF !CC JUMP .Lbytes_left;
+	RTS;
+.Lbytes_left:	P2 = R3;
+.Lnot_aligned:
+	/* From here, we're copying byte-by-byte. */
+	LSETUP (.Lbyte_start, .Lbyte_end) LC0=P2;
+.Lbyte_start:
+	R1 = B[P1++] (X);
+.Lbyte_end:
+	B[P0++] = R1;
+
+.L_P1L2147483647:
+	RTS;
+
+.Lhas_overlap:
+	/* Need to reverse the copying, because the
+	 * dst would clobber the src.
+	 * Don't bother to work out alignment for
+	 * the reverse case.
+	 */
+	P0 = P0 + P2;
+	P0 += -1;
+	P1 = P1 + P2;
+	P1 += -1;
+	LSETUP(.Lover_start, .Lover_end) LC0=P2;
+.Lover_start:
+	R1 = B[P1--] (X);
+.Lover_end:
+	B[P0--] = R1;
+
+	RTS;
+
+ENDPROC(_memcpy)
diff --git a/arch/blackfin/lib/memmove.S b/arch/blackfin/lib/memmove.S
new file mode 100644
index 000000000..e0b78208f
--- /dev/null
+++ b/arch/blackfin/lib/memmove.S
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2005-2009 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+#include <linux/linkage.h>
+
+.align 2
+
+/*
+ * C Library function MEMMOVE
+ * R0 = To Address (leave unchanged to form result)
+ * R1 = From Address
+ * R2 = count
+ * Data may overlap
+ */
+
+ENTRY(_memmove)
+	I1 = P3;
+	P0 = R0;                  /* P0 = To address */
+	P3 = R1;                  /* P3 = From Address */
+	P2 = R2;                  /* P2 = count */
+	CC = P2 == 0;             /* Check zero count*/
+	IF CC JUMP .Lfinished;    /* very unlikely */
+
+	CC = R1 < R0 (IU);        /* From < To */
+	IF !CC JUMP .Lno_overlap;
+	R3 = R1 + R2;
+	CC = R0 <= R3 (IU);       /* (From+len) >= To */
+	IF CC JUMP .Loverlap;
+.Lno_overlap:
+	R3 = 11;
+	CC = R2 <= R3;
+	IF CC JUMP .Lbytes;
+	R3 = R1 | R0;             /* OR addresses together */
+	R3 <<= 30;                /* check bottom two bits */
+	CC =  AZ;                 /* AZ set if zero.*/
+	IF !CC JUMP .Lbytes;      /* Jump if addrs not aligned.*/
+
+	I0 = P3;
+	P1 = P2 >> 2;             /* count = n/4 */
+	P1 += -1;
+	R3 =  3;
+	R2 = R2 & R3;             /* remainder */
+	P2 = R2;                  /* set remainder */
+	R1 = [I0++];
+
+	LSETUP (.Lquad_loops, .Lquad_loope) LC0=P1;
+#if ANOMALY_05000202
+.Lquad_loops:
+	[P0++] = R1;
+.Lquad_loope:
+	R1 = [I0++];
+#else
+.Lquad_loops:
+.Lquad_loope:
+	 MNOP || [P0++] = R1 || R1 = [I0++];
+#endif
+	[P0++] = R1;
+
+	CC = P2 == 0;             /* any remaining bytes? */
+	P3 = I0;                  /* Amend P3 to updated ptr. */
+	IF !CC JUMP .Lbytes;
+	P3 = I1;
+	RTS;
+
+.Lbytes:     LSETUP (.Lbyte2_s, .Lbyte2_e) LC0=P2;
+.Lbyte2_s:   R1 = B[P3++](Z);
+.Lbyte2_e:   B[P0++] = R1;
+
+.Lfinished:  P3 = I1;
+	RTS;
+
+.Loverlap:
+	P2 += -1;
+	P0 = P0 + P2;
+	P3 = P3 + P2;
+	R1 = B[P3--] (Z);
+	CC = P2 == 0;
+	IF CC JUMP .Lno_loop;
+#if ANOMALY_05000245
+	NOP;
+	NOP;
+#endif
+	LSETUP (.Lol_s, .Lol_e) LC0 = P2;
+.Lol_s:    B[P0--] = R1;
+.Lol_e:    R1 = B[P3--] (Z);
+.Lno_loop: B[P0] = R1;
+	P3 = I1;
+	RTS;
+
+ENDPROC(_memmove)
diff --git a/arch/blackfin/lib/memset.S b/arch/blackfin/lib/memset.S
new file mode 100644
index 000000000..cdcf9148e
--- /dev/null
+++ b/arch/blackfin/lib/memset.S
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2004-2009 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+#include <linux/linkage.h>
+
+.align 2
+
+#ifdef CONFIG_MEMSET_L1
+.section .l1.text
+#else
+.text
+#endif
+
+/*
+ * C Library function MEMSET
+ * R0 = address (leave unchanged to form result)
+ * R1 = filler byte
+ * R2 = count
+ * Favours word aligned data.
+ * The strncpy assumes that I0 and I1 are not used in this function
+ */
+
+ENTRY(_memset)
+	P0 = R0 ;              /* P0 = address */
+	P2 = R2 ;              /* P2 = count   */
+	R3 = R0 + R2;          /* end          */
+	CC = R2 <= 7(IU);
+	IF CC JUMP  .Ltoo_small;
+	R1 = R1.B (Z);         /* R1 = fill char */
+	R2 =  3;
+	R2 = R0 & R2;          /* addr bottom two bits */
+	CC =  R2 == 0;             /* AZ set if zero.	*/
+	IF !CC JUMP  .Lforce_align ;  /* Jump if addr not aligned. */
+
+.Laligned:
+	P1 = P2 >> 2;          /* count = n/4        */
+	R2 = R1 <<  8;         /* create quad filler */
+	R2.L = R2.L + R1.L(NS);
+	R2.H = R2.L + R1.H(NS);
+	P2 = R3;
+
+	LSETUP (.Lquad_loop , .Lquad_loop) LC0=P1;
+.Lquad_loop:
+	[P0++] = R2;
+
+	CC = P0 == P2;
+	IF !CC JUMP .Lbytes_left;
+	RTS;
+
+.Lbytes_left:
+	R2 = R3;                /* end point */
+	R3 = P0;                /* current position */
+	R2 = R2 - R3;           /* bytes left */
+	P2 = R2;
+
+.Ltoo_small:
+	CC = P2 == 0;           /* Check zero count */
+	IF CC JUMP .Lfinished;    /* Unusual */
+
+.Lbytes:
+	LSETUP (.Lbyte_loop , .Lbyte_loop) LC0=P2;
+.Lbyte_loop:
+	B[P0++] = R1;
+
+.Lfinished:
+	RTS;
+
+.Lforce_align:
+	CC = BITTST (R0, 0);  /* odd byte */
+	R0 = 4;
+	R0 = R0 - R2;
+	P1 = R0;
+	R0 = P0;		    /* Recover return address */
+	IF !CC JUMP .Lskip1;
+	B[P0++] = R1;
+.Lskip1:
+	CC = R2 <= 2;          /* 2 bytes */
+	P2 -= P1;              /* reduce count */
+	IF !CC JUMP .Laligned;
+	B[P0++] = R1;
+	B[P0++] = R1;
+	JUMP .Laligned;
+
+ENDPROC(_memset)
diff --git a/arch/blackfin/lib/modsi3.S b/arch/blackfin/lib/modsi3.S
new file mode 100644
index 000000000..f7026ce1f
--- /dev/null
+++ b/arch/blackfin/lib/modsi3.S
@@ -0,0 +1,57 @@
+/*
+ * This program computes 32 bit signed remainder. It calls div32 function
+ * for quotient estimation.
+ *   Registers in:  R0, R1 = Numerator/ Denominator
+ *   Registers out: R0     = Remainder
+ *
+ * Copyright 2004-2009 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+.global ___modsi3;
+.type ___modsi3, STT_FUNC;
+.extern ___divsi3;
+.type ___divsi3, STT_FUNC;
+
+#ifdef CONFIG_ARITHMETIC_OPS_L1
+.section .l1.text
+#else
+.text
+#endif
+
+___modsi3:
+
+	CC=R0==0;
+	IF CC JUMP .LRETURN_R0;		/* Return 0, if numerator  == 0 */
+	CC=R1==0;
+	IF CC JUMP .LRETURN_ZERO;		/* Return 0, if denominator == 0 */
+	CC=R0==R1;
+	IF CC JUMP .LRETURN_ZERO;		/* Return 0, if numerator == denominator */
+	CC = R1 == 1;
+	IF CC JUMP .LRETURN_ZERO;		/* Return 0, if denominator ==  1 */
+	CC = R1 == -1;
+	IF CC JUMP .LRETURN_ZERO;		/* Return 0, if denominator == -1 */
+
+	/* Valid input. Use __divsi3() to compute the quotient, and then
+	 * derive the remainder from that. */
+
+	[--SP] = (R7:6);		/* Push  R7 and R6 */
+	[--SP] = RETS;			/* and return address */
+	R7 = R0;			/* Copy of R0 */
+	R6 = R1;			/* Save for later */
+	SP += -12;			/* Should always provide this space */
+	CALL ___divsi3;			/* Compute signed quotient using ___divsi3()*/
+	SP += 12;
+	R0 *= R6;			/* Quotient * divisor */
+	R0 = R7 - R0;			/* Dividend - (quotient * divisor) */
+	RETS = [SP++];			/* Get back return address */
+	(R7:6) = [SP++];		/* Pop registers R7 and R4 */
+	RTS;				/* Store remainder    */
+
+.LRETURN_ZERO:
+	R0 = 0;
+.LRETURN_R0:
+	RTS;
+
+.size ___modsi3, .-___modsi3
diff --git a/arch/blackfin/lib/muldi3.S b/arch/blackfin/lib/muldi3.S
new file mode 100644
index 000000000..abf9b2a51
--- /dev/null
+++ b/arch/blackfin/lib/muldi3.S
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2008 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+.align 2
+.global ___muldi3;
+.type ___muldi3, STT_FUNC;
+
+#ifdef CONFIG_ARITHMETIC_OPS_L1
+.section .l1.text
+#else
+.text
+#endif
+
+/*
+	   R1:R0 * R3:R2
+	 = R1.h:R1.l:R0.h:R0.l * R3.h:R3.l:R2.h:R2.l
+[X]	 = (R1.h * R3.h) * 2^96
+[X]	   + (R1.h * R3.l + R1.l * R3.h) * 2^80
+[X]	   + (R1.h * R2.h + R1.l * R3.l + R3.h * R0.h) * 2^64
+[T1]	   + (R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h) * 2^48
+[T2]	   + (R1.l * R2.l + R3.l * R0.l + R0.h * R2.h) * 2^32
+[T3]	   + (R0.l * R2.h + R2.l * R0.h) * 2^16
+[T4]	   + (R0.l * R2.l)
+
+	We can discard the first three lines marked "X" since we produce
+	only a 64 bit result.  So, we need ten 16-bit multiplies.
+
+	Individual mul-acc results:
+[E1]	 =  R1.h * R2.l + R3.h * R0.l + R1.l * R2.h + R3.l * R0.h
+[E2]	 =  R1.l * R2.l + R3.l * R0.l + R0.h * R2.h
+[E3]	 =  R0.l * R2.h + R2.l * R0.h
+[E4]	 =  R0.l * R2.l
+
+	We also need to add high parts from lower-level results to higher ones:
+	E[n]c = E[n] + (E[n+1]c >> 16), where E4c := E4
+
+	One interesting property is that all parts of the result that depend
+	on the sign of the multiplication are discarded.  Those would be the
+	multiplications involving R1.h and R3.h, but only the top 16 bit of
+	the 32 bit result depend on the sign, and since R1.h and R3.h only
+	occur in E1, the top half of these results is cut off.
+	So, we can just use FU mode for all of the 16-bit multiplies, and
+	ignore questions of when to use mixed mode.  */
+
+___muldi3:
+	/* [SP] technically is part of the caller's frame, but we can
+	   use it as scratch space.  */
+	A0 = R2.H * R1.L, A1 = R2.L * R1.H (FU) || R3 = [SP + 12];	/* E1 */
+	A0 += R3.H * R0.L, A1 += R3.L * R0.H (FU) || [SP] = R4;		/* E1 */
+	A0 += A1;							/* E1 */
+	R4 = A0.w;
+	A0 = R0.l * R3.l (FU);						/* E2 */
+	A0 += R2.l * R1.l (FU);						/* E2 */
+
+	A1 = R2.L * R0.L (FU);						/* E4 */
+	R3 = A1.w;
+	A1 = A1 >> 16;							/* E3c */
+	A0 += R2.H * R0.H, A1 += R2.L * R0.H (FU);			/* E2, E3c */
+	A1 += R0.L * R2.H (FU);						/* E3c */
+	R0 = A1.w;
+	A1 = A1 >> 16;							/* E2c */
+	A0 += A1;							/* E2c */
+	R1 = A0.w;
+
+	/* low(result) = low(E3c):low(E4) */
+	R0 = PACK (R0.l, R3.l);
+	/* high(result) = E2c + (E1 << 16) */
+	R1.h = R1.h + R4.l (NS) || R4 = [SP];
+	RTS;
+
+.size ___muldi3, .-___muldi3
diff --git a/arch/blackfin/lib/outs.S b/arch/blackfin/lib/outs.S
new file mode 100644
index 000000000..06a5e6744
--- /dev/null
+++ b/arch/blackfin/lib/outs.S
@@ -0,0 +1,68 @@
+/*
+ * Implementation of outs{bwl} for BlackFin processors using zero overhead loops.
+ *
+ * Copyright 2005-2009 Analog Devices Inc.
+ *                2005 BuyWays BV
+ *                      Bas Vermeulen <bas@buyways.nl>
+ *
+ * Licensed under the GPL-2.
+ */
+
+#include <linux/linkage.h>
+
+.align 2
+
+ENTRY(_outsl)
+	CC = R2 == 0;
+	IF CC JUMP 1f;
+	P0 = R0;	/* P0 = port */
+	P1 = R1;	/* P1 = address */
+	P2 = R2;	/* P2 = count */
+
+	LSETUP( .Llong_loop_s, .Llong_loop_e) LC0 = P2;
+.Llong_loop_s: R0 = [P1++];
+.Llong_loop_e: [P0] = R0;
+1:	RTS;
+ENDPROC(_outsl)
+
+ENTRY(_outsw)
+	CC = R2 == 0;
+	IF CC JUMP 1f;
+	P0 = R0;	/* P0 = port */
+	P1 = R1;	/* P1 = address */
+	P2 = R2;	/* P2 = count */
+
+	LSETUP( .Lword_loop_s, .Lword_loop_e) LC0 = P2;
+.Lword_loop_s: R0 = W[P1++];
+.Lword_loop_e: W[P0] = R0;
+1:	RTS;
+ENDPROC(_outsw)
+
+ENTRY(_outsb)
+	CC = R2 == 0;
+	IF CC JUMP 1f;
+	P0 = R0;	/* P0 = port */
+	P1 = R1;	/* P1 = address */
+	P2 = R2;	/* P2 = count */
+
+	LSETUP( .Lbyte_loop_s, .Lbyte_loop_e) LC0 = P2;
+.Lbyte_loop_s: R0 = B[P1++];
+.Lbyte_loop_e: B[P0] = R0;
+1:	RTS;
+ENDPROC(_outsb)
+
+ENTRY(_outsw_8)
+	CC = R2 == 0;
+	IF CC JUMP 1f;
+	P0 = R0;	/* P0 = port */
+	P1 = R1;	/* P1 = address */
+	P2 = R2;	/* P2 = count */
+
+	LSETUP( .Lword8_loop_s, .Lword8_loop_e) LC0 = P2;
+.Lword8_loop_s: R1 = B[P1++];
+		R0 = B[P1++];
+		R0 = R0 << 8;
+		R0 = R0 + R1;
+.Lword8_loop_e: W[P0] = R0;
+1:	RTS;
+ENDPROC(_outsw_8)
diff --git a/arch/blackfin/lib/smulsi3_highpart.S b/arch/blackfin/lib/smulsi3_highpart.S
new file mode 100644
index 000000000..e50d6c4ac
--- /dev/null
+++ b/arch/blackfin/lib/smulsi3_highpart.S
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2007 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+.align 2
+.global ___smulsi3_highpart;
+.type ___smulsi3_highpart, STT_FUNC;
+
+#ifdef CONFIG_ARITHMETIC_OPS_L1
+.section .l1.text
+#else
+.text
+#endif
+
+___smulsi3_highpart:
+	R2 = R1.L * R0.L (FU);
+	R3 = R1.H * R0.L (IS,M);
+	R0 = R0.H * R1.H, R1 = R0.H * R1.L (IS,M);
+
+	R1.L = R2.H + R1.L;
+	cc = ac0;
+	R2 = cc;
+
+	R1.L = R1.L + R3.L;
+	cc = ac0;
+	R1 >>>= 16;
+	R3 >>>= 16;
+	R1 = R1 + R3;
+	R1 = R1 + R2;
+	R2 = cc;
+	R1 = R1 + R2;
+
+	R0 = R0 + R1;
+	RTS;
+
+.size ___smulsi3_highpart, .-___smulsi3_highpart
diff --git a/arch/blackfin/lib/strcmp.S b/arch/blackfin/lib/strcmp.S
new file mode 100644
index 000000000..9c8b98637
--- /dev/null
+++ b/arch/blackfin/lib/strcmp.S
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2005-2010 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+#include <linux/linkage.h>
+
+/* void *strcmp(char *s1, const char *s2);
+ * R0 = address (s1)
+ * R1 = address (s2)
+ *
+ * Returns an integer less than, equal to, or greater than zero if s1
+ *  (or the first n  bytes thereof) is found, respectively, to be less
+ *  than, to match, or be greater than s2.
+ */
+
+#ifdef CONFIG_STRCMP_L1
+.section .l1.text
+#else
+.text
+#endif
+
+.align 2
+
+ENTRY(_strcmp)
+	P0 = R0 ;       /* s1 */
+	P1 = R1 ;       /* s2 */
+
+1:
+	R0 = B[P0++] (Z);      /* get *s1 */
+	R1 = B[P1++] (Z);      /* get *s2 */
+	CC = R0 == R1;         /* compare a byte */
+	if ! cc jump 2f;       /* not equal, break out */
+	CC = R0;               /* at end of s1? */
+	if cc jump 1b (bp);    /* no, keep going */
+	jump.s 3f;             /* strings are equal */
+2:
+	R0 = R0 - R1;          /* *s1 - *s2 */
+3:
+	RTS;
+
+ENDPROC(_strcmp)
diff --git a/arch/blackfin/lib/strcpy.S b/arch/blackfin/lib/strcpy.S
new file mode 100644
index 000000000..9495aa77c
--- /dev/null
+++ b/arch/blackfin/lib/strcpy.S
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2005-2010 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+#include <linux/linkage.h>
+
+/* void *strcpy(char *dest, const char *src);
+ * R0 = address (dest)
+ * R1 = address (src)
+ *
+ * Returns a pointer to the destination string dest
+ */
+
+#ifdef CONFIG_STRCPY_L1
+.section .l1.text
+#else
+.text
+#endif
+
+.align 2
+
+ENTRY(_strcpy)
+	P0 = R0 ;       /* dst*/
+	P1 = R1 ;       /* src*/
+
+1:
+	R1 = B [P1++] (Z);
+	B [P0++] = R1;
+	CC = R1;
+	if cc jump 1b (bp);
+	RTS;
+
+ENDPROC(_strcpy)
diff --git a/arch/blackfin/lib/strncmp.S b/arch/blackfin/lib/strncmp.S
new file mode 100644
index 000000000..3bfaedce8
--- /dev/null
+++ b/arch/blackfin/lib/strncmp.S
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2005-2010 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+#include <linux/linkage.h>
+
+/* void *strncpy(char *s1, const char *s2, size_t n);
+ * R0 = address (dest)
+ * R1 = address (src)
+ * R2 = size (n)
+ * Returns a pointer to the destination string dest
+ */
+
+#ifdef CONFIG_STRNCMP_L1
+.section .l1.text
+#else
+.text
+#endif
+
+.align 2
+
+ENTRY(_strncmp)
+	CC = R2 == 0;
+	if CC JUMP 5f;
+
+	P0 = R0 ;       /* s1 */
+	P1 = R1 ;       /* s2 */
+1:
+	R0 = B[P0++] (Z);      /* get *s1 */
+	R1 = B[P1++] (Z);      /* get *s2 */
+	CC = R0 == R1;         /* compare a byte */
+	if ! cc jump 3f;       /* not equal, break out */
+	CC = R0;               /* at end of s1? */
+	if ! cc jump 4f;       /* yes, all done */
+	R2 += -1;              /* no, adjust count */
+	CC = R2 == 0;
+	if ! cc jump 1b (bp);  /* more to do, keep going */
+2:
+	R0 = 0;                /* strings are equal */
+	jump.s 4f;
+3:
+	R0 = R0 - R1;          /* *s1 - *s2 */
+4:
+	RTS;
+
+5:
+	R0 = 0;
+	RTS;
+
+ENDPROC(_strncmp)
diff --git a/arch/blackfin/lib/strncpy.S b/arch/blackfin/lib/strncpy.S
new file mode 100644
index 000000000..92fd1823b
--- /dev/null
+++ b/arch/blackfin/lib/strncpy.S
@@ -0,0 +1,85 @@
+/*
+ * Copyright 2005-2010 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+#include <linux/linkage.h>
+#include <asm/context.S>
+
+/* void *strncpy(char *dest, const char *src, size_t n);
+ * R0 = address (dest)
+ * R1 = address (src)
+ * R2 = size
+ * Returns a pointer (R0) to the destination string dest
+ *  we do this by not changing R0
+ */
+
+#ifdef CONFIG_STRNCPY_L1
+.section .l1.text
+#else
+.text
+#endif
+
+.align 2
+
+ENTRY(_strncpy)
+	CC = R2 == 0;
+	if CC JUMP 6f;
+
+	P2 = R2 ;       /* size */
+	P0 = R0 ;       /* dst*/
+	P1 = R1 ;       /* src*/
+
+	LSETUP (1f, 2f) LC0 = P2;
+1:
+	R1 = B [P1++] (Z);
+	B [P0++] = R1;
+	CC = R1 == 0;
+2:
+	if CC jump 3f;
+
+	RTS;
+
+	/* if src is shorter than n, we need to null pad bytes in dest
+	 * but, we can get here when the last byte is zero, and we don't
+	 * want to copy an extra byte at the end, so we need to check
+	 */
+3:
+	R2 = LC0;
+	CC = R2
+	if ! CC jump 6f;
+
+	/* if the required null padded portion is small, do it here, rather than
+	 * handling the overhead of memset (which is OK when things are big).
+	 */
+	R3 = 0x20;
+	CC = R2 < R3;
+	IF CC jump 4f;
+
+	R2 += -1;
+
+	/* Set things up for memset
+	 * R0 = address
+	 * R1 = filler byte (this case it's zero, set above)
+	 * R2 = count (set above)
+	 */
+
+	I1 = R0;
+	R0 = RETS;
+	I0 = R0;
+	R0 = P0;
+	pseudo_long_call _memset, p0;
+	R0 = I0;
+	RETS = R0;
+	R0 = I1;
+	RTS;
+
+4:
+	LSETUP(5f, 5f) LC0;
+5:
+	B [P0++] = R1;
+6:
+	RTS;
+
+ENDPROC(_strncpy)
diff --git a/arch/blackfin/lib/udivsi3.S b/arch/blackfin/lib/udivsi3.S
new file mode 100644
index 000000000..748a6a2e8
--- /dev/null
+++ b/arch/blackfin/lib/udivsi3.S
@@ -0,0 +1,277 @@
+/*
+ * Copyright 2004-2009 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+#include <linux/linkage.h>
+
+#define CARRY AC0
+
+#ifdef CONFIG_ARITHMETIC_OPS_L1
+.section .l1.text
+#else
+.text
+#endif
+
+
+ENTRY(___udivsi3)
+
+  CC = R0 < R1 (IU);    /* If X < Y, always return 0 */
+  IF CC JUMP .Lreturn_ident;
+
+  R2 = R1 << 16;
+  CC = R2 <= R0 (IU);
+  IF CC JUMP .Lidents;
+
+  R2 = R0 >> 31;       /* if X is a 31-bit number */
+  R3 = R1 >> 15;       /* and Y is a 15-bit number */
+  R2 = R2 | R3;        /* then it's okay to use the DIVQ builtins (fallthrough to fast)*/
+  CC = R2;
+  IF CC JUMP .Ly_16bit;
+
+/* METHOD 1: FAST DIVQ
+   We know we have a 31-bit dividend, and 15-bit divisor so we can use the
+   simple divq approach (first setting AQ to 0 - implying unsigned division,
+   then 16 DIVQ's).
+*/
+
+  AQ = CC;             /* Clear AQ (CC==0) */
+
+/* ISR States: When dividing two integers (32.0/16.0) using divide primitives,
+   we need to shift the dividend one bit to the left.
+   We have already checked that we have a 31-bit number so we are safe to do
+   that.
+*/
+  R0 <<= 1;
+  DIVQ(R0, R1); // 1
+  DIVQ(R0, R1); // 2
+  DIVQ(R0, R1); // 3
+  DIVQ(R0, R1); // 4
+  DIVQ(R0, R1); // 5
+  DIVQ(R0, R1); // 6
+  DIVQ(R0, R1); // 7
+  DIVQ(R0, R1); // 8
+  DIVQ(R0, R1); // 9
+  DIVQ(R0, R1); // 10
+  DIVQ(R0, R1); // 11
+  DIVQ(R0, R1); // 12
+  DIVQ(R0, R1); // 13
+  DIVQ(R0, R1); // 14
+  DIVQ(R0, R1); // 15
+  DIVQ(R0, R1); // 16
+  R0 = R0.L (Z);
+  RTS;
+
+.Ly_16bit:
+  /* We know that the upper 17 bits of Y might have bits set,
+  ** or that the sign bit of X might have a bit. If Y is a
+  ** 16-bit number, but not bigger, then we can use the builtins
+  ** with a post-divide correction.
+  ** R3 currently holds Y>>15, which means R3's LSB is the
+  ** bit we're interested in.
+  */
+
+  /* According to the ISR, to use the Divide primitives for
+  ** unsigned integer divide, the useable range is 31 bits
+  */
+  CC = ! BITTST(R0, 31);
+
+  /* IF condition is true we can scale our inputs and use the divide primitives,
+  ** with some post-adjustment
+  */
+  R3 += -1;		/* if so, Y is 0x00008nnn */
+  CC &= AZ;
+
+  /* If condition is true we can scale our inputs and use the divide primitives,
+  ** with some post-adjustment
+  */
+  R3 = R1 >> 1;		/* Pre-scaled divisor for primitive case */
+  R2 = R0 >> 16;
+
+  R2 = R3 - R2;		/* shifted divisor < upper 16 bits of dividend */
+  CC &= CARRY;
+  IF CC JUMP .Lshift_and_correct;
+
+  /* Fall through to the identities */
+
+/* METHOD 2: identities and manual calculation
+   We are not able to use the divide primites, but may still catch some special
+   cases.
+*/
+.Lidents:
+  /* Test for common identities. Value to be returned is placed in R2. */
+  CC = R0 == 0;        /* 0/Y => 0 */
+  IF CC JUMP .Lreturn_r0;
+  CC = R0 == R1;       /* X==Y => 1 */
+  IF CC JUMP .Lreturn_ident;
+  CC = R1 == 1;        /* X/1 => X */
+  IF CC JUMP .Lreturn_ident;
+
+  R2.L = ONES R1;
+  R2 = R2.L (Z);
+  CC = R2 == 1;
+  IF CC JUMP .Lpower_of_two;
+
+  [--SP] = (R7:5);                /* Push registers R5-R7 */
+
+  /* Idents don't match. Go for the full operation. */
+
+
+  R6 = 2;                         /* assume we'll shift two */
+  R3 = 1;
+
+  P2 = R1;
+                                  /* If either R0 or R1 have sign set, */
+                                  /* divide them by two, and note it's */
+                                  /* been done. */
+  CC = R1 < 0;
+  R2 = R1 >> 1;
+  IF CC R1 = R2;                  /* Possibly-shifted R1 */
+  IF !CC R6 = R3;                 /* R1 doesn't, so at most 1 shifted */
+
+  P0 = 0;
+  R3 = -R1;
+  [--SP] = R3;
+  R2 = R0 >> 1;
+  R2 = R0 >> 1;
+  CC = R0 < 0;
+  IF CC P0 = R6;                  /* Number of values divided */
+  IF !CC R2 = R0;                 /* Shifted R0 */
+
+                                  /* P0 is 0, 1 (NR/=2) or 2 (NR/=2, DR/=2) */
+
+                                  /* r2 holds Copy dividend  */
+  R3 = 0;                         /* Clear partial remainder */
+  R7 = 0;                         /* Initialise quotient bit */
+
+  P1 = 32;                        /* Set loop counter */
+  LSETUP(.Lulst, .Lulend) LC0 = P1; /* Set loop counter */
+.Lulst:  R6 = R2 >> 31;             /* R6 = sign bit of R2, for carry */
+       R2 = R2 << 1;              /* Shift 64 bit dividend up by 1 bit */
+       R3 = R3 << 1 || R5 = [SP];
+       R3 = R3 | R6;              /* Include any carry */
+       CC = R7 < 0;               /* Check quotient(AQ) */
+                                  /* If AQ==0, we'll sub divisor */
+       IF CC R5 = R1;             /* and if AQ==1, we'll add it. */
+       R3 = R3 + R5;              /* Add/sub divsor to partial remainder */
+       R7 = R3 ^ R1;              /* Generate next quotient bit */
+
+       R5 = R7 >> 31;             /* Get AQ */
+       BITTGL(R5, 0);             /* Invert it, to get what we'll shift */
+.Lulend: R2 = R2 + R5;              /* and "shift" it in. */
+
+  CC = P0 == 0;                   /* Check how many inputs we shifted */
+  IF CC JUMP .Lno_mult;            /* if none... */
+  R6 = R2 << 1;
+  CC = P0 == 1;
+  IF CC R2 = R6;                  /* if 1, Q = Q*2 */
+  IF !CC R1 = P2;                 /* if 2, restore stored divisor */
+
+  R3 = R2;                        /* Copy of R2 */
+  R3 *= R1;                       /* Q * divisor */
+  R5 = R0 - R3;                   /* Z = (dividend - Q * divisor) */
+  CC = R1 <= R5 (IU);             /* Check if divisor <= Z? */
+  R6 = CC;                        /* if yes, R6 = 1 */
+  R2 = R2 + R6;                   /* if yes, add one to quotient(Q) */
+.Lno_mult:
+  SP += 4;
+  (R7:5) = [SP++];                /* Pop registers R5-R7 */
+  R0 = R2;                        /* Store quotient */
+  RTS;
+
+.Lreturn_ident:
+  CC = R0 < R1 (IU);    /* If X < Y, always return 0 */
+  R2 = 0;
+  IF CC JUMP .Ltrue_return_ident;
+  R2 = -1 (X);         /* X/0 => 0xFFFFFFFF */
+  CC = R1 == 0;
+  IF CC JUMP .Ltrue_return_ident;
+  R2 = -R2;            /* R2 now 1 */
+  CC = R0 == R1;       /* X==Y => 1 */
+  IF CC JUMP .Ltrue_return_ident;
+  R2 = R0;             /* X/1 => X */
+  /*FALLTHRU*/
+
+.Ltrue_return_ident:
+  R0 = R2;
+.Lreturn_r0:
+  RTS;
+
+.Lpower_of_two:
+  /* Y has a single bit set, which means it's a power of two.
+  ** That means we can perform the division just by shifting
+  ** X to the right the appropriate number of bits
+  */
+
+  /* signbits returns the number of sign bits, minus one.
+  ** 1=>30, 2=>29, ..., 0x40000000=>0. Which means we need
+  ** to shift right n-signbits spaces. It also means 0x80000000
+  ** is a special case, because that *also* gives a signbits of 0
+  */
+
+  R2 = R0 >> 31;
+  CC = R1 < 0;
+  IF CC JUMP .Ltrue_return_ident;
+
+  R1.l = SIGNBITS R1;
+  R1 = R1.L (Z);
+  R1 += -30;
+  R0 = LSHIFT R0 by R1.L;
+  RTS;
+
+/* METHOD 3: PRESCALE AND USE THE DIVIDE PRIMITIVES WITH SOME POST-CORRECTION
+  Two scaling operations are required to use the divide primitives with a
+  divisor > 0x7FFFF.
+  Firstly (as in method 1) we need to shift the dividend 1 to the left for
+  integer division.
+  Secondly we need to shift both the divisor and dividend 1 to the right so
+  both are in range for the primitives.
+  The left/right shift of the dividend does nothing so we can skip it.
+*/
+.Lshift_and_correct:
+  R2 = R0;
+  // R3 is already R1 >> 1
+  CC=!CC;
+  AQ = CC;                        /* Clear AQ, got here with CC = 0 */
+  DIVQ(R2, R3); // 1
+  DIVQ(R2, R3); // 2
+  DIVQ(R2, R3); // 3
+  DIVQ(R2, R3); // 4
+  DIVQ(R2, R3); // 5
+  DIVQ(R2, R3); // 6
+  DIVQ(R2, R3); // 7
+  DIVQ(R2, R3); // 8
+  DIVQ(R2, R3); // 9
+  DIVQ(R2, R3); // 10
+  DIVQ(R2, R3); // 11
+  DIVQ(R2, R3); // 12
+  DIVQ(R2, R3); // 13
+  DIVQ(R2, R3); // 14
+  DIVQ(R2, R3); // 15
+  DIVQ(R2, R3); // 16
+
+  /* According to the Instruction Set Reference:
+     To divide by a divisor > 0x7FFF,
+     1. prescale and perform divide to obtain quotient (Q) (done above),
+     2. multiply quotient by unscaled divisor (result M)
+     3. subtract the product from the divident to get an error (E = X - M)
+     4. if E < divisor (Y) subtract 1, if E > divisor (Y) add 1, else return quotient (Q)
+   */
+  R3 = R2.L (Z);		/* Q = X' / Y' */
+  R2 = R3;		/* Preserve Q */
+  R2 *= R1;		/* M = Q * Y */
+  R2 = R0 - R2;		/* E = X - M */
+  R0 = R3;		/* Copy Q into result reg */
+
+/* Correction: If result of the multiply is negative, we overflowed
+   and need to correct the result by subtracting 1 from the result.*/
+  R3 = 0xFFFF (Z);
+  R2 = R2 >> 16;		/* E >> 16 */
+  CC = R2 == R3;
+  R3 = 1 ;
+  R1 = R0 - R3;
+  IF CC R0 = R1;
+  RTS;
+
+ENDPROC(___udivsi3)
diff --git a/arch/blackfin/lib/umodsi3.S b/arch/blackfin/lib/umodsi3.S
new file mode 100644
index 000000000..3794c00d8
--- /dev/null
+++ b/arch/blackfin/lib/umodsi3.S
@@ -0,0 +1,49 @@
+/*
+ * libgcc1 routines for Blackfin 5xx
+ *
+ * Copyright 2004-2009 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+#ifdef CONFIG_ARITHMETIC_OPS_L1
+.section .l1.text
+#else
+.text
+#endif
+
+.extern ___udivsi3;
+.type ___udivsi3, STT_FUNC;
+.globl	___umodsi3
+.type ___umodsi3, STT_FUNC;
+___umodsi3:
+
+	CC=R0==0;
+	IF CC JUMP .LRETURN_R0;		/* Return 0, if NR == 0 */
+	CC= R1==0;
+	IF CC JUMP .LRETURN_ZERO_VAL;	/* Return 0, if DR == 0 */
+	CC=R0==R1;
+	IF CC JUMP .LRETURN_ZERO_VAL;	/* Return 0, if NR == DR */
+	CC = R1 == 1;
+	IF CC JUMP .LRETURN_ZERO_VAL;	/* Return 0, if  DR == 1 */
+	CC = R0<R1 (IU);
+	IF CC JUMP .LRETURN_R0;		/* Return dividend (R0),IF NR<DR */
+
+	[--SP] = (R7:6);		/* Push registers and */
+	[--SP] = RETS;			/* Return address */
+	R7 = R0;			/* Copy of R0 */
+	R6 = R1;
+	SP += -12;			/* Should always provide this space */
+	CALL ___udivsi3;		/* Compute unsigned quotient using ___udiv32()*/
+	SP += 12;
+	R0 *= R6;			/* Quotient * divisor */
+	R0 = R7 - R0;			/* Dividend - (quotient * divisor) */
+	RETS = [SP++];			/* Pop return address */
+	( R7:6) = [SP++];		/* And registers */
+	RTS;				/* Return remainder */
+.LRETURN_ZERO_VAL:
+	R0 = 0;
+.LRETURN_R0:
+	RTS;
+
+.size ___umodsi3, .-___umodsi3
diff --git a/arch/blackfin/lib/umulsi3_highpart.S b/arch/blackfin/lib/umulsi3_highpart.S
new file mode 100644
index 000000000..0dcace96e
--- /dev/null
+++ b/arch/blackfin/lib/umulsi3_highpart.S
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2007 Analog Devices Inc.
+ *
+ * Licensed under the Clear BSD license or the GPL-2 (or later)
+ */
+
+.align 2
+.global ___umulsi3_highpart;
+.type ___umulsi3_highpart, STT_FUNC;
+
+#ifdef CONFIG_ARITHMETIC_OPS_L1
+.section .l1.text
+#else
+.text
+#endif
+
+___umulsi3_highpart:
+	R2 = R1.H * R0.H, R3 = R1.L * R0.H (FU);
+	R0 = R1.L * R0.L, R1 = R1.H * R0.L (FU);
+	R0 >>= 16;
+	/* Unsigned multiplication has the nice property that we can
+	   ignore carry on this first addition.  */
+	R0 = R0 + R3;
+	R0 = R0 + R1;
+	cc = ac0;
+	R1 = cc;
+	R1 = PACK(R1.l,R0.h);
+	R0 = R1 + R2;
+	RTS;
+
+.size ___umulsi3_highpart, .-___umulsi3_highpart