summaryrefslogtreecommitdiff
path: root/arch/ia64/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/ia64/lib')
-rw-r--r--arch/ia64/lib/Makefile50
-rw-r--r--arch/ia64/lib/carta_random.S54
-rw-r--r--arch/ia64/lib/checksum.c101
-rw-r--r--arch/ia64/lib/clear_page.S76
-rw-r--r--arch/ia64/lib/clear_user.S209
-rw-r--r--arch/ia64/lib/copy_page.S98
-rw-r--r--arch/ia64/lib/copy_page_mck.S185
-rw-r--r--arch/ia64/lib/copy_user.S610
-rw-r--r--arch/ia64/lib/csum_partial_copy.c140
-rw-r--r--arch/ia64/lib/do_csum.S323
-rw-r--r--arch/ia64/lib/flush.S117
-rw-r--r--arch/ia64/lib/idiv32.S83
-rw-r--r--arch/ia64/lib/idiv64.S80
-rw-r--r--arch/ia64/lib/io.c164
-rw-r--r--arch/ia64/lib/ip_fast_csum.S144
-rw-r--r--arch/ia64/lib/memcpy.S301
-rw-r--r--arch/ia64/lib/memcpy_mck.S666
-rw-r--r--arch/ia64/lib/memset.S362
-rw-r--r--arch/ia64/lib/strlen.S192
-rw-r--r--arch/ia64/lib/strlen_user.S198
-rw-r--r--arch/ia64/lib/strncpy_from_user.S44
-rw-r--r--arch/ia64/lib/strnlen_user.S45
-rw-r--r--arch/ia64/lib/xor.S184
23 files changed, 4426 insertions, 0 deletions
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
new file mode 100644
index 000000000..98771e2a7
--- /dev/null
+++ b/arch/ia64/lib/Makefile
@@ -0,0 +1,50 @@
+#
+# Makefile for ia64-specific library routines..
+#
+
+obj-y := io.o
+
+lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
+ __divdi3.o __udivdi3.o __moddi3.o __umoddi3.o \
+ checksum.o clear_page.o csum_partial_copy.o \
+ clear_user.o strncpy_from_user.o strlen_user.o strnlen_user.o \
+ flush.o ip_fast_csum.o do_csum.o \
+ memset.o strlen.o xor.o
+
+obj-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o
+obj-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o
+lib-$(CONFIG_PERFMON) += carta_random.o
+
+AFLAGS___divdi3.o =
+AFLAGS___udivdi3.o = -DUNSIGNED
+AFLAGS___moddi3.o = -DMODULO
+AFLAGS___umoddi3.o = -DUNSIGNED -DMODULO
+
+AFLAGS___divsi3.o =
+AFLAGS___udivsi3.o = -DUNSIGNED
+AFLAGS___modsi3.o = -DMODULO
+AFLAGS___umodsi3.o = -DUNSIGNED -DMODULO
+
+$(obj)/__divdi3.o: $(src)/idiv64.S FORCE
+ $(call if_changed_dep,as_o_S)
+
+$(obj)/__udivdi3.o: $(src)/idiv64.S FORCE
+ $(call if_changed_dep,as_o_S)
+
+$(obj)/__moddi3.o: $(src)/idiv64.S FORCE
+ $(call if_changed_dep,as_o_S)
+
+$(obj)/__umoddi3.o: $(src)/idiv64.S FORCE
+ $(call if_changed_dep,as_o_S)
+
+$(obj)/__divsi3.o: $(src)/idiv32.S FORCE
+ $(call if_changed_dep,as_o_S)
+
+$(obj)/__udivsi3.o: $(src)/idiv32.S FORCE
+ $(call if_changed_dep,as_o_S)
+
+$(obj)/__modsi3.o: $(src)/idiv32.S FORCE
+ $(call if_changed_dep,as_o_S)
+
+$(obj)/__umodsi3.o: $(src)/idiv32.S FORCE
+ $(call if_changed_dep,as_o_S)
diff --git a/arch/ia64/lib/carta_random.S b/arch/ia64/lib/carta_random.S
new file mode 100644
index 000000000..d0674c360
--- /dev/null
+++ b/arch/ia64/lib/carta_random.S
@@ -0,0 +1,54 @@
+/*
+ * Fast, simple, yet decent quality random number generator based on
+ * a paper by David G. Carta ("Two Fast Implementations of the
+ * `Minimal Standard' Random Number Generator," Communications of the
+ * ACM, January, 1990).
+ *
+ * Copyright (C) 2002 Hewlett-Packard Co
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+#include <asm/asmmacro.h>
+
+#define a r2
+#define m r3
+#define lo r8
+#define hi r9
+#define t0 r16
+#define t1 r17
+#define seed r32
+
+GLOBAL_ENTRY(carta_random32)
+ movl a = (16807 << 16) | 16807
+ ;;
+ pmpyshr2.u t0 = a, seed, 0
+ pmpyshr2.u t1 = a, seed, 16
+ ;;
+ unpack2.l t0 = t1, t0
+ dep m = -1, r0, 0, 31
+ ;;
+ zxt4 lo = t0
+ shr.u hi = t0, 32
+ ;;
+ dep t0 = 0, hi, 15, 49 // t0 = (hi & 0x7fff)
+ ;;
+ shl t0 = t0, 16 // t0 = (hi & 0x7fff) << 16
+ shr t1 = hi, 15 // t1 = (hi >> 15)
+ ;;
+ add lo = lo, t0
+ ;;
+ cmp.gtu p6, p0 = lo, m
+ ;;
+(p6) and lo = lo, m
+ ;;
+(p6) add lo = 1, lo
+ ;;
+ add lo = lo, t1
+ ;;
+ cmp.gtu p6, p0 = lo, m
+ ;;
+(p6) and lo = lo, m
+ ;;
+(p6) add lo = 1, lo
+ br.ret.sptk.many rp
+END(carta_random32)
diff --git a/arch/ia64/lib/checksum.c b/arch/ia64/lib/checksum.c
new file mode 100644
index 000000000..9fc955026
--- /dev/null
+++ b/arch/ia64/lib/checksum.c
@@ -0,0 +1,101 @@
+/*
+ * Network checksum routines
+ *
+ * Copyright (C) 1999, 2003 Hewlett-Packard Co
+ * Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Most of the code coming from arch/alpha/lib/checksum.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed..
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <asm/byteorder.h>
+
+static inline unsigned short
+from64to16 (unsigned long x)
+{
+ /* add up 32-bit words for 33 bits */
+ x = (x & 0xffffffff) + (x >> 32);
+ /* add up 16-bit and 17-bit words for 17+c bits */
+ x = (x & 0xffff) + (x >> 16);
+ /* add up 16-bit and 2-bit for 16+c bit */
+ x = (x & 0xffff) + (x >> 16);
+ /* add up carry.. */
+ x = (x & 0xffff) + (x >> 16);
+ return x;
+}
+
+/*
+ * computes the checksum of the TCP/UDP pseudo-header
+ * returns a 16-bit checksum, already complemented.
+ */
+__sum16
+csum_tcpudp_magic (__be32 saddr, __be32 daddr, unsigned short len,
+ unsigned short proto, __wsum sum)
+{
+ return (__force __sum16)~from64to16(
+ (__force u64)saddr + (__force u64)daddr +
+ (__force u64)sum + ((len + proto) << 8));
+}
+
+EXPORT_SYMBOL(csum_tcpudp_magic);
+
+__wsum
+csum_tcpudp_nofold (__be32 saddr, __be32 daddr, unsigned short len,
+ unsigned short proto, __wsum sum)
+{
+ unsigned long result;
+
+ result = (__force u64)saddr + (__force u64)daddr +
+ (__force u64)sum + ((len + proto) << 8);
+
+ /* Fold down to 32-bits so we don't lose in the typedef-less network stack. */
+ /* 64 to 33 */
+ result = (result & 0xffffffff) + (result >> 32);
+ /* 33 to 32 */
+ result = (result & 0xffffffff) + (result >> 32);
+ return (__force __wsum)result;
+}
+EXPORT_SYMBOL(csum_tcpudp_nofold);
+
+extern unsigned long do_csum (const unsigned char *, long);
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 32-bit boundary
+ */
+__wsum csum_partial(const void *buff, int len, __wsum sum)
+{
+ u64 result = do_csum(buff, len);
+
+ /* add in old sum, and carry.. */
+ result += (__force u32)sum;
+ /* 32+c bits -> 32 bits */
+ result = (result & 0xffffffff) + (result >> 32);
+ return (__force __wsum)result;
+}
+
+EXPORT_SYMBOL(csum_partial);
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+__sum16 ip_compute_csum (const void *buff, int len)
+{
+ return (__force __sum16)~do_csum(buff,len);
+}
+
+EXPORT_SYMBOL(ip_compute_csum);
diff --git a/arch/ia64/lib/clear_page.S b/arch/ia64/lib/clear_page.S
new file mode 100644
index 000000000..2d814e7ed
--- /dev/null
+++ b/arch/ia64/lib/clear_page.S
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 1999-2002 Hewlett-Packard Co
+ * Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
+ *
+ * 1/06/01 davidm Tuned for Itanium.
+ * 2/12/02 kchen Tuned for both Itanium and McKinley
+ * 3/08/02 davidm Some more tweaking
+ */
+
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+
+#ifdef CONFIG_ITANIUM
+# define L3_LINE_SIZE 64 // Itanium L3 line size
+# define PREFETCH_LINES 9 // magic number
+#else
+# define L3_LINE_SIZE 128 // McKinley L3 line size
+# define PREFETCH_LINES 12 // magic number
+#endif
+
+#define saved_lc r2
+#define dst_fetch r3
+#define dst1 r8
+#define dst2 r9
+#define dst3 r10
+#define dst4 r11
+
+#define dst_last r31
+
+GLOBAL_ENTRY(clear_page)
+ .prologue
+ .regstk 1,0,0,0
+ mov r16 = PAGE_SIZE/L3_LINE_SIZE-1 // main loop count, -1=repeat/until
+ .save ar.lc, saved_lc
+ mov saved_lc = ar.lc
+
+ .body
+ mov ar.lc = (PREFETCH_LINES - 1)
+ mov dst_fetch = in0
+ adds dst1 = 16, in0
+ adds dst2 = 32, in0
+ ;;
+.fetch: stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
+ adds dst3 = 48, in0 // executing this multiple times is harmless
+ br.cloop.sptk.few .fetch
+ ;;
+ addl dst_last = (PAGE_SIZE - PREFETCH_LINES*L3_LINE_SIZE), dst_fetch
+ mov ar.lc = r16 // one L3 line per iteration
+ adds dst4 = 64, in0
+ ;;
+#ifdef CONFIG_ITANIUM
+ // Optimized for Itanium
+1: stf.spill.nta [dst1] = f0, 64
+ stf.spill.nta [dst2] = f0, 64
+ cmp.lt p8,p0=dst_fetch, dst_last
+ ;;
+#else
+ // Optimized for McKinley
+1: stf.spill.nta [dst1] = f0, 64
+ stf.spill.nta [dst2] = f0, 64
+ stf.spill.nta [dst3] = f0, 64
+ stf.spill.nta [dst4] = f0, 128
+ cmp.lt p8,p0=dst_fetch, dst_last
+ ;;
+ stf.spill.nta [dst1] = f0, 64
+ stf.spill.nta [dst2] = f0, 64
+#endif
+ stf.spill.nta [dst3] = f0, 64
+(p8) stf.spill.nta [dst_fetch] = f0, L3_LINE_SIZE
+ br.cloop.sptk.few 1b
+ ;;
+ mov ar.lc = saved_lc // restore lc
+ br.ret.sptk.many rp
+END(clear_page)
diff --git a/arch/ia64/lib/clear_user.S b/arch/ia64/lib/clear_user.S
new file mode 100644
index 000000000..eecd8577b
--- /dev/null
+++ b/arch/ia64/lib/clear_user.S
@@ -0,0 +1,209 @@
+/*
+ * This routine clears to zero a linear memory buffer in user space.
+ *
+ * Inputs:
+ * in0: address of buffer
+ * in1: length of buffer in bytes
+ * Outputs:
+ * r8: number of bytes that didn't get cleared due to a fault
+ *
+ * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
+ * Stephane Eranian <eranian@hpl.hp.com>
+ */
+
+#include <asm/asmmacro.h>
+
+//
+// arguments
+//
+#define buf r32
+#define len r33
+
+//
+// local registers
+//
+#define cnt r16
+#define buf2 r17
+#define saved_lc r18
+#define saved_pfs r19
+#define tmp r20
+#define len2 r21
+#define len3 r22
+
+//
+// Theory of operations:
+// - we check whether or not the buffer is small, i.e., less than 17
+// in which case we do the byte by byte loop.
+//
+// - Otherwise we go progressively from 1 byte store to 8byte store in
+// the head part, the body is a 16byte store loop and we finish we the
+// tail for the last 15 bytes.
+// The good point about this breakdown is that the long buffer handling
+// contains only 2 branches.
+//
+// The reason for not using shifting & masking for both the head and the
+// tail is to stay semantically correct. This routine is not supposed
+// to write bytes outside of the buffer. While most of the time this would
+// be ok, we can't tolerate a mistake. A classical example is the case
+// of multithreaded code were to the extra bytes touched is actually owned
+// by another thread which runs concurrently to ours. Another, less likely,
+// example is with device drivers where reading an I/O mapped location may
+// have side effects (same thing for writing).
+//
+
+GLOBAL_ENTRY(__do_clear_user)
+ .prologue
+ .save ar.pfs, saved_pfs
+ alloc saved_pfs=ar.pfs,2,0,0,0
+ cmp.eq p6,p0=r0,len // check for zero length
+ .save ar.lc, saved_lc
+ mov saved_lc=ar.lc // preserve ar.lc (slow)
+ .body
+ ;; // avoid WAW on CFM
+ adds tmp=-1,len // br.ctop is repeat/until
+ mov ret0=len // return value is length at this point
+(p6) br.ret.spnt.many rp
+ ;;
+ cmp.lt p6,p0=16,len // if len > 16 then long memset
+ mov ar.lc=tmp // initialize lc for small count
+(p6) br.cond.dptk .long_do_clear
+ ;; // WAR on ar.lc
+ //
+ // worst case 16 iterations, avg 8 iterations
+ //
+ // We could have played with the predicates to use the extra
+ // M slot for 2 stores/iteration but the cost the initialization
+ // the various counters compared to how long the loop is supposed
+ // to last on average does not make this solution viable.
+ //
+1:
+ EX( .Lexit1, st1 [buf]=r0,1 )
+ adds len=-1,len // countdown length using len
+ br.cloop.dptk 1b
+ ;; // avoid RAW on ar.lc
+ //
+ // .Lexit4: comes from byte by byte loop
+ // len contains bytes left
+.Lexit1:
+ mov ret0=len // faster than using ar.lc
+ mov ar.lc=saved_lc
+ br.ret.sptk.many rp // end of short clear_user
+
+
+ //
+ // At this point we know we have more than 16 bytes to copy
+ // so we focus on alignment (no branches required)
+ //
+ // The use of len/len2 for countdown of the number of bytes left
+ // instead of ret0 is due to the fact that the exception code
+ // changes the values of r8.
+ //
+.long_do_clear:
+ tbit.nz p6,p0=buf,0 // odd alignment (for long_do_clear)
+ ;;
+ EX( .Lexit3, (p6) st1 [buf]=r0,1 ) // 1-byte aligned
+(p6) adds len=-1,len;; // sync because buf is modified
+ tbit.nz p6,p0=buf,1
+ ;;
+ EX( .Lexit3, (p6) st2 [buf]=r0,2 ) // 2-byte aligned
+(p6) adds len=-2,len;;
+ tbit.nz p6,p0=buf,2
+ ;;
+ EX( .Lexit3, (p6) st4 [buf]=r0,4 ) // 4-byte aligned
+(p6) adds len=-4,len;;
+ tbit.nz p6,p0=buf,3
+ ;;
+ EX( .Lexit3, (p6) st8 [buf]=r0,8 ) // 8-byte aligned
+(p6) adds len=-8,len;;
+ shr.u cnt=len,4 // number of 128-bit (2x64bit) words
+ ;;
+ cmp.eq p6,p0=r0,cnt
+ adds tmp=-1,cnt
+(p6) br.cond.dpnt .dotail // we have less than 16 bytes left
+ ;;
+ adds buf2=8,buf // setup second base pointer
+ mov ar.lc=tmp
+ ;;
+
+ //
+ // 16bytes/iteration core loop
+ //
+ // The second store can never generate a fault because
+ // we come into the loop only when we are 16-byte aligned.
+ // This means that if we cross a page then it will always be
+ // in the first store and never in the second.
+ //
+ //
+ // We need to keep track of the remaining length. A possible (optimistic)
+ // way would be to use ar.lc and derive how many byte were left by
+ // doing : left= 16*ar.lc + 16. this would avoid the addition at
+ // every iteration.
+ // However we need to keep the synchronization point. A template
+ // M;;MB does not exist and thus we can keep the addition at no
+ // extra cycle cost (use a nop slot anyway). It also simplifies the
+ // (unlikely) error recovery code
+ //
+
+2: EX(.Lexit3, st8 [buf]=r0,16 )
+ ;; // needed to get len correct when error
+ st8 [buf2]=r0,16
+ adds len=-16,len
+ br.cloop.dptk 2b
+ ;;
+ mov ar.lc=saved_lc
+ //
+ // tail correction based on len only
+ //
+ // We alternate the use of len3,len2 to allow parallelism and correct
+ // error handling. We also reuse p6/p7 to return correct value.
+ // The addition of len2/len3 does not cost anything more compared to
+ // the regular memset as we had empty slots.
+ //
+.dotail:
+ mov len2=len // for parallelization of error handling
+ mov len3=len
+ tbit.nz p6,p0=len,3
+ ;;
+ EX( .Lexit2, (p6) st8 [buf]=r0,8 ) // at least 8 bytes
+(p6) adds len3=-8,len2
+ tbit.nz p7,p6=len,2
+ ;;
+ EX( .Lexit2, (p7) st4 [buf]=r0,4 ) // at least 4 bytes
+(p7) adds len2=-4,len3
+ tbit.nz p6,p7=len,1
+ ;;
+ EX( .Lexit2, (p6) st2 [buf]=r0,2 ) // at least 2 bytes
+(p6) adds len3=-2,len2
+ tbit.nz p7,p6=len,0
+ ;;
+ EX( .Lexit2, (p7) st1 [buf]=r0 ) // only 1 byte left
+ mov ret0=r0 // success
+ br.ret.sptk.many rp // end of most likely path
+
+ //
+ // Outlined error handling code
+ //
+
+ //
+ // .Lexit3: comes from core loop, need restore pr/lc
+ // len contains bytes left
+ //
+ //
+ // .Lexit2:
+ // if p6 -> coming from st8 or st2 : len2 contains what's left
+ // if p7 -> coming from st4 or st1 : len3 contains what's left
+ // We must restore lc/pr even though might not have been used.
+.Lexit2:
+ .pred.rel "mutex", p6, p7
+(p6) mov len=len2
+(p7) mov len=len3
+ ;;
+ //
+ // .Lexit4: comes from head, need not restore pr/lc
+ // len contains bytes left
+ //
+.Lexit3:
+ mov ret0=len
+ mov ar.lc=saved_lc
+ br.ret.sptk.many rp
+END(__do_clear_user)
diff --git a/arch/ia64/lib/copy_page.S b/arch/ia64/lib/copy_page.S
new file mode 100644
index 000000000..127d1d050
--- /dev/null
+++ b/arch/ia64/lib/copy_page.S
@@ -0,0 +1,98 @@
+/*
+ *
+ * Optimized version of the standard copy_page() function
+ *
+ * Inputs:
+ * in0: address of target page
+ * in1: address of source page
+ * Output:
+ * no return value
+ *
+ * Copyright (C) 1999, 2001 Hewlett-Packard Co
+ * Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger <davidm@hpl.hp.com>
+ *
+ * 4/06/01 davidm Tuned to make it perform well both for cached and uncached copies.
+ */
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+
+#define PIPE_DEPTH 3
+#define EPI p[PIPE_DEPTH-1]
+
+#define lcount r16
+#define saved_pr r17
+#define saved_lc r18
+#define saved_pfs r19
+#define src1 r20
+#define src2 r21
+#define tgt1 r22
+#define tgt2 r23
+#define srcf r24
+#define tgtf r25
+#define tgt_last r26
+
+#define Nrot ((8*PIPE_DEPTH+7)&~7)
+
+GLOBAL_ENTRY(copy_page)
+ .prologue
+ .save ar.pfs, saved_pfs
+ alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
+
+ .rotr t1[PIPE_DEPTH], t2[PIPE_DEPTH], t3[PIPE_DEPTH], t4[PIPE_DEPTH], \
+ t5[PIPE_DEPTH], t6[PIPE_DEPTH], t7[PIPE_DEPTH], t8[PIPE_DEPTH]
+ .rotp p[PIPE_DEPTH]
+
+ .save ar.lc, saved_lc
+ mov saved_lc=ar.lc
+ mov ar.ec=PIPE_DEPTH
+
+ mov lcount=PAGE_SIZE/64-1
+ .save pr, saved_pr
+ mov saved_pr=pr
+ mov pr.rot=1<<16
+
+ .body
+
+ mov src1=in1
+ adds src2=8,in1
+ mov tgt_last = PAGE_SIZE
+ ;;
+ adds tgt2=8,in0
+ add srcf=512,in1
+ mov ar.lc=lcount
+ mov tgt1=in0
+ add tgtf=512,in0
+ add tgt_last = tgt_last, in0
+ ;;
+1:
+(p[0]) ld8 t1[0]=[src1],16
+(EPI) st8 [tgt1]=t1[PIPE_DEPTH-1],16
+(p[0]) ld8 t2[0]=[src2],16
+(EPI) st8 [tgt2]=t2[PIPE_DEPTH-1],16
+ cmp.ltu p6,p0 = tgtf, tgt_last
+ ;;
+(p[0]) ld8 t3[0]=[src1],16
+(EPI) st8 [tgt1]=t3[PIPE_DEPTH-1],16
+(p[0]) ld8 t4[0]=[src2],16
+(EPI) st8 [tgt2]=t4[PIPE_DEPTH-1],16
+ ;;
+(p[0]) ld8 t5[0]=[src1],16
+(EPI) st8 [tgt1]=t5[PIPE_DEPTH-1],16
+(p[0]) ld8 t6[0]=[src2],16
+(EPI) st8 [tgt2]=t6[PIPE_DEPTH-1],16
+ ;;
+(p[0]) ld8 t7[0]=[src1],16
+(EPI) st8 [tgt1]=t7[PIPE_DEPTH-1],16
+(p[0]) ld8 t8[0]=[src2],16
+(EPI) st8 [tgt2]=t8[PIPE_DEPTH-1],16
+
+(p6) lfetch [srcf], 64
+(p6) lfetch [tgtf], 64
+ br.ctop.sptk.few 1b
+ ;;
+ mov pr=saved_pr,0xffffffffffff0000 // restore predicates
+ mov ar.pfs=saved_pfs
+ mov ar.lc=saved_lc
+ br.ret.sptk.many rp
+END(copy_page)
diff --git a/arch/ia64/lib/copy_page_mck.S b/arch/ia64/lib/copy_page_mck.S
new file mode 100644
index 000000000..3c45d60a8
--- /dev/null
+++ b/arch/ia64/lib/copy_page_mck.S
@@ -0,0 +1,185 @@
+/*
+ * McKinley-optimized version of copy_page().
+ *
+ * Copyright (C) 2002 Hewlett-Packard Co
+ * David Mosberger <davidm@hpl.hp.com>
+ *
+ * Inputs:
+ * in0: address of target page
+ * in1: address of source page
+ * Output:
+ * no return value
+ *
+ * General idea:
+ * - use regular loads and stores to prefetch data to avoid consuming M-slot just for
+ * lfetches => good for in-cache performance
+ * - avoid l2 bank-conflicts by not storing into the same 16-byte bank within a single
+ * cycle
+ *
+ * Principle of operation:
+ * First, note that L1 has a line-size of 64 bytes and L2 a line-size of 128 bytes.
+ * To avoid secondary misses in L2, we prefetch both source and destination with a line-size
+ * of 128 bytes. When both of these lines are in the L2 and the first half of the
+ * source line is in L1, we start copying the remaining words. The second half of the
+ * source line is prefetched in an earlier iteration, so that by the time we start
+ * accessing it, it's also present in the L1.
+ *
+ * We use a software-pipelined loop to control the overall operation. The pipeline
+ * has 2*PREFETCH_DIST+K stages. The first PREFETCH_DIST stages are used for prefetching
+ * source cache-lines. The second PREFETCH_DIST stages are used for prefetching destination
+ * cache-lines, the last K stages are used to copy the cache-line words not copied by
+ * the prefetches. The four relevant points in the pipelined are called A, B, C, D:
+ * p[A] is TRUE if a source-line should be prefetched, p[B] is TRUE if a destination-line
+ * should be prefetched, p[C] is TRUE if the second half of an L2 line should be brought
+ * into L1D and p[D] is TRUE if a cacheline needs to be copied.
+ *
+ * This all sounds very complicated, but thanks to the modulo-scheduled loop support,
+ * the resulting code is very regular and quite easy to follow (once you get the idea).
+ *
+ * As a secondary optimization, the first 2*PREFETCH_DIST iterations are implemented
+ * as the separate .prefetch_loop. Logically, this loop performs exactly like the
+ * main-loop (.line_copy), but has all known-to-be-predicated-off instructions removed,
+ * so that each loop iteration is faster (again, good for cached case).
+ *
+ * When reading the code, it helps to keep the following picture in mind:
+ *
+ * word 0 word 1
+ * +------+------+---
+ * | v[x] | t1 | ^
+ * | t2 | t3 | |
+ * | t4 | t5 | |
+ * | t6 | t7 | | 128 bytes
+ * | n[y] | t9 | | (L2 cache line)
+ * | t10 | t11 | |
+ * | t12 | t13 | |
+ * | t14 | t15 | v
+ * +------+------+---
+ *
+ * Here, v[x] is copied by the (memory) prefetch. n[y] is loaded at p[C]
+ * to fetch the second-half of the L2 cache line into L1, and the tX words are copied in
+ * an order that avoids bank conflicts.
+ */
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+
+#define PREFETCH_DIST 8 // McKinley sustains 16 outstanding L2 misses (8 ld, 8 st)
+
+#define src0 r2
+#define src1 r3
+#define dst0 r9
+#define dst1 r10
+#define src_pre_mem r11
+#define dst_pre_mem r14
+#define src_pre_l2 r15
+#define dst_pre_l2 r16
+#define t1 r17
+#define t2 r18
+#define t3 r19
+#define t4 r20
+#define t5 t1 // alias!
+#define t6 t2 // alias!
+#define t7 t3 // alias!
+#define t9 t5 // alias!
+#define t10 t4 // alias!
+#define t11 t7 // alias!
+#define t12 t6 // alias!
+#define t14 t10 // alias!
+#define t13 r21
+#define t15 r22
+
+#define saved_lc r23
+#define saved_pr r24
+
+#define A 0
+#define B (PREFETCH_DIST)
+#define C (B + PREFETCH_DIST)
+#define D (C + 3)
+#define N (D + 1)
+#define Nrot ((N + 7) & ~7)
+
+GLOBAL_ENTRY(copy_page)
+ .prologue
+ alloc r8 = ar.pfs, 2, Nrot-2, 0, Nrot
+
+ .rotr v[2*PREFETCH_DIST], n[D-C+1]
+ .rotp p[N]
+
+ .save ar.lc, saved_lc
+ mov saved_lc = ar.lc
+ .save pr, saved_pr
+ mov saved_pr = pr
+ .body
+
+ mov src_pre_mem = in1
+ mov pr.rot = 0x10000
+ mov ar.ec = 1 // special unrolled loop
+
+ mov dst_pre_mem = in0
+ mov ar.lc = 2*PREFETCH_DIST - 1
+
+ add src_pre_l2 = 8*8, in1
+ add dst_pre_l2 = 8*8, in0
+ add src0 = 8, in1 // first t1 src
+ add src1 = 3*8, in1 // first t3 src
+ add dst0 = 8, in0 // first t1 dst
+ add dst1 = 3*8, in0 // first t3 dst
+ mov t1 = (PAGE_SIZE/128) - (2*PREFETCH_DIST) - 1
+ nop.m 0
+ nop.i 0
+ ;;
+ // same as .line_copy loop, but with all predicated-off instructions removed:
+.prefetch_loop:
+(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0
+(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2
+ br.ctop.sptk .prefetch_loop
+ ;;
+ cmp.eq p16, p0 = r0, r0 // reset p16 to 1 (br.ctop cleared it to zero)
+ mov ar.lc = t1 // with 64KB pages, t1 is too big to fit in 8 bits!
+ mov ar.ec = N // # of stages in pipeline
+ ;;
+.line_copy:
+(p[D]) ld8 t2 = [src0], 3*8 // M0
+(p[D]) ld8 t4 = [src1], 3*8 // M1
+(p[B]) st8 [dst_pre_mem] = v[B], 128 // M2 prefetch dst from memory
+(p[D]) st8 [dst_pre_l2] = n[D-C], 128 // M3 prefetch dst from L2
+ ;;
+(p[A]) ld8 v[A] = [src_pre_mem], 128 // M0 prefetch src from memory
+(p[C]) ld8 n[0] = [src_pre_l2], 128 // M1 prefetch src from L2
+(p[D]) st8 [dst0] = t1, 8 // M2
+(p[D]) st8 [dst1] = t3, 8 // M3
+ ;;
+(p[D]) ld8 t5 = [src0], 8
+(p[D]) ld8 t7 = [src1], 3*8
+(p[D]) st8 [dst0] = t2, 3*8
+(p[D]) st8 [dst1] = t4, 3*8
+ ;;
+(p[D]) ld8 t6 = [src0], 3*8
+(p[D]) ld8 t10 = [src1], 8
+(p[D]) st8 [dst0] = t5, 8
+(p[D]) st8 [dst1] = t7, 3*8
+ ;;
+(p[D]) ld8 t9 = [src0], 3*8
+(p[D]) ld8 t11 = [src1], 3*8
+(p[D]) st8 [dst0] = t6, 3*8
+(p[D]) st8 [dst1] = t10, 8
+ ;;
+(p[D]) ld8 t12 = [src0], 8
+(p[D]) ld8 t14 = [src1], 8
+(p[D]) st8 [dst0] = t9, 3*8
+(p[D]) st8 [dst1] = t11, 3*8
+ ;;
+(p[D]) ld8 t13 = [src0], 4*8
+(p[D]) ld8 t15 = [src1], 4*8
+(p[D]) st8 [dst0] = t12, 8
+(p[D]) st8 [dst1] = t14, 8
+ ;;
+(p[D-1])ld8 t1 = [src0], 8
+(p[D-1])ld8 t3 = [src1], 8
+(p[D]) st8 [dst0] = t13, 4*8
+(p[D]) st8 [dst1] = t15, 4*8
+ br.ctop.sptk .line_copy
+ ;;
+ mov ar.lc = saved_lc
+ mov pr = saved_pr, -1
+ br.ret.sptk.many rp
+END(copy_page)
diff --git a/arch/ia64/lib/copy_user.S b/arch/ia64/lib/copy_user.S
new file mode 100644
index 000000000..c952bdc6a
--- /dev/null
+++ b/arch/ia64/lib/copy_user.S
@@ -0,0 +1,610 @@
+/*
+ *
+ * Optimized version of the copy_user() routine.
+ * It is used to copy date across the kernel/user boundary.
+ *
+ * The source and destination are always on opposite side of
+ * the boundary. When reading from user space we must catch
+ * faults on loads. When writing to user space we must catch
+ * errors on stores. Note that because of the nature of the copy
+ * we don't need to worry about overlapping regions.
+ *
+ *
+ * Inputs:
+ * in0 address of source buffer
+ * in1 address of destination buffer
+ * in2 number of bytes to copy
+ *
+ * Outputs:
+ * ret0 0 in case of success. The number of bytes NOT copied in
+ * case of error.
+ *
+ * Copyright (C) 2000-2001 Hewlett-Packard Co
+ * Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Fixme:
+ * - handle the case where we have more than 16 bytes and the alignment
+ * are different.
+ * - more benchmarking
+ * - fix extraneous stop bit introduced by the EX() macro.
+ */
+
+#include <asm/asmmacro.h>
+
+//
+// Tuneable parameters
+//
+#define COPY_BREAK 16 // we do byte copy below (must be >=16)
+#define PIPE_DEPTH 21 // pipe depth
+
+#define EPI p[PIPE_DEPTH-1]
+
+//
+// arguments
+//
+#define dst in0
+#define src in1
+#define len in2
+
+//
+// local registers
+//
+#define t1 r2 // rshift in bytes
+#define t2 r3 // lshift in bytes
+#define rshift r14 // right shift in bits
+#define lshift r15 // left shift in bits
+#define word1 r16
+#define word2 r17
+#define cnt r18
+#define len2 r19
+#define saved_lc r20
+#define saved_pr r21
+#define tmp r22
+#define val r23
+#define src1 r24
+#define dst1 r25
+#define src2 r26
+#define dst2 r27
+#define len1 r28
+#define enddst r29
+#define endsrc r30
+#define saved_pfs r31
+
+GLOBAL_ENTRY(__copy_user)
+ .prologue
+ .save ar.pfs, saved_pfs
+ alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
+
+ .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
+ .rotp p[PIPE_DEPTH]
+
+ adds len2=-1,len // br.ctop is repeat/until
+ mov ret0=r0
+
+ ;; // RAW of cfm when len=0
+ cmp.eq p8,p0=r0,len // check for zero length
+ .save ar.lc, saved_lc
+ mov saved_lc=ar.lc // preserve ar.lc (slow)
+(p8) br.ret.spnt.many rp // empty mempcy()
+ ;;
+ add enddst=dst,len // first byte after end of source
+ add endsrc=src,len // first byte after end of destination
+ .save pr, saved_pr
+ mov saved_pr=pr // preserve predicates
+
+ .body
+
+ mov dst1=dst // copy because of rotation
+ mov ar.ec=PIPE_DEPTH
+ mov pr.rot=1<<16 // p16=true all others are false
+
+ mov src1=src // copy because of rotation
+ mov ar.lc=len2 // initialize lc for small count
+ cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy
+
+ xor tmp=src,dst // same alignment test prepare
+(p10) br.cond.dptk .long_copy_user
+ ;; // RAW pr.rot/p16 ?
+ //
+ // Now we do the byte by byte loop with software pipeline
+ //
+ // p7 is necessarily false by now
+1:
+ EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
+ EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
+ br.ctop.dptk.few 1b
+ ;;
+ mov ar.lc=saved_lc
+ mov pr=saved_pr,0xffffffffffff0000
+ mov ar.pfs=saved_pfs // restore ar.ec
+ br.ret.sptk.many rp // end of short memcpy
+
+ //
+ // Not 8-byte aligned
+ //
+.diff_align_copy_user:
+ // At this point we know we have more than 16 bytes to copy
+ // and also that src and dest do _not_ have the same alignment.
+ and src2=0x7,src1 // src offset
+ and dst2=0x7,dst1 // dst offset
+ ;;
+ // The basic idea is that we copy byte-by-byte at the head so
+ // that we can reach 8-byte alignment for both src1 and dst1.
+ // Then copy the body using software pipelined 8-byte copy,
+ // shifting the two back-to-back words right and left, then copy
+ // the tail by copying byte-by-byte.
+ //
+ // Fault handling. If the byte-by-byte at the head fails on the
+ // load, then restart and finish the pipleline by copying zeros
+ // to the dst1. Then copy zeros for the rest of dst1.
+ // If 8-byte software pipeline fails on the load, do the same as
+ // failure_in3 does. If the byte-by-byte at the tail fails, it is
+ // handled simply by failure_in_pipe1.
+ //
+ // The case p14 represents the source has more bytes in the
+ // the first word (by the shifted part), whereas the p15 needs to
+ // copy some bytes from the 2nd word of the source that has the
+ // tail of the 1st of the destination.
+ //
+
+ //
+ // Optimization. If dst1 is 8-byte aligned (quite common), we don't need
+ // to copy the head to dst1, to start 8-byte copy software pipeline.
+ // We know src1 is not 8-byte aligned in this case.
+ //
+ cmp.eq p14,p15=r0,dst2
+(p15) br.cond.spnt 1f
+ ;;
+ sub t1=8,src2
+ mov t2=src2
+ ;;
+ shl rshift=t2,3
+ sub len1=len,t1 // set len1
+ ;;
+ sub lshift=64,rshift
+ ;;
+ br.cond.spnt .word_copy_user
+ ;;
+1:
+ cmp.leu p14,p15=src2,dst2
+ sub t1=dst2,src2
+ ;;
+ .pred.rel "mutex", p14, p15
+(p14) sub word1=8,src2 // (8 - src offset)
+(p15) sub t1=r0,t1 // absolute value
+(p15) sub word1=8,dst2 // (8 - dst offset)
+ ;;
+ // For the case p14, we don't need to copy the shifted part to
+ // the 1st word of destination.
+ sub t2=8,t1
+(p14) sub word1=word1,t1
+ ;;
+ sub len1=len,word1 // resulting len
+(p15) shl rshift=t1,3 // in bits
+(p14) shl rshift=t2,3
+ ;;
+(p14) sub len1=len1,t1
+ adds cnt=-1,word1
+ ;;
+ sub lshift=64,rshift
+ mov ar.ec=PIPE_DEPTH
+ mov pr.rot=1<<16 // p16=true all others are false
+ mov ar.lc=cnt
+ ;;
+2:
+ EX(.failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
+ EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
+ br.ctop.dptk.few 2b
+ ;;
+ clrrrb
+ ;;
+.word_copy_user:
+ cmp.gtu p9,p0=16,len1
+(p9) br.cond.spnt 4f // if (16 > len1) skip 8-byte copy
+ ;;
+ shr.u cnt=len1,3 // number of 64-bit words
+ ;;
+ adds cnt=-1,cnt
+ ;;
+ .pred.rel "mutex", p14, p15
+(p14) sub src1=src1,t2
+(p15) sub src1=src1,t1
+ //
+ // Now both src1 and dst1 point to an 8-byte aligned address. And
+ // we have more than 8 bytes to copy.
+ //
+ mov ar.lc=cnt
+ mov ar.ec=PIPE_DEPTH
+ mov pr.rot=1<<16 // p16=true all others are false
+ ;;
+3:
+ //
+ // The pipleline consists of 3 stages:
+ // 1 (p16): Load a word from src1
+ // 2 (EPI_1): Shift right pair, saving to tmp
+ // 3 (EPI): Store tmp to dst1
+ //
+ // To make it simple, use at least 2 (p16) loops to set up val1[n]
+ // because we need 2 back-to-back val1[] to get tmp.
+ // Note that this implies EPI_2 must be p18 or greater.
+ //
+
+#define EPI_1 p[PIPE_DEPTH-2]
+#define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift
+#define CASE(pred, shift) \
+ (pred) br.cond.spnt .copy_user_bit##shift
+#define BODY(rshift) \
+.copy_user_bit##rshift: \
+1: \
+ EX(.failure_out,(EPI) st8 [dst1]=tmp,8); \
+(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
+ EX(3f,(p16) ld8 val1[1]=[src1],8); \
+(p16) mov val1[0]=r0; \
+ br.ctop.dptk 1b; \
+ ;; \
+ br.cond.sptk.many .diff_align_do_tail; \
+2: \
+(EPI) st8 [dst1]=tmp,8; \
+(EPI_1) shrp tmp=val1[PIPE_DEPTH-2],val1[PIPE_DEPTH-1],rshift; \
+3: \
+(p16) mov val1[1]=r0; \
+(p16) mov val1[0]=r0; \
+ br.ctop.dptk 2b; \
+ ;; \
+ br.cond.sptk.many .failure_in2
+
+ //
+ // Since the instruction 'shrp' requires a fixed 128-bit value
+ // specifying the bits to shift, we need to provide 7 cases
+ // below.
+ //
+ SWITCH(p6, 8)
+ SWITCH(p7, 16)
+ SWITCH(p8, 24)
+ SWITCH(p9, 32)
+ SWITCH(p10, 40)
+ SWITCH(p11, 48)
+ SWITCH(p12, 56)
+ ;;
+ CASE(p6, 8)
+ CASE(p7, 16)
+ CASE(p8, 24)
+ CASE(p9, 32)
+ CASE(p10, 40)
+ CASE(p11, 48)
+ CASE(p12, 56)
+ ;;
+ BODY(8)
+ BODY(16)
+ BODY(24)
+ BODY(32)
+ BODY(40)
+ BODY(48)
+ BODY(56)
+ ;;
+.diff_align_do_tail:
+ .pred.rel "mutex", p14, p15
+(p14) sub src1=src1,t1
+(p14) adds dst1=-8,dst1
+(p15) sub dst1=dst1,t1
+ ;;
+4:
+ // Tail correction.
+ //
+ // The problem with this piplelined loop is that the last word is not
+ // loaded and thus parf of the last word written is not correct.
+ // To fix that, we simply copy the tail byte by byte.
+
+ sub len1=endsrc,src1,1
+ clrrrb
+ ;;
+ mov ar.ec=PIPE_DEPTH
+ mov pr.rot=1<<16 // p16=true all others are false
+ mov ar.lc=len1
+ ;;
+5:
+ EX(.failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
+ EX(.failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
+ br.ctop.dptk.few 5b
+ ;;
+ mov ar.lc=saved_lc
+ mov pr=saved_pr,0xffffffffffff0000
+ mov ar.pfs=saved_pfs
+ br.ret.sptk.many rp
+
+ //
+ // Beginning of long mempcy (i.e. > 16 bytes)
+ //
+.long_copy_user:
+ tbit.nz p6,p7=src1,0 // odd alignment
+ and tmp=7,tmp
+ ;;
+ cmp.eq p10,p8=r0,tmp
+ mov len1=len // copy because of rotation
+(p8) br.cond.dpnt .diff_align_copy_user
+ ;;
+ // At this point we know we have more than 16 bytes to copy
+ // and also that both src and dest have the same alignment
+ // which may not be the one we want. So for now we must move
+ // forward slowly until we reach 16byte alignment: no need to
+ // worry about reaching the end of buffer.
+ //
+ EX(.failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned
+(p6) adds len1=-1,len1;;
+ tbit.nz p7,p0=src1,1
+ ;;
+ EX(.failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned
+(p7) adds len1=-2,len1;;
+ tbit.nz p8,p0=src1,2
+ ;;
+ //
+ // Stop bit not required after ld4 because if we fail on ld4
+ // we have never executed the ld1, therefore st1 is not executed.
+ //
+ EX(.failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned
+ ;;
+ EX(.failure_out,(p6) st1 [dst1]=val1[0],1)
+ tbit.nz p9,p0=src1,3
+ ;;
+ //
+ // Stop bit not required after ld8 because if we fail on ld8
+ // we have never executed the ld2, therefore st2 is not executed.
+ //
+ EX(.failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned
+ EX(.failure_out,(p7) st2 [dst1]=val1[1],2)
+(p8) adds len1=-4,len1
+ ;;
+ EX(.failure_out, (p8) st4 [dst1]=val2[0],4)
+(p9) adds len1=-8,len1;;
+ shr.u cnt=len1,4 // number of 128-bit (2x64bit) words
+ ;;
+ EX(.failure_out, (p9) st8 [dst1]=val2[1],8)
+ tbit.nz p6,p0=len1,3
+ cmp.eq p7,p0=r0,cnt
+ adds tmp=-1,cnt // br.ctop is repeat/until
+(p7) br.cond.dpnt .dotail // we have less than 16 bytes left
+ ;;
+ adds src2=8,src1
+ adds dst2=8,dst1
+ mov ar.lc=tmp
+ ;;
+ //
+ // 16bytes/iteration
+ //
+2:
+ EX(.failure_in3,(p16) ld8 val1[0]=[src1],16)
+(p16) ld8 val2[0]=[src2],16
+
+ EX(.failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16)
+(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
+ br.ctop.dptk 2b
+ ;; // RAW on src1 when fall through from loop
+ //
+ // Tail correction based on len only
+ //
+ // No matter where we come from (loop or test) the src1 pointer
+ // is 16 byte aligned AND we have less than 16 bytes to copy.
+ //
+.dotail:
+ EX(.failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes
+ tbit.nz p7,p0=len1,2
+ ;;
+ EX(.failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes
+ tbit.nz p8,p0=len1,1
+ ;;
+ EX(.failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes
+ tbit.nz p9,p0=len1,0
+ ;;
+ EX(.failure_out, (p6) st8 [dst1]=val1[0],8)
+ ;;
+ EX(.failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left
+ mov ar.lc=saved_lc
+ ;;
+ EX(.failure_out,(p7) st4 [dst1]=val1[1],4)
+ mov pr=saved_pr,0xffffffffffff0000
+ ;;
+ EX(.failure_out, (p8) st2 [dst1]=val2[0],2)
+ mov ar.pfs=saved_pfs
+ ;;
+ EX(.failure_out, (p9) st1 [dst1]=val2[1])
+ br.ret.sptk.many rp
+
+
+ //
+ // Here we handle the case where the byte by byte copy fails
+ // on the load.
+ // Several factors make the zeroing of the rest of the buffer kind of
+ // tricky:
+ // - the pipeline: loads/stores are not in sync (pipeline)
+ //
+ // In the same loop iteration, the dst1 pointer does not directly
+ // reflect where the faulty load was.
+ //
+ // - pipeline effect
+ // When you get a fault on load, you may have valid data from
+ // previous loads not yet store in transit. Such data must be
+ // store normally before moving onto zeroing the rest.
+ //
+ // - single/multi dispersal independence.
+ //
+ // solution:
+ // - we don't disrupt the pipeline, i.e. data in transit in
+ // the software pipeline will be eventually move to memory.
+ // We simply replace the load with a simple mov and keep the
+ // pipeline going. We can't really do this inline because
+ // p16 is always reset to 1 when lc > 0.
+ //
+.failure_in_pipe1:
+ sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
+1:
+(p16) mov val1[0]=r0
+(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
+ br.ctop.dptk 1b
+ ;;
+ mov pr=saved_pr,0xffffffffffff0000
+ mov ar.lc=saved_lc
+ mov ar.pfs=saved_pfs
+ br.ret.sptk.many rp
+
+ //
+ // This is the case where the byte by byte copy fails on the load
+ // when we copy the head. We need to finish the pipeline and copy
+ // zeros for the rest of the destination. Since this happens
+ // at the top we still need to fill the body and tail.
+.failure_in_pipe2:
+ sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
+2:
+(p16) mov val1[0]=r0
+(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
+ br.ctop.dptk 2b
+ ;;
+ sub len=enddst,dst1,1 // precompute len
+ br.cond.dptk.many .failure_in1bis
+ ;;
+
+ //
+ // Here we handle the head & tail part when we check for alignment.
+ // The following code handles only the load failures. The
+ // main diffculty comes from the fact that loads/stores are
+ // scheduled. So when you fail on a load, the stores corresponding
+ // to previous successful loads must be executed.
+ //
+ // However some simplifications are possible given the way
+ // things work.
+ //
+ // 1) HEAD
+ // Theory of operation:
+ //
+ // Page A | Page B
+ // ---------|-----
+ // 1|8 x
+ // 1 2|8 x
+ // 4|8 x
+ // 1 4|8 x
+ // 2 4|8 x
+ // 1 2 4|8 x
+ // |1
+ // |2 x
+ // |4 x
+ //
+ // page_size >= 4k (2^12). (x means 4, 2, 1)
+ // Here we suppose Page A exists and Page B does not.
+ //
+ // As we move towards eight byte alignment we may encounter faults.
+ // The numbers on each page show the size of the load (current alignment).
+ //
+ // Key point:
+ // - if you fail on 1, 2, 4 then you have never executed any smaller
+ // size loads, e.g. failing ld4 means no ld1 nor ld2 executed
+ // before.
+ //
+ // This allows us to simplify the cleanup code, because basically you
+ // only have to worry about "pending" stores in the case of a failing
+ // ld8(). Given the way the code is written today, this means only
+ // worry about st2, st4. There we can use the information encapsulated
+ // into the predicates.
+ //
+ // Other key point:
+ // - if you fail on the ld8 in the head, it means you went straight
+ // to it, i.e. 8byte alignment within an unexisting page.
+ // Again this comes from the fact that if you crossed just for the ld8 then
+ // you are 8byte aligned but also 16byte align, therefore you would
+ // either go for the 16byte copy loop OR the ld8 in the tail part.
+ // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
+ // because it would mean you had 15bytes to copy in which case you
+ // would have defaulted to the byte by byte copy.
+ //
+ //
+ // 2) TAIL
+ // Here we now we have less than 16 bytes AND we are either 8 or 16 byte
+ // aligned.
+ //
+ // Key point:
+ // This means that we either:
+ // - are right on a page boundary
+ // OR
+ // - are at more than 16 bytes from a page boundary with
+ // at most 15 bytes to copy: no chance of crossing.
+ //
+ // This allows us to assume that if we fail on a load we haven't possibly
+ // executed any of the previous (tail) ones, so we don't need to do
+ // any stores. For instance, if we fail on ld2, this means we had
+ // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
+ //
+ // This means that we are in a situation similar the a fault in the
+ // head part. That's nice!
+ //
+.failure_in1:
+ sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
+ sub len=endsrc,src1,1
+ //
+ // we know that ret0 can never be zero at this point
+ // because we failed why trying to do a load, i.e. there is still
+ // some work to do.
+ // The failure_in1bis and length problem is taken care of at the
+ // calling side.
+ //
+ ;;
+.failure_in1bis: // from (.failure_in3)
+ mov ar.lc=len // Continue with a stupid byte store.
+ ;;
+5:
+ st1 [dst1]=r0,1
+ br.cloop.dptk 5b
+ ;;
+ mov pr=saved_pr,0xffffffffffff0000
+ mov ar.lc=saved_lc
+ mov ar.pfs=saved_pfs
+ br.ret.sptk.many rp
+
+ //
+ // Here we simply restart the loop but instead
+ // of doing loads we fill the pipeline with zeroes
+ // We can't simply store r0 because we may have valid
+ // data in transit in the pipeline.
+ // ar.lc and ar.ec are setup correctly at this point
+ //
+ // we MUST use src1/endsrc here and not dst1/enddst because
+ // of the pipeline effect.
+ //
+.failure_in3:
+ sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
+ ;;
+2:
+(p16) mov val1[0]=r0
+(p16) mov val2[0]=r0
+(EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16
+(EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
+ br.ctop.dptk 2b
+ ;;
+ cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
+ sub len=enddst,dst1,1 // precompute len
+(p6) br.cond.dptk .failure_in1bis
+ ;;
+ mov pr=saved_pr,0xffffffffffff0000
+ mov ar.lc=saved_lc
+ mov ar.pfs=saved_pfs
+ br.ret.sptk.many rp
+
+.failure_in2:
+ sub ret0=endsrc,src1
+ cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
+ sub len=enddst,dst1,1 // precompute len
+(p6) br.cond.dptk .failure_in1bis
+ ;;
+ mov pr=saved_pr,0xffffffffffff0000
+ mov ar.lc=saved_lc
+ mov ar.pfs=saved_pfs
+ br.ret.sptk.many rp
+
+ //
+ // handling of failures on stores: that's the easy part
+ //
+.failure_out:
+ sub ret0=enddst,dst1
+ mov pr=saved_pr,0xffffffffffff0000
+ mov ar.lc=saved_lc
+
+ mov ar.pfs=saved_pfs
+ br.ret.sptk.many rp
+END(__copy_user)
diff --git a/arch/ia64/lib/csum_partial_copy.c b/arch/ia64/lib/csum_partial_copy.c
new file mode 100644
index 000000000..118daf5a0
--- /dev/null
+++ b/arch/ia64/lib/csum_partial_copy.c
@@ -0,0 +1,140 @@
+/*
+ * Network Checksum & Copy routine
+ *
+ * Copyright (C) 1999, 2003-2004 Hewlett-Packard Co
+ * Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Most of the code has been imported from Linux/Alpha
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+
+#include <asm/uaccess.h>
+
+/*
+ * XXX Fixme: those 2 inlines are meant for debugging and will go away
+ */
+static inline unsigned
+short from64to16(unsigned long x)
+{
+ /* add up 32-bit words for 33 bits */
+ x = (x & 0xffffffff) + (x >> 32);
+ /* add up 16-bit and 17-bit words for 17+c bits */
+ x = (x & 0xffff) + (x >> 16);
+ /* add up 16-bit and 2-bit for 16+c bit */
+ x = (x & 0xffff) + (x >> 16);
+ /* add up carry.. */
+ x = (x & 0xffff) + (x >> 16);
+ return x;
+}
+
+static inline
+unsigned long do_csum_c(const unsigned char * buff, int len, unsigned int psum)
+{
+ int odd, count;
+ unsigned long result = (unsigned long)psum;
+
+ if (len <= 0)
+ goto out;
+ odd = 1 & (unsigned long) buff;
+ if (odd) {
+ result = *buff << 8;
+ len--;
+ buff++;
+ }
+ count = len >> 1; /* nr of 16-bit words.. */
+ if (count) {
+ if (2 & (unsigned long) buff) {
+ result += *(unsigned short *) buff;
+ count--;
+ len -= 2;
+ buff += 2;
+ }
+ count >>= 1; /* nr of 32-bit words.. */
+ if (count) {
+ if (4 & (unsigned long) buff) {
+ result += *(unsigned int *) buff;
+ count--;
+ len -= 4;
+ buff += 4;
+ }
+ count >>= 1; /* nr of 64-bit words.. */
+ if (count) {
+ unsigned long carry = 0;
+ do {
+ unsigned long w = *(unsigned long *) buff;
+ count--;
+ buff += 8;
+ result += carry;
+ result += w;
+ carry = (w > result);
+ } while (count);
+ result += carry;
+ result = (result & 0xffffffff) + (result >> 32);
+ }
+ if (len & 4) {
+ result += *(unsigned int *) buff;
+ buff += 4;
+ }
+ }
+ if (len & 2) {
+ result += *(unsigned short *) buff;
+ buff += 2;
+ }
+ }
+ if (len & 1)
+ result += *buff;
+
+ result = from64to16(result);
+
+ if (odd)
+ result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+
+out:
+ return result;
+}
+
+/*
+ * XXX Fixme
+ *
+ * This is very ugly but temporary. THIS NEEDS SERIOUS ENHANCEMENTS.
+ * But it's very tricky to get right even in C.
+ */
+extern unsigned long do_csum(const unsigned char *, long);
+
+__wsum
+csum_partial_copy_from_user(const void __user *src, void *dst,
+ int len, __wsum psum, int *errp)
+{
+ unsigned long result;
+
+ /* XXX Fixme
+ * for now we separate the copy from checksum for obvious
+ * alignment difficulties. Look at the Alpha code and you'll be
+ * scared.
+ */
+
+ if (__copy_from_user(dst, src, len) != 0 && errp)
+ *errp = -EFAULT;
+
+ result = do_csum(dst, len);
+
+ /* add in old sum, and carry.. */
+ result += (__force u32)psum;
+ /* 32+c bits -> 32 bits */
+ result = (result & 0xffffffff) + (result >> 32);
+ return (__force __wsum)result;
+}
+
+EXPORT_SYMBOL(csum_partial_copy_from_user);
+
+__wsum
+csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
+{
+ return csum_partial_copy_from_user((__force const void __user *)src,
+ dst, len, sum, NULL);
+}
+
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
diff --git a/arch/ia64/lib/do_csum.S b/arch/ia64/lib/do_csum.S
new file mode 100644
index 000000000..1a431a5cf
--- /dev/null
+++ b/arch/ia64/lib/do_csum.S
@@ -0,0 +1,323 @@
+/*
+ *
+ * Optmized version of the standard do_csum() function
+ *
+ * Return: a 64bit quantity containing the 16bit Internet checksum
+ *
+ * Inputs:
+ * in0: address of buffer to checksum (char *)
+ * in1: length of the buffer (int)
+ *
+ * Copyright (C) 1999, 2001-2002 Hewlett-Packard Co
+ * Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 02/04/22 Ken Chen <kenneth.w.chen@intel.com>
+ * Data locality study on the checksum buffer.
+ * More optimization cleanup - remove excessive stop bits.
+ * 02/04/08 David Mosberger <davidm@hpl.hp.com>
+ * More cleanup and tuning.
+ * 01/04/18 Jun Nakajima <jun.nakajima@intel.com>
+ * Clean up and optimize and the software pipeline, loading two
+ * back-to-back 8-byte words per loop. Clean up the initialization
+ * for the loop. Support the cases where load latency = 1 or 2.
+ * Set CONFIG_IA64_LOAD_LATENCY to 1 or 2 (default).
+ */
+
+#include <asm/asmmacro.h>
+
+//
+// Theory of operations:
+// The goal is to go as quickly as possible to the point where
+// we can checksum 16 bytes/loop. Before reaching that point we must
+// take care of incorrect alignment of first byte.
+//
+// The code hereafter also takes care of the "tail" part of the buffer
+// before entering the core loop, if any. The checksum is a sum so it
+// allows us to commute operations. So we do the "head" and "tail"
+// first to finish at full speed in the body. Once we get the head and
+// tail values, we feed them into the pipeline, very handy initialization.
+//
+// Of course we deal with the special case where the whole buffer fits
+// into one 8 byte word. In this case we have only one entry in the pipeline.
+//
+// We use a (LOAD_LATENCY+2)-stage pipeline in the loop to account for
+// possible load latency and also to accommodate for head and tail.
+//
+// The end of the function deals with folding the checksum from 64bits
+// down to 16bits taking care of the carry.
+//
+// This version avoids synchronization in the core loop by also using a
+// pipeline for the accumulation of the checksum in resultx[] (x=1,2).
+//
+// wordx[] (x=1,2)
+// |---|
+// | | 0 : new value loaded in pipeline
+// |---|
+// | | - : in transit data
+// |---|
+// | | LOAD_LATENCY : current value to add to checksum
+// |---|
+// | | LOAD_LATENCY+1 : previous value added to checksum
+// |---| (previous iteration)
+//
+// resultx[] (x=1,2)
+// |---|
+// | | 0 : initial value
+// |---|
+// | | LOAD_LATENCY-1 : new checksum
+// |---|
+// | | LOAD_LATENCY : previous value of checksum
+// |---|
+// | | LOAD_LATENCY+1 : final checksum when out of the loop
+// |---|
+//
+//
+// See RFC1071 "Computing the Internet Checksum" for various techniques for
+// calculating the Internet checksum.
+//
+// NOT YET DONE:
+// - Maybe another algorithm which would take care of the folding at the
+// end in a different manner
+// - Work with people more knowledgeable than me on the network stack
+// to figure out if we could not split the function depending on the
+// type of packet or alignment we get. Like the ip_fast_csum() routine
+// where we know we have at least 20bytes worth of data to checksum.
+// - Do a better job of handling small packets.
+// - Note on prefetching: it was found that under various load, i.e. ftp read/write,
+// nfs read/write, the L1 cache hit rate is at 60% and L2 cache hit rate is at 99.8%
+// on the data that buffer points to (partly because the checksum is often preceded by
+// a copy_from_user()). This finding indiate that lfetch will not be beneficial since
+// the data is already in the cache.
+//
+
+#define saved_pfs r11
+#define hmask r16
+#define tmask r17
+#define first1 r18
+#define firstval r19
+#define firstoff r20
+#define last r21
+#define lastval r22
+#define lastoff r23
+#define saved_lc r24
+#define saved_pr r25
+#define tmp1 r26
+#define tmp2 r27
+#define tmp3 r28
+#define carry1 r29
+#define carry2 r30
+#define first2 r31
+
+#define buf in0
+#define len in1
+
+#define LOAD_LATENCY 2 // XXX fix me
+
+#if (LOAD_LATENCY != 1) && (LOAD_LATENCY != 2)
+# error "Only 1 or 2 is supported/tested for LOAD_LATENCY."
+#endif
+
+#define PIPE_DEPTH (LOAD_LATENCY+2)
+#define ELD p[LOAD_LATENCY] // end of load
+#define ELD_1 p[LOAD_LATENCY+1] // and next stage
+
+// unsigned long do_csum(unsigned char *buf,long len)
+
+GLOBAL_ENTRY(do_csum)
+ .prologue
+ .save ar.pfs, saved_pfs
+ alloc saved_pfs=ar.pfs,2,16,0,16
+ .rotr word1[4], word2[4],result1[LOAD_LATENCY+2],result2[LOAD_LATENCY+2]
+ .rotp p[PIPE_DEPTH], pC1[2], pC2[2]
+ mov ret0=r0 // in case we have zero length
+ cmp.lt p0,p6=r0,len // check for zero length or negative (32bit len)
+ ;;
+ add tmp1=buf,len // last byte's address
+ .save pr, saved_pr
+ mov saved_pr=pr // preserve predicates (rotation)
+(p6) br.ret.spnt.many rp // return if zero or negative length
+
+ mov hmask=-1 // initialize head mask
+ tbit.nz p15,p0=buf,0 // is buf an odd address?
+ and first1=-8,buf // 8-byte align down address of first1 element
+
+ and firstoff=7,buf // how many bytes off for first1 element
+ mov tmask=-1 // initialize tail mask
+
+ ;;
+ adds tmp2=-1,tmp1 // last-1
+ and lastoff=7,tmp1 // how many bytes off for last element
+ ;;
+ sub tmp1=8,lastoff // complement to lastoff
+ and last=-8,tmp2 // address of word containing last byte
+ ;;
+ sub tmp3=last,first1 // tmp3=distance from first1 to last
+ .save ar.lc, saved_lc
+ mov saved_lc=ar.lc // save lc
+ cmp.eq p8,p9=last,first1 // everything fits in one word ?
+
+ ld8 firstval=[first1],8 // load, ahead of time, "first1" word
+ and tmp1=7, tmp1 // make sure that if tmp1==8 -> tmp1=0
+ shl tmp2=firstoff,3 // number of bits
+ ;;
+(p9) ld8 lastval=[last] // load, ahead of time, "last" word, if needed
+ shl tmp1=tmp1,3 // number of bits
+(p9) adds tmp3=-8,tmp3 // effectively loaded
+ ;;
+(p8) mov lastval=r0 // we don't need lastval if first1==last
+ shl hmask=hmask,tmp2 // build head mask, mask off [0,first1off[
+ shr.u tmask=tmask,tmp1 // build tail mask, mask off ]8,lastoff]
+ ;;
+ .body
+#define count tmp3
+
+(p8) and hmask=hmask,tmask // apply tail mask to head mask if 1 word only
+(p9) and word2[0]=lastval,tmask // mask last it as appropriate
+ shr.u count=count,3 // how many 8-byte?
+ ;;
+ // If count is odd, finish this 8-byte word so that we can
+ // load two back-to-back 8-byte words per loop thereafter.
+ and word1[0]=firstval,hmask // and mask it as appropriate
+ tbit.nz p10,p11=count,0 // if (count is odd)
+ ;;
+(p8) mov result1[0]=word1[0]
+(p9) add result1[0]=word1[0],word2[0]
+ ;;
+ cmp.ltu p6,p0=result1[0],word1[0] // check the carry
+ cmp.eq.or.andcm p8,p0=0,count // exit if zero 8-byte
+ ;;
+(p6) adds result1[0]=1,result1[0]
+(p8) br.cond.dptk .do_csum_exit // if (within an 8-byte word)
+(p11) br.cond.dptk .do_csum16 // if (count is even)
+
+ // Here count is odd.
+ ld8 word1[1]=[first1],8 // load an 8-byte word
+ cmp.eq p9,p10=1,count // if (count == 1)
+ adds count=-1,count // loaded an 8-byte word
+ ;;
+ add result1[0]=result1[0],word1[1]
+ ;;
+ cmp.ltu p6,p0=result1[0],word1[1]
+ ;;
+(p6) adds result1[0]=1,result1[0]
+(p9) br.cond.sptk .do_csum_exit // if (count == 1) exit
+ // Fall through to calculate the checksum, feeding result1[0] as
+ // the initial value in result1[0].
+ //
+ // Calculate the checksum loading two 8-byte words per loop.
+ //
+.do_csum16:
+ add first2=8,first1
+ shr.u count=count,1 // we do 16 bytes per loop
+ ;;
+ adds count=-1,count
+ mov carry1=r0
+ mov carry2=r0
+ brp.loop.imp 1f,2f
+ ;;
+ mov ar.ec=PIPE_DEPTH
+ mov ar.lc=count // set lc
+ mov pr.rot=1<<16
+ // result1[0] must be initialized in advance.
+ mov result2[0]=r0
+ ;;
+ .align 32
+1:
+(ELD_1) cmp.ltu pC1[0],p0=result1[LOAD_LATENCY],word1[LOAD_LATENCY+1]
+(pC1[1])adds carry1=1,carry1
+(ELD_1) cmp.ltu pC2[0],p0=result2[LOAD_LATENCY],word2[LOAD_LATENCY+1]
+(pC2[1])adds carry2=1,carry2
+(ELD) add result1[LOAD_LATENCY-1]=result1[LOAD_LATENCY],word1[LOAD_LATENCY]
+(ELD) add result2[LOAD_LATENCY-1]=result2[LOAD_LATENCY],word2[LOAD_LATENCY]
+2:
+(p[0]) ld8 word1[0]=[first1],16
+(p[0]) ld8 word2[0]=[first2],16
+ br.ctop.sptk 1b
+ ;;
+ // Since len is a 32-bit value, carry cannot be larger than a 64-bit value.
+(pC1[1])adds carry1=1,carry1 // since we miss the last one
+(pC2[1])adds carry2=1,carry2
+ ;;
+ add result1[LOAD_LATENCY+1]=result1[LOAD_LATENCY+1],carry1
+ add result2[LOAD_LATENCY+1]=result2[LOAD_LATENCY+1],carry2
+ ;;
+ cmp.ltu p6,p0=result1[LOAD_LATENCY+1],carry1
+ cmp.ltu p7,p0=result2[LOAD_LATENCY+1],carry2
+ ;;
+(p6) adds result1[LOAD_LATENCY+1]=1,result1[LOAD_LATENCY+1]
+(p7) adds result2[LOAD_LATENCY+1]=1,result2[LOAD_LATENCY+1]
+ ;;
+ add result1[0]=result1[LOAD_LATENCY+1],result2[LOAD_LATENCY+1]
+ ;;
+ cmp.ltu p6,p0=result1[0],result2[LOAD_LATENCY+1]
+ ;;
+(p6) adds result1[0]=1,result1[0]
+ ;;
+.do_csum_exit:
+ //
+ // now fold 64 into 16 bits taking care of carry
+ // that's not very good because it has lots of sequentiality
+ //
+ mov tmp3=0xffff
+ zxt4 tmp1=result1[0]
+ shr.u tmp2=result1[0],32
+ ;;
+ add result1[0]=tmp1,tmp2
+ ;;
+ and tmp1=result1[0],tmp3
+ shr.u tmp2=result1[0],16
+ ;;
+ add result1[0]=tmp1,tmp2
+ ;;
+ and tmp1=result1[0],tmp3
+ shr.u tmp2=result1[0],16
+ ;;
+ add result1[0]=tmp1,tmp2
+ ;;
+ and tmp1=result1[0],tmp3
+ shr.u tmp2=result1[0],16
+ ;;
+ add ret0=tmp1,tmp2
+ mov pr=saved_pr,0xffffffffffff0000
+ ;;
+ // if buf was odd then swap bytes
+ mov ar.pfs=saved_pfs // restore ar.ec
+(p15) mux1 ret0=ret0,@rev // reverse word
+ ;;
+ mov ar.lc=saved_lc
+(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
+ br.ret.sptk.many rp
+
+// I (Jun Nakajima) wrote an equivalent code (see below), but it was
+// not much better than the original. So keep the original there so that
+// someone else can challenge.
+//
+// shr.u word1[0]=result1[0],32
+// zxt4 result1[0]=result1[0]
+// ;;
+// add result1[0]=result1[0],word1[0]
+// ;;
+// zxt2 result2[0]=result1[0]
+// extr.u word1[0]=result1[0],16,16
+// shr.u carry1=result1[0],32
+// ;;
+// add result2[0]=result2[0],word1[0]
+// ;;
+// add result2[0]=result2[0],carry1
+// ;;
+// extr.u ret0=result2[0],16,16
+// ;;
+// add ret0=ret0,result2[0]
+// ;;
+// zxt2 ret0=ret0
+// mov ar.pfs=saved_pfs // restore ar.ec
+// mov pr=saved_pr,0xffffffffffff0000
+// ;;
+// // if buf was odd then swap bytes
+// mov ar.lc=saved_lc
+//(p15) mux1 ret0=ret0,@rev // reverse word
+// ;;
+//(p15) shr.u ret0=ret0,64-16 // + shift back to position = swap bytes
+// br.ret.sptk.many rp
+
+END(do_csum)
diff --git a/arch/ia64/lib/flush.S b/arch/ia64/lib/flush.S
new file mode 100644
index 000000000..1d8c88860
--- /dev/null
+++ b/arch/ia64/lib/flush.S
@@ -0,0 +1,117 @@
+/*
+ * Cache flushing routines.
+ *
+ * Copyright (C) 1999-2001, 2005 Hewlett-Packard Co
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 05/28/05 Zoltan Menyhart Dynamic stride size
+ */
+
+#include <asm/asmmacro.h>
+
+
+ /*
+ * flush_icache_range(start,end)
+ *
+ * Make i-cache(s) coherent with d-caches.
+ *
+ * Must deal with range from start to end-1 but nothing else (need to
+ * be careful not to touch addresses that may be unmapped).
+ *
+ * Note: "in0" and "in1" are preserved for debugging purposes.
+ */
+ .section .kprobes.text,"ax"
+GLOBAL_ENTRY(flush_icache_range)
+
+ .prologue
+ alloc r2=ar.pfs,2,0,0,0
+ movl r3=ia64_i_cache_stride_shift
+ mov r21=1
+ ;;
+ ld8 r20=[r3] // r20: stride shift
+ sub r22=in1,r0,1 // last byte address
+ ;;
+ shr.u r23=in0,r20 // start / (stride size)
+ shr.u r22=r22,r20 // (last byte address) / (stride size)
+ shl r21=r21,r20 // r21: stride size of the i-cache(s)
+ ;;
+ sub r8=r22,r23 // number of strides - 1
+ shl r24=r23,r20 // r24: addresses for "fc.i" =
+ // "start" rounded down to stride boundary
+ .save ar.lc,r3
+ mov r3=ar.lc // save ar.lc
+ ;;
+
+ .body
+ mov ar.lc=r8
+ ;;
+ /*
+ * 32 byte aligned loop, even number of (actually 2) bundles
+ */
+.Loop: fc.i r24 // issuable on M0 only
+ add r24=r21,r24 // we flush "stride size" bytes per iteration
+ nop.i 0
+ br.cloop.sptk.few .Loop
+ ;;
+ sync.i
+ ;;
+ srlz.i
+ ;;
+ mov ar.lc=r3 // restore ar.lc
+ br.ret.sptk.many rp
+END(flush_icache_range)
+
+ /*
+ * clflush_cache_range(start,size)
+ *
+ * Flush cache lines from start to start+size-1.
+ *
+ * Must deal with range from start to start+size-1 but nothing else
+ * (need to be careful not to touch addresses that may be
+ * unmapped).
+ *
+ * Note: "in0" and "in1" are preserved for debugging purposes.
+ */
+ .section .kprobes.text,"ax"
+GLOBAL_ENTRY(clflush_cache_range)
+
+ .prologue
+ alloc r2=ar.pfs,2,0,0,0
+ movl r3=ia64_cache_stride_shift
+ mov r21=1
+ add r22=in1,in0
+ ;;
+ ld8 r20=[r3] // r20: stride shift
+ sub r22=r22,r0,1 // last byte address
+ ;;
+ shr.u r23=in0,r20 // start / (stride size)
+ shr.u r22=r22,r20 // (last byte address) / (stride size)
+ shl r21=r21,r20 // r21: stride size of the i-cache(s)
+ ;;
+ sub r8=r22,r23 // number of strides - 1
+ shl r24=r23,r20 // r24: addresses for "fc" =
+ // "start" rounded down to stride
+ // boundary
+ .save ar.lc,r3
+ mov r3=ar.lc // save ar.lc
+ ;;
+
+ .body
+ mov ar.lc=r8
+ ;;
+ /*
+ * 32 byte aligned loop, even number of (actually 2) bundles
+ */
+.Loop_fc:
+ fc r24 // issuable on M0 only
+ add r24=r21,r24 // we flush "stride size" bytes per iteration
+ nop.i 0
+ br.cloop.sptk.few .Loop_fc
+ ;;
+ sync.i
+ ;;
+ srlz.i
+ ;;
+ mov ar.lc=r3 // restore ar.lc
+ br.ret.sptk.many rp
+END(clflush_cache_range)
diff --git a/arch/ia64/lib/idiv32.S b/arch/ia64/lib/idiv32.S
new file mode 100644
index 000000000..2ac28bf0a
--- /dev/null
+++ b/arch/ia64/lib/idiv32.S
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2000 Hewlett-Packard Co
+ * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 32-bit integer division.
+ *
+ * This code is based on the application note entitled "Divide, Square Root
+ * and Remainder Algorithms for the IA-64 Architecture". This document
+ * is available as Intel document number 248725-002 or via the web at
+ * http://developer.intel.com/software/opensource/numerics/
+ *
+ * For more details on the theory behind these algorithms, see "IA-64
+ * and Elementary Functions" by Peter Markstein; HP Professional Books
+ * (http://www.hp.com/go/retailbooks/)
+ */
+
+#include <asm/asmmacro.h>
+
+#ifdef MODULO
+# define OP mod
+#else
+# define OP div
+#endif
+
+#ifdef UNSIGNED
+# define SGN u
+# define EXTEND zxt4
+# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b
+# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b
+#else
+# define SGN
+# define EXTEND sxt4
+# define INT_TO_FP(a,b) fcvt.xf a=b
+# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b
+#endif
+
+#define PASTE1(a,b) a##b
+#define PASTE(a,b) PASTE1(a,b)
+#define NAME PASTE(PASTE(__,SGN),PASTE(OP,si3))
+
+GLOBAL_ENTRY(NAME)
+ .regstk 2,0,0,0
+ // Transfer inputs to FP registers.
+ mov r2 = 0xffdd // r2 = -34 + 65535 (fp reg format bias)
+ EXTEND in0 = in0 // in0 = a
+ EXTEND in1 = in1 // in1 = b
+ ;;
+ setf.sig f8 = in0
+ setf.sig f9 = in1
+#ifdef MODULO
+ sub in1 = r0, in1 // in1 = -b
+#endif
+ ;;
+ // Convert the inputs to FP, to avoid FP software-assist faults.
+ INT_TO_FP(f8, f8)
+ INT_TO_FP(f9, f9)
+ ;;
+ setf.exp f7 = r2 // f7 = 2^-34
+ frcpa.s1 f6, p6 = f8, f9 // y0 = frcpa(b)
+ ;;
+(p6) fmpy.s1 f8 = f8, f6 // q0 = a*y0
+(p6) fnma.s1 f6 = f9, f6, f1 // e0 = -b*y0 + 1
+ ;;
+#ifdef MODULO
+ setf.sig f9 = in1 // f9 = -b
+#endif
+(p6) fma.s1 f8 = f6, f8, f8 // q1 = e0*q0 + q0
+(p6) fma.s1 f6 = f6, f6, f7 // e1 = e0*e0 + 2^-34
+ ;;
+#ifdef MODULO
+ setf.sig f7 = in0
+#endif
+(p6) fma.s1 f6 = f6, f8, f8 // q2 = e1*q1 + q1
+ ;;
+ FP_TO_INT(f6, f6) // q = trunc(q2)
+ ;;
+#ifdef MODULO
+ xma.l f6 = f6, f9, f7 // r = q*(-b) + a
+ ;;
+#endif
+ getf.sig r8 = f6 // transfer result to result register
+ br.ret.sptk.many rp
+END(NAME)
diff --git a/arch/ia64/lib/idiv64.S b/arch/ia64/lib/idiv64.S
new file mode 100644
index 000000000..f69bd2b09
--- /dev/null
+++ b/arch/ia64/lib/idiv64.S
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 1999-2000 Hewlett-Packard Co
+ * Copyright (C) 1999-2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 64-bit integer division.
+ *
+ * This code is based on the application note entitled "Divide, Square Root
+ * and Remainder Algorithms for the IA-64 Architecture". This document
+ * is available as Intel document number 248725-002 or via the web at
+ * http://developer.intel.com/software/opensource/numerics/
+ *
+ * For more details on the theory behind these algorithms, see "IA-64
+ * and Elementary Functions" by Peter Markstein; HP Professional Books
+ * (http://www.hp.com/go/retailbooks/)
+ */
+
+#include <asm/asmmacro.h>
+
+#ifdef MODULO
+# define OP mod
+#else
+# define OP div
+#endif
+
+#ifdef UNSIGNED
+# define SGN u
+# define INT_TO_FP(a,b) fcvt.xuf.s1 a=b
+# define FP_TO_INT(a,b) fcvt.fxu.trunc.s1 a=b
+#else
+# define SGN
+# define INT_TO_FP(a,b) fcvt.xf a=b
+# define FP_TO_INT(a,b) fcvt.fx.trunc.s1 a=b
+#endif
+
+#define PASTE1(a,b) a##b
+#define PASTE(a,b) PASTE1(a,b)
+#define NAME PASTE(PASTE(__,SGN),PASTE(OP,di3))
+
+GLOBAL_ENTRY(NAME)
+ .regstk 2,0,0,0
+ // Transfer inputs to FP registers.
+ setf.sig f8 = in0
+ setf.sig f9 = in1
+ ;;
+ // Convert the inputs to FP, to avoid FP software-assist faults.
+ INT_TO_FP(f8, f8)
+ INT_TO_FP(f9, f9)
+ ;;
+ frcpa.s1 f11, p6 = f8, f9 // y0 = frcpa(b)
+ ;;
+(p6) fmpy.s1 f7 = f8, f11 // q0 = a*y0
+(p6) fnma.s1 f6 = f9, f11, f1 // e0 = -b*y0 + 1
+ ;;
+(p6) fma.s1 f10 = f7, f6, f7 // q1 = q0*e0 + q0
+(p6) fmpy.s1 f7 = f6, f6 // e1 = e0*e0
+ ;;
+#ifdef MODULO
+ sub in1 = r0, in1 // in1 = -b
+#endif
+(p6) fma.s1 f10 = f10, f7, f10 // q2 = q1*e1 + q1
+(p6) fma.s1 f6 = f11, f6, f11 // y1 = y0*e0 + y0
+ ;;
+(p6) fma.s1 f6 = f6, f7, f6 // y2 = y1*e1 + y1
+(p6) fnma.s1 f7 = f9, f10, f8 // r = -b*q2 + a
+ ;;
+#ifdef MODULO
+ setf.sig f8 = in0 // f8 = a
+ setf.sig f9 = in1 // f9 = -b
+#endif
+(p6) fma.s1 f11 = f7, f6, f10 // q3 = r*y2 + q2
+ ;;
+ FP_TO_INT(f11, f11) // q = trunc(q3)
+ ;;
+#ifdef MODULO
+ xma.l f11 = f11, f9, f8 // r = q*(-b) + a
+ ;;
+#endif
+ getf.sig r8 = f11 // transfer result to result register
+ br.ret.sptk.many rp
+END(NAME)
diff --git a/arch/ia64/lib/io.c b/arch/ia64/lib/io.c
new file mode 100644
index 000000000..bcd16f8ad
--- /dev/null
+++ b/arch/ia64/lib/io.c
@@ -0,0 +1,164 @@
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include <asm/io.h>
+
+/*
+ * Copy data from IO memory space to "real" memory space.
+ * This needs to be optimized.
+ */
+void memcpy_fromio(void *to, const volatile void __iomem *from, long count)
+{
+ char *dst = to;
+
+ while (count) {
+ count--;
+ *dst++ = readb(from++);
+ }
+}
+EXPORT_SYMBOL(memcpy_fromio);
+
+/*
+ * Copy data from "real" memory space to IO memory space.
+ * This needs to be optimized.
+ */
+void memcpy_toio(volatile void __iomem *to, const void *from, long count)
+{
+ const char *src = from;
+
+ while (count) {
+ count--;
+ writeb(*src++, to++);
+ }
+}
+EXPORT_SYMBOL(memcpy_toio);
+
+/*
+ * "memset" on IO memory space.
+ * This needs to be optimized.
+ */
+void memset_io(volatile void __iomem *dst, int c, long count)
+{
+ unsigned char ch = (char)(c & 0xff);
+
+ while (count) {
+ count--;
+ writeb(ch, dst);
+ dst++;
+ }
+}
+EXPORT_SYMBOL(memset_io);
+
+#ifdef CONFIG_IA64_GENERIC
+
+#undef __ia64_inb
+#undef __ia64_inw
+#undef __ia64_inl
+#undef __ia64_outb
+#undef __ia64_outw
+#undef __ia64_outl
+#undef __ia64_readb
+#undef __ia64_readw
+#undef __ia64_readl
+#undef __ia64_readq
+#undef __ia64_readb_relaxed
+#undef __ia64_readw_relaxed
+#undef __ia64_readl_relaxed
+#undef __ia64_readq_relaxed
+#undef __ia64_writeb
+#undef __ia64_writew
+#undef __ia64_writel
+#undef __ia64_writeq
+#undef __ia64_mmiowb
+
+unsigned int
+__ia64_inb (unsigned long port)
+{
+ return ___ia64_inb(port);
+}
+
+unsigned int
+__ia64_inw (unsigned long port)
+{
+ return ___ia64_inw(port);
+}
+
+unsigned int
+__ia64_inl (unsigned long port)
+{
+ return ___ia64_inl(port);
+}
+
+void
+__ia64_outb (unsigned char val, unsigned long port)
+{
+ ___ia64_outb(val, port);
+}
+
+void
+__ia64_outw (unsigned short val, unsigned long port)
+{
+ ___ia64_outw(val, port);
+}
+
+void
+__ia64_outl (unsigned int val, unsigned long port)
+{
+ ___ia64_outl(val, port);
+}
+
+unsigned char
+__ia64_readb (void __iomem *addr)
+{
+ return ___ia64_readb (addr);
+}
+
+unsigned short
+__ia64_readw (void __iomem *addr)
+{
+ return ___ia64_readw (addr);
+}
+
+unsigned int
+__ia64_readl (void __iomem *addr)
+{
+ return ___ia64_readl (addr);
+}
+
+unsigned long
+__ia64_readq (void __iomem *addr)
+{
+ return ___ia64_readq (addr);
+}
+
+unsigned char
+__ia64_readb_relaxed (void __iomem *addr)
+{
+ return ___ia64_readb (addr);
+}
+
+unsigned short
+__ia64_readw_relaxed (void __iomem *addr)
+{
+ return ___ia64_readw (addr);
+}
+
+unsigned int
+__ia64_readl_relaxed (void __iomem *addr)
+{
+ return ___ia64_readl (addr);
+}
+
+unsigned long
+__ia64_readq_relaxed (void __iomem *addr)
+{
+ return ___ia64_readq (addr);
+}
+
+void
+__ia64_mmiowb(void)
+{
+ ___ia64_mmiowb();
+}
+
+#endif /* CONFIG_IA64_GENERIC */
diff --git a/arch/ia64/lib/ip_fast_csum.S b/arch/ia64/lib/ip_fast_csum.S
new file mode 100644
index 000000000..620d9dc52
--- /dev/null
+++ b/arch/ia64/lib/ip_fast_csum.S
@@ -0,0 +1,144 @@
+/*
+ * Optmized version of the ip_fast_csum() function
+ * Used for calculating IP header checksum
+ *
+ * Return: 16bit checksum, complemented
+ *
+ * Inputs:
+ * in0: address of buffer to checksum (char *)
+ * in1: length of the buffer (int)
+ *
+ * Copyright (C) 2002, 2006 Intel Corp.
+ * Copyright (C) 2002, 2006 Ken Chen <kenneth.w.chen@intel.com>
+ */
+
+#include <asm/asmmacro.h>
+
+/*
+ * Since we know that most likely this function is called with buf aligned
+ * on 4-byte boundary and 20 bytes in length, we can execution rather quickly
+ * versus calling generic version of do_csum, which has lots of overhead in
+ * handling various alignments and sizes. However, due to lack of constrains
+ * put on the function input argument, cases with alignment not on 4-byte or
+ * size not equal to 20 bytes will be handled by the generic do_csum function.
+ */
+
+#define in0 r32
+#define in1 r33
+#define in2 r34
+#define in3 r35
+#define in4 r36
+#define ret0 r8
+
+GLOBAL_ENTRY(ip_fast_csum)
+ .prologue
+ .body
+ cmp.ne p6,p7=5,in1 // size other than 20 byte?
+ and r14=3,in0 // is it aligned on 4-byte?
+ add r15=4,in0 // second source pointer
+ ;;
+ cmp.ne.or.andcm p6,p7=r14,r0
+ ;;
+(p7) ld4 r20=[in0],8
+(p7) ld4 r21=[r15],8
+(p6) br.spnt .generic
+ ;;
+ ld4 r22=[in0],8
+ ld4 r23=[r15],8
+ ;;
+ ld4 r24=[in0]
+ add r20=r20,r21
+ add r22=r22,r23
+ ;;
+ add r20=r20,r22
+ ;;
+ add r20=r20,r24
+ ;;
+ shr.u ret0=r20,16 // now need to add the carry
+ zxt2 r20=r20
+ ;;
+ add r20=ret0,r20
+ ;;
+ shr.u ret0=r20,16 // add carry again
+ zxt2 r20=r20
+ ;;
+ add r20=ret0,r20
+ ;;
+ shr.u ret0=r20,16
+ zxt2 r20=r20
+ ;;
+ add r20=ret0,r20
+ mov r9=0xffff
+ ;;
+ andcm ret0=r9,r20
+ .restore sp // reset frame state
+ br.ret.sptk.many b0
+ ;;
+
+.generic:
+ .prologue
+ .save ar.pfs, r35
+ alloc r35=ar.pfs,2,2,2,0
+ .save rp, r34
+ mov r34=b0
+ .body
+ dep.z out1=in1,2,30
+ mov out0=in0
+ ;;
+ br.call.sptk.many b0=do_csum
+ ;;
+ andcm ret0=-1,ret0
+ mov ar.pfs=r35
+ mov b0=r34
+ br.ret.sptk.many b0
+END(ip_fast_csum)
+
+GLOBAL_ENTRY(csum_ipv6_magic)
+ ld4 r20=[in0],4
+ ld4 r21=[in1],4
+ zxt4 in2=in2
+ ;;
+ ld4 r22=[in0],4
+ ld4 r23=[in1],4
+ dep r15=in3,in2,32,16
+ ;;
+ ld4 r24=[in0],4
+ ld4 r25=[in1],4
+ mux1 r15=r15,@rev
+ add r16=r20,r21
+ add r17=r22,r23
+ zxt4 in4=in4
+ ;;
+ ld4 r26=[in0],4
+ ld4 r27=[in1],4
+ shr.u r15=r15,16
+ add r18=r24,r25
+ add r8=r16,r17
+ ;;
+ add r19=r26,r27
+ add r8=r8,r18
+ ;;
+ add r8=r8,r19
+ add r15=r15,in4
+ ;;
+ add r8=r8,r15
+ ;;
+ shr.u r10=r8,32 // now fold sum into short
+ zxt4 r11=r8
+ ;;
+ add r8=r10,r11
+ ;;
+ shr.u r10=r8,16 // yeah, keep it rolling
+ zxt2 r11=r8
+ ;;
+ add r8=r10,r11
+ ;;
+ shr.u r10=r8,16 // three times lucky
+ zxt2 r11=r8
+ ;;
+ add r8=r10,r11
+ mov r9=0xffff
+ ;;
+ andcm r8=r9,r8
+ br.ret.sptk.many b0
+END(csum_ipv6_magic)
diff --git a/arch/ia64/lib/memcpy.S b/arch/ia64/lib/memcpy.S
new file mode 100644
index 000000000..448908d80
--- /dev/null
+++ b/arch/ia64/lib/memcpy.S
@@ -0,0 +1,301 @@
+/*
+ *
+ * Optimized version of the standard memcpy() function
+ *
+ * Inputs:
+ * in0: destination address
+ * in1: source address
+ * in2: number of bytes to copy
+ * Output:
+ * no return value
+ *
+ * Copyright (C) 2000-2001 Hewlett-Packard Co
+ * Stephane Eranian <eranian@hpl.hp.com>
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+#include <asm/asmmacro.h>
+
+GLOBAL_ENTRY(memcpy)
+
+# define MEM_LAT 21 /* latency to memory */
+
+# define dst r2
+# define src r3
+# define retval r8
+# define saved_pfs r9
+# define saved_lc r10
+# define saved_pr r11
+# define cnt r16
+# define src2 r17
+# define t0 r18
+# define t1 r19
+# define t2 r20
+# define t3 r21
+# define t4 r22
+# define src_end r23
+
+# define N (MEM_LAT + 4)
+# define Nrot ((N + 7) & ~7)
+
+ /*
+ * First, check if everything (src, dst, len) is a multiple of eight. If
+ * so, we handle everything with no taken branches (other than the loop
+ * itself) and a small icache footprint. Otherwise, we jump off to
+ * the more general copy routine handling arbitrary
+ * sizes/alignment etc.
+ */
+ .prologue
+ .save ar.pfs, saved_pfs
+ alloc saved_pfs=ar.pfs,3,Nrot,0,Nrot
+ .save ar.lc, saved_lc
+ mov saved_lc=ar.lc
+ or t0=in0,in1
+ ;;
+
+ or t0=t0,in2
+ .save pr, saved_pr
+ mov saved_pr=pr
+
+ .body
+
+ cmp.eq p6,p0=in2,r0 // zero length?
+ mov retval=in0 // return dst
+(p6) br.ret.spnt.many rp // zero length, return immediately
+ ;;
+
+ mov dst=in0 // copy because of rotation
+ shr.u cnt=in2,3 // number of 8-byte words to copy
+ mov pr.rot=1<<16
+ ;;
+
+ adds cnt=-1,cnt // br.ctop is repeat/until
+ cmp.gtu p7,p0=16,in2 // copying less than 16 bytes?
+ mov ar.ec=N
+ ;;
+
+ and t0=0x7,t0
+ mov ar.lc=cnt
+ ;;
+ cmp.ne p6,p0=t0,r0
+
+ mov src=in1 // copy because of rotation
+(p7) br.cond.spnt.few .memcpy_short
+(p6) br.cond.spnt.few .memcpy_long
+ ;;
+ nop.m 0
+ ;;
+ nop.m 0
+ nop.i 0
+ ;;
+ nop.m 0
+ ;;
+ .rotr val[N]
+ .rotp p[N]
+ .align 32
+1: { .mib
+(p[0]) ld8 val[0]=[src],8
+ nop.i 0
+ brp.loop.imp 1b, 2f
+}
+2: { .mfb
+(p[N-1])st8 [dst]=val[N-1],8
+ nop.f 0
+ br.ctop.dptk.few 1b
+}
+ ;;
+ mov ar.lc=saved_lc
+ mov pr=saved_pr,-1
+ mov ar.pfs=saved_pfs
+ br.ret.sptk.many rp
+
+ /*
+ * Small (<16 bytes) unaligned copying is done via a simple byte-at-the-time
+ * copy loop. This performs relatively poorly on Itanium, but it doesn't
+ * get used very often (gcc inlines small copies) and due to atomicity
+ * issues, we want to avoid read-modify-write of entire words.
+ */
+ .align 32
+.memcpy_short:
+ adds cnt=-1,in2 // br.ctop is repeat/until
+ mov ar.ec=MEM_LAT
+ brp.loop.imp 1f, 2f
+ ;;
+ mov ar.lc=cnt
+ ;;
+ nop.m 0
+ ;;
+ nop.m 0
+ nop.i 0
+ ;;
+ nop.m 0
+ ;;
+ nop.m 0
+ ;;
+ /*
+ * It is faster to put a stop bit in the loop here because it makes
+ * the pipeline shorter (and latency is what matters on short copies).
+ */
+ .align 32
+1: { .mib
+(p[0]) ld1 val[0]=[src],1
+ nop.i 0
+ brp.loop.imp 1b, 2f
+} ;;
+2: { .mfb
+(p[MEM_LAT-1])st1 [dst]=val[MEM_LAT-1],1
+ nop.f 0
+ br.ctop.dptk.few 1b
+} ;;
+ mov ar.lc=saved_lc
+ mov pr=saved_pr,-1
+ mov ar.pfs=saved_pfs
+ br.ret.sptk.many rp
+
+ /*
+ * Large (>= 16 bytes) copying is done in a fancy way. Latency isn't
+ * an overriding concern here, but throughput is. We first do
+ * sub-word copying until the destination is aligned, then we check
+ * if the source is also aligned. If so, we do a simple load/store-loop
+ * until there are less than 8 bytes left over and then we do the tail,
+ * by storing the last few bytes using sub-word copying. If the source
+ * is not aligned, we branch off to the non-congruent loop.
+ *
+ * stage: op:
+ * 0 ld
+ * :
+ * MEM_LAT+3 shrp
+ * MEM_LAT+4 st
+ *
+ * On Itanium, the pipeline itself runs without stalls. However, br.ctop
+ * seems to introduce an unavoidable bubble in the pipeline so the overall
+ * latency is 2 cycles/iteration. This gives us a _copy_ throughput
+ * of 4 byte/cycle. Still not bad.
+ */
+# undef N
+# undef Nrot
+# define N (MEM_LAT + 5) /* number of stages */
+# define Nrot ((N+1 + 2 + 7) & ~7) /* number of rotating regs */
+
+#define LOG_LOOP_SIZE 6
+
+.memcpy_long:
+ alloc t3=ar.pfs,3,Nrot,0,Nrot // resize register frame
+ and t0=-8,src // t0 = src & ~7
+ and t2=7,src // t2 = src & 7
+ ;;
+ ld8 t0=[t0] // t0 = 1st source word
+ adds src2=7,src // src2 = (src + 7)
+ sub t4=r0,dst // t4 = -dst
+ ;;
+ and src2=-8,src2 // src2 = (src + 7) & ~7
+ shl t2=t2,3 // t2 = 8*(src & 7)
+ shl t4=t4,3 // t4 = 8*(dst & 7)
+ ;;
+ ld8 t1=[src2] // t1 = 1st source word if src is 8-byte aligned, 2nd otherwise
+ sub t3=64,t2 // t3 = 64-8*(src & 7)
+ shr.u t0=t0,t2
+ ;;
+ add src_end=src,in2
+ shl t1=t1,t3
+ mov pr=t4,0x38 // (p5,p4,p3)=(dst & 7)
+ ;;
+ or t0=t0,t1
+ mov cnt=r0
+ adds src_end=-1,src_end
+ ;;
+(p3) st1 [dst]=t0,1
+(p3) shr.u t0=t0,8
+(p3) adds cnt=1,cnt
+ ;;
+(p4) st2 [dst]=t0,2
+(p4) shr.u t0=t0,16
+(p4) adds cnt=2,cnt
+ ;;
+(p5) st4 [dst]=t0,4
+(p5) adds cnt=4,cnt
+ and src_end=-8,src_end // src_end = last word of source buffer
+ ;;
+
+ // At this point, dst is aligned to 8 bytes and there at least 16-7=9 bytes left to copy:
+
+1:{ add src=cnt,src // make src point to remainder of source buffer
+ sub cnt=in2,cnt // cnt = number of bytes left to copy
+ mov t4=ip
+ } ;;
+ and src2=-8,src // align source pointer
+ adds t4=.memcpy_loops-1b,t4
+ mov ar.ec=N
+
+ and t0=7,src // t0 = src & 7
+ shr.u t2=cnt,3 // t2 = number of 8-byte words left to copy
+ shl cnt=cnt,3 // move bits 0-2 to 3-5
+ ;;
+
+ .rotr val[N+1], w[2]
+ .rotp p[N]
+
+ cmp.ne p6,p0=t0,r0 // is src aligned, too?
+ shl t0=t0,LOG_LOOP_SIZE // t0 = 8*(src & 7)
+ adds t2=-1,t2 // br.ctop is repeat/until
+ ;;
+ add t4=t0,t4
+ mov pr=cnt,0x38 // set (p5,p4,p3) to # of bytes last-word bytes to copy
+ mov ar.lc=t2
+ ;;
+ nop.m 0
+ ;;
+ nop.m 0
+ nop.i 0
+ ;;
+ nop.m 0
+ ;;
+(p6) ld8 val[1]=[src2],8 // prime the pump...
+ mov b6=t4
+ br.sptk.few b6
+ ;;
+
+.memcpy_tail:
+ // At this point, (p5,p4,p3) are set to the number of bytes left to copy (which is
+ // less than 8) and t0 contains the last few bytes of the src buffer:
+(p5) st4 [dst]=t0,4
+(p5) shr.u t0=t0,32
+ mov ar.lc=saved_lc
+ ;;
+(p4) st2 [dst]=t0,2
+(p4) shr.u t0=t0,16
+ mov ar.pfs=saved_pfs
+ ;;
+(p3) st1 [dst]=t0
+ mov pr=saved_pr,-1
+ br.ret.sptk.many rp
+
+///////////////////////////////////////////////////////
+ .align 64
+
+#define COPY(shift,index) \
+ 1: { .mib \
+ (p[0]) ld8 val[0]=[src2],8; \
+ (p[MEM_LAT+3]) shrp w[0]=val[MEM_LAT+3],val[MEM_LAT+4-index],shift; \
+ brp.loop.imp 1b, 2f \
+ }; \
+ 2: { .mfb \
+ (p[MEM_LAT+4]) st8 [dst]=w[1],8; \
+ nop.f 0; \
+ br.ctop.dptk.few 1b; \
+ }; \
+ ;; \
+ ld8 val[N-1]=[src_end]; /* load last word (may be same as val[N]) */ \
+ ;; \
+ shrp t0=val[N-1],val[N-index],shift; \
+ br .memcpy_tail
+.memcpy_loops:
+ COPY(0, 1) /* no point special casing this---it doesn't go any faster without shrp */
+ COPY(8, 0)
+ COPY(16, 0)
+ COPY(24, 0)
+ COPY(32, 0)
+ COPY(40, 0)
+ COPY(48, 0)
+ COPY(56, 0)
+
+END(memcpy)
diff --git a/arch/ia64/lib/memcpy_mck.S b/arch/ia64/lib/memcpy_mck.S
new file mode 100644
index 000000000..ab0f87639
--- /dev/null
+++ b/arch/ia64/lib/memcpy_mck.S
@@ -0,0 +1,666 @@
+/*
+ * Itanium 2-optimized version of memcpy and copy_user function
+ *
+ * Inputs:
+ * in0: destination address
+ * in1: source address
+ * in2: number of bytes to copy
+ * Output:
+ * for memcpy: return dest
+ * for copy_user: return 0 if success,
+ * or number of byte NOT copied if error occurred.
+ *
+ * Copyright (C) 2002 Intel Corp.
+ * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com>
+ */
+#include <asm/asmmacro.h>
+#include <asm/page.h>
+
+#define EK(y...) EX(y)
+
+/* McKinley specific optimization */
+
+#define retval r8
+#define saved_pfs r31
+#define saved_lc r10
+#define saved_pr r11
+#define saved_in0 r14
+#define saved_in1 r15
+#define saved_in2 r16
+
+#define src0 r2
+#define src1 r3
+#define dst0 r17
+#define dst1 r18
+#define cnt r9
+
+/* r19-r30 are temp for each code section */
+#define PREFETCH_DIST 8
+#define src_pre_mem r19
+#define dst_pre_mem r20
+#define src_pre_l2 r21
+#define dst_pre_l2 r22
+#define t1 r23
+#define t2 r24
+#define t3 r25
+#define t4 r26
+#define t5 t1 // alias!
+#define t6 t2 // alias!
+#define t7 t3 // alias!
+#define n8 r27
+#define t9 t5 // alias!
+#define t10 t4 // alias!
+#define t11 t7 // alias!
+#define t12 t6 // alias!
+#define t14 t10 // alias!
+#define t13 r28
+#define t15 r29
+#define tmp r30
+
+/* defines for long_copy block */
+#define A 0
+#define B (PREFETCH_DIST)
+#define C (B + PREFETCH_DIST)
+#define D (C + 1)
+#define N (D + 1)
+#define Nrot ((N + 7) & ~7)
+
+/* alias */
+#define in0 r32
+#define in1 r33
+#define in2 r34
+
+GLOBAL_ENTRY(memcpy)
+ and r28=0x7,in0
+ and r29=0x7,in1
+ mov f6=f0
+ mov retval=in0
+ br.cond.sptk .common_code
+ ;;
+END(memcpy)
+GLOBAL_ENTRY(__copy_user)
+ .prologue
+// check dest alignment
+ and r28=0x7,in0
+ and r29=0x7,in1
+ mov f6=f1
+ mov saved_in0=in0 // save dest pointer
+ mov saved_in1=in1 // save src pointer
+ mov retval=r0 // initialize return value
+ ;;
+.common_code:
+ cmp.gt p15,p0=8,in2 // check for small size
+ cmp.ne p13,p0=0,r28 // check dest alignment
+ cmp.ne p14,p0=0,r29 // check src alignment
+ add src0=0,in1
+ sub r30=8,r28 // for .align_dest
+ mov saved_in2=in2 // save len
+ ;;
+ add dst0=0,in0
+ add dst1=1,in0 // dest odd index
+ cmp.le p6,p0 = 1,r30 // for .align_dest
+(p15) br.cond.dpnt .memcpy_short
+(p13) br.cond.dpnt .align_dest
+(p14) br.cond.dpnt .unaligned_src
+ ;;
+
+// both dest and src are aligned on 8-byte boundary
+.aligned_src:
+ .save ar.pfs, saved_pfs
+ alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot
+ .save pr, saved_pr
+ mov saved_pr=pr
+
+ shr.u cnt=in2,7 // this much cache line
+ ;;
+ cmp.lt p6,p0=2*PREFETCH_DIST,cnt
+ cmp.lt p7,p8=1,cnt
+ .save ar.lc, saved_lc
+ mov saved_lc=ar.lc
+ .body
+ add cnt=-1,cnt
+ add src_pre_mem=0,in1 // prefetch src pointer
+ add dst_pre_mem=0,in0 // prefetch dest pointer
+ ;;
+(p7) mov ar.lc=cnt // prefetch count
+(p8) mov ar.lc=r0
+(p6) br.cond.dpnt .long_copy
+ ;;
+
+.prefetch:
+ lfetch.fault [src_pre_mem], 128
+ lfetch.fault.excl [dst_pre_mem], 128
+ br.cloop.dptk.few .prefetch
+ ;;
+
+.medium_copy:
+ and tmp=31,in2 // copy length after iteration
+ shr.u r29=in2,5 // number of 32-byte iteration
+ add dst1=8,dst0 // 2nd dest pointer
+ ;;
+ add cnt=-1,r29 // ctop iteration adjustment
+ cmp.eq p10,p0=r29,r0 // do we really need to loop?
+ add src1=8,src0 // 2nd src pointer
+ cmp.le p6,p0=8,tmp
+ ;;
+ cmp.le p7,p0=16,tmp
+ mov ar.lc=cnt // loop setup
+ cmp.eq p16,p17 = r0,r0
+ mov ar.ec=2
+(p10) br.dpnt.few .aligned_src_tail
+ ;;
+ TEXT_ALIGN(32)
+1:
+EX(.ex_handler, (p16) ld8 r34=[src0],16)
+EK(.ex_handler, (p16) ld8 r38=[src1],16)
+EX(.ex_handler, (p17) st8 [dst0]=r33,16)
+EK(.ex_handler, (p17) st8 [dst1]=r37,16)
+ ;;
+EX(.ex_handler, (p16) ld8 r32=[src0],16)
+EK(.ex_handler, (p16) ld8 r36=[src1],16)
+EX(.ex_handler, (p16) st8 [dst0]=r34,16)
+EK(.ex_handler, (p16) st8 [dst1]=r38,16)
+ br.ctop.dptk.few 1b
+ ;;
+
+.aligned_src_tail:
+EX(.ex_handler, (p6) ld8 t1=[src0])
+ mov ar.lc=saved_lc
+ mov ar.pfs=saved_pfs
+EX(.ex_hndlr_s, (p7) ld8 t2=[src1],8)
+ cmp.le p8,p0=24,tmp
+ and r21=-8,tmp
+ ;;
+EX(.ex_hndlr_s, (p8) ld8 t3=[src1])
+EX(.ex_handler, (p6) st8 [dst0]=t1) // store byte 1
+ and in2=7,tmp // remaining length
+EX(.ex_hndlr_d, (p7) st8 [dst1]=t2,8) // store byte 2
+ add src0=src0,r21 // setting up src pointer
+ add dst0=dst0,r21 // setting up dest pointer
+ ;;
+EX(.ex_handler, (p8) st8 [dst1]=t3) // store byte 3
+ mov pr=saved_pr,-1
+ br.dptk.many .memcpy_short
+ ;;
+
+/* code taken from copy_page_mck */
+.long_copy:
+ .rotr v[2*PREFETCH_DIST]
+ .rotp p[N]
+
+ mov src_pre_mem = src0
+ mov pr.rot = 0x10000
+ mov ar.ec = 1 // special unrolled loop
+
+ mov dst_pre_mem = dst0
+
+ add src_pre_l2 = 8*8, src0
+ add dst_pre_l2 = 8*8, dst0
+ ;;
+ add src0 = 8, src_pre_mem // first t1 src
+ mov ar.lc = 2*PREFETCH_DIST - 1
+ shr.u cnt=in2,7 // number of lines
+ add src1 = 3*8, src_pre_mem // first t3 src
+ add dst0 = 8, dst_pre_mem // first t1 dst
+ add dst1 = 3*8, dst_pre_mem // first t3 dst
+ ;;
+ and tmp=127,in2 // remaining bytes after this block
+ add cnt = -(2*PREFETCH_DIST) - 1, cnt
+ // same as .line_copy loop, but with all predicated-off instructions removed:
+.prefetch_loop:
+EX(.ex_hndlr_lcpy_1, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0
+EK(.ex_hndlr_lcpy_1, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2
+ br.ctop.sptk .prefetch_loop
+ ;;
+ cmp.eq p16, p0 = r0, r0 // reset p16 to 1
+ mov ar.lc = cnt
+ mov ar.ec = N // # of stages in pipeline
+ ;;
+.line_copy:
+EX(.ex_handler, (p[D]) ld8 t2 = [src0], 3*8) // M0
+EK(.ex_handler, (p[D]) ld8 t4 = [src1], 3*8) // M1
+EX(.ex_handler_lcpy, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 prefetch dst from memory
+EK(.ex_handler_lcpy, (p[D]) st8 [dst_pre_l2] = n8, 128) // M3 prefetch dst from L2
+ ;;
+EX(.ex_handler_lcpy, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 prefetch src from memory
+EK(.ex_handler_lcpy, (p[C]) ld8 n8 = [src_pre_l2], 128) // M1 prefetch src from L2
+EX(.ex_handler, (p[D]) st8 [dst0] = t1, 8) // M2
+EK(.ex_handler, (p[D]) st8 [dst1] = t3, 8) // M3
+ ;;
+EX(.ex_handler, (p[D]) ld8 t5 = [src0], 8)
+EK(.ex_handler, (p[D]) ld8 t7 = [src1], 3*8)
+EX(.ex_handler, (p[D]) st8 [dst0] = t2, 3*8)
+EK(.ex_handler, (p[D]) st8 [dst1] = t4, 3*8)
+ ;;
+EX(.ex_handler, (p[D]) ld8 t6 = [src0], 3*8)
+EK(.ex_handler, (p[D]) ld8 t10 = [src1], 8)
+EX(.ex_handler, (p[D]) st8 [dst0] = t5, 8)
+EK(.ex_handler, (p[D]) st8 [dst1] = t7, 3*8)
+ ;;
+EX(.ex_handler, (p[D]) ld8 t9 = [src0], 3*8)
+EK(.ex_handler, (p[D]) ld8 t11 = [src1], 3*8)
+EX(.ex_handler, (p[D]) st8 [dst0] = t6, 3*8)
+EK(.ex_handler, (p[D]) st8 [dst1] = t10, 8)
+ ;;
+EX(.ex_handler, (p[D]) ld8 t12 = [src0], 8)
+EK(.ex_handler, (p[D]) ld8 t14 = [src1], 8)
+EX(.ex_handler, (p[D]) st8 [dst0] = t9, 3*8)
+EK(.ex_handler, (p[D]) st8 [dst1] = t11, 3*8)
+ ;;
+EX(.ex_handler, (p[D]) ld8 t13 = [src0], 4*8)
+EK(.ex_handler, (p[D]) ld8 t15 = [src1], 4*8)
+EX(.ex_handler, (p[D]) st8 [dst0] = t12, 8)
+EK(.ex_handler, (p[D]) st8 [dst1] = t14, 8)
+ ;;
+EX(.ex_handler, (p[C]) ld8 t1 = [src0], 8)
+EK(.ex_handler, (p[C]) ld8 t3 = [src1], 8)
+EX(.ex_handler, (p[D]) st8 [dst0] = t13, 4*8)
+EK(.ex_handler, (p[D]) st8 [dst1] = t15, 4*8)
+ br.ctop.sptk .line_copy
+ ;;
+
+ add dst0=-8,dst0
+ add src0=-8,src0
+ mov in2=tmp
+ .restore sp
+ br.sptk.many .medium_copy
+ ;;
+
+#define BLOCK_SIZE 128*32
+#define blocksize r23
+#define curlen r24
+
+// dest is on 8-byte boundary, src is not. We need to do
+// ld8-ld8, shrp, then st8. Max 8 byte copy per cycle.
+.unaligned_src:
+ .prologue
+ .save ar.pfs, saved_pfs
+ alloc saved_pfs=ar.pfs,3,5,0,8
+ .save ar.lc, saved_lc
+ mov saved_lc=ar.lc
+ .save pr, saved_pr
+ mov saved_pr=pr
+ .body
+.4k_block:
+ mov saved_in0=dst0 // need to save all input arguments
+ mov saved_in2=in2
+ mov blocksize=BLOCK_SIZE
+ ;;
+ cmp.lt p6,p7=blocksize,in2
+ mov saved_in1=src0
+ ;;
+(p6) mov in2=blocksize
+ ;;
+ shr.u r21=in2,7 // this much cache line
+ shr.u r22=in2,4 // number of 16-byte iteration
+ and curlen=15,in2 // copy length after iteration
+ and r30=7,src0 // source alignment
+ ;;
+ cmp.lt p7,p8=1,r21
+ add cnt=-1,r21
+ ;;
+
+ add src_pre_mem=0,src0 // prefetch src pointer
+ add dst_pre_mem=0,dst0 // prefetch dest pointer
+ and src0=-8,src0 // 1st src pointer
+(p7) mov ar.lc = cnt
+(p8) mov ar.lc = r0
+ ;;
+ TEXT_ALIGN(32)
+1: lfetch.fault [src_pre_mem], 128
+ lfetch.fault.excl [dst_pre_mem], 128
+ br.cloop.dptk.few 1b
+ ;;
+
+ shladd dst1=r22,3,dst0 // 2nd dest pointer
+ shladd src1=r22,3,src0 // 2nd src pointer
+ cmp.eq p8,p9=r22,r0 // do we really need to loop?
+ cmp.le p6,p7=8,curlen; // have at least 8 byte remaining?
+ add cnt=-1,r22 // ctop iteration adjustment
+ ;;
+EX(.ex_handler, (p9) ld8 r33=[src0],8) // loop primer
+EK(.ex_handler, (p9) ld8 r37=[src1],8)
+(p8) br.dpnt.few .noloop
+ ;;
+
+// The jump address is calculated based on src alignment. The COPYU
+// macro below need to confine its size to power of two, so an entry
+// can be caulated using shl instead of an expensive multiply. The
+// size is then hard coded by the following #define to match the
+// actual size. This make it somewhat tedious when COPYU macro gets
+// changed and this need to be adjusted to match.
+#define LOOP_SIZE 6
+1:
+ mov r29=ip // jmp_table thread
+ mov ar.lc=cnt
+ ;;
+ add r29=.jump_table - 1b - (.jmp1-.jump_table), r29
+ shl r28=r30, LOOP_SIZE // jmp_table thread
+ mov ar.ec=2 // loop setup
+ ;;
+ add r29=r29,r28 // jmp_table thread
+ cmp.eq p16,p17=r0,r0
+ ;;
+ mov b6=r29 // jmp_table thread
+ ;;
+ br.cond.sptk.few b6
+
+// for 8-15 byte case
+// We will skip the loop, but need to replicate the side effect
+// that the loop produces.
+.noloop:
+EX(.ex_handler, (p6) ld8 r37=[src1],8)
+ add src0=8,src0
+(p6) shl r25=r30,3
+ ;;
+EX(.ex_handler, (p6) ld8 r27=[src1])
+(p6) shr.u r28=r37,r25
+(p6) sub r26=64,r25
+ ;;
+(p6) shl r27=r27,r26
+ ;;
+(p6) or r21=r28,r27
+
+.unaligned_src_tail:
+/* check if we have more than blocksize to copy, if so go back */
+ cmp.gt p8,p0=saved_in2,blocksize
+ ;;
+(p8) add dst0=saved_in0,blocksize
+(p8) add src0=saved_in1,blocksize
+(p8) sub in2=saved_in2,blocksize
+(p8) br.dpnt .4k_block
+ ;;
+
+/* we have up to 15 byte to copy in the tail.
+ * part of work is already done in the jump table code
+ * we are at the following state.
+ * src side:
+ *
+ * xxxxxx xx <----- r21 has xxxxxxxx already
+ * -------- -------- --------
+ * 0 8 16
+ * ^
+ * |
+ * src1
+ *
+ * dst
+ * -------- -------- --------
+ * ^
+ * |
+ * dst1
+ */
+EX(.ex_handler, (p6) st8 [dst1]=r21,8) // more than 8 byte to copy
+(p6) add curlen=-8,curlen // update length
+ mov ar.pfs=saved_pfs
+ ;;
+ mov ar.lc=saved_lc
+ mov pr=saved_pr,-1
+ mov in2=curlen // remaining length
+ mov dst0=dst1 // dest pointer
+ add src0=src1,r30 // forward by src alignment
+ ;;
+
+// 7 byte or smaller.
+.memcpy_short:
+ cmp.le p8,p9 = 1,in2
+ cmp.le p10,p11 = 2,in2
+ cmp.le p12,p13 = 3,in2
+ cmp.le p14,p15 = 4,in2
+ add src1=1,src0 // second src pointer
+ add dst1=1,dst0 // second dest pointer
+ ;;
+
+EX(.ex_handler_short, (p8) ld1 t1=[src0],2)
+EK(.ex_handler_short, (p10) ld1 t2=[src1],2)
+(p9) br.ret.dpnt rp // 0 byte copy
+ ;;
+
+EX(.ex_handler_short, (p8) st1 [dst0]=t1,2)
+EK(.ex_handler_short, (p10) st1 [dst1]=t2,2)
+(p11) br.ret.dpnt rp // 1 byte copy
+
+EX(.ex_handler_short, (p12) ld1 t3=[src0],2)
+EK(.ex_handler_short, (p14) ld1 t4=[src1],2)
+(p13) br.ret.dpnt rp // 2 byte copy
+ ;;
+
+ cmp.le p6,p7 = 5,in2
+ cmp.le p8,p9 = 6,in2
+ cmp.le p10,p11 = 7,in2
+
+EX(.ex_handler_short, (p12) st1 [dst0]=t3,2)
+EK(.ex_handler_short, (p14) st1 [dst1]=t4,2)
+(p15) br.ret.dpnt rp // 3 byte copy
+ ;;
+
+EX(.ex_handler_short, (p6) ld1 t5=[src0],2)
+EK(.ex_handler_short, (p8) ld1 t6=[src1],2)
+(p7) br.ret.dpnt rp // 4 byte copy
+ ;;
+
+EX(.ex_handler_short, (p6) st1 [dst0]=t5,2)
+EK(.ex_handler_short, (p8) st1 [dst1]=t6,2)
+(p9) br.ret.dptk rp // 5 byte copy
+
+EX(.ex_handler_short, (p10) ld1 t7=[src0],2)
+(p11) br.ret.dptk rp // 6 byte copy
+ ;;
+
+EX(.ex_handler_short, (p10) st1 [dst0]=t7,2)
+ br.ret.dptk rp // done all cases
+
+
+/* Align dest to nearest 8-byte boundary. We know we have at
+ * least 7 bytes to copy, enough to crawl to 8-byte boundary.
+ * Actual number of byte to crawl depend on the dest alignment.
+ * 7 byte or less is taken care at .memcpy_short
+
+ * src0 - source even index
+ * src1 - source odd index
+ * dst0 - dest even index
+ * dst1 - dest odd index
+ * r30 - distance to 8-byte boundary
+ */
+
+.align_dest:
+ add src1=1,in1 // source odd index
+ cmp.le p7,p0 = 2,r30 // for .align_dest
+ cmp.le p8,p0 = 3,r30 // for .align_dest
+EX(.ex_handler_short, (p6) ld1 t1=[src0],2)
+ cmp.le p9,p0 = 4,r30 // for .align_dest
+ cmp.le p10,p0 = 5,r30
+ ;;
+EX(.ex_handler_short, (p7) ld1 t2=[src1],2)
+EK(.ex_handler_short, (p8) ld1 t3=[src0],2)
+ cmp.le p11,p0 = 6,r30
+EX(.ex_handler_short, (p6) st1 [dst0] = t1,2)
+ cmp.le p12,p0 = 7,r30
+ ;;
+EX(.ex_handler_short, (p9) ld1 t4=[src1],2)
+EK(.ex_handler_short, (p10) ld1 t5=[src0],2)
+EX(.ex_handler_short, (p7) st1 [dst1] = t2,2)
+EK(.ex_handler_short, (p8) st1 [dst0] = t3,2)
+ ;;
+EX(.ex_handler_short, (p11) ld1 t6=[src1],2)
+EK(.ex_handler_short, (p12) ld1 t7=[src0],2)
+ cmp.eq p6,p7=r28,r29
+EX(.ex_handler_short, (p9) st1 [dst1] = t4,2)
+EK(.ex_handler_short, (p10) st1 [dst0] = t5,2)
+ sub in2=in2,r30
+ ;;
+EX(.ex_handler_short, (p11) st1 [dst1] = t6,2)
+EK(.ex_handler_short, (p12) st1 [dst0] = t7)
+ add dst0=in0,r30 // setup arguments
+ add src0=in1,r30
+(p6) br.cond.dptk .aligned_src
+(p7) br.cond.dpnt .unaligned_src
+ ;;
+
+/* main loop body in jump table format */
+#define COPYU(shift) \
+1: \
+EX(.ex_handler, (p16) ld8 r32=[src0],8); /* 1 */ \
+EK(.ex_handler, (p16) ld8 r36=[src1],8); \
+ (p17) shrp r35=r33,r34,shift;; /* 1 */ \
+EX(.ex_handler, (p6) ld8 r22=[src1]); /* common, prime for tail section */ \
+ nop.m 0; \
+ (p16) shrp r38=r36,r37,shift; \
+EX(.ex_handler, (p17) st8 [dst0]=r35,8); /* 1 */ \
+EK(.ex_handler, (p17) st8 [dst1]=r39,8); \
+ br.ctop.dptk.few 1b;; \
+ (p7) add src1=-8,src1; /* back out for <8 byte case */ \
+ shrp r21=r22,r38,shift; /* speculative work */ \
+ br.sptk.few .unaligned_src_tail /* branch out of jump table */ \
+ ;;
+ TEXT_ALIGN(32)
+.jump_table:
+ COPYU(8) // unaligned cases
+.jmp1:
+ COPYU(16)
+ COPYU(24)
+ COPYU(32)
+ COPYU(40)
+ COPYU(48)
+ COPYU(56)
+
+#undef A
+#undef B
+#undef C
+#undef D
+
+/*
+ * Due to lack of local tag support in gcc 2.x assembler, it is not clear which
+ * instruction failed in the bundle. The exception algorithm is that we
+ * first figure out the faulting address, then detect if there is any
+ * progress made on the copy, if so, redo the copy from last known copied
+ * location up to the faulting address (exclusive). In the copy_from_user
+ * case, remaining byte in kernel buffer will be zeroed.
+ *
+ * Take copy_from_user as an example, in the code there are multiple loads
+ * in a bundle and those multiple loads could span over two pages, the
+ * faulting address is calculated as page_round_down(max(src0, src1)).
+ * This is based on knowledge that if we can access one byte in a page, we
+ * can access any byte in that page.
+ *
+ * predicate used in the exception handler:
+ * p6-p7: direction
+ * p10-p11: src faulting addr calculation
+ * p12-p13: dst faulting addr calculation
+ */
+
+#define A r19
+#define B r20
+#define C r21
+#define D r22
+#define F r28
+
+#define memset_arg0 r32
+#define memset_arg2 r33
+
+#define saved_retval loc0
+#define saved_rtlink loc1
+#define saved_pfs_stack loc2
+
+.ex_hndlr_s:
+ add src0=8,src0
+ br.sptk .ex_handler
+ ;;
+.ex_hndlr_d:
+ add dst0=8,dst0
+ br.sptk .ex_handler
+ ;;
+.ex_hndlr_lcpy_1:
+ mov src1=src_pre_mem
+ mov dst1=dst_pre_mem
+ cmp.gtu p10,p11=src_pre_mem,saved_in1
+ cmp.gtu p12,p13=dst_pre_mem,saved_in0
+ ;;
+(p10) add src0=8,saved_in1
+(p11) mov src0=saved_in1
+(p12) add dst0=8,saved_in0
+(p13) mov dst0=saved_in0
+ br.sptk .ex_handler
+.ex_handler_lcpy:
+ // in line_copy block, the preload addresses should always ahead
+ // of the other two src/dst pointers. Furthermore, src1/dst1 should
+ // always ahead of src0/dst0.
+ mov src1=src_pre_mem
+ mov dst1=dst_pre_mem
+.ex_handler:
+ mov pr=saved_pr,-1 // first restore pr, lc, and pfs
+ mov ar.lc=saved_lc
+ mov ar.pfs=saved_pfs
+ ;;
+.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs
+ cmp.ltu p6,p7=saved_in0, saved_in1 // get the copy direction
+ cmp.ltu p10,p11=src0,src1
+ cmp.ltu p12,p13=dst0,dst1
+ fcmp.eq p8,p0=f6,f0 // is it memcpy?
+ mov tmp = dst0
+ ;;
+(p11) mov src1 = src0 // pick the larger of the two
+(p13) mov dst0 = dst1 // make dst0 the smaller one
+(p13) mov dst1 = tmp // and dst1 the larger one
+ ;;
+(p6) dep F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary
+(p7) dep F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary
+ ;;
+(p6) cmp.le p14,p0=dst0,saved_in0 // no progress has been made on store
+(p7) cmp.le p14,p0=src0,saved_in1 // no progress has been made on load
+ mov retval=saved_in2
+(p8) ld1 tmp=[src1] // force an oops for memcpy call
+(p8) st1 [dst1]=r0 // force an oops for memcpy call
+(p14) br.ret.sptk.many rp
+
+/*
+ * The remaining byte to copy is calculated as:
+ *
+ * A = (faulting_addr - orig_src) -> len to faulting ld address
+ * or
+ * (faulting_addr - orig_dst) -> len to faulting st address
+ * B = (cur_dst - orig_dst) -> len copied so far
+ * C = A - B -> len need to be copied
+ * D = orig_len - A -> len need to be zeroed
+ */
+(p6) sub A = F, saved_in0
+(p7) sub A = F, saved_in1
+ clrrrb
+ ;;
+ alloc saved_pfs_stack=ar.pfs,3,3,3,0
+ cmp.lt p8,p0=A,r0
+ sub B = dst0, saved_in0 // how many byte copied so far
+ ;;
+(p8) mov A = 0; // A shouldn't be negative, cap it
+ ;;
+ sub C = A, B
+ sub D = saved_in2, A
+ ;;
+ cmp.gt p8,p0=C,r0 // more than 1 byte?
+ add memset_arg0=saved_in0, A
+(p6) mov memset_arg2=0 // copy_to_user should not call memset
+(p7) mov memset_arg2=D // copy_from_user need to have kbuf zeroed
+ mov r8=0
+ mov saved_retval = D
+ mov saved_rtlink = b0
+
+ add out0=saved_in0, B
+ add out1=saved_in1, B
+ mov out2=C
+(p8) br.call.sptk.few b0=__copy_user // recursive call
+ ;;
+
+ add saved_retval=saved_retval,r8 // above might return non-zero value
+ cmp.gt p8,p0=memset_arg2,r0 // more than 1 byte?
+ mov out0=memset_arg0 // *s
+ mov out1=r0 // c
+ mov out2=memset_arg2 // n
+(p8) br.call.sptk.few b0=memset
+ ;;
+
+ mov retval=saved_retval
+ mov ar.pfs=saved_pfs_stack
+ mov b0=saved_rtlink
+ br.ret.sptk.many rp
+
+/* end of McKinley specific optimization */
+END(__copy_user)
diff --git a/arch/ia64/lib/memset.S b/arch/ia64/lib/memset.S
new file mode 100644
index 000000000..f26c16aef
--- /dev/null
+++ b/arch/ia64/lib/memset.S
@@ -0,0 +1,362 @@
+/* Optimized version of the standard memset() function.
+
+ Copyright (c) 2002 Hewlett-Packard Co/CERN
+ Sverre Jarp <Sverre.Jarp@cern.ch>
+
+ Return: dest
+
+ Inputs:
+ in0: dest
+ in1: value
+ in2: count
+
+ The algorithm is fairly straightforward: set byte by byte until we
+ we get to a 16B-aligned address, then loop on 128 B chunks using an
+ early store as prefetching, then loop on 32B chucks, then clear remaining
+ words, finally clear remaining bytes.
+ Since a stf.spill f0 can store 16B in one go, we use this instruction
+ to get peak speed when value = 0. */
+
+#include <asm/asmmacro.h>
+#undef ret
+
+#define dest in0
+#define value in1
+#define cnt in2
+
+#define tmp r31
+#define save_lc r30
+#define ptr0 r29
+#define ptr1 r28
+#define ptr2 r27
+#define ptr3 r26
+#define ptr9 r24
+#define loopcnt r23
+#define linecnt r22
+#define bytecnt r21
+
+#define fvalue f6
+
+// This routine uses only scratch predicate registers (p6 - p15)
+#define p_scr p6 // default register for same-cycle branches
+#define p_nz p7
+#define p_zr p8
+#define p_unalgn p9
+#define p_y p11
+#define p_n p12
+#define p_yy p13
+#define p_nn p14
+
+#define MIN1 15
+#define MIN1P1HALF 8
+#define LINE_SIZE 128
+#define LSIZE_SH 7 // shift amount
+#define PREF_AHEAD 8
+
+GLOBAL_ENTRY(memset)
+{ .mmi
+ .prologue
+ alloc tmp = ar.pfs, 3, 0, 0, 0
+ lfetch.nt1 [dest] //
+ .save ar.lc, save_lc
+ mov.i save_lc = ar.lc
+ .body
+} { .mmi
+ mov ret0 = dest // return value
+ cmp.ne p_nz, p_zr = value, r0 // use stf.spill if value is zero
+ cmp.eq p_scr, p0 = cnt, r0
+;; }
+{ .mmi
+ and ptr2 = -(MIN1+1), dest // aligned address
+ and tmp = MIN1, dest // prepare to check for correct alignment
+ tbit.nz p_y, p_n = dest, 0 // Do we have an odd address? (M_B_U)
+} { .mib
+ mov ptr1 = dest
+ mux1 value = value, @brcst // create 8 identical bytes in word
+(p_scr) br.ret.dpnt.many rp // return immediately if count = 0
+;; }
+{ .mib
+ cmp.ne p_unalgn, p0 = tmp, r0 //
+} { .mib
+ sub bytecnt = (MIN1+1), tmp // NB: # of bytes to move is 1 higher than loopcnt
+ cmp.gt p_scr, p0 = 16, cnt // is it a minimalistic task?
+(p_scr) br.cond.dptk.many .move_bytes_unaligned // go move just a few (M_B_U)
+;; }
+{ .mmi
+(p_unalgn) add ptr1 = (MIN1+1), ptr2 // after alignment
+(p_unalgn) add ptr2 = MIN1P1HALF, ptr2 // after alignment
+(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 3 // should we do a st8 ?
+;; }
+{ .mib
+(p_y) add cnt = -8, cnt //
+(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 2 // should we do a st4 ?
+} { .mib
+(p_y) st8 [ptr2] = value,-4 //
+(p_n) add ptr2 = 4, ptr2 //
+;; }
+{ .mib
+(p_yy) add cnt = -4, cnt //
+(p_unalgn) tbit.nz.unc p_y, p_n = bytecnt, 1 // should we do a st2 ?
+} { .mib
+(p_yy) st4 [ptr2] = value,-2 //
+(p_nn) add ptr2 = 2, ptr2 //
+;; }
+{ .mmi
+ mov tmp = LINE_SIZE+1 // for compare
+(p_y) add cnt = -2, cnt //
+(p_unalgn) tbit.nz.unc p_yy, p_nn = bytecnt, 0 // should we do a st1 ?
+} { .mmi
+ setf.sig fvalue=value // transfer value to FLP side
+(p_y) st2 [ptr2] = value,-1 //
+(p_n) add ptr2 = 1, ptr2 //
+;; }
+
+{ .mmi
+(p_yy) st1 [ptr2] = value //
+ cmp.gt p_scr, p0 = tmp, cnt // is it a minimalistic task?
+} { .mbb
+(p_yy) add cnt = -1, cnt //
+(p_scr) br.cond.dpnt.many .fraction_of_line // go move just a few
+;; }
+
+{ .mib
+ nop.m 0
+ shr.u linecnt = cnt, LSIZE_SH
+(p_zr) br.cond.dptk.many .l1b // Jump to use stf.spill
+;; }
+
+ TEXT_ALIGN(32) // --------------------- // L1A: store ahead into cache lines; fill later
+{ .mmi
+ and tmp = -(LINE_SIZE), cnt // compute end of range
+ mov ptr9 = ptr1 // used for prefetching
+ and cnt = (LINE_SIZE-1), cnt // remainder
+} { .mmi
+ mov loopcnt = PREF_AHEAD-1 // default prefetch loop
+ cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
+;; }
+{ .mmi
+(p_scr) add loopcnt = -1, linecnt //
+ add ptr2 = 8, ptr1 // start of stores (beyond prefetch stores)
+ add ptr1 = tmp, ptr1 // first address beyond total range
+;; }
+{ .mmi
+ add tmp = -1, linecnt // next loop count
+ mov.i ar.lc = loopcnt //
+;; }
+.pref_l1a:
+{ .mib
+ stf8 [ptr9] = fvalue, 128 // Do stores one cache line apart
+ nop.i 0
+ br.cloop.dptk.few .pref_l1a
+;; }
+{ .mmi
+ add ptr0 = 16, ptr2 // Two stores in parallel
+ mov.i ar.lc = tmp //
+;; }
+.l1ax:
+ { .mmi
+ stf8 [ptr2] = fvalue, 8
+ stf8 [ptr0] = fvalue, 8
+ ;; }
+ { .mmi
+ stf8 [ptr2] = fvalue, 24
+ stf8 [ptr0] = fvalue, 24
+ ;; }
+ { .mmi
+ stf8 [ptr2] = fvalue, 8
+ stf8 [ptr0] = fvalue, 8
+ ;; }
+ { .mmi
+ stf8 [ptr2] = fvalue, 24
+ stf8 [ptr0] = fvalue, 24
+ ;; }
+ { .mmi
+ stf8 [ptr2] = fvalue, 8
+ stf8 [ptr0] = fvalue, 8
+ ;; }
+ { .mmi
+ stf8 [ptr2] = fvalue, 24
+ stf8 [ptr0] = fvalue, 24
+ ;; }
+ { .mmi
+ stf8 [ptr2] = fvalue, 8
+ stf8 [ptr0] = fvalue, 32
+ cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
+ ;; }
+{ .mmb
+ stf8 [ptr2] = fvalue, 24
+(p_scr) stf8 [ptr9] = fvalue, 128
+ br.cloop.dptk.few .l1ax
+;; }
+{ .mbb
+ cmp.le p_scr, p0 = 8, cnt // just a few bytes left ?
+(p_scr) br.cond.dpnt.many .fraction_of_line // Branch no. 2
+ br.cond.dpnt.many .move_bytes_from_alignment // Branch no. 3
+;; }
+
+ TEXT_ALIGN(32)
+.l1b: // ------------------------------------ // L1B: store ahead into cache lines; fill later
+{ .mmi
+ and tmp = -(LINE_SIZE), cnt // compute end of range
+ mov ptr9 = ptr1 // used for prefetching
+ and cnt = (LINE_SIZE-1), cnt // remainder
+} { .mmi
+ mov loopcnt = PREF_AHEAD-1 // default prefetch loop
+ cmp.gt p_scr, p0 = PREF_AHEAD, linecnt // check against actual value
+;; }
+{ .mmi
+(p_scr) add loopcnt = -1, linecnt
+ add ptr2 = 16, ptr1 // start of stores (beyond prefetch stores)
+ add ptr1 = tmp, ptr1 // first address beyond total range
+;; }
+{ .mmi
+ add tmp = -1, linecnt // next loop count
+ mov.i ar.lc = loopcnt
+;; }
+.pref_l1b:
+{ .mib
+ stf.spill [ptr9] = f0, 128 // Do stores one cache line apart
+ nop.i 0
+ br.cloop.dptk.few .pref_l1b
+;; }
+{ .mmi
+ add ptr0 = 16, ptr2 // Two stores in parallel
+ mov.i ar.lc = tmp
+;; }
+.l1bx:
+ { .mmi
+ stf.spill [ptr2] = f0, 32
+ stf.spill [ptr0] = f0, 32
+ ;; }
+ { .mmi
+ stf.spill [ptr2] = f0, 32
+ stf.spill [ptr0] = f0, 32
+ ;; }
+ { .mmi
+ stf.spill [ptr2] = f0, 32
+ stf.spill [ptr0] = f0, 64
+ cmp.lt p_scr, p0 = ptr9, ptr1 // do we need more prefetching?
+ ;; }
+{ .mmb
+ stf.spill [ptr2] = f0, 32
+(p_scr) stf.spill [ptr9] = f0, 128
+ br.cloop.dptk.few .l1bx
+;; }
+{ .mib
+ cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
+(p_scr) br.cond.dpnt.many .move_bytes_from_alignment //
+;; }
+
+.fraction_of_line:
+{ .mib
+ add ptr2 = 16, ptr1
+ shr.u loopcnt = cnt, 5 // loopcnt = cnt / 32
+;; }
+{ .mib
+ cmp.eq p_scr, p0 = loopcnt, r0
+ add loopcnt = -1, loopcnt
+(p_scr) br.cond.dpnt.many .store_words
+;; }
+{ .mib
+ and cnt = 0x1f, cnt // compute the remaining cnt
+ mov.i ar.lc = loopcnt
+;; }
+ TEXT_ALIGN(32)
+.l2: // ------------------------------------ // L2A: store 32B in 2 cycles
+{ .mmb
+ stf8 [ptr1] = fvalue, 8
+ stf8 [ptr2] = fvalue, 8
+;; } { .mmb
+ stf8 [ptr1] = fvalue, 24
+ stf8 [ptr2] = fvalue, 24
+ br.cloop.dptk.many .l2
+;; }
+.store_words:
+{ .mib
+ cmp.gt p_scr, p0 = 8, cnt // just a few bytes left ?
+(p_scr) br.cond.dpnt.many .move_bytes_from_alignment // Branch
+;; }
+
+{ .mmi
+ stf8 [ptr1] = fvalue, 8 // store
+ cmp.le p_y, p_n = 16, cnt
+ add cnt = -8, cnt // subtract
+;; }
+{ .mmi
+(p_y) stf8 [ptr1] = fvalue, 8 // store
+(p_y) cmp.le.unc p_yy, p_nn = 16, cnt
+(p_y) add cnt = -8, cnt // subtract
+;; }
+{ .mmi // store
+(p_yy) stf8 [ptr1] = fvalue, 8
+(p_yy) add cnt = -8, cnt // subtract
+;; }
+
+.move_bytes_from_alignment:
+{ .mib
+ cmp.eq p_scr, p0 = cnt, r0
+ tbit.nz.unc p_y, p0 = cnt, 2 // should we terminate with a st4 ?
+(p_scr) br.cond.dpnt.few .restore_and_exit
+;; }
+{ .mib
+(p_y) st4 [ptr1] = value,4
+ tbit.nz.unc p_yy, p0 = cnt, 1 // should we terminate with a st2 ?
+;; }
+{ .mib
+(p_yy) st2 [ptr1] = value,2
+ tbit.nz.unc p_y, p0 = cnt, 0 // should we terminate with a st1 ?
+;; }
+
+{ .mib
+(p_y) st1 [ptr1] = value
+;; }
+.restore_and_exit:
+{ .mib
+ nop.m 0
+ mov.i ar.lc = save_lc
+ br.ret.sptk.many rp
+;; }
+
+.move_bytes_unaligned:
+{ .mmi
+ .pred.rel "mutex",p_y, p_n
+ .pred.rel "mutex",p_yy, p_nn
+(p_n) cmp.le p_yy, p_nn = 4, cnt
+(p_y) cmp.le p_yy, p_nn = 5, cnt
+(p_n) add ptr2 = 2, ptr1
+} { .mmi
+(p_y) add ptr2 = 3, ptr1
+(p_y) st1 [ptr1] = value, 1 // fill 1 (odd-aligned) byte [15, 14 (or less) left]
+(p_y) add cnt = -1, cnt
+;; }
+{ .mmi
+(p_yy) cmp.le.unc p_y, p0 = 8, cnt
+ add ptr3 = ptr1, cnt // prepare last store
+ mov.i ar.lc = save_lc
+} { .mmi
+(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
+(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [11, 10 (o less) left]
+(p_yy) add cnt = -4, cnt
+;; }
+{ .mmi
+(p_y) cmp.le.unc p_yy, p0 = 8, cnt
+ add ptr3 = -1, ptr3 // last store
+ tbit.nz p_scr, p0 = cnt, 1 // will there be a st2 at the end ?
+} { .mmi
+(p_y) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
+(p_y) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [7, 6 (or less) left]
+(p_y) add cnt = -4, cnt
+;; }
+{ .mmi
+(p_yy) st2 [ptr1] = value, 4 // fill 2 (aligned) bytes
+(p_yy) st2 [ptr2] = value, 4 // fill 2 (aligned) bytes [3, 2 (or less) left]
+ tbit.nz p_y, p0 = cnt, 0 // will there be a st1 at the end ?
+} { .mmi
+(p_yy) add cnt = -4, cnt
+;; }
+{ .mmb
+(p_scr) st2 [ptr1] = value // fill 2 (aligned) bytes
+(p_y) st1 [ptr3] = value // fill last byte (using ptr3)
+ br.ret.sptk.many rp
+}
+END(memset)
diff --git a/arch/ia64/lib/strlen.S b/arch/ia64/lib/strlen.S
new file mode 100644
index 000000000..e0cdac0a8
--- /dev/null
+++ b/arch/ia64/lib/strlen.S
@@ -0,0 +1,192 @@
+/*
+ *
+ * Optimized version of the standard strlen() function
+ *
+ *
+ * Inputs:
+ * in0 address of string
+ *
+ * Outputs:
+ * ret0 the number of characters in the string (0 if empty string)
+ * does not count the \0
+ *
+ * Copyright (C) 1999, 2001 Hewlett-Packard Co
+ * Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 09/24/99 S.Eranian add speculation recovery code
+ */
+
+#include <asm/asmmacro.h>
+
+//
+//
+// This is an enhanced version of the basic strlen. it includes a combination
+// of compute zero index (czx), parallel comparisons, speculative loads and
+// loop unroll using rotating registers.
+//
+// General Ideas about the algorithm:
+// The goal is to look at the string in chunks of 8 bytes.
+// so we need to do a few extra checks at the beginning because the
+// string may not be 8-byte aligned. In this case we load the 8byte
+// quantity which includes the start of the string and mask the unused
+// bytes with 0xff to avoid confusing czx.
+// We use speculative loads and software pipelining to hide memory
+// latency and do read ahead safely. This way we defer any exception.
+//
+// Because we don't want the kernel to be relying on particular
+// settings of the DCR register, we provide recovery code in case
+// speculation fails. The recovery code is going to "redo" the work using
+// only normal loads. If we still get a fault then we generate a
+// kernel panic. Otherwise we return the strlen as usual.
+//
+// The fact that speculation may fail can be caused, for instance, by
+// the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
+// a NaT bit will be set if the translation is not present. The normal
+// load, on the other hand, will cause the translation to be inserted
+// if the mapping exists.
+//
+// It should be noted that we execute recovery code only when we need
+// to use the data that has been speculatively loaded: we don't execute
+// recovery code on pure read ahead data.
+//
+// Remarks:
+// - the cmp r0,r0 is used as a fast way to initialize a predicate
+// register to 1. This is required to make sure that we get the parallel
+// compare correct.
+//
+// - we don't use the epilogue counter to exit the loop but we need to set
+// it to zero beforehand.
+//
+// - after the loop we must test for Nat values because neither the
+// czx nor cmp instruction raise a NaT consumption fault. We must be
+// careful not to look too far for a Nat for which we don't care.
+// For instance we don't need to look at a NaT in val2 if the zero byte
+// was in val1.
+//
+// - Clearly performance tuning is required.
+//
+//
+//
+#define saved_pfs r11
+#define tmp r10
+#define base r16
+#define orig r17
+#define saved_pr r18
+#define src r19
+#define mask r20
+#define val r21
+#define val1 r22
+#define val2 r23
+
+GLOBAL_ENTRY(strlen)
+ .prologue
+ .save ar.pfs, saved_pfs
+ alloc saved_pfs=ar.pfs,11,0,0,8 // rotating must be multiple of 8
+
+ .rotr v[2], w[2] // declares our 4 aliases
+
+ extr.u tmp=in0,0,3 // tmp=least significant 3 bits
+ mov orig=in0 // keep trackof initial byte address
+ dep src=0,in0,0,3 // src=8byte-aligned in0 address
+ .save pr, saved_pr
+ mov saved_pr=pr // preserve predicates (rotation)
+ ;;
+
+ .body
+
+ ld8 v[1]=[src],8 // must not speculate: can fail here
+ shl tmp=tmp,3 // multiply by 8bits/byte
+ mov mask=-1 // our mask
+ ;;
+ ld8.s w[1]=[src],8 // speculatively load next
+ cmp.eq p6,p0=r0,r0 // sets p6 to true for cmp.and
+ sub tmp=64,tmp // how many bits to shift our mask on the right
+ ;;
+ shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part
+ mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs)
+ ;;
+ add base=-16,src // keep track of aligned base
+ or v[1]=v[1],mask // now we have a safe initial byte pattern
+ ;;
+1:
+ ld8.s v[0]=[src],8 // speculatively load next
+ czx1.r val1=v[1] // search 0 byte from right
+ czx1.r val2=w[1] // search 0 byte from right following 8bytes
+ ;;
+ ld8.s w[0]=[src],8 // speculatively load next to next
+ cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
+ cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
+(p6) br.wtop.dptk 1b // loop until p6 == 0
+ ;;
+ //
+ // We must return try the recovery code iff
+ // val1_is_nat || (val1==8 && val2_is_nat)
+ //
+ // XXX Fixme
+ // - there must be a better way of doing the test
+ //
+ cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate)
+ tnat.nz p6,p7=val1 // test NaT on val1
+(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT
+ ;;
+ //
+ // if we come here p7 is true, i.e., initialized for // cmp
+ //
+ cmp.eq.and p7,p0=8,val1// val1==8?
+ tnat.nz.and p7,p0=val2 // test NaT if val2
+(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT
+ ;;
+(p8) mov val1=val2 // the other test got us out of the loop
+(p8) adds src=-16,src // correct position when 3 ahead
+(p9) adds src=-24,src // correct position when 4 ahead
+ ;;
+ sub ret0=src,orig // distance from base
+ sub tmp=8,val1 // which byte in word
+ mov pr=saved_pr,0xffffffffffff0000
+ ;;
+ sub ret0=ret0,tmp // adjust
+ mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
+ br.ret.sptk.many rp // end of normal execution
+
+ //
+ // Outlined recovery code when speculation failed
+ //
+ // This time we don't use speculation and rely on the normal exception
+ // mechanism. that's why the loop is not as good as the previous one
+ // because read ahead is not possible
+ //
+ // IMPORTANT:
+ // Please note that in the case of strlen() as opposed to strlen_user()
+ // we don't use the exception mechanism, as this function is not
+ // supposed to fail. If that happens it means we have a bug and the
+ // code will cause of kernel fault.
+ //
+ // XXX Fixme
+ // - today we restart from the beginning of the string instead
+ // of trying to continue where we left off.
+ //
+.recover:
+ ld8 val=[base],8 // will fail if unrecoverable fault
+ ;;
+ or val=val,mask // remask first bytes
+ cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop
+ ;;
+ //
+ // ar.ec is still zero here
+ //
+2:
+(p6) ld8 val=[base],8 // will fail if unrecoverable fault
+ ;;
+ czx1.r val1=val // search 0 byte from right
+ ;;
+ cmp.eq p6,p0=8,val1 // val1==8 ?
+(p6) br.wtop.dptk 2b // loop until p6 == 0
+ ;; // (avoid WAW on p63)
+ sub ret0=base,orig // distance from base
+ sub tmp=8,val1
+ mov pr=saved_pr,0xffffffffffff0000
+ ;;
+ sub ret0=ret0,tmp // length=now - back -1
+ mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
+ br.ret.sptk.many rp // end of successful recovery code
+END(strlen)
diff --git a/arch/ia64/lib/strlen_user.S b/arch/ia64/lib/strlen_user.S
new file mode 100644
index 000000000..c71eded42
--- /dev/null
+++ b/arch/ia64/lib/strlen_user.S
@@ -0,0 +1,198 @@
+/*
+ * Optimized version of the strlen_user() function
+ *
+ * Inputs:
+ * in0 address of buffer
+ *
+ * Outputs:
+ * ret0 0 in case of fault, strlen(buffer)+1 otherwise
+ *
+ * Copyright (C) 1998, 1999, 2001 Hewlett-Packard Co
+ * David Mosberger-Tang <davidm@hpl.hp.com>
+ * Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * 01/19/99 S.Eranian heavily enhanced version (see details below)
+ * 09/24/99 S.Eranian added speculation recovery code
+ */
+
+#include <asm/asmmacro.h>
+
+//
+// int strlen_user(char *)
+// ------------------------
+// Returns:
+// - length of string + 1
+// - 0 in case an exception is raised
+//
+// This is an enhanced version of the basic strlen_user. it includes a
+// combination of compute zero index (czx), parallel comparisons, speculative
+// loads and loop unroll using rotating registers.
+//
+// General Ideas about the algorithm:
+// The goal is to look at the string in chunks of 8 bytes.
+// so we need to do a few extra checks at the beginning because the
+// string may not be 8-byte aligned. In this case we load the 8byte
+// quantity which includes the start of the string and mask the unused
+// bytes with 0xff to avoid confusing czx.
+// We use speculative loads and software pipelining to hide memory
+// latency and do read ahead safely. This way we defer any exception.
+//
+// Because we don't want the kernel to be relying on particular
+// settings of the DCR register, we provide recovery code in case
+// speculation fails. The recovery code is going to "redo" the work using
+// only normal loads. If we still get a fault then we return an
+// error (ret0=0). Otherwise we return the strlen+1 as usual.
+// The fact that speculation may fail can be caused, for instance, by
+// the DCR.dm bit being set. In this case TLB misses are deferred, i.e.,
+// a NaT bit will be set if the translation is not present. The normal
+// load, on the other hand, will cause the translation to be inserted
+// if the mapping exists.
+//
+// It should be noted that we execute recovery code only when we need
+// to use the data that has been speculatively loaded: we don't execute
+// recovery code on pure read ahead data.
+//
+// Remarks:
+// - the cmp r0,r0 is used as a fast way to initialize a predicate
+// register to 1. This is required to make sure that we get the parallel
+// compare correct.
+//
+// - we don't use the epilogue counter to exit the loop but we need to set
+// it to zero beforehand.
+//
+// - after the loop we must test for Nat values because neither the
+// czx nor cmp instruction raise a NaT consumption fault. We must be
+// careful not to look too far for a Nat for which we don't care.
+// For instance we don't need to look at a NaT in val2 if the zero byte
+// was in val1.
+//
+// - Clearly performance tuning is required.
+//
+
+#define saved_pfs r11
+#define tmp r10
+#define base r16
+#define orig r17
+#define saved_pr r18
+#define src r19
+#define mask r20
+#define val r21
+#define val1 r22
+#define val2 r23
+
+GLOBAL_ENTRY(__strlen_user)
+ .prologue
+ .save ar.pfs, saved_pfs
+ alloc saved_pfs=ar.pfs,11,0,0,8
+
+ .rotr v[2], w[2] // declares our 4 aliases
+
+ extr.u tmp=in0,0,3 // tmp=least significant 3 bits
+ mov orig=in0 // keep trackof initial byte address
+ dep src=0,in0,0,3 // src=8byte-aligned in0 address
+ .save pr, saved_pr
+ mov saved_pr=pr // preserve predicates (rotation)
+ ;;
+
+ .body
+
+ ld8.s v[1]=[src],8 // load the initial 8bytes (must speculate)
+ shl tmp=tmp,3 // multiply by 8bits/byte
+ mov mask=-1 // our mask
+ ;;
+ ld8.s w[1]=[src],8 // load next 8 bytes in 2nd pipeline
+ cmp.eq p6,p0=r0,r0 // sets p6 (required because of // cmp.and)
+ sub tmp=64,tmp // how many bits to shift our mask on the right
+ ;;
+ shr.u mask=mask,tmp // zero enough bits to hold v[1] valuable part
+ mov ar.ec=r0 // clear epilogue counter (saved in ar.pfs)
+ ;;
+ add base=-16,src // keep track of aligned base
+ chk.s v[1], .recover // if already NaT, then directly skip to recover
+ or v[1]=v[1],mask // now we have a safe initial byte pattern
+ ;;
+1:
+ ld8.s v[0]=[src],8 // speculatively load next
+ czx1.r val1=v[1] // search 0 byte from right
+ czx1.r val2=w[1] // search 0 byte from right following 8bytes
+ ;;
+ ld8.s w[0]=[src],8 // speculatively load next to next
+ cmp.eq.and p6,p0=8,val1 // p6 = p6 and val1==8
+ cmp.eq.and p6,p0=8,val2 // p6 = p6 and mask==8
+(p6) br.wtop.dptk.few 1b // loop until p6 == 0
+ ;;
+ //
+ // We must return try the recovery code iff
+ // val1_is_nat || (val1==8 && val2_is_nat)
+ //
+ // XXX Fixme
+ // - there must be a better way of doing the test
+ //
+ cmp.eq p8,p9=8,val1 // p6 = val1 had zero (disambiguate)
+ tnat.nz p6,p7=val1 // test NaT on val1
+(p6) br.cond.spnt .recover // jump to recovery if val1 is NaT
+ ;;
+ //
+ // if we come here p7 is true, i.e., initialized for // cmp
+ //
+ cmp.eq.and p7,p0=8,val1// val1==8?
+ tnat.nz.and p7,p0=val2 // test NaT if val2
+(p7) br.cond.spnt .recover // jump to recovery if val2 is NaT
+ ;;
+(p8) mov val1=val2 // val2 contains the value
+(p8) adds src=-16,src // correct position when 3 ahead
+(p9) adds src=-24,src // correct position when 4 ahead
+ ;;
+ sub ret0=src,orig // distance from origin
+ sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1
+ mov pr=saved_pr,0xffffffffffff0000
+ ;;
+ sub ret0=ret0,tmp // length=now - back -1
+ mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
+ br.ret.sptk.many rp // end of normal execution
+
+ //
+ // Outlined recovery code when speculation failed
+ //
+ // This time we don't use speculation and rely on the normal exception
+ // mechanism. that's why the loop is not as good as the previous one
+ // because read ahead is not possible
+ //
+ // XXX Fixme
+ // - today we restart from the beginning of the string instead
+ // of trying to continue where we left off.
+ //
+.recover:
+ EX(.Lexit1, ld8 val=[base],8) // load the initial bytes
+ ;;
+ or val=val,mask // remask first bytes
+ cmp.eq p0,p6=r0,r0 // nullify first ld8 in loop
+ ;;
+ //
+ // ar.ec is still zero here
+ //
+2:
+ EX(.Lexit1, (p6) ld8 val=[base],8)
+ ;;
+ czx1.r val1=val // search 0 byte from right
+ ;;
+ cmp.eq p6,p0=8,val1 // val1==8 ?
+(p6) br.wtop.dptk.few 2b // loop until p6 == 0
+ ;;
+ sub ret0=base,orig // distance from base
+ sub tmp=7,val1 // 7=8-1 because this strlen returns strlen+1
+ mov pr=saved_pr,0xffffffffffff0000
+ ;;
+ sub ret0=ret0,tmp // length=now - back -1
+ mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
+ br.ret.sptk.many rp // end of successful recovery code
+
+ //
+ // We failed even on the normal load (called from exception handler)
+ //
+.Lexit1:
+ mov ret0=0
+ mov pr=saved_pr,0xffffffffffff0000
+ mov ar.pfs=saved_pfs // because of ar.ec, restore no matter what
+ br.ret.sptk.many rp
+END(__strlen_user)
diff --git a/arch/ia64/lib/strncpy_from_user.S b/arch/ia64/lib/strncpy_from_user.S
new file mode 100644
index 000000000..a504381f3
--- /dev/null
+++ b/arch/ia64/lib/strncpy_from_user.S
@@ -0,0 +1,44 @@
+/*
+ * Just like strncpy() except that if a fault occurs during copying,
+ * -EFAULT is returned.
+ *
+ * Inputs:
+ * in0: address of destination buffer
+ * in1: address of string to be copied
+ * in2: length of buffer in bytes
+ * Outputs:
+ * r8: -EFAULT in case of fault or number of bytes copied if no fault
+ *
+ * Copyright (C) 1998-2001 Hewlett-Packard Co
+ * Copyright (C) 1998-2001 David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * 00/03/06 D. Mosberger Fixed to return proper return value (bug found by
+ * by Andreas Schwab <schwab@suse.de>).
+ */
+
+#include <asm/asmmacro.h>
+
+GLOBAL_ENTRY(__strncpy_from_user)
+ alloc r2=ar.pfs,3,0,0,0
+ mov r8=0
+ mov r9=in1
+ ;;
+ add r10=in1,in2
+ cmp.eq p6,p0=r0,in2
+(p6) br.ret.spnt.many rp
+
+ // XXX braindead copy loop---this needs to be optimized
+.Loop1:
+ EX(.Lexit, ld1 r8=[in1],1)
+ ;;
+ EX(.Lexit, st1 [in0]=r8,1)
+ cmp.ne p6,p7=r8,r0
+ ;;
+(p6) cmp.ne.unc p8,p0=in1,r10
+(p8) br.cond.dpnt.few .Loop1
+ ;;
+(p6) mov r8=in2 // buffer filled up---return buffer length
+(p7) sub r8=in1,r9,1 // return string length (excluding NUL character)
+[.Lexit:]
+ br.ret.sptk.many rp
+END(__strncpy_from_user)
diff --git a/arch/ia64/lib/strnlen_user.S b/arch/ia64/lib/strnlen_user.S
new file mode 100644
index 000000000..d09066b1e
--- /dev/null
+++ b/arch/ia64/lib/strnlen_user.S
@@ -0,0 +1,45 @@
+/*
+ * Returns 0 if exception before NUL or reaching the supplied limit (N),
+ * a value greater than N if the string is longer than the limit, else
+ * strlen.
+ *
+ * Inputs:
+ * in0: address of buffer
+ * in1: string length limit N
+ * Outputs:
+ * r8: 0 in case of fault, strlen(buffer)+1 otherwise
+ *
+ * Copyright (C) 1999, 2001 David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+#include <asm/asmmacro.h>
+
+GLOBAL_ENTRY(__strnlen_user)
+ .prologue
+ alloc r2=ar.pfs,2,0,0,0
+ .save ar.lc, r16
+ mov r16=ar.lc // preserve ar.lc
+
+ .body
+
+ add r3=-1,in1
+ ;;
+ mov ar.lc=r3
+ mov r9=0
+ ;;
+ // XXX braindead strlen loop---this needs to be optimized
+.Loop1:
+ EXCLR(.Lexit, ld1 r8=[in0],1)
+ add r9=1,r9
+ ;;
+ cmp.eq p6,p0=r8,r0
+(p6) br.cond.dpnt .Lexit
+ br.cloop.dptk.few .Loop1
+
+ add r9=1,in1 // NUL not found---return N+1
+ ;;
+.Lexit:
+ mov r8=r9
+ mov ar.lc=r16 // restore ar.lc
+ br.ret.sptk.many rp
+END(__strnlen_user)
diff --git a/arch/ia64/lib/xor.S b/arch/ia64/lib/xor.S
new file mode 100644
index 000000000..54e3f7eab
--- /dev/null
+++ b/arch/ia64/lib/xor.S
@@ -0,0 +1,184 @@
+/*
+ * arch/ia64/lib/xor.S
+ *
+ * Optimized RAID-5 checksumming functions for IA-64.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * You should have received a copy of the GNU General Public License
+ * (for example /usr/src/linux/COPYING); if not, write to the Free
+ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <asm/asmmacro.h>
+
+GLOBAL_ENTRY(xor_ia64_2)
+ .prologue
+ .fframe 0
+ .save ar.pfs, r31
+ alloc r31 = ar.pfs, 3, 0, 13, 16
+ .save ar.lc, r30
+ mov r30 = ar.lc
+ .save pr, r29
+ mov r29 = pr
+ ;;
+ .body
+ mov r8 = in1
+ mov ar.ec = 6 + 2
+ shr in0 = in0, 3
+ ;;
+ adds in0 = -1, in0
+ mov r16 = in1
+ mov r17 = in2
+ ;;
+ mov ar.lc = in0
+ mov pr.rot = 1 << 16
+ ;;
+ .rotr s1[6+1], s2[6+1], d[2]
+ .rotp p[6+2]
+0:
+(p[0]) ld8.nta s1[0] = [r16], 8
+(p[0]) ld8.nta s2[0] = [r17], 8
+(p[6]) xor d[0] = s1[6], s2[6]
+(p[6+1])st8.nta [r8] = d[1], 8
+ nop.f 0
+ br.ctop.dptk.few 0b
+ ;;
+ mov ar.lc = r30
+ mov pr = r29, -1
+ br.ret.sptk.few rp
+END(xor_ia64_2)
+
+GLOBAL_ENTRY(xor_ia64_3)
+ .prologue
+ .fframe 0
+ .save ar.pfs, r31
+ alloc r31 = ar.pfs, 4, 0, 20, 24
+ .save ar.lc, r30
+ mov r30 = ar.lc
+ .save pr, r29
+ mov r29 = pr
+ ;;
+ .body
+ mov r8 = in1
+ mov ar.ec = 6 + 2
+ shr in0 = in0, 3
+ ;;
+ adds in0 = -1, in0
+ mov r16 = in1
+ mov r17 = in2
+ ;;
+ mov r18 = in3
+ mov ar.lc = in0
+ mov pr.rot = 1 << 16
+ ;;
+ .rotr s1[6+1], s2[6+1], s3[6+1], d[2]
+ .rotp p[6+2]
+0:
+(p[0]) ld8.nta s1[0] = [r16], 8
+(p[0]) ld8.nta s2[0] = [r17], 8
+(p[6]) xor d[0] = s1[6], s2[6]
+ ;;
+(p[0]) ld8.nta s3[0] = [r18], 8
+(p[6+1])st8.nta [r8] = d[1], 8
+(p[6]) xor d[0] = d[0], s3[6]
+ br.ctop.dptk.few 0b
+ ;;
+ mov ar.lc = r30
+ mov pr = r29, -1
+ br.ret.sptk.few rp
+END(xor_ia64_3)
+
+GLOBAL_ENTRY(xor_ia64_4)
+ .prologue
+ .fframe 0
+ .save ar.pfs, r31
+ alloc r31 = ar.pfs, 5, 0, 27, 32
+ .save ar.lc, r30
+ mov r30 = ar.lc
+ .save pr, r29
+ mov r29 = pr
+ ;;
+ .body
+ mov r8 = in1
+ mov ar.ec = 6 + 2
+ shr in0 = in0, 3
+ ;;
+ adds in0 = -1, in0
+ mov r16 = in1
+ mov r17 = in2
+ ;;
+ mov r18 = in3
+ mov ar.lc = in0
+ mov pr.rot = 1 << 16
+ mov r19 = in4
+ ;;
+ .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], d[2]
+ .rotp p[6+2]
+0:
+(p[0]) ld8.nta s1[0] = [r16], 8
+(p[0]) ld8.nta s2[0] = [r17], 8
+(p[6]) xor d[0] = s1[6], s2[6]
+(p[0]) ld8.nta s3[0] = [r18], 8
+(p[0]) ld8.nta s4[0] = [r19], 8
+(p[6]) xor r20 = s3[6], s4[6]
+ ;;
+(p[6+1])st8.nta [r8] = d[1], 8
+(p[6]) xor d[0] = d[0], r20
+ br.ctop.dptk.few 0b
+ ;;
+ mov ar.lc = r30
+ mov pr = r29, -1
+ br.ret.sptk.few rp
+END(xor_ia64_4)
+
+GLOBAL_ENTRY(xor_ia64_5)
+ .prologue
+ .fframe 0
+ .save ar.pfs, r31
+ alloc r31 = ar.pfs, 6, 0, 34, 40
+ .save ar.lc, r30
+ mov r30 = ar.lc
+ .save pr, r29
+ mov r29 = pr
+ ;;
+ .body
+ mov r8 = in1
+ mov ar.ec = 6 + 2
+ shr in0 = in0, 3
+ ;;
+ adds in0 = -1, in0
+ mov r16 = in1
+ mov r17 = in2
+ ;;
+ mov r18 = in3
+ mov ar.lc = in0
+ mov pr.rot = 1 << 16
+ mov r19 = in4
+ mov r20 = in5
+ ;;
+ .rotr s1[6+1], s2[6+1], s3[6+1], s4[6+1], s5[6+1], d[2]
+ .rotp p[6+2]
+0:
+(p[0]) ld8.nta s1[0] = [r16], 8
+(p[0]) ld8.nta s2[0] = [r17], 8
+(p[6]) xor d[0] = s1[6], s2[6]
+(p[0]) ld8.nta s3[0] = [r18], 8
+(p[0]) ld8.nta s4[0] = [r19], 8
+(p[6]) xor r21 = s3[6], s4[6]
+ ;;
+(p[0]) ld8.nta s5[0] = [r20], 8
+(p[6+1])st8.nta [r8] = d[1], 8
+(p[6]) xor d[0] = d[0], r21
+ ;;
+(p[6]) xor d[0] = d[0], s5[6]
+ nop.f 0
+ br.ctop.dptk.few 0b
+ ;;
+ mov ar.lc = r30
+ mov pr = r29, -1
+ br.ret.sptk.few rp
+END(xor_ia64_5)