summaryrefslogtreecommitdiff
path: root/arch/powerpc/lib
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-12-15 14:52:16 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-12-15 14:52:16 -0300
commit8d91c1e411f55d7ea91b1183a2e9f8088fb4d5be (patch)
treee9891aa6c295060d065adffd610c4f49ecf884f3 /arch/powerpc/lib
parenta71852147516bc1cb5b0b3cbd13639bfd4022dc8 (diff)
Linux-libre 4.3.2-gnu
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r--arch/powerpc/lib/checksum_32.S16
-rw-r--r--arch/powerpc/lib/checksum_64.S21
-rw-r--r--arch/powerpc/lib/copy_32.S120
3 files changed, 119 insertions, 38 deletions
diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S
index 7874e8a80..6d67e057f 100644
--- a/arch/powerpc/lib/checksum_32.S
+++ b/arch/powerpc/lib/checksum_32.S
@@ -41,22 +41,6 @@ _GLOBAL(ip_fast_csum)
blr
/*
- * Compute checksum of TCP or UDP pseudo-header:
- * csum_tcpudp_magic(saddr, daddr, len, proto, sum)
- */
-_GLOBAL(csum_tcpudp_magic)
- rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
- addc r0,r3,r4 /* add 4 32-bit words together */
- adde r0,r0,r5
- adde r0,r0,r7
- addze r0,r0 /* add in final carry */
- rlwinm r3,r0,16,0,31 /* fold two halves together */
- add r3,r0,r3
- not r3,r3
- srwi r3,r3,16
- blr
-
-/*
* computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit)
*
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index 57a072065..f3ef35436 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -45,27 +45,6 @@ _GLOBAL(ip_fast_csum)
blr
/*
- * Compute checksum of TCP or UDP pseudo-header:
- * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
- * No real gain trying to do this specially for 64 bit, but
- * the 32 bit addition may spill into the upper bits of
- * the doubleword so we still must fold it down from 64.
- */
-_GLOBAL(csum_tcpudp_magic)
- rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
- addc r0,r3,r4 /* add 4 32-bit words together */
- adde r0,r0,r5
- adde r0,r0,r7
- rldicl r4,r0,32,0 /* fold 64 bit value */
- add r0,r4,r0
- srdi r0,r0,32
- rlwinm r3,r0,16,0,31 /* fold two halves together */
- add r3,r0,r3
- not r3,r3
- srwi r3,r3,16
- blr
-
-/*
* Computes the checksum of a memory block at buff, length len,
* and adds in "sum" (32-bit).
*
diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index 6813f80d1..c44df2dbe 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -69,9 +69,19 @@ CACHELINE_BYTES = L1_CACHE_BYTES
LG_CACHELINE_BYTES = L1_CACHE_SHIFT
CACHELINE_MASK = (L1_CACHE_BYTES-1)
+/*
+ * Use dcbz on the complete cache lines in the destination
+ * to set them to zero. This requires that the destination
+ * area is cacheable. -- paulus
+ *
+ * During early init, cache might not be active yet, so dcbz cannot be used.
+ * We therefore skip the optimised bloc that uses dcbz. This jump is
+ * replaced by a nop once cache is active. This is done in machine_init()
+ */
_GLOBAL(memset)
rlwimi r4,r4,8,16,23
rlwimi r4,r4,16,0,15
+
addi r6,r3,-4
cmplwi 0,r5,4
blt 7f
@@ -80,7 +90,31 @@ _GLOBAL(memset)
andi. r0,r6,3
add r5,r0,r5
subf r6,r0,r6
- srwi r0,r5,2
+ cmplwi 0,r4,0
+ bne 2f /* Use normal procedure if r4 is not zero */
+_GLOBAL(memset_nocache_branch)
+ b 2f /* Skip optimised bloc until cache is enabled */
+
+ clrlwi r7,r6,32-LG_CACHELINE_BYTES
+ add r8,r7,r5
+ srwi r9,r8,LG_CACHELINE_BYTES
+ addic. r9,r9,-1 /* total number of complete cachelines */
+ ble 2f
+ xori r0,r7,CACHELINE_MASK & ~3
+ srwi. r0,r0,2
+ beq 3f
+ mtctr r0
+4: stwu r4,4(r6)
+ bdnz 4b
+3: mtctr r9
+ li r7,4
+10: dcbz r7,r6
+ addi r6,r6,CACHELINE_BYTES
+ bdnz 10b
+ clrlwi r5,r8,32-LG_CACHELINE_BYTES
+ addi r5,r5,4
+
+2: srwi r0,r5,2
mtctr r0
bdz 6f
1: stwu r4,4(r6)
@@ -94,12 +128,96 @@ _GLOBAL(memset)
bdnz 8b
blr
+/*
+ * This version uses dcbz on the complete cache lines in the
+ * destination area to reduce memory traffic. This requires that
+ * the destination area is cacheable.
+ * We only use this version if the source and dest don't overlap.
+ * -- paulus.
+ *
+ * During early init, cache might not be active yet, so dcbz cannot be used.
+ * We therefore jump to generic_memcpy which doesn't use dcbz. This jump is
+ * replaced by a nop once cache is active. This is done in machine_init()
+ */
_GLOBAL(memmove)
cmplw 0,r3,r4
bgt backwards_memcpy
/* fall through */
_GLOBAL(memcpy)
+ b generic_memcpy
+ add r7,r3,r5 /* test if the src & dst overlap */
+ add r8,r4,r5
+ cmplw 0,r4,r7
+ cmplw 1,r3,r8
+ crand 0,0,4 /* cr0.lt &= cr1.lt */
+ blt generic_memcpy /* if regions overlap */
+
+ addi r4,r4,-4
+ addi r6,r3,-4
+ neg r0,r3
+ andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */
+ beq 58f
+
+ cmplw 0,r5,r0 /* is this more than total to do? */
+ blt 63f /* if not much to do */
+ andi. r8,r0,3 /* get it word-aligned first */
+ subf r5,r0,r5
+ mtctr r8
+ beq+ 61f
+70: lbz r9,4(r4) /* do some bytes */
+ addi r4,r4,1
+ addi r6,r6,1
+ stb r9,3(r6)
+ bdnz 70b
+61: srwi. r0,r0,2
+ mtctr r0
+ beq 58f
+72: lwzu r9,4(r4) /* do some words */
+ stwu r9,4(r6)
+ bdnz 72b
+
+58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
+ clrlwi r5,r5,32-LG_CACHELINE_BYTES
+ li r11,4
+ mtctr r0
+ beq 63f
+53:
+ dcbz r11,r6
+ COPY_16_BYTES
+#if L1_CACHE_BYTES >= 32
+ COPY_16_BYTES
+#if L1_CACHE_BYTES >= 64
+ COPY_16_BYTES
+ COPY_16_BYTES
+#if L1_CACHE_BYTES >= 128
+ COPY_16_BYTES
+ COPY_16_BYTES
+ COPY_16_BYTES
+ COPY_16_BYTES
+#endif
+#endif
+#endif
+ bdnz 53b
+
+63: srwi. r0,r5,2
+ mtctr r0
+ beq 64f
+30: lwzu r0,4(r4)
+ stwu r0,4(r6)
+ bdnz 30b
+
+64: andi. r0,r5,3
+ mtctr r0
+ beq+ 65f
+ addi r4,r4,3
+ addi r6,r6,3
+40: lbzu r0,1(r4)
+ stbu r0,1(r6)
+ bdnz 40b
+65: blr
+
+_GLOBAL(generic_memcpy)
srwi. r7,r5,3
addi r6,r3,-4
addi r4,r4,-4