From d635711daa98be86d4c7fd01499c34f566b54ccb Mon Sep 17 00:00:00 2001 From: André Fabian Silva Delgado Date: Fri, 10 Jun 2016 05:30:17 -0300 Subject: Linux-libre 4.6.2-gnu --- arch/powerpc/lib/checksum_32.S | 398 +++++++++++++++++++++++++---------------- 1 file changed, 243 insertions(+), 155 deletions(-) (limited to 'arch/powerpc/lib/checksum_32.S') diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S index 6d67e057f..d90870a66 100644 --- a/arch/powerpc/lib/checksum_32.S +++ b/arch/powerpc/lib/checksum_32.S @@ -14,68 +14,59 @@ #include #include +#include #include #include .text -/* - * ip_fast_csum(buf, len) -- Optimized for IP header - * len is in words and is always >= 5. - */ -_GLOBAL(ip_fast_csum) - lwz r0,0(r3) - lwzu r5,4(r3) - addic. r4,r4,-2 - addc r0,r0,r5 - mtctr r4 - blelr- -1: lwzu r4,4(r3) - adde r0,r0,r4 - bdnz 1b - addze r0,r0 /* add in final carry */ - rlwinm r3,r0,16,0,31 /* fold two halves together */ - add r3,r0,r3 - not r3,r3 - srwi r3,r3,16 - blr - /* * computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit) * - * csum_partial(buff, len, sum) + * __csum_partial(buff, len, sum) */ -_GLOBAL(csum_partial) - addic r0,r5,0 +_GLOBAL(__csum_partial) subi r3,r3,4 - srwi. r6,r4,2 + srawi. r6,r4,2 /* Divide len by 4 and also clear carry */ beq 3f /* if we're doing < 4 bytes */ - andi. r5,r3,2 /* Align buffer to longword boundary */ + andi. r0,r3,2 /* Align buffer to longword boundary */ beq+ 1f - lhz r5,4(r3) /* do 2 bytes to get aligned */ - addi r3,r3,2 + lhz r0,4(r3) /* do 2 bytes to get aligned */ subi r4,r4,2 - addc r0,r0,r5 + addi r3,r3,2 srwi. r6,r4,2 /* # words to do */ + adde r5,r5,r0 beq 3f -1: mtctr r6 -2: lwzu r5,4(r3) /* the bdnz has zero overhead, so it should */ - adde r0,r0,r5 /* be unnecessary to unroll this loop */ +1: andi. r6,r6,3 /* Prepare to handle words 4 by 4 */ + beq 21f + mtctr r6 +2: lwzu r0,4(r3) + adde r5,r5,r0 bdnz 2b - andi. r4,r4,3 -3: cmpwi 0,r4,2 - blt+ 4f - lhz r5,4(r3) +21: srwi. r6,r4,4 /* # blocks of 4 words to do */ + beq 3f + mtctr r6 +22: lwz r0,4(r3) + lwz r6,8(r3) + lwz r7,12(r3) + lwzu r8,16(r3) + adde r5,r5,r0 + adde r5,r5,r6 + adde r5,r5,r7 + adde r5,r5,r8 + bdnz 22b +3: andi. r0,r4,2 + beq+ 4f + lhz r0,4(r3) addi r3,r3,2 - subi r4,r4,2 - adde r0,r0,r5 -4: cmpwi 0,r4,1 - bne+ 5f - lbz r5,4(r3) - slwi r5,r5,8 /* Upper byte of word */ - adde r0,r0,r5 -5: addze r3,r0 /* add in final carry */ + adde r5,r5,r0 +4: andi. r0,r4,1 + beq+ 5f + lbz r0,4(r3) + slwi r0,r0,8 /* Upper byte of word */ + adde r5,r5,r0 +5: addze r3,r5 /* add in final carry */ blr /* @@ -87,123 +78,220 @@ _GLOBAL(csum_partial) * * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) */ +#define CSUM_COPY_16_BYTES_WITHEX(n) \ +8 ## n ## 0: \ + lwz r7,4(r4); \ +8 ## n ## 1: \ + lwz r8,8(r4); \ +8 ## n ## 2: \ + lwz r9,12(r4); \ +8 ## n ## 3: \ + lwzu r10,16(r4); \ +8 ## n ## 4: \ + stw r7,4(r6); \ + adde r12,r12,r7; \ +8 ## n ## 5: \ + stw r8,8(r6); \ + adde r12,r12,r8; \ +8 ## n ## 6: \ + stw r9,12(r6); \ + adde r12,r12,r9; \ +8 ## n ## 7: \ + stwu r10,16(r6); \ + adde r12,r12,r10 + +#define CSUM_COPY_16_BYTES_EXCODE(n) \ +.section __ex_table,"a"; \ + .align 2; \ + .long 8 ## n ## 0b,src_error; \ + .long 8 ## n ## 1b,src_error; \ + .long 8 ## n ## 2b,src_error; \ + .long 8 ## n ## 3b,src_error; \ + .long 8 ## n ## 4b,dst_error; \ + .long 8 ## n ## 5b,dst_error; \ + .long 8 ## n ## 6b,dst_error; \ + .long 8 ## n ## 7b,dst_error; \ + .text + + .text + .stabs "arch/powerpc/lib/",N_SO,0,0,0f + .stabs "checksum_32.S",N_SO,0,0,0f +0: + +CACHELINE_BYTES = L1_CACHE_BYTES +LG_CACHELINE_BYTES = L1_CACHE_SHIFT +CACHELINE_MASK = (L1_CACHE_BYTES-1) + _GLOBAL(csum_partial_copy_generic) - addic r0,r6,0 - subi r3,r3,4 - subi r4,r4,4 - srwi. r6,r5,2 - beq 3f /* if we're doing < 4 bytes */ - andi. r9,r4,2 /* Align dst to longword boundary */ - beq+ 1f -81: lhz r6,4(r3) /* do 2 bytes to get aligned */ - addi r3,r3,2 - subi r5,r5,2 -91: sth r6,4(r4) - addi r4,r4,2 - addc r0,r0,r6 - srwi. r6,r5,2 /* # words to do */ - beq 3f -1: srwi. r6,r5,4 /* # groups of 4 words to do */ - beq 10f - mtctr r6 -71: lwz r6,4(r3) -72: lwz r9,8(r3) -73: lwz r10,12(r3) -74: lwzu r11,16(r3) - adde r0,r0,r6 -75: stw r6,4(r4) - adde r0,r0,r9 -76: stw r9,8(r4) - adde r0,r0,r10 -77: stw r10,12(r4) - adde r0,r0,r11 -78: stwu r11,16(r4) - bdnz 71b -10: rlwinm. r6,r5,30,30,31 /* # words left to do */ - beq 13f - mtctr r6 -82: lwzu r9,4(r3) -92: stwu r9,4(r4) - adde r0,r0,r9 - bdnz 82b -13: andi. r5,r5,3 -3: cmpwi 0,r5,2 - blt+ 4f -83: lhz r6,4(r3) - addi r3,r3,2 - subi r5,r5,2 -93: sth r6,4(r4) + stwu r1,-16(r1) + stw r7,12(r1) + stw r8,8(r1) + + andi. r0,r4,1 /* is destination address even ? */ + cmplwi cr7,r0,0 + addic r12,r6,0 + addi r6,r4,-4 + neg r0,r4 + addi r4,r3,-4 + andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ + beq 58f + + cmplw 0,r5,r0 /* is this more than total to do? */ + blt 63f /* if not much to do */ + andi. r8,r0,3 /* get it word-aligned first */ + mtctr r8 + beq+ 61f + li r3,0 +70: lbz r9,4(r4) /* do some bytes */ + addi r4,r4,1 + slwi r3,r3,8 + rlwimi r3,r9,0,24,31 +71: stb r9,4(r6) + addi r6,r6,1 + bdnz 70b + adde r12,r12,r3 +61: subf r5,r0,r5 + srwi. r0,r0,2 + mtctr r0 + beq 58f +72: lwzu r9,4(r4) /* do some words */ + adde r12,r12,r9 +73: stwu r9,4(r6) + bdnz 72b + +58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ + clrlwi r5,r5,32-LG_CACHELINE_BYTES + li r11,4 + beq 63f + + /* Here we decide how far ahead to prefetch the source */ + li r3,4 + cmpwi r0,1 + li r7,0 + ble 114f + li r7,1 +#if MAX_COPY_PREFETCH > 1 + /* Heuristically, for large transfers we prefetch + MAX_COPY_PREFETCH cachelines ahead. For small transfers + we prefetch 1 cacheline ahead. */ + cmpwi r0,MAX_COPY_PREFETCH + ble 112f + li r7,MAX_COPY_PREFETCH +112: mtctr r7 +111: dcbt r3,r4 + addi r3,r3,CACHELINE_BYTES + bdnz 111b +#else + dcbt r3,r4 + addi r3,r3,CACHELINE_BYTES +#endif /* MAX_COPY_PREFETCH > 1 */ + +114: subf r8,r7,r0 + mr r0,r7 + mtctr r8 + +53: dcbt r3,r4 +54: dcbz r11,r6 +/* the main body of the cacheline loop */ + CSUM_COPY_16_BYTES_WITHEX(0) +#if L1_CACHE_BYTES >= 32 + CSUM_COPY_16_BYTES_WITHEX(1) +#if L1_CACHE_BYTES >= 64 + CSUM_COPY_16_BYTES_WITHEX(2) + CSUM_COPY_16_BYTES_WITHEX(3) +#if L1_CACHE_BYTES >= 128 + CSUM_COPY_16_BYTES_WITHEX(4) + CSUM_COPY_16_BYTES_WITHEX(5) + CSUM_COPY_16_BYTES_WITHEX(6) + CSUM_COPY_16_BYTES_WITHEX(7) +#endif +#endif +#endif + bdnz 53b + cmpwi r0,0 + li r3,4 + li r7,0 + bne 114b + +63: srwi. r0,r5,2 + mtctr r0 + beq 64f +30: lwzu r0,4(r4) + adde r12,r12,r0 +31: stwu r0,4(r6) + bdnz 30b + +64: andi. r0,r5,2 + beq+ 65f +40: lhz r0,4(r4) addi r4,r4,2 - adde r0,r0,r6 -4: cmpwi 0,r5,1 - bne+ 5f -84: lbz r6,4(r3) -94: stb r6,4(r4) - slwi r6,r6,8 /* Upper byte of word */ - adde r0,r0,r6 -5: addze r3,r0 /* add in final carry */ +41: sth r0,4(r6) + adde r12,r12,r0 + addi r6,r6,2 +65: andi. r0,r5,1 + beq+ 66f +50: lbz r0,4(r4) +51: stb r0,4(r6) + slwi r0,r0,8 + adde r12,r12,r0 +66: addze r3,r12 + addi r1,r1,16 + beqlr+ cr7 + rlwinm r3,r3,8,0,31 /* swap bytes for odd destination */ blr -/* These shouldn't go in the fixup section, since that would - cause the ex_table addresses to get out of order. */ - -src_error_4: - mfctr r6 /* update # bytes remaining from ctr */ - rlwimi r5,r6,4,0,27 - b 79f -src_error_1: - li r6,0 - subi r5,r5,2 -95: sth r6,4(r4) - addi r4,r4,2 -79: srwi. r6,r5,2 - beq 3f - mtctr r6 -src_error_2: - li r6,0 -96: stwu r6,4(r4) - bdnz 96b -3: andi. r5,r5,3 - beq src_error -src_error_3: - li r6,0 - mtctr r5 - addi r4,r4,3 -97: stbu r6,1(r4) - bdnz 97b +/* read fault */ src_error: - cmpwi 0,r7,0 - beq 1f - li r6,-EFAULT - stw r6,0(r7) -1: addze r3,r0 + lwz r7,12(r1) + addi r1,r1,16 + cmpwi cr0,r7,0 + beqlr + li r0,-EFAULT + stw r0,0(r7) blr - +/* write fault */ dst_error: - cmpwi 0,r8,0 - beq 1f - li r6,-EFAULT - stw r6,0(r8) -1: addze r3,r0 + lwz r8,8(r1) + addi r1,r1,16 + cmpwi cr0,r8,0 + beqlr + li r0,-EFAULT + stw r0,0(r8) blr -.section __ex_table,"a" - .long 81b,src_error_1 - .long 91b,dst_error - .long 71b,src_error_4 - .long 72b,src_error_4 - .long 73b,src_error_4 - .long 74b,src_error_4 - .long 75b,dst_error - .long 76b,dst_error - .long 77b,dst_error - .long 78b,dst_error - .long 82b,src_error_2 - .long 92b,dst_error - .long 83b,src_error_3 - .long 93b,dst_error - .long 84b,src_error_3 - .long 94b,dst_error - .long 95b,dst_error - .long 96b,dst_error - .long 97b,dst_error + .section __ex_table,"a" + .align 2 + .long 70b,src_error + .long 71b,dst_error + .long 72b,src_error + .long 73b,dst_error + .long 54b,dst_error + .text + +/* + * this stuff handles faults in the cacheline loop and branches to either + * src_error (if in read part) or dst_error (if in write part) + */ + CSUM_COPY_16_BYTES_EXCODE(0) +#if L1_CACHE_BYTES >= 32 + CSUM_COPY_16_BYTES_EXCODE(1) +#if L1_CACHE_BYTES >= 64 + CSUM_COPY_16_BYTES_EXCODE(2) + CSUM_COPY_16_BYTES_EXCODE(3) +#if L1_CACHE_BYTES >= 128 + CSUM_COPY_16_BYTES_EXCODE(4) + CSUM_COPY_16_BYTES_EXCODE(5) + CSUM_COPY_16_BYTES_EXCODE(6) + CSUM_COPY_16_BYTES_EXCODE(7) +#endif +#endif +#endif + + .section __ex_table,"a" + .align 2 + .long 30b,src_error + .long 31b,dst_error + .long 40b,src_error + .long 41b,dst_error + .long 50b,src_error + .long 51b,dst_error -- cgit v1.2.3-54-g00ecf