diff options
author | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2015-08-05 17:04:01 -0300 |
---|---|---|
committer | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2015-08-05 17:04:01 -0300 |
commit | 57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch) | |
tree | 5e910f0e82173f4ef4f51111366a3f1299037a7b /arch/powerpc/lib |
Initial import
Diffstat (limited to 'arch/powerpc/lib')
30 files changed, 10403 insertions, 0 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile new file mode 100644 index 000000000..7902802a1 --- /dev/null +++ b/arch/powerpc/lib/Makefile @@ -0,0 +1,38 @@ +# +# Makefile for ppc-specific library files.. +# + +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror + +ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) + +CFLAGS_REMOVE_code-patching.o = -pg +CFLAGS_REMOVE_feature-fixups.o = -pg + +obj-y += string.o alloc.o crtsavres.o ppc_ksyms.o code-patching.o \ + feature-fixups.o + +obj-$(CONFIG_PPC32) += div64.o copy_32.o + +obj64-y += copypage_64.o copyuser_64.o usercopy_64.o mem_64.o hweight_64.o \ + copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \ + memcpy_64.o memcmp_64.o + +obj64-$(CONFIG_SMP) += locks.o +obj64-$(CONFIG_ALTIVEC) += vmx-helper.o + +ifeq ($(CONFIG_GENERIC_CSUM),) +obj-y += checksum_$(CONFIG_WORD_SIZE).o +obj-$(CONFIG_PPC64) += checksum_wrappers_64.o +endif + +obj-$(CONFIG_PPC_EMULATE_SSTEP) += sstep.o ldstfp.o + +obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o + +obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o + +obj-$(CONFIG_ALTIVEC) += xor_vmx.o +CFLAGS_xor_vmx.o += -maltivec -mabi=altivec + +obj-$(CONFIG_PPC64) += $(obj64-y) diff --git a/arch/powerpc/lib/alloc.c b/arch/powerpc/lib/alloc.c new file mode 100644 index 000000000..60b0b3fc8 --- /dev/null +++ b/arch/powerpc/lib/alloc.c @@ -0,0 +1,19 @@ +#include <linux/types.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/bootmem.h> +#include <linux/string.h> +#include <asm/setup.h> + + +void * __init_refok zalloc_maybe_bootmem(size_t size, gfp_t mask) +{ + void *p; + + if (slab_is_available()) + p = kzalloc(size, mask); + else { + p = memblock_virt_alloc(size, 0); + } + return p; +} diff --git a/arch/powerpc/lib/checksum_32.S b/arch/powerpc/lib/checksum_32.S new file mode 100644 index 000000000..7874e8a80 --- /dev/null +++ b/arch/powerpc/lib/checksum_32.S @@ -0,0 +1,225 @@ +/* + * This file contains assembly-language implementations + * of IP-style 1's complement checksum routines. + * + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). + */ + +#include <linux/sys.h> +#include <asm/processor.h> +#include <asm/errno.h> +#include <asm/ppc_asm.h> + + .text + +/* + * ip_fast_csum(buf, len) -- Optimized for IP header + * len is in words and is always >= 5. + */ +_GLOBAL(ip_fast_csum) + lwz r0,0(r3) + lwzu r5,4(r3) + addic. r4,r4,-2 + addc r0,r0,r5 + mtctr r4 + blelr- +1: lwzu r4,4(r3) + adde r0,r0,r4 + bdnz 1b + addze r0,r0 /* add in final carry */ + rlwinm r3,r0,16,0,31 /* fold two halves together */ + add r3,r0,r3 + not r3,r3 + srwi r3,r3,16 + blr + +/* + * Compute checksum of TCP or UDP pseudo-header: + * csum_tcpudp_magic(saddr, daddr, len, proto, sum) + */ +_GLOBAL(csum_tcpudp_magic) + rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ + addc r0,r3,r4 /* add 4 32-bit words together */ + adde r0,r0,r5 + adde r0,r0,r7 + addze r0,r0 /* add in final carry */ + rlwinm r3,r0,16,0,31 /* fold two halves together */ + add r3,r0,r3 + not r3,r3 + srwi r3,r3,16 + blr + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * csum_partial(buff, len, sum) + */ +_GLOBAL(csum_partial) + addic r0,r5,0 + subi r3,r3,4 + srwi. r6,r4,2 + beq 3f /* if we're doing < 4 bytes */ + andi. r5,r3,2 /* Align buffer to longword boundary */ + beq+ 1f + lhz r5,4(r3) /* do 2 bytes to get aligned */ + addi r3,r3,2 + subi r4,r4,2 + addc r0,r0,r5 + srwi. r6,r4,2 /* # words to do */ + beq 3f +1: mtctr r6 +2: lwzu r5,4(r3) /* the bdnz has zero overhead, so it should */ + adde r0,r0,r5 /* be unnecessary to unroll this loop */ + bdnz 2b + andi. r4,r4,3 +3: cmpwi 0,r4,2 + blt+ 4f + lhz r5,4(r3) + addi r3,r3,2 + subi r4,r4,2 + adde r0,r0,r5 +4: cmpwi 0,r4,1 + bne+ 5f + lbz r5,4(r3) + slwi r5,r5,8 /* Upper byte of word */ + adde r0,r0,r5 +5: addze r3,r0 /* add in final carry */ + blr + +/* + * Computes the checksum of a memory block at src, length len, + * and adds in "sum" (32-bit), while copying the block to dst. + * If an access exception occurs on src or dst, it stores -EFAULT + * to *src_err or *dst_err respectively, and (for an error on + * src) zeroes the rest of dst. + * + * csum_partial_copy_generic(src, dst, len, sum, src_err, dst_err) + */ +_GLOBAL(csum_partial_copy_generic) + addic r0,r6,0 + subi r3,r3,4 + subi r4,r4,4 + srwi. r6,r5,2 + beq 3f /* if we're doing < 4 bytes */ + andi. r9,r4,2 /* Align dst to longword boundary */ + beq+ 1f +81: lhz r6,4(r3) /* do 2 bytes to get aligned */ + addi r3,r3,2 + subi r5,r5,2 +91: sth r6,4(r4) + addi r4,r4,2 + addc r0,r0,r6 + srwi. r6,r5,2 /* # words to do */ + beq 3f +1: srwi. r6,r5,4 /* # groups of 4 words to do */ + beq 10f + mtctr r6 +71: lwz r6,4(r3) +72: lwz r9,8(r3) +73: lwz r10,12(r3) +74: lwzu r11,16(r3) + adde r0,r0,r6 +75: stw r6,4(r4) + adde r0,r0,r9 +76: stw r9,8(r4) + adde r0,r0,r10 +77: stw r10,12(r4) + adde r0,r0,r11 +78: stwu r11,16(r4) + bdnz 71b +10: rlwinm. r6,r5,30,30,31 /* # words left to do */ + beq 13f + mtctr r6 +82: lwzu r9,4(r3) +92: stwu r9,4(r4) + adde r0,r0,r9 + bdnz 82b +13: andi. r5,r5,3 +3: cmpwi 0,r5,2 + blt+ 4f +83: lhz r6,4(r3) + addi r3,r3,2 + subi r5,r5,2 +93: sth r6,4(r4) + addi r4,r4,2 + adde r0,r0,r6 +4: cmpwi 0,r5,1 + bne+ 5f +84: lbz r6,4(r3) +94: stb r6,4(r4) + slwi r6,r6,8 /* Upper byte of word */ + adde r0,r0,r6 +5: addze r3,r0 /* add in final carry */ + blr + +/* These shouldn't go in the fixup section, since that would + cause the ex_table addresses to get out of order. */ + +src_error_4: + mfctr r6 /* update # bytes remaining from ctr */ + rlwimi r5,r6,4,0,27 + b 79f +src_error_1: + li r6,0 + subi r5,r5,2 +95: sth r6,4(r4) + addi r4,r4,2 +79: srwi. r6,r5,2 + beq 3f + mtctr r6 +src_error_2: + li r6,0 +96: stwu r6,4(r4) + bdnz 96b +3: andi. r5,r5,3 + beq src_error +src_error_3: + li r6,0 + mtctr r5 + addi r4,r4,3 +97: stbu r6,1(r4) + bdnz 97b +src_error: + cmpwi 0,r7,0 + beq 1f + li r6,-EFAULT + stw r6,0(r7) +1: addze r3,r0 + blr + +dst_error: + cmpwi 0,r8,0 + beq 1f + li r6,-EFAULT + stw r6,0(r8) +1: addze r3,r0 + blr + +.section __ex_table,"a" + .long 81b,src_error_1 + .long 91b,dst_error + .long 71b,src_error_4 + .long 72b,src_error_4 + .long 73b,src_error_4 + .long 74b,src_error_4 + .long 75b,dst_error + .long 76b,dst_error + .long 77b,dst_error + .long 78b,dst_error + .long 82b,src_error_2 + .long 92b,dst_error + .long 83b,src_error_3 + .long 93b,dst_error + .long 84b,src_error_3 + .long 94b,dst_error + .long 95b,dst_error + .long 96b,dst_error + .long 97b,dst_error diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S new file mode 100644 index 000000000..57a072065 --- /dev/null +++ b/arch/powerpc/lib/checksum_64.S @@ -0,0 +1,480 @@ +/* + * This file contains assembly-language implementations + * of IP-style 1's complement checksum routines. + * + * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). + */ + +#include <linux/sys.h> +#include <asm/processor.h> +#include <asm/errno.h> +#include <asm/ppc_asm.h> + +/* + * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header + * len is in words and is always >= 5. + * + * In practice len == 5, but this is not guaranteed. So this code does not + * attempt to use doubleword instructions. + */ +_GLOBAL(ip_fast_csum) + lwz r0,0(r3) + lwzu r5,4(r3) + addic. r4,r4,-2 + addc r0,r0,r5 + mtctr r4 + blelr- +1: lwzu r4,4(r3) + adde r0,r0,r4 + bdnz 1b + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32-bit halves together */ + add r0,r0,r4 + srdi r0,r0,32 + rlwinm r3,r0,16,0,31 /* fold two halves together */ + add r3,r0,r3 + not r3,r3 + srwi r3,r3,16 + blr + +/* + * Compute checksum of TCP or UDP pseudo-header: + * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum) + * No real gain trying to do this specially for 64 bit, but + * the 32 bit addition may spill into the upper bits of + * the doubleword so we still must fold it down from 64. + */ +_GLOBAL(csum_tcpudp_magic) + rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ + addc r0,r3,r4 /* add 4 32-bit words together */ + adde r0,r0,r5 + adde r0,r0,r7 + rldicl r4,r0,32,0 /* fold 64 bit value */ + add r0,r4,r0 + srdi r0,r0,32 + rlwinm r3,r0,16,0,31 /* fold two halves together */ + add r3,r0,r3 + not r3,r3 + srwi r3,r3,16 + blr + +/* + * Computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit). + * + * csum_partial(r3=buff, r4=len, r5=sum) + */ +_GLOBAL(csum_partial) + addic r0,r5,0 /* clear carry */ + + srdi. r6,r4,3 /* less than 8 bytes? */ + beq .Lcsum_tail_word + + /* + * If only halfword aligned, align to a double word. Since odd + * aligned addresses should be rare and they would require more + * work to calculate the correct checksum, we ignore that case + * and take the potential slowdown of unaligned loads. + */ + rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ + beq .Lcsum_aligned + + li r7,4 + sub r6,r7,r6 + mtctr r6 + +1: + lhz r6,0(r3) /* align to doubleword */ + subi r4,r4,2 + addi r3,r3,2 + adde r0,r0,r6 + bdnz 1b + +.Lcsum_aligned: + /* + * We unroll the loop such that each iteration is 64 bytes with an + * entry and exit limb of 64 bytes, meaning a minimum size of + * 128 bytes. + */ + srdi. r6,r4,7 + beq .Lcsum_tail_doublewords /* len < 128 */ + + srdi r6,r4,6 + subi r6,r6,1 + mtctr r6 + + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + ld r6,0(r3) + ld r9,8(r3) + + ld r10,16(r3) + ld r11,24(r3) + + /* + * On POWER6 and POWER7 back to back addes take 2 cycles because of + * the XER dependency. This means the fastest this loop can go is + * 16 cycles per iteration. The scheduling of the loop below has + * been shown to hit this on both POWER6 and POWER7. + */ + .align 5 +2: + adde r0,r0,r6 + ld r12,32(r3) + ld r14,40(r3) + + adde r0,r0,r9 + ld r15,48(r3) + ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 + + adde r0,r0,r11 + + adde r0,r0,r12 + + adde r0,r0,r14 + + adde r0,r0,r15 + ld r6,0(r3) + ld r9,8(r3) + + adde r0,r0,r16 + ld r10,16(r3) + ld r11,24(r3) + bdnz 2b + + + adde r0,r0,r6 + ld r12,32(r3) + ld r14,40(r3) + + adde r0,r0,r9 + ld r15,48(r3) + ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 + adde r0,r0,r11 + adde r0,r0,r12 + adde r0,r0,r14 + adde r0,r0,r15 + adde r0,r0,r16 + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE + + andi. r4,r4,63 + +.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ + srdi. r6,r4,3 + beq .Lcsum_tail_word + + mtctr r6 +3: + ld r6,0(r3) + addi r3,r3,8 + adde r0,r0,r6 + bdnz 3b + + andi. r4,r4,7 + +.Lcsum_tail_word: /* Up to 7 bytes to go */ + srdi. r6,r4,2 + beq .Lcsum_tail_halfword + + lwz r6,0(r3) + addi r3,r3,4 + adde r0,r0,r6 + subi r4,r4,4 + +.Lcsum_tail_halfword: /* Up to 3 bytes to go */ + srdi. r6,r4,1 + beq .Lcsum_tail_byte + + lhz r6,0(r3) + addi r3,r3,2 + adde r0,r0,r6 + subi r4,r4,2 + +.Lcsum_tail_byte: /* Up to 1 byte to go */ + andi. r6,r4,1 + beq .Lcsum_finish + + lbz r6,0(r3) + sldi r9,r6,8 /* Pad the byte out to 16 bits */ + adde r0,r0,r9 + +.Lcsum_finish: + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32 bit halves together */ + add r3,r4,r0 + srdi r3,r3,32 + blr + + + .macro srcnr +100: + .section __ex_table,"a" + .align 3 + .llong 100b,.Lsrc_error_nr + .previous + .endm + + .macro source +150: + .section __ex_table,"a" + .align 3 + .llong 150b,.Lsrc_error + .previous + .endm + + .macro dstnr +200: + .section __ex_table,"a" + .align 3 + .llong 200b,.Ldest_error_nr + .previous + .endm + + .macro dest +250: + .section __ex_table,"a" + .align 3 + .llong 250b,.Ldest_error + .previous + .endm + +/* + * Computes the checksum of a memory block at src, length len, + * and adds in "sum" (32-bit), while copying the block to dst. + * If an access exception occurs on src or dst, it stores -EFAULT + * to *src_err or *dst_err respectively. The caller must take any action + * required in this case (zeroing memory, recalculating partial checksum etc). + * + * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) + */ +_GLOBAL(csum_partial_copy_generic) + addic r0,r6,0 /* clear carry */ + + srdi. r6,r5,3 /* less than 8 bytes? */ + beq .Lcopy_tail_word + + /* + * If only halfword aligned, align to a double word. Since odd + * aligned addresses should be rare and they would require more + * work to calculate the correct checksum, we ignore that case + * and take the potential slowdown of unaligned loads. + * + * If the source and destination are relatively unaligned we only + * align the source. This keeps things simple. + */ + rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ + beq .Lcopy_aligned + + li r9,4 + sub r6,r9,r6 + mtctr r6 + +1: +srcnr; lhz r6,0(r3) /* align to doubleword */ + subi r5,r5,2 + addi r3,r3,2 + adde r0,r0,r6 +dstnr; sth r6,0(r4) + addi r4,r4,2 + bdnz 1b + +.Lcopy_aligned: + /* + * We unroll the loop such that each iteration is 64 bytes with an + * entry and exit limb of 64 bytes, meaning a minimum size of + * 128 bytes. + */ + srdi. r6,r5,7 + beq .Lcopy_tail_doublewords /* len < 128 */ + + srdi r6,r5,6 + subi r6,r6,1 + mtctr r6 + + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + +source; ld r6,0(r3) +source; ld r9,8(r3) + +source; ld r10,16(r3) +source; ld r11,24(r3) + + /* + * On POWER6 and POWER7 back to back addes take 2 cycles because of + * the XER dependency. This means the fastest this loop can go is + * 16 cycles per iteration. The scheduling of the loop below has + * been shown to hit this on both POWER6 and POWER7. + */ + .align 5 +2: + adde r0,r0,r6 +source; ld r12,32(r3) +source; ld r14,40(r3) + + adde r0,r0,r9 +source; ld r15,48(r3) +source; ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 +dest; std r6,0(r4) +dest; std r9,8(r4) + + adde r0,r0,r11 +dest; std r10,16(r4) +dest; std r11,24(r4) + + adde r0,r0,r12 +dest; std r12,32(r4) +dest; std r14,40(r4) + + adde r0,r0,r14 +dest; std r15,48(r4) +dest; std r16,56(r4) + addi r4,r4,64 + + adde r0,r0,r15 +source; ld r6,0(r3) +source; ld r9,8(r3) + + adde r0,r0,r16 +source; ld r10,16(r3) +source; ld r11,24(r3) + bdnz 2b + + + adde r0,r0,r6 +source; ld r12,32(r3) +source; ld r14,40(r3) + + adde r0,r0,r9 +source; ld r15,48(r3) +source; ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 +dest; std r6,0(r4) +dest; std r9,8(r4) + + adde r0,r0,r11 +dest; std r10,16(r4) +dest; std r11,24(r4) + + adde r0,r0,r12 +dest; std r12,32(r4) +dest; std r14,40(r4) + + adde r0,r0,r14 +dest; std r15,48(r4) +dest; std r16,56(r4) + addi r4,r4,64 + + adde r0,r0,r15 + adde r0,r0,r16 + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE + + andi. r5,r5,63 + +.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ + srdi. r6,r5,3 + beq .Lcopy_tail_word + + mtctr r6 +3: +srcnr; ld r6,0(r3) + addi r3,r3,8 + adde r0,r0,r6 +dstnr; std r6,0(r4) + addi r4,r4,8 + bdnz 3b + + andi. r5,r5,7 + +.Lcopy_tail_word: /* Up to 7 bytes to go */ + srdi. r6,r5,2 + beq .Lcopy_tail_halfword + +srcnr; lwz r6,0(r3) + addi r3,r3,4 + adde r0,r0,r6 +dstnr; stw r6,0(r4) + addi r4,r4,4 + subi r5,r5,4 + +.Lcopy_tail_halfword: /* Up to 3 bytes to go */ + srdi. r6,r5,1 + beq .Lcopy_tail_byte + +srcnr; lhz r6,0(r3) + addi r3,r3,2 + adde r0,r0,r6 +dstnr; sth r6,0(r4) + addi r4,r4,2 + subi r5,r5,2 + +.Lcopy_tail_byte: /* Up to 1 byte to go */ + andi. r6,r5,1 + beq .Lcopy_finish + +srcnr; lbz r6,0(r3) + sldi r9,r6,8 /* Pad the byte out to 16 bits */ + adde r0,r0,r9 +dstnr; stb r6,0(r4) + +.Lcopy_finish: + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32 bit halves together */ + add r3,r4,r0 + srdi r3,r3,32 + blr + +.Lsrc_error: + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE +.Lsrc_error_nr: + cmpdi 0,r7,0 + beqlr + li r6,-EFAULT + stw r6,0(r7) + blr + +.Ldest_error: + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + addi r1,r1,STACKFRAMESIZE +.Ldest_error_nr: + cmpdi 0,r8,0 + beqlr + li r6,-EFAULT + stw r6,0(r8) + blr diff --git a/arch/powerpc/lib/checksum_wrappers_64.c b/arch/powerpc/lib/checksum_wrappers_64.c new file mode 100644 index 000000000..08e3a3356 --- /dev/null +++ b/arch/powerpc/lib/checksum_wrappers_64.c @@ -0,0 +1,102 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2010 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <linux/export.h> +#include <linux/compiler.h> +#include <linux/types.h> +#include <asm/checksum.h> +#include <asm/uaccess.h> + +__wsum csum_and_copy_from_user(const void __user *src, void *dst, + int len, __wsum sum, int *err_ptr) +{ + unsigned int csum; + + might_sleep(); + + *err_ptr = 0; + + if (!len) { + csum = 0; + goto out; + } + + if (unlikely((len < 0) || !access_ok(VERIFY_READ, src, len))) { + *err_ptr = -EFAULT; + csum = (__force unsigned int)sum; + goto out; + } + + csum = csum_partial_copy_generic((void __force *)src, dst, + len, sum, err_ptr, NULL); + + if (unlikely(*err_ptr)) { + int missing = __copy_from_user(dst, src, len); + + if (missing) { + memset(dst + len - missing, 0, missing); + *err_ptr = -EFAULT; + } else { + *err_ptr = 0; + } + + csum = csum_partial(dst, len, sum); + } + +out: + return (__force __wsum)csum; +} +EXPORT_SYMBOL(csum_and_copy_from_user); + +__wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, + __wsum sum, int *err_ptr) +{ + unsigned int csum; + + might_sleep(); + + *err_ptr = 0; + + if (!len) { + csum = 0; + goto out; + } + + if (unlikely((len < 0) || !access_ok(VERIFY_WRITE, dst, len))) { + *err_ptr = -EFAULT; + csum = -1; /* invalid checksum */ + goto out; + } + + csum = csum_partial_copy_generic(src, (void __force *)dst, + len, sum, NULL, err_ptr); + + if (unlikely(*err_ptr)) { + csum = csum_partial(src, len, sum); + + if (copy_to_user(dst, src, len)) { + *err_ptr = -EFAULT; + csum = -1; /* invalid checksum */ + } + } + +out: + return (__force __wsum)csum; +} +EXPORT_SYMBOL(csum_and_copy_to_user); diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c new file mode 100644 index 000000000..d5edbeb8e --- /dev/null +++ b/arch/powerpc/lib/code-patching.c @@ -0,0 +1,470 @@ +/* + * Copyright 2008 Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <asm/page.h> +#include <asm/code-patching.h> +#include <asm/uaccess.h> + + +int patch_instruction(unsigned int *addr, unsigned int instr) +{ + int err; + + __put_user_size(instr, addr, 4, err); + if (err) + return err; + asm ("dcbst 0, %0; sync; icbi 0,%0; sync; isync" : : "r" (addr)); + return 0; +} + +int patch_branch(unsigned int *addr, unsigned long target, int flags) +{ + return patch_instruction(addr, create_branch(addr, target, flags)); +} + +unsigned int create_branch(const unsigned int *addr, + unsigned long target, int flags) +{ + unsigned int instruction; + long offset; + + offset = target; + if (! (flags & BRANCH_ABSOLUTE)) + offset = offset - (unsigned long)addr; + + /* Check we can represent the target in the instruction format */ + if (offset < -0x2000000 || offset > 0x1fffffc || offset & 0x3) + return 0; + + /* Mask out the flags and target, so they don't step on each other. */ + instruction = 0x48000000 | (flags & 0x3) | (offset & 0x03FFFFFC); + + return instruction; +} + +unsigned int create_cond_branch(const unsigned int *addr, + unsigned long target, int flags) +{ + unsigned int instruction; + long offset; + + offset = target; + if (! (flags & BRANCH_ABSOLUTE)) + offset = offset - (unsigned long)addr; + + /* Check we can represent the target in the instruction format */ + if (offset < -0x8000 || offset > 0x7FFF || offset & 0x3) + return 0; + + /* Mask out the flags and target, so they don't step on each other. */ + instruction = 0x40000000 | (flags & 0x3FF0003) | (offset & 0xFFFC); + + return instruction; +} + +static unsigned int branch_opcode(unsigned int instr) +{ + return (instr >> 26) & 0x3F; +} + +static int instr_is_branch_iform(unsigned int instr) +{ + return branch_opcode(instr) == 18; +} + +static int instr_is_branch_bform(unsigned int instr) +{ + return branch_opcode(instr) == 16; +} + +int instr_is_relative_branch(unsigned int instr) +{ + if (instr & BRANCH_ABSOLUTE) + return 0; + + return instr_is_branch_iform(instr) || instr_is_branch_bform(instr); +} + +static unsigned long branch_iform_target(const unsigned int *instr) +{ + signed long imm; + + imm = *instr & 0x3FFFFFC; + + /* If the top bit of the immediate value is set this is negative */ + if (imm & 0x2000000) + imm -= 0x4000000; + + if ((*instr & BRANCH_ABSOLUTE) == 0) + imm += (unsigned long)instr; + + return (unsigned long)imm; +} + +static unsigned long branch_bform_target(const unsigned int *instr) +{ + signed long imm; + + imm = *instr & 0xFFFC; + + /* If the top bit of the immediate value is set this is negative */ + if (imm & 0x8000) + imm -= 0x10000; + + if ((*instr & BRANCH_ABSOLUTE) == 0) + imm += (unsigned long)instr; + + return (unsigned long)imm; +} + +unsigned long branch_target(const unsigned int *instr) +{ + if (instr_is_branch_iform(*instr)) + return branch_iform_target(instr); + else if (instr_is_branch_bform(*instr)) + return branch_bform_target(instr); + + return 0; +} + +int instr_is_branch_to_addr(const unsigned int *instr, unsigned long addr) +{ + if (instr_is_branch_iform(*instr) || instr_is_branch_bform(*instr)) + return branch_target(instr) == addr; + + return 0; +} + +unsigned int translate_branch(const unsigned int *dest, const unsigned int *src) +{ + unsigned long target; + + target = branch_target(src); + + if (instr_is_branch_iform(*src)) + return create_branch(dest, target, *src); + else if (instr_is_branch_bform(*src)) + return create_cond_branch(dest, target, *src); + + return 0; +} + +#ifdef CONFIG_PPC_BOOK3E_64 +void __patch_exception(int exc, unsigned long addr) +{ + extern unsigned int interrupt_base_book3e; + unsigned int *ibase = &interrupt_base_book3e; + + /* Our exceptions vectors start with a NOP and -then- a branch + * to deal with single stepping from userspace which stops on + * the second instruction. Thus we need to patch the second + * instruction of the exception, not the first one + */ + + patch_branch(ibase + (exc / 4) + 1, addr, 0); +} +#endif + +#ifdef CONFIG_CODE_PATCHING_SELFTEST + +static void __init test_trampoline(void) +{ + asm ("nop;\n"); +} + +#define check(x) \ + if (!(x)) printk("code-patching: test failed at line %d\n", __LINE__); + +static void __init test_branch_iform(void) +{ + unsigned int instr; + unsigned long addr; + + addr = (unsigned long)&instr; + + /* The simplest case, branch to self, no flags */ + check(instr_is_branch_iform(0x48000000)); + /* All bits of target set, and flags */ + check(instr_is_branch_iform(0x4bffffff)); + /* High bit of opcode set, which is wrong */ + check(!instr_is_branch_iform(0xcbffffff)); + /* Middle bits of opcode set, which is wrong */ + check(!instr_is_branch_iform(0x7bffffff)); + + /* Simplest case, branch to self with link */ + check(instr_is_branch_iform(0x48000001)); + /* All bits of targets set */ + check(instr_is_branch_iform(0x4bfffffd)); + /* Some bits of targets set */ + check(instr_is_branch_iform(0x4bff00fd)); + /* Must be a valid branch to start with */ + check(!instr_is_branch_iform(0x7bfffffd)); + + /* Absolute branch to 0x100 */ + instr = 0x48000103; + check(instr_is_branch_to_addr(&instr, 0x100)); + /* Absolute branch to 0x420fc */ + instr = 0x480420ff; + check(instr_is_branch_to_addr(&instr, 0x420fc)); + /* Maximum positive relative branch, + 20MB - 4B */ + instr = 0x49fffffc; + check(instr_is_branch_to_addr(&instr, addr + 0x1FFFFFC)); + /* Smallest negative relative branch, - 4B */ + instr = 0x4bfffffc; + check(instr_is_branch_to_addr(&instr, addr - 4)); + /* Largest negative relative branch, - 32 MB */ + instr = 0x4a000000; + check(instr_is_branch_to_addr(&instr, addr - 0x2000000)); + + /* Branch to self, with link */ + instr = create_branch(&instr, addr, BRANCH_SET_LINK); + check(instr_is_branch_to_addr(&instr, addr)); + + /* Branch to self - 0x100, with link */ + instr = create_branch(&instr, addr - 0x100, BRANCH_SET_LINK); + check(instr_is_branch_to_addr(&instr, addr - 0x100)); + + /* Branch to self + 0x100, no link */ + instr = create_branch(&instr, addr + 0x100, 0); + check(instr_is_branch_to_addr(&instr, addr + 0x100)); + + /* Maximum relative negative offset, - 32 MB */ + instr = create_branch(&instr, addr - 0x2000000, BRANCH_SET_LINK); + check(instr_is_branch_to_addr(&instr, addr - 0x2000000)); + + /* Out of range relative negative offset, - 32 MB + 4*/ + instr = create_branch(&instr, addr - 0x2000004, BRANCH_SET_LINK); + check(instr == 0); + + /* Out of range relative positive offset, + 32 MB */ + instr = create_branch(&instr, addr + 0x2000000, BRANCH_SET_LINK); + check(instr == 0); + + /* Unaligned target */ + instr = create_branch(&instr, addr + 3, BRANCH_SET_LINK); + check(instr == 0); + + /* Check flags are masked correctly */ + instr = create_branch(&instr, addr, 0xFFFFFFFC); + check(instr_is_branch_to_addr(&instr, addr)); + check(instr == 0x48000000); +} + +static void __init test_create_function_call(void) +{ + unsigned int *iptr; + unsigned long dest; + + /* Check we can create a function call */ + iptr = (unsigned int *)ppc_function_entry(test_trampoline); + dest = ppc_function_entry(test_create_function_call); + patch_instruction(iptr, create_branch(iptr, dest, BRANCH_SET_LINK)); + check(instr_is_branch_to_addr(iptr, dest)); +} + +static void __init test_branch_bform(void) +{ + unsigned long addr; + unsigned int *iptr, instr, flags; + + iptr = &instr; + addr = (unsigned long)iptr; + + /* The simplest case, branch to self, no flags */ + check(instr_is_branch_bform(0x40000000)); + /* All bits of target set, and flags */ + check(instr_is_branch_bform(0x43ffffff)); + /* High bit of opcode set, which is wrong */ + check(!instr_is_branch_bform(0xc3ffffff)); + /* Middle bits of opcode set, which is wrong */ + check(!instr_is_branch_bform(0x7bffffff)); + + /* Absolute conditional branch to 0x100 */ + instr = 0x43ff0103; + check(instr_is_branch_to_addr(&instr, 0x100)); + /* Absolute conditional branch to 0x20fc */ + instr = 0x43ff20ff; + check(instr_is_branch_to_addr(&instr, 0x20fc)); + /* Maximum positive relative conditional branch, + 32 KB - 4B */ + instr = 0x43ff7ffc; + check(instr_is_branch_to_addr(&instr, addr + 0x7FFC)); + /* Smallest negative relative conditional branch, - 4B */ + instr = 0x43fffffc; + check(instr_is_branch_to_addr(&instr, addr - 4)); + /* Largest negative relative conditional branch, - 32 KB */ + instr = 0x43ff8000; + check(instr_is_branch_to_addr(&instr, addr - 0x8000)); + + /* All condition code bits set & link */ + flags = 0x3ff000 | BRANCH_SET_LINK; + + /* Branch to self */ + instr = create_cond_branch(iptr, addr, flags); + check(instr_is_branch_to_addr(&instr, addr)); + + /* Branch to self - 0x100 */ + instr = create_cond_branch(iptr, addr - 0x100, flags); + check(instr_is_branch_to_addr(&instr, addr - 0x100)); + + /* Branch to self + 0x100 */ + instr = create_cond_branch(iptr, addr + 0x100, flags); + check(instr_is_branch_to_addr(&instr, addr + 0x100)); + + /* Maximum relative negative offset, - 32 KB */ + instr = create_cond_branch(iptr, addr - 0x8000, flags); + check(instr_is_branch_to_addr(&instr, addr - 0x8000)); + + /* Out of range relative negative offset, - 32 KB + 4*/ + instr = create_cond_branch(iptr, addr - 0x8004, flags); + check(instr == 0); + + /* Out of range relative positive offset, + 32 KB */ + instr = create_cond_branch(iptr, addr + 0x8000, flags); + check(instr == 0); + + /* Unaligned target */ + instr = create_cond_branch(iptr, addr + 3, flags); + check(instr == 0); + + /* Check flags are masked correctly */ + instr = create_cond_branch(iptr, addr, 0xFFFFFFFC); + check(instr_is_branch_to_addr(&instr, addr)); + check(instr == 0x43FF0000); +} + +static void __init test_translate_branch(void) +{ + unsigned long addr; + unsigned int *p, *q; + void *buf; + + buf = vmalloc(PAGE_ALIGN(0x2000000 + 1)); + check(buf); + if (!buf) + return; + + /* Simple case, branch to self moved a little */ + p = buf; + addr = (unsigned long)p; + patch_branch(p, addr, 0); + check(instr_is_branch_to_addr(p, addr)); + q = p + 1; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(q, addr)); + + /* Maximum negative case, move b . to addr + 32 MB */ + p = buf; + addr = (unsigned long)p; + patch_branch(p, addr, 0); + q = buf + 0x2000000; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(*q == 0x4a000000); + + /* Maximum positive case, move x to x - 32 MB + 4 */ + p = buf + 0x2000000; + addr = (unsigned long)p; + patch_branch(p, addr, 0); + q = buf + 4; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(*q == 0x49fffffc); + + /* Jump to x + 16 MB moved to x + 20 MB */ + p = buf; + addr = 0x1000000 + (unsigned long)buf; + patch_branch(p, addr, BRANCH_SET_LINK); + q = buf + 0x1400000; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + /* Jump to x + 16 MB moved to x - 16 MB + 4 */ + p = buf + 0x1000000; + addr = 0x2000000 + (unsigned long)buf; + patch_branch(p, addr, 0); + q = buf + 4; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + + /* Conditional branch tests */ + + /* Simple case, branch to self moved a little */ + p = buf; + addr = (unsigned long)p; + patch_instruction(p, create_cond_branch(p, addr, 0)); + check(instr_is_branch_to_addr(p, addr)); + q = p + 1; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(q, addr)); + + /* Maximum negative case, move b . to addr + 32 KB */ + p = buf; + addr = (unsigned long)p; + patch_instruction(p, create_cond_branch(p, addr, 0xFFFFFFFC)); + q = buf + 0x8000; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(*q == 0x43ff8000); + + /* Maximum positive case, move x to x - 32 KB + 4 */ + p = buf + 0x8000; + addr = (unsigned long)p; + patch_instruction(p, create_cond_branch(p, addr, 0xFFFFFFFC)); + q = buf + 4; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + check(*q == 0x43ff7ffc); + + /* Jump to x + 12 KB moved to x + 20 KB */ + p = buf; + addr = 0x3000 + (unsigned long)buf; + patch_instruction(p, create_cond_branch(p, addr, BRANCH_SET_LINK)); + q = buf + 0x5000; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + /* Jump to x + 8 KB moved to x - 8 KB + 4 */ + p = buf + 0x2000; + addr = 0x4000 + (unsigned long)buf; + patch_instruction(p, create_cond_branch(p, addr, 0)); + q = buf + 4; + patch_instruction(q, translate_branch(q, p)); + check(instr_is_branch_to_addr(p, addr)); + check(instr_is_branch_to_addr(q, addr)); + + /* Free the buffer we were using */ + vfree(buf); +} + +static int __init test_code_patching(void) +{ + printk(KERN_DEBUG "Running code patching self-tests ...\n"); + + test_branch_iform(); + test_branch_bform(); + test_create_function_call(); + test_translate_branch(); + + return 0; +} +late_initcall(test_code_patching); + +#endif /* CONFIG_CODE_PATCHING_SELFTEST */ diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S new file mode 100644 index 000000000..6813f80d1 --- /dev/null +++ b/arch/powerpc/lib/copy_32.S @@ -0,0 +1,391 @@ +/* + * Memory copy functions for 32-bit PowerPC. + * + * Copyright (C) 1996-2005 Paul Mackerras. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/processor.h> +#include <asm/cache.h> +#include <asm/errno.h> +#include <asm/ppc_asm.h> + +#define COPY_16_BYTES \ + lwz r7,4(r4); \ + lwz r8,8(r4); \ + lwz r9,12(r4); \ + lwzu r10,16(r4); \ + stw r7,4(r6); \ + stw r8,8(r6); \ + stw r9,12(r6); \ + stwu r10,16(r6) + +#define COPY_16_BYTES_WITHEX(n) \ +8 ## n ## 0: \ + lwz r7,4(r4); \ +8 ## n ## 1: \ + lwz r8,8(r4); \ +8 ## n ## 2: \ + lwz r9,12(r4); \ +8 ## n ## 3: \ + lwzu r10,16(r4); \ +8 ## n ## 4: \ + stw r7,4(r6); \ +8 ## n ## 5: \ + stw r8,8(r6); \ +8 ## n ## 6: \ + stw r9,12(r6); \ +8 ## n ## 7: \ + stwu r10,16(r6) + +#define COPY_16_BYTES_EXCODE(n) \ +9 ## n ## 0: \ + addi r5,r5,-(16 * n); \ + b 104f; \ +9 ## n ## 1: \ + addi r5,r5,-(16 * n); \ + b 105f; \ +.section __ex_table,"a"; \ + .align 2; \ + .long 8 ## n ## 0b,9 ## n ## 0b; \ + .long 8 ## n ## 1b,9 ## n ## 0b; \ + .long 8 ## n ## 2b,9 ## n ## 0b; \ + .long 8 ## n ## 3b,9 ## n ## 0b; \ + .long 8 ## n ## 4b,9 ## n ## 1b; \ + .long 8 ## n ## 5b,9 ## n ## 1b; \ + .long 8 ## n ## 6b,9 ## n ## 1b; \ + .long 8 ## n ## 7b,9 ## n ## 1b; \ + .text + + .text + .stabs "arch/powerpc/lib/",N_SO,0,0,0f + .stabs "copy_32.S",N_SO,0,0,0f +0: + +CACHELINE_BYTES = L1_CACHE_BYTES +LG_CACHELINE_BYTES = L1_CACHE_SHIFT +CACHELINE_MASK = (L1_CACHE_BYTES-1) + +_GLOBAL(memset) + rlwimi r4,r4,8,16,23 + rlwimi r4,r4,16,0,15 + addi r6,r3,-4 + cmplwi 0,r5,4 + blt 7f + stwu r4,4(r6) + beqlr + andi. r0,r6,3 + add r5,r0,r5 + subf r6,r0,r6 + srwi r0,r5,2 + mtctr r0 + bdz 6f +1: stwu r4,4(r6) + bdnz 1b +6: andi. r5,r5,3 +7: cmpwi 0,r5,0 + beqlr + mtctr r5 + addi r6,r6,3 +8: stbu r4,1(r6) + bdnz 8b + blr + +_GLOBAL(memmove) + cmplw 0,r3,r4 + bgt backwards_memcpy + /* fall through */ + +_GLOBAL(memcpy) + srwi. r7,r5,3 + addi r6,r3,-4 + addi r4,r4,-4 + beq 2f /* if less than 8 bytes to do */ + andi. r0,r6,3 /* get dest word aligned */ + mtctr r7 + bne 5f +1: lwz r7,4(r4) + lwzu r8,8(r4) + stw r7,4(r6) + stwu r8,8(r6) + bdnz 1b + andi. r5,r5,7 +2: cmplwi 0,r5,4 + blt 3f + lwzu r0,4(r4) + addi r5,r5,-4 + stwu r0,4(r6) +3: cmpwi 0,r5,0 + beqlr + mtctr r5 + addi r4,r4,3 + addi r6,r6,3 +4: lbzu r0,1(r4) + stbu r0,1(r6) + bdnz 4b + blr +5: subfic r0,r0,4 + mtctr r0 +6: lbz r7,4(r4) + addi r4,r4,1 + stb r7,4(r6) + addi r6,r6,1 + bdnz 6b + subf r5,r0,r5 + rlwinm. r7,r5,32-3,3,31 + beq 2b + mtctr r7 + b 1b + +_GLOBAL(backwards_memcpy) + rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ + add r6,r3,r5 + add r4,r4,r5 + beq 2f + andi. r0,r6,3 + mtctr r7 + bne 5f +1: lwz r7,-4(r4) + lwzu r8,-8(r4) + stw r7,-4(r6) + stwu r8,-8(r6) + bdnz 1b + andi. r5,r5,7 +2: cmplwi 0,r5,4 + blt 3f + lwzu r0,-4(r4) + subi r5,r5,4 + stwu r0,-4(r6) +3: cmpwi 0,r5,0 + beqlr + mtctr r5 +4: lbzu r0,-1(r4) + stbu r0,-1(r6) + bdnz 4b + blr +5: mtctr r0 +6: lbzu r7,-1(r4) + stbu r7,-1(r6) + bdnz 6b + subf r5,r0,r5 + rlwinm. r7,r5,32-3,3,31 + beq 2b + mtctr r7 + b 1b + +_GLOBAL(__copy_tofrom_user) + addi r4,r4,-4 + addi r6,r3,-4 + neg r0,r3 + andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ + beq 58f + + cmplw 0,r5,r0 /* is this more than total to do? */ + blt 63f /* if not much to do */ + andi. r8,r0,3 /* get it word-aligned first */ + mtctr r8 + beq+ 61f +70: lbz r9,4(r4) /* do some bytes */ +71: stb r9,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 70b +61: subf r5,r0,r5 + srwi. r0,r0,2 + mtctr r0 + beq 58f +72: lwzu r9,4(r4) /* do some words */ +73: stwu r9,4(r6) + bdnz 72b + + .section __ex_table,"a" + .align 2 + .long 70b,100f + .long 71b,101f + .long 72b,102f + .long 73b,103f + .text + +58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ + clrlwi r5,r5,32-LG_CACHELINE_BYTES + li r11,4 + beq 63f + + /* Here we decide how far ahead to prefetch the source */ + li r3,4 + cmpwi r0,1 + li r7,0 + ble 114f + li r7,1 +#if MAX_COPY_PREFETCH > 1 + /* Heuristically, for large transfers we prefetch + MAX_COPY_PREFETCH cachelines ahead. For small transfers + we prefetch 1 cacheline ahead. */ + cmpwi r0,MAX_COPY_PREFETCH + ble 112f + li r7,MAX_COPY_PREFETCH +112: mtctr r7 +111: dcbt r3,r4 + addi r3,r3,CACHELINE_BYTES + bdnz 111b +#else + dcbt r3,r4 + addi r3,r3,CACHELINE_BYTES +#endif /* MAX_COPY_PREFETCH > 1 */ + +114: subf r8,r7,r0 + mr r0,r7 + mtctr r8 + +53: dcbt r3,r4 +54: dcbz r11,r6 + .section __ex_table,"a" + .align 2 + .long 54b,105f + .text +/* the main body of the cacheline loop */ + COPY_16_BYTES_WITHEX(0) +#if L1_CACHE_BYTES >= 32 + COPY_16_BYTES_WITHEX(1) +#if L1_CACHE_BYTES >= 64 + COPY_16_BYTES_WITHEX(2) + COPY_16_BYTES_WITHEX(3) +#if L1_CACHE_BYTES >= 128 + COPY_16_BYTES_WITHEX(4) + COPY_16_BYTES_WITHEX(5) + COPY_16_BYTES_WITHEX(6) + COPY_16_BYTES_WITHEX(7) +#endif +#endif +#endif + bdnz 53b + cmpwi r0,0 + li r3,4 + li r7,0 + bne 114b + +63: srwi. r0,r5,2 + mtctr r0 + beq 64f +30: lwzu r0,4(r4) +31: stwu r0,4(r6) + bdnz 30b + +64: andi. r0,r5,3 + mtctr r0 + beq+ 65f +40: lbz r0,4(r4) +41: stb r0,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 40b +65: li r3,0 + blr + +/* read fault, initial single-byte copy */ +100: li r9,0 + b 90f +/* write fault, initial single-byte copy */ +101: li r9,1 +90: subf r5,r8,r5 + li r3,0 + b 99f +/* read fault, initial word copy */ +102: li r9,0 + b 91f +/* write fault, initial word copy */ +103: li r9,1 +91: li r3,2 + b 99f + +/* + * this stuff handles faults in the cacheline loop and branches to either + * 104f (if in read part) or 105f (if in write part), after updating r5 + */ + COPY_16_BYTES_EXCODE(0) +#if L1_CACHE_BYTES >= 32 + COPY_16_BYTES_EXCODE(1) +#if L1_CACHE_BYTES >= 64 + COPY_16_BYTES_EXCODE(2) + COPY_16_BYTES_EXCODE(3) +#if L1_CACHE_BYTES >= 128 + COPY_16_BYTES_EXCODE(4) + COPY_16_BYTES_EXCODE(5) + COPY_16_BYTES_EXCODE(6) + COPY_16_BYTES_EXCODE(7) +#endif +#endif +#endif + +/* read fault in cacheline loop */ +104: li r9,0 + b 92f +/* fault on dcbz (effectively a write fault) */ +/* or write fault in cacheline loop */ +105: li r9,1 +92: li r3,LG_CACHELINE_BYTES + mfctr r8 + add r0,r0,r8 + b 106f +/* read fault in final word loop */ +108: li r9,0 + b 93f +/* write fault in final word loop */ +109: li r9,1 +93: andi. r5,r5,3 + li r3,2 + b 99f +/* read fault in final byte loop */ +110: li r9,0 + b 94f +/* write fault in final byte loop */ +111: li r9,1 +94: li r5,0 + li r3,0 +/* + * At this stage the number of bytes not copied is + * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. + */ +99: mfctr r0 +106: slw r3,r0,r3 + add. r3,r3,r5 + beq 120f /* shouldn't happen */ + cmpwi 0,r9,0 + bne 120f +/* for a read fault, first try to continue the copy one byte at a time */ + mtctr r3 +130: lbz r0,4(r4) +131: stb r0,4(r6) + addi r4,r4,1 + addi r6,r6,1 + bdnz 130b +/* then clear out the destination: r3 bytes starting at 4(r6) */ +132: mfctr r3 + srwi. r0,r3,2 + li r9,0 + mtctr r0 + beq 113f +112: stwu r9,4(r6) + bdnz 112b +113: andi. r0,r3,3 + mtctr r0 + beq 120f +114: stb r9,4(r6) + addi r6,r6,1 + bdnz 114b +120: blr + + .section __ex_table,"a" + .align 2 + .long 30b,108b + .long 31b,109b + .long 40b,110b + .long 41b,111b + .long 130b,132b + .long 131b,120b + .long 112b,120b + .long 114b,120b + .text diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S new file mode 100644 index 000000000..a3c4dc4de --- /dev/null +++ b/arch/powerpc/lib/copypage_64.S @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2008 Mark Nelson, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/page.h> +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + + .section ".toc","aw" +PPC64_CACHES: + .tc ppc64_caches[TC],ppc64_caches + .section ".text" + +_GLOBAL_TOC(copy_page) +BEGIN_FTR_SECTION + lis r5,PAGE_SIZE@h +FTR_SECTION_ELSE + b copypage_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) + ori r5,r5,PAGE_SIZE@l +BEGIN_FTR_SECTION + ld r10,PPC64_CACHES@toc(r2) + lwz r11,DCACHEL1LOGLINESIZE(r10) /* log2 of cache line size */ + lwz r12,DCACHEL1LINESIZE(r10) /* get cache line size */ + li r9,0 + srd r8,r5,r11 + + mtctr r8 +.Lsetup: + dcbt r9,r4 + dcbz r9,r3 + add r9,r9,r12 + bdnz .Lsetup +END_FTR_SECTION_IFSET(CPU_FTR_CP_USE_DCBTZ) + addi r3,r3,-8 + srdi r8,r5,7 /* page is copied in 128 byte strides */ + addi r8,r8,-1 /* one stride copied outside loop */ + + mtctr r8 + + ld r5,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ldu r8,24(r4) +1: std r5,8(r3) + std r6,16(r3) + ld r9,8(r4) + ld r10,16(r4) + std r7,24(r3) + std r8,32(r3) + ld r11,24(r4) + ld r12,32(r4) + std r9,40(r3) + std r10,48(r3) + ld r5,40(r4) + ld r6,48(r4) + std r11,56(r3) + std r12,64(r3) + ld r7,56(r4) + ld r8,64(r4) + std r5,72(r3) + std r6,80(r3) + ld r9,72(r4) + ld r10,80(r4) + std r7,88(r3) + std r8,96(r3) + ld r11,88(r4) + ld r12,96(r4) + std r9,104(r3) + std r10,112(r3) + ld r5,104(r4) + ld r6,112(r4) + std r11,120(r3) + stdu r12,128(r3) + ld r7,120(r4) + ldu r8,128(r4) + bdnz 1b + + std r5,8(r3) + std r6,16(r3) + ld r9,8(r4) + ld r10,16(r4) + std r7,24(r3) + std r8,32(r3) + ld r11,24(r4) + ld r12,32(r4) + std r9,40(r3) + std r10,48(r3) + ld r5,40(r4) + ld r6,48(r4) + std r11,56(r3) + std r12,64(r3) + ld r7,56(r4) + ld r8,64(r4) + std r5,72(r3) + std r6,80(r3) + ld r9,72(r4) + ld r10,80(r4) + std r7,88(r3) + std r8,96(r3) + ld r11,88(r4) + ld r12,96(r4) + std r9,104(r3) + std r10,112(r3) + std r11,120(r3) + std r12,128(r3) + blr diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S new file mode 100644 index 000000000..a84d333ec --- /dev/null +++ b/arch/powerpc/lib/copypage_power7.S @@ -0,0 +1,168 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/page.h> +#include <asm/ppc_asm.h> + +_GLOBAL(copypage_power7) + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. Since source and destination are page + * aligned we don't need to clear the bottom 7 bits of either + * address. + */ + ori r9,r3,1 /* stream=1 => to */ + +#ifdef CONFIG_PPC_64K_PAGES + lis r7,0x0E01 /* depth=7 + * units/cachelines=512 */ +#else + lis r7,0x0E00 /* depth=7 */ + ori r7,r7,0x1000 /* units/cachelines=32 */ +#endif + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + /* setup read stream 0 */ + dcbt r0,r4,0b01000 /* addr from */ + dcbt r0,r7,0b01010 /* length and depth from */ + /* setup write stream 1 */ + dcbtst r0,r9,0b01000 /* addr to */ + dcbtst r0,r10,0b01010 /* length and depth to */ + eieio + dcbt r0,r8,0b01010 /* all streams GO */ +.machine pop + +#ifdef CONFIG_ALTIVEC + mflr r0 + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl enter_vmx_copy + cmpwi r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STK_REG(R31)(r1) + ld r4,STK_REG(R30)(r1) + mtlr r0 + + li r0,(PAGE_SIZE/128) + mtctr r0 + + beq .Lnonvmx_copy + + addi r1,r1,STACKFRAMESIZE + + li r6,16 + li r7,32 + li r8,48 + li r9,64 + li r10,80 + li r11,96 + li r12,112 + + .align 5 +1: lvx v7,r0,r4 + lvx v6,r4,r6 + lvx v5,r4,r7 + lvx v4,r4,r8 + lvx v3,r4,r9 + lvx v2,r4,r10 + lvx v1,r4,r11 + lvx v0,r4,r12 + addi r4,r4,128 + stvx v7,r0,r3 + stvx v6,r3,r6 + stvx v5,r3,r7 + stvx v4,r3,r8 + stvx v3,r3,r9 + stvx v2,r3,r10 + stvx v1,r3,r11 + stvx v0,r3,r12 + addi r3,r3,128 + bdnz 1b + + b exit_vmx_copy /* tail call optimise */ + +#else + li r0,(PAGE_SIZE/128) + mtctr r0 + + stdu r1,-STACKFRAMESIZE(r1) +#endif + +.Lnonvmx_copy: + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + +1: ld r0,0(r4) + ld r5,8(r4) + ld r6,16(r4) + ld r7,24(r4) + ld r8,32(r4) + ld r9,40(r4) + ld r10,48(r4) + ld r11,56(r4) + ld r12,64(r4) + ld r14,72(r4) + ld r15,80(r4) + ld r16,88(r4) + ld r17,96(r4) + ld r18,104(r4) + ld r19,112(r4) + ld r20,120(r4) + addi r4,r4,128 + std r0,0(r3) + std r5,8(r3) + std r6,16(r3) + std r7,24(r3) + std r8,32(r3) + std r9,40(r3) + std r10,48(r3) + std r11,56(r3) + std r12,64(r3) + std r14,72(r3) + std r15,80(r3) + std r16,88(r3) + std r17,96(r3) + std r18,104(r3) + std r19,112(r3) + std r20,120(r3) + addi r3,r3,128 + bdnz 1b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + addi r1,r1,STACKFRAMESIZE + blr diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S new file mode 100644 index 000000000..f09899e35 --- /dev/null +++ b/arch/powerpc/lib/copyuser_64.S @@ -0,0 +1,673 @@ +/* + * Copyright (C) 2002 Paul Mackerras, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> + +#ifdef __BIG_ENDIAN__ +#define sLd sld /* Shift towards low-numbered address. */ +#define sHd srd /* Shift towards high-numbered address. */ +#else +#define sLd srd /* Shift towards low-numbered address. */ +#define sHd sld /* Shift towards high-numbered address. */ +#endif + + .align 7 +_GLOBAL_TOC(__copy_tofrom_user) +BEGIN_FTR_SECTION + nop +FTR_SECTION_ELSE + b __copy_tofrom_user_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) +_GLOBAL(__copy_tofrom_user_base) + /* first check for a whole page copy on a page boundary */ + cmpldi cr1,r5,16 + cmpdi cr6,r5,4096 + or r0,r3,r4 + neg r6,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ + andi. r0,r0,4095 + std r3,-24(r1) + crand cr0*4+2,cr0*4+2,cr6*4+2 + std r4,-16(r1) + std r5,-8(r1) + dcbt 0,r4 + beq .Lcopy_page_4K + andi. r6,r6,7 + PPC_MTOCRF(0x01,r5) + blt cr1,.Lshort_copy +/* Below we want to nop out the bne if we're on a CPU that has the + * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit + * cleared. + * At the time of writing the only CPU that has this combination of bits + * set is Power6. + */ +BEGIN_FTR_SECTION + nop +FTR_SECTION_ELSE + bne .Ldst_unaligned +ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \ + CPU_FTR_UNALIGNED_LD_STD) +.Ldst_aligned: + addi r3,r3,-16 +BEGIN_FTR_SECTION + andi. r0,r4,7 + bne .Lsrc_unaligned +END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) + blt cr1,.Ldo_tail /* if < 16 bytes to copy */ + srdi r0,r5,5 + cmpdi cr1,r0,0 +20: ld r7,0(r4) +220: ld r6,8(r4) + addi r4,r4,16 + mtctr r0 + andi. r0,r5,0x10 + beq 22f + addi r3,r3,16 + addi r4,r4,-16 + mr r9,r7 + mr r8,r6 + beq cr1,72f +21: ld r7,16(r4) +221: ld r6,24(r4) + addi r4,r4,32 +70: std r9,0(r3) +270: std r8,8(r3) +22: ld r9,0(r4) +222: ld r8,8(r4) +71: std r7,16(r3) +271: std r6,24(r3) + addi r3,r3,32 + bdnz 21b +72: std r9,0(r3) +272: std r8,8(r3) + andi. r5,r5,0xf + beq+ 3f + addi r4,r4,16 +.Ldo_tail: + addi r3,r3,16 + bf cr7*4+0,246f +244: ld r9,0(r4) + addi r4,r4,8 +245: std r9,0(r3) + addi r3,r3,8 +246: bf cr7*4+1,1f +23: lwz r9,0(r4) + addi r4,r4,4 +73: stw r9,0(r3) + addi r3,r3,4 +1: bf cr7*4+2,2f +44: lhz r9,0(r4) + addi r4,r4,2 +74: sth r9,0(r3) + addi r3,r3,2 +2: bf cr7*4+3,3f +45: lbz r9,0(r4) +75: stb r9,0(r3) +3: li r3,0 + blr + +.Lsrc_unaligned: + srdi r6,r5,3 + addi r5,r5,-16 + subf r4,r0,r4 + srdi r7,r5,4 + sldi r10,r0,3 + cmpldi cr6,r6,3 + andi. r5,r5,7 + mtctr r7 + subfic r11,r10,64 + add r5,r5,r0 + bt cr7*4+0,28f + +24: ld r9,0(r4) /* 3+2n loads, 2+2n stores */ +25: ld r0,8(r4) + sLd r6,r9,r10 +26: ldu r9,16(r4) + sHd r7,r0,r11 + sLd r8,r0,r10 + or r7,r7,r6 + blt cr6,79f +27: ld r0,8(r4) + b 2f + +28: ld r0,0(r4) /* 4+2n loads, 3+2n stores */ +29: ldu r9,8(r4) + sLd r8,r0,r10 + addi r3,r3,-8 + blt cr6,5f +30: ld r0,8(r4) + sHd r12,r9,r11 + sLd r6,r9,r10 +31: ldu r9,16(r4) + or r12,r8,r12 + sHd r7,r0,r11 + sLd r8,r0,r10 + addi r3,r3,16 + beq cr6,78f + +1: or r7,r7,r6 +32: ld r0,8(r4) +76: std r12,8(r3) +2: sHd r12,r9,r11 + sLd r6,r9,r10 +33: ldu r9,16(r4) + or r12,r8,r12 +77: stdu r7,16(r3) + sHd r7,r0,r11 + sLd r8,r0,r10 + bdnz 1b + +78: std r12,8(r3) + or r7,r7,r6 +79: std r7,16(r3) +5: sHd r12,r9,r11 + or r12,r8,r12 +80: std r12,24(r3) + bne 6f + li r3,0 + blr +6: cmpwi cr1,r5,8 + addi r3,r3,32 + sLd r9,r9,r10 + ble cr1,7f +34: ld r0,8(r4) + sHd r7,r0,r11 + or r9,r7,r9 +7: + bf cr7*4+1,1f +#ifdef __BIG_ENDIAN__ + rotldi r9,r9,32 +#endif +94: stw r9,0(r3) +#ifdef __LITTLE_ENDIAN__ + rotrdi r9,r9,32 +#endif + addi r3,r3,4 +1: bf cr7*4+2,2f +#ifdef __BIG_ENDIAN__ + rotldi r9,r9,16 +#endif +95: sth r9,0(r3) +#ifdef __LITTLE_ENDIAN__ + rotrdi r9,r9,16 +#endif + addi r3,r3,2 +2: bf cr7*4+3,3f +#ifdef __BIG_ENDIAN__ + rotldi r9,r9,8 +#endif +96: stb r9,0(r3) +#ifdef __LITTLE_ENDIAN__ + rotrdi r9,r9,8 +#endif +3: li r3,0 + blr + +.Ldst_unaligned: + PPC_MTOCRF(0x01,r6) /* put #bytes to 8B bdry into cr7 */ + subf r5,r6,r5 + li r7,0 + cmpldi cr1,r5,16 + bf cr7*4+3,1f +35: lbz r0,0(r4) +81: stb r0,0(r3) + addi r7,r7,1 +1: bf cr7*4+2,2f +36: lhzx r0,r7,r4 +82: sthx r0,r7,r3 + addi r7,r7,2 +2: bf cr7*4+1,3f +37: lwzx r0,r7,r4 +83: stwx r0,r7,r3 +3: PPC_MTOCRF(0x01,r5) + add r4,r6,r4 + add r3,r6,r3 + b .Ldst_aligned + +.Lshort_copy: + bf cr7*4+0,1f +38: lwz r0,0(r4) +39: lwz r9,4(r4) + addi r4,r4,8 +84: stw r0,0(r3) +85: stw r9,4(r3) + addi r3,r3,8 +1: bf cr7*4+1,2f +40: lwz r0,0(r4) + addi r4,r4,4 +86: stw r0,0(r3) + addi r3,r3,4 +2: bf cr7*4+2,3f +41: lhz r0,0(r4) + addi r4,r4,2 +87: sth r0,0(r3) + addi r3,r3,2 +3: bf cr7*4+3,4f +42: lbz r0,0(r4) +88: stb r0,0(r3) +4: li r3,0 + blr + +/* + * exception handlers follow + * we have to return the number of bytes not copied + * for an exception on a load, we set the rest of the destination to 0 + */ + +136: +137: + add r3,r3,r7 + b 1f +130: +131: + addi r3,r3,8 +120: +320: +122: +322: +124: +125: +126: +127: +128: +129: +133: + addi r3,r3,8 +132: + addi r3,r3,8 +121: +321: +344: +134: +135: +138: +139: +140: +141: +142: +123: +144: +145: + +/* + * here we have had a fault on a load and r3 points to the first + * unmodified byte of the destination + */ +1: ld r6,-24(r1) + ld r4,-16(r1) + ld r5,-8(r1) + subf r6,r6,r3 + add r4,r4,r6 + subf r5,r6,r5 /* #bytes left to go */ + +/* + * first see if we can copy any more bytes before hitting another exception + */ + mtctr r5 +43: lbz r0,0(r4) + addi r4,r4,1 +89: stb r0,0(r3) + addi r3,r3,1 + bdnz 43b + li r3,0 /* huh? all copied successfully this time? */ + blr + +/* + * here we have trapped again, need to clear ctr bytes starting at r3 + */ +143: mfctr r5 + li r0,0 + mr r4,r3 + mr r3,r5 /* return the number of bytes not copied */ +1: andi. r9,r4,7 + beq 3f +90: stb r0,0(r4) + addic. r5,r5,-1 + addi r4,r4,1 + bne 1b + blr +3: cmpldi cr1,r5,8 + srdi r9,r5,3 + andi. r5,r5,7 + blt cr1,93f + mtctr r9 +91: std r0,0(r4) + addi r4,r4,8 + bdnz 91b +93: beqlr + mtctr r5 +92: stb r0,0(r4) + addi r4,r4,1 + bdnz 92b + blr + +/* + * exception handlers for stores: we just need to work + * out how many bytes weren't copied + */ +182: +183: + add r3,r3,r7 + b 1f +371: +180: + addi r3,r3,8 +171: +177: + addi r3,r3,8 +370: +372: +176: +178: + addi r3,r3,4 +185: + addi r3,r3,4 +170: +172: +345: +173: +174: +175: +179: +181: +184: +186: +187: +188: +189: +194: +195: +196: +1: + ld r6,-24(r1) + ld r5,-8(r1) + add r6,r6,r5 + subf r3,r3,r6 /* #bytes not copied */ +190: +191: +192: + blr /* #bytes not copied in r3 */ + + .section __ex_table,"a" + .align 3 + .llong 20b,120b + .llong 220b,320b + .llong 21b,121b + .llong 221b,321b + .llong 70b,170b + .llong 270b,370b + .llong 22b,122b + .llong 222b,322b + .llong 71b,171b + .llong 271b,371b + .llong 72b,172b + .llong 272b,372b + .llong 244b,344b + .llong 245b,345b + .llong 23b,123b + .llong 73b,173b + .llong 44b,144b + .llong 74b,174b + .llong 45b,145b + .llong 75b,175b + .llong 24b,124b + .llong 25b,125b + .llong 26b,126b + .llong 27b,127b + .llong 28b,128b + .llong 29b,129b + .llong 30b,130b + .llong 31b,131b + .llong 32b,132b + .llong 76b,176b + .llong 33b,133b + .llong 77b,177b + .llong 78b,178b + .llong 79b,179b + .llong 80b,180b + .llong 34b,134b + .llong 94b,194b + .llong 95b,195b + .llong 96b,196b + .llong 35b,135b + .llong 81b,181b + .llong 36b,136b + .llong 82b,182b + .llong 37b,137b + .llong 83b,183b + .llong 38b,138b + .llong 39b,139b + .llong 84b,184b + .llong 85b,185b + .llong 40b,140b + .llong 86b,186b + .llong 41b,141b + .llong 87b,187b + .llong 42b,142b + .llong 88b,188b + .llong 43b,143b + .llong 89b,189b + .llong 90b,190b + .llong 91b,191b + .llong 92b,192b + + .text + +/* + * Routine to copy a whole page of data, optimized for POWER4. + * On POWER4 it is more than 50% faster than the simple loop + * above (following the .Ldst_aligned label). + */ +.Lcopy_page_4K: + std r31,-32(1) + std r30,-40(1) + std r29,-48(1) + std r28,-56(1) + std r27,-64(1) + std r26,-72(1) + std r25,-80(1) + std r24,-88(1) + std r23,-96(1) + std r22,-104(1) + std r21,-112(1) + std r20,-120(1) + li r5,4096/32 - 1 + addi r3,r3,-8 + li r0,5 +0: addi r5,r5,-24 + mtctr r0 +20: ld r22,640(4) +21: ld r21,512(4) +22: ld r20,384(4) +23: ld r11,256(4) +24: ld r9,128(4) +25: ld r7,0(4) +26: ld r25,648(4) +27: ld r24,520(4) +28: ld r23,392(4) +29: ld r10,264(4) +30: ld r8,136(4) +31: ldu r6,8(4) + cmpwi r5,24 +1: +32: std r22,648(3) +33: std r21,520(3) +34: std r20,392(3) +35: std r11,264(3) +36: std r9,136(3) +37: std r7,8(3) +38: ld r28,648(4) +39: ld r27,520(4) +40: ld r26,392(4) +41: ld r31,264(4) +42: ld r30,136(4) +43: ld r29,8(4) +44: std r25,656(3) +45: std r24,528(3) +46: std r23,400(3) +47: std r10,272(3) +48: std r8,144(3) +49: std r6,16(3) +50: ld r22,656(4) +51: ld r21,528(4) +52: ld r20,400(4) +53: ld r11,272(4) +54: ld r9,144(4) +55: ld r7,16(4) +56: std r28,664(3) +57: std r27,536(3) +58: std r26,408(3) +59: std r31,280(3) +60: std r30,152(3) +61: stdu r29,24(3) +62: ld r25,664(4) +63: ld r24,536(4) +64: ld r23,408(4) +65: ld r10,280(4) +66: ld r8,152(4) +67: ldu r6,24(4) + bdnz 1b +68: std r22,648(3) +69: std r21,520(3) +70: std r20,392(3) +71: std r11,264(3) +72: std r9,136(3) +73: std r7,8(3) +74: addi r4,r4,640 +75: addi r3,r3,648 + bge 0b + mtctr r5 +76: ld r7,0(4) +77: ld r8,8(4) +78: ldu r9,16(4) +3: +79: ld r10,8(4) +80: std r7,8(3) +81: ld r7,16(4) +82: std r8,16(3) +83: ld r8,24(4) +84: std r9,24(3) +85: ldu r9,32(4) +86: stdu r10,32(3) + bdnz 3b +4: +87: ld r10,8(4) +88: std r7,8(3) +89: std r8,16(3) +90: std r9,24(3) +91: std r10,32(3) +9: ld r20,-120(1) + ld r21,-112(1) + ld r22,-104(1) + ld r23,-96(1) + ld r24,-88(1) + ld r25,-80(1) + ld r26,-72(1) + ld r27,-64(1) + ld r28,-56(1) + ld r29,-48(1) + ld r30,-40(1) + ld r31,-32(1) + li r3,0 + blr + +/* + * on an exception, reset to the beginning and jump back into the + * standard __copy_tofrom_user + */ +100: ld r20,-120(1) + ld r21,-112(1) + ld r22,-104(1) + ld r23,-96(1) + ld r24,-88(1) + ld r25,-80(1) + ld r26,-72(1) + ld r27,-64(1) + ld r28,-56(1) + ld r29,-48(1) + ld r30,-40(1) + ld r31,-32(1) + ld r3,-24(r1) + ld r4,-16(r1) + li r5,4096 + b .Ldst_aligned + + .section __ex_table,"a" + .align 3 + .llong 20b,100b + .llong 21b,100b + .llong 22b,100b + .llong 23b,100b + .llong 24b,100b + .llong 25b,100b + .llong 26b,100b + .llong 27b,100b + .llong 28b,100b + .llong 29b,100b + .llong 30b,100b + .llong 31b,100b + .llong 32b,100b + .llong 33b,100b + .llong 34b,100b + .llong 35b,100b + .llong 36b,100b + .llong 37b,100b + .llong 38b,100b + .llong 39b,100b + .llong 40b,100b + .llong 41b,100b + .llong 42b,100b + .llong 43b,100b + .llong 44b,100b + .llong 45b,100b + .llong 46b,100b + .llong 47b,100b + .llong 48b,100b + .llong 49b,100b + .llong 50b,100b + .llong 51b,100b + .llong 52b,100b + .llong 53b,100b + .llong 54b,100b + .llong 55b,100b + .llong 56b,100b + .llong 57b,100b + .llong 58b,100b + .llong 59b,100b + .llong 60b,100b + .llong 61b,100b + .llong 62b,100b + .llong 63b,100b + .llong 64b,100b + .llong 65b,100b + .llong 66b,100b + .llong 67b,100b + .llong 68b,100b + .llong 69b,100b + .llong 70b,100b + .llong 71b,100b + .llong 72b,100b + .llong 73b,100b + .llong 74b,100b + .llong 75b,100b + .llong 76b,100b + .llong 77b,100b + .llong 78b,100b + .llong 79b,100b + .llong 80b,100b + .llong 81b,100b + .llong 82b,100b + .llong 83b,100b + .llong 84b,100b + .llong 85b,100b + .llong 86b,100b + .llong 87b,100b + .llong 88b,100b + .llong 89b,100b + .llong 90b,100b + .llong 91b,100b diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S new file mode 100644 index 000000000..da0c568d1 --- /dev/null +++ b/arch/powerpc/lib/copyuser_power7.S @@ -0,0 +1,721 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2011 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/ppc_asm.h> + +#ifdef __BIG_ENDIAN__ +#define LVS(VRT,RA,RB) lvsl VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC +#else +#define LVS(VRT,RA,RB) lvsr VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC +#endif + + .macro err1 +100: + .section __ex_table,"a" + .align 3 + .llong 100b,.Ldo_err1 + .previous + .endm + + .macro err2 +200: + .section __ex_table,"a" + .align 3 + .llong 200b,.Ldo_err2 + .previous + .endm + +#ifdef CONFIG_ALTIVEC + .macro err3 +300: + .section __ex_table,"a" + .align 3 + .llong 300b,.Ldo_err3 + .previous + .endm + + .macro err4 +400: + .section __ex_table,"a" + .align 3 + .llong 400b,.Ldo_err4 + .previous + .endm + + +.Ldo_err4: + ld r16,STK_REG(R16)(r1) + ld r15,STK_REG(R15)(r1) + ld r14,STK_REG(R14)(r1) +.Ldo_err3: + bl exit_vmx_usercopy + ld r0,STACKFRAMESIZE+16(r1) + mtlr r0 + b .Lexit +#endif /* CONFIG_ALTIVEC */ + +.Ldo_err2: + ld r22,STK_REG(R22)(r1) + ld r21,STK_REG(R21)(r1) + ld r20,STK_REG(R20)(r1) + ld r19,STK_REG(R19)(r1) + ld r18,STK_REG(R18)(r1) + ld r17,STK_REG(R17)(r1) + ld r16,STK_REG(R16)(r1) + ld r15,STK_REG(R15)(r1) + ld r14,STK_REG(R14)(r1) +.Lexit: + addi r1,r1,STACKFRAMESIZE +.Ldo_err1: + ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + b __copy_tofrom_user_base + + +_GLOBAL(__copy_tofrom_user_power7) +#ifdef CONFIG_ALTIVEC + cmpldi r5,16 + cmpldi cr1,r5,4096 + + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + + blt .Lshort_copy + bgt cr1,.Lvmx_copy +#else + cmpldi r5,16 + + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + + blt .Lshort_copy +#endif + +.Lnonvmx_copy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f +err1; lbz r0,0(r4) + addi r4,r4,1 +err1; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + std r21,STK_REG(R21)(r1) + std r22,STK_REG(R22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: +err2; ld r0,0(r4) +err2; ld r6,8(r4) +err2; ld r7,16(r4) +err2; ld r8,24(r4) +err2; ld r9,32(r4) +err2; ld r10,40(r4) +err2; ld r11,48(r4) +err2; ld r12,56(r4) +err2; ld r14,64(r4) +err2; ld r15,72(r4) +err2; ld r16,80(r4) +err2; ld r17,88(r4) +err2; ld r18,96(r4) +err2; ld r19,104(r4) +err2; ld r20,112(r4) +err2; ld r21,120(r4) + addi r4,r4,128 +err2; std r0,0(r3) +err2; std r6,8(r3) +err2; std r7,16(r3) +err2; std r8,24(r3) +err2; std r9,32(r3) +err2; std r10,40(r3) +err2; std r11,48(r3) +err2; std r12,56(r3) +err2; std r14,64(r3) +err2; std r15,72(r3) +err2; std r16,80(r3) +err2; std r17,88(r3) +err2; std r18,96(r3) +err2; std r19,104(r3) +err2; std r20,112(r3) +err2; std r21,120(r3) + addi r3,r3,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + ld r21,STK_REG(R21)(r1) + ld r22,STK_REG(R22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f +err1; ld r0,0(r4) +err1; ld r6,8(r4) +err1; ld r7,16(r4) +err1; ld r8,24(r4) +err1; ld r9,32(r4) +err1; ld r10,40(r4) +err1; ld r11,48(r4) +err1; ld r12,56(r4) + addi r4,r4,64 +err1; std r0,0(r3) +err1; std r6,8(r3) +err1; std r7,16(r3) +err1; std r8,24(r3) +err1; std r9,32(r3) +err1; std r10,40(r3) +err1; std r11,48(r3) +err1; std r12,56(r3) + addi r3,r3,64 + + /* Up to 63B to go */ +7: bf cr7*4+2,8f +err1; ld r0,0(r4) +err1; ld r6,8(r4) +err1; ld r7,16(r4) +err1; ld r8,24(r4) + addi r4,r4,32 +err1; std r0,0(r3) +err1; std r6,8(r3) +err1; std r7,16(r3) +err1; std r8,24(r3) + addi r3,r3,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f +err1; ld r0,0(r4) +err1; ld r6,8(r4) + addi r4,r4,16 +err1; std r0,0(r3) +err1; std r6,8(r3) + addi r3,r3,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f +err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err1; lwz r6,4(r4) + addi r4,r4,8 +err1; stw r0,0(r3) +err1; stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err1; lbz r0,0(r4) +err1; stb r0,0(r3) + +15: li r3,0 + blr + +.Lunwind_stack_nonvmx_copy: + addi r1,r1,STACKFRAMESIZE + b .Lnonvmx_copy + +#ifdef CONFIG_ALTIVEC +.Lvmx_copy: + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl enter_vmx_usercopy + cmpwi cr1,r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STK_REG(R31)(r1) + ld r4,STK_REG(R30)(r1) + ld r5,STK_REG(R29)(r1) + mtlr r0 + + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. + */ + clrrdi r6,r4,7 + clrrdi r9,r3,7 + ori r9,r9,1 /* stream=1 */ + + srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ + cmpldi r7,0x3FF + ble 1f + li r7,0x3FF +1: lis r0,0x0E00 /* depth=7 */ + sldi r7,r7,7 + or r7,r7,r0 + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + /* setup read stream 0 */ + dcbt r0,r6,0b01000 /* addr from */ + dcbt r0,r7,0b01010 /* length and depth from */ + /* setup write stream 1 */ + dcbtst r0,r9,0b01000 /* addr to */ + dcbtst r0,r10,0b01010 /* length and depth to */ + eieio + dcbt r0,r8,0b01010 /* all streams GO */ +.machine pop + + beq cr1,.Lunwind_stack_nonvmx_copy + + /* + * If source and destination are not relatively aligned we use a + * slower permute loop. + */ + xor r6,r4,r3 + rldicl. r6,r6,0,(64-4) + bne .Lvmx_unaligned_copy + + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f +err3; lbz r0,0(r4) + addi r4,r4,1 +err3; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f +err3; ld r0,0(r4) + addi r4,r4,8 +err3; std r0,0(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + bf cr7*4+3,5f +err3; lvx v1,r0,r4 + addi r4,r4,16 +err3; stvx v1,r0,r3 + addi r3,r3,16 + +5: bf cr7*4+2,6f +err3; lvx v1,r0,r4 +err3; lvx v0,r4,r9 + addi r4,r4,32 +err3; stvx v1,r0,r3 +err3; stvx v0,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f +err3; lvx v3,r0,r4 +err3; lvx v2,r4,r9 +err3; lvx v1,r4,r10 +err3; lvx v0,r4,r11 + addi r4,r4,64 +err3; stvx v3,r0,r3 +err3; stvx v2,r3,r9 +err3; stvx v1,r3,r10 +err3; stvx v0,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: +err4; lvx v7,r0,r4 +err4; lvx v6,r4,r9 +err4; lvx v5,r4,r10 +err4; lvx v4,r4,r11 +err4; lvx v3,r4,r12 +err4; lvx v2,r4,r14 +err4; lvx v1,r4,r15 +err4; lvx v0,r4,r16 + addi r4,r4,128 +err4; stvx v7,r0,r3 +err4; stvx v6,r3,r9 +err4; stvx v5,r3,r10 +err4; stvx v4,r3,r11 +err4; stvx v3,r3,r12 +err4; stvx v2,r3,r14 +err4; stvx v1,r3,r15 +err4; stvx v0,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f +err3; lvx v3,r0,r4 +err3; lvx v2,r4,r9 +err3; lvx v1,r4,r10 +err3; lvx v0,r4,r11 + addi r4,r4,64 +err3; stvx v3,r0,r3 +err3; stvx v2,r3,r9 +err3; stvx v1,r3,r10 +err3; stvx v0,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f +err3; lvx v1,r0,r4 +err3; lvx v0,r4,r9 + addi r4,r4,32 +err3; stvx v1,r0,r3 +err3; stvx v0,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f +err3; lvx v1,r0,r4 + addi r4,r4,16 +err3; stvx v1,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + mtocrf 0x01,r5 + bf cr7*4+0,12f +err3; ld r0,0(r4) + addi r4,r4,8 +err3; std r0,0(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err3; lbz r0,0(r4) +err3; stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + b exit_vmx_usercopy /* tail call optimise */ + +.Lvmx_unaligned_copy: + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f +err3; lbz r0,0(r4) + addi r4,r4,1 +err3; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f +err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err3; lwz r7,4(r4) + addi r4,r4,8 +err3; stw r0,0(r3) +err3; stw r7,4(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + LVS(v16,0,r4) /* Setup permute control vector */ +err3; lvx v0,0,r4 + addi r4,r4,16 + + bf cr7*4+3,5f +err3; lvx v1,r0,r4 + VPERM(v8,v0,v1,v16) + addi r4,r4,16 +err3; stvx v8,r0,r3 + addi r3,r3,16 + vor v0,v1,v1 + +5: bf cr7*4+2,6f +err3; lvx v1,r0,r4 + VPERM(v8,v0,v1,v16) +err3; lvx v0,r4,r9 + VPERM(v9,v1,v0,v16) + addi r4,r4,32 +err3; stvx v8,r0,r3 +err3; stvx v9,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f +err3; lvx v3,r0,r4 + VPERM(v8,v0,v3,v16) +err3; lvx v2,r4,r9 + VPERM(v9,v3,v2,v16) +err3; lvx v1,r4,r10 + VPERM(v10,v2,v1,v16) +err3; lvx v0,r4,r11 + VPERM(v11,v1,v0,v16) + addi r4,r4,64 +err3; stvx v8,r0,r3 +err3; stvx v9,r3,r9 +err3; stvx v10,r3,r10 +err3; stvx v11,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: +err4; lvx v7,r0,r4 + VPERM(v8,v0,v7,v16) +err4; lvx v6,r4,r9 + VPERM(v9,v7,v6,v16) +err4; lvx v5,r4,r10 + VPERM(v10,v6,v5,v16) +err4; lvx v4,r4,r11 + VPERM(v11,v5,v4,v16) +err4; lvx v3,r4,r12 + VPERM(v12,v4,v3,v16) +err4; lvx v2,r4,r14 + VPERM(v13,v3,v2,v16) +err4; lvx v1,r4,r15 + VPERM(v14,v2,v1,v16) +err4; lvx v0,r4,r16 + VPERM(v15,v1,v0,v16) + addi r4,r4,128 +err4; stvx v8,r0,r3 +err4; stvx v9,r3,r9 +err4; stvx v10,r3,r10 +err4; stvx v11,r3,r11 +err4; stvx v12,r3,r12 +err4; stvx v13,r3,r14 +err4; stvx v14,r3,r15 +err4; stvx v15,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f +err3; lvx v3,r0,r4 + VPERM(v8,v0,v3,v16) +err3; lvx v2,r4,r9 + VPERM(v9,v3,v2,v16) +err3; lvx v1,r4,r10 + VPERM(v10,v2,v1,v16) +err3; lvx v0,r4,r11 + VPERM(v11,v1,v0,v16) + addi r4,r4,64 +err3; stvx v8,r0,r3 +err3; stvx v9,r3,r9 +err3; stvx v10,r3,r10 +err3; stvx v11,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f +err3; lvx v1,r0,r4 + VPERM(v8,v0,v1,v16) +err3; lvx v0,r4,r9 + VPERM(v9,v1,v0,v16) + addi r4,r4,32 +err3; stvx v8,r0,r3 +err3; stvx v9,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f +err3; lvx v1,r0,r4 + VPERM(v8,v0,v1,v16) + addi r4,r4,16 +err3; stvx v8,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + addi r4,r4,-16 /* Unwind the +16 load offset */ + mtocrf 0x01,r5 + bf cr7*4+0,12f +err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err3; lwz r6,4(r4) + addi r4,r4,8 +err3; stw r0,0(r3) +err3; stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err3; lbz r0,0(r4) +err3; stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + b exit_vmx_usercopy /* tail call optimise */ +#endif /* CONFIG_ALTIVEC */ diff --git a/arch/powerpc/lib/crtsavres.S b/arch/powerpc/lib/crtsavres.S new file mode 100644 index 000000000..18af0b3d3 --- /dev/null +++ b/arch/powerpc/lib/crtsavres.S @@ -0,0 +1,547 @@ +/* + * Special support for eabi and SVR4 + * + * Copyright (C) 1995, 1996, 1998, 2000, 2001 Free Software Foundation, Inc. + * Copyright 2008 Freescale Semiconductor, Inc. + * Written By Michael Meissner + * + * Based on gcc/config/rs6000/crtsavres.asm from gcc + * 64 bit additions from reading the PPC elf64abi document. + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * In addition to the permissions in the GNU General Public License, the + * Free Software Foundation gives you unlimited permission to link the + * compiled version of this file with other programs, and to distribute + * those programs without any restriction coming from the use of this + * file. (The General Public License restrictions do apply in other + * respects; for example, they cover modification of the file, and + * distribution when not linked into another program.) + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + * As a special exception, if you link this library with files + * compiled with GCC to produce an executable, this does not cause + * the resulting executable to be covered by the GNU General Public License. + * This exception does not however invalidate any other reasons why + * the executable file might be covered by the GNU General Public License. + */ + +#include <asm/ppc_asm.h> + + .file "crtsavres.S" + +#ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE + +#ifndef CONFIG_PPC64 + + .section ".text" + +/* Routines for saving integer registers, called by the compiler. */ +/* Called with r11 pointing to the stack header word of the caller of the */ +/* function, just beyond the end of the integer save area. */ + +_GLOBAL(_savegpr_14) +_GLOBAL(_save32gpr_14) + stw 14,-72(11) /* save gp registers */ +_GLOBAL(_savegpr_15) +_GLOBAL(_save32gpr_15) + stw 15,-68(11) +_GLOBAL(_savegpr_16) +_GLOBAL(_save32gpr_16) + stw 16,-64(11) +_GLOBAL(_savegpr_17) +_GLOBAL(_save32gpr_17) + stw 17,-60(11) +_GLOBAL(_savegpr_18) +_GLOBAL(_save32gpr_18) + stw 18,-56(11) +_GLOBAL(_savegpr_19) +_GLOBAL(_save32gpr_19) + stw 19,-52(11) +_GLOBAL(_savegpr_20) +_GLOBAL(_save32gpr_20) + stw 20,-48(11) +_GLOBAL(_savegpr_21) +_GLOBAL(_save32gpr_21) + stw 21,-44(11) +_GLOBAL(_savegpr_22) +_GLOBAL(_save32gpr_22) + stw 22,-40(11) +_GLOBAL(_savegpr_23) +_GLOBAL(_save32gpr_23) + stw 23,-36(11) +_GLOBAL(_savegpr_24) +_GLOBAL(_save32gpr_24) + stw 24,-32(11) +_GLOBAL(_savegpr_25) +_GLOBAL(_save32gpr_25) + stw 25,-28(11) +_GLOBAL(_savegpr_26) +_GLOBAL(_save32gpr_26) + stw 26,-24(11) +_GLOBAL(_savegpr_27) +_GLOBAL(_save32gpr_27) + stw 27,-20(11) +_GLOBAL(_savegpr_28) +_GLOBAL(_save32gpr_28) + stw 28,-16(11) +_GLOBAL(_savegpr_29) +_GLOBAL(_save32gpr_29) + stw 29,-12(11) +_GLOBAL(_savegpr_30) +_GLOBAL(_save32gpr_30) + stw 30,-8(11) +_GLOBAL(_savegpr_31) +_GLOBAL(_save32gpr_31) + stw 31,-4(11) + blr + +/* Routines for restoring integer registers, called by the compiler. */ +/* Called with r11 pointing to the stack header word of the caller of the */ +/* function, just beyond the end of the integer restore area. */ + +_GLOBAL(_restgpr_14) +_GLOBAL(_rest32gpr_14) + lwz 14,-72(11) /* restore gp registers */ +_GLOBAL(_restgpr_15) +_GLOBAL(_rest32gpr_15) + lwz 15,-68(11) +_GLOBAL(_restgpr_16) +_GLOBAL(_rest32gpr_16) + lwz 16,-64(11) +_GLOBAL(_restgpr_17) +_GLOBAL(_rest32gpr_17) + lwz 17,-60(11) +_GLOBAL(_restgpr_18) +_GLOBAL(_rest32gpr_18) + lwz 18,-56(11) +_GLOBAL(_restgpr_19) +_GLOBAL(_rest32gpr_19) + lwz 19,-52(11) +_GLOBAL(_restgpr_20) +_GLOBAL(_rest32gpr_20) + lwz 20,-48(11) +_GLOBAL(_restgpr_21) +_GLOBAL(_rest32gpr_21) + lwz 21,-44(11) +_GLOBAL(_restgpr_22) +_GLOBAL(_rest32gpr_22) + lwz 22,-40(11) +_GLOBAL(_restgpr_23) +_GLOBAL(_rest32gpr_23) + lwz 23,-36(11) +_GLOBAL(_restgpr_24) +_GLOBAL(_rest32gpr_24) + lwz 24,-32(11) +_GLOBAL(_restgpr_25) +_GLOBAL(_rest32gpr_25) + lwz 25,-28(11) +_GLOBAL(_restgpr_26) +_GLOBAL(_rest32gpr_26) + lwz 26,-24(11) +_GLOBAL(_restgpr_27) +_GLOBAL(_rest32gpr_27) + lwz 27,-20(11) +_GLOBAL(_restgpr_28) +_GLOBAL(_rest32gpr_28) + lwz 28,-16(11) +_GLOBAL(_restgpr_29) +_GLOBAL(_rest32gpr_29) + lwz 29,-12(11) +_GLOBAL(_restgpr_30) +_GLOBAL(_rest32gpr_30) + lwz 30,-8(11) +_GLOBAL(_restgpr_31) +_GLOBAL(_rest32gpr_31) + lwz 31,-4(11) + blr + +/* Routines for restoring integer registers, called by the compiler. */ +/* Called with r11 pointing to the stack header word of the caller of the */ +/* function, just beyond the end of the integer restore area. */ + +_GLOBAL(_restgpr_14_x) +_GLOBAL(_rest32gpr_14_x) + lwz 14,-72(11) /* restore gp registers */ +_GLOBAL(_restgpr_15_x) +_GLOBAL(_rest32gpr_15_x) + lwz 15,-68(11) +_GLOBAL(_restgpr_16_x) +_GLOBAL(_rest32gpr_16_x) + lwz 16,-64(11) +_GLOBAL(_restgpr_17_x) +_GLOBAL(_rest32gpr_17_x) + lwz 17,-60(11) +_GLOBAL(_restgpr_18_x) +_GLOBAL(_rest32gpr_18_x) + lwz 18,-56(11) +_GLOBAL(_restgpr_19_x) +_GLOBAL(_rest32gpr_19_x) + lwz 19,-52(11) +_GLOBAL(_restgpr_20_x) +_GLOBAL(_rest32gpr_20_x) + lwz 20,-48(11) +_GLOBAL(_restgpr_21_x) +_GLOBAL(_rest32gpr_21_x) + lwz 21,-44(11) +_GLOBAL(_restgpr_22_x) +_GLOBAL(_rest32gpr_22_x) + lwz 22,-40(11) +_GLOBAL(_restgpr_23_x) +_GLOBAL(_rest32gpr_23_x) + lwz 23,-36(11) +_GLOBAL(_restgpr_24_x) +_GLOBAL(_rest32gpr_24_x) + lwz 24,-32(11) +_GLOBAL(_restgpr_25_x) +_GLOBAL(_rest32gpr_25_x) + lwz 25,-28(11) +_GLOBAL(_restgpr_26_x) +_GLOBAL(_rest32gpr_26_x) + lwz 26,-24(11) +_GLOBAL(_restgpr_27_x) +_GLOBAL(_rest32gpr_27_x) + lwz 27,-20(11) +_GLOBAL(_restgpr_28_x) +_GLOBAL(_rest32gpr_28_x) + lwz 28,-16(11) +_GLOBAL(_restgpr_29_x) +_GLOBAL(_rest32gpr_29_x) + lwz 29,-12(11) +_GLOBAL(_restgpr_30_x) +_GLOBAL(_rest32gpr_30_x) + lwz 30,-8(11) +_GLOBAL(_restgpr_31_x) +_GLOBAL(_rest32gpr_31_x) + lwz 0,4(11) + lwz 31,-4(11) + mtlr 0 + mr 1,11 + blr + +#ifdef CONFIG_ALTIVEC +/* Called with r0 pointing just beyond the end of the vector save area. */ + +_GLOBAL(_savevr_20) + li r11,-192 + stvx v20,r11,r0 +_GLOBAL(_savevr_21) + li r11,-176 + stvx v21,r11,r0 +_GLOBAL(_savevr_22) + li r11,-160 + stvx v22,r11,r0 +_GLOBAL(_savevr_23) + li r11,-144 + stvx v23,r11,r0 +_GLOBAL(_savevr_24) + li r11,-128 + stvx v24,r11,r0 +_GLOBAL(_savevr_25) + li r11,-112 + stvx v25,r11,r0 +_GLOBAL(_savevr_26) + li r11,-96 + stvx v26,r11,r0 +_GLOBAL(_savevr_27) + li r11,-80 + stvx v27,r11,r0 +_GLOBAL(_savevr_28) + li r11,-64 + stvx v28,r11,r0 +_GLOBAL(_savevr_29) + li r11,-48 + stvx v29,r11,r0 +_GLOBAL(_savevr_30) + li r11,-32 + stvx v30,r11,r0 +_GLOBAL(_savevr_31) + li r11,-16 + stvx v31,r11,r0 + blr + +_GLOBAL(_restvr_20) + li r11,-192 + lvx v20,r11,r0 +_GLOBAL(_restvr_21) + li r11,-176 + lvx v21,r11,r0 +_GLOBAL(_restvr_22) + li r11,-160 + lvx v22,r11,r0 +_GLOBAL(_restvr_23) + li r11,-144 + lvx v23,r11,r0 +_GLOBAL(_restvr_24) + li r11,-128 + lvx v24,r11,r0 +_GLOBAL(_restvr_25) + li r11,-112 + lvx v25,r11,r0 +_GLOBAL(_restvr_26) + li r11,-96 + lvx v26,r11,r0 +_GLOBAL(_restvr_27) + li r11,-80 + lvx v27,r11,r0 +_GLOBAL(_restvr_28) + li r11,-64 + lvx v28,r11,r0 +_GLOBAL(_restvr_29) + li r11,-48 + lvx v29,r11,r0 +_GLOBAL(_restvr_30) + li r11,-32 + lvx v30,r11,r0 +_GLOBAL(_restvr_31) + li r11,-16 + lvx v31,r11,r0 + blr + +#endif /* CONFIG_ALTIVEC */ + +#else /* CONFIG_PPC64 */ + + .section ".text.save.restore","ax",@progbits + +.globl _savegpr0_14 +_savegpr0_14: + std r14,-144(r1) +.globl _savegpr0_15 +_savegpr0_15: + std r15,-136(r1) +.globl _savegpr0_16 +_savegpr0_16: + std r16,-128(r1) +.globl _savegpr0_17 +_savegpr0_17: + std r17,-120(r1) +.globl _savegpr0_18 +_savegpr0_18: + std r18,-112(r1) +.globl _savegpr0_19 +_savegpr0_19: + std r19,-104(r1) +.globl _savegpr0_20 +_savegpr0_20: + std r20,-96(r1) +.globl _savegpr0_21 +_savegpr0_21: + std r21,-88(r1) +.globl _savegpr0_22 +_savegpr0_22: + std r22,-80(r1) +.globl _savegpr0_23 +_savegpr0_23: + std r23,-72(r1) +.globl _savegpr0_24 +_savegpr0_24: + std r24,-64(r1) +.globl _savegpr0_25 +_savegpr0_25: + std r25,-56(r1) +.globl _savegpr0_26 +_savegpr0_26: + std r26,-48(r1) +.globl _savegpr0_27 +_savegpr0_27: + std r27,-40(r1) +.globl _savegpr0_28 +_savegpr0_28: + std r28,-32(r1) +.globl _savegpr0_29 +_savegpr0_29: + std r29,-24(r1) +.globl _savegpr0_30 +_savegpr0_30: + std r30,-16(r1) +.globl _savegpr0_31 +_savegpr0_31: + std r31,-8(r1) + std r0,16(r1) + blr + +.globl _restgpr0_14 +_restgpr0_14: + ld r14,-144(r1) +.globl _restgpr0_15 +_restgpr0_15: + ld r15,-136(r1) +.globl _restgpr0_16 +_restgpr0_16: + ld r16,-128(r1) +.globl _restgpr0_17 +_restgpr0_17: + ld r17,-120(r1) +.globl _restgpr0_18 +_restgpr0_18: + ld r18,-112(r1) +.globl _restgpr0_19 +_restgpr0_19: + ld r19,-104(r1) +.globl _restgpr0_20 +_restgpr0_20: + ld r20,-96(r1) +.globl _restgpr0_21 +_restgpr0_21: + ld r21,-88(r1) +.globl _restgpr0_22 +_restgpr0_22: + ld r22,-80(r1) +.globl _restgpr0_23 +_restgpr0_23: + ld r23,-72(r1) +.globl _restgpr0_24 +_restgpr0_24: + ld r24,-64(r1) +.globl _restgpr0_25 +_restgpr0_25: + ld r25,-56(r1) +.globl _restgpr0_26 +_restgpr0_26: + ld r26,-48(r1) +.globl _restgpr0_27 +_restgpr0_27: + ld r27,-40(r1) +.globl _restgpr0_28 +_restgpr0_28: + ld r28,-32(r1) +.globl _restgpr0_29 +_restgpr0_29: + ld r0,16(r1) + ld r29,-24(r1) + mtlr r0 + ld r30,-16(r1) + ld r31,-8(r1) + blr + +.globl _restgpr0_30 +_restgpr0_30: + ld r30,-16(r1) +.globl _restgpr0_31 +_restgpr0_31: + ld r0,16(r1) + ld r31,-8(r1) + mtlr r0 + blr + +#ifdef CONFIG_ALTIVEC +/* Called with r0 pointing just beyond the end of the vector save area. */ + +.globl _savevr_20 +_savevr_20: + li r12,-192 + stvx v20,r12,r0 +.globl _savevr_21 +_savevr_21: + li r12,-176 + stvx v21,r12,r0 +.globl _savevr_22 +_savevr_22: + li r12,-160 + stvx v22,r12,r0 +.globl _savevr_23 +_savevr_23: + li r12,-144 + stvx v23,r12,r0 +.globl _savevr_24 +_savevr_24: + li r12,-128 + stvx v24,r12,r0 +.globl _savevr_25 +_savevr_25: + li r12,-112 + stvx v25,r12,r0 +.globl _savevr_26 +_savevr_26: + li r12,-96 + stvx v26,r12,r0 +.globl _savevr_27 +_savevr_27: + li r12,-80 + stvx v27,r12,r0 +.globl _savevr_28 +_savevr_28: + li r12,-64 + stvx v28,r12,r0 +.globl _savevr_29 +_savevr_29: + li r12,-48 + stvx v29,r12,r0 +.globl _savevr_30 +_savevr_30: + li r12,-32 + stvx v30,r12,r0 +.globl _savevr_31 +_savevr_31: + li r12,-16 + stvx v31,r12,r0 + blr + +.globl _restvr_20 +_restvr_20: + li r12,-192 + lvx v20,r12,r0 +.globl _restvr_21 +_restvr_21: + li r12,-176 + lvx v21,r12,r0 +.globl _restvr_22 +_restvr_22: + li r12,-160 + lvx v22,r12,r0 +.globl _restvr_23 +_restvr_23: + li r12,-144 + lvx v23,r12,r0 +.globl _restvr_24 +_restvr_24: + li r12,-128 + lvx v24,r12,r0 +.globl _restvr_25 +_restvr_25: + li r12,-112 + lvx v25,r12,r0 +.globl _restvr_26 +_restvr_26: + li r12,-96 + lvx v26,r12,r0 +.globl _restvr_27 +_restvr_27: + li r12,-80 + lvx v27,r12,r0 +.globl _restvr_28 +_restvr_28: + li r12,-64 + lvx v28,r12,r0 +.globl _restvr_29 +_restvr_29: + li r12,-48 + lvx v29,r12,r0 +.globl _restvr_30 +_restvr_30: + li r12,-32 + lvx v30,r12,r0 +.globl _restvr_31 +_restvr_31: + li r12,-16 + lvx v31,r12,r0 + blr + +#endif /* CONFIG_ALTIVEC */ + +#endif /* CONFIG_PPC64 */ + +#endif diff --git a/arch/powerpc/lib/div64.S b/arch/powerpc/lib/div64.S new file mode 100644 index 000000000..83d9832fd --- /dev/null +++ b/arch/powerpc/lib/div64.S @@ -0,0 +1,59 @@ +/* + * Divide a 64-bit unsigned number by a 32-bit unsigned number. + * This routine assumes that the top 32 bits of the dividend are + * non-zero to start with. + * On entry, r3 points to the dividend, which get overwritten with + * the 64-bit quotient, and r4 contains the divisor. + * On exit, r3 contains the remainder. + * + * Copyright (C) 2002 Paul Mackerras, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/ppc_asm.h> +#include <asm/processor.h> + +_GLOBAL(__div64_32) + lwz r5,0(r3) # get the dividend into r5/r6 + lwz r6,4(r3) + cmplw r5,r4 + li r7,0 + li r8,0 + blt 1f + divwu r7,r5,r4 # if dividend.hi >= divisor, + mullw r0,r7,r4 # quotient.hi = dividend.hi / divisor + subf. r5,r0,r5 # dividend.hi %= divisor + beq 3f +1: mr r11,r5 # here dividend.hi != 0 + andis. r0,r5,0xc000 + bne 2f + cntlzw r0,r5 # we are shifting the dividend right + li r10,-1 # to make it < 2^32, and shifting + srw r10,r10,r0 # the divisor right the same amount, + addc r9,r4,r10 # rounding up (so the estimate cannot + andc r11,r6,r10 # ever be too large, only too small) + andc r9,r9,r10 + addze r9,r9 + or r11,r5,r11 + rotlw r9,r9,r0 + rotlw r11,r11,r0 + divwu r11,r11,r9 # then we divide the shifted quantities +2: mullw r10,r11,r4 # to get an estimate of the quotient, + mulhwu r9,r11,r4 # multiply the estimate by the divisor, + subfc r6,r10,r6 # take the product from the divisor, + add r8,r8,r11 # and add the estimate to the accumulated + subfe. r5,r9,r5 # quotient + bne 1b +3: cmplw r6,r4 + blt 4f + divwu r0,r6,r4 # perform the remaining 32-bit division + mullw r10,r0,r4 # and get the remainder + add r8,r8,r0 + subf r6,r10,r6 +4: stw r7,0(r3) # return the quotient in *r3 + stw r8,4(r3) + mr r3,r6 # return the remainder in r3 + blr diff --git a/arch/powerpc/lib/feature-fixups-test.S b/arch/powerpc/lib/feature-fixups-test.S new file mode 100644 index 000000000..f46131181 --- /dev/null +++ b/arch/powerpc/lib/feature-fixups-test.S @@ -0,0 +1,761 @@ +/* + * Copyright 2008 Michael Ellerman, IBM Corporation. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/feature-fixups.h> +#include <asm/ppc_asm.h> +#include <asm/synch.h> + + .text + +#define globl(x) \ + .globl x; \ +x: + +globl(ftr_fixup_test1) + or 1,1,1 + or 2,2,2 /* fixup will nop out this instruction */ + or 3,3,3 + +globl(end_ftr_fixup_test1) + +globl(ftr_fixup_test1_orig) + or 1,1,1 + or 2,2,2 + or 3,3,3 + +globl(ftr_fixup_test1_expected) + or 1,1,1 + nop + or 3,3,3 + +globl(ftr_fixup_test2) + or 1,1,1 + or 2,2,2 /* fixup will replace this with ftr_fixup_test2_alt */ + or 3,3,3 + +globl(end_ftr_fixup_test2) + +globl(ftr_fixup_test2_orig) + or 1,1,1 + or 2,2,2 + or 3,3,3 + +globl(ftr_fixup_test2_alt) + or 31,31,31 + +globl(ftr_fixup_test2_expected) + or 1,1,1 + or 31,31,31 + or 3,3,3 + +globl(ftr_fixup_test3) + or 1,1,1 + or 2,2,2 /* fixup will fail to replace this */ + or 3,3,3 + +globl(end_ftr_fixup_test3) + +globl(ftr_fixup_test3_orig) + or 1,1,1 + or 2,2,2 + or 3,3,3 + +globl(ftr_fixup_test3_alt) + or 31,31,31 + or 31,31,31 + +globl(ftr_fixup_test4) + or 1,1,1 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 3,3,3 + +globl(end_ftr_fixup_test4) + +globl(ftr_fixup_test4_expected) + or 1,1,1 + or 31,31,31 + or 31,31,31 + nop + nop + or 3,3,3 + +globl(ftr_fixup_test4_orig) + or 1,1,1 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 3,3,3 + +globl(ftr_fixup_test4_alt) + or 31,31,31 + or 31,31,31 + + +globl(ftr_fixup_test5) + or 1,1,1 +BEGIN_FTR_SECTION + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 + or 2,2,2 +FTR_SECTION_ELSE +2: b 3f +3: or 5,5,5 + beq 3b + b 1f + or 6,6,6 + b 2b +1: bdnz 3b +ALT_FTR_SECTION_END(0, 1) + or 1,1,1 + +globl(end_ftr_fixup_test5) + +globl(ftr_fixup_test5_expected) + or 1,1,1 +2: b 3f +3: or 5,5,5 + beq 3b + b 1f + or 6,6,6 + b 2b +1: bdnz 3b + or 1,1,1 + +globl(ftr_fixup_test6) +1: or 1,1,1 +BEGIN_FTR_SECTION + or 5,5,5 +2: PPC_LCMPI r3,0 + beq 4f + blt 2b + b 1b + b 4f +FTR_SECTION_ELSE +2: or 2,2,2 + PPC_LCMPI r3,1 + beq 3f + blt 2b + b 3f + b 1b +ALT_FTR_SECTION_END(0, 1) +3: or 1,1,1 + or 2,2,2 +4: or 3,3,3 + +globl(end_ftr_fixup_test6) + +globl(ftr_fixup_test6_expected) +1: or 1,1,1 +2: or 2,2,2 + PPC_LCMPI r3,1 + beq 3f + blt 2b + b 3f + b 1b +2: or 1,1,1 + or 2,2,2 +3: or 3,3,3 + + +#if 0 +/* Test that if we have a larger else case the assembler spots it and + * reports an error. #if 0'ed so as not to break the build normally. + */ +ftr_fixup_test7: + or 1,1,1 +BEGIN_FTR_SECTION + or 2,2,2 + or 2,2,2 + or 2,2,2 +FTR_SECTION_ELSE + or 3,3,3 + or 3,3,3 + or 3,3,3 + or 3,3,3 +ALT_FTR_SECTION_END(0, 1) + or 1,1,1 +#endif + +#define MAKE_MACRO_TEST(TYPE) \ +globl(ftr_fixup_test_ ##TYPE##_macros) \ + or 1,1,1; \ + /* Basic test, this section should all be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic test, this section should NOT be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, inner section should be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(80) \ + or 3,3,3; \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 1, 80) \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, whole section should be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(80) \ + or 3,3,3; \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 0, 80) \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, none should be nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(80) \ + or 3,3,3; \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 0, 80) \ + or 2,2,2; \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic alt section test, default case should be taken */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 5,5,5; \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic alt section test, else case should be taken */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ + or 31,31,31; \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt with smaller else case, should be padded with nops */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in default case */ \ + /* Default case should be taken, with nop'ed inner section */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 3,3,3; \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 1, 95) \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 2,2,2; \ + or 2,2,2; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in else, default taken */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 5,5,5; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 1, 95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in else, else taken & nop */ \ +BEGIN_##TYPE##_SECTION \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +##TYPE##_SECTION_ELSE \ + or 5,5,5; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 3,3,3; \ +END_##TYPE##_SECTION_NESTED(0, 1, 95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, default taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, else taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, all nop'ed */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) \ + or 2,2,2; \ +END_##TYPE##_SECTION(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, default with inner default taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, default with inner else taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 0) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else with inner default taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else with inner else taken */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ +BEGIN_##TYPE##_SECTION_NESTED(95) \ + or 1,1,1; \ +##TYPE##_SECTION_ELSE_NESTED(95) \ + or 5,5,5; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ + or 31,31,31; \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94) \ + or 31,31,31; \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else can have large else case */ \ +BEGIN_##TYPE##_SECTION \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ +##TYPE##_SECTION_ELSE \ +BEGIN_##TYPE##_SECTION_NESTED(94) \ + or 5,5,5; \ + or 5,5,5; \ + or 5,5,5; \ + or 5,5,5; \ +##TYPE##_SECTION_ELSE_NESTED(94) \ + or 1,1,1; \ + or 1,1,1; \ + or 1,1,1; \ + or 1,1,1; \ +ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94) \ +ALT_##TYPE##_SECTION_END(0, 1) \ + or 1,1,1; \ + or 1,1,1; + +#define MAKE_MACRO_TEST_EXPECTED(TYPE) \ +globl(ftr_fixup_test_ ##TYPE##_macros_expected) \ + or 1,1,1; \ + /* Basic test, this section should all be nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + nop; \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic test, this section should NOT be nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ + or 2,2,2; \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, inner section should be nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(80) */ \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION_NESTED(0, 1, 80) */ \ + or 2,2,2; \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, whole section should be nop'ed */ \ + /* NB. inner section is not nop'ed, but then entire outer is */ \ +/* BEGIN_##TYPE##_SECTION */ \ + nop; \ + nop; \ +/* BEGIN_##TYPE##_SECTION_NESTED(80) */ \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION_NESTED(0, 0, 80) */ \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nesting test, none should be nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(80) */ \ + or 3,3,3; \ + or 3,3,3; \ +/* END_##TYPE##_SECTION_NESTED(0, 0, 80) */ \ + or 2,2,2; \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic alt section test, default case should be taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 5,5,5; */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Basic alt section test, else case should be taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 31,31,31; \ + or 31,31,31; \ + or 31,31,31; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt with smaller else case, should be padded with nops */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 31,31,31; \ + nop; \ + nop; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in default case */ \ + /* Default case should be taken, with nop'ed inner section */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 3,3,3; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + nop; \ + nop; \ +/* END_##TYPE##_SECTION_NESTED(0, 1, 95) */ \ + or 3,3,3; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 2,2,2; */ \ + /* or 2,2,2; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in else, default taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 3,3,3; \ + or 3,3,3; \ + or 3,3,3; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 5,5,5; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 3,3,3; */ \ +/* END_##TYPE##_SECTION_NESTED(0, 1, 95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Alt section with nested section in else, else taken & nop */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ + /* or 3,3,3; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 5,5,5; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + nop; \ +/* END_##TYPE##_SECTION_NESTED(0, 1, 95) */ \ + or 5,5,5; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, default taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + or 1,1,1; \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) */ \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, else taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 1,1,1; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + or 5,5,5; \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */ \ + or 2,2,2; \ +/* END_##TYPE##_SECTION(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Feature section with nested alt section, all nop'ed */ \ +/* BEGIN_##TYPE##_SECTION */ \ + nop; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + nop; \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) */ \ + nop; \ +/* END_##TYPE##_SECTION(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, default with inner default taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + or 1,1,1; \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 95) */ \ + or 2,2,2; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 31,31,31; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + /* or 5,5,5; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + /* or 1,1,1; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) */ \ + /* or 31,31,31; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, default with inner else taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + or 2,2,2; \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 1,1,1; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + or 5,5,5; \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */ \ + or 2,2,2; \ +/* ##TYPE##_SECTION_ELSE */ \ + /* or 31,31,31; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + /* or 5,5,5; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + /* or 1,1,1; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) */ \ + /* or 31,31,31; */ \ +/* ALT_##TYPE##_SECTION_END(0, 0) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else with inner default taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 2,2,2; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 1,1,1; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */ \ + /* or 2,2,2; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 31,31,31; \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + or 5,5,5; \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + /* or 1,1,1; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 0, 94) */ \ + or 31,31,31; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else with inner else taken */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 2,2,2; */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(95) */ \ + /* or 1,1,1; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(95) */ \ + /* or 5,5,5; */ \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 95) */ \ + /* or 2,2,2; */ \ +/* ##TYPE##_SECTION_ELSE */ \ + or 31,31,31; \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + /* or 5,5,5; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + or 1,1,1; \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94) */ \ + or 31,31,31; \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; \ + /* Nested alt sections, else can have large else case */ \ +/* BEGIN_##TYPE##_SECTION */ \ + /* or 2,2,2; */ \ + /* or 2,2,2; */ \ + /* or 2,2,2; */ \ + /* or 2,2,2; */ \ +/* ##TYPE##_SECTION_ELSE */ \ +/* BEGIN_##TYPE##_SECTION_NESTED(94) */ \ + /* or 5,5,5; */ \ + /* or 5,5,5; */ \ + /* or 5,5,5; */ \ + /* or 5,5,5; */ \ +/* ##TYPE##_SECTION_ELSE_NESTED(94) */ \ + or 1,1,1; \ + or 1,1,1; \ + or 1,1,1; \ + or 1,1,1; \ +/* ALT_##TYPE##_SECTION_END_NESTED(0, 1, 94) */ \ +/* ALT_##TYPE##_SECTION_END(0, 1) */ \ + or 1,1,1; \ + or 1,1,1; + +MAKE_MACRO_TEST(FTR); +MAKE_MACRO_TEST_EXPECTED(FTR); + +#ifdef CONFIG_PPC64 +MAKE_MACRO_TEST(FW_FTR); +MAKE_MACRO_TEST_EXPECTED(FW_FTR); +#endif + +globl(lwsync_fixup_test) +1: or 1,1,1 + LWSYNC +globl(end_lwsync_fixup_test) + +globl(lwsync_fixup_test_expected_LWSYNC) +1: or 1,1,1 + lwsync + +globl(lwsync_fixup_test_expected_SYNC) +1: or 1,1,1 + sync + diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c new file mode 100644 index 000000000..7ce3870d7 --- /dev/null +++ b/arch/powerpc/lib/feature-fixups.c @@ -0,0 +1,376 @@ +/* + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + * + * Modifications for ppc64: + * Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com> + * + * Copyright 2008 Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/init.h> +#include <asm/cputable.h> +#include <asm/code-patching.h> +#include <asm/page.h> +#include <asm/sections.h> + + +struct fixup_entry { + unsigned long mask; + unsigned long value; + long start_off; + long end_off; + long alt_start_off; + long alt_end_off; +}; + +static unsigned int *calc_addr(struct fixup_entry *fcur, long offset) +{ + /* + * We store the offset to the code as a negative offset from + * the start of the alt_entry, to support the VDSO. This + * routine converts that back into an actual address. + */ + return (unsigned int *)((unsigned long)fcur + offset); +} + +static int patch_alt_instruction(unsigned int *src, unsigned int *dest, + unsigned int *alt_start, unsigned int *alt_end) +{ + unsigned int instr; + + instr = *src; + + if (instr_is_relative_branch(*src)) { + unsigned int *target = (unsigned int *)branch_target(src); + + /* Branch within the section doesn't need translating */ + if (target < alt_start || target >= alt_end) { + instr = translate_branch(dest, src); + if (!instr) + return 1; + } + } + + patch_instruction(dest, instr); + + return 0; +} + +static int patch_feature_section(unsigned long value, struct fixup_entry *fcur) +{ + unsigned int *start, *end, *alt_start, *alt_end, *src, *dest; + + start = calc_addr(fcur, fcur->start_off); + end = calc_addr(fcur, fcur->end_off); + alt_start = calc_addr(fcur, fcur->alt_start_off); + alt_end = calc_addr(fcur, fcur->alt_end_off); + + if ((alt_end - alt_start) > (end - start)) + return 1; + + if ((value & fcur->mask) == fcur->value) + return 0; + + src = alt_start; + dest = start; + + for (; src < alt_end; src++, dest++) { + if (patch_alt_instruction(src, dest, alt_start, alt_end)) + return 1; + } + + for (; dest < end; dest++) + patch_instruction(dest, PPC_INST_NOP); + + return 0; +} + +void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end) +{ + struct fixup_entry *fcur, *fend; + + fcur = fixup_start; + fend = fixup_end; + + for (; fcur < fend; fcur++) { + if (patch_feature_section(value, fcur)) { + WARN_ON(1); + printk("Unable to patch feature section at %p - %p" \ + " with %p - %p\n", + calc_addr(fcur, fcur->start_off), + calc_addr(fcur, fcur->end_off), + calc_addr(fcur, fcur->alt_start_off), + calc_addr(fcur, fcur->alt_end_off)); + } + } +} + +void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) +{ + long *start, *end; + unsigned int *dest; + + if (!(value & CPU_FTR_LWSYNC)) + return ; + + start = fixup_start; + end = fixup_end; + + for (; start < end; start++) { + dest = (void *)start + *start; + patch_instruction(dest, PPC_INST_LWSYNC); + } +} + +void do_final_fixups(void) +{ +#if defined(CONFIG_PPC64) && defined(CONFIG_RELOCATABLE) + int *src, *dest; + unsigned long length; + + if (PHYSICAL_START == 0) + return; + + src = (int *)(KERNELBASE + PHYSICAL_START); + dest = (int *)KERNELBASE; + length = (__end_interrupts - _stext) / sizeof(int); + + while (length--) { + patch_instruction(dest, *src); + src++; + dest++; + } +#endif +} + +#ifdef CONFIG_FTR_FIXUP_SELFTEST + +#define check(x) \ + if (!(x)) printk("feature-fixups: test failed at line %d\n", __LINE__); + +/* This must be after the text it fixes up, vmlinux.lds.S enforces that atm */ +static struct fixup_entry fixup; + +static long calc_offset(struct fixup_entry *entry, unsigned int *p) +{ + return (unsigned long)p - (unsigned long)entry; +} + +static void test_basic_patching(void) +{ + extern unsigned int ftr_fixup_test1; + extern unsigned int end_ftr_fixup_test1; + extern unsigned int ftr_fixup_test1_orig; + extern unsigned int ftr_fixup_test1_expected; + int size = &end_ftr_fixup_test1 - &ftr_fixup_test1; + + fixup.value = fixup.mask = 8; + fixup.start_off = calc_offset(&fixup, &ftr_fixup_test1 + 1); + fixup.end_off = calc_offset(&fixup, &ftr_fixup_test1 + 2); + fixup.alt_start_off = fixup.alt_end_off = 0; + + /* Sanity check */ + check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_orig, size) == 0); + + /* Check we don't patch if the value matches */ + patch_feature_section(8, &fixup); + check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_orig, size) == 0); + + /* Check we do patch if the value doesn't match */ + patch_feature_section(0, &fixup); + check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_expected, size) == 0); + + /* Check we do patch if the mask doesn't match */ + memcpy(&ftr_fixup_test1, &ftr_fixup_test1_orig, size); + check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_orig, size) == 0); + patch_feature_section(~8, &fixup); + check(memcmp(&ftr_fixup_test1, &ftr_fixup_test1_expected, size) == 0); +} + +static void test_alternative_patching(void) +{ + extern unsigned int ftr_fixup_test2; + extern unsigned int end_ftr_fixup_test2; + extern unsigned int ftr_fixup_test2_orig; + extern unsigned int ftr_fixup_test2_alt; + extern unsigned int ftr_fixup_test2_expected; + int size = &end_ftr_fixup_test2 - &ftr_fixup_test2; + + fixup.value = fixup.mask = 0xF; + fixup.start_off = calc_offset(&fixup, &ftr_fixup_test2 + 1); + fixup.end_off = calc_offset(&fixup, &ftr_fixup_test2 + 2); + fixup.alt_start_off = calc_offset(&fixup, &ftr_fixup_test2_alt); + fixup.alt_end_off = calc_offset(&fixup, &ftr_fixup_test2_alt + 1); + + /* Sanity check */ + check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_orig, size) == 0); + + /* Check we don't patch if the value matches */ + patch_feature_section(0xF, &fixup); + check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_orig, size) == 0); + + /* Check we do patch if the value doesn't match */ + patch_feature_section(0, &fixup); + check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_expected, size) == 0); + + /* Check we do patch if the mask doesn't match */ + memcpy(&ftr_fixup_test2, &ftr_fixup_test2_orig, size); + check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_orig, size) == 0); + patch_feature_section(~0xF, &fixup); + check(memcmp(&ftr_fixup_test2, &ftr_fixup_test2_expected, size) == 0); +} + +static void test_alternative_case_too_big(void) +{ + extern unsigned int ftr_fixup_test3; + extern unsigned int end_ftr_fixup_test3; + extern unsigned int ftr_fixup_test3_orig; + extern unsigned int ftr_fixup_test3_alt; + int size = &end_ftr_fixup_test3 - &ftr_fixup_test3; + + fixup.value = fixup.mask = 0xC; + fixup.start_off = calc_offset(&fixup, &ftr_fixup_test3 + 1); + fixup.end_off = calc_offset(&fixup, &ftr_fixup_test3 + 2); + fixup.alt_start_off = calc_offset(&fixup, &ftr_fixup_test3_alt); + fixup.alt_end_off = calc_offset(&fixup, &ftr_fixup_test3_alt + 2); + + /* Sanity check */ + check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0); + + /* Expect nothing to be patched, and the error returned to us */ + check(patch_feature_section(0xF, &fixup) == 1); + check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0); + check(patch_feature_section(0, &fixup) == 1); + check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0); + check(patch_feature_section(~0xF, &fixup) == 1); + check(memcmp(&ftr_fixup_test3, &ftr_fixup_test3_orig, size) == 0); +} + +static void test_alternative_case_too_small(void) +{ + extern unsigned int ftr_fixup_test4; + extern unsigned int end_ftr_fixup_test4; + extern unsigned int ftr_fixup_test4_orig; + extern unsigned int ftr_fixup_test4_alt; + extern unsigned int ftr_fixup_test4_expected; + int size = &end_ftr_fixup_test4 - &ftr_fixup_test4; + unsigned long flag; + + /* Check a high-bit flag */ + flag = 1UL << ((sizeof(unsigned long) - 1) * 8); + fixup.value = fixup.mask = flag; + fixup.start_off = calc_offset(&fixup, &ftr_fixup_test4 + 1); + fixup.end_off = calc_offset(&fixup, &ftr_fixup_test4 + 5); + fixup.alt_start_off = calc_offset(&fixup, &ftr_fixup_test4_alt); + fixup.alt_end_off = calc_offset(&fixup, &ftr_fixup_test4_alt + 2); + + /* Sanity check */ + check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_orig, size) == 0); + + /* Check we don't patch if the value matches */ + patch_feature_section(flag, &fixup); + check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_orig, size) == 0); + + /* Check we do patch if the value doesn't match */ + patch_feature_section(0, &fixup); + check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_expected, size) == 0); + + /* Check we do patch if the mask doesn't match */ + memcpy(&ftr_fixup_test4, &ftr_fixup_test4_orig, size); + check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_orig, size) == 0); + patch_feature_section(~flag, &fixup); + check(memcmp(&ftr_fixup_test4, &ftr_fixup_test4_expected, size) == 0); +} + +static void test_alternative_case_with_branch(void) +{ + extern unsigned int ftr_fixup_test5; + extern unsigned int end_ftr_fixup_test5; + extern unsigned int ftr_fixup_test5_expected; + int size = &end_ftr_fixup_test5 - &ftr_fixup_test5; + + check(memcmp(&ftr_fixup_test5, &ftr_fixup_test5_expected, size) == 0); +} + +static void test_alternative_case_with_external_branch(void) +{ + extern unsigned int ftr_fixup_test6; + extern unsigned int end_ftr_fixup_test6; + extern unsigned int ftr_fixup_test6_expected; + int size = &end_ftr_fixup_test6 - &ftr_fixup_test6; + + check(memcmp(&ftr_fixup_test6, &ftr_fixup_test6_expected, size) == 0); +} + +static void test_cpu_macros(void) +{ + extern u8 ftr_fixup_test_FTR_macros; + extern u8 ftr_fixup_test_FTR_macros_expected; + unsigned long size = &ftr_fixup_test_FTR_macros_expected - + &ftr_fixup_test_FTR_macros; + + /* The fixups have already been done for us during boot */ + check(memcmp(&ftr_fixup_test_FTR_macros, + &ftr_fixup_test_FTR_macros_expected, size) == 0); +} + +static void test_fw_macros(void) +{ +#ifdef CONFIG_PPC64 + extern u8 ftr_fixup_test_FW_FTR_macros; + extern u8 ftr_fixup_test_FW_FTR_macros_expected; + unsigned long size = &ftr_fixup_test_FW_FTR_macros_expected - + &ftr_fixup_test_FW_FTR_macros; + + /* The fixups have already been done for us during boot */ + check(memcmp(&ftr_fixup_test_FW_FTR_macros, + &ftr_fixup_test_FW_FTR_macros_expected, size) == 0); +#endif +} + +static void test_lwsync_macros(void) +{ + extern u8 lwsync_fixup_test; + extern u8 end_lwsync_fixup_test; + extern u8 lwsync_fixup_test_expected_LWSYNC; + extern u8 lwsync_fixup_test_expected_SYNC; + unsigned long size = &end_lwsync_fixup_test - + &lwsync_fixup_test; + + /* The fixups have already been done for us during boot */ + if (cur_cpu_spec->cpu_features & CPU_FTR_LWSYNC) { + check(memcmp(&lwsync_fixup_test, + &lwsync_fixup_test_expected_LWSYNC, size) == 0); + } else { + check(memcmp(&lwsync_fixup_test, + &lwsync_fixup_test_expected_SYNC, size) == 0); + } +} + +static int __init test_feature_fixups(void) +{ + printk(KERN_DEBUG "Running feature fixup self-tests ...\n"); + + test_basic_patching(); + test_alternative_patching(); + test_alternative_case_too_big(); + test_alternative_case_too_small(); + test_alternative_case_with_branch(); + test_alternative_case_with_external_branch(); + test_cpu_macros(); + test_fw_macros(); + test_lwsync_macros(); + + return 0; +} +late_initcall(test_feature_fixups); + +#endif /* CONFIG_FTR_FIXUP_SELFTEST */ diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S new file mode 100644 index 000000000..19e66001a --- /dev/null +++ b/arch/powerpc/lib/hweight_64.S @@ -0,0 +1,110 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2010 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> + +/* Note: This code relies on -mminimal-toc */ + +_GLOBAL(__arch_hweight8) +BEGIN_FTR_SECTION + b __sw_hweight8 + nop + nop +FTR_SECTION_ELSE + PPC_POPCNTB(R3,R3) + clrldi r3,r3,64-8 + blr +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) + +_GLOBAL(__arch_hweight16) +BEGIN_FTR_SECTION + b __sw_hweight16 + nop + nop + nop + nop +FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(50) + PPC_POPCNTB(R3,R3) + srdi r4,r3,8 + add r3,r4,r3 + clrldi r3,r3,64-8 + blr + FTR_SECTION_ELSE_NESTED(50) + clrlwi r3,r3,16 + PPC_POPCNTW(R3,R3) + clrldi r3,r3,64-8 + blr + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 50) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) + +_GLOBAL(__arch_hweight32) +BEGIN_FTR_SECTION + b __sw_hweight32 + nop + nop + nop + nop + nop + nop +FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(51) + PPC_POPCNTB(R3,R3) + srdi r4,r3,16 + add r3,r4,r3 + srdi r4,r3,8 + add r3,r4,r3 + clrldi r3,r3,64-8 + blr + FTR_SECTION_ELSE_NESTED(51) + PPC_POPCNTW(R3,R3) + clrldi r3,r3,64-8 + blr + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 51) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) + +_GLOBAL(__arch_hweight64) +BEGIN_FTR_SECTION + b __sw_hweight64 + nop + nop + nop + nop + nop + nop + nop + nop +FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(52) + PPC_POPCNTB(R3,R3) + srdi r4,r3,32 + add r3,r4,r3 + srdi r4,r3,16 + add r3,r4,r3 + srdi r4,r3,8 + add r3,r4,r3 + clrldi r3,r3,64-8 + blr + FTR_SECTION_ELSE_NESTED(52) + PPC_POPCNTD(R3,R3) + clrldi r3,r3,64-8 + blr + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 52) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S new file mode 100644 index 000000000..5d0cdbfbe --- /dev/null +++ b/arch/powerpc/lib/ldstfp.S @@ -0,0 +1,379 @@ +/* + * Floating-point, VMX/Altivec and VSX loads and stores + * for use in instruction emulation. + * + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/ppc-opcode.h> +#include <asm/reg.h> +#include <asm/asm-offsets.h> +#include <linux/errno.h> + +#ifdef CONFIG_PPC_FPU + +#define STKFRM (PPC_MIN_STKFRM + 16) + + .macro extab instr,handler + .section __ex_table,"a" + PPC_LONG \instr,\handler + .previous + .endm + + .macro inst32 op +reg = 0 + .rept 32 +20: \op reg,0,r4 + b 3f + extab 20b,99f +reg = reg + 1 + .endr + .endm + +/* Get the contents of frN into fr0; N is in r3. */ +_GLOBAL(get_fpr) + mflr r0 + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f + blr /* fr0 is already in fr0 */ + nop +reg = 1 + .rept 31 + fmr fr0,reg + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Put the contents of fr0 into frN; N is in r3. */ +_GLOBAL(put_fpr) + mflr r0 + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f + blr /* fr0 is already in fr0 */ + nop +reg = 1 + .rept 31 + fmr reg,fr0 + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Load FP reg N from float at *p. N is in r3, p in r4. */ +_GLOBAL(do_lfs) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + ori r7,r6,MSR_FP + cmpwi cr7,r3,0 + MTMSRD(r7) + isync + beq cr7,1f + stfd fr0,STKFRM-16(r1) +1: li r9,-EFAULT +2: lfs fr0,0(r4) + li r9,0 +3: bl put_fpr + beq cr7,4f + lfd fr0,STKFRM-16(r1) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Load FP reg N from double at *p. N is in r3, p in r4. */ +_GLOBAL(do_lfd) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + ori r7,r6,MSR_FP + cmpwi cr7,r3,0 + MTMSRD(r7) + isync + beq cr7,1f + stfd fr0,STKFRM-16(r1) +1: li r9,-EFAULT +2: lfd fr0,0(r4) + li r9,0 +3: beq cr7,4f + bl put_fpr + lfd fr0,STKFRM-16(r1) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Store FP reg N to float at *p. N is in r3, p in r4. */ +_GLOBAL(do_stfs) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + ori r7,r6,MSR_FP + cmpwi cr7,r3,0 + MTMSRD(r7) + isync + beq cr7,1f + stfd fr0,STKFRM-16(r1) + bl get_fpr +1: li r9,-EFAULT +2: stfs fr0,0(r4) + li r9,0 +3: beq cr7,4f + lfd fr0,STKFRM-16(r1) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Store FP reg N to double at *p. N is in r3, p in r4. */ +_GLOBAL(do_stfd) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + ori r7,r6,MSR_FP + cmpwi cr7,r3,0 + MTMSRD(r7) + isync + beq cr7,1f + stfd fr0,STKFRM-16(r1) + bl get_fpr +1: li r9,-EFAULT +2: stfd fr0,0(r4) + li r9,0 +3: beq cr7,4f + lfd fr0,STKFRM-16(r1) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +#ifdef CONFIG_ALTIVEC +/* Get the contents of vrN into v0; N is in r3. */ +_GLOBAL(get_vr) + mflr r0 + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f + blr /* v0 is already in v0 */ + nop +reg = 1 + .rept 31 + vor v0,reg,reg /* assembler doesn't know vmr? */ + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Put the contents of v0 into vrN; N is in r3. */ +_GLOBAL(put_vr) + mflr r0 + rlwinm r3,r3,3,0xf8 + bcl 20,31,1f + blr /* v0 is already in v0 */ + nop +reg = 1 + .rept 31 + vor reg,v0,v0 + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Load vector reg N from *p. N is in r3, p in r4. */ +_GLOBAL(do_lvx) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + oris r7,r6,MSR_VEC@h + cmpwi cr7,r3,0 + li r8,STKFRM-16 + MTMSRD(r7) + isync + beq cr7,1f + stvx v0,r1,r8 +1: li r9,-EFAULT +2: lvx v0,0,r4 + li r9,0 +3: beq cr7,4f + bl put_vr + lvx v0,r1,r8 +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Store vector reg N to *p. N is in r3, p in r4. */ +_GLOBAL(do_stvx) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + oris r7,r6,MSR_VEC@h + cmpwi cr7,r3,0 + li r8,STKFRM-16 + MTMSRD(r7) + isync + beq cr7,1f + stvx v0,r1,r8 + bl get_vr +1: li r9,-EFAULT +2: stvx v0,0,r4 + li r9,0 +3: beq cr7,4f + lvx v0,r1,r8 +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b +#endif /* CONFIG_ALTIVEC */ + +#ifdef CONFIG_VSX +/* Get the contents of vsN into vs0; N is in r3. */ +_GLOBAL(get_vsr) + mflr r0 + rlwinm r3,r3,3,0x1f8 + bcl 20,31,1f + blr /* vs0 is already in vs0 */ + nop +reg = 1 + .rept 63 + XXLOR(0,reg,reg) + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Put the contents of vs0 into vsN; N is in r3. */ +_GLOBAL(put_vsr) + mflr r0 + rlwinm r3,r3,3,0x1f8 + bcl 20,31,1f + blr /* v0 is already in v0 */ + nop +reg = 1 + .rept 63 + XXLOR(reg,0,0) + blr +reg = reg + 1 + .endr +1: mflr r5 + add r5,r3,r5 + mtctr r5 + mtlr r0 + bctr + +/* Load VSX reg N from vector doubleword *p. N is in r3, p in r4. */ +_GLOBAL(do_lxvd2x) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + oris r7,r6,MSR_VSX@h + cmpwi cr7,r3,0 + li r8,STKFRM-16 + MTMSRD(r7) + isync + beq cr7,1f + STXVD2X(0,R1,R8) +1: li r9,-EFAULT +2: LXVD2X(0,R0,R4) + li r9,0 +3: beq cr7,4f + bl put_vsr + LXVD2X(0,R1,R8) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +/* Store VSX reg N to vector doubleword *p. N is in r3, p in r4. */ +_GLOBAL(do_stxvd2x) + PPC_STLU r1,-STKFRM(r1) + mflr r0 + PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1) + mfmsr r6 + oris r7,r6,MSR_VSX@h + cmpwi cr7,r3,0 + li r8,STKFRM-16 + MTMSRD(r7) + isync + beq cr7,1f + STXVD2X(0,R1,R8) + bl get_vsr +1: li r9,-EFAULT +2: STXVD2X(0,R0,R4) + li r9,0 +3: beq cr7,4f + LXVD2X(0,R1,R8) +4: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) + mtlr r0 + MTMSRD(r6) + isync + mr r3,r9 + addi r1,r1,STKFRM + blr + extab 2b,3b + +#endif /* CONFIG_VSX */ + +#endif /* CONFIG_PPC_FPU */ diff --git a/arch/powerpc/lib/locks.c b/arch/powerpc/lib/locks.c new file mode 100644 index 000000000..f7deebdf3 --- /dev/null +++ b/arch/powerpc/lib/locks.c @@ -0,0 +1,86 @@ +/* + * Spin and read/write lock operations. + * + * Copyright (C) 2001-2004 Paul Mackerras <paulus@au.ibm.com>, IBM + * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM + * Copyright (C) 2002 Dave Engebretsen <engebret@us.ibm.com>, IBM + * Rework to support virtual processors + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/spinlock.h> +#include <linux/export.h> +#include <linux/stringify.h> +#include <linux/smp.h> + +/* waiting for a spinlock... */ +#if defined(CONFIG_PPC_SPLPAR) +#include <asm/hvcall.h> +#include <asm/smp.h> + +void __spin_yield(arch_spinlock_t *lock) +{ + unsigned int lock_value, holder_cpu, yield_count; + + lock_value = lock->slock; + if (lock_value == 0) + return; + holder_cpu = lock_value & 0xffff; + BUG_ON(holder_cpu >= NR_CPUS); + yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count); + if ((yield_count & 1) == 0) + return; /* virtual cpu is currently running */ + rmb(); + if (lock->slock != lock_value) + return; /* something has changed */ + plpar_hcall_norets(H_CONFER, + get_hard_smp_processor_id(holder_cpu), yield_count); +} +EXPORT_SYMBOL_GPL(__spin_yield); + +/* + * Waiting for a read lock or a write lock on a rwlock... + * This turns out to be the same for read and write locks, since + * we only know the holder if it is write-locked. + */ +void __rw_yield(arch_rwlock_t *rw) +{ + int lock_value; + unsigned int holder_cpu, yield_count; + + lock_value = rw->lock; + if (lock_value >= 0) + return; /* no write lock at present */ + holder_cpu = lock_value & 0xffff; + BUG_ON(holder_cpu >= NR_CPUS); + yield_count = be32_to_cpu(lppaca_of(holder_cpu).yield_count); + if ((yield_count & 1) == 0) + return; /* virtual cpu is currently running */ + rmb(); + if (rw->lock != lock_value) + return; /* something has changed */ + plpar_hcall_norets(H_CONFER, + get_hard_smp_processor_id(holder_cpu), yield_count); +} +#endif + +void arch_spin_unlock_wait(arch_spinlock_t *lock) +{ + smp_mb(); + + while (lock->slock) { + HMT_low(); + if (SHARED_PROCESSOR) + __spin_yield(lock); + } + HMT_medium(); + + smp_mb(); +} + +EXPORT_SYMBOL(arch_spin_unlock_wait); diff --git a/arch/powerpc/lib/mem_64.S b/arch/powerpc/lib/mem_64.S new file mode 100644 index 000000000..43435c689 --- /dev/null +++ b/arch/powerpc/lib/mem_64.S @@ -0,0 +1,119 @@ +/* + * String handling functions for PowerPC. + * + * Copyright (C) 1996 Paul Mackerras. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/processor.h> +#include <asm/errno.h> +#include <asm/ppc_asm.h> + +_GLOBAL(memset) + neg r0,r3 + rlwimi r4,r4,8,16,23 + andi. r0,r0,7 /* # bytes to be 8-byte aligned */ + rlwimi r4,r4,16,0,15 + cmplw cr1,r5,r0 /* do we get that far? */ + rldimi r4,r4,32,0 + PPC_MTOCRF(1,r0) + mr r6,r3 + blt cr1,8f + beq+ 3f /* if already 8-byte aligned */ + subf r5,r0,r5 + bf 31,1f + stb r4,0(r6) + addi r6,r6,1 +1: bf 30,2f + sth r4,0(r6) + addi r6,r6,2 +2: bf 29,3f + stw r4,0(r6) + addi r6,r6,4 +3: srdi. r0,r5,6 + clrldi r5,r5,58 + mtctr r0 + beq 5f +4: std r4,0(r6) + std r4,8(r6) + std r4,16(r6) + std r4,24(r6) + std r4,32(r6) + std r4,40(r6) + std r4,48(r6) + std r4,56(r6) + addi r6,r6,64 + bdnz 4b +5: srwi. r0,r5,3 + clrlwi r5,r5,29 + PPC_MTOCRF(1,r0) + beq 8f + bf 29,6f + std r4,0(r6) + std r4,8(r6) + std r4,16(r6) + std r4,24(r6) + addi r6,r6,32 +6: bf 30,7f + std r4,0(r6) + std r4,8(r6) + addi r6,r6,16 +7: bf 31,8f + std r4,0(r6) + addi r6,r6,8 +8: cmpwi r5,0 + PPC_MTOCRF(1,r5) + beqlr+ + bf 29,9f + stw r4,0(r6) + addi r6,r6,4 +9: bf 30,10f + sth r4,0(r6) + addi r6,r6,2 +10: bflr 31 + stb r4,0(r6) + blr + +_GLOBAL_TOC(memmove) + cmplw 0,r3,r4 + bgt backwards_memcpy + b memcpy + +_GLOBAL(backwards_memcpy) + rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ + add r6,r3,r5 + add r4,r4,r5 + beq 2f + andi. r0,r6,3 + mtctr r7 + bne 5f +1: lwz r7,-4(r4) + lwzu r8,-8(r4) + stw r7,-4(r6) + stwu r8,-8(r6) + bdnz 1b + andi. r5,r5,7 +2: cmplwi 0,r5,4 + blt 3f + lwzu r0,-4(r4) + subi r5,r5,4 + stwu r0,-4(r6) +3: cmpwi 0,r5,0 + beqlr + mtctr r5 +4: lbzu r0,-1(r4) + stbu r0,-1(r6) + bdnz 4b + blr +5: mtctr r0 +6: lbzu r7,-1(r4) + stbu r7,-1(r6) + bdnz 6b + subf r5,r0,r5 + rlwinm. r7,r5,32-3,3,31 + beq 2b + mtctr r7 + b 1b diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S new file mode 100644 index 000000000..8953d2382 --- /dev/null +++ b/arch/powerpc/lib/memcmp_64.S @@ -0,0 +1,233 @@ +/* + * Author: Anton Blanchard <anton@au.ibm.com> + * Copyright 2015 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/ppc_asm.h> + +#define off8 r6 +#define off16 r7 +#define off24 r8 + +#define rA r9 +#define rB r10 +#define rC r11 +#define rD r27 +#define rE r28 +#define rF r29 +#define rG r30 +#define rH r31 + +#ifdef __LITTLE_ENDIAN__ +#define LD ldbrx +#else +#define LD ldx +#endif + +_GLOBAL(memcmp) + cmpdi cr1,r5,0 + + /* Use the short loop if both strings are not 8B aligned */ + or r6,r3,r4 + andi. r6,r6,7 + + /* Use the short loop if length is less than 32B */ + cmpdi cr6,r5,31 + + beq cr1,.Lzero + bne .Lshort + bgt cr6,.Llong + +.Lshort: + mtctr r5 + +1: lbz rA,0(r3) + lbz rB,0(r4) + subf. rC,rB,rA + bne .Lnon_zero + bdz .Lzero + + lbz rA,1(r3) + lbz rB,1(r4) + subf. rC,rB,rA + bne .Lnon_zero + bdz .Lzero + + lbz rA,2(r3) + lbz rB,2(r4) + subf. rC,rB,rA + bne .Lnon_zero + bdz .Lzero + + lbz rA,3(r3) + lbz rB,3(r4) + subf. rC,rB,rA + bne .Lnon_zero + + addi r3,r3,4 + addi r4,r4,4 + + bdnz 1b + +.Lzero: + li r3,0 + blr + +.Lnon_zero: + mr r3,rC + blr + +.Llong: + li off8,8 + li off16,16 + li off24,24 + + std r31,-8(r1) + std r30,-16(r1) + std r29,-24(r1) + std r28,-32(r1) + std r27,-40(r1) + + srdi r0,r5,5 + mtctr r0 + andi. r5,r5,31 + + LD rA,0,r3 + LD rB,0,r4 + + LD rC,off8,r3 + LD rD,off8,r4 + + LD rE,off16,r3 + LD rF,off16,r4 + + LD rG,off24,r3 + LD rH,off24,r4 + cmpld cr0,rA,rB + + addi r3,r3,32 + addi r4,r4,32 + + bdz .Lfirst32 + + LD rA,0,r3 + LD rB,0,r4 + cmpld cr1,rC,rD + + LD rC,off8,r3 + LD rD,off8,r4 + cmpld cr6,rE,rF + + LD rE,off16,r3 + LD rF,off16,r4 + cmpld cr7,rG,rH + bne cr0,.LcmpAB + + LD rG,off24,r3 + LD rH,off24,r4 + cmpld cr0,rA,rB + bne cr1,.LcmpCD + + addi r3,r3,32 + addi r4,r4,32 + + bdz .Lsecond32 + + .balign 16 + +1: LD rA,0,r3 + LD rB,0,r4 + cmpld cr1,rC,rD + bne cr6,.LcmpEF + + LD rC,off8,r3 + LD rD,off8,r4 + cmpld cr6,rE,rF + bne cr7,.LcmpGH + + LD rE,off16,r3 + LD rF,off16,r4 + cmpld cr7,rG,rH + bne cr0,.LcmpAB + + LD rG,off24,r3 + LD rH,off24,r4 + cmpld cr0,rA,rB + bne cr1,.LcmpCD + + addi r3,r3,32 + addi r4,r4,32 + + bdnz 1b + +.Lsecond32: + cmpld cr1,rC,rD + bne cr6,.LcmpEF + + cmpld cr6,rE,rF + bne cr7,.LcmpGH + + cmpld cr7,rG,rH + bne cr0,.LcmpAB + + bne cr1,.LcmpCD + bne cr6,.LcmpEF + bne cr7,.LcmpGH + +.Ltail: + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + + cmpdi r5,0 + beq .Lzero + b .Lshort + +.Lfirst32: + cmpld cr1,rC,rD + cmpld cr6,rE,rF + cmpld cr7,rG,rH + + bne cr0,.LcmpAB + bne cr1,.LcmpCD + bne cr6,.LcmpEF + bne cr7,.LcmpGH + + b .Ltail + +.LcmpAB: + li r3,1 + bgt cr0,.Lout + li r3,-1 + b .Lout + +.LcmpCD: + li r3,1 + bgt cr1,.Lout + li r3,-1 + b .Lout + +.LcmpEF: + li r3,1 + bgt cr6,.Lout + li r3,-1 + b .Lout + +.LcmpGH: + li r3,1 + bgt cr7,.Lout + li r3,-1 + +.Lout: + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + blr diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S new file mode 100644 index 000000000..32a06ec39 --- /dev/null +++ b/arch/powerpc/lib/memcpy_64.S @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2002 Paul Mackerras, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/processor.h> +#include <asm/ppc_asm.h> + + .align 7 +_GLOBAL_TOC(memcpy) +BEGIN_FTR_SECTION +#ifdef __LITTLE_ENDIAN__ + cmpdi cr7,r5,0 +#else + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* save destination pointer for return value */ +#endif +FTR_SECTION_ELSE +#ifndef SELFTEST + b memcpy_power7 +#endif +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) +#ifdef __LITTLE_ENDIAN__ + /* dumb little-endian memcpy that will get replaced at runtime */ + addi r9,r3,-1 + addi r4,r4,-1 + beqlr cr7 + mtctr r5 +1: lbzu r10,1(r4) + stbu r10,1(r9) + bdnz 1b + blr +#else + PPC_MTOCRF(0x01,r5) + cmpldi cr1,r5,16 + neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry + andi. r6,r6,7 + dcbt 0,r4 + blt cr1,.Lshort_copy +/* Below we want to nop out the bne if we're on a CPU that has the + CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit + cleared. + At the time of writing the only CPU that has this combination of bits + set is Power6. */ +BEGIN_FTR_SECTION + nop +FTR_SECTION_ELSE + bne .Ldst_unaligned +ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \ + CPU_FTR_UNALIGNED_LD_STD) +.Ldst_aligned: + addi r3,r3,-16 +BEGIN_FTR_SECTION + andi. r0,r4,7 + bne .Lsrc_unaligned +END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) + srdi r7,r5,4 + ld r9,0(r4) + addi r4,r4,-8 + mtctr r7 + andi. r5,r5,7 + bf cr7*4+0,2f + addi r3,r3,8 + addi r4,r4,8 + mr r8,r9 + blt cr1,3f +1: ld r9,8(r4) + std r8,8(r3) +2: ldu r8,16(r4) + stdu r9,16(r3) + bdnz 1b +3: std r8,8(r3) + beq 3f + addi r3,r3,16 +.Ldo_tail: + bf cr7*4+1,1f + lwz r9,8(r4) + addi r4,r4,4 + stw r9,0(r3) + addi r3,r3,4 +1: bf cr7*4+2,2f + lhz r9,8(r4) + addi r4,r4,2 + sth r9,0(r3) + addi r3,r3,2 +2: bf cr7*4+3,3f + lbz r9,8(r4) + stb r9,0(r3) +3: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* return dest pointer */ + blr + +.Lsrc_unaligned: + srdi r6,r5,3 + addi r5,r5,-16 + subf r4,r0,r4 + srdi r7,r5,4 + sldi r10,r0,3 + cmpdi cr6,r6,3 + andi. r5,r5,7 + mtctr r7 + subfic r11,r10,64 + add r5,r5,r0 + + bt cr7*4+0,0f + + ld r9,0(r4) # 3+2n loads, 2+2n stores + ld r0,8(r4) + sld r6,r9,r10 + ldu r9,16(r4) + srd r7,r0,r11 + sld r8,r0,r10 + or r7,r7,r6 + blt cr6,4f + ld r0,8(r4) + # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12 + b 2f + +0: ld r0,0(r4) # 4+2n loads, 3+2n stores + ldu r9,8(r4) + sld r8,r0,r10 + addi r3,r3,-8 + blt cr6,5f + ld r0,8(r4) + srd r12,r9,r11 + sld r6,r9,r10 + ldu r9,16(r4) + or r12,r8,r12 + srd r7,r0,r11 + sld r8,r0,r10 + addi r3,r3,16 + beq cr6,3f + + # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9 +1: or r7,r7,r6 + ld r0,8(r4) + std r12,8(r3) +2: srd r12,r9,r11 + sld r6,r9,r10 + ldu r9,16(r4) + or r12,r8,r12 + stdu r7,16(r3) + srd r7,r0,r11 + sld r8,r0,r10 + bdnz 1b + +3: std r12,8(r3) + or r7,r7,r6 +4: std r7,16(r3) +5: srd r12,r9,r11 + or r12,r8,r12 + std r12,24(r3) + beq 4f + cmpwi cr1,r5,8 + addi r3,r3,32 + sld r9,r9,r10 + ble cr1,6f + ld r0,8(r4) + srd r7,r0,r11 + or r9,r7,r9 +6: + bf cr7*4+1,1f + rotldi r9,r9,32 + stw r9,0(r3) + addi r3,r3,4 +1: bf cr7*4+2,2f + rotldi r9,r9,16 + sth r9,0(r3) + addi r3,r3,2 +2: bf cr7*4+3,3f + rotldi r9,r9,8 + stb r9,0(r3) +3: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* return dest pointer */ + blr + +.Ldst_unaligned: + PPC_MTOCRF(0x01,r6) # put #bytes to 8B bdry into cr7 + subf r5,r6,r5 + li r7,0 + cmpldi cr1,r5,16 + bf cr7*4+3,1f + lbz r0,0(r4) + stb r0,0(r3) + addi r7,r7,1 +1: bf cr7*4+2,2f + lhzx r0,r7,r4 + sthx r0,r7,r3 + addi r7,r7,2 +2: bf cr7*4+1,3f + lwzx r0,r7,r4 + stwx r0,r7,r3 +3: PPC_MTOCRF(0x01,r5) + add r4,r6,r4 + add r3,r6,r3 + b .Ldst_aligned + +.Lshort_copy: + bf cr7*4+0,1f + lwz r0,0(r4) + lwz r9,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r9,4(r3) + addi r3,r3,8 +1: bf cr7*4+1,2f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 +2: bf cr7*4+2,3f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 +3: bf cr7*4+3,4f + lbz r0,0(r4) + stb r0,0(r3) +4: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) /* return dest pointer */ + blr +#endif diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S new file mode 100644 index 000000000..786234fd4 --- /dev/null +++ b/arch/powerpc/lib/memcpy_power7.S @@ -0,0 +1,656 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/ppc_asm.h> + +_GLOBAL(memcpy_power7) + +#ifdef __BIG_ENDIAN__ +#define LVS(VRT,RA,RB) lvsl VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC +#else +#define LVS(VRT,RA,RB) lvsr VRT,RA,RB +#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC +#endif + +#ifdef CONFIG_ALTIVEC + cmpldi r5,16 + cmpldi cr1,r5,4096 + + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + + blt .Lshort_copy + bgt cr1,.Lvmx_copy +#else + cmpldi r5,16 + + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + + blt .Lshort_copy +#endif + +.Lnonvmx_copy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + std r17,STK_REG(R17)(r1) + std r18,STK_REG(R18)(r1) + std r19,STK_REG(R19)(r1) + std r20,STK_REG(R20)(r1) + std r21,STK_REG(R21)(r1) + std r22,STK_REG(R22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + ld r9,32(r4) + ld r10,40(r4) + ld r11,48(r4) + ld r12,56(r4) + ld r14,64(r4) + ld r15,72(r4) + ld r16,80(r4) + ld r17,88(r4) + ld r18,96(r4) + ld r19,104(r4) + ld r20,112(r4) + ld r21,120(r4) + addi r4,r4,128 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + std r9,32(r3) + std r10,40(r3) + std r11,48(r3) + std r12,56(r3) + std r14,64(r3) + std r15,72(r3) + std r16,80(r3) + std r17,88(r3) + std r18,96(r3) + std r19,104(r3) + std r20,112(r3) + std r21,120(r3) + addi r3,r3,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + ld r17,STK_REG(R17)(r1) + ld r18,STK_REG(R18)(r1) + ld r19,STK_REG(R19)(r1) + ld r20,STK_REG(R20)(r1) + ld r21,STK_REG(R21)(r1) + ld r22,STK_REG(R22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + ld r9,32(r4) + ld r10,40(r4) + ld r11,48(r4) + ld r12,56(r4) + addi r4,r4,64 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + std r9,32(r3) + std r10,40(r3) + std r11,48(r3) + std r12,56(r3) + addi r3,r3,64 + + /* Up to 63B to go */ +7: bf cr7*4+2,8f + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + addi r4,r4,32 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + addi r3,r3,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f + ld r0,0(r4) + ld r6,8(r4) + addi r4,r4,16 + std r0,0(r3) + std r6,8(r3) + addi r3,r3,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r6,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + blr + +.Lunwind_stack_nonvmx_copy: + addi r1,r1,STACKFRAMESIZE + b .Lnonvmx_copy + +#ifdef CONFIG_ALTIVEC +.Lvmx_copy: + mflr r0 + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl enter_vmx_copy + cmpwi cr1,r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STK_REG(R31)(r1) + ld r4,STK_REG(R30)(r1) + ld r5,STK_REG(R29)(r1) + mtlr r0 + + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. + */ + clrrdi r6,r4,7 + clrrdi r9,r3,7 + ori r9,r9,1 /* stream=1 */ + + srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ + cmpldi r7,0x3FF + ble 1f + li r7,0x3FF +1: lis r0,0x0E00 /* depth=7 */ + sldi r7,r7,7 + or r7,r7,r0 + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + dcbt r0,r6,0b01000 + dcbt r0,r7,0b01010 + dcbtst r0,r9,0b01000 + dcbtst r0,r10,0b01010 + eieio + dcbt r0,r8,0b01010 /* GO */ +.machine pop + + beq cr1,.Lunwind_stack_nonvmx_copy + + /* + * If source and destination are not relatively aligned we use a + * slower permute loop. + */ + xor r6,r4,r3 + rldicl. r6,r6,0,(64-4) + bne .Lvmx_unaligned_copy + + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f + ld r0,0(r4) + addi r4,r4,8 + std r0,0(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + bf cr7*4+3,5f + lvx v1,r0,r4 + addi r4,r4,16 + stvx v1,r0,r3 + addi r3,r3,16 + +5: bf cr7*4+2,6f + lvx v1,r0,r4 + lvx v0,r4,r9 + addi r4,r4,32 + stvx v1,r0,r3 + stvx v0,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f + lvx v3,r0,r4 + lvx v2,r4,r9 + lvx v1,r4,r10 + lvx v0,r4,r11 + addi r4,r4,64 + stvx v3,r0,r3 + stvx v2,r3,r9 + stvx v1,r3,r10 + stvx v0,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: + lvx v7,r0,r4 + lvx v6,r4,r9 + lvx v5,r4,r10 + lvx v4,r4,r11 + lvx v3,r4,r12 + lvx v2,r4,r14 + lvx v1,r4,r15 + lvx v0,r4,r16 + addi r4,r4,128 + stvx v7,r0,r3 + stvx v6,r3,r9 + stvx v5,r3,r10 + stvx v4,r3,r11 + stvx v3,r3,r12 + stvx v2,r3,r14 + stvx v1,r3,r15 + stvx v0,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f + lvx v3,r0,r4 + lvx v2,r4,r9 + lvx v1,r4,r10 + lvx v0,r4,r11 + addi r4,r4,64 + stvx v3,r0,r3 + stvx v2,r3,r9 + stvx v1,r3,r10 + stvx v0,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f + lvx v1,r0,r4 + lvx v0,r4,r9 + addi r4,r4,32 + stvx v1,r0,r3 + stvx v0,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f + lvx v1,r0,r4 + addi r4,r4,16 + stvx v1,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + mtocrf 0x01,r5 + bf cr7*4+0,12f + ld r0,0(r4) + addi r4,r4,8 + std r0,0(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + b exit_vmx_copy /* tail call optimise */ + +.Lvmx_unaligned_copy: + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r7,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r7,4(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + LVS(v16,0,r4) /* Setup permute control vector */ + lvx v0,0,r4 + addi r4,r4,16 + + bf cr7*4+3,5f + lvx v1,r0,r4 + VPERM(v8,v0,v1,v16) + addi r4,r4,16 + stvx v8,r0,r3 + addi r3,r3,16 + vor v0,v1,v1 + +5: bf cr7*4+2,6f + lvx v1,r0,r4 + VPERM(v8,v0,v1,v16) + lvx v0,r4,r9 + VPERM(v9,v1,v0,v16) + addi r4,r4,32 + stvx v8,r0,r3 + stvx v9,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f + lvx v3,r0,r4 + VPERM(v8,v0,v3,v16) + lvx v2,r4,r9 + VPERM(v9,v3,v2,v16) + lvx v1,r4,r10 + VPERM(v10,v2,v1,v16) + lvx v0,r4,r11 + VPERM(v11,v1,v0,v16) + addi r4,r4,64 + stvx v8,r0,r3 + stvx v9,r3,r9 + stvx v10,r3,r10 + stvx v11,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(R14)(r1) + std r15,STK_REG(R15)(r1) + std r16,STK_REG(R16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: + lvx v7,r0,r4 + VPERM(v8,v0,v7,v16) + lvx v6,r4,r9 + VPERM(v9,v7,v6,v16) + lvx v5,r4,r10 + VPERM(v10,v6,v5,v16) + lvx v4,r4,r11 + VPERM(v11,v5,v4,v16) + lvx v3,r4,r12 + VPERM(v12,v4,v3,v16) + lvx v2,r4,r14 + VPERM(v13,v3,v2,v16) + lvx v1,r4,r15 + VPERM(v14,v2,v1,v16) + lvx v0,r4,r16 + VPERM(v15,v1,v0,v16) + addi r4,r4,128 + stvx v8,r0,r3 + stvx v9,r3,r9 + stvx v10,r3,r10 + stvx v11,r3,r11 + stvx v12,r3,r12 + stvx v13,r3,r14 + stvx v14,r3,r15 + stvx v15,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(R14)(r1) + ld r15,STK_REG(R15)(r1) + ld r16,STK_REG(R16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f + lvx v3,r0,r4 + VPERM(v8,v0,v3,v16) + lvx v2,r4,r9 + VPERM(v9,v3,v2,v16) + lvx v1,r4,r10 + VPERM(v10,v2,v1,v16) + lvx v0,r4,r11 + VPERM(v11,v1,v0,v16) + addi r4,r4,64 + stvx v8,r0,r3 + stvx v9,r3,r9 + stvx v10,r3,r10 + stvx v11,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f + lvx v1,r0,r4 + VPERM(v8,v0,v1,v16) + lvx v0,r4,r9 + VPERM(v9,v1,v0,v16) + addi r4,r4,32 + stvx v8,r0,r3 + stvx v9,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f + lvx v1,r0,r4 + VPERM(v8,v0,v1,v16) + addi r4,r4,16 + stvx v8,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + addi r4,r4,-16 /* Unwind the +16 load offset */ + mtocrf 0x01,r5 + bf cr7*4+0,12f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r6,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + b exit_vmx_copy /* tail call optimise */ +#endif /* CONFIG_ALTIVEC */ diff --git a/arch/powerpc/lib/ppc_ksyms.c b/arch/powerpc/lib/ppc_ksyms.c new file mode 100644 index 000000000..c7f8e9586 --- /dev/null +++ b/arch/powerpc/lib/ppc_ksyms.c @@ -0,0 +1,35 @@ +#include <linux/string.h> +#include <linux/uaccess.h> +#include <linux/bitops.h> +#include <net/checksum.h> + +EXPORT_SYMBOL(memcpy); +EXPORT_SYMBOL(memset); +EXPORT_SYMBOL(memmove); +EXPORT_SYMBOL(memcmp); +EXPORT_SYMBOL(memchr); + +EXPORT_SYMBOL(strcpy); +EXPORT_SYMBOL(strncpy); +EXPORT_SYMBOL(strcat); +EXPORT_SYMBOL(strlen); +EXPORT_SYMBOL(strcmp); +EXPORT_SYMBOL(strncmp); + +#ifndef CONFIG_GENERIC_CSUM +EXPORT_SYMBOL(csum_partial); +EXPORT_SYMBOL(csum_partial_copy_generic); +EXPORT_SYMBOL(ip_fast_csum); +EXPORT_SYMBOL(csum_tcpudp_magic); +#endif + +EXPORT_SYMBOL(__copy_tofrom_user); +EXPORT_SYMBOL(__clear_user); +EXPORT_SYMBOL(copy_page); + +#ifdef CONFIG_PPC64 +EXPORT_SYMBOL(__arch_hweight8); +EXPORT_SYMBOL(__arch_hweight16); +EXPORT_SYMBOL(__arch_hweight32); +EXPORT_SYMBOL(__arch_hweight64); +#endif diff --git a/arch/powerpc/lib/rheap.c b/arch/powerpc/lib/rheap.c new file mode 100644 index 000000000..69abf844c --- /dev/null +++ b/arch/powerpc/lib/rheap.c @@ -0,0 +1,747 @@ +/* + * A Remote Heap. Remote means that we don't touch the memory that the + * heap points to. Normal heap implementations use the memory they manage + * to place their list. We cannot do that because the memory we manage may + * have special properties, for example it is uncachable or of different + * endianess. + * + * Author: Pantelis Antoniou <panto@intracom.gr> + * + * 2004 (c) INTRACOM S.A. Greece. This file is licensed under + * the terms of the GNU General Public License version 2. This program + * is licensed "as is" without any warranty of any kind, whether express + * or implied. + */ +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/err.h> +#include <linux/slab.h> + +#include <asm/rheap.h> + +/* + * Fixup a list_head, needed when copying lists. If the pointers fall + * between s and e, apply the delta. This assumes that + * sizeof(struct list_head *) == sizeof(unsigned long *). + */ +static inline void fixup(unsigned long s, unsigned long e, int d, + struct list_head *l) +{ + unsigned long *pp; + + pp = (unsigned long *)&l->next; + if (*pp >= s && *pp < e) + *pp += d; + + pp = (unsigned long *)&l->prev; + if (*pp >= s && *pp < e) + *pp += d; +} + +/* Grow the allocated blocks */ +static int grow(rh_info_t * info, int max_blocks) +{ + rh_block_t *block, *blk; + int i, new_blocks; + int delta; + unsigned long blks, blke; + + if (max_blocks <= info->max_blocks) + return -EINVAL; + + new_blocks = max_blocks - info->max_blocks; + + block = kmalloc(sizeof(rh_block_t) * max_blocks, GFP_ATOMIC); + if (block == NULL) + return -ENOMEM; + + if (info->max_blocks > 0) { + + /* copy old block area */ + memcpy(block, info->block, + sizeof(rh_block_t) * info->max_blocks); + + delta = (char *)block - (char *)info->block; + + /* and fixup list pointers */ + blks = (unsigned long)info->block; + blke = (unsigned long)(info->block + info->max_blocks); + + for (i = 0, blk = block; i < info->max_blocks; i++, blk++) + fixup(blks, blke, delta, &blk->list); + + fixup(blks, blke, delta, &info->empty_list); + fixup(blks, blke, delta, &info->free_list); + fixup(blks, blke, delta, &info->taken_list); + + /* free the old allocated memory */ + if ((info->flags & RHIF_STATIC_BLOCK) == 0) + kfree(info->block); + } + + info->block = block; + info->empty_slots += new_blocks; + info->max_blocks = max_blocks; + info->flags &= ~RHIF_STATIC_BLOCK; + + /* add all new blocks to the free list */ + blk = block + info->max_blocks - new_blocks; + for (i = 0; i < new_blocks; i++, blk++) + list_add(&blk->list, &info->empty_list); + + return 0; +} + +/* + * Assure at least the required amount of empty slots. If this function + * causes a grow in the block area then all pointers kept to the block + * area are invalid! + */ +static int assure_empty(rh_info_t * info, int slots) +{ + int max_blocks; + + /* This function is not meant to be used to grow uncontrollably */ + if (slots >= 4) + return -EINVAL; + + /* Enough space */ + if (info->empty_slots >= slots) + return 0; + + /* Next 16 sized block */ + max_blocks = ((info->max_blocks + slots) + 15) & ~15; + + return grow(info, max_blocks); +} + +static rh_block_t *get_slot(rh_info_t * info) +{ + rh_block_t *blk; + + /* If no more free slots, and failure to extend. */ + /* XXX: You should have called assure_empty before */ + if (info->empty_slots == 0) { + printk(KERN_ERR "rh: out of slots; crash is imminent.\n"); + return NULL; + } + + /* Get empty slot to use */ + blk = list_entry(info->empty_list.next, rh_block_t, list); + list_del_init(&blk->list); + info->empty_slots--; + + /* Initialize */ + blk->start = 0; + blk->size = 0; + blk->owner = NULL; + + return blk; +} + +static inline void release_slot(rh_info_t * info, rh_block_t * blk) +{ + list_add(&blk->list, &info->empty_list); + info->empty_slots++; +} + +static void attach_free_block(rh_info_t * info, rh_block_t * blkn) +{ + rh_block_t *blk; + rh_block_t *before; + rh_block_t *after; + rh_block_t *next; + int size; + unsigned long s, e, bs, be; + struct list_head *l; + + /* We assume that they are aligned properly */ + size = blkn->size; + s = blkn->start; + e = s + size; + + /* Find the blocks immediately before and after the given one + * (if any) */ + before = NULL; + after = NULL; + next = NULL; + + list_for_each(l, &info->free_list) { + blk = list_entry(l, rh_block_t, list); + + bs = blk->start; + be = bs + blk->size; + + if (next == NULL && s >= bs) + next = blk; + + if (be == s) + before = blk; + + if (e == bs) + after = blk; + + /* If both are not null, break now */ + if (before != NULL && after != NULL) + break; + } + + /* Now check if they are really adjacent */ + if (before && s != (before->start + before->size)) + before = NULL; + + if (after && e != after->start) + after = NULL; + + /* No coalescing; list insert and return */ + if (before == NULL && after == NULL) { + + if (next != NULL) + list_add(&blkn->list, &next->list); + else + list_add(&blkn->list, &info->free_list); + + return; + } + + /* We don't need it anymore */ + release_slot(info, blkn); + + /* Grow the before block */ + if (before != NULL && after == NULL) { + before->size += size; + return; + } + + /* Grow the after block backwards */ + if (before == NULL && after != NULL) { + after->start -= size; + after->size += size; + return; + } + + /* Grow the before block, and release the after block */ + before->size += size + after->size; + list_del(&after->list); + release_slot(info, after); +} + +static void attach_taken_block(rh_info_t * info, rh_block_t * blkn) +{ + rh_block_t *blk; + struct list_head *l; + + /* Find the block immediately before the given one (if any) */ + list_for_each(l, &info->taken_list) { + blk = list_entry(l, rh_block_t, list); + if (blk->start > blkn->start) { + list_add_tail(&blkn->list, &blk->list); + return; + } + } + + list_add_tail(&blkn->list, &info->taken_list); +} + +/* + * Create a remote heap dynamically. Note that no memory for the blocks + * are allocated. It will upon the first allocation + */ +rh_info_t *rh_create(unsigned int alignment) +{ + rh_info_t *info; + + /* Alignment must be a power of two */ + if ((alignment & (alignment - 1)) != 0) + return ERR_PTR(-EINVAL); + + info = kmalloc(sizeof(*info), GFP_ATOMIC); + if (info == NULL) + return ERR_PTR(-ENOMEM); + + info->alignment = alignment; + + /* Initially everything as empty */ + info->block = NULL; + info->max_blocks = 0; + info->empty_slots = 0; + info->flags = 0; + + INIT_LIST_HEAD(&info->empty_list); + INIT_LIST_HEAD(&info->free_list); + INIT_LIST_HEAD(&info->taken_list); + + return info; +} +EXPORT_SYMBOL_GPL(rh_create); + +/* + * Destroy a dynamically created remote heap. Deallocate only if the areas + * are not static + */ +void rh_destroy(rh_info_t * info) +{ + if ((info->flags & RHIF_STATIC_BLOCK) == 0) + kfree(info->block); + + if ((info->flags & RHIF_STATIC_INFO) == 0) + kfree(info); +} +EXPORT_SYMBOL_GPL(rh_destroy); + +/* + * Initialize in place a remote heap info block. This is needed to support + * operation very early in the startup of the kernel, when it is not yet safe + * to call kmalloc. + */ +void rh_init(rh_info_t * info, unsigned int alignment, int max_blocks, + rh_block_t * block) +{ + int i; + rh_block_t *blk; + + /* Alignment must be a power of two */ + if ((alignment & (alignment - 1)) != 0) + return; + + info->alignment = alignment; + + /* Initially everything as empty */ + info->block = block; + info->max_blocks = max_blocks; + info->empty_slots = max_blocks; + info->flags = RHIF_STATIC_INFO | RHIF_STATIC_BLOCK; + + INIT_LIST_HEAD(&info->empty_list); + INIT_LIST_HEAD(&info->free_list); + INIT_LIST_HEAD(&info->taken_list); + + /* Add all new blocks to the free list */ + for (i = 0, blk = block; i < max_blocks; i++, blk++) + list_add(&blk->list, &info->empty_list); +} +EXPORT_SYMBOL_GPL(rh_init); + +/* Attach a free memory region, coalesces regions if adjuscent */ +int rh_attach_region(rh_info_t * info, unsigned long start, int size) +{ + rh_block_t *blk; + unsigned long s, e, m; + int r; + + /* The region must be aligned */ + s = start; + e = s + size; + m = info->alignment - 1; + + /* Round start up */ + s = (s + m) & ~m; + + /* Round end down */ + e = e & ~m; + + if (IS_ERR_VALUE(e) || (e < s)) + return -ERANGE; + + /* Take final values */ + start = s; + size = e - s; + + /* Grow the blocks, if needed */ + r = assure_empty(info, 1); + if (r < 0) + return r; + + blk = get_slot(info); + blk->start = start; + blk->size = size; + blk->owner = NULL; + + attach_free_block(info, blk); + + return 0; +} +EXPORT_SYMBOL_GPL(rh_attach_region); + +/* Detatch given address range, splits free block if needed. */ +unsigned long rh_detach_region(rh_info_t * info, unsigned long start, int size) +{ + struct list_head *l; + rh_block_t *blk, *newblk; + unsigned long s, e, m, bs, be; + + /* Validate size */ + if (size <= 0) + return (unsigned long) -EINVAL; + + /* The region must be aligned */ + s = start; + e = s + size; + m = info->alignment - 1; + + /* Round start up */ + s = (s + m) & ~m; + + /* Round end down */ + e = e & ~m; + + if (assure_empty(info, 1) < 0) + return (unsigned long) -ENOMEM; + + blk = NULL; + list_for_each(l, &info->free_list) { + blk = list_entry(l, rh_block_t, list); + /* The range must lie entirely inside one free block */ + bs = blk->start; + be = blk->start + blk->size; + if (s >= bs && e <= be) + break; + blk = NULL; + } + + if (blk == NULL) + return (unsigned long) -ENOMEM; + + /* Perfect fit */ + if (bs == s && be == e) { + /* Delete from free list, release slot */ + list_del(&blk->list); + release_slot(info, blk); + return s; + } + + /* blk still in free list, with updated start and/or size */ + if (bs == s || be == e) { + if (bs == s) + blk->start += size; + blk->size -= size; + + } else { + /* The front free fragment */ + blk->size = s - bs; + + /* the back free fragment */ + newblk = get_slot(info); + newblk->start = e; + newblk->size = be - e; + + list_add(&newblk->list, &blk->list); + } + + return s; +} +EXPORT_SYMBOL_GPL(rh_detach_region); + +/* Allocate a block of memory at the specified alignment. The value returned + * is an offset into the buffer initialized by rh_init(), or a negative number + * if there is an error. + */ +unsigned long rh_alloc_align(rh_info_t * info, int size, int alignment, const char *owner) +{ + struct list_head *l; + rh_block_t *blk; + rh_block_t *newblk; + unsigned long start, sp_size; + + /* Validate size, and alignment must be power of two */ + if (size <= 0 || (alignment & (alignment - 1)) != 0) + return (unsigned long) -EINVAL; + + /* Align to configured alignment */ + size = (size + (info->alignment - 1)) & ~(info->alignment - 1); + + if (assure_empty(info, 2) < 0) + return (unsigned long) -ENOMEM; + + blk = NULL; + list_for_each(l, &info->free_list) { + blk = list_entry(l, rh_block_t, list); + if (size <= blk->size) { + start = (blk->start + alignment - 1) & ~(alignment - 1); + if (start + size <= blk->start + blk->size) + break; + } + blk = NULL; + } + + if (blk == NULL) + return (unsigned long) -ENOMEM; + + /* Just fits */ + if (blk->size == size) { + /* Move from free list to taken list */ + list_del(&blk->list); + newblk = blk; + } else { + /* Fragment caused, split if needed */ + /* Create block for fragment in the beginning */ + sp_size = start - blk->start; + if (sp_size) { + rh_block_t *spblk; + + spblk = get_slot(info); + spblk->start = blk->start; + spblk->size = sp_size; + /* add before the blk */ + list_add(&spblk->list, blk->list.prev); + } + newblk = get_slot(info); + newblk->start = start; + newblk->size = size; + + /* blk still in free list, with updated start and size + * for fragment in the end */ + blk->start = start + size; + blk->size -= sp_size + size; + /* No fragment in the end, remove blk */ + if (blk->size == 0) { + list_del(&blk->list); + release_slot(info, blk); + } + } + + newblk->owner = owner; + attach_taken_block(info, newblk); + + return start; +} +EXPORT_SYMBOL_GPL(rh_alloc_align); + +/* Allocate a block of memory at the default alignment. The value returned is + * an offset into the buffer initialized by rh_init(), or a negative number if + * there is an error. + */ +unsigned long rh_alloc(rh_info_t * info, int size, const char *owner) +{ + return rh_alloc_align(info, size, info->alignment, owner); +} +EXPORT_SYMBOL_GPL(rh_alloc); + +/* Allocate a block of memory at the given offset, rounded up to the default + * alignment. The value returned is an offset into the buffer initialized by + * rh_init(), or a negative number if there is an error. + */ +unsigned long rh_alloc_fixed(rh_info_t * info, unsigned long start, int size, const char *owner) +{ + struct list_head *l; + rh_block_t *blk, *newblk1, *newblk2; + unsigned long s, e, m, bs = 0, be = 0; + + /* Validate size */ + if (size <= 0) + return (unsigned long) -EINVAL; + + /* The region must be aligned */ + s = start; + e = s + size; + m = info->alignment - 1; + + /* Round start up */ + s = (s + m) & ~m; + + /* Round end down */ + e = e & ~m; + + if (assure_empty(info, 2) < 0) + return (unsigned long) -ENOMEM; + + blk = NULL; + list_for_each(l, &info->free_list) { + blk = list_entry(l, rh_block_t, list); + /* The range must lie entirely inside one free block */ + bs = blk->start; + be = blk->start + blk->size; + if (s >= bs && e <= be) + break; + blk = NULL; + } + + if (blk == NULL) + return (unsigned long) -ENOMEM; + + /* Perfect fit */ + if (bs == s && be == e) { + /* Move from free list to taken list */ + list_del(&blk->list); + blk->owner = owner; + + start = blk->start; + attach_taken_block(info, blk); + + return start; + + } + + /* blk still in free list, with updated start and/or size */ + if (bs == s || be == e) { + if (bs == s) + blk->start += size; + blk->size -= size; + + } else { + /* The front free fragment */ + blk->size = s - bs; + + /* The back free fragment */ + newblk2 = get_slot(info); + newblk2->start = e; + newblk2->size = be - e; + + list_add(&newblk2->list, &blk->list); + } + + newblk1 = get_slot(info); + newblk1->start = s; + newblk1->size = e - s; + newblk1->owner = owner; + + start = newblk1->start; + attach_taken_block(info, newblk1); + + return start; +} +EXPORT_SYMBOL_GPL(rh_alloc_fixed); + +/* Deallocate the memory previously allocated by one of the rh_alloc functions. + * The return value is the size of the deallocated block, or a negative number + * if there is an error. + */ +int rh_free(rh_info_t * info, unsigned long start) +{ + rh_block_t *blk, *blk2; + struct list_head *l; + int size; + + /* Linear search for block */ + blk = NULL; + list_for_each(l, &info->taken_list) { + blk2 = list_entry(l, rh_block_t, list); + if (start < blk2->start) + break; + blk = blk2; + } + + if (blk == NULL || start > (blk->start + blk->size)) + return -EINVAL; + + /* Remove from taken list */ + list_del(&blk->list); + + /* Get size of freed block */ + size = blk->size; + attach_free_block(info, blk); + + return size; +} +EXPORT_SYMBOL_GPL(rh_free); + +int rh_get_stats(rh_info_t * info, int what, int max_stats, rh_stats_t * stats) +{ + rh_block_t *blk; + struct list_head *l; + struct list_head *h; + int nr; + + switch (what) { + + case RHGS_FREE: + h = &info->free_list; + break; + + case RHGS_TAKEN: + h = &info->taken_list; + break; + + default: + return -EINVAL; + } + + /* Linear search for block */ + nr = 0; + list_for_each(l, h) { + blk = list_entry(l, rh_block_t, list); + if (stats != NULL && nr < max_stats) { + stats->start = blk->start; + stats->size = blk->size; + stats->owner = blk->owner; + stats++; + } + nr++; + } + + return nr; +} +EXPORT_SYMBOL_GPL(rh_get_stats); + +int rh_set_owner(rh_info_t * info, unsigned long start, const char *owner) +{ + rh_block_t *blk, *blk2; + struct list_head *l; + int size; + + /* Linear search for block */ + blk = NULL; + list_for_each(l, &info->taken_list) { + blk2 = list_entry(l, rh_block_t, list); + if (start < blk2->start) + break; + blk = blk2; + } + + if (blk == NULL || start > (blk->start + blk->size)) + return -EINVAL; + + blk->owner = owner; + size = blk->size; + + return size; +} +EXPORT_SYMBOL_GPL(rh_set_owner); + +void rh_dump(rh_info_t * info) +{ + static rh_stats_t st[32]; /* XXX maximum 32 blocks */ + int maxnr; + int i, nr; + + maxnr = ARRAY_SIZE(st); + + printk(KERN_INFO + "info @0x%p (%d slots empty / %d max)\n", + info, info->empty_slots, info->max_blocks); + + printk(KERN_INFO " Free:\n"); + nr = rh_get_stats(info, RHGS_FREE, maxnr, st); + if (nr > maxnr) + nr = maxnr; + for (i = 0; i < nr; i++) + printk(KERN_INFO + " 0x%lx-0x%lx (%u)\n", + st[i].start, st[i].start + st[i].size, + st[i].size); + printk(KERN_INFO "\n"); + + printk(KERN_INFO " Taken:\n"); + nr = rh_get_stats(info, RHGS_TAKEN, maxnr, st); + if (nr > maxnr) + nr = maxnr; + for (i = 0; i < nr; i++) + printk(KERN_INFO + " 0x%lx-0x%lx (%u) %s\n", + st[i].start, st[i].start + st[i].size, + st[i].size, st[i].owner != NULL ? st[i].owner : ""); + printk(KERN_INFO "\n"); +} +EXPORT_SYMBOL_GPL(rh_dump); + +void rh_dump_blk(rh_info_t * info, rh_block_t * blk) +{ + printk(KERN_INFO + "blk @0x%p: 0x%lx-0x%lx (%u)\n", + blk, blk->start, blk->start + blk->size, blk->size); +} +EXPORT_SYMBOL_GPL(rh_dump_blk); + diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c new file mode 100644 index 000000000..dc885b30f --- /dev/null +++ b/arch/powerpc/lib/sstep.c @@ -0,0 +1,2015 @@ +/* + * Single-step support. + * + * Copyright (C) 2004 Paul Mackerras <paulus@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/prefetch.h> +#include <asm/sstep.h> +#include <asm/processor.h> +#include <asm/uaccess.h> +#include <asm/cputable.h> + +extern char system_call_common[]; + +#ifdef CONFIG_PPC64 +/* Bits in SRR1 that are copied from MSR */ +#define MSR_MASK 0xffffffff87c0ffffUL +#else +#define MSR_MASK 0x87c0ffff +#endif + +/* Bits in XER */ +#define XER_SO 0x80000000U +#define XER_OV 0x40000000U +#define XER_CA 0x20000000U + +#ifdef CONFIG_PPC_FPU +/* + * Functions in ldstfp.S + */ +extern int do_lfs(int rn, unsigned long ea); +extern int do_lfd(int rn, unsigned long ea); +extern int do_stfs(int rn, unsigned long ea); +extern int do_stfd(int rn, unsigned long ea); +extern int do_lvx(int rn, unsigned long ea); +extern int do_stvx(int rn, unsigned long ea); +extern int do_lxvd2x(int rn, unsigned long ea); +extern int do_stxvd2x(int rn, unsigned long ea); +#endif + +/* + * Emulate the truncation of 64 bit values in 32-bit mode. + */ +static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val) +{ +#ifdef __powerpc64__ + if ((msr & MSR_64BIT) == 0) + val &= 0xffffffffUL; +#endif + return val; +} + +/* + * Determine whether a conditional branch instruction would branch. + */ +static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs) +{ + unsigned int bo = (instr >> 21) & 0x1f; + unsigned int bi; + + if ((bo & 4) == 0) { + /* decrement counter */ + --regs->ctr; + if (((bo >> 1) & 1) ^ (regs->ctr == 0)) + return 0; + } + if ((bo & 0x10) == 0) { + /* check bit from CR */ + bi = (instr >> 16) & 0x1f; + if (((regs->ccr >> (31 - bi)) & 1) != ((bo >> 3) & 1)) + return 0; + } + return 1; +} + + +static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb) +{ + if (!user_mode(regs)) + return 1; + return __access_ok(ea, nb, USER_DS); +} + +/* + * Calculate effective address for a D-form instruction + */ +static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs) +{ + int ra; + unsigned long ea; + + ra = (instr >> 16) & 0x1f; + ea = (signed short) instr; /* sign-extend */ + if (ra) + ea += regs->gpr[ra]; + + return truncate_if_32bit(regs->msr, ea); +} + +#ifdef __powerpc64__ +/* + * Calculate effective address for a DS-form instruction + */ +static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *regs) +{ + int ra; + unsigned long ea; + + ra = (instr >> 16) & 0x1f; + ea = (signed short) (instr & ~3); /* sign-extend */ + if (ra) + ea += regs->gpr[ra]; + + return truncate_if_32bit(regs->msr, ea); +} +#endif /* __powerpc64 */ + +/* + * Calculate effective address for an X-form instruction + */ +static unsigned long __kprobes xform_ea(unsigned int instr, + struct pt_regs *regs) +{ + int ra, rb; + unsigned long ea; + + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + ea = regs->gpr[rb]; + if (ra) + ea += regs->gpr[ra]; + + return truncate_if_32bit(regs->msr, ea); +} + +/* + * Return the largest power of 2, not greater than sizeof(unsigned long), + * such that x is a multiple of it. + */ +static inline unsigned long max_align(unsigned long x) +{ + x |= sizeof(unsigned long); + return x & -x; /* isolates rightmost bit */ +} + + +static inline unsigned long byterev_2(unsigned long x) +{ + return ((x >> 8) & 0xff) | ((x & 0xff) << 8); +} + +static inline unsigned long byterev_4(unsigned long x) +{ + return ((x >> 24) & 0xff) | ((x >> 8) & 0xff00) | + ((x & 0xff00) << 8) | ((x & 0xff) << 24); +} + +#ifdef __powerpc64__ +static inline unsigned long byterev_8(unsigned long x) +{ + return (byterev_4(x) << 32) | byterev_4(x >> 32); +} +#endif + +static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea, + int nb) +{ + int err = 0; + unsigned long x = 0; + + switch (nb) { + case 1: + err = __get_user(x, (unsigned char __user *) ea); + break; + case 2: + err = __get_user(x, (unsigned short __user *) ea); + break; + case 4: + err = __get_user(x, (unsigned int __user *) ea); + break; +#ifdef __powerpc64__ + case 8: + err = __get_user(x, (unsigned long __user *) ea); + break; +#endif + } + if (!err) + *dest = x; + return err; +} + +static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea, + int nb, struct pt_regs *regs) +{ + int err; + unsigned long x, b, c; +#ifdef __LITTLE_ENDIAN__ + int len = nb; /* save a copy of the length for byte reversal */ +#endif + + /* unaligned, do this in pieces */ + x = 0; + for (; nb > 0; nb -= c) { +#ifdef __LITTLE_ENDIAN__ + c = 1; +#endif +#ifdef __BIG_ENDIAN__ + c = max_align(ea); +#endif + if (c > nb) + c = max_align(nb); + err = read_mem_aligned(&b, ea, c); + if (err) + return err; + x = (x << (8 * c)) + b; + ea += c; + } +#ifdef __LITTLE_ENDIAN__ + switch (len) { + case 2: + *dest = byterev_2(x); + break; + case 4: + *dest = byterev_4(x); + break; +#ifdef __powerpc64__ + case 8: + *dest = byterev_8(x); + break; +#endif + } +#endif +#ifdef __BIG_ENDIAN__ + *dest = x; +#endif + return 0; +} + +/* + * Read memory at address ea for nb bytes, return 0 for success + * or -EFAULT if an error occurred. + */ +static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb, + struct pt_regs *regs) +{ + if (!address_ok(regs, ea, nb)) + return -EFAULT; + if ((ea & (nb - 1)) == 0) + return read_mem_aligned(dest, ea, nb); + return read_mem_unaligned(dest, ea, nb, regs); +} + +static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea, + int nb) +{ + int err = 0; + + switch (nb) { + case 1: + err = __put_user(val, (unsigned char __user *) ea); + break; + case 2: + err = __put_user(val, (unsigned short __user *) ea); + break; + case 4: + err = __put_user(val, (unsigned int __user *) ea); + break; +#ifdef __powerpc64__ + case 8: + err = __put_user(val, (unsigned long __user *) ea); + break; +#endif + } + return err; +} + +static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea, + int nb, struct pt_regs *regs) +{ + int err; + unsigned long c; + +#ifdef __LITTLE_ENDIAN__ + switch (nb) { + case 2: + val = byterev_2(val); + break; + case 4: + val = byterev_4(val); + break; +#ifdef __powerpc64__ + case 8: + val = byterev_8(val); + break; +#endif + } +#endif + /* unaligned or little-endian, do this in pieces */ + for (; nb > 0; nb -= c) { +#ifdef __LITTLE_ENDIAN__ + c = 1; +#endif +#ifdef __BIG_ENDIAN__ + c = max_align(ea); +#endif + if (c > nb) + c = max_align(nb); + err = write_mem_aligned(val >> (nb - c) * 8, ea, c); + if (err) + return err; + ea += c; + } + return 0; +} + +/* + * Write memory at address ea for nb bytes, return 0 for success + * or -EFAULT if an error occurred. + */ +static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb, + struct pt_regs *regs) +{ + if (!address_ok(regs, ea, nb)) + return -EFAULT; + if ((ea & (nb - 1)) == 0) + return write_mem_aligned(val, ea, nb); + return write_mem_unaligned(val, ea, nb, regs); +} + +#ifdef CONFIG_PPC_FPU +/* + * Check the address and alignment, and call func to do the actual + * load or store. + */ +static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long), + unsigned long ea, int nb, + struct pt_regs *regs) +{ + int err; + union { + double dbl; + unsigned long ul[2]; + struct { +#ifdef __BIG_ENDIAN__ + unsigned _pad_; + unsigned word; +#endif +#ifdef __LITTLE_ENDIAN__ + unsigned word; + unsigned _pad_; +#endif + } single; + } data; + unsigned long ptr; + + if (!address_ok(regs, ea, nb)) + return -EFAULT; + if ((ea & 3) == 0) + return (*func)(rn, ea); + ptr = (unsigned long) &data.ul; + if (sizeof(unsigned long) == 8 || nb == 4) { + err = read_mem_unaligned(&data.ul[0], ea, nb, regs); + if (nb == 4) + ptr = (unsigned long)&(data.single.word); + } else { + /* reading a double on 32-bit */ + err = read_mem_unaligned(&data.ul[0], ea, 4, regs); + if (!err) + err = read_mem_unaligned(&data.ul[1], ea + 4, 4, regs); + } + if (err) + return err; + return (*func)(rn, ptr); +} + +static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long), + unsigned long ea, int nb, + struct pt_regs *regs) +{ + int err; + union { + double dbl; + unsigned long ul[2]; + struct { +#ifdef __BIG_ENDIAN__ + unsigned _pad_; + unsigned word; +#endif +#ifdef __LITTLE_ENDIAN__ + unsigned word; + unsigned _pad_; +#endif + } single; + } data; + unsigned long ptr; + + if (!address_ok(regs, ea, nb)) + return -EFAULT; + if ((ea & 3) == 0) + return (*func)(rn, ea); + ptr = (unsigned long) &data.ul[0]; + if (sizeof(unsigned long) == 8 || nb == 4) { + if (nb == 4) + ptr = (unsigned long)&(data.single.word); + err = (*func)(rn, ptr); + if (err) + return err; + err = write_mem_unaligned(data.ul[0], ea, nb, regs); + } else { + /* writing a double on 32-bit */ + err = (*func)(rn, ptr); + if (err) + return err; + err = write_mem_unaligned(data.ul[0], ea, 4, regs); + if (!err) + err = write_mem_unaligned(data.ul[1], ea + 4, 4, regs); + } + return err; +} +#endif + +#ifdef CONFIG_ALTIVEC +/* For Altivec/VMX, no need to worry about alignment */ +static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long), + unsigned long ea, struct pt_regs *regs) +{ + if (!address_ok(regs, ea & ~0xfUL, 16)) + return -EFAULT; + return (*func)(rn, ea); +} + +static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long), + unsigned long ea, struct pt_regs *regs) +{ + if (!address_ok(regs, ea & ~0xfUL, 16)) + return -EFAULT; + return (*func)(rn, ea); +} +#endif /* CONFIG_ALTIVEC */ + +#ifdef CONFIG_VSX +static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long), + unsigned long ea, struct pt_regs *regs) +{ + int err; + unsigned long val[2]; + + if (!address_ok(regs, ea, 16)) + return -EFAULT; + if ((ea & 3) == 0) + return (*func)(rn, ea); + err = read_mem_unaligned(&val[0], ea, 8, regs); + if (!err) + err = read_mem_unaligned(&val[1], ea + 8, 8, regs); + if (!err) + err = (*func)(rn, (unsigned long) &val[0]); + return err; +} + +static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long), + unsigned long ea, struct pt_regs *regs) +{ + int err; + unsigned long val[2]; + + if (!address_ok(regs, ea, 16)) + return -EFAULT; + if ((ea & 3) == 0) + return (*func)(rn, ea); + err = (*func)(rn, (unsigned long) &val[0]); + if (err) + return err; + err = write_mem_unaligned(val[0], ea, 8, regs); + if (!err) + err = write_mem_unaligned(val[1], ea + 8, 8, regs); + return err; +} +#endif /* CONFIG_VSX */ + +#define __put_user_asmx(x, addr, err, op, cr) \ + __asm__ __volatile__( \ + "1: " op " %2,0,%3\n" \ + " mfcr %1\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%4\n" \ + " b 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + PPC_LONG_ALIGN "\n" \ + PPC_LONG "1b,3b\n" \ + ".previous" \ + : "=r" (err), "=r" (cr) \ + : "r" (x), "r" (addr), "i" (-EFAULT), "0" (err)) + +#define __get_user_asmx(x, addr, err, op) \ + __asm__ __volatile__( \ + "1: "op" %1,0,%2\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%3\n" \ + " b 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + PPC_LONG_ALIGN "\n" \ + PPC_LONG "1b,3b\n" \ + ".previous" \ + : "=r" (err), "=r" (x) \ + : "r" (addr), "i" (-EFAULT), "0" (err)) + +#define __cacheop_user_asmx(addr, err, op) \ + __asm__ __volatile__( \ + "1: "op" 0,%1\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%3\n" \ + " b 2b\n" \ + ".previous\n" \ + ".section __ex_table,\"a\"\n" \ + PPC_LONG_ALIGN "\n" \ + PPC_LONG "1b,3b\n" \ + ".previous" \ + : "=r" (err) \ + : "r" (addr), "i" (-EFAULT), "0" (err)) + +static void __kprobes set_cr0(struct pt_regs *regs, int rd) +{ + long val = regs->gpr[rd]; + + regs->ccr = (regs->ccr & 0x0fffffff) | ((regs->xer >> 3) & 0x10000000); +#ifdef __powerpc64__ + if (!(regs->msr & MSR_64BIT)) + val = (int) val; +#endif + if (val < 0) + regs->ccr |= 0x80000000; + else if (val > 0) + regs->ccr |= 0x40000000; + else + regs->ccr |= 0x20000000; +} + +static void __kprobes add_with_carry(struct pt_regs *regs, int rd, + unsigned long val1, unsigned long val2, + unsigned long carry_in) +{ + unsigned long val = val1 + val2; + + if (carry_in) + ++val; + regs->gpr[rd] = val; +#ifdef __powerpc64__ + if (!(regs->msr & MSR_64BIT)) { + val = (unsigned int) val; + val1 = (unsigned int) val1; + } +#endif + if (val < val1 || (carry_in && val == val1)) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; +} + +static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2, + int crfld) +{ + unsigned int crval, shift; + + crval = (regs->xer >> 31) & 1; /* get SO bit */ + if (v1 < v2) + crval |= 8; + else if (v1 > v2) + crval |= 4; + else + crval |= 2; + shift = (7 - crfld) * 4; + regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift); +} + +static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1, + unsigned long v2, int crfld) +{ + unsigned int crval, shift; + + crval = (regs->xer >> 31) & 1; /* get SO bit */ + if (v1 < v2) + crval |= 8; + else if (v1 > v2) + crval |= 4; + else + crval |= 2; + shift = (7 - crfld) * 4; + regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift); +} + +static int __kprobes trap_compare(long v1, long v2) +{ + int ret = 0; + + if (v1 < v2) + ret |= 0x10; + else if (v1 > v2) + ret |= 0x08; + else + ret |= 0x04; + if ((unsigned long)v1 < (unsigned long)v2) + ret |= 0x02; + else if ((unsigned long)v1 > (unsigned long)v2) + ret |= 0x01; + return ret; +} + +/* + * Elements of 32-bit rotate and mask instructions. + */ +#define MASK32(mb, me) ((0xffffffffUL >> (mb)) + \ + ((signed long)-0x80000000L >> (me)) + ((me) >= (mb))) +#ifdef __powerpc64__ +#define MASK64_L(mb) (~0UL >> (mb)) +#define MASK64_R(me) ((signed long)-0x8000000000000000L >> (me)) +#define MASK64(mb, me) (MASK64_L(mb) + MASK64_R(me) + ((me) >= (mb))) +#define DATA32(x) (((x) & 0xffffffffUL) | (((x) & 0xffffffffUL) << 32)) +#else +#define DATA32(x) (x) +#endif +#define ROTATE(x, n) ((n) ? (((x) << (n)) | ((x) >> (8 * sizeof(long) - (n)))) : (x)) + +/* + * Decode an instruction, and execute it if that can be done just by + * modifying *regs (i.e. integer arithmetic and logical instructions, + * branches, and barrier instructions). + * Returns 1 if the instruction has been executed, or 0 if not. + * Sets *op to indicate what the instruction does. + */ +int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs, + unsigned int instr) +{ + unsigned int opcode, ra, rb, rd, spr, u; + unsigned long int imm; + unsigned long int val, val2; + unsigned int mb, me, sh; + long ival; + + op->type = COMPUTE; + + opcode = instr >> 26; + switch (opcode) { + case 16: /* bc */ + op->type = BRANCH; + imm = (signed short)(instr & 0xfffc); + if ((instr & 2) == 0) + imm += regs->nip; + regs->nip += 4; + regs->nip = truncate_if_32bit(regs->msr, regs->nip); + if (instr & 1) + regs->link = regs->nip; + if (branch_taken(instr, regs)) + regs->nip = truncate_if_32bit(regs->msr, imm); + return 1; +#ifdef CONFIG_PPC64 + case 17: /* sc */ + if ((instr & 0xfe2) == 2) + op->type = SYSCALL; + else + op->type = UNKNOWN; + return 0; +#endif + case 18: /* b */ + op->type = BRANCH; + imm = instr & 0x03fffffc; + if (imm & 0x02000000) + imm -= 0x04000000; + if ((instr & 2) == 0) + imm += regs->nip; + if (instr & 1) + regs->link = truncate_if_32bit(regs->msr, regs->nip + 4); + imm = truncate_if_32bit(regs->msr, imm); + regs->nip = imm; + return 1; + case 19: + switch ((instr >> 1) & 0x3ff) { + case 0: /* mcrf */ + rd = (instr >> 21) & 0x1c; + ra = (instr >> 16) & 0x1c; + val = (regs->ccr >> ra) & 0xf; + regs->ccr = (regs->ccr & ~(0xfUL << rd)) | (val << rd); + goto instr_done; + + case 16: /* bclr */ + case 528: /* bcctr */ + op->type = BRANCH; + imm = (instr & 0x400)? regs->ctr: regs->link; + regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4); + imm = truncate_if_32bit(regs->msr, imm); + if (instr & 1) + regs->link = regs->nip; + if (branch_taken(instr, regs)) + regs->nip = imm; + return 1; + + case 18: /* rfid, scary */ + if (regs->msr & MSR_PR) + goto priv; + op->type = RFI; + return 0; + + case 150: /* isync */ + op->type = BARRIER; + isync(); + goto instr_done; + + case 33: /* crnor */ + case 129: /* crandc */ + case 193: /* crxor */ + case 225: /* crnand */ + case 257: /* crand */ + case 289: /* creqv */ + case 417: /* crorc */ + case 449: /* cror */ + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + rd = (instr >> 21) & 0x1f; + ra = (regs->ccr >> (31 - ra)) & 1; + rb = (regs->ccr >> (31 - rb)) & 1; + val = (instr >> (6 + ra * 2 + rb)) & 1; + regs->ccr = (regs->ccr & ~(1UL << (31 - rd))) | + (val << (31 - rd)); + goto instr_done; + } + break; + case 31: + switch ((instr >> 1) & 0x3ff) { + case 598: /* sync */ + op->type = BARRIER; +#ifdef __powerpc64__ + switch ((instr >> 21) & 3) { + case 1: /* lwsync */ + asm volatile("lwsync" : : : "memory"); + goto instr_done; + case 2: /* ptesync */ + asm volatile("ptesync" : : : "memory"); + goto instr_done; + } +#endif + mb(); + goto instr_done; + + case 854: /* eieio */ + op->type = BARRIER; + eieio(); + goto instr_done; + } + break; + } + + /* Following cases refer to regs->gpr[], so we need all regs */ + if (!FULL_REGS(regs)) + return 0; + + rd = (instr >> 21) & 0x1f; + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + + switch (opcode) { +#ifdef __powerpc64__ + case 2: /* tdi */ + if (rd & trap_compare(regs->gpr[ra], (short) instr)) + goto trap; + goto instr_done; +#endif + case 3: /* twi */ + if (rd & trap_compare((int)regs->gpr[ra], (short) instr)) + goto trap; + goto instr_done; + + case 7: /* mulli */ + regs->gpr[rd] = regs->gpr[ra] * (short) instr; + goto instr_done; + + case 8: /* subfic */ + imm = (short) instr; + add_with_carry(regs, rd, ~regs->gpr[ra], imm, 1); + goto instr_done; + + case 10: /* cmpli */ + imm = (unsigned short) instr; + val = regs->gpr[ra]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) + val = (unsigned int) val; +#endif + do_cmp_unsigned(regs, val, imm, rd >> 2); + goto instr_done; + + case 11: /* cmpi */ + imm = (short) instr; + val = regs->gpr[ra]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) + val = (int) val; +#endif + do_cmp_signed(regs, val, imm, rd >> 2); + goto instr_done; + + case 12: /* addic */ + imm = (short) instr; + add_with_carry(regs, rd, regs->gpr[ra], imm, 0); + goto instr_done; + + case 13: /* addic. */ + imm = (short) instr; + add_with_carry(regs, rd, regs->gpr[ra], imm, 0); + set_cr0(regs, rd); + goto instr_done; + + case 14: /* addi */ + imm = (short) instr; + if (ra) + imm += regs->gpr[ra]; + regs->gpr[rd] = imm; + goto instr_done; + + case 15: /* addis */ + imm = ((short) instr) << 16; + if (ra) + imm += regs->gpr[ra]; + regs->gpr[rd] = imm; + goto instr_done; + + case 20: /* rlwimi */ + mb = (instr >> 6) & 0x1f; + me = (instr >> 1) & 0x1f; + val = DATA32(regs->gpr[rd]); + imm = MASK32(mb, me); + regs->gpr[ra] = (regs->gpr[ra] & ~imm) | (ROTATE(val, rb) & imm); + goto logical_done; + + case 21: /* rlwinm */ + mb = (instr >> 6) & 0x1f; + me = (instr >> 1) & 0x1f; + val = DATA32(regs->gpr[rd]); + regs->gpr[ra] = ROTATE(val, rb) & MASK32(mb, me); + goto logical_done; + + case 23: /* rlwnm */ + mb = (instr >> 6) & 0x1f; + me = (instr >> 1) & 0x1f; + rb = regs->gpr[rb] & 0x1f; + val = DATA32(regs->gpr[rd]); + regs->gpr[ra] = ROTATE(val, rb) & MASK32(mb, me); + goto logical_done; + + case 24: /* ori */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] | imm; + goto instr_done; + + case 25: /* oris */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] | (imm << 16); + goto instr_done; + + case 26: /* xori */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] ^ imm; + goto instr_done; + + case 27: /* xoris */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] ^ (imm << 16); + goto instr_done; + + case 28: /* andi. */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] & imm; + set_cr0(regs, ra); + goto instr_done; + + case 29: /* andis. */ + imm = (unsigned short) instr; + regs->gpr[ra] = regs->gpr[rd] & (imm << 16); + set_cr0(regs, ra); + goto instr_done; + +#ifdef __powerpc64__ + case 30: /* rld* */ + mb = ((instr >> 6) & 0x1f) | (instr & 0x20); + val = regs->gpr[rd]; + if ((instr & 0x10) == 0) { + sh = rb | ((instr & 2) << 4); + val = ROTATE(val, sh); + switch ((instr >> 2) & 3) { + case 0: /* rldicl */ + regs->gpr[ra] = val & MASK64_L(mb); + goto logical_done; + case 1: /* rldicr */ + regs->gpr[ra] = val & MASK64_R(mb); + goto logical_done; + case 2: /* rldic */ + regs->gpr[ra] = val & MASK64(mb, 63 - sh); + goto logical_done; + case 3: /* rldimi */ + imm = MASK64(mb, 63 - sh); + regs->gpr[ra] = (regs->gpr[ra] & ~imm) | + (val & imm); + goto logical_done; + } + } else { + sh = regs->gpr[rb] & 0x3f; + val = ROTATE(val, sh); + switch ((instr >> 1) & 7) { + case 0: /* rldcl */ + regs->gpr[ra] = val & MASK64_L(mb); + goto logical_done; + case 1: /* rldcr */ + regs->gpr[ra] = val & MASK64_R(mb); + goto logical_done; + } + } +#endif + + case 31: + switch ((instr >> 1) & 0x3ff) { + case 4: /* tw */ + if (rd == 0x1f || + (rd & trap_compare((int)regs->gpr[ra], + (int)regs->gpr[rb]))) + goto trap; + goto instr_done; +#ifdef __powerpc64__ + case 68: /* td */ + if (rd & trap_compare(regs->gpr[ra], regs->gpr[rb])) + goto trap; + goto instr_done; +#endif + case 83: /* mfmsr */ + if (regs->msr & MSR_PR) + goto priv; + op->type = MFMSR; + op->reg = rd; + return 0; + case 146: /* mtmsr */ + if (regs->msr & MSR_PR) + goto priv; + op->type = MTMSR; + op->reg = rd; + op->val = 0xffffffff & ~(MSR_ME | MSR_LE); + return 0; +#ifdef CONFIG_PPC64 + case 178: /* mtmsrd */ + if (regs->msr & MSR_PR) + goto priv; + op->type = MTMSR; + op->reg = rd; + /* only MSR_EE and MSR_RI get changed if bit 15 set */ + /* mtmsrd doesn't change MSR_HV, MSR_ME or MSR_LE */ + imm = (instr & 0x10000)? 0x8002: 0xefffffffffffeffeUL; + op->val = imm; + return 0; +#endif + + case 19: /* mfcr */ + regs->gpr[rd] = regs->ccr; + regs->gpr[rd] &= 0xffffffffUL; + goto instr_done; + + case 144: /* mtcrf */ + imm = 0xf0000000UL; + val = regs->gpr[rd]; + for (sh = 0; sh < 8; ++sh) { + if (instr & (0x80000 >> sh)) + regs->ccr = (regs->ccr & ~imm) | + (val & imm); + imm >>= 4; + } + goto instr_done; + + case 339: /* mfspr */ + spr = ((instr >> 16) & 0x1f) | ((instr >> 6) & 0x3e0); + switch (spr) { + case SPRN_XER: /* mfxer */ + regs->gpr[rd] = regs->xer; + regs->gpr[rd] &= 0xffffffffUL; + goto instr_done; + case SPRN_LR: /* mflr */ + regs->gpr[rd] = regs->link; + goto instr_done; + case SPRN_CTR: /* mfctr */ + regs->gpr[rd] = regs->ctr; + goto instr_done; + default: + op->type = MFSPR; + op->reg = rd; + op->spr = spr; + return 0; + } + break; + + case 467: /* mtspr */ + spr = ((instr >> 16) & 0x1f) | ((instr >> 6) & 0x3e0); + switch (spr) { + case SPRN_XER: /* mtxer */ + regs->xer = (regs->gpr[rd] & 0xffffffffUL); + goto instr_done; + case SPRN_LR: /* mtlr */ + regs->link = regs->gpr[rd]; + goto instr_done; + case SPRN_CTR: /* mtctr */ + regs->ctr = regs->gpr[rd]; + goto instr_done; + default: + op->type = MTSPR; + op->val = regs->gpr[rd]; + op->spr = spr; + return 0; + } + break; + +/* + * Compare instructions + */ + case 0: /* cmp */ + val = regs->gpr[ra]; + val2 = regs->gpr[rb]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) { + /* word (32-bit) compare */ + val = (int) val; + val2 = (int) val2; + } +#endif + do_cmp_signed(regs, val, val2, rd >> 2); + goto instr_done; + + case 32: /* cmpl */ + val = regs->gpr[ra]; + val2 = regs->gpr[rb]; +#ifdef __powerpc64__ + if ((rd & 1) == 0) { + /* word (32-bit) compare */ + val = (unsigned int) val; + val2 = (unsigned int) val2; + } +#endif + do_cmp_unsigned(regs, val, val2, rd >> 2); + goto instr_done; + +/* + * Arithmetic instructions + */ + case 8: /* subfc */ + add_with_carry(regs, rd, ~regs->gpr[ra], + regs->gpr[rb], 1); + goto arith_done; +#ifdef __powerpc64__ + case 9: /* mulhdu */ + asm("mulhdu %0,%1,%2" : "=r" (regs->gpr[rd]) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; +#endif + case 10: /* addc */ + add_with_carry(regs, rd, regs->gpr[ra], + regs->gpr[rb], 0); + goto arith_done; + + case 11: /* mulhwu */ + asm("mulhwu %0,%1,%2" : "=r" (regs->gpr[rd]) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; + + case 40: /* subf */ + regs->gpr[rd] = regs->gpr[rb] - regs->gpr[ra]; + goto arith_done; +#ifdef __powerpc64__ + case 73: /* mulhd */ + asm("mulhd %0,%1,%2" : "=r" (regs->gpr[rd]) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; +#endif + case 75: /* mulhw */ + asm("mulhw %0,%1,%2" : "=r" (regs->gpr[rd]) : + "r" (regs->gpr[ra]), "r" (regs->gpr[rb])); + goto arith_done; + + case 104: /* neg */ + regs->gpr[rd] = -regs->gpr[ra]; + goto arith_done; + + case 136: /* subfe */ + add_with_carry(regs, rd, ~regs->gpr[ra], regs->gpr[rb], + regs->xer & XER_CA); + goto arith_done; + + case 138: /* adde */ + add_with_carry(regs, rd, regs->gpr[ra], regs->gpr[rb], + regs->xer & XER_CA); + goto arith_done; + + case 200: /* subfze */ + add_with_carry(regs, rd, ~regs->gpr[ra], 0L, + regs->xer & XER_CA); + goto arith_done; + + case 202: /* addze */ + add_with_carry(regs, rd, regs->gpr[ra], 0L, + regs->xer & XER_CA); + goto arith_done; + + case 232: /* subfme */ + add_with_carry(regs, rd, ~regs->gpr[ra], -1L, + regs->xer & XER_CA); + goto arith_done; +#ifdef __powerpc64__ + case 233: /* mulld */ + regs->gpr[rd] = regs->gpr[ra] * regs->gpr[rb]; + goto arith_done; +#endif + case 234: /* addme */ + add_with_carry(regs, rd, regs->gpr[ra], -1L, + regs->xer & XER_CA); + goto arith_done; + + case 235: /* mullw */ + regs->gpr[rd] = (unsigned int) regs->gpr[ra] * + (unsigned int) regs->gpr[rb]; + goto arith_done; + + case 266: /* add */ + regs->gpr[rd] = regs->gpr[ra] + regs->gpr[rb]; + goto arith_done; +#ifdef __powerpc64__ + case 457: /* divdu */ + regs->gpr[rd] = regs->gpr[ra] / regs->gpr[rb]; + goto arith_done; +#endif + case 459: /* divwu */ + regs->gpr[rd] = (unsigned int) regs->gpr[ra] / + (unsigned int) regs->gpr[rb]; + goto arith_done; +#ifdef __powerpc64__ + case 489: /* divd */ + regs->gpr[rd] = (long int) regs->gpr[ra] / + (long int) regs->gpr[rb]; + goto arith_done; +#endif + case 491: /* divw */ + regs->gpr[rd] = (int) regs->gpr[ra] / + (int) regs->gpr[rb]; + goto arith_done; + + +/* + * Logical instructions + */ + case 26: /* cntlzw */ + asm("cntlzw %0,%1" : "=r" (regs->gpr[ra]) : + "r" (regs->gpr[rd])); + goto logical_done; +#ifdef __powerpc64__ + case 58: /* cntlzd */ + asm("cntlzd %0,%1" : "=r" (regs->gpr[ra]) : + "r" (regs->gpr[rd])); + goto logical_done; +#endif + case 28: /* and */ + regs->gpr[ra] = regs->gpr[rd] & regs->gpr[rb]; + goto logical_done; + + case 60: /* andc */ + regs->gpr[ra] = regs->gpr[rd] & ~regs->gpr[rb]; + goto logical_done; + + case 124: /* nor */ + regs->gpr[ra] = ~(regs->gpr[rd] | regs->gpr[rb]); + goto logical_done; + + case 284: /* xor */ + regs->gpr[ra] = ~(regs->gpr[rd] ^ regs->gpr[rb]); + goto logical_done; + + case 316: /* xor */ + regs->gpr[ra] = regs->gpr[rd] ^ regs->gpr[rb]; + goto logical_done; + + case 412: /* orc */ + regs->gpr[ra] = regs->gpr[rd] | ~regs->gpr[rb]; + goto logical_done; + + case 444: /* or */ + regs->gpr[ra] = regs->gpr[rd] | regs->gpr[rb]; + goto logical_done; + + case 476: /* nand */ + regs->gpr[ra] = ~(regs->gpr[rd] & regs->gpr[rb]); + goto logical_done; + + case 922: /* extsh */ + regs->gpr[ra] = (signed short) regs->gpr[rd]; + goto logical_done; + + case 954: /* extsb */ + regs->gpr[ra] = (signed char) regs->gpr[rd]; + goto logical_done; +#ifdef __powerpc64__ + case 986: /* extsw */ + regs->gpr[ra] = (signed int) regs->gpr[rd]; + goto logical_done; +#endif + +/* + * Shift instructions + */ + case 24: /* slw */ + sh = regs->gpr[rb] & 0x3f; + if (sh < 32) + regs->gpr[ra] = (regs->gpr[rd] << sh) & 0xffffffffUL; + else + regs->gpr[ra] = 0; + goto logical_done; + + case 536: /* srw */ + sh = regs->gpr[rb] & 0x3f; + if (sh < 32) + regs->gpr[ra] = (regs->gpr[rd] & 0xffffffffUL) >> sh; + else + regs->gpr[ra] = 0; + goto logical_done; + + case 792: /* sraw */ + sh = regs->gpr[rb] & 0x3f; + ival = (signed int) regs->gpr[rd]; + regs->gpr[ra] = ival >> (sh < 32 ? sh : 31); + if (ival < 0 && (sh >= 32 || (ival & ((1ul << sh) - 1)) != 0)) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; + goto logical_done; + + case 824: /* srawi */ + sh = rb; + ival = (signed int) regs->gpr[rd]; + regs->gpr[ra] = ival >> sh; + if (ival < 0 && (ival & ((1ul << sh) - 1)) != 0) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; + goto logical_done; + +#ifdef __powerpc64__ + case 27: /* sld */ + sh = regs->gpr[rb] & 0x7f; + if (sh < 64) + regs->gpr[ra] = regs->gpr[rd] << sh; + else + regs->gpr[ra] = 0; + goto logical_done; + + case 539: /* srd */ + sh = regs->gpr[rb] & 0x7f; + if (sh < 64) + regs->gpr[ra] = regs->gpr[rd] >> sh; + else + regs->gpr[ra] = 0; + goto logical_done; + + case 794: /* srad */ + sh = regs->gpr[rb] & 0x7f; + ival = (signed long int) regs->gpr[rd]; + regs->gpr[ra] = ival >> (sh < 64 ? sh : 63); + if (ival < 0 && (sh >= 64 || (ival & ((1ul << sh) - 1)) != 0)) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; + goto logical_done; + + case 826: /* sradi with sh_5 = 0 */ + case 827: /* sradi with sh_5 = 1 */ + sh = rb | ((instr & 2) << 4); + ival = (signed long int) regs->gpr[rd]; + regs->gpr[ra] = ival >> sh; + if (ival < 0 && (ival & ((1ul << sh) - 1)) != 0) + regs->xer |= XER_CA; + else + regs->xer &= ~XER_CA; + goto logical_done; +#endif /* __powerpc64__ */ + +/* + * Cache instructions + */ + case 54: /* dcbst */ + op->type = MKOP(CACHEOP, DCBST, 0); + op->ea = xform_ea(instr, regs); + return 0; + + case 86: /* dcbf */ + op->type = MKOP(CACHEOP, DCBF, 0); + op->ea = xform_ea(instr, regs); + return 0; + + case 246: /* dcbtst */ + op->type = MKOP(CACHEOP, DCBTST, 0); + op->ea = xform_ea(instr, regs); + op->reg = rd; + return 0; + + case 278: /* dcbt */ + op->type = MKOP(CACHEOP, DCBTST, 0); + op->ea = xform_ea(instr, regs); + op->reg = rd; + return 0; + + case 982: /* icbi */ + op->type = MKOP(CACHEOP, ICBI, 0); + op->ea = xform_ea(instr, regs); + return 0; + } + break; + } + + /* + * Loads and stores. + */ + op->type = UNKNOWN; + op->update_reg = ra; + op->reg = rd; + op->val = regs->gpr[rd]; + u = (instr >> 20) & UPDATE; + + switch (opcode) { + case 31: + u = instr & UPDATE; + op->ea = xform_ea(instr, regs); + switch ((instr >> 1) & 0x3ff) { + case 20: /* lwarx */ + op->type = MKOP(LARX, 0, 4); + break; + + case 150: /* stwcx. */ + op->type = MKOP(STCX, 0, 4); + break; + +#ifdef __powerpc64__ + case 84: /* ldarx */ + op->type = MKOP(LARX, 0, 8); + break; + + case 214: /* stdcx. */ + op->type = MKOP(STCX, 0, 8); + break; + + case 21: /* ldx */ + case 53: /* ldux */ + op->type = MKOP(LOAD, u, 8); + break; +#endif + + case 23: /* lwzx */ + case 55: /* lwzux */ + op->type = MKOP(LOAD, u, 4); + break; + + case 87: /* lbzx */ + case 119: /* lbzux */ + op->type = MKOP(LOAD, u, 1); + break; + +#ifdef CONFIG_ALTIVEC + case 103: /* lvx */ + case 359: /* lvxl */ + if (!(regs->msr & MSR_VEC)) + goto vecunavail; + op->type = MKOP(LOAD_VMX, 0, 16); + break; + + case 231: /* stvx */ + case 487: /* stvxl */ + if (!(regs->msr & MSR_VEC)) + goto vecunavail; + op->type = MKOP(STORE_VMX, 0, 16); + break; +#endif /* CONFIG_ALTIVEC */ + +#ifdef __powerpc64__ + case 149: /* stdx */ + case 181: /* stdux */ + op->type = MKOP(STORE, u, 8); + break; +#endif + + case 151: /* stwx */ + case 183: /* stwux */ + op->type = MKOP(STORE, u, 4); + break; + + case 215: /* stbx */ + case 247: /* stbux */ + op->type = MKOP(STORE, u, 1); + break; + + case 279: /* lhzx */ + case 311: /* lhzux */ + op->type = MKOP(LOAD, u, 2); + break; + +#ifdef __powerpc64__ + case 341: /* lwax */ + case 373: /* lwaux */ + op->type = MKOP(LOAD, SIGNEXT | u, 4); + break; +#endif + + case 343: /* lhax */ + case 375: /* lhaux */ + op->type = MKOP(LOAD, SIGNEXT | u, 2); + break; + + case 407: /* sthx */ + case 439: /* sthux */ + op->type = MKOP(STORE, u, 2); + break; + +#ifdef __powerpc64__ + case 532: /* ldbrx */ + op->type = MKOP(LOAD, BYTEREV, 8); + break; + +#endif + case 533: /* lswx */ + op->type = MKOP(LOAD_MULTI, 0, regs->xer & 0x7f); + break; + + case 534: /* lwbrx */ + op->type = MKOP(LOAD, BYTEREV, 4); + break; + + case 597: /* lswi */ + if (rb == 0) + rb = 32; /* # bytes to load */ + op->type = MKOP(LOAD_MULTI, 0, rb); + op->ea = 0; + if (ra) + op->ea = truncate_if_32bit(regs->msr, + regs->gpr[ra]); + break; + +#ifdef CONFIG_PPC_FPU + case 535: /* lfsx */ + case 567: /* lfsux */ + if (!(regs->msr & MSR_FP)) + goto fpunavail; + op->type = MKOP(LOAD_FP, u, 4); + break; + + case 599: /* lfdx */ + case 631: /* lfdux */ + if (!(regs->msr & MSR_FP)) + goto fpunavail; + op->type = MKOP(LOAD_FP, u, 8); + break; + + case 663: /* stfsx */ + case 695: /* stfsux */ + if (!(regs->msr & MSR_FP)) + goto fpunavail; + op->type = MKOP(STORE_FP, u, 4); + break; + + case 727: /* stfdx */ + case 759: /* stfdux */ + if (!(regs->msr & MSR_FP)) + goto fpunavail; + op->type = MKOP(STORE_FP, u, 8); + break; +#endif + +#ifdef __powerpc64__ + case 660: /* stdbrx */ + op->type = MKOP(STORE, BYTEREV, 8); + op->val = byterev_8(regs->gpr[rd]); + break; + +#endif + case 661: /* stswx */ + op->type = MKOP(STORE_MULTI, 0, regs->xer & 0x7f); + break; + + case 662: /* stwbrx */ + op->type = MKOP(STORE, BYTEREV, 4); + op->val = byterev_4(regs->gpr[rd]); + break; + + case 725: + if (rb == 0) + rb = 32; /* # bytes to store */ + op->type = MKOP(STORE_MULTI, 0, rb); + op->ea = 0; + if (ra) + op->ea = truncate_if_32bit(regs->msr, + regs->gpr[ra]); + break; + + case 790: /* lhbrx */ + op->type = MKOP(LOAD, BYTEREV, 2); + break; + + case 918: /* sthbrx */ + op->type = MKOP(STORE, BYTEREV, 2); + op->val = byterev_2(regs->gpr[rd]); + break; + +#ifdef CONFIG_VSX + case 844: /* lxvd2x */ + case 876: /* lxvd2ux */ + if (!(regs->msr & MSR_VSX)) + goto vsxunavail; + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(LOAD_VSX, u, 16); + break; + + case 972: /* stxvd2x */ + case 1004: /* stxvd2ux */ + if (!(regs->msr & MSR_VSX)) + goto vsxunavail; + op->reg = rd | ((instr & 1) << 5); + op->type = MKOP(STORE_VSX, u, 16); + break; + +#endif /* CONFIG_VSX */ + } + break; + + case 32: /* lwz */ + case 33: /* lwzu */ + op->type = MKOP(LOAD, u, 4); + op->ea = dform_ea(instr, regs); + break; + + case 34: /* lbz */ + case 35: /* lbzu */ + op->type = MKOP(LOAD, u, 1); + op->ea = dform_ea(instr, regs); + break; + + case 36: /* stw */ + case 37: /* stwu */ + op->type = MKOP(STORE, u, 4); + op->ea = dform_ea(instr, regs); + break; + + case 38: /* stb */ + case 39: /* stbu */ + op->type = MKOP(STORE, u, 1); + op->ea = dform_ea(instr, regs); + break; + + case 40: /* lhz */ + case 41: /* lhzu */ + op->type = MKOP(LOAD, u, 2); + op->ea = dform_ea(instr, regs); + break; + + case 42: /* lha */ + case 43: /* lhau */ + op->type = MKOP(LOAD, SIGNEXT | u, 2); + op->ea = dform_ea(instr, regs); + break; + + case 44: /* sth */ + case 45: /* sthu */ + op->type = MKOP(STORE, u, 2); + op->ea = dform_ea(instr, regs); + break; + + case 46: /* lmw */ + if (ra >= rd) + break; /* invalid form, ra in range to load */ + op->type = MKOP(LOAD_MULTI, 0, 4 * (32 - rd)); + op->ea = dform_ea(instr, regs); + break; + + case 47: /* stmw */ + op->type = MKOP(STORE_MULTI, 0, 4 * (32 - rd)); + op->ea = dform_ea(instr, regs); + break; + +#ifdef CONFIG_PPC_FPU + case 48: /* lfs */ + case 49: /* lfsu */ + if (!(regs->msr & MSR_FP)) + goto fpunavail; + op->type = MKOP(LOAD_FP, u, 4); + op->ea = dform_ea(instr, regs); + break; + + case 50: /* lfd */ + case 51: /* lfdu */ + if (!(regs->msr & MSR_FP)) + goto fpunavail; + op->type = MKOP(LOAD_FP, u, 8); + op->ea = dform_ea(instr, regs); + break; + + case 52: /* stfs */ + case 53: /* stfsu */ + if (!(regs->msr & MSR_FP)) + goto fpunavail; + op->type = MKOP(STORE_FP, u, 4); + op->ea = dform_ea(instr, regs); + break; + + case 54: /* stfd */ + case 55: /* stfdu */ + if (!(regs->msr & MSR_FP)) + goto fpunavail; + op->type = MKOP(STORE_FP, u, 8); + op->ea = dform_ea(instr, regs); + break; +#endif + +#ifdef __powerpc64__ + case 58: /* ld[u], lwa */ + op->ea = dsform_ea(instr, regs); + switch (instr & 3) { + case 0: /* ld */ + op->type = MKOP(LOAD, 0, 8); + break; + case 1: /* ldu */ + op->type = MKOP(LOAD, UPDATE, 8); + break; + case 2: /* lwa */ + op->type = MKOP(LOAD, SIGNEXT, 4); + break; + } + break; + + case 62: /* std[u] */ + op->ea = dsform_ea(instr, regs); + switch (instr & 3) { + case 0: /* std */ + op->type = MKOP(STORE, 0, 8); + break; + case 1: /* stdu */ + op->type = MKOP(STORE, UPDATE, 8); + break; + } + break; +#endif /* __powerpc64__ */ + + } + return 0; + + logical_done: + if (instr & 1) + set_cr0(regs, ra); + goto instr_done; + + arith_done: + if (instr & 1) + set_cr0(regs, rd); + + instr_done: + regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4); + return 1; + + priv: + op->type = INTERRUPT | 0x700; + op->val = SRR1_PROGPRIV; + return 0; + + trap: + op->type = INTERRUPT | 0x700; + op->val = SRR1_PROGTRAP; + return 0; + +#ifdef CONFIG_PPC_FPU + fpunavail: + op->type = INTERRUPT | 0x800; + return 0; +#endif + +#ifdef CONFIG_ALTIVEC + vecunavail: + op->type = INTERRUPT | 0xf20; + return 0; +#endif + +#ifdef CONFIG_VSX + vsxunavail: + op->type = INTERRUPT | 0xf40; + return 0; +#endif +} +EXPORT_SYMBOL_GPL(analyse_instr); + +/* + * For PPC32 we always use stwu with r1 to change the stack pointer. + * So this emulated store may corrupt the exception frame, now we + * have to provide the exception frame trampoline, which is pushed + * below the kprobed function stack. So we only update gpr[1] but + * don't emulate the real store operation. We will do real store + * operation safely in exception return code by checking this flag. + */ +static __kprobes int handle_stack_update(unsigned long ea, struct pt_regs *regs) +{ +#ifdef CONFIG_PPC32 + /* + * Check if we will touch kernel stack overflow + */ + if (ea - STACK_INT_FRAME_SIZE <= current->thread.ksp_limit) { + printk(KERN_CRIT "Can't kprobe this since kernel stack would overflow.\n"); + return -EINVAL; + } +#endif /* CONFIG_PPC32 */ + /* + * Check if we already set since that means we'll + * lose the previous value. + */ + WARN_ON(test_thread_flag(TIF_EMULATE_STACK_STORE)); + set_thread_flag(TIF_EMULATE_STACK_STORE); + return 0; +} + +static __kprobes void do_signext(unsigned long *valp, int size) +{ + switch (size) { + case 2: + *valp = (signed short) *valp; + break; + case 4: + *valp = (signed int) *valp; + break; + } +} + +static __kprobes void do_byterev(unsigned long *valp, int size) +{ + switch (size) { + case 2: + *valp = byterev_2(*valp); + break; + case 4: + *valp = byterev_4(*valp); + break; +#ifdef __powerpc64__ + case 8: + *valp = byterev_8(*valp); + break; +#endif + } +} + +/* + * Emulate instructions that cause a transfer of control, + * loads and stores, and a few other instructions. + * Returns 1 if the step was emulated, 0 if not, + * or -1 if the instruction is one that should not be stepped, + * such as an rfid, or a mtmsrd that would clear MSR_RI. + */ +int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) +{ + struct instruction_op op; + int r, err, size; + unsigned long val; + unsigned int cr; + int i, rd, nb; + + r = analyse_instr(&op, regs, instr); + if (r != 0) + return r; + + err = 0; + size = GETSIZE(op.type); + switch (op.type & INSTR_TYPE_MASK) { + case CACHEOP: + if (!address_ok(regs, op.ea, 8)) + return 0; + switch (op.type & CACHEOP_MASK) { + case DCBST: + __cacheop_user_asmx(op.ea, err, "dcbst"); + break; + case DCBF: + __cacheop_user_asmx(op.ea, err, "dcbf"); + break; + case DCBTST: + if (op.reg == 0) + prefetchw((void *) op.ea); + break; + case DCBT: + if (op.reg == 0) + prefetch((void *) op.ea); + break; + case ICBI: + __cacheop_user_asmx(op.ea, err, "icbi"); + break; + } + if (err) + return 0; + goto instr_done; + + case LARX: + if (regs->msr & MSR_LE) + return 0; + if (op.ea & (size - 1)) + break; /* can't handle misaligned */ + err = -EFAULT; + if (!address_ok(regs, op.ea, size)) + goto ldst_done; + err = 0; + switch (size) { + case 4: + __get_user_asmx(val, op.ea, err, "lwarx"); + break; + case 8: + __get_user_asmx(val, op.ea, err, "ldarx"); + break; + default: + return 0; + } + if (!err) + regs->gpr[op.reg] = val; + goto ldst_done; + + case STCX: + if (regs->msr & MSR_LE) + return 0; + if (op.ea & (size - 1)) + break; /* can't handle misaligned */ + err = -EFAULT; + if (!address_ok(regs, op.ea, size)) + goto ldst_done; + err = 0; + switch (size) { + case 4: + __put_user_asmx(op.val, op.ea, err, "stwcx.", cr); + break; + case 8: + __put_user_asmx(op.val, op.ea, err, "stdcx.", cr); + break; + default: + return 0; + } + if (!err) + regs->ccr = (regs->ccr & 0x0fffffff) | + (cr & 0xe0000000) | + ((regs->xer >> 3) & 0x10000000); + goto ldst_done; + + case LOAD: + if (regs->msr & MSR_LE) + return 0; + err = read_mem(®s->gpr[op.reg], op.ea, size, regs); + if (!err) { + if (op.type & SIGNEXT) + do_signext(®s->gpr[op.reg], size); + if (op.type & BYTEREV) + do_byterev(®s->gpr[op.reg], size); + } + goto ldst_done; + +#ifdef CONFIG_PPC_FPU + case LOAD_FP: + if (regs->msr & MSR_LE) + return 0; + if (size == 4) + err = do_fp_load(op.reg, do_lfs, op.ea, size, regs); + else + err = do_fp_load(op.reg, do_lfd, op.ea, size, regs); + goto ldst_done; +#endif +#ifdef CONFIG_ALTIVEC + case LOAD_VMX: + if (regs->msr & MSR_LE) + return 0; + err = do_vec_load(op.reg, do_lvx, op.ea & ~0xfUL, regs); + goto ldst_done; +#endif +#ifdef CONFIG_VSX + case LOAD_VSX: + if (regs->msr & MSR_LE) + return 0; + err = do_vsx_load(op.reg, do_lxvd2x, op.ea, regs); + goto ldst_done; +#endif + case LOAD_MULTI: + if (regs->msr & MSR_LE) + return 0; + rd = op.reg; + for (i = 0; i < size; i += 4) { + nb = size - i; + if (nb > 4) + nb = 4; + err = read_mem(®s->gpr[rd], op.ea, nb, regs); + if (err) + return 0; + if (nb < 4) /* left-justify last bytes */ + regs->gpr[rd] <<= 32 - 8 * nb; + op.ea += 4; + ++rd; + } + goto instr_done; + + case STORE: + if (regs->msr & MSR_LE) + return 0; + if ((op.type & UPDATE) && size == sizeof(long) && + op.reg == 1 && op.update_reg == 1 && + !(regs->msr & MSR_PR) && + op.ea >= regs->gpr[1] - STACK_INT_FRAME_SIZE) { + err = handle_stack_update(op.ea, regs); + goto ldst_done; + } + err = write_mem(op.val, op.ea, size, regs); + goto ldst_done; + +#ifdef CONFIG_PPC_FPU + case STORE_FP: + if (regs->msr & MSR_LE) + return 0; + if (size == 4) + err = do_fp_store(op.reg, do_stfs, op.ea, size, regs); + else + err = do_fp_store(op.reg, do_stfd, op.ea, size, regs); + goto ldst_done; +#endif +#ifdef CONFIG_ALTIVEC + case STORE_VMX: + if (regs->msr & MSR_LE) + return 0; + err = do_vec_store(op.reg, do_stvx, op.ea & ~0xfUL, regs); + goto ldst_done; +#endif +#ifdef CONFIG_VSX + case STORE_VSX: + if (regs->msr & MSR_LE) + return 0; + err = do_vsx_store(op.reg, do_stxvd2x, op.ea, regs); + goto ldst_done; +#endif + case STORE_MULTI: + if (regs->msr & MSR_LE) + return 0; + rd = op.reg; + for (i = 0; i < size; i += 4) { + val = regs->gpr[rd]; + nb = size - i; + if (nb > 4) + nb = 4; + else + val >>= 32 - 8 * nb; + err = write_mem(val, op.ea, nb, regs); + if (err) + return 0; + op.ea += 4; + ++rd; + } + goto instr_done; + + case MFMSR: + regs->gpr[op.reg] = regs->msr & MSR_MASK; + goto instr_done; + + case MTMSR: + val = regs->gpr[op.reg]; + if ((val & MSR_RI) == 0) + /* can't step mtmsr[d] that would clear MSR_RI */ + return -1; + /* here op.val is the mask of bits to change */ + regs->msr = (regs->msr & ~op.val) | (val & op.val); + goto instr_done; + +#ifdef CONFIG_PPC64 + case SYSCALL: /* sc */ + /* + * N.B. this uses knowledge about how the syscall + * entry code works. If that is changed, this will + * need to be changed also. + */ + if (regs->gpr[0] == 0x1ebe && + cpu_has_feature(CPU_FTR_REAL_LE)) { + regs->msr ^= MSR_LE; + goto instr_done; + } + regs->gpr[9] = regs->gpr[13]; + regs->gpr[10] = MSR_KERNEL; + regs->gpr[11] = regs->nip + 4; + regs->gpr[12] = regs->msr & MSR_MASK; + regs->gpr[13] = (unsigned long) get_paca(); + regs->nip = (unsigned long) &system_call_common; + regs->msr = MSR_KERNEL; + return 1; + + case RFI: + return -1; +#endif + } + return 0; + + ldst_done: + if (err) + return 0; + if (op.type & UPDATE) + regs->gpr[op.update_reg] = op.ea; + + instr_done: + regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4); + return 1; +} diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S new file mode 100644 index 000000000..c80fb49ce --- /dev/null +++ b/arch/powerpc/lib/string.S @@ -0,0 +1,166 @@ +/* + * String handling functions for PowerPC. + * + * Copyright (C) 1996 Paul Mackerras. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/processor.h> +#include <asm/errno.h> +#include <asm/ppc_asm.h> + + .section __ex_table,"a" + PPC_LONG_ALIGN + .text + +_GLOBAL(strcpy) + addi r5,r3,-1 + addi r4,r4,-1 +1: lbzu r0,1(r4) + cmpwi 0,r0,0 + stbu r0,1(r5) + bne 1b + blr + +/* This clears out any unused part of the destination buffer, + just as the libc version does. -- paulus */ +_GLOBAL(strncpy) + PPC_LCMPI 0,r5,0 + beqlr + mtctr r5 + addi r6,r3,-1 + addi r4,r4,-1 +1: lbzu r0,1(r4) + cmpwi 0,r0,0 + stbu r0,1(r6) + bdnzf 2,1b /* dec ctr, branch if ctr != 0 && !cr0.eq */ + bnelr /* if we didn't hit a null char, we're done */ + mfctr r5 + PPC_LCMPI 0,r5,0 /* any space left in destination buffer? */ + beqlr /* we know r0 == 0 here */ +2: stbu r0,1(r6) /* clear it out if so */ + bdnz 2b + blr + +_GLOBAL(strcat) + addi r5,r3,-1 + addi r4,r4,-1 +1: lbzu r0,1(r5) + cmpwi 0,r0,0 + bne 1b + addi r5,r5,-1 +1: lbzu r0,1(r4) + cmpwi 0,r0,0 + stbu r0,1(r5) + bne 1b + blr + +_GLOBAL(strcmp) + addi r5,r3,-1 + addi r4,r4,-1 +1: lbzu r3,1(r5) + cmpwi 1,r3,0 + lbzu r0,1(r4) + subf. r3,r0,r3 + beqlr 1 + beq 1b + blr + +_GLOBAL(strncmp) + PPC_LCMPI 0,r5,0 + beq- 2f + mtctr r5 + addi r5,r3,-1 + addi r4,r4,-1 +1: lbzu r3,1(r5) + cmpwi 1,r3,0 + lbzu r0,1(r4) + subf. r3,r0,r3 + beqlr 1 + bdnzt eq,1b + blr +2: li r3,0 + blr + +_GLOBAL(strlen) + addi r4,r3,-1 +1: lbzu r0,1(r4) + cmpwi 0,r0,0 + bne 1b + subf r3,r3,r4 + blr + +#ifdef CONFIG_PPC32 +_GLOBAL(memcmp) + PPC_LCMPI 0,r5,0 + beq- 2f + mtctr r5 + addi r6,r3,-1 + addi r4,r4,-1 +1: lbzu r3,1(r6) + lbzu r0,1(r4) + subf. r3,r0,r3 + bdnzt 2,1b + blr +2: li r3,0 + blr +#endif + +_GLOBAL(memchr) + PPC_LCMPI 0,r5,0 + beq- 2f + mtctr r5 + addi r3,r3,-1 +1: lbzu r0,1(r3) + cmpw 0,r0,r4 + bdnzf 2,1b + beqlr +2: li r3,0 + blr + +#ifdef CONFIG_PPC32 +_GLOBAL(__clear_user) + addi r6,r3,-4 + li r3,0 + li r5,0 + cmplwi 0,r4,4 + blt 7f + /* clear a single word */ +11: stwu r5,4(r6) + beqlr + /* clear word sized chunks */ + andi. r0,r6,3 + add r4,r0,r4 + subf r6,r0,r6 + srwi r0,r4,2 + andi. r4,r4,3 + mtctr r0 + bdz 7f +1: stwu r5,4(r6) + bdnz 1b + /* clear byte sized chunks */ +7: cmpwi 0,r4,0 + beqlr + mtctr r4 + addi r6,r6,3 +8: stbu r5,1(r6) + bdnz 8b + blr +90: mr r3,r4 + blr +91: mfctr r3 + slwi r3,r3,2 + add r3,r3,r4 + blr +92: mfctr r3 + blr + + .section __ex_table,"a" + PPC_LONG 11b,90b + PPC_LONG 1b,91b + PPC_LONG 8b,92b + .text +#endif diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S new file mode 100644 index 000000000..7bd9549a9 --- /dev/null +++ b/arch/powerpc/lib/string_64.S @@ -0,0 +1,202 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ + +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + + .section ".toc","aw" +PPC64_CACHES: + .tc ppc64_caches[TC],ppc64_caches + .section ".text" + +/** + * __clear_user: - Zero a block of memory in user space, with less checking. + * @to: Destination address, in user space. + * @n: Number of bytes to zero. + * + * Zero a block of memory in user space. Caller must check + * the specified block with access_ok() before calling this function. + * + * Returns number of bytes that could not be cleared. + * On success, this will be zero. + */ + + .macro err1 +100: + .section __ex_table,"a" + .align 3 + .llong 100b,.Ldo_err1 + .previous + .endm + + .macro err2 +200: + .section __ex_table,"a" + .align 3 + .llong 200b,.Ldo_err2 + .previous + .endm + + .macro err3 +300: + .section __ex_table,"a" + .align 3 + .llong 300b,.Ldo_err3 + .previous + .endm + +.Ldo_err1: + mr r3,r8 + +.Ldo_err2: + mtctr r4 +1: +err3; stb r0,0(r3) + addi r3,r3,1 + addi r4,r4,-1 + bdnz 1b + +.Ldo_err3: + mr r3,r4 + blr + +_GLOBAL_TOC(__clear_user) + cmpdi r4,32 + neg r6,r3 + li r0,0 + blt .Lshort_clear + mr r8,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + /* Get the destination 8 byte aligned */ + bf cr7*4+3,1f +err1; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err1; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err1; stw r0,0(r3) + addi r3,r3,4 + +3: sub r4,r4,r6 + + cmpdi r4,32 + cmpdi cr1,r4,512 + blt .Lshort_clear + bgt cr1,.Llong_clear + +.Lmedium_clear: + srdi r6,r4,5 + mtctr r6 + + /* Do 32 byte chunks */ +4: +err2; std r0,0(r3) +err2; std r0,8(r3) +err2; std r0,16(r3) +err2; std r0,24(r3) + addi r3,r3,32 + addi r4,r4,-32 + bdnz 4b + +.Lshort_clear: + /* up to 31 bytes to go */ + cmpdi r4,16 + blt 6f +err2; std r0,0(r3) +err2; std r0,8(r3) + addi r3,r3,16 + addi r4,r4,-16 + + /* Up to 15 bytes to go */ +6: mr r8,r3 + clrldi r4,r4,(64-4) + mtocrf 0x01,r4 + bf cr7*4+0,7f +err1; std r0,0(r3) + addi r3,r3,8 + +7: bf cr7*4+1,8f +err1; stw r0,0(r3) + addi r3,r3,4 + +8: bf cr7*4+2,9f +err1; sth r0,0(r3) + addi r3,r3,2 + +9: bf cr7*4+3,10f +err1; stb r0,0(r3) + +10: li r3,0 + blr + +.Llong_clear: + ld r5,PPC64_CACHES@toc(r2) + + bf cr7*4+0,11f +err2; std r0,0(r3) + addi r3,r3,8 + addi r4,r4,-8 + + /* Destination is 16 byte aligned, need to get it cacheline aligned */ +11: lwz r7,DCACHEL1LOGLINESIZE(r5) + lwz r9,DCACHEL1LINESIZE(r5) + + /* + * With worst case alignment the long clear loop takes a minimum + * of 1 byte less than 2 cachelines. + */ + sldi r10,r9,2 + cmpd r4,r10 + blt .Lmedium_clear + + neg r6,r3 + addi r10,r9,-1 + and. r5,r6,r10 + beq 13f + + srdi r6,r5,4 + mtctr r6 + mr r8,r3 +12: +err1; std r0,0(r3) +err1; std r0,8(r3) + addi r3,r3,16 + bdnz 12b + + sub r4,r4,r5 + +13: srd r6,r4,r7 + mtctr r6 + mr r8,r3 +14: +err1; dcbz r0,r3 + add r3,r3,r9 + bdnz 14b + + and r4,r4,r10 + + cmpdi r4,32 + blt .Lshort_clear + b .Lmedium_clear diff --git a/arch/powerpc/lib/usercopy_64.c b/arch/powerpc/lib/usercopy_64.c new file mode 100644 index 000000000..5eea6f3c1 --- /dev/null +++ b/arch/powerpc/lib/usercopy_64.c @@ -0,0 +1,41 @@ +/* + * Functions which are too large to be inlined. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/module.h> +#include <asm/uaccess.h> + +unsigned long copy_from_user(void *to, const void __user *from, unsigned long n) +{ + if (likely(access_ok(VERIFY_READ, from, n))) + n = __copy_from_user(to, from, n); + else + memset(to, 0, n); + return n; +} + +unsigned long copy_to_user(void __user *to, const void *from, unsigned long n) +{ + if (likely(access_ok(VERIFY_WRITE, to, n))) + n = __copy_to_user(to, from, n); + return n; +} + +unsigned long copy_in_user(void __user *to, const void __user *from, + unsigned long n) +{ + might_sleep(); + if (likely(access_ok(VERIFY_READ, from, n) && + access_ok(VERIFY_WRITE, to, n))) + n =__copy_tofrom_user(to, from, n); + return n; +} + +EXPORT_SYMBOL(copy_from_user); +EXPORT_SYMBOL(copy_to_user); +EXPORT_SYMBOL(copy_in_user); + diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c new file mode 100644 index 000000000..3cf529cee --- /dev/null +++ b/arch/powerpc/lib/vmx-helper.c @@ -0,0 +1,74 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2011 + * + * Authors: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com> + * Anton Blanchard <anton@au.ibm.com> + */ +#include <linux/uaccess.h> +#include <linux/hardirq.h> +#include <asm/switch_to.h> + +int enter_vmx_usercopy(void) +{ + if (in_interrupt()) + return 0; + + /* This acts as preempt_disable() as well and will make + * enable_kernel_altivec(). We need to disable page faults + * as they can call schedule and thus make us lose the VMX + * context. So on page faults, we just fail which will cause + * a fallback to the normal non-vmx copy. + */ + pagefault_disable(); + + enable_kernel_altivec(); + + return 1; +} + +/* + * This function must return 0 because we tail call optimise when calling + * from __copy_tofrom_user_power7 which returns 0 on success. + */ +int exit_vmx_usercopy(void) +{ + pagefault_enable(); + return 0; +} + +int enter_vmx_copy(void) +{ + if (in_interrupt()) + return 0; + + preempt_disable(); + + enable_kernel_altivec(); + + return 1; +} + +/* + * All calls to this function will be optimised into tail calls. We are + * passed a pointer to the destination which we return as required by a + * memcpy implementation. + */ +void *exit_vmx_copy(void *dest) +{ + preempt_enable(); + return dest; +} diff --git a/arch/powerpc/lib/xor_vmx.c b/arch/powerpc/lib/xor_vmx.c new file mode 100644 index 000000000..e905f7c2e --- /dev/null +++ b/arch/powerpc/lib/xor_vmx.c @@ -0,0 +1,177 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <altivec.h> + +#include <linux/preempt.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <asm/switch_to.h> + +typedef vector signed char unative_t; + +#define DEFINE(V) \ + unative_t *V = (unative_t *)V##_in; \ + unative_t V##_0, V##_1, V##_2, V##_3 + +#define LOAD(V) \ + do { \ + V##_0 = V[0]; \ + V##_1 = V[1]; \ + V##_2 = V[2]; \ + V##_3 = V[3]; \ + } while (0) + +#define STORE(V) \ + do { \ + V[0] = V##_0; \ + V[1] = V##_1; \ + V[2] = V##_2; \ + V[3] = V##_3; \ + } while (0) + +#define XOR(V1, V2) \ + do { \ + V1##_0 = vec_xor(V1##_0, V2##_0); \ + V1##_1 = vec_xor(V1##_1, V2##_1); \ + V1##_2 = vec_xor(V1##_2, V2##_2); \ + V1##_3 = vec_xor(V1##_3, V2##_3); \ + } while (0) + +void xor_altivec_2(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in) +{ + DEFINE(v1); + DEFINE(v2); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + preempt_disable(); + enable_kernel_altivec(); + + do { + LOAD(v1); + LOAD(v2); + XOR(v1, v2); + STORE(v1); + + v1 += 4; + v2 += 4; + } while (--lines > 0); + + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_2); + +void xor_altivec_3(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in) +{ + DEFINE(v1); + DEFINE(v2); + DEFINE(v3); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + preempt_disable(); + enable_kernel_altivec(); + + do { + LOAD(v1); + LOAD(v2); + LOAD(v3); + XOR(v1, v2); + XOR(v1, v3); + STORE(v1); + + v1 += 4; + v2 += 4; + v3 += 4; + } while (--lines > 0); + + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_3); + +void xor_altivec_4(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in) +{ + DEFINE(v1); + DEFINE(v2); + DEFINE(v3); + DEFINE(v4); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + preempt_disable(); + enable_kernel_altivec(); + + do { + LOAD(v1); + LOAD(v2); + LOAD(v3); + LOAD(v4); + XOR(v1, v2); + XOR(v3, v4); + XOR(v1, v3); + STORE(v1); + + v1 += 4; + v2 += 4; + v3 += 4; + v4 += 4; + } while (--lines > 0); + + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_4); + +void xor_altivec_5(unsigned long bytes, unsigned long *v1_in, + unsigned long *v2_in, unsigned long *v3_in, + unsigned long *v4_in, unsigned long *v5_in) +{ + DEFINE(v1); + DEFINE(v2); + DEFINE(v3); + DEFINE(v4); + DEFINE(v5); + unsigned long lines = bytes / (sizeof(unative_t)) / 4; + + preempt_disable(); + enable_kernel_altivec(); + + do { + LOAD(v1); + LOAD(v2); + LOAD(v3); + LOAD(v4); + LOAD(v5); + XOR(v1, v2); + XOR(v3, v4); + XOR(v1, v5); + XOR(v1, v3); + STORE(v1); + + v1 += 4; + v2 += 4; + v3 += 4; + v4 += 4; + v5 += 4; + } while (--lines > 0); + + preempt_enable(); +} +EXPORT_SYMBOL(xor_altivec_5); |