diff options
author | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2015-08-05 17:04:01 -0300 |
---|---|---|
committer | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2015-08-05 17:04:01 -0300 |
commit | 57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch) | |
tree | 5e910f0e82173f4ef4f51111366a3f1299037a7b /lib/raid6 |
Initial import
Diffstat (limited to 'lib/raid6')
-rw-r--r-- | lib/raid6/.gitignore | 5 | ||||
-rw-r--r-- | lib/raid6/Makefile | 124 | ||||
-rw-r--r-- | lib/raid6/algos.c | 250 | ||||
-rw-r--r-- | lib/raid6/altivec.uc | 128 | ||||
-rw-r--r-- | lib/raid6/avx2.c | 254 | ||||
-rw-r--r-- | lib/raid6/int.uc | 156 | ||||
-rw-r--r-- | lib/raid6/mktables.c | 158 | ||||
-rw-r--r-- | lib/raid6/mmx.c | 144 | ||||
-rw-r--r-- | lib/raid6/neon.c | 59 | ||||
-rw-r--r-- | lib/raid6/neon.uc | 80 | ||||
-rw-r--r-- | lib/raid6/recov.c | 141 | ||||
-rw-r--r-- | lib/raid6/recov_avx2.c | 323 | ||||
-rw-r--r-- | lib/raid6/recov_ssse3.c | 338 | ||||
-rw-r--r-- | lib/raid6/sse1.c | 164 | ||||
-rw-r--r-- | lib/raid6/sse2.c | 485 | ||||
-rw-r--r-- | lib/raid6/test/Makefile | 126 | ||||
-rw-r--r-- | lib/raid6/test/test.c | 155 | ||||
-rw-r--r-- | lib/raid6/tilegx.uc | 87 | ||||
-rw-r--r-- | lib/raid6/unroll.awk | 20 | ||||
-rw-r--r-- | lib/raid6/x86.h | 70 |
20 files changed, 3267 insertions, 0 deletions
diff --git a/lib/raid6/.gitignore b/lib/raid6/.gitignore new file mode 100644 index 000000000..0a7e494b2 --- /dev/null +++ b/lib/raid6/.gitignore @@ -0,0 +1,5 @@ +mktables +altivec*.c +int*.c +tables.c +neon?.c diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile new file mode 100644 index 000000000..c7dab0645 --- /dev/null +++ b/lib/raid6/Makefile @@ -0,0 +1,124 @@ +obj-$(CONFIG_RAID6_PQ) += raid6_pq.o + +raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \ + int8.o int16.o int32.o + +raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o +raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o +raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o +raid6_pq-$(CONFIG_TILEGX) += tilegx8.o + +hostprogs-y += mktables + +quiet_cmd_unroll = UNROLL $@ + cmd_unroll = $(AWK) -f$(srctree)/$(src)/unroll.awk -vN=$(UNROLL) \ + < $< > $@ || ( rm -f $@ && exit 1 ) + +ifeq ($(CONFIG_ALTIVEC),y) +altivec_flags := -maltivec -mabi=altivec +endif + +# The GCC option -ffreestanding is required in order to compile code containing +# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel) +ifeq ($(CONFIG_KERNEL_MODE_NEON),y) +NEON_FLAGS := -ffreestanding +ifeq ($(ARCH),arm) +NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon +endif +ifeq ($(ARCH),arm64) +CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only +CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only +CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only +CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only +endif +endif + +targets += int1.c +$(obj)/int1.c: UNROLL := 1 +$(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int2.c +$(obj)/int2.c: UNROLL := 2 +$(obj)/int2.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int4.c +$(obj)/int4.c: UNROLL := 4 +$(obj)/int4.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int8.c +$(obj)/int8.c: UNROLL := 8 +$(obj)/int8.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int16.c +$(obj)/int16.c: UNROLL := 16 +$(obj)/int16.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += int32.c +$(obj)/int32.c: UNROLL := 32 +$(obj)/int32.c: $(src)/int.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_altivec1.o += $(altivec_flags) +targets += altivec1.c +$(obj)/altivec1.c: UNROLL := 1 +$(obj)/altivec1.c: $(src)/altivec.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_altivec2.o += $(altivec_flags) +targets += altivec2.c +$(obj)/altivec2.c: UNROLL := 2 +$(obj)/altivec2.c: $(src)/altivec.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_altivec4.o += $(altivec_flags) +targets += altivec4.c +$(obj)/altivec4.c: UNROLL := 4 +$(obj)/altivec4.c: $(src)/altivec.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_altivec8.o += $(altivec_flags) +targets += altivec8.c +$(obj)/altivec8.c: UNROLL := 8 +$(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_neon1.o += $(NEON_FLAGS) +targets += neon1.c +$(obj)/neon1.c: UNROLL := 1 +$(obj)/neon1.c: $(src)/neon.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_neon2.o += $(NEON_FLAGS) +targets += neon2.c +$(obj)/neon2.c: UNROLL := 2 +$(obj)/neon2.c: $(src)/neon.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_neon4.o += $(NEON_FLAGS) +targets += neon4.c +$(obj)/neon4.c: UNROLL := 4 +$(obj)/neon4.c: $(src)/neon.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +CFLAGS_neon8.o += $(NEON_FLAGS) +targets += neon8.c +$(obj)/neon8.c: UNROLL := 8 +$(obj)/neon8.c: $(src)/neon.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +targets += tilegx8.c +$(obj)/tilegx8.c: UNROLL := 8 +$(obj)/tilegx8.c: $(src)/tilegx.uc $(src)/unroll.awk FORCE + $(call if_changed,unroll) + +quiet_cmd_mktable = TABLE $@ + cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 ) + +targets += tables.c +$(obj)/tables.c: $(obj)/mktables FORCE + $(call if_changed,mktable) diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c new file mode 100644 index 000000000..975c6e043 --- /dev/null +++ b/lib/raid6/algos.c @@ -0,0 +1,250 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/algos.c + * + * Algorithm list and algorithm selection for RAID-6 + */ + +#include <linux/raid/pq.h> +#ifndef __KERNEL__ +#include <sys/mman.h> +#include <stdio.h> +#else +#include <linux/module.h> +#include <linux/gfp.h> +#if !RAID6_USE_EMPTY_ZERO_PAGE +/* In .bss so it's zeroed */ +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +EXPORT_SYMBOL(raid6_empty_zero_page); +#endif +#endif + +struct raid6_calls raid6_call; +EXPORT_SYMBOL_GPL(raid6_call); + +const struct raid6_calls * const raid6_algos[] = { +#if defined(__ia64__) + &raid6_intx16, + &raid6_intx32, +#endif +#if defined(__i386__) && !defined(__arch_um__) + &raid6_mmxx1, + &raid6_mmxx2, + &raid6_sse1x1, + &raid6_sse1x2, + &raid6_sse2x1, + &raid6_sse2x2, +#ifdef CONFIG_AS_AVX2 + &raid6_avx2x1, + &raid6_avx2x2, +#endif +#endif +#if defined(__x86_64__) && !defined(__arch_um__) + &raid6_sse2x1, + &raid6_sse2x2, + &raid6_sse2x4, +#ifdef CONFIG_AS_AVX2 + &raid6_avx2x1, + &raid6_avx2x2, + &raid6_avx2x4, +#endif +#endif +#ifdef CONFIG_ALTIVEC + &raid6_altivec1, + &raid6_altivec2, + &raid6_altivec4, + &raid6_altivec8, +#endif +#if defined(CONFIG_TILEGX) + &raid6_tilegx8, +#endif + &raid6_intx1, + &raid6_intx2, + &raid6_intx4, + &raid6_intx8, +#ifdef CONFIG_KERNEL_MODE_NEON + &raid6_neonx1, + &raid6_neonx2, + &raid6_neonx4, + &raid6_neonx8, +#endif + NULL +}; + +void (*raid6_2data_recov)(int, size_t, int, int, void **); +EXPORT_SYMBOL_GPL(raid6_2data_recov); + +void (*raid6_datap_recov)(int, size_t, int, void **); +EXPORT_SYMBOL_GPL(raid6_datap_recov); + +const struct raid6_recov_calls *const raid6_recov_algos[] = { +#ifdef CONFIG_AS_AVX2 + &raid6_recov_avx2, +#endif +#ifdef CONFIG_AS_SSSE3 + &raid6_recov_ssse3, +#endif + &raid6_recov_intx1, + NULL +}; + +#ifdef __KERNEL__ +#define RAID6_TIME_JIFFIES_LG2 4 +#else +/* Need more time to be stable in userspace */ +#define RAID6_TIME_JIFFIES_LG2 9 +#define time_before(x, y) ((x) < (y)) +#endif + +static inline const struct raid6_recov_calls *raid6_choose_recov(void) +{ + const struct raid6_recov_calls *const *algo; + const struct raid6_recov_calls *best; + + for (best = NULL, algo = raid6_recov_algos; *algo; algo++) + if (!best || (*algo)->priority > best->priority) + if (!(*algo)->valid || (*algo)->valid()) + best = *algo; + + if (best) { + raid6_2data_recov = best->data2; + raid6_datap_recov = best->datap; + + pr_info("raid6: using %s recovery algorithm\n", best->name); + } else + pr_err("raid6: Yikes! No recovery algorithm found!\n"); + + return best; +} + +static inline const struct raid6_calls *raid6_choose_gen( + void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) +{ + unsigned long perf, bestgenperf, bestxorperf, j0, j1; + int start = (disks>>1)-1, stop = disks-3; /* work on the second half of the disks */ + const struct raid6_calls *const *algo; + const struct raid6_calls *best; + + for (bestgenperf = 0, bestxorperf = 0, best = NULL, algo = raid6_algos; *algo; algo++) { + if (!best || (*algo)->prefer >= best->prefer) { + if ((*algo)->valid && !(*algo)->valid()) + continue; + + perf = 0; + + preempt_disable(); + j0 = jiffies; + while ((j1 = jiffies) == j0) + cpu_relax(); + while (time_before(jiffies, + j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { + (*algo)->gen_syndrome(disks, PAGE_SIZE, *dptrs); + perf++; + } + preempt_enable(); + + if (perf > bestgenperf) { + bestgenperf = perf; + best = *algo; + } + pr_info("raid6: %-8s gen() %5ld MB/s\n", (*algo)->name, + (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); + + if (!(*algo)->xor_syndrome) + continue; + + perf = 0; + + preempt_disable(); + j0 = jiffies; + while ((j1 = jiffies) == j0) + cpu_relax(); + while (time_before(jiffies, + j1 + (1<<RAID6_TIME_JIFFIES_LG2))) { + (*algo)->xor_syndrome(disks, start, stop, + PAGE_SIZE, *dptrs); + perf++; + } + preempt_enable(); + + if (best == *algo) + bestxorperf = perf; + + pr_info("raid6: %-8s xor() %5ld MB/s\n", (*algo)->name, + (perf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2+1)); + } + } + + if (best) { + pr_info("raid6: using algorithm %s gen() %ld MB/s\n", + best->name, + (bestgenperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2)); + if (best->xor_syndrome) + pr_info("raid6: .... xor() %ld MB/s, rmw enabled\n", + (bestxorperf*HZ) >> (20-16+RAID6_TIME_JIFFIES_LG2+1)); + raid6_call = *best; + } else + pr_err("raid6: Yikes! No algorithm found!\n"); + + return best; +} + + +/* Try to pick the best algorithm */ +/* This code uses the gfmul table as convenient data set to abuse */ + +int __init raid6_select_algo(void) +{ + const int disks = (65536/PAGE_SIZE)+2; + + const struct raid6_calls *gen_best; + const struct raid6_recov_calls *rec_best; + char *syndromes; + void *dptrs[(65536/PAGE_SIZE)+2]; + int i; + + for (i = 0; i < disks-2; i++) + dptrs[i] = ((char *)raid6_gfmul) + PAGE_SIZE*i; + + /* Normal code - use a 2-page allocation to avoid D$ conflict */ + syndromes = (void *) __get_free_pages(GFP_KERNEL, 1); + + if (!syndromes) { + pr_err("raid6: Yikes! No memory available.\n"); + return -ENOMEM; + } + + dptrs[disks-2] = syndromes; + dptrs[disks-1] = syndromes + PAGE_SIZE; + + /* select raid gen_syndrome function */ + gen_best = raid6_choose_gen(&dptrs, disks); + + /* select raid recover functions */ + rec_best = raid6_choose_recov(); + + free_pages((unsigned long)syndromes, 1); + + return gen_best && rec_best ? 0 : -EINVAL; +} + +static void raid6_exit(void) +{ + do { } while (0); +} + +subsys_initcall(raid6_select_algo); +module_exit(raid6_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("RAID6 Q-syndrome calculations"); diff --git a/lib/raid6/altivec.uc b/lib/raid6/altivec.uc new file mode 100644 index 000000000..bec27fce7 --- /dev/null +++ b/lib/raid6/altivec.uc @@ -0,0 +1,128 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6altivec$#.c + * + * $#-way unrolled portable integer math RAID-6 instruction set + * + * This file is postprocessed using unroll.awk + * + * <benh> hpa: in process, + * you can just "steal" the vec unit with enable_kernel_altivec() (but + * bracked this with preempt_disable/enable or in a lock) + */ + +#include <linux/raid/pq.h> + +#include <altivec.h> +#ifdef __KERNEL__ +# include <asm/cputable.h> +# include <asm/switch_to.h> + +/* + * This is the C data type to use. We use a vector of + * signed char so vec_cmpgt() will generate the right + * instruction. + */ + +typedef vector signed char unative_t; + +#define NBYTES(x) ((vector signed char) {x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) +#define NSIZE sizeof(unative_t) + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline __attribute_const__ unative_t SHLBYTE(unative_t v) +{ + return vec_add(v,v); +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline __attribute_const__ unative_t MASK(unative_t v) +{ + unative_t zv = NBYTES(0); + + /* vec_cmpgt returns a vector bool char; thus the need for the cast */ + return (unative_t)vec_cmpgt(zv, v); +} + + +/* This is noinline to make damned sure that gcc doesn't move any of the + Altivec code around the enable/disable code */ +static void noinline +raid6_altivec$#_gen_syndrome_real(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + unative_t x1d = NBYTES(0x1d); + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + wp$$ = vec_xor(wp$$, wd$$); + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ = vec_and(w2$$, x1d); + w1$$ = vec_xor(w1$$, w2$$); + wq$$ = vec_xor(w1$$, wd$$); + } + *(unative_t *)&p[d+NSIZE*$$] = wp$$; + *(unative_t *)&q[d+NSIZE*$$] = wq$$; + } +} + +static void raid6_altivec$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + preempt_disable(); + enable_kernel_altivec(); + + raid6_altivec$#_gen_syndrome_real(disks, bytes, ptrs); + + preempt_enable(); +} + +int raid6_have_altivec(void); +#if $# == 1 +int raid6_have_altivec(void) +{ + /* This assumes either all CPUs have Altivec or none does */ +# ifdef __KERNEL__ + return cpu_has_feature(CPU_FTR_ALTIVEC); +# else + return 1; +# endif +} +#endif + +const struct raid6_calls raid6_altivec$# = { + raid6_altivec$#_gen_syndrome, + NULL, /* XOR not yet implemented */ + raid6_have_altivec, + "altivecx$#", + 0 +}; + +#endif /* CONFIG_ALTIVEC */ diff --git a/lib/raid6/avx2.c b/lib/raid6/avx2.c new file mode 100644 index 000000000..767340043 --- /dev/null +++ b/lib/raid6/avx2.c @@ -0,0 +1,254 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright (C) 2012 Intel Corporation + * Author: Yuanhan Liu <yuanhan.liu@linux.intel.com> + * + * Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * AVX2 implementation of RAID-6 syndrome functions + * + */ + +#ifdef CONFIG_AS_AVX2 + +#include <linux/raid/pq.h> +#include "x86.h" + +static const struct raid6_avx2_constants { + u64 x1d[4]; +} raid6_avx2_constants __aligned(32) = { + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, + 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,}, +}; + +static int raid6_have_avx2(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX); +} + +/* + * Plain AVX2 implementation + */ +static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* Zero temp */ + + for (d = 0; d < bytes; d += 32) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ + asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); + asm volatile("vmovdqa %ymm2,%ymm4");/* Q[0] */ + asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z0-1][d])); + for (z = z0-2; z >= 0; z--) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm6,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm4,%ymm4"); + asm volatile("vmovdqa %0,%%ymm6" : : "m" (dptr[z][d])); + } + asm volatile("vpcmpgtb %ymm4,%ymm3,%ymm5"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm6,%ymm2,%ymm2"); + asm volatile("vpxor %ymm6,%ymm4,%ymm4"); + + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vpxor %ymm2,%ymm2,%ymm2"); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vpxor %ymm4,%ymm4,%ymm4"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x1 = { + raid6_avx21_gen_syndrome, + NULL, /* XOR not yet implemented */ + raid6_have_avx2, + "avx2x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 AVX2 implementation + */ +static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ + + /* We uniformly assume a single prefetch covers at least 32 bytes */ + for (d = 0; d < bytes; d += 64) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d+32])); + asm volatile("vmovdqa %0,%%ymm2" : : "m" (dptr[z0][d]));/* P[0] */ + asm volatile("vmovdqa %0,%%ymm3" : : "m" (dptr[z0][d+32]));/* P[1] */ + asm volatile("vmovdqa %ymm2,%ymm4"); /* Q[0] */ + asm volatile("vmovdqa %ymm3,%ymm6"); /* Q[1] */ + for (z = z0-1; z >= 0; z--) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); + asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + } + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x2 = { + raid6_avx22_gen_syndrome, + NULL, /* XOR not yet implemented */ + raid6_have_avx2, + "avx2x2", + 1 /* Has cache hints */ +}; + +#ifdef CONFIG_X86_64 + +/* + * Unrolled-by-4 AVX2 implementation + */ +static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("vmovdqa %0,%%ymm0" : : "m" (raid6_avx2_constants.x1d[0])); + asm volatile("vpxor %ymm1,%ymm1,%ymm1"); /* Zero temp */ + asm volatile("vpxor %ymm2,%ymm2,%ymm2"); /* P[0] */ + asm volatile("vpxor %ymm3,%ymm3,%ymm3"); /* P[1] */ + asm volatile("vpxor %ymm4,%ymm4,%ymm4"); /* Q[0] */ + asm volatile("vpxor %ymm6,%ymm6,%ymm6"); /* Q[1] */ + asm volatile("vpxor %ymm10,%ymm10,%ymm10"); /* P[2] */ + asm volatile("vpxor %ymm11,%ymm11,%ymm11"); /* P[3] */ + asm volatile("vpxor %ymm12,%ymm12,%ymm12"); /* Q[2] */ + asm volatile("vpxor %ymm14,%ymm14,%ymm14"); /* Q[3] */ + + for (d = 0; d < bytes; d += 128) { + for (z = z0; z >= 0; z--) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+32])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+64])); + asm volatile("prefetchnta %0" : : "m" (dptr[z][d+96])); + asm volatile("vpcmpgtb %ymm4,%ymm1,%ymm5"); + asm volatile("vpcmpgtb %ymm6,%ymm1,%ymm7"); + asm volatile("vpcmpgtb %ymm12,%ymm1,%ymm13"); + asm volatile("vpcmpgtb %ymm14,%ymm1,%ymm15"); + asm volatile("vpaddb %ymm4,%ymm4,%ymm4"); + asm volatile("vpaddb %ymm6,%ymm6,%ymm6"); + asm volatile("vpaddb %ymm12,%ymm12,%ymm12"); + asm volatile("vpaddb %ymm14,%ymm14,%ymm14"); + asm volatile("vpand %ymm0,%ymm5,%ymm5"); + asm volatile("vpand %ymm0,%ymm7,%ymm7"); + asm volatile("vpand %ymm0,%ymm13,%ymm13"); + asm volatile("vpand %ymm0,%ymm15,%ymm15"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + asm volatile("vmovdqa %0,%%ymm5" : : "m" (dptr[z][d])); + asm volatile("vmovdqa %0,%%ymm7" : : "m" (dptr[z][d+32])); + asm volatile("vmovdqa %0,%%ymm13" : : "m" (dptr[z][d+64])); + asm volatile("vmovdqa %0,%%ymm15" : : "m" (dptr[z][d+96])); + asm volatile("vpxor %ymm5,%ymm2,%ymm2"); + asm volatile("vpxor %ymm7,%ymm3,%ymm3"); + asm volatile("vpxor %ymm13,%ymm10,%ymm10"); + asm volatile("vpxor %ymm15,%ymm11,%ymm11"); + asm volatile("vpxor %ymm5,%ymm4,%ymm4"); + asm volatile("vpxor %ymm7,%ymm6,%ymm6"); + asm volatile("vpxor %ymm13,%ymm12,%ymm12"); + asm volatile("vpxor %ymm15,%ymm14,%ymm14"); + } + asm volatile("vmovntdq %%ymm2,%0" : "=m" (p[d])); + asm volatile("vpxor %ymm2,%ymm2,%ymm2"); + asm volatile("vmovntdq %%ymm3,%0" : "=m" (p[d+32])); + asm volatile("vpxor %ymm3,%ymm3,%ymm3"); + asm volatile("vmovntdq %%ymm10,%0" : "=m" (p[d+64])); + asm volatile("vpxor %ymm10,%ymm10,%ymm10"); + asm volatile("vmovntdq %%ymm11,%0" : "=m" (p[d+96])); + asm volatile("vpxor %ymm11,%ymm11,%ymm11"); + asm volatile("vmovntdq %%ymm4,%0" : "=m" (q[d])); + asm volatile("vpxor %ymm4,%ymm4,%ymm4"); + asm volatile("vmovntdq %%ymm6,%0" : "=m" (q[d+32])); + asm volatile("vpxor %ymm6,%ymm6,%ymm6"); + asm volatile("vmovntdq %%ymm12,%0" : "=m" (q[d+64])); + asm volatile("vpxor %ymm12,%ymm12,%ymm12"); + asm volatile("vmovntdq %%ymm14,%0" : "=m" (q[d+96])); + asm volatile("vpxor %ymm14,%ymm14,%ymm14"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_avx2x4 = { + raid6_avx24_gen_syndrome, + NULL, /* XOR not yet implemented */ + raid6_have_avx2, + "avx2x4", + 1 /* Has cache hints */ +}; +#endif + +#endif /* CONFIG_AS_AVX2 */ diff --git a/lib/raid6/int.uc b/lib/raid6/int.uc new file mode 100644 index 000000000..558aeac93 --- /dev/null +++ b/lib/raid6/int.uc @@ -0,0 +1,156 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * int$#.c + * + * $#-way unrolled portable integer math RAID-6 instruction set + * + * This file is postprocessed using unroll.awk + */ + +#include <linux/raid/pq.h> + +/* + * This is the C data type to use + */ + +/* Change this from BITS_PER_LONG if there is something better... */ +#if BITS_PER_LONG == 64 +# define NBYTES(x) ((x) * 0x0101010101010101UL) +# define NSIZE 8 +# define NSHIFT 3 +# define NSTRING "64" +typedef u64 unative_t; +#else +# define NBYTES(x) ((x) * 0x01010101U) +# define NSIZE 4 +# define NSHIFT 2 +# define NSTRING "32" +typedef u32 unative_t; +#endif + + + +/* + * IA-64 wants insane amounts of unrolling. On other architectures that + * is just a waste of space. + */ +#if ($# <= 8) || defined(__ia64__) + + +/* + * These sub-operations are separate inlines since they can sometimes be + * specially optimized using architecture-specific hacks. + */ + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline __attribute_const__ unative_t SHLBYTE(unative_t v) +{ + unative_t vv; + + vv = (v << 1) & NBYTES(0xfe); + return vv; +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline __attribute_const__ unative_t MASK(unative_t v) +{ + unative_t vv; + + vv = v & NBYTES(0x80); + vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */ + return vv; +} + + +static void raid6_int$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + wp$$ ^= wd$$; + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ &= NBYTES(0x1d); + w1$$ ^= w2$$; + wq$$ = w1$$ ^ wd$$; + } + *(unative_t *)&p[d+NSIZE*$$] = wp$$; + *(unative_t *)&q[d+NSIZE*$$] = wq$$; + } +} + +static void raid6_int$#_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + /* P/Q data pages */ + wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; + for ( z = z0-1 ; z >= start ; z-- ) { + wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; + wp$$ ^= wd$$; + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ &= NBYTES(0x1d); + w1$$ ^= w2$$; + wq$$ = w1$$ ^ wd$$; + } + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ &= NBYTES(0x1d); + wq$$ = w1$$ ^ w2$$; + } + *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + } + +} + +const struct raid6_calls raid6_intx$# = { + raid6_int$#_gen_syndrome, + raid6_int$#_xor_syndrome, + NULL, /* always valid */ + "int" NSTRING "x$#", + 0 +}; + +#endif diff --git a/lib/raid6/mktables.c b/lib/raid6/mktables.c new file mode 100644 index 000000000..39787db58 --- /dev/null +++ b/lib/raid6/mktables.c @@ -0,0 +1,158 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * mktables.c + * + * Make RAID-6 tables. This is a host user space program to be run at + * compile time. + */ + +#include <stdio.h> +#include <string.h> +#include <inttypes.h> +#include <stdlib.h> +#include <time.h> + +static uint8_t gfmul(uint8_t a, uint8_t b) +{ + uint8_t v = 0; + + while (b) { + if (b & 1) + v ^= a; + a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); + b >>= 1; + } + + return v; +} + +static uint8_t gfpow(uint8_t a, int b) +{ + uint8_t v = 1; + + b %= 255; + if (b < 0) + b += 255; + + while (b) { + if (b & 1) + v = gfmul(v, a); + a = gfmul(a, a); + b >>= 1; + } + + return v; +} + +int main(int argc, char *argv[]) +{ + int i, j, k; + uint8_t v; + uint8_t exptbl[256], invtbl[256]; + + printf("#include <linux/raid/pq.h>\n"); + printf("#include <linux/export.h>\n"); + + /* Compute multiplication table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfmul[256][256] =\n" + "{\n"); + for (i = 0; i < 256; i++) { + printf("\t{\n"); + for (j = 0; j < 256; j += 8) { + printf("\t\t"); + for (k = 0; k < 8; k++) + printf("0x%02x,%c", gfmul(i, j + k), + (k == 7) ? '\n' : ' '); + } + printf("\t},\n"); + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfmul);\n"); + printf("#endif\n"); + + /* Compute vector multiplication table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_vgfmul[256][32] =\n" + "{\n"); + for (i = 0; i < 256; i++) { + printf("\t{\n"); + for (j = 0; j < 16; j += 8) { + printf("\t\t"); + for (k = 0; k < 8; k++) + printf("0x%02x,%c", gfmul(i, j + k), + (k == 7) ? '\n' : ' '); + } + for (j = 0; j < 16; j += 8) { + printf("\t\t"); + for (k = 0; k < 8; k++) + printf("0x%02x,%c", gfmul(i, (j + k) << 4), + (k == 7) ? '\n' : ' '); + } + printf("\t},\n"); + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_vgfmul);\n"); + printf("#endif\n"); + + /* Compute power-of-2 table (exponent) */ + v = 1; + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfexp[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) { + exptbl[i + j] = v; + printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); + v = gfmul(v, 2); + if (v == 1) + v = 0; /* For entry 255, not a real entry */ + } + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfexp);\n"); + printf("#endif\n"); + + /* Compute inverse table x^-1 == x^254 */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfinv[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) { + invtbl[i + j] = v = gfpow(i + j, 254); + printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); + } + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfinv);\n"); + printf("#endif\n"); + + /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ + printf("\nconst u8 __attribute__((aligned(256)))\n" + "raid6_gfexi[256] =\n" "{\n"); + for (i = 0; i < 256; i += 8) { + printf("\t"); + for (j = 0; j < 8; j++) + printf("0x%02x,%c", invtbl[exptbl[i + j] ^ 1], + (j == 7) ? '\n' : ' '); + } + printf("};\n"); + printf("#ifdef __KERNEL__\n"); + printf("EXPORT_SYMBOL(raid6_gfexi);\n"); + printf("#endif\n"); + + return 0; +} diff --git a/lib/raid6/mmx.c b/lib/raid6/mmx.c new file mode 100644 index 000000000..b3b0e1fcd --- /dev/null +++ b/lib/raid6/mmx.c @@ -0,0 +1,144 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/mmx.c + * + * MMX implementation of RAID-6 syndrome functions + */ + +#ifdef CONFIG_X86_32 + +#include <linux/raid/pq.h> +#include "x86.h" + +/* Shared with raid6/sse1.c */ +const struct raid6_mmx_constants { + u64 x1d; +} raid6_mmx_constants = { + 0x1d1d1d1d1d1d1d1dULL, +}; + +static int raid6_have_mmx(void) +{ + /* Not really "boot_cpu" but "all_cpus" */ + return boot_cpu_has(X86_FEATURE_MMX); +} + +/* + * Plain MMX implementation + */ +static void raid6_mmx1_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 8 ) { + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm6,%mm2"); + asm volatile("pxor %mm6,%mm4"); + } + asm volatile("movq %%mm2,%0" : "=m" (p[d])); + asm volatile("pxor %mm2,%mm2"); + asm volatile("movq %%mm4,%0" : "=m" (q[d])); + asm volatile("pxor %mm4,%mm4"); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_mmxx1 = { + raid6_mmx1_gen_syndrome, + NULL, /* XOR not yet implemented */ + raid6_have_mmx, + "mmxx1", + 0 +}; + +/* + * Unrolled-by-2 MMX implementation + */ +static void raid6_mmx2_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + asm volatile("pxor %mm7,%mm7"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + asm volatile("movq %mm3,%mm6"); /* Q[1] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("pcmpgtb %mm6,%mm7"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("paddb %mm6,%mm6"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pand %mm0,%mm7"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); + asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); + asm volatile("pxor %mm5,%mm2"); + asm volatile("pxor %mm7,%mm3"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm7,%mm7"); + } + asm volatile("movq %%mm2,%0" : "=m" (p[d])); + asm volatile("movq %%mm3,%0" : "=m" (p[d+8])); + asm volatile("movq %%mm4,%0" : "=m" (q[d])); + asm volatile("movq %%mm6,%0" : "=m" (q[d+8])); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_mmxx2 = { + raid6_mmx2_gen_syndrome, + NULL, /* XOR not yet implemented */ + raid6_have_mmx, + "mmxx2", + 0 +}; + +#endif diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c new file mode 100644 index 000000000..d9ad6ee28 --- /dev/null +++ b/lib/raid6/neon.c @@ -0,0 +1,59 @@ +/* + * linux/lib/raid6/neon.c - RAID6 syndrome calculation using ARM NEON intrinsics + * + * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/raid/pq.h> + +#ifdef __KERNEL__ +#include <asm/neon.h> +#else +#define kernel_neon_begin() +#define kernel_neon_end() +#define cpu_has_neon() (1) +#endif + +/* + * There are 2 reasons these wrappers are kept in a separate compilation unit + * from the actual implementations in neonN.c (generated from neon.uc by + * unroll.awk): + * - the actual implementations use NEON intrinsics, and the GCC support header + * (arm_neon.h) is not fully compatible (type wise) with the kernel; + * - the neonN.c files are compiled with -mfpu=neon and optimization enabled, + * and we have to make sure that we never use *any* NEON/VFP instructions + * outside a kernel_neon_begin()/kernel_neon_end() pair. + */ + +#define RAID6_NEON_WRAPPER(_n) \ + static void raid6_neon ## _n ## _gen_syndrome(int disks, \ + size_t bytes, void **ptrs) \ + { \ + void raid6_neon ## _n ## _gen_syndrome_real(int, \ + unsigned long, void**); \ + kernel_neon_begin(); \ + raid6_neon ## _n ## _gen_syndrome_real(disks, \ + (unsigned long)bytes, ptrs); \ + kernel_neon_end(); \ + } \ + struct raid6_calls const raid6_neonx ## _n = { \ + raid6_neon ## _n ## _gen_syndrome, \ + NULL, /* XOR not yet implemented */ \ + raid6_have_neon, \ + "neonx" #_n, \ + 0 \ + } + +static int raid6_have_neon(void) +{ + return cpu_has_neon(); +} + +RAID6_NEON_WRAPPER(1); +RAID6_NEON_WRAPPER(2); +RAID6_NEON_WRAPPER(4); +RAID6_NEON_WRAPPER(8); diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc new file mode 100644 index 000000000..1b9ed7933 --- /dev/null +++ b/lib/raid6/neon.uc @@ -0,0 +1,80 @@ +/* ----------------------------------------------------------------------- + * + * neon.uc - RAID-6 syndrome calculation using ARM NEON instructions + * + * Copyright (C) 2012 Rob Herring + * + * Based on altivec.uc: + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * neon$#.c + * + * $#-way unrolled NEON intrinsics math RAID-6 instruction set + * + * This file is postprocessed using unroll.awk + */ + +#include <arm_neon.h> + +typedef uint8x16_t unative_t; + +#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x}) +#define NSIZE sizeof(unative_t) + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline unative_t SHLBYTE(unative_t v) +{ + return vshlq_n_u8(v, 1); +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline unative_t MASK(unative_t v) +{ + const uint8x16_t temp = NBYTES(0); + return (unative_t)vcltq_s8((int8x16_t)v, (int8x16_t)temp); +} + +void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs) +{ + uint8_t **dptr = (uint8_t **)ptrs; + uint8_t *p, *q; + int d, z, z0; + + register unative_t wd$$, wq$$, wp$$, w1$$, w2$$; + const unative_t x1d = NBYTES(0x1d); + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]); + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]); + wp$$ = veorq_u8(wp$$, wd$$); + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + + w2$$ = vandq_u8(w2$$, x1d); + w1$$ = veorq_u8(w1$$, w2$$); + wq$$ = veorq_u8(w1$$, wd$$); + } + vst1q_u8(&p[d+NSIZE*$$], wp$$); + vst1q_u8(&q[d+NSIZE*$$], wq$$); + } +} diff --git a/lib/raid6/recov.c b/lib/raid6/recov.c new file mode 100644 index 000000000..a95bccb84 --- /dev/null +++ b/lib/raid6/recov.c @@ -0,0 +1,141 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/recov.c + * + * RAID-6 data recovery in dual failure mode. In single failure mode, + * use the RAID-5 algorithm (or, in the case of Q failure, just reconstruct + * the syndrome.) + */ + +#include <linux/export.h> +#include <linux/raid/pq.h> + +/* Recover two failed data blocks. */ +static void raid6_2data_recov_intx1(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + u8 px, qx, db; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; + + /* Now do it... */ + while ( bytes-- ) { + px = *p ^ *dp; + qx = qmul[*q ^ *dq]; + *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */ + *dp++ = db ^ px; /* Reconstructed A */ + p++; q++; + } +} + +/* Recover failure of one data block plus the P block */ +static void raid6_datap_recov_intx1(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + /* Now do it... */ + while ( bytes-- ) { + *p++ ^= *dq = qmul[*q ^ *dq]; + q++; dq++; + } +} + + +const struct raid6_recov_calls raid6_recov_intx1 = { + .data2 = raid6_2data_recov_intx1, + .datap = raid6_datap_recov_intx1, + .valid = NULL, + .name = "intx1", + .priority = 0, +}; + +#ifndef __KERNEL__ +/* Testing only */ + +/* Recover two failed blocks. */ +void raid6_dual_recov(int disks, size_t bytes, int faila, int failb, void **ptrs) +{ + if ( faila > failb ) { + int tmp = faila; + faila = failb; + failb = tmp; + } + + if ( failb == disks-1 ) { + if ( faila == disks-2 ) { + /* P+Q failure. Just rebuild the syndrome. */ + raid6_call.gen_syndrome(disks, bytes, ptrs); + } else { + /* data+Q failure. Reconstruct data from P, + then rebuild syndrome. */ + /* NOT IMPLEMENTED - equivalent to RAID-5 */ + } + } else { + if ( failb == disks-2 ) { + /* data+P failure. */ + raid6_datap_recov(disks, bytes, faila, ptrs); + } else { + /* data+data failure. */ + raid6_2data_recov(disks, bytes, faila, failb, ptrs); + } + } +} + +#endif diff --git a/lib/raid6/recov_avx2.c b/lib/raid6/recov_avx2.c new file mode 100644 index 000000000..53fe3d7bd --- /dev/null +++ b/lib/raid6/recov_avx2.c @@ -0,0 +1,323 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#ifdef CONFIG_AS_AVX2 + +#include <linux/raid/pq.h> +#include "x86.h" + +static int raid6_has_avx2(void) +{ + return boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX); +} + +static void raid6_2data_recov_avx2(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* ymm0 = x0f[16] */ + asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa %0, %%ymm1" : : "m" (q[0])); + asm volatile("vmovdqa %0, %%ymm9" : : "m" (q[32])); + asm volatile("vmovdqa %0, %%ymm0" : : "m" (p[0])); + asm volatile("vmovdqa %0, %%ymm8" : : "m" (p[32])); + asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (dq[0])); + asm volatile("vpxor %0, %%ymm9, %%ymm9" : : "m" (dq[32])); + asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (dp[0])); + asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (dp[32])); + + /* + * 1 = dq[0] ^ q[0] + * 9 = dq[32] ^ q[32] + * 0 = dp[0] ^ p[0] + * 8 = dp[32] ^ p[32] + */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16])); + + asm volatile("vpsraw $4, %ymm1, %ymm3"); + asm volatile("vpsraw $4, %ymm9, %ymm12"); + asm volatile("vpand %ymm7, %ymm1, %ymm1"); + asm volatile("vpand %ymm7, %ymm9, %ymm9"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpand %ymm7, %ymm12, %ymm12"); + asm volatile("vpshufb %ymm9, %ymm4, %ymm14"); + asm volatile("vpshufb %ymm1, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm12, %ymm5, %ymm15"); + asm volatile("vpshufb %ymm3, %ymm5, %ymm5"); + asm volatile("vpxor %ymm14, %ymm15, %ymm15"); + asm volatile("vpxor %ymm4, %ymm5, %ymm5"); + + /* + * 5 = qx[0] + * 15 = qx[32] + */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16])); + asm volatile("vpsraw $4, %ymm0, %ymm2"); + asm volatile("vpsraw $4, %ymm8, %ymm6"); + asm volatile("vpand %ymm7, %ymm0, %ymm3"); + asm volatile("vpand %ymm7, %ymm8, %ymm14"); + asm volatile("vpand %ymm7, %ymm2, %ymm2"); + asm volatile("vpand %ymm7, %ymm6, %ymm6"); + asm volatile("vpshufb %ymm14, %ymm4, %ymm12"); + asm volatile("vpshufb %ymm3, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm6, %ymm1, %ymm13"); + asm volatile("vpshufb %ymm2, %ymm1, %ymm1"); + asm volatile("vpxor %ymm4, %ymm1, %ymm1"); + asm volatile("vpxor %ymm12, %ymm13, %ymm13"); + + /* + * 1 = pbmul[px[0]] + * 13 = pbmul[px[32]] + */ + asm volatile("vpxor %ymm5, %ymm1, %ymm1"); + asm volatile("vpxor %ymm15, %ymm13, %ymm13"); + + /* + * 1 = db = DQ + * 13 = db[32] = DQ[32] + */ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + asm volatile("vmovdqa %%ymm13,%0" : "=m" (dq[32])); + asm volatile("vpxor %ymm1, %ymm0, %ymm0"); + asm volatile("vpxor %ymm13, %ymm8, %ymm8"); + + asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0])); + asm volatile("vmovdqa %%ymm8, %0" : "=m" (dp[32])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; +#else + asm volatile("vmovdqa %0, %%ymm1" : : "m" (*q)); + asm volatile("vmovdqa %0, %%ymm0" : : "m" (*p)); + asm volatile("vpxor %0, %%ymm1, %%ymm1" : : "m" (*dq)); + asm volatile("vpxor %0, %%ymm0, %%ymm0" : : "m" (*dp)); + + /* 1 = dq ^ q; 0 = dp ^ p */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (qmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm5" : : "m" (qmul[16])); + + /* + * 1 = dq ^ q + * 3 = dq ^ p >> 4 + */ + asm volatile("vpsraw $4, %ymm1, %ymm3"); + asm volatile("vpand %ymm7, %ymm1, %ymm1"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpshufb %ymm1, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm3, %ymm5, %ymm5"); + asm volatile("vpxor %ymm4, %ymm5, %ymm5"); + + /* 5 = qx */ + + asm volatile("vbroadcasti128 %0, %%ymm4" : : "m" (pbmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (pbmul[16])); + + asm volatile("vpsraw $4, %ymm0, %ymm2"); + asm volatile("vpand %ymm7, %ymm0, %ymm3"); + asm volatile("vpand %ymm7, %ymm2, %ymm2"); + asm volatile("vpshufb %ymm3, %ymm4, %ymm4"); + asm volatile("vpshufb %ymm2, %ymm1, %ymm1"); + asm volatile("vpxor %ymm4, %ymm1, %ymm1"); + + /* 1 = pbmul[px] */ + asm volatile("vpxor %ymm5, %ymm1, %ymm1"); + /* 1 = db = DQ */ + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + + asm volatile("vpxor %ymm1, %ymm0, %ymm0"); + asm volatile("vmovdqa %%ymm0, %0" : "=m" (dp[0])); + + bytes -= 32; + p += 32; + q += 32; + dp += 32; + dq += 32; +#endif + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_avx2(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + const u8 x0f = 0x0f; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + asm volatile("vpbroadcastb %0, %%ymm7" : : "m" (x0f)); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0])); + asm volatile("vmovdqa %0, %%ymm8" : : "m" (dq[32])); + asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0])); + asm volatile("vpxor %0, %%ymm8, %%ymm8" : : "m" (q[32])); + + /* + * 3 = q[0] ^ dq[0] + * 8 = q[32] ^ dq[32] + */ + asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0])); + asm volatile("vmovapd %ymm0, %ymm13"); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16])); + asm volatile("vmovapd %ymm1, %ymm14"); + + asm volatile("vpsraw $4, %ymm3, %ymm6"); + asm volatile("vpsraw $4, %ymm8, %ymm12"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpand %ymm7, %ymm8, %ymm8"); + asm volatile("vpand %ymm7, %ymm6, %ymm6"); + asm volatile("vpand %ymm7, %ymm12, %ymm12"); + asm volatile("vpshufb %ymm3, %ymm0, %ymm0"); + asm volatile("vpshufb %ymm8, %ymm13, %ymm13"); + asm volatile("vpshufb %ymm6, %ymm1, %ymm1"); + asm volatile("vpshufb %ymm12, %ymm14, %ymm14"); + asm volatile("vpxor %ymm0, %ymm1, %ymm1"); + asm volatile("vpxor %ymm13, %ymm14, %ymm14"); + + /* + * 1 = qmul[q[0] ^ dq[0]] + * 14 = qmul[q[32] ^ dq[32]] + */ + asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0])); + asm volatile("vmovdqa %0, %%ymm12" : : "m" (p[32])); + asm volatile("vpxor %ymm1, %ymm2, %ymm2"); + asm volatile("vpxor %ymm14, %ymm12, %ymm12"); + + /* + * 2 = p[0] ^ qmul[q[0] ^ dq[0]] + * 12 = p[32] ^ qmul[q[32] ^ dq[32]] + */ + + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + asm volatile("vmovdqa %%ymm14, %0" : "=m" (dq[32])); + asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0])); + asm volatile("vmovdqa %%ymm12,%0" : "=m" (p[32])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; +#else + asm volatile("vmovdqa %0, %%ymm3" : : "m" (dq[0])); + asm volatile("vpxor %0, %%ymm3, %%ymm3" : : "m" (q[0])); + + /* 3 = q ^ dq */ + + asm volatile("vbroadcasti128 %0, %%ymm0" : : "m" (qmul[0])); + asm volatile("vbroadcasti128 %0, %%ymm1" : : "m" (qmul[16])); + + asm volatile("vpsraw $4, %ymm3, %ymm6"); + asm volatile("vpand %ymm7, %ymm3, %ymm3"); + asm volatile("vpand %ymm7, %ymm6, %ymm6"); + asm volatile("vpshufb %ymm3, %ymm0, %ymm0"); + asm volatile("vpshufb %ymm6, %ymm1, %ymm1"); + asm volatile("vpxor %ymm0, %ymm1, %ymm1"); + + /* 1 = qmul[q ^ dq] */ + + asm volatile("vmovdqa %0, %%ymm2" : : "m" (p[0])); + asm volatile("vpxor %ymm1, %ymm2, %ymm2"); + + /* 2 = p ^ qmul[q ^ dq] */ + + asm volatile("vmovdqa %%ymm1, %0" : "=m" (dq[0])); + asm volatile("vmovdqa %%ymm2, %0" : "=m" (p[0])); + + bytes -= 32; + p += 32; + q += 32; + dq += 32; +#endif + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_avx2 = { + .data2 = raid6_2data_recov_avx2, + .datap = raid6_datap_recov_avx2, + .valid = raid6_has_avx2, +#ifdef CONFIG_X86_64 + .name = "avx2x2", +#else + .name = "avx2x1", +#endif + .priority = 2, +}; + +#else +#warning "your version of binutils lacks AVX2 support" +#endif diff --git a/lib/raid6/recov_ssse3.c b/lib/raid6/recov_ssse3.c new file mode 100644 index 000000000..cda33e56a --- /dev/null +++ b/lib/raid6/recov_ssse3.c @@ -0,0 +1,338 @@ +/* + * Copyright (C) 2012 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#ifdef CONFIG_AS_SSSE3 + +#include <linux/raid/pq.h> +#include "x86.h" + +static int raid6_has_ssse3(void) +{ + return boot_cpu_has(X86_FEATURE_XMM) && + boot_cpu_has(X86_FEATURE_XMM2) && + boot_cpu_has(X86_FEATURE_SSSE3); +} + +static void raid6_2data_recov_ssse3(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + static const u8 __aligned(16) x0f[16] = { + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks-2] = p; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ + raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm7" : : "m" (x0f[0])); + +#ifdef CONFIG_X86_64 + asm volatile("movdqa %0,%%xmm6" : : "m" (qmul[0])); + asm volatile("movdqa %0,%%xmm14" : : "m" (pbmul[0])); + asm volatile("movdqa %0,%%xmm15" : : "m" (pbmul[16])); +#endif + + /* Now do it... */ + while (bytes) { +#ifdef CONFIG_X86_64 + /* xmm6, xmm14, xmm15 */ + + asm volatile("movdqa %0,%%xmm1" : : "m" (q[0])); + asm volatile("movdqa %0,%%xmm9" : : "m" (q[16])); + asm volatile("movdqa %0,%%xmm0" : : "m" (p[0])); + asm volatile("movdqa %0,%%xmm8" : : "m" (p[16])); + asm volatile("pxor %0,%%xmm1" : : "m" (dq[0])); + asm volatile("pxor %0,%%xmm9" : : "m" (dq[16])); + asm volatile("pxor %0,%%xmm0" : : "m" (dp[0])); + asm volatile("pxor %0,%%xmm8" : : "m" (dp[16])); + + /* xmm0/8 = px */ + + asm volatile("movdqa %xmm6,%xmm4"); + asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); + asm volatile("movdqa %xmm6,%xmm12"); + asm volatile("movdqa %xmm5,%xmm13"); + asm volatile("movdqa %xmm1,%xmm3"); + asm volatile("movdqa %xmm9,%xmm11"); + asm volatile("movdqa %xmm0,%xmm2"); /* xmm2/10 = px */ + asm volatile("movdqa %xmm8,%xmm10"); + asm volatile("psraw $4,%xmm1"); + asm volatile("psraw $4,%xmm9"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm11"); + asm volatile("pand %xmm7,%xmm1"); + asm volatile("pand %xmm7,%xmm9"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm11,%xmm12"); + asm volatile("pshufb %xmm1,%xmm5"); + asm volatile("pshufb %xmm9,%xmm13"); + asm volatile("pxor %xmm4,%xmm5"); + asm volatile("pxor %xmm12,%xmm13"); + + /* xmm5/13 = qx */ + + asm volatile("movdqa %xmm14,%xmm4"); + asm volatile("movdqa %xmm15,%xmm1"); + asm volatile("movdqa %xmm14,%xmm12"); + asm volatile("movdqa %xmm15,%xmm9"); + asm volatile("movdqa %xmm2,%xmm3"); + asm volatile("movdqa %xmm10,%xmm11"); + asm volatile("psraw $4,%xmm2"); + asm volatile("psraw $4,%xmm10"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm11"); + asm volatile("pand %xmm7,%xmm2"); + asm volatile("pand %xmm7,%xmm10"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm11,%xmm12"); + asm volatile("pshufb %xmm2,%xmm1"); + asm volatile("pshufb %xmm10,%xmm9"); + asm volatile("pxor %xmm4,%xmm1"); + asm volatile("pxor %xmm12,%xmm9"); + + /* xmm1/9 = pbmul[px] */ + asm volatile("pxor %xmm5,%xmm1"); + asm volatile("pxor %xmm13,%xmm9"); + /* xmm1/9 = db = DQ */ + asm volatile("movdqa %%xmm1,%0" : "=m" (dq[0])); + asm volatile("movdqa %%xmm9,%0" : "=m" (dq[16])); + + asm volatile("pxor %xmm1,%xmm0"); + asm volatile("pxor %xmm9,%xmm8"); + asm volatile("movdqa %%xmm0,%0" : "=m" (dp[0])); + asm volatile("movdqa %%xmm8,%0" : "=m" (dp[16])); + + bytes -= 32; + p += 32; + q += 32; + dp += 32; + dq += 32; +#else + asm volatile("movdqa %0,%%xmm1" : : "m" (*q)); + asm volatile("movdqa %0,%%xmm0" : : "m" (*p)); + asm volatile("pxor %0,%%xmm1" : : "m" (*dq)); + asm volatile("pxor %0,%%xmm0" : : "m" (*dp)); + + /* 1 = dq ^ q + * 0 = dp ^ p + */ + asm volatile("movdqa %0,%%xmm4" : : "m" (qmul[0])); + asm volatile("movdqa %0,%%xmm5" : : "m" (qmul[16])); + + asm volatile("movdqa %xmm1,%xmm3"); + asm volatile("psraw $4,%xmm1"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm1"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm1,%xmm5"); + asm volatile("pxor %xmm4,%xmm5"); + + asm volatile("movdqa %xmm0,%xmm2"); /* xmm2 = px */ + + /* xmm5 = qx */ + + asm volatile("movdqa %0,%%xmm4" : : "m" (pbmul[0])); + asm volatile("movdqa %0,%%xmm1" : : "m" (pbmul[16])); + asm volatile("movdqa %xmm2,%xmm3"); + asm volatile("psraw $4,%xmm2"); + asm volatile("pand %xmm7,%xmm3"); + asm volatile("pand %xmm7,%xmm2"); + asm volatile("pshufb %xmm3,%xmm4"); + asm volatile("pshufb %xmm2,%xmm1"); + asm volatile("pxor %xmm4,%xmm1"); + + /* xmm1 = pbmul[px] */ + asm volatile("pxor %xmm5,%xmm1"); + /* xmm1 = db = DQ */ + asm volatile("movdqa %%xmm1,%0" : "=m" (*dq)); + + asm volatile("pxor %xmm1,%xmm0"); + asm volatile("movdqa %%xmm0,%0" : "=m" (*dp)); + + bytes -= 16; + p += 16; + q += 16; + dp += 16; + dq += 16; +#endif + } + + kernel_fpu_end(); +} + + +static void raid6_datap_recov_ssse3(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + static const u8 __aligned(16) x0f[16] = { + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, + 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f}; + + p = (u8 *)ptrs[disks-2]; + q = (u8 *)ptrs[disks-1]; + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks-1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks-1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + asm volatile("movdqa %0, %%xmm7" : : "m" (x0f[0])); + + while (bytes) { +#ifdef CONFIG_X86_64 + asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); + asm volatile("movdqa %0, %%xmm4" : : "m" (dq[16])); + asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); + asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); + + /* xmm3 = q[0] ^ dq[0] */ + + asm volatile("pxor %0, %%xmm4" : : "m" (q[16])); + asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); + + /* xmm4 = q[16] ^ dq[16] */ + + asm volatile("movdqa %xmm3, %xmm6"); + asm volatile("movdqa %xmm4, %xmm8"); + + /* xmm4 = xmm8 = q[16] ^ dq[16] */ + + asm volatile("psraw $4, %xmm3"); + asm volatile("pand %xmm7, %xmm6"); + asm volatile("pand %xmm7, %xmm3"); + asm volatile("pshufb %xmm6, %xmm0"); + asm volatile("pshufb %xmm3, %xmm1"); + asm volatile("movdqa %0, %%xmm10" : : "m" (qmul[0])); + asm volatile("pxor %xmm0, %xmm1"); + asm volatile("movdqa %0, %%xmm11" : : "m" (qmul[16])); + + /* xmm1 = qmul[q[0] ^ dq[0]] */ + + asm volatile("psraw $4, %xmm4"); + asm volatile("pand %xmm7, %xmm8"); + asm volatile("pand %xmm7, %xmm4"); + asm volatile("pshufb %xmm8, %xmm10"); + asm volatile("pshufb %xmm4, %xmm11"); + asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); + asm volatile("pxor %xmm10, %xmm11"); + asm volatile("movdqa %0, %%xmm12" : : "m" (p[16])); + + /* xmm11 = qmul[q[16] ^ dq[16]] */ + + asm volatile("pxor %xmm1, %xmm2"); + + /* xmm2 = p[0] ^ qmul[q[0] ^ dq[0]] */ + + asm volatile("pxor %xmm11, %xmm12"); + + /* xmm12 = p[16] ^ qmul[q[16] ^ dq[16]] */ + + asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); + asm volatile("movdqa %%xmm11, %0" : "=m" (dq[16])); + + asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); + asm volatile("movdqa %%xmm12, %0" : "=m" (p[16])); + + bytes -= 32; + p += 32; + q += 32; + dq += 32; + +#else + asm volatile("movdqa %0, %%xmm3" : : "m" (dq[0])); + asm volatile("movdqa %0, %%xmm0" : : "m" (qmul[0])); + asm volatile("pxor %0, %%xmm3" : : "m" (q[0])); + asm volatile("movdqa %0, %%xmm1" : : "m" (qmul[16])); + + /* xmm3 = *q ^ *dq */ + + asm volatile("movdqa %xmm3, %xmm6"); + asm volatile("movdqa %0, %%xmm2" : : "m" (p[0])); + asm volatile("psraw $4, %xmm3"); + asm volatile("pand %xmm7, %xmm6"); + asm volatile("pand %xmm7, %xmm3"); + asm volatile("pshufb %xmm6, %xmm0"); + asm volatile("pshufb %xmm3, %xmm1"); + asm volatile("pxor %xmm0, %xmm1"); + + /* xmm1 = qmul[*q ^ *dq */ + + asm volatile("pxor %xmm1, %xmm2"); + + /* xmm2 = *p ^ qmul[*q ^ *dq] */ + + asm volatile("movdqa %%xmm1, %0" : "=m" (dq[0])); + asm volatile("movdqa %%xmm2, %0" : "=m" (p[0])); + + bytes -= 16; + p += 16; + q += 16; + dq += 16; +#endif + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_ssse3 = { + .data2 = raid6_2data_recov_ssse3, + .datap = raid6_datap_recov_ssse3, + .valid = raid6_has_ssse3, +#ifdef CONFIG_X86_64 + .name = "ssse3x2", +#else + .name = "ssse3x1", +#endif + .priority = 1, +}; + +#else +#warning "your version of binutils lacks SSSE3 support" +#endif diff --git a/lib/raid6/sse1.c b/lib/raid6/sse1.c new file mode 100644 index 000000000..9025b8ca9 --- /dev/null +++ b/lib/raid6/sse1.c @@ -0,0 +1,164 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/sse1.c + * + * SSE-1/MMXEXT implementation of RAID-6 syndrome functions + * + * This is really an MMX implementation, but it requires SSE-1 or + * AMD MMXEXT for prefetch support and a few other features. The + * support for nontemporal memory accesses is enough to make this + * worthwhile as a separate implementation. + */ + +#ifdef CONFIG_X86_32 + +#include <linux/raid/pq.h> +#include "x86.h" + +/* Defined in raid6/mmx.c */ +extern const struct raid6_mmx_constants { + u64 x1d; +} raid6_mmx_constants; + +static int raid6_have_sse1_or_mmxext(void) +{ + /* Not really boot_cpu but "all_cpus" */ + return boot_cpu_has(X86_FEATURE_MMX) && + (boot_cpu_has(X86_FEATURE_XMM) || + boot_cpu_has(X86_FEATURE_MMXEXT)); +} + +/* + * Plain SSE1 implementation + */ +static void raid6_sse11_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 8 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + asm volatile("movq %0,%%mm6" : : "m" (dptr[z0-1][d])); + for ( z = z0-2 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm6,%mm2"); + asm volatile("pxor %mm6,%mm4"); + asm volatile("movq %0,%%mm6" : : "m" (dptr[z][d])); + } + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm6,%mm2"); + asm volatile("pxor %mm6,%mm4"); + + asm volatile("movntq %%mm2,%0" : "=m" (p[d])); + asm volatile("movntq %%mm4,%0" : "=m" (q[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse1x1 = { + raid6_sse11_gen_syndrome, + NULL, /* XOR not yet implemented */ + raid6_have_sse1_or_mmxext, + "sse1x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 SSE1 implementation + */ +static void raid6_sse12_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movq %0,%%mm0" : : "m" (raid6_mmx_constants.x1d)); + asm volatile("pxor %mm5,%mm5"); /* Zero temp */ + asm volatile("pxor %mm7,%mm7"); /* Zero temp */ + + /* We uniformly assume a single prefetch covers at least 16 bytes */ + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movq %0,%%mm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movq %0,%%mm3" : : "m" (dptr[z0][d+8])); /* P[1] */ + asm volatile("movq %mm2,%mm4"); /* Q[0] */ + asm volatile("movq %mm3,%mm6"); /* Q[1] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %mm4,%mm5"); + asm volatile("pcmpgtb %mm6,%mm7"); + asm volatile("paddb %mm4,%mm4"); + asm volatile("paddb %mm6,%mm6"); + asm volatile("pand %mm0,%mm5"); + asm volatile("pand %mm0,%mm7"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("movq %0,%%mm5" : : "m" (dptr[z][d])); + asm volatile("movq %0,%%mm7" : : "m" (dptr[z][d+8])); + asm volatile("pxor %mm5,%mm2"); + asm volatile("pxor %mm7,%mm3"); + asm volatile("pxor %mm5,%mm4"); + asm volatile("pxor %mm7,%mm6"); + asm volatile("pxor %mm5,%mm5"); + asm volatile("pxor %mm7,%mm7"); + } + asm volatile("movntq %%mm2,%0" : "=m" (p[d])); + asm volatile("movntq %%mm3,%0" : "=m" (p[d+8])); + asm volatile("movntq %%mm4,%0" : "=m" (q[d])); + asm volatile("movntq %%mm6,%0" : "=m" (q[d+8])); + } + + asm volatile("sfence" : :: "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse1x2 = { + raid6_sse12_gen_syndrome, + NULL, /* XOR not yet implemented */ + raid6_have_sse1_or_mmxext, + "sse1x2", + 1 /* Has cache hints */ +}; + +#endif diff --git a/lib/raid6/sse2.c b/lib/raid6/sse2.c new file mode 100644 index 000000000..1d2276b00 --- /dev/null +++ b/lib/raid6/sse2.c @@ -0,0 +1,485 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/sse2.c + * + * SSE-2 implementation of RAID-6 syndrome functions + * + */ + +#include <linux/raid/pq.h> +#include "x86.h" + +static const struct raid6_sse_constants { + u64 x1d[2]; +} raid6_sse_constants __attribute__((aligned(16))) = { + { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL }, +}; + +static int raid6_have_sse2(void) +{ + /* Not really boot_cpu but "all_cpus" */ + return boot_cpu_has(X86_FEATURE_MMX) && + boot_cpu_has(X86_FEATURE_FXSR) && + boot_cpu_has(X86_FEATURE_XMM) && + boot_cpu_has(X86_FEATURE_XMM2); +} + +/* + * Plain SSE2 implementation + */ +static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d])); + asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ + asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d])); + for ( z = z0-2 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm6,%xmm2"); + asm volatile("pxor %xmm6,%xmm4"); + asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d])); + } + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm6,%xmm2"); + asm volatile("pxor %xmm6,%xmm4"); + + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("pxor %xmm2,%xmm2"); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("pxor %xmm4,%xmm4"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + + +static void raid6_sse21_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) + { + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + + for ( d = 0 ; d < bytes ; d += 16 ) { + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); + asm volatile("pxor %xmm4,%xmm2"); + /* P/Q data pages */ + for ( z = z0-1 ; z >= start ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm5,%xmm4"); + } + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pxor %xmm5,%xmm4"); + } + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); + asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + +const struct raid6_calls raid6_sse2x1 = { + raid6_sse21_gen_syndrome, + raid6_sse21_xor_syndrome, + raid6_have_sse2, + "sse2x1", + 1 /* Has cache hints */ +}; + +/* + * Unrolled-by-2 SSE2 implementation + */ +static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ + asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ + + /* We uniformly assume a single prefetch covers at least 32 bytes */ + for ( d = 0 ; d < bytes ; d += 32 ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */ + asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */ + asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */ + asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */ + for ( z = z0-1 ; z >= 0 ; z-- ) { + asm volatile("prefetchnta %0" : : "m" (dptr[z][d])); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + } + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + + static void raid6_sse22_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) + { + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0])); + + for ( d = 0 ; d < bytes ; d += 32 ) { + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); + asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); + asm volatile("pxor %xmm4,%xmm2"); + asm volatile("pxor %xmm6,%xmm3"); + /* P/Q data pages */ + for ( z = z0-1 ; z >= start ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + } + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + } + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); + asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); + /* Don't use movntdq for r/w memory area < cache line */ + asm volatile("movdqa %%xmm4,%0" : "=m" (q[d])); + asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16])); + asm volatile("movdqa %%xmm2,%0" : "=m" (p[d])); + asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16])); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); + } + +const struct raid6_calls raid6_sse2x2 = { + raid6_sse22_gen_syndrome, + raid6_sse22_xor_syndrome, + raid6_have_sse2, + "sse2x2", + 1 /* Has cache hints */ +}; + +#ifdef CONFIG_X86_64 + +/* + * Unrolled-by-4 SSE2 implementation + */ +static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); + asm volatile("pxor %xmm2,%xmm2"); /* P[0] */ + asm volatile("pxor %xmm3,%xmm3"); /* P[1] */ + asm volatile("pxor %xmm4,%xmm4"); /* Q[0] */ + asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */ + asm volatile("pxor %xmm6,%xmm6"); /* Q[1] */ + asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */ + asm volatile("pxor %xmm10,%xmm10"); /* P[2] */ + asm volatile("pxor %xmm11,%xmm11"); /* P[3] */ + asm volatile("pxor %xmm12,%xmm12"); /* Q[2] */ + asm volatile("pxor %xmm13,%xmm13"); /* Zero temp */ + asm volatile("pxor %xmm14,%xmm14"); /* Q[3] */ + asm volatile("pxor %xmm15,%xmm15"); /* Zero temp */ + + for ( d = 0 ; d < bytes ; d += 64 ) { + for ( z = z0 ; z >= 0 ; z-- ) { + /* The second prefetch seems to improve performance... */ + asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); + asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("pcmpgtb %xmm12,%xmm13"); + asm volatile("pcmpgtb %xmm14,%xmm15"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("paddb %xmm12,%xmm12"); + asm volatile("paddb %xmm14,%xmm14"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pand %xmm0,%xmm13"); + asm volatile("pand %xmm0,%xmm15"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); + asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); + asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm13,%xmm10"); + asm volatile("pxor %xmm15,%xmm11"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pxor %xmm13,%xmm13"); + asm volatile("pxor %xmm15,%xmm15"); + } + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("pxor %xmm2,%xmm2"); + asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); + asm volatile("pxor %xmm3,%xmm3"); + asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); + asm volatile("pxor %xmm10,%xmm10"); + asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); + asm volatile("pxor %xmm11,%xmm11"); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("pxor %xmm4,%xmm4"); + asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); + asm volatile("pxor %xmm6,%xmm6"); + asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); + asm volatile("pxor %xmm12,%xmm12"); + asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); + asm volatile("pxor %xmm14,%xmm14"); + } + + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); +} + + static void raid6_sse24_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) + { + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0])); + + for ( d = 0 ; d < bytes ; d += 64 ) { + asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d])); + asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16])); + asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32])); + asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48])); + asm volatile("movdqa %0,%%xmm2" : : "m" (p[d])); + asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16])); + asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32])); + asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48])); + asm volatile("pxor %xmm4,%xmm2"); + asm volatile("pxor %xmm6,%xmm3"); + asm volatile("pxor %xmm12,%xmm10"); + asm volatile("pxor %xmm14,%xmm11"); + /* P/Q data pages */ + for ( z = z0-1 ; z >= start ; z-- ) { + asm volatile("prefetchnta %0" :: "m" (dptr[z][d])); + asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32])); + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pxor %xmm13,%xmm13"); + asm volatile("pxor %xmm15,%xmm15"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("pcmpgtb %xmm12,%xmm13"); + asm volatile("pcmpgtb %xmm14,%xmm15"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("paddb %xmm12,%xmm12"); + asm volatile("paddb %xmm14,%xmm14"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pand %xmm0,%xmm13"); + asm volatile("pand %xmm0,%xmm15"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d])); + asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16])); + asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32])); + asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48])); + asm volatile("pxor %xmm5,%xmm2"); + asm volatile("pxor %xmm7,%xmm3"); + asm volatile("pxor %xmm13,%xmm10"); + asm volatile("pxor %xmm15,%xmm11"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + } + asm volatile("prefetchnta %0" :: "m" (q[d])); + asm volatile("prefetchnta %0" :: "m" (q[d+32])); + /* P/Q left side optimization */ + for ( z = start-1 ; z >= 0 ; z-- ) { + asm volatile("pxor %xmm5,%xmm5"); + asm volatile("pxor %xmm7,%xmm7"); + asm volatile("pxor %xmm13,%xmm13"); + asm volatile("pxor %xmm15,%xmm15"); + asm volatile("pcmpgtb %xmm4,%xmm5"); + asm volatile("pcmpgtb %xmm6,%xmm7"); + asm volatile("pcmpgtb %xmm12,%xmm13"); + asm volatile("pcmpgtb %xmm14,%xmm15"); + asm volatile("paddb %xmm4,%xmm4"); + asm volatile("paddb %xmm6,%xmm6"); + asm volatile("paddb %xmm12,%xmm12"); + asm volatile("paddb %xmm14,%xmm14"); + asm volatile("pand %xmm0,%xmm5"); + asm volatile("pand %xmm0,%xmm7"); + asm volatile("pand %xmm0,%xmm13"); + asm volatile("pand %xmm0,%xmm15"); + asm volatile("pxor %xmm5,%xmm4"); + asm volatile("pxor %xmm7,%xmm6"); + asm volatile("pxor %xmm13,%xmm12"); + asm volatile("pxor %xmm15,%xmm14"); + } + asm volatile("movntdq %%xmm2,%0" : "=m" (p[d])); + asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16])); + asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32])); + asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48])); + asm volatile("pxor %0,%%xmm4" : : "m" (q[d])); + asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16])); + asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32])); + asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48])); + asm volatile("movntdq %%xmm4,%0" : "=m" (q[d])); + asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16])); + asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32])); + asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48])); + } + asm volatile("sfence" : : : "memory"); + kernel_fpu_end(); + } + + +const struct raid6_calls raid6_sse2x4 = { + raid6_sse24_gen_syndrome, + raid6_sse24_xor_syndrome, + raid6_have_sse2, + "sse2x4", + 1 /* Has cache hints */ +}; + +#endif /* CONFIG_X86_64 */ diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile new file mode 100644 index 000000000..29090f3db --- /dev/null +++ b/lib/raid6/test/Makefile @@ -0,0 +1,126 @@ +# +# This is a simple Makefile to test some of the RAID-6 code +# from userspace. +# + +CC = gcc +OPTFLAGS = -O2 # Adjust as desired +CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) +LD = ld +AWK = awk -f +AR = ar +RANLIB = ranlib +OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o + +ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/) +ifeq ($(ARCH),i386) + CFLAGS += -DCONFIG_X86_32 + IS_X86 = yes +endif +ifeq ($(ARCH),x86_64) + CFLAGS += -DCONFIG_X86_64 + IS_X86 = yes +endif + +ifeq ($(ARCH),arm) + CFLAGS += -I../../../arch/arm/include -mfpu=neon + HAS_NEON = yes +endif +ifeq ($(ARCH),arm64) + CFLAGS += -I../../../arch/arm64/include + HAS_NEON = yes +endif + +ifeq ($(IS_X86),yes) + OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o + CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \ + gcc -c -x assembler - >&/dev/null && \ + rm ./-.o && echo -DCONFIG_AS_AVX2=1) +else ifeq ($(HAS_NEON),yes) + OBJS += neon.o neon1.o neon2.o neon4.o neon8.o + CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 +else + HAS_ALTIVEC := $(shell printf '\#include <altivec.h>\nvector int a;\n' |\ + gcc -c -x c - >&/dev/null && \ + rm ./-.o && echo yes) + ifeq ($(HAS_ALTIVEC),yes) + OBJS += altivec1.o altivec2.o altivec4.o altivec8.o + endif +endif +ifeq ($(ARCH),tilegx) +OBJS += tilegx8.o +endif + +.c.o: + $(CC) $(CFLAGS) -c -o $@ $< + +%.c: ../%.c + cp -f $< $@ + +%.uc: ../%.uc + cp -f $< $@ + +all: raid6.a raid6test + +raid6.a: $(OBJS) + rm -f $@ + $(AR) cq $@ $^ + $(RANLIB) $@ + +raid6test: test.c raid6.a + $(CC) $(CFLAGS) -o raid6test $^ + +neon1.c: neon.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=1 < neon.uc > $@ + +neon2.c: neon.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=2 < neon.uc > $@ + +neon4.c: neon.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=4 < neon.uc > $@ + +neon8.c: neon.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=8 < neon.uc > $@ + +altivec1.c: altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=1 < altivec.uc > $@ + +altivec2.c: altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=2 < altivec.uc > $@ + +altivec4.c: altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=4 < altivec.uc > $@ + +altivec8.c: altivec.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=8 < altivec.uc > $@ + +int1.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=1 < int.uc > $@ + +int2.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=2 < int.uc > $@ + +int4.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=4 < int.uc > $@ + +int8.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=8 < int.uc > $@ + +int16.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=16 < int.uc > $@ + +int32.c: int.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=32 < int.uc > $@ + +tilegx8.c: tilegx.uc ../unroll.awk + $(AWK) ../unroll.awk -vN=8 < tilegx.uc > $@ + +tables.c: mktables + ./mktables > tables.c + +clean: + rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c neon*.c tables.c raid6test + rm -f tilegx*.c + +spotless: clean + rm -f *~ diff --git a/lib/raid6/test/test.c b/lib/raid6/test/test.c new file mode 100644 index 000000000..3bebbabdb --- /dev/null +++ b/lib/raid6/test/test.c @@ -0,0 +1,155 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002-2007 H. Peter Anvin - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6test.c + * + * Test RAID-6 recovery with various algorithms + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <linux/raid/pq.h> + +#define NDISKS 16 /* Including P and Q */ + +const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +struct raid6_calls raid6_call; + +char *dataptrs[NDISKS]; +char data[NDISKS][PAGE_SIZE]; +char recovi[PAGE_SIZE], recovj[PAGE_SIZE]; + +static void makedata(int start, int stop) +{ + int i, j; + + for (i = start; i <= stop; i++) { + for (j = 0; j < PAGE_SIZE; j++) + data[i][j] = rand(); + + dataptrs[i] = data[i]; + } +} + +static char disk_type(int d) +{ + switch (d) { + case NDISKS-2: + return 'P'; + case NDISKS-1: + return 'Q'; + default: + return 'D'; + } +} + +static int test_disks(int i, int j) +{ + int erra, errb; + + memset(recovi, 0xf0, PAGE_SIZE); + memset(recovj, 0xba, PAGE_SIZE); + + dataptrs[i] = recovi; + dataptrs[j] = recovj; + + raid6_dual_recov(NDISKS, PAGE_SIZE, i, j, (void **)&dataptrs); + + erra = memcmp(data[i], recovi, PAGE_SIZE); + errb = memcmp(data[j], recovj, PAGE_SIZE); + + if (i < NDISKS-2 && j == NDISKS-1) { + /* We don't implement the DQ failure scenario, since it's + equivalent to a RAID-5 failure (XOR, then recompute Q) */ + erra = errb = 0; + } else { + printf("algo=%-8s faila=%3d(%c) failb=%3d(%c) %s\n", + raid6_call.name, + i, disk_type(i), + j, disk_type(j), + (!erra && !errb) ? "OK" : + !erra ? "ERRB" : + !errb ? "ERRA" : "ERRAB"); + } + + dataptrs[i] = data[i]; + dataptrs[j] = data[j]; + + return erra || errb; +} + +int main(int argc, char *argv[]) +{ + const struct raid6_calls *const *algo; + const struct raid6_recov_calls *const *ra; + int i, j, p1, p2; + int err = 0; + + makedata(0, NDISKS-1); + + for (ra = raid6_recov_algos; *ra; ra++) { + if ((*ra)->valid && !(*ra)->valid()) + continue; + + raid6_2data_recov = (*ra)->data2; + raid6_datap_recov = (*ra)->datap; + + printf("using recovery %s\n", (*ra)->name); + + for (algo = raid6_algos; *algo; algo++) { + if ((*algo)->valid && !(*algo)->valid()) + continue; + + raid6_call = **algo; + + /* Nuke syndromes */ + memset(data[NDISKS-2], 0xee, 2*PAGE_SIZE); + + /* Generate assumed good syndrome */ + raid6_call.gen_syndrome(NDISKS, PAGE_SIZE, + (void **)&dataptrs); + + for (i = 0; i < NDISKS-1; i++) + for (j = i+1; j < NDISKS; j++) + err += test_disks(i, j); + + if (!raid6_call.xor_syndrome) + continue; + + for (p1 = 0; p1 < NDISKS-2; p1++) + for (p2 = p1; p2 < NDISKS-2; p2++) { + + /* Simulate rmw run */ + raid6_call.xor_syndrome(NDISKS, p1, p2, PAGE_SIZE, + (void **)&dataptrs); + makedata(p1, p2); + raid6_call.xor_syndrome(NDISKS, p1, p2, PAGE_SIZE, + (void **)&dataptrs); + + for (i = 0; i < NDISKS-1; i++) + for (j = i+1; j < NDISKS; j++) + err += test_disks(i, j); + } + + } + printf("\n"); + } + + printf("\n"); + /* Pick the best algorithm test */ + raid6_select_algo(); + + if (err) + printf("\n*** ERRORS FOUND ***\n"); + + return err; +} diff --git a/lib/raid6/tilegx.uc b/lib/raid6/tilegx.uc new file mode 100644 index 000000000..2dd291a11 --- /dev/null +++ b/lib/raid6/tilegx.uc @@ -0,0 +1,87 @@ +/* -*- linux-c -*- ------------------------------------------------------- * + * + * Copyright 2002 H. Peter Anvin - All Rights Reserved + * Copyright 2012 Tilera Corporation - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * tilegx$#.c + * + * $#-way unrolled TILE-Gx SIMD for RAID-6 math. + * + * This file is postprocessed using unroll.awk. + * + */ + +#include <linux/raid/pq.h> + +/* Create 8 byte copies of constant byte */ +# define NBYTES(x) (__insn_v1addi(0, x)) +# define NSIZE 8 + +/* + * The SHLBYTE() operation shifts each byte left by 1, *not* + * rolling over into the next byte + */ +static inline __attribute_const__ u64 SHLBYTE(u64 v) +{ + /* Vector One Byte Shift Left Immediate. */ + return __insn_v1shli(v, 1); +} + +/* + * The MASK() operation returns 0xFF in any byte for which the high + * bit is 1, 0x00 for any byte for which the high bit is 0. + */ +static inline __attribute_const__ u64 MASK(u64 v) +{ + /* Vector One Byte Shift Right Signed Immediate. */ + return __insn_v1shrsi(v, 7); +} + + +void raid6_tilegx$#_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u64 *p, *q; + int d, z, z0; + + u64 wd$$, wq$$, wp$$, w1$$, w2$$; + u64 x1d = NBYTES(0x1d); + u64 * z0ptr; + + z0 = disks - 3; /* Highest data disk */ + p = (u64 *)dptr[z0+1]; /* XOR parity */ + q = (u64 *)dptr[z0+2]; /* RS syndrome */ + + z0ptr = (u64 *)&dptr[z0][0]; + for ( d = 0 ; d < bytes ; d += NSIZE*$# ) { + wq$$ = wp$$ = *z0ptr++; + for ( z = z0-1 ; z >= 0 ; z-- ) { + wd$$ = *(u64 *)&dptr[z][d+$$*NSIZE]; + wp$$ = wp$$ ^ wd$$; + w2$$ = MASK(wq$$); + w1$$ = SHLBYTE(wq$$); + w2$$ = w2$$ & x1d; + w1$$ = w1$$ ^ w2$$; + wq$$ = w1$$ ^ wd$$; + } + *p++ = wp$$; + *q++ = wq$$; + } +} + +const struct raid6_calls raid6_tilegx$# = { + raid6_tilegx$#_gen_syndrome, + NULL, /* XOR not yet implemented */ + NULL, + "tilegx$#", + 0 +}; diff --git a/lib/raid6/unroll.awk b/lib/raid6/unroll.awk new file mode 100644 index 000000000..c6aa03631 --- /dev/null +++ b/lib/raid6/unroll.awk @@ -0,0 +1,20 @@ + +# This filter requires one command line option of form -vN=n +# where n must be a decimal number. +# +# Repeat each input line containing $$ n times, replacing $$ with 0...n-1. +# Replace each $# with n, and each $* with a single $. + +BEGIN { + n = N + 0 +} +{ + if (/\$\$/) { rep = n } else { rep = 1 } + for (i = 0; i < rep; ++i) { + tmp = $0 + gsub(/\$\$/, i, tmp) + gsub(/\$\#/, n, tmp) + gsub(/\$\*/, "$", tmp) + print tmp + } +} diff --git a/lib/raid6/x86.h b/lib/raid6/x86.h new file mode 100644 index 000000000..b7595484a --- /dev/null +++ b/lib/raid6/x86.h @@ -0,0 +1,70 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 53 Temple Place Ste 330, + * Boston MA 02111-1307, USA; either version 2 of the License, or + * (at your option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * raid6/x86.h + * + * Definitions common to x86 and x86-64 RAID-6 code only + */ + +#ifndef LINUX_RAID_RAID6X86_H +#define LINUX_RAID_RAID6X86_H + +#if (defined(__i386__) || defined(__x86_64__)) && !defined(__arch_um__) + +#ifdef __KERNEL__ /* Real code */ + +#include <asm/i387.h> + +#else /* Dummy code for user space testing */ + +static inline void kernel_fpu_begin(void) +{ +} + +static inline void kernel_fpu_end(void) +{ +} + +#define __aligned(x) __attribute__((aligned(x))) + +#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE and FXRSTOR instructions + * (fast save and restore) */ +#define X86_FEATURE_XMM (0*32+25) /* Streaming SIMD Extensions */ +#define X86_FEATURE_XMM2 (0*32+26) /* Streaming SIMD Extensions-2 */ +#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ +#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ + +/* Should work well enough on modern CPUs for testing */ +static inline int boot_cpu_has(int flag) +{ + u32 eax, ebx, ecx, edx; + + eax = (flag & 0x100) ? 7 : + (flag & 0x20) ? 0x80000001 : 1; + ecx = 0; + + asm volatile("cpuid" + : "+a" (eax), "=b" (ebx), "=d" (edx), "+c" (ecx)); + + return ((flag & 0x100 ? ebx : + (flag & 0x80) ? ecx : edx) >> (flag & 31)) & 1; +} + +#endif /* ndef __KERNEL__ */ + +#endif +#endif |