diff -urN pixman//configure.ac Pixman.Loongson//configure.ac --- pixman//configure.ac 2010-12-25 18:46:00.018699000 +0800 +++ Pixman.Loongson//configure.ac 2010-12-25 18:39:15.298778000 +0800 @@ -264,6 +264,43 @@ ]) dnl =========================================================================== +dnl Check for Loongson SIMD + +have_loongson_intrinsics=no +AC_MSG_CHECKING(whether to use Loongson SIMD intrinsics) + +AC_COMPILE_IFELSE([ +#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4)) +error "Need GCC >= 4.4 for Loongson SIMD compilation" +#endif +int main () { + /* Test with a loongson SIMD instruction. */ + asm volatile ( ".set arch = loongson2f \n\t" "and \$f0, \$f0, \$f0 \n\t" : : : "cc", "memory" ); + return 0; +}], have_loongson_intrinsics=yes) + + +AC_ARG_ENABLE(loongson, + [AC_HELP_STRING([--disable-loongson], + [disable Loongson fast paths])], + [enable_loongson=$enableval], [enable_loongson=auto]) + +if test $enable_loongson = no ; then + have_loongson_intrinsics=disabled +fi + +if test $have_loongson_intrinsics = yes ; then + AC_DEFINE(USE_LS, 1, [use Loongson compiler intrinsics]) +fi + +AC_MSG_RESULT($have_loongson_intrinsics) +if test $enable_loongson = yes && test $have_loongson_intrinsics = no ; then + AC_MSG_ERROR([Loongson intrinsics not detected]) +fi + +AM_CONDITIONAL(USE_LS, test $have_loongson_intrinsics = yes) + +dnl =========================================================================== dnl Check for MMX if test "x$MMX_CFLAGS" = "x" ; then diff -urN pixman//pixman/Makefile.am Pixman.Loongson//pixman/Makefile.am --- pixman//pixman/Makefile.am 2010-12-25 18:46:00.025027000 +0800 +++ Pixman.Loongson//pixman/Makefile.am 2010-12-25 18:39:15.303599000 +0800 @@ -55,6 +55,19 @@ pixman-combine.h.template solaris-hwcap.mapfile pixman-x64-mmx-emulation.h CLEANFILES = pixman-combine32.c pixman-combine64.c pixman-combine32.h pixman-combine64.h +# loongson code +if USE_LS +noinst_LTLIBRARIES += libpixman-ls.la +libpixman_ls_la_SOURCES = \ + pixman-ls.c +libpixman_ls_la_CFLAGS = $(DEP_CFLAGS) $(LS_CFLAGS) +libpixman_ls_la_LIBADD = $(DEP_LIBS) +libpixman_1_la_LDFLAGS += $(LS_LDFLAGS) +libpixman_1_la_LIBADD += libpixman-ls.la + +ASM_CFLAGS_ls=$(LS_CFLAGS) +endif + # mmx code if USE_MMX noinst_LTLIBRARIES += libpixman-mmx.la diff -urN pixman//pixman/pixman-combine-ls.c Pixman.Loongson//pixman/pixman-combine-ls.c --- pixman//pixman/pixman-combine-ls.c 1970-01-01 08:00:00.000000000 +0800 +++ Pixman.Loongson//pixman/pixman-combine-ls.c 2010-12-25 18:39:15.344171000 +0800 @@ -0,0 +1,911 @@ +static force_inline uint32_t +combine (const uint32_t *src, const uint32_t *mask) +{ + uint32_t ssrc = *src; + + if (mask) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f22) + load8888r(%0,$f20) + expand_alpha($f22,$f22) + pix_multiply($f20,$f22) + store8888r($f8,%0) + :"+r"(ssrc):"r"(*mask):clobber + ); + } + return ssrc; +} + +static void +ls_combine_saturate_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = dest + width; + + while (dest < end) + { + uint32_t s = combine (src, mask); + uint32_t d = *dest; + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f22) + load8888r(%0,$f20) + :"+r"(d):"r"(s):clobber + ); + + uint32_t sa = s >> 24; + uint32_t da = ~d >> 24; + + if (sa > da) + { + uint32_t dds = DIV_UN8 (da, sa) << 24; + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%0,$f24) + expand_alpha($f24,$f24) + pix_multiply($f22,$f24) + save_to($f22) + ::"r"(dds):clobber + ); + } + __asm__ volatile ( + ".set arch=loongson2f \n\t" + pix_add($f20,$f22) + store8888r($f8,%0) + :"=r"(*dest)::clobber + ); + + ++src; + ++dest; + if (mask) + mask++; + } +} +static void +ls_combine_out_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = dest + width; + + while (dest < end) + { + if (mask) + { + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + + load8888r(%2,$f22) + load8888r(%1,$f20) + expand_alpha($f22,$f22) + pix_multiply($f20,$f22) + save_to ($f20) + + load8888r(%0,$f24) + expand_alpha($f24,$f24) + negate($f24,$f24) + pix_multiply($f20,$f24) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + + mask++; + }else { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + + + load8888r(%1,$f20) + + load8888r(%0,$f24) + expand_alpha($f24,$f24) + negate($f24,$f24) + pix_multiply($f20,$f24) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src):clobber + ); + + } + ++dest; + ++src; + } +} + +static void +ls_combine_out_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = dest + width; + + while (dest < end) + { + if (mask) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + + load8888r(%2,$f22) + load8888r(%1,$f20) + expand_alpha($f22,$f22) + pix_multiply($f20,$f22) + save_to ($f20) + + load8888r(%0,$f24) + expand_alpha($f20,$f20) + negate($f20,$f20) + pix_multiply($f20,$f24) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + + mask++; + }else{ + __asm__ volatile ( + ".set arch=loongson2f \n\t" + + load8888r(%1,$f20) + + load8888r(%0,$f24) + expand_alpha($f20,$f20) + negate($f20,$f20) + pix_multiply($f20,$f24) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src):clobber + ); + } + ++dest; + ++src; + + } +} + +static void +ls_combine_out_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = src + width; + + while (src < end) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f22) + load8888r(%1,$f20) + load8888r(%0,$f24) + expand_alpha($f24,$f26) + negate($f26,$f26) + pix_multiply($f20,$f22) + save_to($f20) + pix_multiply($f20,$f26) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + ++src; + ++dest; + ++mask; + } +} + +static void +ls_combine_out_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = src + width; + + while (src < end) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f22) + load8888r(%1,$f20) + load8888r(%0,$f24) + expand_alpha($f20,$f28) + pix_multiply($f22,$f28) + save_to($f22) + negate($f22,$f22) + pix_multiply($f24,$f22) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + ++src; + ++dest; + ++mask; + } +} + + +static void +ls_combine_atop_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = dest + width; + + while (dest < end) + { + if (mask) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + + load8888r(%2,$f22) + load8888r(%1,$f20) + expand_alpha($f22,$f22) + pix_multiply($f20,$f22) + save_to ($f20) + + load8888r(%0,$f24) + expand_alpha($f20,$f26) + expand_alpha($f24,$f28) + negate($f26,$f26) + pix_add_mul($f20,$f28,$f24,$f26) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + + mask++; + }else { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f20) + + load8888r(%0,$f24) + expand_alpha($f20,$f26) + expand_alpha($f24,$f28) + negate($f26,$f26) + pix_add_mul($f20,$f28,$f24,$f26) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src):clobber + ); + } + ++dest; + ++src; + + } +} + +static void +ls_combine_atop_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end; + + end = dest + width; + + while (dest < end) + { + if (mask){ + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + + load8888r(%2,$f22) + load8888r(%1,$f20) + expand_alpha($f22,$f22) + pix_multiply($f20,$f22) + save_to ($f20) + + load8888r(%0,$f24) + expand_alpha($f20,$f26) + expand_alpha($f24,$f28) + negate($f28,$f28) + pix_add_mul($f20,$f28,$f24,$f26) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + mask++; + }else{ + __asm__ volatile ( + ".set arch=loongson2f \n\t" + + load8888r(%1,$f20) + + load8888r(%0,$f24) + expand_alpha($f20,$f26) + expand_alpha($f24,$f28) + negate($f28,$f28) + pix_add_mul($f20,$f28,$f24,$f26) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src):clobber + ); + } + ++dest; + ++src; + } +} + + +static void +ls_combine_atop_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = src + width; + + while (src < end) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f22) + load8888r(%1,$f20) + load8888r(%0,$f24) + expand_alpha($f24,$f26) + expand_alpha($f20,$f28) + pix_multiply($f20,$f22) + save_to($f20) + pix_multiply($f22,$f28) + save_to($f22) + negate($f22,$f22) + pix_add_mul($f24,$f22,$f20,$f26) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + ++src; + ++dest; + ++mask; + } +} + +static void +ls_combine_atop_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = src + width; + + while (src < end) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f22) + load8888r(%1,$f20) + load8888r(%0,$f24) + expand_alpha($f24,$f26) + expand_alpha($f20,$f28) + pix_multiply($f20,$f22) + save_to($f20) + pix_multiply($f22,$f28) + save_to($f22) + negate($f26,$f26) + pix_add_mul($f24,$f22,$f20,$f26) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + ++src; + ++dest; + ++mask; + } +} + +static void +ls_combine_xor_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = dest + width; + + while (dest < end) + { + if (mask) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f22) + load8888r(%1,$f20) + expand_alpha($f22,$f22) + pix_multiply($f20,$f22) + save_to ($f20) + + + load8888r(%0,$f24) + expand_alpha($f20,$f26) + expand_alpha($f24,$f28) + negate($f26,$f26) + negate($f28,$f28) + pix_add_mul($f20,$f28,$f24,$f26) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + mask++; + }else{ + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f20) + + load8888r(%0,$f24) + expand_alpha($f20,$f26) + expand_alpha($f24,$f28) + negate($f26,$f26) + negate($f28,$f28) + pix_add_mul($f20,$f28,$f24,$f26) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src):clobber + ); + } + ++dest; + ++src; + + } +} + +static void +ls_combine_xor_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = src + width; + + while (src < end) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f22) + load8888r(%1,$f20) + load8888r(%0,$f24) + expand_alpha($f24,$f26) + expand_alpha($f20,$f28) + pix_multiply($f20,$f22) + save_to($f20) + pix_multiply($f22,$f28) + save_to($f22) + negate($f26,$f26) + negate($f22,$f22) + pix_add_mul($f24,$f22,$f20,$f26) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + ++src; + ++dest; + ++mask; + } +} + + +static void +ls_combine_in_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = dest + width; + + while (dest < end) + { + + if (mask) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f22) + load8888r(%1,$f20) + expand_alpha($f22,$f22) + pix_multiply($f20,$f22) + save_to ($f20) + + load8888r(%0,$f24) + expand_alpha($f20,$f26) + pix_multiply($f24,$f26) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + mask++; + } else { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f20) + + load8888r(%0,$f24) + expand_alpha($f20,$f26) + pix_multiply($f24,$f26) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src):clobber + ); + } + ++dest; + ++src; + } +} + +static void +ls_combine_in_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = src + width; + + while (src < end) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f22) + load8888r(%1,$f20) + load8888r(%0,$f24) + expand_alpha($f20,$f20) + pix_multiply($f22,$f20) + save_to($f26) + pix_multiply($f24,$f26) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + ++src; + ++dest; + ++mask; + } +} + +static void +ls_combine_in_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = dest + width; + + while (dest < end) + { + if (mask) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f22) + load8888r(%1,$f20) + expand_alpha($f22,$f22) + pix_multiply($f20,$f22) + save_to ($f20) + + load8888r(%0,$f24) + expand_alpha($f24,$f24) + pix_multiply($f20,$f24) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + mask++; + } else { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f20) + + load8888r(%0,$f24) + expand_alpha($f24,$f24) + pix_multiply($f20,$f24) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src):clobber + ); + + } + ++dest; + ++src; + } +} + +static void +ls_combine_in_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = src + width; + + while (src < end) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f22) + load8888r(%1,$f20) + load8888r(%0,$f24) + expand_alpha($f24,$f24) + pix_multiply($f20,$f22) + save_to($f26) + pix_multiply($f26,$f24) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + ++src; + ++dest; + ++mask; + } + } +static void +ls_combine_src_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = src + width; + + while (src < end) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f22) + load8888r(%1,$f20) + pix_multiply($f20,$f22) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + ++src; + ++mask; + ++dest; + } + +} + + +static void +ls_combine_over_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = dest + width; + + while (dest < end) + { + + uint32_t ssrc = combine (src, mask); + uint32_t a = ssrc >> 24; + + if (a == 0xff) + { + *dest = ssrc; + } + else if (ssrc) + { + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f20) + + expand_alpha($f20,$f24) + load8888r(%0,$f26) + over($f20,$f24,$f26) + store8888r($f8,%0) + :"+r"(*dest):"r"(ssrc):clobber + ); + } + + ++dest; + ++src; + if (mask) + ++mask; + } +} + +static void +ls_combine_over_reverse_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = dest + width; + + while (dest < end) + { + if (mask) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + + load8888r(%2,$f22) + load8888r(%1,$f20) + expand_alpha($f22,$f22) + pix_multiply($f20,$f22) + save_to ($f20) + + load8888r(%0,$f26) + expand_alpha($f26,$f28) + over($f26,$f28,$f20) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + mask++; + }else{ + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f20) + + load8888r(%0,$f26) + expand_alpha($f26,$f28) + over($f26,$f28,$f20) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src):clobber + ); + + } + ++dest; + ++src; + } +} + + +static void +ls_combine_over_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = src + width; + + while (src < end) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%0,$f20) + load8888r(%1,$f22) + load8888r(%2,$f24) + expand_alpha($f22,$f26) + in_over($f22,$f26,$f24,$f20) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + ++src; + ++dest; + ++mask; + } + +} + +static void +ls_combine_over_reverse_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = src + width; + + while (src < end) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%0,$f20) + load8888r(%1,$f22) + load8888r(%2,$f24) + in($f22,$f24) + save_to($f22) + expand_alpha($f20,$f28) + over($f20,$f28,$f22) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + ++src; + ++dest; + ++mask; + } + +} + +static void +ls_combine_add_u (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = dest + width; + + while (dest < end) + { + + if (mask) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f22) + load8888r(%1,$f20) + expand_alpha($f22,$f22) + pix_multiply($f20,$f22) + save_to ($f20) + + load8888r(%0,$f22) + pix_add($f20,$f22) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + mask++; + }else{ + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f20) + + load8888r(%0,$f22) + pix_add($f20,$f22) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src):clobber + ); + + } + ++dest; + ++src; + } +} + +static void +ls_combine_add_ca (pixman_implementation_t *imp, + pixman_op_t op, + uint32_t * dest, + const uint32_t * src, + const uint32_t * mask, + int width) +{ + const uint32_t *end = src + width; + + while (src < end) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%0,$f20) + load8888r(%1,$f22) + load8888r(%2,$f24) + pix_multiply($f22,$f24) + save_to($f22) + pix_add($f22,$f20) + store8888r($f8,%0) + :"+r"(*dest):"r"(*src),"r"(*mask):clobber + ); + ++src; + ++dest; + ++mask; + } +} diff -urN pixman//pixman/pixman-composite-ls.c Pixman.Loongson//pixman/pixman-composite-ls.c --- pixman//pixman/pixman-composite-ls.c 1970-01-01 08:00:00.000000000 +0800 +++ Pixman.Loongson//pixman/pixman-composite-ls.c 2010-12-25 18:39:15.356667000 +0800 @@ -0,0 +1,967 @@ +static void +ls_composite_over_x888_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + + uint32_t *src, *src_line; + uint32_t *dst, *dst_line; + uint8_t *mask, *mask_line; + int src_stride, mask_stride, dst_stride; + uint32_t m; + uint32_t s, d; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + + w = width; + while (w--) + { + m = *mask++; + if (m) + { + s = *src | 0xff000000; + + if (m == 0xff) + { + *dst = s; + } + else + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%0,$f20) + load8888r(%1,$f22) + load8888r(%2,$f24) + expand_alpha($f22,$f26) + expand_alpha_rev($f24,$f28) + in_over($f22,$f26,$f28,$f20) + store8888r($f8,%0) + :"+r"(*dst):"r"(s),"r"(m):clobber + ); + +// __m64 sa = expand_alpha (s); +// __m64 vm = expand_alpha_rev (to_m64 (m)); +// __m64 vdest = in_over (s, sa, vm, load8888 (*dst)); +// *dst = store8888 (vdest); + + } + } + src++; + dst++; + } + } +} + + + + + +static void +ls_composite_over_8888_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + uint32_t s; + int dst_stride, src_stride; + uint8_t a; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src; + a = s >> 24; + + if (a == 0xff) + { + *dst = s; + } + else if (s) + { + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f24) + load8888r(%0,$f20) + expand_alpha($f24,$f26) + over($f24,$f26,$f20) + store8888r($f8,%0) + :"+r"(*dst):"r"(*src):clobber + ); + } + dst++; + src++; + + } + } +} + + +static void +ls_composite_over_8888_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + uint32_t mask; + __m64 vmask; + int dst_stride, src_stride; + int32_t w; + __m64 srca; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + mask = _pixman_image_get_solid (mask_image, dst_image->bits.format); + mask = mask | mask >> 8 | mask >> 16 | mask >> 24; + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888(%1,$f24) + store64a($f24,%0) + :"=m"(vmask):"m"(mask):clobber + ); + + srca = ls_4x00ff; + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f20) + load8888r(%0,$f22) + expand_alpha($f20,$f28) + in_over($f20,$f28,$f24,$f22) + store8888r($f8,%0) + :"+r"(*dst):"r"(*src):clobber + ); + + w--; + dst++; + src++; + } + } +} + +static void +ls_composite_over_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint32_t *dst_line, *dst; + int32_t w; + int dst_stride; + __m64 vsrc, vsrca; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f24) + store64($f24,%0) + expand_alpha($f24,$f26) + store64($f26,%1) + :"=m"(vsrc), "=m"(vsrca):"r"(src):clobber + ); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + w = width; + + while (w) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%0,$f28) + over($f24,$f26,$f28) + store8888r($f8,%0) + :"+r"(*dst)::clobber + ); + + w--; + dst++; + } + } +} + +static void +ls_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line; + uint32_t *mask_line; + int dst_stride, mask_stride; + __m64 vsrc, vsrca; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f24) + store64($f24,%0) + expand_alpha($f24,$f26) + store64($f26,%1) + :"=m"(vsrc), "=m"(vsrca):"r"(src):clobber + ); + + while (height--) + { + int twidth = width; + uint32_t *p = (uint32_t *)mask_line; + uint32_t *q = (uint32_t *)dst_line; + + while (twidth) + { + + if (*p) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%0,$f28) + load8888r(%1,$f20) + in_over($f24,$f26,$f20,$f28) + store8888r($f8,%0) + :"+r"(*q):"r"(*p):clobber + ); + } + twidth--; + p++; + q++; + } + + dst_line += dst_stride; + mask_line += mask_stride; + } +} + + +static void +ls_composite_over_n_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + __m64 vsrc, vsrca; + uint64_t srcsrc; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + srcsrc = (uint64_t)src << 32 | src; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f24) + store64a($f24,%0) + expand_alpha($f24,$f26) + store64a($f26,%1) + :"=m"(vsrc), "=m"(vsrca):"r"(src):clobber + ); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w) + { + uint32_t m = *mask; + + if (m) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%0,$f20) + load32r(%1,$f22) + expand_alpha_rev($f22,$f28) + in_over($f24,$f26,$f28,$f20) + store8888r($f8,%0) + :"+r"(*dst):"r"(m):clobber + ); + } + + w--; + mask++; + dst++; + } + } + +} + +static void +ls_composite_over_x888_n_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + uint32_t mask; + __m64 vmask; + int dst_stride, src_stride; + int32_t w; + __m64 srca; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + mask = _pixman_image_get_solid (mask_image, dst_image->bits.format); + + mask &= 0xff000000; + mask = mask | mask >> 8 | mask >> 16 | mask >> 24; + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f24) + store64a($f24,%0) + :"=m"(vmask):"r"(mask):clobber + ); + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load64a(%1,$f26) + store64a($f26,%0) + :"=m"(srca):"m"(ls_4x00ff):clobber + ); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w) + { + uint32_t src_tmp = *src | 0xff000000; + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f20) + load8888r(%0,$f22) + in_over($f20,$f26,$f24,$f22) + store8888r($f8,%0) + :"+r"(*dst):"r"(src_tmp):clobber + ); + + w--; + dst++; + src++; + } + } +} + + +static void +ls_composite_over_8888_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line, *dst; + uint32_t d; + uint32_t *src_line, *src, s; + uint8_t a; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w--) + { + s = *src++; + a = s >> 24; + if (s) + { + if (a == 0xff) + { + d = s; + } + else + { + d = *dst; + d = CONVERT_0565_TO_0888 (d); + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f24) + load8888r(%0,$f20) + expand_alpha($f24,$f26) + over($f24,$f26,$f20) + store8888r($f8,%0) + :"+r"(d):"r"(s):clobber + ); + + + } + *dst = CONVERT_8888_TO_0565 (d); + } + dst++; + } + } +} + +static void +ls_composite_over_n_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src; + uint32_t d; + uint16_t *dst_line, *dst; + int32_t w; + int dst_stride; + __m64 vsrc, vsrca; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f24) + store64a($f24,%0) + expand_alpha($f24,$f26) + store64a($f26,%1) + :"=m"(vsrc), "=m"(vsrca):"r"(src):clobber + ); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + w = width; + + while (w) + { + + d = *dst; + d = CONVERT_0565_TO_0888 (d); + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%0,$f20) + + over($f24,$f26,$f20) + store8888r($f8,%0) + :"+r"(d)::clobber + ); + + *dst = CONVERT_8888_TO_0565 (d); + + w--; + dst++; + } + } +} + +static void +ls_composite_over_n_8_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca, m, d; + uint16_t *dst_line, *dst; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + __m64 vsrc, vsrca; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f24) + store64a($f24,%0) + expand_alpha($f24,$f26) + store64a($f26,%1) + :"=m"(vsrc), "=m"(vsrca):"r"(src):clobber + ); + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w) + { + m = *mask; + d = *dst; + + if (m) + { + + d = CONVERT_0565_TO_0888 (d); + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%0,$f20) + load32r(%1,$f22) + expand_alpha_rev($f22,$f28) + in_over($f24,$f26,$f28,$f20) + store8888r($f8,%0) + :"+r"(d):"r"(m):clobber + ); + + *dst = CONVERT_8888_TO_0565 (d); + + } + + w--; + mask++; + dst++; + } + } +} + +static void +ls_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca, m, d; + uint16_t *dst_line; + uint32_t *mask_line; + int dst_stride, mask_stride; + __m64 vsrc, vsrca; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + return; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f24) + store64a($f24,%0) + expand_alpha($f24,$f26) + store64a($f26,%1) + :"=m"(vsrc), "=m"(vsrca):"r"(src):clobber + ); + + while (height--) + { + int twidth = width; + uint32_t *p = (uint32_t *)mask_line; + uint16_t *q = (uint16_t *)dst_line; + + while (twidth) + { + + m = *(uint32_t *)p; + d = *q; + + if (m) + { + + d = CONVERT_0565_TO_0888 (d); + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%0,$f20) + load8888r(%1,$f22) + in_over($f24,$f26,$f22,$f20) + store8888r($f8,%0) + :"+r"(d):"r"(m):clobber + ); + + *q = CONVERT_8888_TO_0565 (d); + + } + + twidth--; + p++; + q++; + } + + mask_line += mask_stride; + dst_line += dst_stride; + } +} +static void +ls_composite_over_pixbuf_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *dst_line, *dst; + uint32_t *src_line, *src; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +#if 0 + /* FIXME */ + assert (src_image->drawable == mask_image->drawable); +#endif + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f22) + load8888r(%0,$f20) + over_rev_non_pre($f22,$f20) + store8888r($f8,%0) + :"+r"(*dst):"r"(*src):clobber + ); + + w--; + dst++; + src++; + } + } +} +static void +ls_composite_over_pixbuf_0565 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint16_t *dst_line, *dst; + uint32_t *src_line, *src, d; + int dst_stride, src_stride; + int32_t w; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + +#if 0 + /* FIXME */ + assert (src_image->drawable == mask_image->drawable); +#endif + + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + src = src_line; + src_line += src_stride; + w = width; + + while (w) + { + + d = *dst; + d = CONVERT_0565_TO_0888 (d); + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%1,$f20) + load8888r(%0,$f24) + over_rev_non_pre($f20,$f24) + store8888r($f8,%0) + :"+r"(d):"r"(*src):clobber + ); + + *dst = CONVERT_8888_TO_0565 (d); + + w--; + dst++; + src++; + } + } +} + +static void +ls_composite_src_n_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t src, srca; + uint32_t *dst_line, *dst, m; + uint8_t *mask_line, *mask; + int dst_stride, mask_stride; + int32_t w; + __m64 vsrc, vsrca; + uint64_t srcsrc; + + src = _pixman_image_get_solid (src_image, dst_image->bits.format); + + srca = src >> 24; + if (src == 0) + { + pixman_fill_ls (dst_image->bits.bits, dst_image->bits.rowstride, + PIXMAN_FORMAT_BPP (dst_image->bits.format), + dest_x, dest_y, width, height, 0); + return; + } + + srcsrc = (uint64_t)src << 32 | src; + + PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load8888r(%2,$f24) + store64a($f24,%0) + expand_alpha($f24,$f26) + store64a($f26,%1) + :"=m"(vsrc), "=m"(vsrca):"r"(src):clobber + ); + while (height--) + { + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + w = width; + + while (w) + { + m = *mask; + + if (m) + { + __asm__ volatile ( + ".set arch=loongson2f \n\t" + load32r(%1,$f20) + expand_alpha_rev($f20,$f28) + in($f24,$f28) + store8888r($f8,%0) + :"=r"(*dst):"r"(m):clobber + ); + + } + else + { + *dst = 0; + } + + w--; + mask++; + dst++; + } + } +} diff -urN pixman//pixman/pixman-cpu.c Pixman.Loongson//pixman/pixman-cpu.c --- pixman//pixman/pixman-cpu.c 2010-12-25 18:46:00.073234000 +0800 +++ Pixman.Loongson//pixman/pixman-cpu.c 2010-12-25 18:39:15.360337000 +0800 @@ -579,7 +579,9 @@ if (pixman_have_mmx ()) return _pixman_implementation_create_mmx (); #endif - +#ifdef USE_LS + return _pixman_implementation_create_ls (); +#endif #ifdef USE_ARM_NEON if (pixman_have_arm_neon ()) return _pixman_implementation_create_arm_neon (); diff -urN pixman//pixman/pixman-ls.c Pixman.Loongson//pixman/pixman-ls.c --- pixman//pixman/pixman-ls.c 1970-01-01 08:00:00.000000000 +0800 +++ Pixman.Loongson//pixman/pixman-ls.c 2010-12-25 18:39:15.386759000 +0800 @@ -0,0 +1,538 @@ +/* +* Based on pixman-mmx.c +* Implemented for loongson 2F only. +* Free software based on GPL licence. +* Copyright 2010 WG Ge. +*/ + +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include +#include +#include "pixman-private.h" +#include "pixman-combine32.h" +#include "primitive.h" + +#define __m64 __attribute__ ((aligned (8))) uint64_t +#define DECLARE_ALIGNED(n,t,v) t __attribute__ ((aligned (n))) v +#define DECLARE_ALIGNED_8(t, v, ...) DECLARE_ALIGNED(8, t, v) + +DECLARE_ALIGNED_8 (const uint64_t, ls_4x00ff ) = 0x00ff00ff00ff00ffULL; +DECLARE_ALIGNED_8 (const uint64_t, ls_4x0080 ) = 0x0080008000800080ULL; +DECLARE_ALIGNED_8 (const uint64_t, ls_565_rgb ) = 0x000001f0003f001fULL; +DECLARE_ALIGNED_8 (const uint64_t, ls_565_unpack_multiplier ) = 0x0000008404100840ULL; +DECLARE_ALIGNED_8 (const uint64_t, ls_565_r ) = 0x000000f800000000ULL; +DECLARE_ALIGNED_8 (const uint64_t, ls_565_g ) = 0x0000000000fc0000ULL; +DECLARE_ALIGNED_8 (const uint64_t, ls_565_b ) = 0x00000000000000f8ULL; +DECLARE_ALIGNED_8 (const uint64_t, ls_mask_0 ) = 0xffffffffffff0000ULL; +DECLARE_ALIGNED_8 (const uint64_t, ls_mask_1 ) = 0xffffffff0000ffffULL; +DECLARE_ALIGNED_8 (const uint64_t, ls_mask_2 ) = 0xffff0000ffffffffULL; +DECLARE_ALIGNED_8 (const uint64_t, ls_mask_3 ) = 0x0000ffffffffffffULL; +DECLARE_ALIGNED_8 (const uint64_t, ls_full_alpha ) = 0x00ff000000000000ULL; +DECLARE_ALIGNED_8 (const uint64_t, ls_ffff0000ffff0000 ) = 0xffff0000ffff0000ULL; +DECLARE_ALIGNED_8 (const uint64_t, ls_0000ffff00000000 ) = 0x0000ffff00000000ULL; +DECLARE_ALIGNED_8 (const uint64_t, ls_000000000000ffff ) = 0x000000000000ffffULL; + + +pixman_bool_t +pixman_fill_ls (uint32_t *bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + uint64_t fill; + uint32_t byte_width; + uint8_t *byte_line; + + + + if (bpp != 16 && bpp != 32 && bpp != 8) + return FALSE; + + if (bpp == 8) + { + stride = stride * (int) sizeof (uint32_t) / 1; + byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); + byte_width = width; + stride *= 1; + xor = (xor & 0xff) * 0x01010101; + } + else if (bpp == 16) + { + stride = stride * (int) sizeof (uint32_t) / 2; + byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); + byte_width = 2 * width; + stride *= 2; + xor = (xor & 0xffff) * 0x00010001; + } + else + { + stride = stride * (int) sizeof (uint32_t) / 4; + byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); + byte_width = 4 * width; + stride *= 4; + } + + fill = ((uint64_t)xor << 32) | xor; + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + "ldc1 $f24, %0 \n\t" + ::"m"(fill):"$f24" + ); + while (height--) + { + int w; + uint8_t *d = byte_line; + + byte_line += stride; + w = byte_width; + + while (w >= 1 && ((unsigned long)d & 1)) + { + *(uint8_t *)d = (xor & 0xff); + w--; + d++; + } + + while (w >= 2 && ((unsigned long)d & 3)) + { + *(uint16_t *)d = xor; + w -= 2; + d += 2; + } + + while (w >= 4 && ((unsigned long)d & 7)) + { + *(uint32_t *)d = xor; + + w -= 4; + d += 4; + } + + while (w >= 64) + { + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + "dmfc1 $8, $f24 \n\t" + "sd $8 , (%0) \n\t" + "sd $8 , 8(%0) \n\t" + "sd $8 , 16(%0) \n\t" + "sd $8 , 24(%0) \n\t" + "sd $8 , 32(%0) \n\t" + "sd $8 , 40(%0) \n\t" + "sd $8 , 48(%0) \n\t" + "sd $8 , 56(%0) \n\t" + ::"r"(d):"$8","memory","$f24" + ); + w -= 64; + d += 64; + } + + while (w >= 4) + { + *(uint32_t *)d = xor; + + w -= 4; + d += 4; + } + while (w >= 2) + { + *(uint16_t *)d = xor; + w -= 2; + d += 2; + } + while (w >= 1) + { + *(uint8_t *)d = (xor & 0xff); + w--; + d++; + } + + } + return TRUE; +} + +static pixman_bool_t +pixman_blt_ls (uint32_t *src_bits, + uint32_t *dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dst_x, + int dst_y, + int width, + int height) +{ + uint8_t * src_bytes; + uint8_t * dst_bytes; + int byte_width; + + if (src_bpp != dst_bpp) + return FALSE; + + if (src_bpp == 16) + { + src_stride = src_stride * (int) sizeof (uint32_t) / 2; + dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; + src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); + dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); + byte_width = 2 * width; + src_stride *= 2; + dst_stride *= 2; + } + else if (src_bpp == 32) + { + src_stride = src_stride * (int) sizeof (uint32_t) / 4; + dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; + src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); + dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); + byte_width = 4 * width; + src_stride *= 4; + dst_stride *= 4; + } + else + { + return FALSE; + } + + while (height--) + { + int w; + uint8_t *s = src_bytes; + uint8_t *d = dst_bytes; + src_bytes += src_stride; + dst_bytes += dst_stride; + w = byte_width; + + while (w >= 2 && ((unsigned long)d & 3)) + { + *(uint16_t *)d = *(uint16_t *)s; + w -= 2; + s += 2; + d += 2; + } + + while (w >= 4 && ((unsigned long)d & 7)) + { + *(uint32_t *)d = *(uint32_t *)s; + + w -= 4; + s += 4; + d += 4; + } + if ((unsigned long)s & 7) +{ + while (w >= 64) + { + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + "uld $8 , (%1) \n\t" + "uld $9 , 8(%1) \n\t" + "uld $10, 16(%1) \n\t" + "uld $11, 24(%1) \n\t" + "sd $8 , (%0) \n\t" + "sd $9 , 8(%0) \n\t" + "sd $10, 16(%0) \n\t" + "sd $11, 24(%0) \n\t" + + "uld $8 , 32(%1) \n\t" + "uld $9 , 40(%1) \n\t" + "uld $10, 48(%1) \n\t" + "uld $11, 56(%1) \n\t" + "sd $8 , 32(%0) \n\t" + "sd $9 , 40(%0) \n\t" + "sd $10, 48(%0) \n\t" + "sd $11, 56(%0) \n\t" + ::"r"(d),"r"(s):"$8","$9","$10","$11","memory" + ); + w -= 64; + s += 64; + d += 64; + } +} +else +{ + while (w >= 64) + { + + __asm__ volatile ( + ".set arch=loongson2f \n\t" + "ld $8 , (%1) \n\t" + "ld $9 , 8(%1) \n\t" + "ld $10, 16(%1) \n\t" + "ld $11, 24(%1) \n\t" + "sd $8 , (%0) \n\t" + "sd $9 , 8(%0) \n\t" + "sd $10, 16(%0) \n\t" + "sd $11, 24(%0) \n\t" + + "ld $8 , 32(%1) \n\t" + "ld $9 , 40(%1) \n\t" + "ld $10, 48(%1) \n\t" + "ld $11, 56(%1) \n\t" + "sd $8 , 32(%0) \n\t" + "sd $9 , 40(%0) \n\t" + "sd $10, 48(%0) \n\t" + "sd $11, 56(%0) \n\t" + ::"r"(d),"r"(s):"$8","$9","$10","$11","memory" + ); + w -= 64; + s += 64; + d += 64; + } +} + + while (w >= 4) + { + *(uint32_t *)d = *(uint32_t *)s; + + w -= 4; + s += 4; + d += 4; + } + if (w >= 2) + { + *(uint16_t *)d = *(uint16_t *)s; + w -= 2; + s += 2; + d += 2; + } + } + return TRUE; +} + + +#include "pixman-composite-ls.c" +#include "pixman-combine-ls.c" + +static pixman_bool_t +ls_blt (pixman_implementation_t *imp, + uint32_t * src_bits, + uint32_t * dst_bits, + int src_stride, + int dst_stride, + int src_bpp, + int dst_bpp, + int src_x, + int src_y, + int dst_x, + int dst_y, + int width, + int height) +{ + if (!pixman_blt_ls ( + src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, + src_x, src_y, dst_x, dst_y, width, height)) + { + return _pixman_implementation_blt ( + imp->delegate, + src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, + src_x, src_y, dst_x, dst_y, width, height); + } + + return TRUE; +} + +static pixman_bool_t +ls_fill (pixman_implementation_t *imp, + uint32_t * bits, + int stride, + int bpp, + int x, + int y, + int width, + int height, + uint32_t xor) +{ + if (!pixman_fill_ls (bits, stride, bpp, x, y, width, height, xor)) + { + return _pixman_implementation_fill ( + imp->delegate, bits, stride, bpp, x, y, width, height, xor); + } + + return TRUE; +} + +static void +ls_composite_copy_area (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + pixman_blt_ls (src_image->bits.bits, + dst_image->bits.bits, + src_image->bits.rowstride, + dst_image->bits.rowstride, + PIXMAN_FORMAT_BPP (src_image->bits.format), + PIXMAN_FORMAT_BPP (dst_image->bits.format), + src_x, src_y, dest_x, dest_y, width, height); +} + + +static const pixman_fast_path_t ls_fast_paths[] = +{ + +//these are implemented so far +#if 1 + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, ls_composite_over_x888_8_8888 ), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, ls_composite_over_x888_8_8888 ), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, ls_composite_over_x888_8_8888 ), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, ls_composite_over_x888_8_8888 ), +#endif + +#if 1 +//over_8888_0565 significant perf improvement, slight better L1, L2, 30% better RT + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, ls_composite_over_8888_0565 ), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, ls_composite_over_8888_0565 ), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, ls_composite_over_pixbuf_0565 ), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, ls_composite_over_pixbuf_0565 ), + +//big improvement some closing 100% + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, ls_composite_over_n_8888_0565_ca ), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, ls_composite_over_n_8888_0565_ca ), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, ls_composite_over_n_8_0565 ), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, ls_composite_over_n_8_0565 ), + PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, ls_composite_over_n_0565 ), + +//ubalbe to bench with lowlevel bench, believe it is a gain in perf + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, ls_composite_over_x888_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, ls_composite_over_x888_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, ls_composite_over_x888_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, ls_composite_over_x888_n_8888 ), + +//performance regress 30% in L1,L2, but significant improvement in RT + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, ls_composite_over_8888_8888 ), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, ls_composite_over_8888_8888 ), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, ls_composite_over_8888_8888 ), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, ls_composite_over_8888_8888 ), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, ls_composite_over_pixbuf_8888 ), + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, ls_composite_over_pixbuf_8888 ), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, ls_composite_over_pixbuf_8888 ), + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, ls_composite_over_pixbuf_8888 ), + +//same performance in L1,L2, but significant improvement in RT 30-40% + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, ls_composite_over_8888_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, ls_composite_over_8888_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, ls_composite_over_8888_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, ls_composite_over_8888_n_8888 ), + +//significant perf improvement 20% + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, ls_composite_over_n_8_8888 ), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, ls_composite_over_n_8_8888 ), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, ls_composite_over_n_8_8888 ), + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, ls_composite_over_n_8_8888 ), + +//3 times perf improvements + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, ls_composite_over_n_8888_8888_ca ), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, ls_composite_over_n_8888_8888_ca ), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, ls_composite_over_n_8888_8888_ca ), + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, ls_composite_over_n_8888_8888_ca ), + +//significant performance boost + PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, ls_composite_over_n_8888 ), + PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, ls_composite_over_n_8888 ), +//simple add, expect better perf in generic code +// PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, ls_composite_add_8888_8888 ), +// PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, ls_composite_add_8888_8888 ), + +// FIXME: Copy memory are not better than geneic code +#if 0 + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, ls_composite_copy_area ), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, ls_composite_copy_area ), + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, ls_composite_copy_area ), + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, ls_composite_copy_area ), + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, ls_composite_copy_area ), + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, ls_composite_copy_area ), + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, ls_composite_copy_area ), + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, ls_composite_copy_area ), + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, ls_composite_copy_area ), + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, ls_composite_copy_area ), +#endif + +//significant improvement + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, ls_composite_src_n_8_8888 ), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, ls_composite_src_n_8_8888 ), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, ls_composite_src_n_8_8888 ), + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, ls_composite_src_n_8_8888 ), + +#endif + +//these are not yet implemented + +#if 0 + + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, ls_composite_add_8000_8000 ), + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, ls_composite_add_n_8_8 ), + PIXMAN_STD_FAST_PATH (IN, a8, null, a8, ls_composite_in_8_8 ), + PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, ls_composite_in_n_8_8 ), +#endif + + + { PIXMAN_OP_NONE }, +}; + +pixman_implementation_t * +_pixman_implementation_create_ls (void) +{ + pixman_implementation_t *general = _pixman_implementation_create_fast_path (); + pixman_implementation_t *imp = _pixman_implementation_create (general, ls_fast_paths); + +//Turned on but unable to benchmark. +#if 1 + imp->combine_32[PIXMAN_OP_OVER] = ls_combine_over_u; + imp->combine_32[PIXMAN_OP_OVER_REVERSE] = ls_combine_over_reverse_u; + imp->combine_32[PIXMAN_OP_IN] = ls_combine_in_u; + imp->combine_32[PIXMAN_OP_IN_REVERSE] = ls_combine_in_reverse_u; + imp->combine_32[PIXMAN_OP_OUT] = ls_combine_out_u; + imp->combine_32[PIXMAN_OP_OUT_REVERSE] = ls_combine_out_reverse_u; + imp->combine_32[PIXMAN_OP_ATOP] = ls_combine_atop_u; + imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = ls_combine_atop_reverse_u; + imp->combine_32[PIXMAN_OP_XOR] = ls_combine_xor_u; + imp->combine_32[PIXMAN_OP_ADD] = ls_combine_add_u; + imp->combine_32[PIXMAN_OP_SATURATE] = ls_combine_saturate_u; + + imp->combine_32_ca[PIXMAN_OP_SRC] = ls_combine_src_ca; + imp->combine_32_ca[PIXMAN_OP_OVER] = ls_combine_over_ca; + imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = ls_combine_over_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_IN] = ls_combine_in_ca; + imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = ls_combine_in_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_OUT] = ls_combine_out_ca; + imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = ls_combine_out_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_ATOP] = ls_combine_atop_ca; + imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = ls_combine_atop_reverse_ca; + imp->combine_32_ca[PIXMAN_OP_XOR] = ls_combine_xor_ca; + imp->combine_32_ca[PIXMAN_OP_ADD] = ls_combine_add_ca; +#endif + +//FIXME blt and fill not shown better perf than geneic code +#if 0 + imp->blt = ls_blt; + imp->fill = ls_fill; +#endif + + return imp; +} + diff -urN pixman//pixman/pixman-private.h Pixman.Loongson//pixman/pixman-private.h --- pixman//pixman/pixman-private.h 2010-12-25 18:46:00.102841000 +0800 +++ Pixman.Loongson//pixman/pixman-private.h 2010-12-25 18:39:15.401808000 +0800 @@ -493,6 +493,11 @@ pixman_implementation_t * _pixman_implementation_create_fast_path (void); +#ifdef USE_LS +pixman_implementation_t * +_pixman_implementation_create_ls (void); +#endif + #ifdef USE_MMX pixman_implementation_t * _pixman_implementation_create_mmx (void); diff -urN pixman//pixman/primitive.h Pixman.Loongson//pixman/primitive.h --- pixman//pixman/primitive.h 1970-01-01 08:00:00.000000000 +0800 +++ Pixman.Loongson//pixman/primitive.h 2010-12-25 18:39:15.457084000 +0800 @@ -0,0 +1,214 @@ +/* +* MMX register usage protocal +* return result: f8 +* tmp immediate f12 +* tmp register in primtive f14 f16 f18 +* tmp register in pixman f0,f4,f6,f10,f20,f22, +* globals in function f24, f26, f28,f30 +* Exceptions for load and store: +* load will specify dest FPR register +* store will specify src FPR register +* expand_alpha(_rev) implemented with GPR, dest FPR as the 2nd parameter +* +* Special alert: don't use return result $f8 as input, it might be overwritten +*/ + + +/*primitive macros */ + +#define clobber "$8","$9","$f0","$f2","$f8",\ + "$f12","$f14","$f16","$f18","$f20",\ + "$f22","$f24","$f26","$f28","$f30" + +#define DMTC1_IMM(regc1,imm) \ + "dli $8, "#imm" \n\t" \ + "dmtc1 $8, "#regc1" \n\t" + +#define MTC1_IMM(regc1,imm) \ + "li $8, "#imm" \n\t" \ + "dmtc1 $8, "#regc1" \n\t" + + +#define save_to(reg1) "mov.d "#reg1", $f8 \n\t" +#define zero(reg1) "xor "#reg1","#reg1","#reg1" \n\t" + +#define load32(sp,reg1) \ + "ulw $8, "#sp" \n\t" \ + "dmtc1 $8, "#reg1" \n\t" + +#define load32a(sp,reg1) \ + "lw $8, "#sp" \n\t" \ + "dmtc1 $8, "#reg1" \n\t" + +#define load32r(sp,reg1) \ + "dmtc1 "#sp", "#reg1" \n\t" + +#define load64(sp,reg1) \ + "uld $8, "#sp" \n\t" \ + "dmtc1 $8, "#reg1" \n\t" + +#define load64a(sp,reg1) \ + "ld $8, "#sp" \n\t" \ + "dmtc1 $8, "#reg1" \n\t" + + +#define store32(reg1,sp) \ + "dmfc1 $8, "#reg1" \n\t" \ + "usw $8, "#sp" \n\t" + +#define store32r(reg1,sp) \ + "dmfc1 "#sp", "#reg1" \n\t" + +#define store32a(reg1,sp) \ + "swc1 "#reg1", "#sp" \n\t" + +#define store64(reg1,sp) \ + "dmfc1 $8, "#reg1" \n\t" \ + "usd $8, "#sp" \n\t" + +#define store64a(reg1,sp) \ + "sdc1 "#reg1", "#sp" \n\t" + +#define load8888(sp,reg1) \ + load64(sp,reg1) \ + "xor $f12, $f12, $f12 \n\t" \ + "punpcklbh "#reg1", "#reg1", $f12 \n\t" + +#define load8888r(sp,reg1) \ + load32r(sp,reg1) \ + "xor $f12, $f12, $f12 \n\t" \ + "punpcklbh "#reg1", "#reg1", $f12 \n\t" + +#define load8888a(sp,reg1) \ + load64a(sp,reg1) \ + "xor $f12, $f12, $f12 \n\t" \ + "punpcklbh "#reg1", "#reg1", $f12 \n\t" + +#define load8888ah(sp,reg1) \ + load64a(sp,reg1) \ + "xor $f12, $f12, $f12 \n\t" \ + "punpckhbh "#reg1", "#reg1", $f12 \n\t" + +#define store8888(reg1,sp) \ + "xor $f12, $f12, $f12 \n\t" \ + "packushb "#reg1", "#reg1", $f12 \n\t" \ + store64(reg1,sp) + +#define store8888r(reg1,sp) \ + "xor $f12, $f12, $f12 \n\t" \ + "packushb "#reg1", "#reg1", $f12 \n\t" \ + store32r(reg1,sp) + +#define store8888a(reg1,sp) \ + "xor $f12, $f12, $f12 \n\t" \ + "packushb "#reg1", "#reg1", $f12 \n\t" \ + store64a(reg1,sp) + +#define pack8888(reg1,reg2) \ + "packushb $f8, "#reg1","#reg2" \n\t" + +#define unpack8888(reg1,reg2) \ + "punpcklbh $f8, "#reg1","#reg2" \n\t" + + +#define negate(sreg,dreg) \ + DMTC1_IMM($f12, 0x00ff00ff00ff00ff)\ + "xor "#dreg", "#sreg", $f12 \n\t" + +#define pix_add(reg1,reg2) \ + "paddusb $f8, "#reg1", "#reg2" \n\t" + +#define pix_multiply(reg1,reg2) \ + "pmullh $f14, "#reg1", "#reg2" \n\t " \ + DMTC1_IMM($f12, 0x0080008000800080) \ + "paddush $f14, $f14, $f12 \n\t "\ + MTC1_IMM($f12, 8) \ + "psrlh $f16, $f14, $f12 \n\t" \ + "paddush $f14, $f14, $f16 \n\t" \ + "psrlh $f8, $f14, $f12 \n\t" + +#define pix_add_mul(reg1,reg2,reg3,reg4) \ + pix_multiply(reg1,reg2) \ + "mov.d $f18, $f8 \n\t" \ + pix_multiply(reg3,reg4) \ + pix_add($f18,$f8) + +#define expand_alpha(sreg,dreg) \ + "dmfc1 $8, "#sreg" \n\t" \ + "dsrl32 $8, $8, 16 \n\t" \ + "dsll $9, $8, 16 \n\t" \ + "or $8, $8, $9 \n\t" \ + "dsll32 $9, $8, 0 \n\t" \ + "or $8, $8, $9 \n\t" \ + "dmtc1 $8, "#dreg" \n\t" + +#define expand_alpha_rev(sreg,dreg)\ + "dmfc1 $8, "#sreg" \n\t" \ + "dsll32 $8, $8, 16 \n\t" \ + "dsrl32 $8, $8, 16 \n\t" \ + "dsll $9, $8, 16 \n\t" \ + "or $8, $8, $9 \n\t" \ + "dsll32 $9, $8, 0 \n\t" \ + "or $8, $8, $9 \n\t" \ + "dmtc1 $8, "#dreg" \n\t" + +#define expand8888(reg1,pos) expand8888_##pos(reg1) + +#define expand8888_0(reg1) \ + "xor $f12, $f12, $f12 \n\t" \ + "punpcklbh $f8, "#reg1", $f12 \n\t" + +#define expand8888_1(reg1) \ + "xor $f12, $f12, $f12 \n\t" \ + "punpckhbh $f8, "#reg1", $f12 \n\t" + +#define expandx888(reg1,pos) \ + expand8888(reg1,pos) \ + DMTC1_IMM($f12, 0x00ff000000000000) \ + "or $f8, $f8, $f12 \n\t" + +#define invert_colors(reg1) \ + DMTC1_IMM($f12, 0xffff0000ffff0000) \ + "and $f14, "#reg1", $f12 \n\t" \ + DMTC1_IMM($f12, 0x000000000000ffff) \ + "and $f16, "#reg1", $f12 \n\t" \ + DMTC1_IMM($f12, 0x0000ffff00000000) \ + "and $f18, "#reg1", $f12 \n\t" \ + MTC1_IMM($f12, 32) \ + "dsll $f16, $f16, $f12 \n\t" \ + "dsrl $f18, $f18, $f12 \n\t" \ + "or $f14, $f14, $f16 \n\t" \ + "or $f8, $f14, $f18 \n\t" + +#define over(reg1,reg2,reg3) \ + negate(reg2,$f8) \ + pix_multiply(reg3, $f8)\ + pix_add(reg1, $f8) + + +#define over_rev_non_pre(reg1,reg2) \ + expand_alpha(reg1,$f0) \ + DMTC1_IMM($f12,0x00ff000000000000) \ + "or $f2, $f0, $f12 \n\t" \ + invert_colors(reg1) \ + pix_multiply($f8,$f2) \ + save_to($f2) \ + over($f2, $f0, reg2) + +#define in(reg1,reg2) pix_multiply(reg1,reg2) + +#define in_over_full_src_alpha(reg1,reg2,reg3) \ + DMTC1_IMM($f12,0x00ff000000000000) \ + "or $f0, "#reg1", $f12 \n\t" \ + in($f0,reg2) \ + save_to($f0) \ + over($f0,reg2,reg3) + +#define in_over(reg1,reg2,reg3,reg4) \ + in(reg1,reg3) \ + "mov.d $f0, $f8 \n\t" \ + pix_multiply(reg2,reg3) \ + "mov.d $f2, $f8 \n\t" \ + over($f0,$f2,reg4) + +