diff options
Diffstat (limited to 'extra/pixman/pixman-loongson2f.patch')
-rw-r--r-- | extra/pixman/pixman-loongson2f.patch | 2745 |
1 files changed, 2745 insertions, 0 deletions
diff --git a/extra/pixman/pixman-loongson2f.patch b/extra/pixman/pixman-loongson2f.patch new file mode 100644 index 000000000..15e01cb6b --- /dev/null +++ b/extra/pixman/pixman-loongson2f.patch @@ -0,0 +1,2745 @@ +diff -urN pixman//configure.ac Pixman.Loongson//configure.ac +--- pixman//configure.ac 2010-12-25 18:46:00.018699000 +0800 ++++ Pixman.Loongson//configure.ac 2010-12-25 18:39:15.298778000 +0800 +@@ -264,6 +264,43 @@ + ]) + + dnl =========================================================================== ++dnl Check for Loongson SIMD ++ ++have_loongson_intrinsics=no ++AC_MSG_CHECKING(whether to use Loongson SIMD intrinsics) ++ ++AC_COMPILE_IFELSE([ ++#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4)) ++error "Need GCC >= 4.4 for Loongson SIMD compilation" ++#endif ++int main () { ++ /* Test with a loongson SIMD instruction. */ ++ asm volatile ( ".set arch = loongson2f \n\t" "and \$f0, \$f0, \$f0 \n\t" : : : "cc", "memory" ); ++ return 0; ++}], have_loongson_intrinsics=yes) ++ ++ ++AC_ARG_ENABLE(loongson, ++ [AC_HELP_STRING([--disable-loongson], ++ [disable Loongson fast paths])], ++ [enable_loongson=$enableval], [enable_loongson=auto]) ++ ++if test $enable_loongson = no ; then ++ have_loongson_intrinsics=disabled ++fi ++ ++if test $have_loongson_intrinsics = yes ; then ++ AC_DEFINE(USE_LS, 1, [use Loongson compiler intrinsics]) ++fi ++ ++AC_MSG_RESULT($have_loongson_intrinsics) ++if test $enable_loongson = yes && test $have_loongson_intrinsics = no ; then ++ AC_MSG_ERROR([Loongson intrinsics not detected]) ++fi ++ ++AM_CONDITIONAL(USE_LS, test $have_loongson_intrinsics = yes) ++ ++dnl =========================================================================== + dnl Check for MMX + + if test "x$MMX_CFLAGS" = "x" ; then +diff -urN pixman//pixman/Makefile.am Pixman.Loongson//pixman/Makefile.am +--- pixman//pixman/Makefile.am 2010-12-25 18:46:00.025027000 +0800 ++++ Pixman.Loongson//pixman/Makefile.am 2010-12-25 18:39:15.303599000 +0800 +@@ -55,6 +55,19 @@ + pixman-combine.h.template solaris-hwcap.mapfile pixman-x64-mmx-emulation.h + CLEANFILES = pixman-combine32.c pixman-combine64.c pixman-combine32.h pixman-combine64.h + ++# loongson code ++if USE_LS ++noinst_LTLIBRARIES += libpixman-ls.la ++libpixman_ls_la_SOURCES = \ ++ pixman-ls.c ++libpixman_ls_la_CFLAGS = $(DEP_CFLAGS) $(LS_CFLAGS) ++libpixman_ls_la_LIBADD = $(DEP_LIBS) ++libpixman_1_la_LDFLAGS += $(LS_LDFLAGS) ++libpixman_1_la_LIBADD += libpixman-ls.la ++ ++ASM_CFLAGS_ls=$(LS_CFLAGS) ++endif ++ + # mmx code + if USE_MMX + noinst_LTLIBRARIES += libpixman-mmx.la +diff -urN pixman//pixman/pixman-combine-ls.c Pixman.Loongson//pixman/pixman-combine-ls.c +--- pixman//pixman/pixman-combine-ls.c 1970-01-01 08:00:00.000000000 +0800 ++++ Pixman.Loongson//pixman/pixman-combine-ls.c 2010-12-25 18:39:15.344171000 +0800 +@@ -0,0 +1,911 @@ ++static force_inline uint32_t ++combine (const uint32_t *src, const uint32_t *mask) ++{ ++ uint32_t ssrc = *src; ++ ++ if (mask) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f22) ++ load8888r(%0,$f20) ++ expand_alpha($f22,$f22)
++ pix_multiply($f20,$f22)
++ store8888r($f8,%0) ++ :"+r"(ssrc):"r"(*mask):clobber ++ ); ++ } ++ return ssrc; ++} ++ ++static void ++ls_combine_saturate_u (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = dest + width; ++ ++ while (dest < end) ++ { ++ uint32_t s = combine (src, mask); ++ uint32_t d = *dest; ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f22) ++ load8888r(%0,$f20) ++ :"+r"(d):"r"(s):clobber ++ ); ++ ++ uint32_t sa = s >> 24; ++ uint32_t da = ~d >> 24; ++ ++ if (sa > da) ++ { ++ uint32_t dds = DIV_UN8 (da, sa) << 24; ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%0,$f24) ++ expand_alpha($f24,$f24) ++ pix_multiply($f22,$f24) ++ save_to($f22) ++ ::"r"(dds):clobber ++ ); ++ } ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ pix_add($f20,$f22) ++ store8888r($f8,%0) ++ :"=r"(*dest)::clobber ++ ); ++ ++ ++src; ++ ++dest; ++ if (mask) ++ mask++; ++ } ++} ++static void ++ls_combine_out_u (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = dest + width; ++ ++ while (dest < end) ++ { ++ if (mask) ++ { ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ expand_alpha($f22,$f22) ++ pix_multiply($f20,$f22) ++ save_to ($f20) ++ ++ load8888r(%0,$f24) ++ expand_alpha($f24,$f24) ++ negate($f24,$f24) ++ pix_multiply($f20,$f24) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ ++ mask++; ++ }else { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ ++ ++ load8888r(%1,$f20) ++ ++ load8888r(%0,$f24) ++ expand_alpha($f24,$f24) ++ negate($f24,$f24) ++ pix_multiply($f20,$f24) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src):clobber ++ ); ++ ++ } ++ ++dest; ++ ++src; ++ } ++} ++ ++static void ++ls_combine_out_reverse_u (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = dest + width; ++ ++ while (dest < end) ++ { ++ if (mask) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ expand_alpha($f22,$f22) ++ pix_multiply($f20,$f22) ++ save_to ($f20) ++ ++ load8888r(%0,$f24) ++ expand_alpha($f20,$f20) ++ negate($f20,$f20) ++ pix_multiply($f20,$f24) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ ++ mask++; ++ }else{ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ ++ load8888r(%1,$f20) ++ ++ load8888r(%0,$f24) ++ expand_alpha($f20,$f20) ++ negate($f20,$f20) ++ pix_multiply($f20,$f24) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src):clobber ++ ); ++ } ++ ++dest; ++ ++src; ++ ++ } ++} ++ ++static void ++ls_combine_out_ca (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = src + width; ++ ++ while (src < end) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ load8888r(%0,$f24) ++ expand_alpha($f24,$f26) ++ negate($f26,$f26) ++ pix_multiply($f20,$f22) ++ save_to($f20) ++ pix_multiply($f20,$f26) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ ++src; ++ ++dest; ++ ++mask; ++ } ++} ++ ++static void ++ls_combine_out_reverse_ca (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = src + width; ++ ++ while (src < end) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ load8888r(%0,$f24) ++ expand_alpha($f20,$f28) ++ pix_multiply($f22,$f28) ++ save_to($f22) ++ negate($f22,$f22) ++ pix_multiply($f24,$f22) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ ++src; ++ ++dest; ++ ++mask; ++ } ++} ++ ++ ++static void ++ls_combine_atop_u (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = dest + width; ++ ++ while (dest < end) ++ { ++ if (mask) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ expand_alpha($f22,$f22) ++ pix_multiply($f20,$f22) ++ save_to ($f20) ++ ++ load8888r(%0,$f24) ++ expand_alpha($f20,$f26) ++ expand_alpha($f24,$f28) ++ negate($f26,$f26) ++ pix_add_mul($f20,$f28,$f24,$f26) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ ++ mask++; ++ }else { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f20) ++ ++ load8888r(%0,$f24) ++ expand_alpha($f20,$f26) ++ expand_alpha($f24,$f28) ++ negate($f26,$f26) ++ pix_add_mul($f20,$f28,$f24,$f26) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src):clobber ++ ); ++ } ++ ++dest; ++ ++src; ++ ++ } ++} ++ ++static void ++ls_combine_atop_reverse_u (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end; ++ ++ end = dest + width; ++ ++ while (dest < end) ++ { ++ if (mask){ ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ expand_alpha($f22,$f22) ++ pix_multiply($f20,$f22) ++ save_to ($f20) ++ ++ load8888r(%0,$f24) ++ expand_alpha($f20,$f26) ++ expand_alpha($f24,$f28) ++ negate($f28,$f28) ++ pix_add_mul($f20,$f28,$f24,$f26) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ mask++; ++ }else{ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ ++ load8888r(%1,$f20) ++ ++ load8888r(%0,$f24) ++ expand_alpha($f20,$f26) ++ expand_alpha($f24,$f28) ++ negate($f28,$f28) ++ pix_add_mul($f20,$f28,$f24,$f26) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src):clobber ++ ); ++ } ++ ++dest; ++ ++src; ++ } ++} ++ ++ ++static void ++ls_combine_atop_ca (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = src + width; ++ ++ while (src < end) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ load8888r(%0,$f24) ++ expand_alpha($f24,$f26) ++ expand_alpha($f20,$f28) ++ pix_multiply($f20,$f22) ++ save_to($f20) ++ pix_multiply($f22,$f28) ++ save_to($f22) ++ negate($f22,$f22) ++ pix_add_mul($f24,$f22,$f20,$f26) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ ++src; ++ ++dest; ++ ++mask; ++ } ++} ++ ++static void ++ls_combine_atop_reverse_ca (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = src + width; ++ ++ while (src < end) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ load8888r(%0,$f24) ++ expand_alpha($f24,$f26) ++ expand_alpha($f20,$f28) ++ pix_multiply($f20,$f22) ++ save_to($f20) ++ pix_multiply($f22,$f28) ++ save_to($f22) ++ negate($f26,$f26) ++ pix_add_mul($f24,$f22,$f20,$f26) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ ++src; ++ ++dest; ++ ++mask; ++ } ++} ++ ++static void ++ls_combine_xor_u (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = dest + width; ++ ++ while (dest < end) ++ { ++ if (mask) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ expand_alpha($f22,$f22) ++ pix_multiply($f20,$f22) ++ save_to ($f20) ++ ++ ++ load8888r(%0,$f24) ++ expand_alpha($f20,$f26) ++ expand_alpha($f24,$f28) ++ negate($f26,$f26) ++ negate($f28,$f28) ++ pix_add_mul($f20,$f28,$f24,$f26) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ mask++; ++ }else{ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f20) ++ ++ load8888r(%0,$f24) ++ expand_alpha($f20,$f26) ++ expand_alpha($f24,$f28) ++ negate($f26,$f26) ++ negate($f28,$f28) ++ pix_add_mul($f20,$f28,$f24,$f26) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src):clobber ++ ); ++ } ++ ++dest; ++ ++src; ++ ++ } ++} ++ ++static void ++ls_combine_xor_ca (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = src + width; ++ ++ while (src < end) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ load8888r(%0,$f24) ++ expand_alpha($f24,$f26) ++ expand_alpha($f20,$f28) ++ pix_multiply($f20,$f22) ++ save_to($f20) ++ pix_multiply($f22,$f28) ++ save_to($f22) ++ negate($f26,$f26) ++ negate($f22,$f22) ++ pix_add_mul($f24,$f22,$f20,$f26) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ ++src; ++ ++dest; ++ ++mask; ++ } ++} ++ ++ ++static void ++ls_combine_in_reverse_u (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = dest + width; ++ ++ while (dest < end) ++ { ++ ++ if (mask) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ expand_alpha($f22,$f22) ++ pix_multiply($f20,$f22) ++ save_to ($f20) ++ ++ load8888r(%0,$f24) ++ expand_alpha($f20,$f26) ++ pix_multiply($f24,$f26) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ mask++; ++ } else { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f20) ++ ++ load8888r(%0,$f24) ++ expand_alpha($f20,$f26) ++ pix_multiply($f24,$f26) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src):clobber ++ ); ++ } ++ ++dest; ++ ++src; ++ } ++} ++ ++static void ++ls_combine_in_reverse_ca (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = src + width; ++ ++ while (src < end) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ load8888r(%0,$f24) ++ expand_alpha($f20,$f20) ++ pix_multiply($f22,$f20) ++ save_to($f26) ++ pix_multiply($f24,$f26) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ ++src; ++ ++dest; ++ ++mask; ++ } ++} ++ ++static void ++ls_combine_in_u (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = dest + width; ++ ++ while (dest < end) ++ { ++ if (mask) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ expand_alpha($f22,$f22) ++ pix_multiply($f20,$f22) ++ save_to ($f20) ++ ++ load8888r(%0,$f24) ++ expand_alpha($f24,$f24) ++ pix_multiply($f20,$f24) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ mask++; ++ } else { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f20) ++ ++ load8888r(%0,$f24) ++ expand_alpha($f24,$f24) ++ pix_multiply($f20,$f24) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src):clobber ++ ); ++ ++ } ++ ++dest; ++ ++src; ++ } ++} ++ ++static void ++ls_combine_in_ca (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = src + width; ++ ++ while (src < end) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ load8888r(%0,$f24) ++ expand_alpha($f24,$f24) ++ pix_multiply($f20,$f22) ++ save_to($f26) ++ pix_multiply($f26,$f24) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ ++src; ++ ++dest; ++ ++mask; ++ } ++ } ++static void ++ls_combine_src_ca (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = src + width; ++ ++ while (src < end) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ pix_multiply($f20,$f22) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ ++src; ++ ++mask; ++ ++dest; ++ } ++ ++} ++ ++ ++static void ++ls_combine_over_u (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = dest + width; ++ ++ while (dest < end) ++ { ++ ++ uint32_t ssrc = combine (src, mask); ++ uint32_t a = ssrc >> 24; ++ ++ if (a == 0xff) ++ { ++ *dest = ssrc; ++ } ++ else if (ssrc) ++ { ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f20) ++ ++ expand_alpha($f20,$f24) ++ load8888r(%0,$f26) ++ over($f20,$f24,$f26) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(ssrc):clobber ++ ); ++ } ++ ++ ++dest; ++ ++src; ++ if (mask) ++ ++mask; ++ } ++} ++ ++static void ++ls_combine_over_reverse_u (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = dest + width; ++ ++ while (dest < end) ++ { ++ if (mask) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ expand_alpha($f22,$f22) ++ pix_multiply($f20,$f22) ++ save_to ($f20) ++ ++ load8888r(%0,$f26) ++ expand_alpha($f26,$f28) ++ over($f26,$f28,$f20) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ mask++; ++ }else{ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f20) ++ ++ load8888r(%0,$f26) ++ expand_alpha($f26,$f28) ++ over($f26,$f28,$f20) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src):clobber ++ ); ++ ++ } ++ ++dest; ++ ++src; ++ } ++} ++ ++ ++static void ++ls_combine_over_ca (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = src + width; ++ ++ while (src < end) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%0,$f20) ++ load8888r(%1,$f22) ++ load8888r(%2,$f24) ++ expand_alpha($f22,$f26) ++ in_over($f22,$f26,$f24,$f20) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ ++src; ++ ++dest; ++ ++mask; ++ } ++ ++} ++ ++static void ++ls_combine_over_reverse_ca (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = src + width; ++ ++ while (src < end) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%0,$f20) ++ load8888r(%1,$f22) ++ load8888r(%2,$f24) ++ in($f22,$f24) ++ save_to($f22) ++ expand_alpha($f20,$f28) ++ over($f20,$f28,$f22) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ ++src; ++ ++dest; ++ ++mask; ++ } ++ ++} ++ ++static void ++ls_combine_add_u (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = dest + width; ++ ++ while (dest < end) ++ { ++ ++ if (mask) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f22) ++ load8888r(%1,$f20) ++ expand_alpha($f22,$f22) ++ pix_multiply($f20,$f22) ++ save_to ($f20) ++ ++ load8888r(%0,$f22) ++ pix_add($f20,$f22) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ mask++; ++ }else{ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f20) ++ ++ load8888r(%0,$f22) ++ pix_add($f20,$f22) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src):clobber ++ ); ++ ++ } ++ ++dest; ++ ++src; ++ } ++} ++ ++static void ++ls_combine_add_ca (pixman_implementation_t *imp, ++ pixman_op_t op, ++ uint32_t * dest, ++ const uint32_t * src, ++ const uint32_t * mask, ++ int width) ++{ ++ const uint32_t *end = src + width; ++ ++ while (src < end) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%0,$f20) ++ load8888r(%1,$f22) ++ load8888r(%2,$f24) ++ pix_multiply($f22,$f24) ++ save_to($f22) ++ pix_add($f22,$f20) ++ store8888r($f8,%0) ++ :"+r"(*dest):"r"(*src),"r"(*mask):clobber ++ ); ++ ++src; ++ ++dest; ++ ++mask; ++ } ++} +diff -urN pixman//pixman/pixman-composite-ls.c Pixman.Loongson//pixman/pixman-composite-ls.c +--- pixman//pixman/pixman-composite-ls.c 1970-01-01 08:00:00.000000000 +0800 ++++ Pixman.Loongson//pixman/pixman-composite-ls.c 2010-12-25 18:39:15.356667000 +0800 +@@ -0,0 +1,967 @@ ++static void ++ls_composite_over_x888_8_8888 (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ ++ uint32_t *src, *src_line; ++ uint32_t *dst, *dst_line; ++ uint8_t *mask, *mask_line; ++ int src_stride, mask_stride, dst_stride; ++ uint32_t m; ++ uint32_t s, d; ++ int32_t w; ++ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); ++ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); ++ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); ++ ++ while (height--) ++ { ++ src = src_line; ++ src_line += src_stride; ++ dst = dst_line; ++ dst_line += dst_stride; ++ mask = mask_line; ++ mask_line += mask_stride; ++ ++ w = width; ++ while (w--) ++ { ++ m = *mask++; ++ if (m) ++ { ++ s = *src | 0xff000000; ++ ++ if (m == 0xff) ++ { ++ *dst = s; ++ } ++ else ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%0,$f20) ++ load8888r(%1,$f22) ++ load8888r(%2,$f24) ++ expand_alpha($f22,$f26) ++ expand_alpha_rev($f24,$f28) ++ in_over($f22,$f26,$f28,$f20) ++ store8888r($f8,%0) ++ :"+r"(*dst):"r"(s),"r"(m):clobber ++ ); ++ ++// __m64 sa = expand_alpha (s); ++// __m64 vm = expand_alpha_rev (to_m64 (m)); ++// __m64 vdest = in_over (s, sa, vm, load8888 (*dst)); ++// *dst = store8888 (vdest); ++ ++ } ++ } ++ src++; ++ dst++; ++ } ++ } ++} ++ ++ ++ ++ ++ ++static void ++ls_composite_over_8888_8888 (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint32_t *dst_line, *dst; ++ uint32_t *src_line, *src; ++ uint32_t s; ++ int dst_stride, src_stride; ++ uint8_t a; ++ int32_t w; ++ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); ++ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); ++ ++ while (height--) ++ { ++ dst = dst_line; ++ dst_line += dst_stride; ++ src = src_line; ++ src_line += src_stride; ++ w = width; ++ ++ while (w--) ++ { ++ s = *src; ++ a = s >> 24; ++ ++ if (a == 0xff) ++ { ++ *dst = s; ++ } ++ else if (s) ++ { ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f24) ++ load8888r(%0,$f20) ++ expand_alpha($f24,$f26) ++ over($f24,$f26,$f20) ++ store8888r($f8,%0) ++ :"+r"(*dst):"r"(*src):clobber ++ ); ++ } ++ dst++; ++ src++; ++ ++ } ++ } ++} ++ ++ ++static void ++ls_composite_over_8888_n_8888 (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint32_t *dst_line, *dst; ++ uint32_t *src_line, *src; ++ uint32_t mask; ++ __m64 vmask; ++ int dst_stride, src_stride; ++ int32_t w; ++ __m64 srca; ++ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); ++ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); ++ ++ mask = _pixman_image_get_solid (mask_image, dst_image->bits.format); ++ mask = mask | mask >> 8 | mask >> 16 | mask >> 24; ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888(%1,$f24) ++ store64a($f24,%0) ++ :"=m"(vmask):"m"(mask):clobber ++ ); ++ ++ srca = ls_4x00ff; ++ ++ while (height--) ++ { ++ dst = dst_line; ++ dst_line += dst_stride; ++ src = src_line; ++ src_line += src_stride; ++ w = width; ++ ++ while (w) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f20) ++ load8888r(%0,$f22) ++ expand_alpha($f20,$f28) ++ in_over($f20,$f28,$f24,$f22) ++ store8888r($f8,%0) ++ :"+r"(*dst):"r"(*src):clobber ++ ); ++ ++ w--; ++ dst++; ++ src++; ++ } ++ } ++} ++ ++static void ++ls_composite_over_n_8888 (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint32_t src; ++ uint32_t *dst_line, *dst; ++ int32_t w; ++ int dst_stride; ++ __m64 vsrc, vsrca; ++ ++ src = _pixman_image_get_solid (src_image, dst_image->bits.format); ++ ++ if (src == 0) ++ return; ++ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f24) ++ store64($f24,%0) ++ expand_alpha($f24,$f26) ++ store64($f26,%1) ++ :"=m"(vsrc), "=m"(vsrca):"r"(src):clobber ++ ); ++ ++ while (height--) ++ { ++ dst = dst_line; ++ dst_line += dst_stride; ++ w = width; ++ ++ while (w) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%0,$f28) ++ over($f24,$f26,$f28) ++ store8888r($f8,%0) ++ :"+r"(*dst)::clobber ++ ); ++ ++ w--; ++ dst++; ++ } ++ } ++} ++ ++static void ++ls_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint32_t src, srca; ++ uint32_t *dst_line; ++ uint32_t *mask_line; ++ int dst_stride, mask_stride; ++ __m64 vsrc, vsrca; ++ ++ src = _pixman_image_get_solid (src_image, dst_image->bits.format); ++ ++ srca = src >> 24; ++ if (src == 0) ++ return; ++ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); ++ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f24) ++ store64($f24,%0) ++ expand_alpha($f24,$f26) ++ store64($f26,%1) ++ :"=m"(vsrc), "=m"(vsrca):"r"(src):clobber ++ ); ++ ++ while (height--) ++ { ++ int twidth = width; ++ uint32_t *p = (uint32_t *)mask_line; ++ uint32_t *q = (uint32_t *)dst_line; ++ ++ while (twidth) ++ { ++ ++ if (*p) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%0,$f28) ++ load8888r(%1,$f20) ++ in_over($f24,$f26,$f20,$f28) ++ store8888r($f8,%0) ++ :"+r"(*q):"r"(*p):clobber ++ ); ++ } ++ twidth--; ++ p++; ++ q++; ++ } ++ ++ dst_line += dst_stride; ++ mask_line += mask_stride; ++ } ++} ++ ++ ++static void ++ls_composite_over_n_8_8888 (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint32_t src, srca; ++ uint32_t *dst_line, *dst; ++ uint8_t *mask_line, *mask; ++ int dst_stride, mask_stride; ++ int32_t w; ++ __m64 vsrc, vsrca; ++ uint64_t srcsrc; ++ ++ src = _pixman_image_get_solid (src_image, dst_image->bits.format); ++ ++ srca = src >> 24; ++ if (src == 0) ++ return; ++ ++ srcsrc = (uint64_t)src << 32 | src; ++ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); ++ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f24) ++ store64a($f24,%0) ++ expand_alpha($f24,$f26) ++ store64a($f26,%1) ++ :"=m"(vsrc), "=m"(vsrca):"r"(src):clobber ++ ); ++ ++ while (height--) ++ { ++ dst = dst_line; ++ dst_line += dst_stride; ++ mask = mask_line; ++ mask_line += mask_stride; ++ w = width; ++ ++ while (w) ++ { ++ uint32_t m = *mask; ++ ++ if (m) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%0,$f20) ++ load32r(%1,$f22) ++ expand_alpha_rev($f22,$f28) ++ in_over($f24,$f26,$f28,$f20) ++ store8888r($f8,%0) ++ :"+r"(*dst):"r"(m):clobber ++ ); ++ } ++ ++ w--; ++ mask++; ++ dst++; ++ } ++ } ++ ++} ++ ++static void ++ls_composite_over_x888_n_8888 (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint32_t *dst_line, *dst; ++ uint32_t *src_line, *src; ++ uint32_t mask; ++ __m64 vmask; ++ int dst_stride, src_stride; ++ int32_t w; ++ __m64 srca; ++ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); ++ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); ++ mask = _pixman_image_get_solid (mask_image, dst_image->bits.format); ++ ++ mask &= 0xff000000; ++ mask = mask | mask >> 8 | mask >> 16 | mask >> 24; ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f24) ++ store64a($f24,%0) ++ :"=m"(vmask):"r"(mask):clobber ++ ); ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load64a(%1,$f26) ++ store64a($f26,%0) ++ :"=m"(srca):"m"(ls_4x00ff):clobber ++ ); ++ ++ while (height--) ++ { ++ dst = dst_line; ++ dst_line += dst_stride; ++ src = src_line; ++ src_line += src_stride; ++ w = width; ++ ++ while (w) ++ { ++ uint32_t src_tmp = *src | 0xff000000; ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f20) ++ load8888r(%0,$f22) ++ in_over($f20,$f26,$f24,$f22) ++ store8888r($f8,%0) ++ :"+r"(*dst):"r"(src_tmp):clobber ++ ); ++ ++ w--; ++ dst++; ++ src++; ++ } ++ } ++} ++ ++ ++static void ++ls_composite_over_8888_0565 (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint16_t *dst_line, *dst; ++ uint32_t d; ++ uint32_t *src_line, *src, s; ++ uint8_t a; ++ int dst_stride, src_stride; ++ int32_t w; ++ ++ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); ++ ++ while (height--) ++ { ++ dst = dst_line; ++ dst_line += dst_stride; ++ src = src_line; ++ src_line += src_stride; ++ w = width; ++ ++ while (w--) ++ { ++ s = *src++; ++ a = s >> 24; ++ if (s) ++ { ++ if (a == 0xff) ++ { ++ d = s; ++ } ++ else ++ { ++ d = *dst; ++ d = CONVERT_0565_TO_0888 (d); ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f24) ++ load8888r(%0,$f20) ++ expand_alpha($f24,$f26) ++ over($f24,$f26,$f20) ++ store8888r($f8,%0) ++ :"+r"(d):"r"(s):clobber ++ ); ++ ++ ++ } ++ *dst = CONVERT_8888_TO_0565 (d); ++ } ++ dst++; ++ } ++ } ++} ++ ++static void ++ls_composite_over_n_0565 (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint32_t src; ++ uint32_t d; ++ uint16_t *dst_line, *dst; ++ int32_t w; ++ int dst_stride; ++ __m64 vsrc, vsrca; ++ ++ src = _pixman_image_get_solid (src_image, dst_image->bits.format); ++ ++ if (src == 0) ++ return; ++ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f24) ++ store64a($f24,%0) ++ expand_alpha($f24,$f26) ++ store64a($f26,%1) ++ :"=m"(vsrc), "=m"(vsrca):"r"(src):clobber ++ ); ++ ++ while (height--) ++ { ++ dst = dst_line; ++ dst_line += dst_stride; ++ w = width; ++ ++ while (w) ++ { ++ ++ d = *dst; ++ d = CONVERT_0565_TO_0888 (d); ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%0,$f20) ++ ++ over($f24,$f26,$f20) ++ store8888r($f8,%0) ++ :"+r"(d)::clobber ++ ); ++ ++ *dst = CONVERT_8888_TO_0565 (d); ++ ++ w--; ++ dst++; ++ } ++ } ++} ++ ++static void ++ls_composite_over_n_8_0565 (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint32_t src, srca, m, d; ++ uint16_t *dst_line, *dst; ++ uint8_t *mask_line, *mask; ++ int dst_stride, mask_stride; ++ int32_t w; ++ __m64 vsrc, vsrca; ++ ++ src = _pixman_image_get_solid (src_image, dst_image->bits.format); ++ ++ srca = src >> 24; ++ if (src == 0) ++ return; ++ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); ++ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f24) ++ store64a($f24,%0) ++ expand_alpha($f24,$f26) ++ store64a($f26,%1) ++ :"=m"(vsrc), "=m"(vsrca):"r"(src):clobber ++ ); ++ ++ while (height--) ++ { ++ dst = dst_line; ++ dst_line += dst_stride; ++ mask = mask_line; ++ mask_line += mask_stride; ++ w = width; ++ ++ while (w) ++ { ++ m = *mask; ++ d = *dst; ++ ++ if (m) ++ { ++ ++ d = CONVERT_0565_TO_0888 (d); ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%0,$f20) ++ load32r(%1,$f22) ++ expand_alpha_rev($f22,$f28) ++ in_over($f24,$f26,$f28,$f20) ++ store8888r($f8,%0) ++ :"+r"(d):"r"(m):clobber ++ ); ++ ++ *dst = CONVERT_8888_TO_0565 (d); ++ ++ } ++ ++ w--; ++ mask++; ++ dst++; ++ } ++ } ++} ++ ++static void ++ls_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint32_t src, srca, m, d; ++ uint16_t *dst_line; ++ uint32_t *mask_line; ++ int dst_stride, mask_stride; ++ __m64 vsrc, vsrca; ++ ++ src = _pixman_image_get_solid (src_image, dst_image->bits.format); ++ ++ srca = src >> 24; ++ if (src == 0) ++ return; ++ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); ++ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f24) ++ store64a($f24,%0) ++ expand_alpha($f24,$f26) ++ store64a($f26,%1) ++ :"=m"(vsrc), "=m"(vsrca):"r"(src):clobber ++ ); ++ ++ while (height--) ++ { ++ int twidth = width; ++ uint32_t *p = (uint32_t *)mask_line; ++ uint16_t *q = (uint16_t *)dst_line; ++ ++ while (twidth) ++ { ++ ++ m = *(uint32_t *)p; ++ d = *q; ++ ++ if (m) ++ { ++ ++ d = CONVERT_0565_TO_0888 (d); ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%0,$f20) ++ load8888r(%1,$f22) ++ in_over($f24,$f26,$f22,$f20) ++ store8888r($f8,%0) ++ :"+r"(d):"r"(m):clobber ++ ); ++ ++ *q = CONVERT_8888_TO_0565 (d); ++ ++ } ++ ++ twidth--; ++ p++; ++ q++; ++ } ++ ++ mask_line += mask_stride; ++ dst_line += dst_stride; ++ } ++} ++static void ++ls_composite_over_pixbuf_8888 (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint32_t *dst_line, *dst; ++ uint32_t *src_line, *src; ++ int dst_stride, src_stride; ++ int32_t w; ++ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); ++ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); ++ ++#if 0 ++ /* FIXME */ ++ assert (src_image->drawable == mask_image->drawable); ++#endif ++ ++ while (height--) ++ { ++ dst = dst_line; ++ dst_line += dst_stride; ++ src = src_line; ++ src_line += src_stride; ++ w = width; ++ ++ while (w) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f22) ++ load8888r(%0,$f20) ++ over_rev_non_pre($f22,$f20) ++ store8888r($f8,%0) ++ :"+r"(*dst):"r"(*src):clobber ++ ); ++ ++ w--; ++ dst++; ++ src++; ++ } ++ } ++} ++static void ++ls_composite_over_pixbuf_0565 (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint16_t *dst_line, *dst; ++ uint32_t *src_line, *src, d; ++ int dst_stride, src_stride; ++ int32_t w; ++ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); ++ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); ++ ++#if 0 ++ /* FIXME */ ++ assert (src_image->drawable == mask_image->drawable); ++#endif ++ ++ while (height--) ++ { ++ dst = dst_line; ++ dst_line += dst_stride; ++ src = src_line; ++ src_line += src_stride; ++ w = width; ++ ++ while (w) ++ { ++ ++ d = *dst; ++ d = CONVERT_0565_TO_0888 (d); ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%1,$f20) ++ load8888r(%0,$f24) ++ over_rev_non_pre($f20,$f24) ++ store8888r($f8,%0) ++ :"+r"(d):"r"(*src):clobber ++ ); ++ ++ *dst = CONVERT_8888_TO_0565 (d); ++ ++ w--; ++ dst++; ++ src++; ++ } ++ } ++} ++ ++static void ++ls_composite_src_n_8_8888 (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ uint32_t src, srca; ++ uint32_t *dst_line, *dst, m; ++ uint8_t *mask_line, *mask; ++ int dst_stride, mask_stride; ++ int32_t w; ++ __m64 vsrc, vsrca; ++ uint64_t srcsrc; ++ ++ src = _pixman_image_get_solid (src_image, dst_image->bits.format); ++ ++ srca = src >> 24; ++ if (src == 0) ++ { ++ pixman_fill_ls (dst_image->bits.bits, dst_image->bits.rowstride, ++ PIXMAN_FORMAT_BPP (dst_image->bits.format), ++ dest_x, dest_y, width, height, 0); ++ return; ++ } ++ ++ srcsrc = (uint64_t)src << 32 | src; ++ ++ PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); ++ PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load8888r(%2,$f24) ++ store64a($f24,%0) ++ expand_alpha($f24,$f26) ++ store64a($f26,%1) ++ :"=m"(vsrc), "=m"(vsrca):"r"(src):clobber ++ ); ++ while (height--) ++ { ++ dst = dst_line; ++ dst_line += dst_stride; ++ mask = mask_line; ++ mask_line += mask_stride; ++ w = width; ++ ++ while (w) ++ { ++ m = *mask; ++ ++ if (m) ++ { ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ load32r(%1,$f20) ++ expand_alpha_rev($f20,$f28) ++ in($f24,$f28) ++ store8888r($f8,%0) ++ :"=r"(*dst):"r"(m):clobber ++ ); ++ ++ } ++ else ++ { ++ *dst = 0; ++ } ++ ++ w--; ++ mask++; ++ dst++; ++ } ++ } ++} +diff -urN pixman//pixman/pixman-cpu.c Pixman.Loongson//pixman/pixman-cpu.c +--- pixman//pixman/pixman-cpu.c 2010-12-25 18:46:00.073234000 +0800 ++++ Pixman.Loongson//pixman/pixman-cpu.c 2010-12-25 18:39:15.360337000 +0800 +@@ -579,7 +579,9 @@ + if (pixman_have_mmx ()) + return _pixman_implementation_create_mmx (); + #endif +- ++#ifdef USE_LS ++ return _pixman_implementation_create_ls (); ++#endif + #ifdef USE_ARM_NEON + if (pixman_have_arm_neon ()) + return _pixman_implementation_create_arm_neon (); +diff -urN pixman//pixman/pixman-ls.c Pixman.Loongson//pixman/pixman-ls.c +--- pixman//pixman/pixman-ls.c 1970-01-01 08:00:00.000000000 +0800 ++++ Pixman.Loongson//pixman/pixman-ls.c 2010-12-25 18:39:15.386759000 +0800 +@@ -0,0 +1,538 @@ ++/* ++* Based on pixman-mmx.c ++* Implemented for loongson 2F only. ++* Free software based on GPL licence. ++* Copyright 2010 WG Ge. ++*/ ++ ++#ifdef HAVE_CONFIG_H ++#include <config.h> ++#endif ++#include <stdlib.h> ++#include <string.h> ++#include <math.h> ++#include <limits.h> ++#include <stdio.h> ++#include "pixman-private.h" ++#include "pixman-combine32.h" ++#include "primitive.h" ++ ++#define __m64 __attribute__ ((aligned (8))) uint64_t ++#define DECLARE_ALIGNED(n,t,v) t __attribute__ ((aligned (n))) v ++#define DECLARE_ALIGNED_8(t, v, ...) DECLARE_ALIGNED(8, t, v) ++ ++DECLARE_ALIGNED_8 (const uint64_t, ls_4x00ff ) = 0x00ff00ff00ff00ffULL; ++DECLARE_ALIGNED_8 (const uint64_t, ls_4x0080 ) = 0x0080008000800080ULL; ++DECLARE_ALIGNED_8 (const uint64_t, ls_565_rgb ) = 0x000001f0003f001fULL; ++DECLARE_ALIGNED_8 (const uint64_t, ls_565_unpack_multiplier ) = 0x0000008404100840ULL; ++DECLARE_ALIGNED_8 (const uint64_t, ls_565_r ) = 0x000000f800000000ULL; ++DECLARE_ALIGNED_8 (const uint64_t, ls_565_g ) = 0x0000000000fc0000ULL; ++DECLARE_ALIGNED_8 (const uint64_t, ls_565_b ) = 0x00000000000000f8ULL; ++DECLARE_ALIGNED_8 (const uint64_t, ls_mask_0 ) = 0xffffffffffff0000ULL; ++DECLARE_ALIGNED_8 (const uint64_t, ls_mask_1 ) = 0xffffffff0000ffffULL; ++DECLARE_ALIGNED_8 (const uint64_t, ls_mask_2 ) = 0xffff0000ffffffffULL; ++DECLARE_ALIGNED_8 (const uint64_t, ls_mask_3 ) = 0x0000ffffffffffffULL; ++DECLARE_ALIGNED_8 (const uint64_t, ls_full_alpha ) = 0x00ff000000000000ULL; ++DECLARE_ALIGNED_8 (const uint64_t, ls_ffff0000ffff0000 ) = 0xffff0000ffff0000ULL; ++DECLARE_ALIGNED_8 (const uint64_t, ls_0000ffff00000000 ) = 0x0000ffff00000000ULL; ++DECLARE_ALIGNED_8 (const uint64_t, ls_000000000000ffff ) = 0x000000000000ffffULL; ++ ++ ++pixman_bool_t ++pixman_fill_ls (uint32_t *bits, ++ int stride, ++ int bpp, ++ int x, ++ int y, ++ int width, ++ int height, ++ uint32_t xor) ++{ ++ uint64_t fill; ++ uint32_t byte_width; ++ uint8_t *byte_line; ++ ++ ++ ++ if (bpp != 16 && bpp != 32 && bpp != 8) ++ return FALSE; ++ ++ if (bpp == 8) ++ { ++ stride = stride * (int) sizeof (uint32_t) / 1; ++ byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); ++ byte_width = width; ++ stride *= 1; ++ xor = (xor & 0xff) * 0x01010101; ++ } ++ else if (bpp == 16) ++ { ++ stride = stride * (int) sizeof (uint32_t) / 2; ++ byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); ++ byte_width = 2 * width; ++ stride *= 2; ++ xor = (xor & 0xffff) * 0x00010001; ++ } ++ else ++ { ++ stride = stride * (int) sizeof (uint32_t) / 4; ++ byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); ++ byte_width = 4 * width; ++ stride *= 4; ++ } ++ ++ fill = ((uint64_t)xor << 32) | xor; ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ "ldc1 $f24, %0 \n\t" ++ ::"m"(fill):"$f24" ++ ); ++ while (height--) ++ { ++ int w; ++ uint8_t *d = byte_line; ++ ++ byte_line += stride; ++ w = byte_width; ++ ++ while (w >= 1 && ((unsigned long)d & 1)) ++ { ++ *(uint8_t *)d = (xor & 0xff); ++ w--; ++ d++; ++ } ++ ++ while (w >= 2 && ((unsigned long)d & 3)) ++ { ++ *(uint16_t *)d = xor; ++ w -= 2; ++ d += 2; ++ } ++ ++ while (w >= 4 && ((unsigned long)d & 7)) ++ { ++ *(uint32_t *)d = xor; ++ ++ w -= 4; ++ d += 4; ++ } ++ ++ while (w >= 64) ++ { ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ "dmfc1 $8, $f24 \n\t" ++ "sd $8 , (%0) \n\t" ++ "sd $8 , 8(%0) \n\t" ++ "sd $8 , 16(%0) \n\t" ++ "sd $8 , 24(%0) \n\t" ++ "sd $8 , 32(%0) \n\t" ++ "sd $8 , 40(%0) \n\t" ++ "sd $8 , 48(%0) \n\t" ++ "sd $8 , 56(%0) \n\t" ++ ::"r"(d):"$8","memory","$f24" ++ ); ++ w -= 64; ++ d += 64; ++ } ++ ++ while (w >= 4) ++ { ++ *(uint32_t *)d = xor; ++ ++ w -= 4; ++ d += 4; ++ } ++ while (w >= 2) ++ { ++ *(uint16_t *)d = xor; ++ w -= 2; ++ d += 2; ++ } ++ while (w >= 1) ++ { ++ *(uint8_t *)d = (xor & 0xff); ++ w--; ++ d++; ++ } ++ ++ } ++ return TRUE; ++} ++ ++static pixman_bool_t ++pixman_blt_ls (uint32_t *src_bits, ++ uint32_t *dst_bits, ++ int src_stride, ++ int dst_stride, ++ int src_bpp, ++ int dst_bpp, ++ int src_x, ++ int src_y, ++ int dst_x, ++ int dst_y, ++ int width, ++ int height) ++{ ++ uint8_t * src_bytes; ++ uint8_t * dst_bytes; ++ int byte_width; ++ ++ if (src_bpp != dst_bpp) ++ return FALSE; ++ ++ if (src_bpp == 16) ++ { ++ src_stride = src_stride * (int) sizeof (uint32_t) / 2; ++ dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; ++ src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); ++ dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); ++ byte_width = 2 * width; ++ src_stride *= 2; ++ dst_stride *= 2; ++ } ++ else if (src_bpp == 32) ++ { ++ src_stride = src_stride * (int) sizeof (uint32_t) / 4; ++ dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; ++ src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); ++ dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); ++ byte_width = 4 * width; ++ src_stride *= 4; ++ dst_stride *= 4; ++ } ++ else ++ { ++ return FALSE; ++ } ++ ++ while (height--) ++ { ++ int w; ++ uint8_t *s = src_bytes; ++ uint8_t *d = dst_bytes; ++ src_bytes += src_stride; ++ dst_bytes += dst_stride; ++ w = byte_width; ++ ++ while (w >= 2 && ((unsigned long)d & 3)) ++ { ++ *(uint16_t *)d = *(uint16_t *)s; ++ w -= 2; ++ s += 2; ++ d += 2; ++ } ++ ++ while (w >= 4 && ((unsigned long)d & 7)) ++ { ++ *(uint32_t *)d = *(uint32_t *)s; ++ ++ w -= 4; ++ s += 4; ++ d += 4; ++ } ++ if ((unsigned long)s & 7) ++{ ++ while (w >= 64) ++ { ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ "uld $8 , (%1) \n\t" ++ "uld $9 , 8(%1) \n\t" ++ "uld $10, 16(%1) \n\t" ++ "uld $11, 24(%1) \n\t" ++ "sd $8 , (%0) \n\t" ++ "sd $9 , 8(%0) \n\t" ++ "sd $10, 16(%0) \n\t" ++ "sd $11, 24(%0) \n\t" ++ ++ "uld $8 , 32(%1) \n\t" ++ "uld $9 , 40(%1) \n\t" ++ "uld $10, 48(%1) \n\t" ++ "uld $11, 56(%1) \n\t" ++ "sd $8 , 32(%0) \n\t" ++ "sd $9 , 40(%0) \n\t" ++ "sd $10, 48(%0) \n\t" ++ "sd $11, 56(%0) \n\t" ++ ::"r"(d),"r"(s):"$8","$9","$10","$11","memory" ++ ); ++ w -= 64; ++ s += 64; ++ d += 64; ++ } ++} ++else ++{ ++ while (w >= 64) ++ { ++ ++ __asm__ volatile ( ++ ".set arch=loongson2f \n\t" ++ "ld $8 , (%1) \n\t" ++ "ld $9 , 8(%1) \n\t" ++ "ld $10, 16(%1) \n\t" ++ "ld $11, 24(%1) \n\t" ++ "sd $8 , (%0) \n\t" ++ "sd $9 , 8(%0) \n\t" ++ "sd $10, 16(%0) \n\t" ++ "sd $11, 24(%0) \n\t" ++ ++ "ld $8 , 32(%1) \n\t" ++ "ld $9 , 40(%1) \n\t" ++ "ld $10, 48(%1) \n\t" ++ "ld $11, 56(%1) \n\t" ++ "sd $8 , 32(%0) \n\t" ++ "sd $9 , 40(%0) \n\t" ++ "sd $10, 48(%0) \n\t" ++ "sd $11, 56(%0) \n\t" ++ ::"r"(d),"r"(s):"$8","$9","$10","$11","memory" ++ ); ++ w -= 64; ++ s += 64; ++ d += 64; ++ } ++} ++ ++ while (w >= 4) ++ { ++ *(uint32_t *)d = *(uint32_t *)s; ++ ++ w -= 4; ++ s += 4; ++ d += 4; ++ } ++ if (w >= 2) ++ { ++ *(uint16_t *)d = *(uint16_t *)s; ++ w -= 2; ++ s += 2; ++ d += 2; ++ } ++ } ++ return TRUE; ++} ++ ++ ++#include "pixman-composite-ls.c" ++#include "pixman-combine-ls.c" ++ ++static pixman_bool_t ++ls_blt (pixman_implementation_t *imp, ++ uint32_t * src_bits, ++ uint32_t * dst_bits, ++ int src_stride, ++ int dst_stride, ++ int src_bpp, ++ int dst_bpp, ++ int src_x, ++ int src_y, ++ int dst_x, ++ int dst_y, ++ int width, ++ int height) ++{ ++ if (!pixman_blt_ls ( ++ src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, ++ src_x, src_y, dst_x, dst_y, width, height)) ++ { ++ return _pixman_implementation_blt ( ++ imp->delegate, ++ src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, ++ src_x, src_y, dst_x, dst_y, width, height); ++ } ++ ++ return TRUE; ++} ++ ++static pixman_bool_t ++ls_fill (pixman_implementation_t *imp, ++ uint32_t * bits, ++ int stride, ++ int bpp, ++ int x, ++ int y, ++ int width, ++ int height, ++ uint32_t xor) ++{ ++ if (!pixman_fill_ls (bits, stride, bpp, x, y, width, height, xor)) ++ { ++ return _pixman_implementation_fill ( ++ imp->delegate, bits, stride, bpp, x, y, width, height, xor); ++ } ++ ++ return TRUE; ++} ++
++static void ++ls_composite_copy_area (pixman_implementation_t *imp, ++ pixman_op_t op, ++ pixman_image_t * src_image, ++ pixman_image_t * mask_image, ++ pixman_image_t * dst_image, ++ int32_t src_x, ++ int32_t src_y, ++ int32_t mask_x, ++ int32_t mask_y, ++ int32_t dest_x, ++ int32_t dest_y, ++ int32_t width, ++ int32_t height) ++{ ++ pixman_blt_ls (src_image->bits.bits, ++ dst_image->bits.bits, ++ src_image->bits.rowstride, ++ dst_image->bits.rowstride, ++ PIXMAN_FORMAT_BPP (src_image->bits.format), ++ PIXMAN_FORMAT_BPP (dst_image->bits.format), ++ src_x, src_y, dest_x, dest_y, width, height); ++} ++ ++ ++static const pixman_fast_path_t ls_fast_paths[] = ++{ ++ ++//these are implemented so far ++#if 1 ++ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, ls_composite_over_x888_8_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, ls_composite_over_x888_8_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, ls_composite_over_x888_8_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, ls_composite_over_x888_8_8888 ), ++#endif ++ ++#if 1 ++//over_8888_0565 significant perf improvement, slight better L1, L2, 30% better RT ++ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, ls_composite_over_8888_0565 ), ++ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, ls_composite_over_8888_0565 ), ++ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, ls_composite_over_pixbuf_0565 ), ++ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, ls_composite_over_pixbuf_0565 ), ++ ++//big improvement some closing 100% ++ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, ls_composite_over_n_8888_0565_ca ), ++ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, ls_composite_over_n_8888_0565_ca ), ++ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, ls_composite_over_n_8_0565 ), ++ PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, ls_composite_over_n_8_0565 ), ++ PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, ls_composite_over_n_0565 ), ++ ++//ubalbe to bench with lowlevel bench, believe it is a gain in perf ++ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, ls_composite_over_x888_n_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, ls_composite_over_x888_n_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, ls_composite_over_x888_n_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, ls_composite_over_x888_n_8888 ), ++ ++//performance regress 30% in L1,L2, but significant improvement in RT ++ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, ls_composite_over_8888_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, ls_composite_over_8888_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, ls_composite_over_8888_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, ls_composite_over_8888_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, ls_composite_over_pixbuf_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, ls_composite_over_pixbuf_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, ls_composite_over_pixbuf_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, ls_composite_over_pixbuf_8888 ), ++ ++//same performance in L1,L2, but significant improvement in RT 30-40% ++ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, ls_composite_over_8888_n_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, ls_composite_over_8888_n_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, ls_composite_over_8888_n_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, ls_composite_over_8888_n_8888 ), ++ ++//significant perf improvement 20% ++ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, ls_composite_over_n_8_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, ls_composite_over_n_8_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, ls_composite_over_n_8_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, ls_composite_over_n_8_8888 ), ++ ++//3 times perf improvements ++ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, ls_composite_over_n_8888_8888_ca ), ++ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, ls_composite_over_n_8888_8888_ca ), ++ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, ls_composite_over_n_8888_8888_ca ), ++ PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, ls_composite_over_n_8888_8888_ca ), ++ ++//significant performance boost ++ PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, ls_composite_over_n_8888 ), ++ PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, ls_composite_over_n_8888 ), ++//simple add, expect better perf in generic code ++// PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, ls_composite_add_8888_8888 ), ++// PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, ls_composite_add_8888_8888 ), ++ ++// FIXME: Copy memory are not better than geneic code ++#if 0 ++ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, ls_composite_copy_area ), ++ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, ls_composite_copy_area ), ++ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, ls_composite_copy_area ), ++ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, ls_composite_copy_area ), ++ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, ls_composite_copy_area ), ++ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, ls_composite_copy_area ), ++ PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, ls_composite_copy_area ), ++ PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, ls_composite_copy_area ), ++ PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, ls_composite_copy_area ), ++ PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, ls_composite_copy_area ), ++#endif ++ ++//significant improvement ++ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, ls_composite_src_n_8_8888 ), ++ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, ls_composite_src_n_8_8888 ), ++ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, ls_composite_src_n_8_8888 ), ++ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, ls_composite_src_n_8_8888 ), ++ ++#endif ++ ++//these are not yet implemented ++ ++#if 0 ++ ++ PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, ls_composite_add_8000_8000 ), ++ PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, ls_composite_add_n_8_8 ), ++ PIXMAN_STD_FAST_PATH (IN, a8, null, a8, ls_composite_in_8_8 ), ++ PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, ls_composite_in_n_8_8 ), ++#endif ++ ++ ++ { PIXMAN_OP_NONE }, ++}; ++ ++pixman_implementation_t * ++_pixman_implementation_create_ls (void) ++{ ++ pixman_implementation_t *general = _pixman_implementation_create_fast_path (); ++ pixman_implementation_t *imp = _pixman_implementation_create (general, ls_fast_paths); ++ ++//Turned on but unable to benchmark. ++#if 1 ++ imp->combine_32[PIXMAN_OP_OVER] = ls_combine_over_u; ++ imp->combine_32[PIXMAN_OP_OVER_REVERSE] = ls_combine_over_reverse_u; ++ imp->combine_32[PIXMAN_OP_IN] = ls_combine_in_u; ++ imp->combine_32[PIXMAN_OP_IN_REVERSE] = ls_combine_in_reverse_u; ++ imp->combine_32[PIXMAN_OP_OUT] = ls_combine_out_u; ++ imp->combine_32[PIXMAN_OP_OUT_REVERSE] = ls_combine_out_reverse_u; ++ imp->combine_32[PIXMAN_OP_ATOP] = ls_combine_atop_u; ++ imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = ls_combine_atop_reverse_u; ++ imp->combine_32[PIXMAN_OP_XOR] = ls_combine_xor_u; ++ imp->combine_32[PIXMAN_OP_ADD] = ls_combine_add_u; ++ imp->combine_32[PIXMAN_OP_SATURATE] = ls_combine_saturate_u; ++ ++ imp->combine_32_ca[PIXMAN_OP_SRC] = ls_combine_src_ca; ++ imp->combine_32_ca[PIXMAN_OP_OVER] = ls_combine_over_ca; ++ imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = ls_combine_over_reverse_ca; ++ imp->combine_32_ca[PIXMAN_OP_IN] = ls_combine_in_ca; ++ imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = ls_combine_in_reverse_ca; ++ imp->combine_32_ca[PIXMAN_OP_OUT] = ls_combine_out_ca; ++ imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = ls_combine_out_reverse_ca; ++ imp->combine_32_ca[PIXMAN_OP_ATOP] = ls_combine_atop_ca; ++ imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = ls_combine_atop_reverse_ca; ++ imp->combine_32_ca[PIXMAN_OP_XOR] = ls_combine_xor_ca; ++ imp->combine_32_ca[PIXMAN_OP_ADD] = ls_combine_add_ca; ++#endif ++ ++//FIXME blt and fill not shown better perf than geneic code ++#if 0 ++ imp->blt = ls_blt; ++ imp->fill = ls_fill; ++#endif ++ ++ return imp; ++} ++ +diff -urN pixman//pixman/pixman-private.h Pixman.Loongson//pixman/pixman-private.h +--- pixman//pixman/pixman-private.h 2010-12-25 18:46:00.102841000 +0800 ++++ Pixman.Loongson//pixman/pixman-private.h 2010-12-25 18:39:15.401808000 +0800 +@@ -493,6 +493,11 @@ + pixman_implementation_t * + _pixman_implementation_create_fast_path (void); + ++#ifdef USE_LS ++pixman_implementation_t * ++_pixman_implementation_create_ls (void); ++#endif ++ + #ifdef USE_MMX + pixman_implementation_t * + _pixman_implementation_create_mmx (void); +diff -urN pixman//pixman/primitive.h Pixman.Loongson//pixman/primitive.h +--- pixman//pixman/primitive.h 1970-01-01 08:00:00.000000000 +0800 ++++ Pixman.Loongson//pixman/primitive.h 2010-12-25 18:39:15.457084000 +0800 +@@ -0,0 +1,214 @@ ++/*
++* MMX register usage protocal
++* return result: f8
++* tmp immediate f12
++* tmp register in primtive f14 f16 f18
++* tmp register in pixman f0,f4,f6,f10,f20,f22,
++* globals in function f24, f26, f28,f30
++* Exceptions for load and store:
++* load will specify dest FPR register
++* store will specify src FPR register
++* expand_alpha(_rev) implemented with GPR, dest FPR as the 2nd parameter
++*
++* Special alert: don't use return result $f8 as input, it might be overwritten
++*/
++
++
++/*primitive macros */
++
++#define clobber "$8","$9","$f0","$f2","$f8",\
++ "$f12","$f14","$f16","$f18","$f20",\
++ "$f22","$f24","$f26","$f28","$f30"
++
++#define DMTC1_IMM(regc1,imm) \
++ "dli $8, "#imm" \n\t" \
++ "dmtc1 $8, "#regc1" \n\t"
++
++#define MTC1_IMM(regc1,imm) \
++ "li $8, "#imm" \n\t" \
++ "dmtc1 $8, "#regc1" \n\t"
++
++
++#define save_to(reg1) "mov.d "#reg1", $f8 \n\t"
++#define zero(reg1) "xor "#reg1","#reg1","#reg1" \n\t"
++
++#define load32(sp,reg1) \
++ "ulw $8, "#sp" \n\t" \
++ "dmtc1 $8, "#reg1" \n\t"
++
++#define load32a(sp,reg1) \
++ "lw $8, "#sp" \n\t" \
++ "dmtc1 $8, "#reg1" \n\t"
++
++#define load32r(sp,reg1) \
++ "dmtc1 "#sp", "#reg1" \n\t"
++
++#define load64(sp,reg1) \
++ "uld $8, "#sp" \n\t" \
++ "dmtc1 $8, "#reg1" \n\t"
++
++#define load64a(sp,reg1) \
++ "ld $8, "#sp" \n\t" \
++ "dmtc1 $8, "#reg1" \n\t"
++
++
++#define store32(reg1,sp) \
++ "dmfc1 $8, "#reg1" \n\t" \
++ "usw $8, "#sp" \n\t"
++
++#define store32r(reg1,sp) \
++ "dmfc1 "#sp", "#reg1" \n\t"
++
++#define store32a(reg1,sp) \
++ "swc1 "#reg1", "#sp" \n\t"
++
++#define store64(reg1,sp) \
++ "dmfc1 $8, "#reg1" \n\t" \
++ "usd $8, "#sp" \n\t"
++
++#define store64a(reg1,sp) \
++ "sdc1 "#reg1", "#sp" \n\t"
++
++#define load8888(sp,reg1) \
++ load64(sp,reg1) \
++ "xor $f12, $f12, $f12 \n\t" \
++ "punpcklbh "#reg1", "#reg1", $f12 \n\t"
++
++#define load8888r(sp,reg1) \
++ load32r(sp,reg1) \
++ "xor $f12, $f12, $f12 \n\t" \
++ "punpcklbh "#reg1", "#reg1", $f12 \n\t"
++
++#define load8888a(sp,reg1) \
++ load64a(sp,reg1) \
++ "xor $f12, $f12, $f12 \n\t" \
++ "punpcklbh "#reg1", "#reg1", $f12 \n\t"
++
++#define load8888ah(sp,reg1) \
++ load64a(sp,reg1) \
++ "xor $f12, $f12, $f12 \n\t" \
++ "punpckhbh "#reg1", "#reg1", $f12 \n\t"
++
++#define store8888(reg1,sp) \
++ "xor $f12, $f12, $f12 \n\t" \
++ "packushb "#reg1", "#reg1", $f12 \n\t" \
++ store64(reg1,sp)
++
++#define store8888r(reg1,sp) \
++ "xor $f12, $f12, $f12 \n\t" \
++ "packushb "#reg1", "#reg1", $f12 \n\t" \
++ store32r(reg1,sp)
++
++#define store8888a(reg1,sp) \
++ "xor $f12, $f12, $f12 \n\t" \
++ "packushb "#reg1", "#reg1", $f12 \n\t" \
++ store64a(reg1,sp)
++
++#define pack8888(reg1,reg2) \
++ "packushb $f8, "#reg1","#reg2" \n\t"
++
++#define unpack8888(reg1,reg2) \
++ "punpcklbh $f8, "#reg1","#reg2" \n\t"
++
++
++#define negate(sreg,dreg) \
++ DMTC1_IMM($f12, 0x00ff00ff00ff00ff)\
++ "xor "#dreg", "#sreg", $f12 \n\t"
++
++#define pix_add(reg1,reg2) \
++ "paddusb $f8, "#reg1", "#reg2" \n\t"
++
++#define pix_multiply(reg1,reg2) \
++ "pmullh $f14, "#reg1", "#reg2" \n\t " \
++ DMTC1_IMM($f12, 0x0080008000800080) \
++ "paddush $f14, $f14, $f12 \n\t "\
++ MTC1_IMM($f12, 8) \
++ "psrlh $f16, $f14, $f12 \n\t" \
++ "paddush $f14, $f14, $f16 \n\t" \
++ "psrlh $f8, $f14, $f12 \n\t"
++
++#define pix_add_mul(reg1,reg2,reg3,reg4) \ ++ pix_multiply(reg1,reg2) \
++ "mov.d $f18, $f8 \n\t" \
++ pix_multiply(reg3,reg4) \
++ pix_add($f18,$f8)
++
++#define expand_alpha(sreg,dreg) \
++ "dmfc1 $8, "#sreg" \n\t" \
++ "dsrl32 $8, $8, 16 \n\t" \
++ "dsll $9, $8, 16 \n\t" \
++ "or $8, $8, $9 \n\t" \
++ "dsll32 $9, $8, 0 \n\t" \
++ "or $8, $8, $9 \n\t" \
++ "dmtc1 $8, "#dreg" \n\t"
++
++#define expand_alpha_rev(sreg,dreg)\
++ "dmfc1 $8, "#sreg" \n\t" \
++ "dsll32 $8, $8, 16 \n\t" \
++ "dsrl32 $8, $8, 16 \n\t" \
++ "dsll $9, $8, 16 \n\t" \
++ "or $8, $8, $9 \n\t" \
++ "dsll32 $9, $8, 0 \n\t" \
++ "or $8, $8, $9 \n\t" \
++ "dmtc1 $8, "#dreg" \n\t"
++
++#define expand8888(reg1,pos) expand8888_##pos(reg1)
++
++#define expand8888_0(reg1) \
++ "xor $f12, $f12, $f12 \n\t" \
++ "punpcklbh $f8, "#reg1", $f12 \n\t"
++
++#define expand8888_1(reg1) \
++ "xor $f12, $f12, $f12 \n\t" \
++ "punpckhbh $f8, "#reg1", $f12 \n\t"
++
++#define expandx888(reg1,pos) \
++ expand8888(reg1,pos) \
++ DMTC1_IMM($f12, 0x00ff000000000000) \
++ "or $f8, $f8, $f12 \n\t"
++
++#define invert_colors(reg1) \
++ DMTC1_IMM($f12, 0xffff0000ffff0000) \
++ "and $f14, "#reg1", $f12 \n\t" \
++ DMTC1_IMM($f12, 0x000000000000ffff) \
++ "and $f16, "#reg1", $f12 \n\t" \
++ DMTC1_IMM($f12, 0x0000ffff00000000) \
++ "and $f18, "#reg1", $f12 \n\t" \
++ MTC1_IMM($f12, 32) \
++ "dsll $f16, $f16, $f12 \n\t" \
++ "dsrl $f18, $f18, $f12 \n\t" \
++ "or $f14, $f14, $f16 \n\t" \
++ "or $f8, $f14, $f18 \n\t"
++
++#define over(reg1,reg2,reg3) \
++ negate(reg2,$f8) \
++ pix_multiply(reg3, $f8)\
++ pix_add(reg1, $f8)
++
++
++#define over_rev_non_pre(reg1,reg2) \
++ expand_alpha(reg1,$f0) \
++ DMTC1_IMM($f12,0x00ff000000000000) \
++ "or $f2, $f0, $f12 \n\t" \
++ invert_colors(reg1) \
++ pix_multiply($f8,$f2) \
++ save_to($f2) \
++ over($f2, $f0, reg2)
++
++#define in(reg1,reg2) pix_multiply(reg1,reg2)
++
++#define in_over_full_src_alpha(reg1,reg2,reg3) \
++ DMTC1_IMM($f12,0x00ff000000000000) \
++ "or $f0, "#reg1", $f12 \n\t" \
++ in($f0,reg2) \
++ save_to($f0) \
++ over($f0,reg2,reg3)
++
++#define in_over(reg1,reg2,reg3,reg4) \
++ in(reg1,reg3) \
++ "mov.d $f0, $f8 \n\t" \
++ pix_multiply(reg2,reg3) \
++ "mov.d $f2, $f8 \n\t" \
++ over($f0,$f2,reg4)
++
++
|