1 files changed, 2745 insertions, 0 deletions
diff --git a/extra/pixman/pixman-loongson2f.patch b/extra/pixman/pixman-loongson2f.patch
new file mode 100644
index 000000000..15e01cb6b
--- /dev/null
+++ b/extra/pixman/pixman-loongson2f.patch
@@ -0,0 +1,2745 @@
+diff -urN pixman//configure.ac Pixman.Loongson//configure.ac
+--- pixman//configure.ac	2010-12-25 18:46:00.018699000 +0800
++++ Pixman.Loongson//configure.ac	2010-12-25 18:39:15.298778000 +0800
+@@ -264,6 +264,43 @@
+ ])
+ 
+ dnl ===========================================================================
++dnl Check for Loongson SIMD
++
++have_loongson_intrinsics=no
++AC_MSG_CHECKING(whether to use Loongson SIMD intrinsics)
++
++AC_COMPILE_IFELSE([
++#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4))
++error "Need GCC >= 4.4 for Loongson SIMD compilation"
++#endif
++int main () {
++    /* Test with a loongson SIMD instruction. */
++	asm volatile ( ".set arch = loongson2f \n\t" "and \$f0, \$f0, \$f0 \n\t"  : : : "cc", "memory" );
++    return 0;
++}], have_loongson_intrinsics=yes)
++
++
++AC_ARG_ENABLE(loongson,
++   [AC_HELP_STRING([--disable-loongson],
++                   [disable Loongson fast paths])],
++   [enable_loongson=$enableval], [enable_loongson=auto])
++
++if test $enable_loongson = no ; then
++   have_loongson_intrinsics=disabled
++fi
++
++if test $have_loongson_intrinsics = yes ; then
++   AC_DEFINE(USE_LS, 1, [use Loongson compiler intrinsics])
++fi
++
++AC_MSG_RESULT($have_loongson_intrinsics)
++if test $enable_loongson = yes && test $have_loongson_intrinsics = no ; then
++   AC_MSG_ERROR([Loongson intrinsics not detected])
++fi
++
++AM_CONDITIONAL(USE_LS, test $have_loongson_intrinsics = yes)
++
++dnl ===========================================================================
+ dnl Check for MMX
+ 
+ if test "x$MMX_CFLAGS" = "x" ; then
+diff -urN pixman//pixman/Makefile.am Pixman.Loongson//pixman/Makefile.am
+--- pixman//pixman/Makefile.am	2010-12-25 18:46:00.025027000 +0800
++++ Pixman.Loongson//pixman/Makefile.am	2010-12-25 18:39:15.303599000 +0800
+@@ -55,6 +55,19 @@
+ 	pixman-combine.h.template solaris-hwcap.mapfile pixman-x64-mmx-emulation.h
+ CLEANFILES = pixman-combine32.c pixman-combine64.c pixman-combine32.h pixman-combine64.h
+ 
++# loongson code
++if USE_LS
++noinst_LTLIBRARIES += libpixman-ls.la
++libpixman_ls_la_SOURCES = \
++	pixman-ls.c
++libpixman_ls_la_CFLAGS = $(DEP_CFLAGS) $(LS_CFLAGS)
++libpixman_ls_la_LIBADD = $(DEP_LIBS)
++libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
++libpixman_1_la_LIBADD += libpixman-ls.la
++
++ASM_CFLAGS_ls=$(LS_CFLAGS)
++endif
++
+ # mmx code
+ if USE_MMX
+ noinst_LTLIBRARIES += libpixman-mmx.la
+diff -urN pixman//pixman/pixman-combine-ls.c Pixman.Loongson//pixman/pixman-combine-ls.c
+--- pixman//pixman/pixman-combine-ls.c	1970-01-01 08:00:00.000000000 +0800
++++ Pixman.Loongson//pixman/pixman-combine-ls.c	2010-12-25 18:39:15.344171000 +0800
+@@ -0,0 +1,911 @@
++static force_inline uint32_t
++combine (const uint32_t *src, const uint32_t *mask)
++{
++    uint32_t ssrc = *src;
++
++    if (mask)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f22) 
++		load8888r(%0,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		store8888r($f8,%0)
++		:"+r"(ssrc):"r"(*mask):clobber
++		);
++    }
++    return ssrc;
++}
++
++static void
++ls_combine_saturate_u (pixman_implementation_t *imp,
++                        pixman_op_t              op,
++                        uint32_t *               dest,
++                        const uint32_t *         src,
++                        const uint32_t *         mask,
++                        int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++	uint32_t s = combine (src, mask);
++	uint32_t d = *dest;
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%1,$f22) 
++	load8888r(%0,$f20) 
++	:"+r"(d):"r"(s):clobber
++	);
++
++	uint32_t sa = s >> 24;
++	uint32_t da = ~d >> 24;
++
++	if (sa > da)
++	{
++		uint32_t dds =  DIV_UN8 (da, sa) << 24;
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f24)
++		pix_multiply($f22,$f24)
++		save_to($f22)
++		::"r"(dds):clobber
++		);
++	}
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	pix_add($f20,$f22) 
++	store8888r($f8,%0)
++	:"=r"(*dest)::clobber
++	);
++
++	++src;
++	++dest;
++	if (mask)
++	    mask++;
++    }
++}
++static void
++ls_combine_out_u (pixman_implementation_t *imp,
++                   pixman_op_t              op,
++                   uint32_t *               dest,
++                   const uint32_t *         src,
++                   const uint32_t *         mask,
++                   int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++	if (mask) 
++	{
++
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f24)
++		negate($f24,$f24)
++		pix_multiply($f20,$f24)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++
++		mask++;
++	}else {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f24)
++		negate($f24,$f24)
++		pix_multiply($f20,$f24)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);		
++		
++	}
++	++dest;
++	++src;
++    }
++}
++
++static void
++ls_combine_out_reverse_u (pixman_implementation_t *imp,
++                           pixman_op_t              op,
++                           uint32_t *               dest,
++                           const uint32_t *         src,
++                           const uint32_t *         mask,
++                           int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++	if (mask)
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f20)
++		negate($f20,$f20)
++		pix_multiply($f20,$f24)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++
++		mask++;
++	}else{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f20)
++		negate($f20,$f20)
++		pix_multiply($f20,$f24)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);	
++	}
++	++dest;
++	++src;
++
++    }
++}
++
++static void
++ls_combine_out_ca (pixman_implementation_t *imp,
++                    pixman_op_t              op,
++                    uint32_t *               dest,
++                    const uint32_t *         src,
++                    const uint32_t *         mask,
++                    int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f26)
++		negate($f26,$f26)
++		pix_multiply($f20,$f22)
++		save_to($f20) 
++		pix_multiply($f20,$f26)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++}
++
++static void
++ls_combine_out_reverse_ca (pixman_implementation_t *imp,
++                            pixman_op_t              op,
++                            uint32_t *               dest,
++                            const uint32_t *         src,
++                            const uint32_t *         mask,
++                            int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f28)
++		pix_multiply($f22,$f28)
++		save_to($f22) 
++		negate($f22,$f22)
++		pix_multiply($f24,$f22)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++}
++
++
++static void
++ls_combine_atop_u (pixman_implementation_t *imp,
++                    pixman_op_t              op,
++                    uint32_t *               dest,
++                    const uint32_t *         src,
++                    const uint32_t *         mask,
++                    int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++	if (mask)
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26)
++		expand_alpha($f24,$f28)
++		negate($f26,$f26)
++		pix_add_mul($f20,$f28,$f24,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++
++		mask++;
++	}else {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26)
++		expand_alpha($f24,$f28)
++		negate($f26,$f26)
++		pix_add_mul($f20,$f28,$f24,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);
++	}
++	++dest;
++	++src;
++
++    }
++}
++
++static void
++ls_combine_atop_reverse_u (pixman_implementation_t *imp,
++                            pixman_op_t              op,
++                            uint32_t *               dest,
++                            const uint32_t *         src,
++                            const uint32_t *         mask,
++                            int                      width)
++{
++    const uint32_t *end;
++
++    end = dest + width;
++
++    while (dest < end)
++    {
++	if (mask){
++
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26)
++		expand_alpha($f24,$f28)
++		negate($f28,$f28)
++		pix_add_mul($f20,$f28,$f24,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++		mask++;
++	}else{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26)
++		expand_alpha($f24,$f28)
++		negate($f28,$f28)
++		pix_add_mul($f20,$f28,$f24,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);	
++	}
++	++dest;
++	++src;
++    }
++}
++
++
++static void
++ls_combine_atop_ca (pixman_implementation_t *imp,
++                     pixman_op_t              op,
++                     uint32_t *               dest,
++                     const uint32_t *         src,
++                     const uint32_t *         mask,
++                     int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f26) 
++		expand_alpha($f20,$f28) 
++		pix_multiply($f20,$f22)
++		save_to($f20)
++		pix_multiply($f22,$f28)
++		save_to($f22)
++		negate($f22,$f22)
++		pix_add_mul($f24,$f22,$f20,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++}
++
++static void
++ls_combine_atop_reverse_ca (pixman_implementation_t *imp,
++                             pixman_op_t              op,
++                             uint32_t *               dest,
++                             const uint32_t *         src,
++                             const uint32_t *         mask,
++                             int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f26)
++		expand_alpha($f20,$f28)
++		pix_multiply($f20,$f22)
++		save_to($f20) 
++		pix_multiply($f22,$f28)
++		save_to($f22) 
++		negate($f26,$f26)
++		pix_add_mul($f24,$f22,$f20,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++}
++
++static void
++ls_combine_xor_u (pixman_implementation_t *imp,
++                   pixman_op_t              op,
++                   uint32_t *               dest,
++                   const uint32_t *         src,
++                   const uint32_t *         mask,
++                   int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++	if (mask) 
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26) 
++		expand_alpha($f24,$f28) 
++		negate($f26,$f26)
++		negate($f28,$f28)
++		pix_add_mul($f20,$f28,$f24,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++		mask++;
++	}else{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26) 
++		expand_alpha($f24,$f28) 
++		negate($f26,$f26)
++		negate($f28,$f28)
++		pix_add_mul($f20,$f28,$f24,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);
++	}
++	++dest;
++	++src;
++
++    }
++}
++
++static void
++ls_combine_xor_ca (pixman_implementation_t *imp,
++                    pixman_op_t              op,
++                    uint32_t *               dest,
++                    const uint32_t *         src,
++                    const uint32_t *         mask,
++                    int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f26) 
++		expand_alpha($f20,$f28) 
++		pix_multiply($f20,$f22)
++		save_to($f20) 
++		pix_multiply($f22,$f28)
++		save_to($f22) 
++		negate($f26,$f26)
++		negate($f22,$f22)
++		pix_add_mul($f24,$f22,$f20,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++}
++
++
++static void
++ls_combine_in_reverse_u (pixman_implementation_t *imp,
++                          pixman_op_t              op,
++                          uint32_t *               dest,
++                          const uint32_t *         src,
++                          const uint32_t *         mask,
++                          int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++ 
++	if (mask) 
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26)
++		pix_multiply($f24,$f26)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++		mask++;
++	} else {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26)
++		pix_multiply($f24,$f26)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber		
++		);
++	}
++	++dest;
++	++src;
++    }
++}
++
++static void
++ls_combine_in_reverse_ca (pixman_implementation_t *imp,
++                           pixman_op_t              op,
++                           uint32_t *               dest,
++                           const uint32_t *         src,
++                           const uint32_t *         mask,
++                           int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f20) 
++		pix_multiply($f22,$f20)
++		save_to($f26)
++		pix_multiply($f24,$f26)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++}
++
++static void
++ls_combine_in_u (pixman_implementation_t *imp,
++                  pixman_op_t              op,
++                  uint32_t *               dest,
++                  const uint32_t *         src,
++                  const uint32_t *         mask,
++                  int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++	if (mask) 
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f24)
++		pix_multiply($f20,$f24)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++		mask++;
++	} else {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f24)
++		pix_multiply($f20,$f24)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);	
++	
++	}
++	++dest;
++	++src;
++    }
++}
++
++static void
++ls_combine_in_ca (pixman_implementation_t *imp,
++                   pixman_op_t              op,
++                   uint32_t *               dest,
++                   const uint32_t *         src,
++                   const uint32_t *         mask,
++                   int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f24) 
++		pix_multiply($f20,$f22)
++		save_to($f26)
++		pix_multiply($f26,$f24)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++ }
++static void
++ls_combine_src_ca (pixman_implementation_t *imp,
++                    pixman_op_t              op,
++                    uint32_t *               dest,
++                    const uint32_t *         src,
++                    const uint32_t *         mask,
++                    int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		pix_multiply($f20,$f22)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++mask;
++	++dest;
++    }
++
++}
++
++
++static void 
++ls_combine_over_u (pixman_implementation_t *imp,
++                    pixman_op_t              op,
++                    uint32_t *               dest,
++                    const uint32_t *         src,
++                    const uint32_t *         mask,
++                    int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++
++	uint32_t ssrc = combine (src, mask);
++	uint32_t a = ssrc >> 24;
++
++	if (a == 0xff)
++	{
++	    *dest = ssrc;
++	}
++	else if (ssrc)
++	{
++
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20)
++
++		expand_alpha($f20,$f24) 
++		load8888r(%0,$f26) 
++		over($f20,$f24,$f26)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(ssrc):clobber
++		);
++	}
++
++	++dest;
++	++src;
++	if (mask)
++	    ++mask;
++    }
++}
++
++static void
++ls_combine_over_reverse_u (pixman_implementation_t *imp,
++                            pixman_op_t              op,
++                            uint32_t *               dest,
++                            const uint32_t *         src,
++                            const uint32_t *         mask,
++                            int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++	if (mask)
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f26) 
++		expand_alpha($f26,$f28)
++		over($f26,$f28,$f20)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++		mask++;
++	}else{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f26) 
++		expand_alpha($f26,$f28)
++		over($f26,$f28,$f20)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);	
++	
++	}
++	++dest;
++	++src;
++    }
++}
++
++
++static void
++ls_combine_over_ca (pixman_implementation_t *imp,
++                     pixman_op_t              op,
++                     uint32_t *               dest,
++                     const uint32_t *         src,
++                     const uint32_t *         mask,
++                     int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f20) 
++		load8888r(%1,$f22) 
++		load8888r(%2,$f24) 
++		expand_alpha($f22,$f26) 
++		in_over($f22,$f26,$f24,$f20)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++
++}
++
++static void
++ls_combine_over_reverse_ca (pixman_implementation_t *imp,
++                             pixman_op_t              op,
++                             uint32_t *               dest,
++                             const uint32_t *         src,
++                             const uint32_t *         mask,
++                             int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f20) 
++		load8888r(%1,$f22) 
++		load8888r(%2,$f24) 
++		in($f22,$f24)
++		save_to($f22)
++		expand_alpha($f20,$f28) 
++		over($f20,$f28,$f22)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++
++}
++
++static void
++ls_combine_add_u (pixman_implementation_t *imp,
++                   pixman_op_t              op,
++                   uint32_t *               dest,
++                   const uint32_t *         src,
++                   const uint32_t *         mask,
++                   int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++
++	if (mask)
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f22) 
++		pix_add($f20,$f22)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++		mask++;
++	}else{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f22) 
++		pix_add($f20,$f22)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);	
++	
++	}
++	++dest;
++	++src;
++    }
++}
++
++static void
++ls_combine_add_ca (pixman_implementation_t *imp,
++                    pixman_op_t              op,
++                    uint32_t *               dest,
++                    const uint32_t *         src,
++                    const uint32_t *         mask,
++                    int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f20) 
++		load8888r(%1,$f22) 
++		load8888r(%2,$f24) 
++		pix_multiply($f22,$f24)
++		save_to($f22)
++		pix_add($f22,$f20)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++}
+diff -urN pixman//pixman/pixman-composite-ls.c Pixman.Loongson//pixman/pixman-composite-ls.c
+--- pixman//pixman/pixman-composite-ls.c	1970-01-01 08:00:00.000000000 +0800
++++ Pixman.Loongson//pixman/pixman-composite-ls.c	2010-12-25 18:39:15.356667000 +0800
+@@ -0,0 +1,967 @@
++static void
++ls_composite_over_x888_8_8888 (pixman_implementation_t *imp,
++                                 pixman_op_t              op,
++                                 pixman_image_t *         src_image,
++                                 pixman_image_t *         mask_image,
++                                 pixman_image_t *         dst_image,
++                                 int32_t                  src_x,
++                                 int32_t                  src_y,
++                                 int32_t                  mask_x,
++                                 int32_t                  mask_y,
++                                 int32_t                  dest_x,
++                                 int32_t                  dest_y,
++                                 int32_t                  width,
++                                 int32_t                  height)
++{
++
++    uint32_t    *src, *src_line;
++    uint32_t    *dst, *dst_line;
++    uint8_t     *mask, *mask_line;
++    int src_stride, mask_stride, dst_stride;
++    uint32_t m;
++    uint32_t s, d;
++    int32_t w;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++
++    while (height--)
++    {
++	src = src_line;
++	src_line += src_stride;
++	dst = dst_line;
++	dst_line += dst_stride;
++	mask = mask_line;
++	mask_line += mask_stride;
++
++	w = width;
++	while (w--)
++	{
++	    m = *mask++;
++	    if (m)
++	    {
++		s = *src | 0xff000000;
++
++		if (m == 0xff)
++		{
++		    *dst = s;
++		}
++		else
++		{
++          __asm__ volatile (
++          ".set arch=loongson2f \n\t"     
++          load8888r(%0,$f20) 
++          load8888r(%1,$f22) 
++          load8888r(%2,$f24) 
++          expand_alpha($f22,$f26)
++          expand_alpha_rev($f24,$f28)
++          in_over($f22,$f26,$f28,$f20)
++          store8888r($f8,%0)
++          :"+r"(*dst):"r"(s),"r"(m):clobber
++          );
++
++//		    __m64 sa = expand_alpha (s);
++//		    __m64 vm = expand_alpha_rev (to_m64 (m));
++//		    __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
++//		    *dst = store8888 (vdest);
++
++		}
++	    }
++	    src++;
++	    dst++;
++	}
++    }
++}
++
++
++
++
++
++static void
++ls_composite_over_8888_8888 (pixman_implementation_t *imp,
++                              pixman_op_t              op,
++                              pixman_image_t *         src_image,
++                              pixman_image_t *         mask_image,
++                              pixman_image_t *         dst_image,
++                              int32_t                  src_x,
++                              int32_t                  src_y,
++                              int32_t                  mask_x,
++                              int32_t                  mask_y,
++                              int32_t                  dest_x,
++                              int32_t                  dest_y,
++                              int32_t                  width,
++                              int32_t                  height)
++{
++    uint32_t *dst_line, *dst;
++    uint32_t *src_line, *src;
++    uint32_t s;
++    int dst_stride, src_stride;
++    uint8_t a;
++    int32_t w;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	src = src_line;
++	src_line += src_stride;
++	w = width;
++
++	while (w--)
++	{
++	    s = *src;
++	    a = s >> 24;
++
++	    if (a == 0xff)
++	    {
++		*dst = s;
++	    }
++	    else if (s)
++	    {
++			
++				__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f24) 
++		load8888r(%0,$f20)
++		expand_alpha($f24,$f26)
++		over($f24,$f26,$f20)
++		store8888r($f8,%0)
++		:"+r"(*dst):"r"(*src):clobber
++		);
++	    }
++		dst++;
++		src++;	
++
++	}
++    }
++}
++
++
++static void
++ls_composite_over_8888_n_8888 (pixman_implementation_t *imp,
++                                pixman_op_t              op,
++                                pixman_image_t *         src_image,
++                                pixman_image_t *         mask_image,
++                                pixman_image_t *         dst_image,
++                                int32_t                  src_x,
++                                int32_t                  src_y,
++                                int32_t                  mask_x,
++                                int32_t                  mask_y,
++                                int32_t                  dest_x,
++                                int32_t                  dest_y,
++                                int32_t                  width,
++                                int32_t                  height)
++{
++    uint32_t    *dst_line, *dst;
++    uint32_t    *src_line, *src;
++    uint32_t mask;
++    __m64 vmask;
++    int dst_stride, src_stride;
++    int32_t w;
++    __m64 srca;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++
++    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
++    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888(%1,$f24)
++	store64a($f24,%0)
++	:"=m"(vmask):"m"(mask):clobber
++	);
++
++    srca = ls_4x00ff;
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	src = src_line;
++	src_line += src_stride;
++	w = width;
++
++	while (w)
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++		load8888r(%0,$f22) 
++		expand_alpha($f20,$f28)
++		in_over($f20,$f28,$f24,$f22)
++		store8888r($f8,%0)
++		:"+r"(*dst):"r"(*src):clobber
++		);
++
++	    w--;
++	    dst++;
++	    src++;
++	}
++    }
++}
++
++static void
++ls_composite_over_n_8888 (pixman_implementation_t *imp,
++                           pixman_op_t              op,
++                           pixman_image_t *         src_image,
++                           pixman_image_t *         mask_image,
++                           pixman_image_t *         dst_image,
++                           int32_t                  src_x,
++                           int32_t                  src_y,
++                           int32_t                  mask_x,
++                           int32_t                  mask_y,
++                           int32_t                  dest_x,
++                           int32_t                  dest_y,
++                           int32_t                  width,
++                           int32_t                  height)
++{
++    uint32_t src;
++    uint32_t    *dst_line, *dst;
++    int32_t w;
++    int dst_stride;
++    __m64 vsrc, vsrca; 
++
++    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
++
++    if (src == 0)
++	return;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%2,$f24)
++	store64($f24,%0)
++	expand_alpha($f24,$f26)
++	store64($f26,%1)
++	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
++	);
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	w = width;
++
++	while (w)
++	{
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%0,$f28)
++	over($f24,$f26,$f28)
++	store8888r($f8,%0)
++	:"+r"(*dst)::clobber
++	);
++
++	    w--;
++	    dst++;
++	}
++    }
++}
++
++static void
++ls_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
++                                   pixman_op_t              op,
++                                   pixman_image_t *         src_image,
++                                   pixman_image_t *         mask_image,
++                                   pixman_image_t *         dst_image,
++                                   int32_t                  src_x,
++                                   int32_t                  src_y,
++                                   int32_t                  mask_x,
++                                   int32_t                  mask_y,
++                                   int32_t                  dest_x,
++                                   int32_t                  dest_y,
++                                   int32_t                  width,
++                                   int32_t                  height)
++{
++    uint32_t src, srca;
++    uint32_t    *dst_line;
++    uint32_t    *mask_line;
++    int dst_stride, mask_stride;
++    __m64 vsrc, vsrca;
++
++    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
++
++    srca = src >> 24;
++    if (src == 0)
++	return;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%2,$f24)
++	store64($f24,%0)
++	expand_alpha($f24,$f26)
++	store64($f26,%1)
++	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
++	);
++
++    while (height--)
++    {
++	int twidth = width;
++	uint32_t *p = (uint32_t *)mask_line;
++	uint32_t *q = (uint32_t *)dst_line;
++
++	while (twidth)
++	{
++
++	    if (*p)
++	    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f28)
++		load8888r(%1,$f20)
++		in_over($f24,$f26,$f20,$f28)
++		store8888r($f8,%0)
++		:"+r"(*q):"r"(*p):clobber
++		);
++	    }
++	    twidth--;
++	    p++;
++	    q++;
++	}
++
++	dst_line += dst_stride;
++	mask_line += mask_stride;
++    }
++}
++
++
++static void
++ls_composite_over_n_8_8888 (pixman_implementation_t *imp,
++                             pixman_op_t              op,
++                             pixman_image_t *         src_image,
++                             pixman_image_t *         mask_image,
++                             pixman_image_t *         dst_image,
++                             int32_t                  src_x,
++                             int32_t                  src_y,
++                             int32_t                  mask_x,
++                             int32_t                  mask_y,
++                             int32_t                  dest_x,
++                             int32_t                  dest_y,
++                             int32_t                  width,
++                             int32_t                  height)
++{
++    uint32_t src, srca;
++    uint32_t *dst_line, *dst;
++    uint8_t *mask_line, *mask;
++    int dst_stride, mask_stride;
++    int32_t w;
++    __m64 vsrc, vsrca;
++    uint64_t srcsrc;
++
++    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
++
++    srca = src >> 24;
++    if (src == 0)
++	return;
++
++    srcsrc = (uint64_t)src << 32 | src;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%2,$f24)
++	store64a($f24,%0)
++	expand_alpha($f24,$f26)
++	store64a($f26,%1)
++	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
++	);
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	mask = mask_line;
++	mask_line += mask_stride;
++	w = width;
++
++	while (w)
++	{
++	    uint32_t m = *mask;
++
++	    if (m)
++	    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f20)
++		load32r(%1,$f22)
++		expand_alpha_rev($f22,$f28)
++		in_over($f24,$f26,$f28,$f20)
++		store8888r($f8,%0)
++		:"+r"(*dst):"r"(m):clobber
++		);
++	    }
++
++	    w--;
++	    mask++;
++	    dst++;
++	}
++    }
++
++}
++
++static void
++ls_composite_over_x888_n_8888 (pixman_implementation_t *imp,
++                                pixman_op_t              op,
++                                pixman_image_t *         src_image,
++                                pixman_image_t *         mask_image,
++                                pixman_image_t *         dst_image,
++                                int32_t                  src_x,
++                                int32_t                  src_y,
++                                int32_t                  mask_x,
++                                int32_t                  mask_y,
++                                int32_t                  dest_x,
++                                int32_t                  dest_y,
++                                int32_t                  width,
++                                int32_t                  height)
++{
++    uint32_t *dst_line, *dst;
++    uint32_t *src_line, *src;
++    uint32_t mask;
++    __m64 vmask;
++    int dst_stride, src_stride;
++    int32_t w;
++    __m64 srca;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
++
++    mask &= 0xff000000;
++    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%1,$f24)
++	store64a($f24,%0)
++	:"=m"(vmask):"r"(mask):clobber
++	);
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load64a(%1,$f26)
++	store64a($f26,%0)
++	:"=m"(srca):"m"(ls_4x00ff):clobber
++	);
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	src = src_line;
++	src_line += src_stride;
++	w = width;
++
++	while (w)
++	{
++		uint32_t src_tmp = *src | 0xff000000;
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20)
++		load8888r(%0,$f22)		
++		in_over($f20,$f26,$f24,$f22)
++		store8888r($f8,%0)
++		:"+r"(*dst):"r"(src_tmp):clobber
++		);
++
++	    w--;
++	    dst++;
++	    src++;
++	}
++    }
++}
++
++
++static void
++ls_composite_over_8888_0565 (pixman_implementation_t *imp,
++                               pixman_op_t              op,
++                               pixman_image_t *         src_image,
++                               pixman_image_t *         mask_image,
++                               pixman_image_t *         dst_image,
++                               int32_t                  src_x,
++                               int32_t                  src_y,
++                               int32_t                  mask_x,
++                               int32_t                  mask_y,
++                               int32_t                  dest_x,
++                               int32_t                  dest_y,
++                               int32_t                  width,
++                               int32_t                  height)
++{
++    uint16_t    *dst_line, *dst;
++    uint32_t d;
++    uint32_t    *src_line, *src, s;
++    uint8_t a;
++    int dst_stride, src_stride;
++    int32_t w;
++
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	src = src_line;
++	src_line += src_stride;
++	w = width;
++
++	while (w--)
++	{
++	    s = *src++;
++	    a = s >> 24;
++	    if (s)
++	    {
++		if (a == 0xff)
++		{
++		    d = s;
++		}
++		else
++		{
++		    d = *dst;
++		    d = CONVERT_0565_TO_0888 (d);
++
++		    __asm__ volatile (
++		    ".set arch=loongson2f \n\t"     
++		    load8888r(%1,$f24) 
++		    load8888r(%0,$f20)
++		    expand_alpha($f24,$f26) 
++		    over($f24,$f26,$f20)
++		    store8888r($f8,%0)
++		    :"+r"(d):"r"(s):clobber
++		    );
++
++
++		}
++		*dst = CONVERT_8888_TO_0565 (d);
++	    }
++	    dst++;
++	}
++    }
++}
++
++static void
++ls_composite_over_n_0565 (pixman_implementation_t *imp,
++                           pixman_op_t              op,
++                           pixman_image_t *         src_image,
++                           pixman_image_t *         mask_image,
++                           pixman_image_t *         dst_image,
++                           int32_t                  src_x,
++                           int32_t                  src_y,
++                           int32_t                  mask_x,
++                           int32_t                  mask_y,
++                           int32_t                  dest_x,
++                           int32_t                  dest_y,
++                           int32_t                  width,
++                           int32_t                  height)
++{
++    uint32_t src;
++    uint32_t d;
++    uint16_t    *dst_line, *dst;
++    int32_t w;
++    int dst_stride;
++    __m64 vsrc, vsrca;
++
++    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
++
++    if (src == 0)
++	return;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%2,$f24)
++	store64a($f24,%0)
++	expand_alpha($f24,$f26)
++	store64a($f26,%1)
++	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
++	);
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	w = width;
++
++	while (w)
++	{
++
++		d = *dst;
++		d = CONVERT_0565_TO_0888 (d);
++
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f20)
++
++		over($f24,$f26,$f20)
++		store8888r($f8,%0)
++		:"+r"(d)::clobber
++		);
++
++		*dst = CONVERT_8888_TO_0565 (d);
++
++	    w--;
++	    dst++;
++	}
++    }
++}
++
++static void
++ls_composite_over_n_8_0565 (pixman_implementation_t *imp,
++                             pixman_op_t              op,
++                             pixman_image_t *         src_image,
++                             pixman_image_t *         mask_image,
++                             pixman_image_t *         dst_image,
++                             int32_t                  src_x,
++                             int32_t                  src_y,
++                             int32_t                  mask_x,
++                             int32_t                  mask_y,
++                             int32_t                  dest_x,
++                             int32_t                  dest_y,
++                             int32_t                  width,
++                             int32_t                  height)
++{
++    uint32_t src, srca, m, d;
++    uint16_t *dst_line, *dst;
++    uint8_t *mask_line, *mask;
++    int dst_stride, mask_stride;
++    int32_t w;
++    __m64 vsrc, vsrca;
++
++    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
++
++    srca = src >> 24;
++    if (src == 0)
++	return;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%2,$f24)
++	store64a($f24,%0)
++	expand_alpha($f24,$f26)
++	store64a($f26,%1)
++	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
++	);
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	mask = mask_line;
++	mask_line += mask_stride;
++	w = width;
++
++	while (w)
++	{
++	    m = *mask;
++	    d = *dst;
++
++	    if (m)
++	    {
++
++		d = CONVERT_0565_TO_0888 (d);
++
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f20)
++		load32r(%1,$f22)
++		expand_alpha_rev($f22,$f28)
++		in_over($f24,$f26,$f28,$f20)
++		store8888r($f8,%0)
++		:"+r"(d):"r"(m):clobber
++		);
++
++		*dst = CONVERT_8888_TO_0565 (d);
++
++	    }
++
++	    w--;
++	    mask++;
++	    dst++;
++	}
++    }
++}
++
++static void
++ls_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
++                                   pixman_op_t              op,
++                                   pixman_image_t *         src_image,
++                                   pixman_image_t *         mask_image,
++                                   pixman_image_t *         dst_image,
++                                   int32_t                  src_x,
++                                   int32_t                  src_y,
++                                   int32_t                  mask_x,
++                                   int32_t                  mask_y,
++                                   int32_t                  dest_x,
++                                   int32_t                  dest_y,
++                                   int32_t                  width,
++                                   int32_t                  height)
++{
++    uint32_t src, srca, m, d;
++    uint16_t    *dst_line;
++    uint32_t    *mask_line;
++    int dst_stride, mask_stride;
++    __m64 vsrc, vsrca;
++
++    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
++
++    srca = src >> 24;
++    if (src == 0)
++	return;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%2,$f24)
++	store64a($f24,%0)
++	expand_alpha($f24,$f26)
++	store64a($f26,%1)
++	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
++	);
++
++    while (height--)
++    {
++	int twidth = width;
++	uint32_t *p = (uint32_t *)mask_line;
++	uint16_t *q = (uint16_t *)dst_line;
++
++	while (twidth)
++	{
++
++	    m = *(uint32_t *)p;
++	    d = *q;
++
++	    if (m)
++	    {
++
++		d = CONVERT_0565_TO_0888 (d);
++
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f20)
++		load8888r(%1,$f22)
++		in_over($f24,$f26,$f22,$f20)
++		store8888r($f8,%0)
++		:"+r"(d):"r"(m):clobber
++		);
++
++		*q = CONVERT_8888_TO_0565 (d);
++
++	    }
++
++	    twidth--;
++	    p++;
++	    q++;
++	}
++
++	mask_line += mask_stride;
++	dst_line += dst_stride;
++    }
++}
++static void
++ls_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
++                                pixman_op_t              op,
++                                pixman_image_t *         src_image,
++                                pixman_image_t *         mask_image,
++                                pixman_image_t *         dst_image,
++                                int32_t                  src_x,
++                                int32_t                  src_y,
++                                int32_t                  mask_x,
++                                int32_t                  mask_y,
++                                int32_t                  dest_x,
++                                int32_t                  dest_y,
++                                int32_t                  width,
++                                int32_t                  height)
++{
++    uint32_t    *dst_line, *dst;
++    uint32_t    *src_line, *src;
++    int dst_stride, src_stride;
++    int32_t w;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++
++#if 0
++    /* FIXME */
++    assert (src_image->drawable == mask_image->drawable);
++#endif
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	src = src_line;
++	src_line += src_stride;
++	w = width;
++
++	while (w)
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f22) 
++		load8888r(%0,$f20) 
++		over_rev_non_pre($f22,$f20)
++		store8888r($f8,%0)
++		:"+r"(*dst):"r"(*src):clobber
++		);
++
++	    w--;
++	    dst++;
++	    src++;
++	}
++    }
++}
++static void
++ls_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
++                                pixman_op_t              op,
++                                pixman_image_t *         src_image,
++                                pixman_image_t *         mask_image,
++                                pixman_image_t *         dst_image,
++                                int32_t                  src_x,
++                                int32_t                  src_y,
++                                int32_t                  mask_x,
++                                int32_t                  mask_y,
++                                int32_t                  dest_x,
++                                int32_t                  dest_y,
++                                int32_t                  width,
++                                int32_t                  height)
++{
++    uint16_t    *dst_line, *dst;
++    uint32_t    *src_line, *src, d;
++    int dst_stride, src_stride;
++    int32_t w;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++
++#if 0
++    /* FIXME */
++    assert (src_image->drawable == mask_image->drawable);
++#endif
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	src = src_line;
++	src_line += src_stride;
++	w = width;
++
++	while (w)
++	{
++
++		d = *dst;
++		d = CONVERT_0565_TO_0888 (d);
++
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		over_rev_non_pre($f20,$f24)
++		store8888r($f8,%0)
++		:"+r"(d):"r"(*src):clobber
++		);
++
++		*dst = CONVERT_8888_TO_0565 (d);
++
++	    w--;
++	    dst++;
++	    src++;
++	}
++    }
++}
++
++static void
++ls_composite_src_n_8_8888 (pixman_implementation_t *imp,
++                            pixman_op_t              op,
++                            pixman_image_t *         src_image,
++                            pixman_image_t *         mask_image,
++                            pixman_image_t *         dst_image,
++                            int32_t                  src_x,
++                            int32_t                  src_y,
++                            int32_t                  mask_x,
++                            int32_t                  mask_y,
++                            int32_t                  dest_x,
++                            int32_t                  dest_y,
++                            int32_t                  width,
++                            int32_t                  height)
++{
++    uint32_t src, srca;
++    uint32_t    *dst_line, *dst, m;
++    uint8_t     *mask_line, *mask;
++    int dst_stride, mask_stride;
++    int32_t w;
++    __m64 vsrc, vsrca;
++    uint64_t srcsrc;
++
++    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
++
++    srca = src >> 24;
++    if (src == 0)
++    {
++	pixman_fill_ls (dst_image->bits.bits, dst_image->bits.rowstride,
++			 PIXMAN_FORMAT_BPP (dst_image->bits.format),
++	                 dest_x, dest_y, width, height, 0);
++	return;
++    }
++
++    srcsrc = (uint64_t)src << 32 | src;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%2,$f24)
++	store64a($f24,%0)
++	expand_alpha($f24,$f26)
++	store64a($f26,%1)
++	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
++	);
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	mask = mask_line;
++	mask_line += mask_stride;
++	w = width;
++
++	while (w)
++	{
++	    m = *mask;
++
++	    if (m)
++	    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load32r(%1,$f20)
++		expand_alpha_rev($f20,$f28)
++		in($f24,$f28)
++		store8888r($f8,%0)
++		:"=r"(*dst):"r"(m):clobber
++		);
++
++	    }
++	    else
++	    {
++		*dst = 0;
++	    }
++
++	    w--;
++	    mask++;
++	    dst++;
++	}
++    }
++}
+diff -urN pixman//pixman/pixman-cpu.c Pixman.Loongson//pixman/pixman-cpu.c
+--- pixman//pixman/pixman-cpu.c	2010-12-25 18:46:00.073234000 +0800
++++ Pixman.Loongson//pixman/pixman-cpu.c	2010-12-25 18:39:15.360337000 +0800
+@@ -579,7 +579,9 @@
+     if (pixman_have_mmx ())
+ 	return _pixman_implementation_create_mmx ();
+ #endif
+-
++#ifdef USE_LS
++	return _pixman_implementation_create_ls ();
++#endif
+ #ifdef USE_ARM_NEON
+     if (pixman_have_arm_neon ())
+ 	return _pixman_implementation_create_arm_neon ();
+diff -urN pixman//pixman/pixman-ls.c Pixman.Loongson//pixman/pixman-ls.c
+--- pixman//pixman/pixman-ls.c	1970-01-01 08:00:00.000000000 +0800
++++ Pixman.Loongson//pixman/pixman-ls.c	2010-12-25 18:39:15.386759000 +0800
+@@ -0,0 +1,538 @@
++/*
++* Based on pixman-mmx.c
++* Implemented for loongson 2F only.
++* Free software based on GPL licence.
++* Copyright 2010 WG Ge.
++*/
++
++#ifdef HAVE_CONFIG_H
++#include <config.h>
++#endif
++#include <stdlib.h>
++#include <string.h>
++#include <math.h>
++#include <limits.h>
++#include <stdio.h>
++#include "pixman-private.h"
++#include "pixman-combine32.h"
++#include "primitive.h"
++
++#define __m64  __attribute__ ((aligned (8))) uint64_t
++#define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (n))) v
++#define DECLARE_ALIGNED_8(t, v, ...)  DECLARE_ALIGNED(8, t, v)
++
++DECLARE_ALIGNED_8 (const uint64_t, ls_4x00ff                  ) = 0x00ff00ff00ff00ffULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_4x0080                  ) = 0x0080008000800080ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_565_rgb                 ) = 0x000001f0003f001fULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_565_unpack_multiplier   ) = 0x0000008404100840ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_565_r                   ) = 0x000000f800000000ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_565_g                   ) = 0x0000000000fc0000ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_565_b                   ) = 0x00000000000000f8ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_mask_0                  ) = 0xffffffffffff0000ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_mask_1                  ) = 0xffffffff0000ffffULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_mask_2                  ) = 0xffff0000ffffffffULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_mask_3                  ) = 0x0000ffffffffffffULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_full_alpha              ) = 0x00ff000000000000ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_ffff0000ffff0000        ) = 0xffff0000ffff0000ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_0000ffff00000000        ) = 0x0000ffff00000000ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_000000000000ffff        ) = 0x000000000000ffffULL;
++
++
++pixman_bool_t
++pixman_fill_ls (uint32_t *bits,
++                 int       stride,
++                 int       bpp,
++                 int       x,
++                 int       y,
++                 int       width,
++                 int       height,
++                 uint32_t xor)
++{
++    uint64_t fill;
++    uint32_t byte_width;
++    uint8_t     *byte_line;
++
++
++
++    if (bpp != 16 && bpp != 32 && bpp != 8)
++	return FALSE;
++
++    if (bpp == 8)
++    {
++	stride = stride * (int) sizeof (uint32_t) / 1;
++	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
++	byte_width = width;
++	stride *= 1;
++        xor = (xor & 0xff) * 0x01010101;
++    }
++    else if (bpp == 16)
++    {
++	stride = stride * (int) sizeof (uint32_t) / 2;
++	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
++	byte_width = 2 * width;
++	stride *= 2;
++        xor = (xor & 0xffff) * 0x00010001;
++    }
++    else
++    {
++	stride = stride * (int) sizeof (uint32_t) / 4;
++	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
++	byte_width = 4 * width;
++	stride *= 4;
++    }
++
++    fill = ((uint64_t)xor << 32) | xor;
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"
++	"ldc1 $f24, %0 \n\t"
++	::"m"(fill):"$f24"
++	);
++    while (height--)
++    {
++	int w;
++	uint8_t *d = byte_line;
++
++	byte_line += stride;
++	w = byte_width;
++
++	while (w >= 1 && ((unsigned long)d & 1))
++	{
++	    *(uint8_t *)d = (xor & 0xff);
++	    w--;
++	    d++;
++	}
++
++	while (w >= 2 && ((unsigned long)d & 3))
++	{
++	    *(uint16_t *)d = xor;
++	    w -= 2;
++	    d += 2;
++	}
++
++	while (w >= 4 && ((unsigned long)d & 7))
++	{
++	    *(uint32_t *)d = xor;
++
++	    w -= 4;
++	    d += 4;
++	}
++
++	while (w >= 64)
++	{
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"
++	"dmfc1 $8, $f24 \n\t"
++	"sd $8 ,   (%0) \n\t"
++	"sd $8 ,   8(%0) \n\t"	
++	"sd $8 ,   16(%0) \n\t"
++	"sd $8 ,   24(%0) \n\t"
++	"sd $8 ,   32(%0) \n\t"          
++	"sd $8 ,   40(%0) \n\t"	       
++	"sd $8 ,   48(%0) \n\t"        
++	"sd $8 ,   56(%0) \n\t"        
++	::"r"(d):"$8","memory","$f24"
++	);
++	    w -= 64;
++	    d += 64;
++	}
++
++	while (w >= 4)
++	{
++	    *(uint32_t *)d = xor;
++
++	    w -= 4;
++	    d += 4;
++	}
++	while (w >= 2)
++	{
++	    *(uint16_t *)d = xor;
++	    w -= 2;
++	    d += 2;
++	}
++	while (w >= 1)
++	{
++	    *(uint8_t *)d = (xor & 0xff);
++	    w--;
++	    d++;
++	}
++
++    }
++    return TRUE;
++}
++
++static pixman_bool_t
++pixman_blt_ls (uint32_t *src_bits,
++                uint32_t *dst_bits,
++                int       src_stride,
++                int       dst_stride,
++                int       src_bpp,
++                int       dst_bpp,
++                int       src_x,
++                int       src_y,
++                int       dst_x,
++                int       dst_y,
++                int       width,
++                int       height)
++{
++    uint8_t *   src_bytes;
++    uint8_t *   dst_bytes;
++    int byte_width;
++
++    if (src_bpp != dst_bpp)
++	return FALSE;
++
++    if (src_bpp == 16)
++    {
++	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
++	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
++	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
++	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
++	byte_width = 2 * width;
++	src_stride *= 2;
++	dst_stride *= 2;
++    }
++    else if (src_bpp == 32)
++    {
++	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
++	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
++	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
++	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
++	byte_width = 4 * width;
++	src_stride *= 4;
++	dst_stride *= 4;
++    }
++    else
++    {
++	return FALSE;
++    }
++
++    while (height--)
++    {
++	int w;
++	uint8_t *s = src_bytes;
++	uint8_t *d = dst_bytes;
++	src_bytes += src_stride;
++	dst_bytes += dst_stride;
++	w = byte_width;
++
++	while (w >= 2 && ((unsigned long)d & 3))
++	{
++	    *(uint16_t *)d = *(uint16_t *)s;
++	    w -= 2;
++	    s += 2;
++	    d += 2;
++	}
++
++	while (w >= 4 && ((unsigned long)d & 7))
++	{
++	    *(uint32_t *)d = *(uint32_t *)s;
++
++	    w -= 4;
++	    s += 4;
++	    d += 4;
++	}
++ if ((unsigned long)s & 7)
++{
++	while (w >= 64)
++	{
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"
++	"uld $8 ,   (%1) \n\t"
++	"uld $9 ,  8(%1) \n\t"
++	"uld $10, 16(%1) \n\t"
++	"uld $11, 24(%1) \n\t"
++	"sd $8 ,   (%0) \n\t"
++	"sd $9 ,   8(%0) \n\t"	
++	"sd $10,   16(%0) \n\t"
++	"sd $11,   24(%0) \n\t"
++
++	"uld $8 ,   32(%1) \n\t"
++	"uld $9 ,   40(%1) \n\t"
++	"uld $10,   48(%1) \n\t"
++	"uld $11,   56(%1) \n\t"
++	"sd $8 ,   32(%0) \n\t"          
++	"sd $9 ,   40(%0) \n\t"	       
++	"sd $10,   48(%0) \n\t"        
++	"sd $11,   56(%0) \n\t"        
++	::"r"(d),"r"(s):"$8","$9","$10","$11","memory"
++	);
++	    w -= 64;
++	    s += 64;
++	    d += 64;
++	}
++}
++else
++{
++	while (w >= 64)
++	{
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"
++	"ld $8 ,   (%1) \n\t"
++	"ld $9 ,  8(%1) \n\t"
++	"ld $10, 16(%1) \n\t"
++	"ld $11, 24(%1) \n\t"
++	"sd $8 ,   (%0) \n\t"
++	"sd $9 ,   8(%0) \n\t"	
++	"sd $10,   16(%0) \n\t"
++	"sd $11,   24(%0) \n\t"
++
++	"ld $8 ,   32(%1) \n\t"
++	"ld $9 ,   40(%1) \n\t"
++	"ld $10,   48(%1) \n\t"
++	"ld $11,   56(%1) \n\t"
++	"sd $8 ,   32(%0) \n\t"          
++	"sd $9 ,   40(%0) \n\t"	       
++	"sd $10,   48(%0) \n\t"        
++	"sd $11,   56(%0) \n\t"        
++	::"r"(d),"r"(s):"$8","$9","$10","$11","memory"
++	);
++	    w -= 64;
++	    s += 64;
++	    d += 64;
++	}
++}	
++
++	while (w >= 4)
++	{
++	    *(uint32_t *)d = *(uint32_t *)s;
++
++	    w -= 4;
++	    s += 4;
++	    d += 4;
++	}
++	if (w >= 2)
++	{
++	    *(uint16_t *)d = *(uint16_t *)s;
++	    w -= 2;
++	    s += 2;
++	    d += 2;
++	}
++    }
++    return TRUE;
++}
++
++
++#include "pixman-composite-ls.c"
++#include "pixman-combine-ls.c"
++
++static pixman_bool_t
++ls_blt (pixman_implementation_t *imp,
++         uint32_t *               src_bits,
++         uint32_t *               dst_bits,
++         int                      src_stride,
++         int                      dst_stride,
++         int                      src_bpp,
++         int                      dst_bpp,
++         int                      src_x,
++         int                      src_y,
++         int                      dst_x,
++         int                      dst_y,
++         int                      width,
++         int                      height)
++{
++    if (!pixman_blt_ls (
++            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
++            src_x, src_y, dst_x, dst_y, width, height))
++    {
++	return _pixman_implementation_blt (
++	    imp->delegate,
++	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
++	    src_x, src_y, dst_x, dst_y, width, height);
++    }
++
++    return TRUE;
++}
++
++static pixman_bool_t
++ls_fill (pixman_implementation_t *imp,
++          uint32_t *               bits,
++          int                      stride,
++          int                      bpp,
++          int                      x,
++          int                      y,
++          int                      width,
++          int                      height,
++          uint32_t xor)
++{
++    if (!pixman_fill_ls (bits, stride, bpp, x, y, width, height, xor))
++    {
++	return _pixman_implementation_fill (
++	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
++    }
++
++    return TRUE;
++}
++
++static void
++ls_composite_copy_area (pixman_implementation_t *imp,
++                         pixman_op_t              op,
++                         pixman_image_t *         src_image,
++                         pixman_image_t *         mask_image,
++                         pixman_image_t *         dst_image,
++                         int32_t                  src_x,
++                         int32_t                  src_y,
++                         int32_t                  mask_x,
++                         int32_t                  mask_y,
++                         int32_t                  dest_x,
++                         int32_t                  dest_y,
++                         int32_t                  width,
++                         int32_t                  height)
++{
++    pixman_blt_ls (src_image->bits.bits,
++                    dst_image->bits.bits,
++                    src_image->bits.rowstride,
++                    dst_image->bits.rowstride,
++                    PIXMAN_FORMAT_BPP (src_image->bits.format),
++                    PIXMAN_FORMAT_BPP (dst_image->bits.format),
++                    src_x, src_y, dest_x, dest_y, width, height);
++}
++
++
++static const pixman_fast_path_t ls_fast_paths[] =
++{
++
++//these are implemented so far
++#if 1
++    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, ls_composite_over_x888_8_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, ls_composite_over_x888_8_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, ls_composite_over_x888_8_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, ls_composite_over_x888_8_8888    ),
++#endif
++
++#if 1
++//over_8888_0565 significant perf improvement, slight better L1, L2, 30% better RT
++    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   ls_composite_over_8888_0565      ),
++    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   ls_composite_over_8888_0565      ),
++    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   ls_composite_over_pixbuf_0565    ),
++    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   ls_composite_over_pixbuf_0565    ),
++
++//big improvement some closing 100%
++    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   ls_composite_over_n_8888_0565_ca ),
++    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   ls_composite_over_n_8888_0565_ca ),
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   ls_composite_over_n_8_0565       ),
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   ls_composite_over_n_8_0565       ),
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   ls_composite_over_n_0565         ),
++
++//ubalbe to bench with lowlevel bench, believe it is a gain in perf
++    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, ls_composite_over_x888_n_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, ls_composite_over_x888_n_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, ls_composite_over_x888_n_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, ls_composite_over_x888_n_8888    ),
++
++//performance regress 30% in L1,L2, but significant improvement in RT
++    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, ls_composite_over_8888_8888      ),
++    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, ls_composite_over_8888_8888      ),
++    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, ls_composite_over_8888_8888      ),
++    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, ls_composite_over_8888_8888      ),
++    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, ls_composite_over_pixbuf_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, ls_composite_over_pixbuf_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, ls_composite_over_pixbuf_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, ls_composite_over_pixbuf_8888    ),
++
++//same performance in L1,L2, but significant improvement in RT 30-40%
++    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, ls_composite_over_8888_n_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, ls_composite_over_8888_n_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, ls_composite_over_8888_n_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, ls_composite_over_8888_n_8888    ),
++
++//significant perf improvement 20%
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, ls_composite_over_n_8_8888       ),
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, ls_composite_over_n_8_8888       ),
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, ls_composite_over_n_8_8888       ),
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, ls_composite_over_n_8_8888       ),
++
++//3 times perf improvements
++    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, ls_composite_over_n_8888_8888_ca ),
++    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, ls_composite_over_n_8888_8888_ca ),
++    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, ls_composite_over_n_8888_8888_ca ),
++    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, ls_composite_over_n_8888_8888_ca ),
++
++//significant performance boost
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, ls_composite_over_n_8888         ),
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, ls_composite_over_n_8888         ),
++//simple add, expect better perf in generic code
++//    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, ls_composite_add_8888_8888       ),
++//    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, ls_composite_add_8888_8888       ),
++
++// FIXME: Copy memory are not better than geneic code
++#if 0
++    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, ls_composite_copy_area           ),
++#endif
++
++//significant improvement
++    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, ls_composite_src_n_8_8888        ),
++    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, ls_composite_src_n_8_8888        ),
++    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, ls_composite_src_n_8_8888        ),
++    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, ls_composite_src_n_8_8888        ),
++
++#endif
++
++//these are not yet implemented
++
++#if 0
++
++    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       ls_composite_add_8000_8000       ),
++    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       ls_composite_add_n_8_8           ),
++    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       ls_composite_in_8_8              ),
++    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       ls_composite_in_n_8_8            ),
++#endif
++
++
++    { PIXMAN_OP_NONE },
++};
++
++pixman_implementation_t *
++_pixman_implementation_create_ls (void)
++{
++    pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
++    pixman_implementation_t *imp = _pixman_implementation_create (general, ls_fast_paths);
++
++//Turned on but unable to benchmark.
++#if 1
++    imp->combine_32[PIXMAN_OP_OVER] = ls_combine_over_u;
++    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = ls_combine_over_reverse_u;
++    imp->combine_32[PIXMAN_OP_IN] = ls_combine_in_u;
++    imp->combine_32[PIXMAN_OP_IN_REVERSE] = ls_combine_in_reverse_u;
++    imp->combine_32[PIXMAN_OP_OUT] = ls_combine_out_u;
++    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = ls_combine_out_reverse_u;
++    imp->combine_32[PIXMAN_OP_ATOP] = ls_combine_atop_u;
++    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = ls_combine_atop_reverse_u;
++    imp->combine_32[PIXMAN_OP_XOR] = ls_combine_xor_u;
++    imp->combine_32[PIXMAN_OP_ADD] = ls_combine_add_u;
++    imp->combine_32[PIXMAN_OP_SATURATE] = ls_combine_saturate_u;
++
++    imp->combine_32_ca[PIXMAN_OP_SRC] = ls_combine_src_ca;
++    imp->combine_32_ca[PIXMAN_OP_OVER] = ls_combine_over_ca;
++    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = ls_combine_over_reverse_ca;
++    imp->combine_32_ca[PIXMAN_OP_IN] = ls_combine_in_ca;
++    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = ls_combine_in_reverse_ca;
++    imp->combine_32_ca[PIXMAN_OP_OUT] = ls_combine_out_ca;
++    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = ls_combine_out_reverse_ca;
++    imp->combine_32_ca[PIXMAN_OP_ATOP] = ls_combine_atop_ca;
++    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = ls_combine_atop_reverse_ca;
++    imp->combine_32_ca[PIXMAN_OP_XOR] = ls_combine_xor_ca;
++    imp->combine_32_ca[PIXMAN_OP_ADD] = ls_combine_add_ca;
++#endif
++
++//FIXME blt and fill not shown better perf than geneic code
++#if 0
++    imp->blt = ls_blt;
++    imp->fill = ls_fill;
++#endif
++
++    return imp;
++}
++
+diff -urN pixman//pixman/pixman-private.h Pixman.Loongson//pixman/pixman-private.h
+--- pixman//pixman/pixman-private.h	2010-12-25 18:46:00.102841000 +0800
++++ Pixman.Loongson//pixman/pixman-private.h	2010-12-25 18:39:15.401808000 +0800
+@@ -493,6 +493,11 @@
+ pixman_implementation_t *
+ _pixman_implementation_create_fast_path (void);
+ 
++#ifdef USE_LS
++pixman_implementation_t *
++_pixman_implementation_create_ls (void);
++#endif
++
+ #ifdef USE_MMX
+ pixman_implementation_t *
+ _pixman_implementation_create_mmx (void);
+diff -urN pixman//pixman/primitive.h Pixman.Loongson//pixman/primitive.h
+--- pixman//pixman/primitive.h	1970-01-01 08:00:00.000000000 +0800
++++ Pixman.Loongson//pixman/primitive.h	2010-12-25 18:39:15.457084000 +0800
+@@ -0,0 +1,214 @@
++/*
++* MMX register usage protocal
++*	return result: f8 
++*	tmp immediate f12
++*	tmp register in primtive f14 f16 f18
++*	tmp register in pixman f0,f4,f6,f10,f20,f22,  
++*	globals in function f24, f26, f28,f30 
++* Exceptions for load and store: 
++*	load will specify dest FPR register
++*	store will specify src FPR register
++*       expand_alpha(_rev) implemented with GPR,  dest FPR as the 2nd parameter
++*
++* Special alert: don't use return result $f8 as input, it might be overwritten
++*/
++
++
++/*primitive macros */
++
++#define clobber "$8","$9","$f0","$f2","$f8",\
++	"$f12","$f14","$f16","$f18","$f20",\
++	"$f22","$f24","$f26","$f28","$f30"
++
++#define  DMTC1_IMM(regc1,imm) \
++	"dli $8, "#imm" \n\t" \
++	"dmtc1 $8, "#regc1" \n\t"
++
++#define  MTC1_IMM(regc1,imm) \
++	"li $8, "#imm" \n\t" \
++	"dmtc1 $8, "#regc1" \n\t"
++
++
++#define save_to(reg1)  "mov.d "#reg1", $f8 \n\t"
++#define zero(reg1)  "xor "#reg1","#reg1","#reg1" \n\t"
++
++#define load32(sp,reg1) \
++		"ulw $8, "#sp" \n\t" \
++		"dmtc1 $8, "#reg1" \n\t"
++
++#define load32a(sp,reg1) \
++		"lw $8, "#sp" \n\t" \
++		"dmtc1 $8, "#reg1" \n\t"
++
++#define load32r(sp,reg1)  \
++	"dmtc1 "#sp", "#reg1" \n\t"
++
++#define load64(sp,reg1) \
++		"uld $8, "#sp" \n\t" \
++		"dmtc1 $8, "#reg1" \n\t"
++
++#define load64a(sp,reg1) \
++		"ld $8, "#sp" \n\t" \
++		"dmtc1 $8, "#reg1" \n\t"
++
++
++#define store32(reg1,sp) \
++		"dmfc1 $8, "#reg1" \n\t" \
++		"usw $8, "#sp" \n\t"
++
++#define store32r(reg1,sp) \
++		"dmfc1 "#sp", "#reg1" \n\t"
++
++#define store32a(reg1,sp) \
++		"swc1 "#reg1", "#sp" \n\t" 
++
++#define store64(reg1,sp) \
++		"dmfc1 $8, "#reg1" \n\t" \
++		"usd $8, "#sp" \n\t"
++
++#define store64a(reg1,sp) \
++		"sdc1 "#reg1", "#sp" \n\t" 
++
++#define load8888(sp,reg1) \
++	load64(sp,reg1) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"punpcklbh "#reg1", "#reg1", $f12 \n\t" 
++
++#define load8888r(sp,reg1) \
++	load32r(sp,reg1) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"punpcklbh "#reg1", "#reg1", $f12 \n\t" 
++
++#define load8888a(sp,reg1) \
++	load64a(sp,reg1) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"punpcklbh "#reg1", "#reg1", $f12 \n\t" 
++
++#define load8888ah(sp,reg1) \
++	load64a(sp,reg1) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"punpckhbh "#reg1", "#reg1", $f12 \n\t" 
++	
++#define store8888(reg1,sp) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"packushb "#reg1", "#reg1", $f12 \n\t" \
++	store64(reg1,sp)
++
++#define store8888r(reg1,sp) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"packushb "#reg1", "#reg1", $f12 \n\t" \
++	store32r(reg1,sp)
++
++#define store8888a(reg1,sp) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"packushb "#reg1", "#reg1", $f12 \n\t" \
++	store64a(reg1,sp)
++
++#define pack8888(reg1,reg2) 	\
++	"packushb $f8, "#reg1","#reg2" \n\t"
++
++#define unpack8888(reg1,reg2) 	\
++	"punpcklbh $f8, "#reg1","#reg2" \n\t"
++
++
++#define negate(sreg,dreg) \
++	DMTC1_IMM($f12, 0x00ff00ff00ff00ff)\
++	"xor "#dreg", "#sreg", $f12 \n\t"
++
++#define pix_add(reg1,reg2) \
++	"paddusb $f8, "#reg1", "#reg2" \n\t"
++
++#define pix_multiply(reg1,reg2) \
++	"pmullh $f14, "#reg1", "#reg2" \n\t " \
++	DMTC1_IMM($f12, 0x0080008000800080) \
++	"paddush $f14, $f14, $f12 \n\t "\
++ 	MTC1_IMM($f12, 8) \
++	"psrlh $f16, $f14, $f12 \n\t" \
++	"paddush $f14, $f14, $f16 \n\t" \
++	"psrlh $f8, $f14, $f12 \n\t" 
++
++#define pix_add_mul(reg1,reg2,reg3,reg4) \
++	pix_multiply(reg1,reg2) \
++	"mov.d $f18, $f8 \n\t" \
++	pix_multiply(reg3,reg4) \
++	pix_add($f18,$f8)	
++
++#define expand_alpha(sreg,dreg) \
++                "dmfc1 $8, "#sreg" \n\t" \
++                "dsrl32 $8, $8, 16 \n\t" \
++                "dsll $9, $8, 16 \n\t" \
++                "or $8, $8, $9 \n\t" \
++                "dsll32 $9, $8, 0 \n\t" \
++                "or $8, $8, $9 \n\t" \
++                "dmtc1 $8, "#dreg" \n\t"
++
++#define expand_alpha_rev(sreg,dreg)\
++                "dmfc1 $8, "#sreg" \n\t" \
++                "dsll32 $8, $8, 16 \n\t" \
++                "dsrl32 $8, $8, 16 \n\t" \
++                "dsll $9, $8, 16 \n\t" \
++                "or $8, $8, $9 \n\t" \
++                "dsll32 $9, $8, 0 \n\t" \
++                "or $8, $8, $9 \n\t" \
++                "dmtc1 $8, "#dreg" \n\t"
++
++#define expand8888(reg1,pos) expand8888_##pos(reg1)
++
++#define expand8888_0(reg1) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"punpcklbh $f8, "#reg1", $f12 \n\t" 
++
++#define expand8888_1(reg1) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"punpckhbh $f8, "#reg1", $f12 \n\t" 
++
++#define expandx888(reg1,pos) \
++	expand8888(reg1,pos) \
++	DMTC1_IMM($f12, 0x00ff000000000000) \
++	"or $f8, $f8, $f12 \n\t"
++
++#define invert_colors(reg1)  \
++	DMTC1_IMM($f12, 0xffff0000ffff0000) \
++	"and $f14, "#reg1", $f12 \n\t" \
++	DMTC1_IMM($f12, 0x000000000000ffff) \
++	"and $f16, "#reg1", $f12 \n\t" \
++	DMTC1_IMM($f12, 0x0000ffff00000000) \
++	"and $f18, "#reg1", $f12 \n\t" \
++	MTC1_IMM($f12, 32) \
++	"dsll $f16, $f16, $f12 \n\t" \
++	"dsrl $f18, $f18, $f12 \n\t" \
++	"or $f14, $f14, $f16 \n\t" \
++	"or $f8, $f14, $f18 \n\t" 
++
++#define over(reg1,reg2,reg3) \
++	negate(reg2,$f8) \
++	pix_multiply(reg3, $f8)\
++	pix_add(reg1, $f8) 
++
++
++#define over_rev_non_pre(reg1,reg2) \
++	expand_alpha(reg1,$f0) \
++	DMTC1_IMM($f12,0x00ff000000000000) \
++	"or $f2, $f0, $f12 \n\t" \
++	invert_colors(reg1) \
++	pix_multiply($f8,$f2) \
++	save_to($f2) \
++	over($f2, $f0, reg2)
++
++#define in(reg1,reg2) pix_multiply(reg1,reg2) 
++
++#define in_over_full_src_alpha(reg1,reg2,reg3) \
++	DMTC1_IMM($f12,0x00ff000000000000) \
++	"or $f0, "#reg1", $f12 \n\t" \
++	in($f0,reg2) \
++	save_to($f0) \
++	over($f0,reg2,reg3)
++
++#define in_over(reg1,reg2,reg3,reg4) \
++	in(reg1,reg3) \
++	"mov.d $f0, $f8 \n\t" \
++	pix_multiply(reg2,reg3) \
++	"mov.d $f2, $f8 \n\t" \
++	over($f0,$f2,reg4)
++
++