diff -urN pixman//configure.ac Pixman.Loongson//configure.ac
--- pixman//configure.ac	2010-12-25 18:46:00.018699000 +0800
+++ Pixman.Loongson//configure.ac	2010-12-25 18:39:15.298778000 +0800
@@ -264,6 +264,43 @@
 ])
 
 dnl ===========================================================================
+dnl Check for Loongson SIMD
+
+have_loongson_intrinsics=no
+AC_MSG_CHECKING(whether to use Loongson SIMD intrinsics)
+
+AC_COMPILE_IFELSE([
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4))
+error "Need GCC >= 4.4 for Loongson SIMD compilation"
+#endif
+int main () {
+    /* Test with a loongson SIMD instruction. */
+	asm volatile ( ".set arch = loongson2f \n\t" "and \$f0, \$f0, \$f0 \n\t"  : : : "cc", "memory" );
+    return 0;
+}], have_loongson_intrinsics=yes)
+
+
+AC_ARG_ENABLE(loongson,
+   [AC_HELP_STRING([--disable-loongson],
+                   [disable Loongson fast paths])],
+   [enable_loongson=$enableval], [enable_loongson=auto])
+
+if test $enable_loongson = no ; then
+   have_loongson_intrinsics=disabled
+fi
+
+if test $have_loongson_intrinsics = yes ; then
+   AC_DEFINE(USE_LS, 1, [use Loongson compiler intrinsics])
+fi
+
+AC_MSG_RESULT($have_loongson_intrinsics)
+if test $enable_loongson = yes && test $have_loongson_intrinsics = no ; then
+   AC_MSG_ERROR([Loongson intrinsics not detected])
+fi
+
+AM_CONDITIONAL(USE_LS, test $have_loongson_intrinsics = yes)
+
+dnl ===========================================================================
 dnl Check for MMX
 
 if test "x$MMX_CFLAGS" = "x" ; then
diff -urN pixman//pixman/Makefile.am Pixman.Loongson//pixman/Makefile.am
--- pixman//pixman/Makefile.am	2010-12-25 18:46:00.025027000 +0800
+++ Pixman.Loongson//pixman/Makefile.am	2010-12-25 18:39:15.303599000 +0800
@@ -55,6 +55,19 @@
 	pixman-combine.h.template solaris-hwcap.mapfile pixman-x64-mmx-emulation.h
 CLEANFILES = pixman-combine32.c pixman-combine64.c pixman-combine32.h pixman-combine64.h
 
+# loongson code
+if USE_LS
+noinst_LTLIBRARIES += libpixman-ls.la
+libpixman_ls_la_SOURCES = \
+	pixman-ls.c
+libpixman_ls_la_CFLAGS = $(DEP_CFLAGS) $(LS_CFLAGS)
+libpixman_ls_la_LIBADD = $(DEP_LIBS)
+libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
+libpixman_1_la_LIBADD += libpixman-ls.la
+
+ASM_CFLAGS_ls=$(LS_CFLAGS)
+endif
+
 # mmx code
 if USE_MMX
 noinst_LTLIBRARIES += libpixman-mmx.la
diff -urN pixman//pixman/pixman-combine-ls.c Pixman.Loongson//pixman/pixman-combine-ls.c
--- pixman//pixman/pixman-combine-ls.c	1970-01-01 08:00:00.000000000 +0800
+++ Pixman.Loongson//pixman/pixman-combine-ls.c	2010-12-25 18:39:15.344171000 +0800
@@ -0,0 +1,911 @@
+static force_inline uint32_t
+combine (const uint32_t *src, const uint32_t *mask)
+{
+    uint32_t ssrc = *src;
+
+    if (mask)
+    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%1,$f22) 
+		load8888r(%0,$f20) 
+		expand_alpha($f22,$f22)
+		pix_multiply($f20,$f22)
+		store8888r($f8,%0)
+		:"+r"(ssrc):"r"(*mask):clobber
+		);
+    }
+    return ssrc;
+}
+
+static void
+ls_combine_saturate_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *               dest,
+                        const uint32_t *         src,
+                        const uint32_t *         mask,
+                        int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	uint32_t s = combine (src, mask);
+	uint32_t d = *dest;
+
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"	
+	load8888r(%1,$f22) 
+	load8888r(%0,$f20) 
+	:"+r"(d):"r"(s):clobber
+	);
+
+	uint32_t sa = s >> 24;
+	uint32_t da = ~d >> 24;
+
+	if (sa > da)
+	{
+		uint32_t dds =  DIV_UN8 (da, sa) << 24;
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%0,$f24) 
+		expand_alpha($f24,$f24)
+		pix_multiply($f22,$f24)
+		save_to($f22)
+		::"r"(dds):clobber
+		);
+	}
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"	
+	pix_add($f20,$f22) 
+	store8888r($f8,%0)
+	:"=r"(*dest)::clobber
+	);
+
+	++src;
+	++dest;
+	if (mask)
+	    mask++;
+    }
+}
+static void
+ls_combine_out_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	if (mask) 
+	{
+
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+
+		load8888r(%2,$f22)  
+		load8888r(%1,$f20) 
+		expand_alpha($f22,$f22)
+		pix_multiply($f20,$f22)
+		save_to ($f20)
+
+		load8888r(%0,$f24) 
+		expand_alpha($f24,$f24)
+		negate($f24,$f24)
+		pix_multiply($f20,$f24)		
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+
+		mask++;
+	}else {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+
+
+		load8888r(%1,$f20) 
+
+		load8888r(%0,$f24) 
+		expand_alpha($f24,$f24)
+		negate($f24,$f24)
+		pix_multiply($f20,$f24)		
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src):clobber
+		);		
+		
+	}
+	++dest;
+	++src;
+    }
+}
+
+static void
+ls_combine_out_reverse_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	if (mask)
+	{
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+
+		load8888r(%2,$f22)  
+		load8888r(%1,$f20) 
+		expand_alpha($f22,$f22)
+		pix_multiply($f20,$f22)
+		save_to ($f20)
+
+		load8888r(%0,$f24) 
+		expand_alpha($f20,$f20)
+		negate($f20,$f20)
+		pix_multiply($f20,$f24)		
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+
+		mask++;
+	}else{
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+
+		load8888r(%1,$f20) 
+
+		load8888r(%0,$f24) 
+		expand_alpha($f20,$f20)
+		negate($f20,$f20)
+		pix_multiply($f20,$f24)		
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src):clobber
+		);	
+	}
+	++dest;
+	++src;
+
+    }
+}
+
+static void
+ls_combine_out_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%2,$f22) 
+		load8888r(%1,$f20) 
+		load8888r(%0,$f24) 
+		expand_alpha($f24,$f26)
+		negate($f26,$f26)
+		pix_multiply($f20,$f22)
+		save_to($f20) 
+		pix_multiply($f20,$f26)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+}
+
+static void
+ls_combine_out_reverse_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%2,$f22) 
+		load8888r(%1,$f20) 
+		load8888r(%0,$f24) 
+		expand_alpha($f20,$f28)
+		pix_multiply($f22,$f28)
+		save_to($f22) 
+		negate($f22,$f22)
+		pix_multiply($f24,$f22)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+}
+
+
+static void
+ls_combine_atop_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	if (mask)
+	{
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+
+		load8888r(%2,$f22)  
+		load8888r(%1,$f20) 
+		expand_alpha($f22,$f22)
+		pix_multiply($f20,$f22)
+		save_to ($f20)
+
+		load8888r(%0,$f24) 
+		expand_alpha($f20,$f26)
+		expand_alpha($f24,$f28)
+		negate($f26,$f26)
+		pix_add_mul($f20,$f28,$f24,$f26)		
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+
+		mask++;
+	}else {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%1,$f20) 
+
+		load8888r(%0,$f24) 
+		expand_alpha($f20,$f26)
+		expand_alpha($f24,$f28)
+		negate($f26,$f26)
+		pix_add_mul($f20,$f28,$f24,$f26)		
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src):clobber
+		);
+	}
+	++dest;
+	++src;
+
+    }
+}
+
+static void
+ls_combine_atop_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end;
+
+    end = dest + width;
+
+    while (dest < end)
+    {
+	if (mask){
+
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+
+		load8888r(%2,$f22)  
+		load8888r(%1,$f20) 
+		expand_alpha($f22,$f22)
+		pix_multiply($f20,$f22)
+		save_to ($f20)
+
+		load8888r(%0,$f24) 
+		expand_alpha($f20,$f26)
+		expand_alpha($f24,$f28)
+		negate($f28,$f28)
+		pix_add_mul($f20,$f28,$f24,$f26)		
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+		mask++;
+	}else{
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+
+		load8888r(%1,$f20) 
+
+		load8888r(%0,$f24) 
+		expand_alpha($f20,$f26)
+		expand_alpha($f24,$f28)
+		negate($f28,$f28)
+		pix_add_mul($f20,$f28,$f24,$f26)		
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src):clobber
+		);	
+	}
+	++dest;
+	++src;
+    }
+}
+
+
+static void
+ls_combine_atop_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%2,$f22) 
+		load8888r(%1,$f20) 
+		load8888r(%0,$f24) 
+		expand_alpha($f24,$f26) 
+		expand_alpha($f20,$f28) 
+		pix_multiply($f20,$f22)
+		save_to($f20)
+		pix_multiply($f22,$f28)
+		save_to($f22)
+		negate($f22,$f22)
+		pix_add_mul($f24,$f22,$f20,$f26)		
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+}
+
+static void
+ls_combine_atop_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%2,$f22) 
+		load8888r(%1,$f20) 
+		load8888r(%0,$f24) 
+		expand_alpha($f24,$f26)
+		expand_alpha($f20,$f28)
+		pix_multiply($f20,$f22)
+		save_to($f20) 
+		pix_multiply($f22,$f28)
+		save_to($f22) 
+		negate($f26,$f26)
+		pix_add_mul($f24,$f22,$f20,$f26)		
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+}
+
+static void
+ls_combine_xor_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	if (mask) 
+	{
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%2,$f22)  
+		load8888r(%1,$f20) 
+		expand_alpha($f22,$f22)
+		pix_multiply($f20,$f22)
+		save_to ($f20)
+
+
+		load8888r(%0,$f24) 
+		expand_alpha($f20,$f26) 
+		expand_alpha($f24,$f28) 
+		negate($f26,$f26)
+		negate($f28,$f28)
+		pix_add_mul($f20,$f28,$f24,$f26)		
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+		mask++;
+	}else{
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%1,$f20) 
+
+		load8888r(%0,$f24) 
+		expand_alpha($f20,$f26) 
+		expand_alpha($f24,$f28) 
+		negate($f26,$f26)
+		negate($f28,$f28)
+		pix_add_mul($f20,$f28,$f24,$f26)		
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src):clobber
+		);
+	}
+	++dest;
+	++src;
+
+    }
+}
+
+static void
+ls_combine_xor_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%2,$f22) 
+		load8888r(%1,$f20) 
+		load8888r(%0,$f24) 
+		expand_alpha($f24,$f26) 
+		expand_alpha($f20,$f28) 
+		pix_multiply($f20,$f22)
+		save_to($f20) 
+		pix_multiply($f22,$f28)
+		save_to($f22) 
+		negate($f26,$f26)
+		negate($f22,$f22)
+		pix_add_mul($f24,$f22,$f20,$f26)		
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+}
+
+
+static void
+ls_combine_in_reverse_u (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *               dest,
+                          const uint32_t *         src,
+                          const uint32_t *         mask,
+                          int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+ 
+	if (mask) 
+	{
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%2,$f22)  
+		load8888r(%1,$f20) 
+		expand_alpha($f22,$f22)
+		pix_multiply($f20,$f22)
+		save_to ($f20)
+
+		load8888r(%0,$f24) 
+		expand_alpha($f20,$f26)
+		pix_multiply($f24,$f26)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+		mask++;
+	} else {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%1,$f20) 
+
+		load8888r(%0,$f24) 
+		expand_alpha($f20,$f26)
+		pix_multiply($f24,$f26)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src):clobber		
+		);
+	}
+	++dest;
+	++src;
+    }
+}
+
+static void
+ls_combine_in_reverse_ca (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *               dest,
+                           const uint32_t *         src,
+                           const uint32_t *         mask,
+                           int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%2,$f22) 
+		load8888r(%1,$f20) 
+		load8888r(%0,$f24) 
+		expand_alpha($f20,$f20) 
+		pix_multiply($f22,$f20)
+		save_to($f26)
+		pix_multiply($f24,$f26)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+}
+
+static void
+ls_combine_in_u (pixman_implementation_t *imp,
+                  pixman_op_t              op,
+                  uint32_t *               dest,
+                  const uint32_t *         src,
+                  const uint32_t *         mask,
+                  int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	if (mask) 
+	{
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%2,$f22)  
+		load8888r(%1,$f20) 
+		expand_alpha($f22,$f22)
+		pix_multiply($f20,$f22)
+		save_to ($f20)
+
+		load8888r(%0,$f24) 
+		expand_alpha($f24,$f24)
+		pix_multiply($f20,$f24)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+		mask++;
+	} else {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%1,$f20) 
+
+		load8888r(%0,$f24) 
+		expand_alpha($f24,$f24)
+		pix_multiply($f20,$f24)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src):clobber
+		);	
+	
+	}
+	++dest;
+	++src;
+    }
+}
+
+static void
+ls_combine_in_ca (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%2,$f22) 
+		load8888r(%1,$f20) 
+		load8888r(%0,$f24) 
+		expand_alpha($f24,$f24) 
+		pix_multiply($f20,$f22)
+		save_to($f26)
+		pix_multiply($f26,$f24)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+ }
+static void
+ls_combine_src_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%2,$f22) 
+		load8888r(%1,$f20) 
+		pix_multiply($f20,$f22)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+	++src;
+	++mask;
+	++dest;
+    }
+
+}
+
+
+static void 
+ls_combine_over_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+
+	uint32_t ssrc = combine (src, mask);
+	uint32_t a = ssrc >> 24;
+
+	if (a == 0xff)
+	{
+	    *dest = ssrc;
+	}
+	else if (ssrc)
+	{
+
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%1,$f20)
+
+		expand_alpha($f20,$f24) 
+		load8888r(%0,$f26) 
+		over($f20,$f24,$f26)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(ssrc):clobber
+		);
+	}
+
+	++dest;
+	++src;
+	if (mask)
+	    ++mask;
+    }
+}
+
+static void
+ls_combine_over_reverse_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *               dest,
+                            const uint32_t *         src,
+                            const uint32_t *         mask,
+                            int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+	if (mask)
+	{
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+
+		load8888r(%2,$f22)  
+		load8888r(%1,$f20) 
+		expand_alpha($f22,$f22)
+		pix_multiply($f20,$f22)
+		save_to ($f20)
+
+		load8888r(%0,$f26) 
+		expand_alpha($f26,$f28)
+		over($f26,$f28,$f20)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+		mask++;
+	}else{
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%1,$f20) 
+
+		load8888r(%0,$f26) 
+		expand_alpha($f26,$f28)
+		over($f26,$f28,$f20)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src):clobber
+		);	
+	
+	}
+	++dest;
+	++src;
+    }
+}
+
+
+static void
+ls_combine_over_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *               dest,
+                     const uint32_t *         src,
+                     const uint32_t *         mask,
+                     int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%0,$f20) 
+		load8888r(%1,$f22) 
+		load8888r(%2,$f24) 
+		expand_alpha($f22,$f26) 
+		in_over($f22,$f26,$f24,$f20)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+
+}
+
+static void
+ls_combine_over_reverse_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *               dest,
+                             const uint32_t *         src,
+                             const uint32_t *         mask,
+                             int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%0,$f20) 
+		load8888r(%1,$f22) 
+		load8888r(%2,$f24) 
+		in($f22,$f24)
+		save_to($f22)
+		expand_alpha($f20,$f28) 
+		over($f20,$f28,$f22)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+
+}
+
+static void
+ls_combine_add_u (pixman_implementation_t *imp,
+                   pixman_op_t              op,
+                   uint32_t *               dest,
+                   const uint32_t *         src,
+                   const uint32_t *         mask,
+                   int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+
+	if (mask)
+	{
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%2,$f22)  
+		load8888r(%1,$f20) 
+		expand_alpha($f22,$f22)
+		pix_multiply($f20,$f22)
+		save_to ($f20)
+
+		load8888r(%0,$f22) 
+		pix_add($f20,$f22)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+		mask++;
+	}else{
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%1,$f20) 
+
+		load8888r(%0,$f22) 
+		pix_add($f20,$f22)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src):clobber
+		);	
+	
+	}
+	++dest;
+	++src;
+    }
+}
+
+static void
+ls_combine_add_ca (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *               dest,
+                    const uint32_t *         src,
+                    const uint32_t *         mask,
+                    int                      width)
+{
+    const uint32_t *end = src + width;
+
+    while (src < end)
+    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%0,$f20) 
+		load8888r(%1,$f22) 
+		load8888r(%2,$f24) 
+		pix_multiply($f22,$f24)
+		save_to($f22)
+		pix_add($f22,$f20)
+		store8888r($f8,%0)
+		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
+		);
+	++src;
+	++dest;
+	++mask;
+    }
+}
diff -urN pixman//pixman/pixman-composite-ls.c Pixman.Loongson//pixman/pixman-composite-ls.c
--- pixman//pixman/pixman-composite-ls.c	1970-01-01 08:00:00.000000000 +0800
+++ Pixman.Loongson//pixman/pixman-composite-ls.c	2010-12-25 18:39:15.356667000 +0800
@@ -0,0 +1,967 @@
+static void
+ls_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+                                 pixman_op_t              op,
+                                 pixman_image_t *         src_image,
+                                 pixman_image_t *         mask_image,
+                                 pixman_image_t *         dst_image,
+                                 int32_t                  src_x,
+                                 int32_t                  src_y,
+                                 int32_t                  mask_x,
+                                 int32_t                  mask_y,
+                                 int32_t                  dest_x,
+                                 int32_t                  dest_y,
+                                 int32_t                  width,
+                                 int32_t                  height)
+{
+
+    uint32_t    *src, *src_line;
+    uint32_t    *dst, *dst_line;
+    uint8_t     *mask, *mask_line;
+    int src_stride, mask_stride, dst_stride;
+    uint32_t m;
+    uint32_t s, d;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	src = src_line;
+	src_line += src_stride;
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+
+	w = width;
+	while (w--)
+	{
+	    m = *mask++;
+	    if (m)
+	    {
+		s = *src | 0xff000000;
+
+		if (m == 0xff)
+		{
+		    *dst = s;
+		}
+		else
+		{
+          __asm__ volatile (
+          ".set arch=loongson2f \n\t"     
+          load8888r(%0,$f20) 
+          load8888r(%1,$f22) 
+          load8888r(%2,$f24) 
+          expand_alpha($f22,$f26)
+          expand_alpha_rev($f24,$f28)
+          in_over($f22,$f26,$f28,$f20)
+          store8888r($f8,%0)
+          :"+r"(*dst):"r"(s),"r"(m):clobber
+          );
+
+//		    __m64 sa = expand_alpha (s);
+//		    __m64 vm = expand_alpha_rev (to_m64 (m));
+//		    __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
+//		    *dst = store8888 (vdest);
+
+		}
+	    }
+	    src++;
+	    dst++;
+	}
+    }
+}
+
+
+
+
+
+static void
+ls_composite_over_8888_8888 (pixman_implementation_t *imp,
+                              pixman_op_t              op,
+                              pixman_image_t *         src_image,
+                              pixman_image_t *         mask_image,
+                              pixman_image_t *         dst_image,
+                              int32_t                  src_x,
+                              int32_t                  src_y,
+                              int32_t                  mask_x,
+                              int32_t                  mask_y,
+                              int32_t                  dest_x,
+                              int32_t                  dest_y,
+                              int32_t                  width,
+                              int32_t                  height)
+{
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    uint32_t s;
+    int dst_stride, src_stride;
+    uint8_t a;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src;
+	    a = s >> 24;
+
+	    if (a == 0xff)
+	    {
+		*dst = s;
+	    }
+	    else if (s)
+	    {
+			
+				__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%1,$f24) 
+		load8888r(%0,$f20)
+		expand_alpha($f24,$f26)
+		over($f24,$f26,$f20)
+		store8888r($f8,%0)
+		:"+r"(*dst):"r"(*src):clobber
+		);
+	    }
+		dst++;
+		src++;	
+
+	}
+    }
+}
+
+
+static void
+ls_composite_over_8888_n_8888 (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                pixman_image_t *         src_image,
+                                pixman_image_t *         mask_image,
+                                pixman_image_t *         dst_image,
+                                int32_t                  src_x,
+                                int32_t                  src_y,
+                                int32_t                  mask_x,
+                                int32_t                  mask_y,
+                                int32_t                  dest_x,
+                                int32_t                  dest_y,
+                                int32_t                  width,
+                                int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    uint32_t mask;
+    __m64 vmask;
+    int dst_stride, src_stride;
+    int32_t w;
+    __m64 srca;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
+    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
+
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"	
+	load8888(%1,$f24)
+	store64a($f24,%0)
+	:"=m"(vmask):"m"(mask):clobber
+	);
+
+    srca = ls_4x00ff;
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w)
+	{
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%1,$f20) 
+		load8888r(%0,$f22) 
+		expand_alpha($f20,$f28)
+		in_over($f20,$f28,$f24,$f22)
+		store8888r($f8,%0)
+		:"+r"(*dst):"r"(*src):clobber
+		);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+}
+
+static void
+ls_composite_over_n_8888 (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           pixman_image_t *         src_image,
+                           pixman_image_t *         mask_image,
+                           pixman_image_t *         dst_image,
+                           int32_t                  src_x,
+                           int32_t                  src_y,
+                           int32_t                  mask_x,
+                           int32_t                  mask_y,
+                           int32_t                  dest_x,
+                           int32_t                  dest_y,
+                           int32_t                  width,
+                           int32_t                  height)
+{
+    uint32_t src;
+    uint32_t    *dst_line, *dst;
+    int32_t w;
+    int dst_stride;
+    __m64 vsrc, vsrca; 
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"	
+	load8888r(%2,$f24)
+	store64($f24,%0)
+	expand_alpha($f24,$f26)
+	store64($f26,%1)
+	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
+	);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	while (w)
+	{
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"	
+	load8888r(%0,$f28)
+	over($f24,$f26,$f28)
+	store8888r($f8,%0)
+	:"+r"(*dst)::clobber
+	);
+
+	    w--;
+	    dst++;
+	}
+    }
+}
+
+static void
+ls_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
+                                   pixman_op_t              op,
+                                   pixman_image_t *         src_image,
+                                   pixman_image_t *         mask_image,
+                                   pixman_image_t *         dst_image,
+                                   int32_t                  src_x,
+                                   int32_t                  src_y,
+                                   int32_t                  mask_x,
+                                   int32_t                  mask_y,
+                                   int32_t                  dest_x,
+                                   int32_t                  dest_y,
+                                   int32_t                  width,
+                                   int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t    *dst_line;
+    uint32_t    *mask_line;
+    int dst_stride, mask_stride;
+    __m64 vsrc, vsrca;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"	
+	load8888r(%2,$f24)
+	store64($f24,%0)
+	expand_alpha($f24,$f26)
+	store64($f26,%1)
+	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
+	);
+
+    while (height--)
+    {
+	int twidth = width;
+	uint32_t *p = (uint32_t *)mask_line;
+	uint32_t *q = (uint32_t *)dst_line;
+
+	while (twidth)
+	{
+
+	    if (*p)
+	    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%0,$f28)
+		load8888r(%1,$f20)
+		in_over($f24,$f26,$f20,$f28)
+		store8888r($f8,%0)
+		:"+r"(*q):"r"(*p):clobber
+		);
+	    }
+	    twidth--;
+	    p++;
+	    q++;
+	}
+
+	dst_line += dst_stride;
+	mask_line += mask_stride;
+    }
+}
+
+
+static void
+ls_composite_over_n_8_8888 (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             pixman_image_t *         src_image,
+                             pixman_image_t *         mask_image,
+                             pixman_image_t *         dst_image,
+                             int32_t                  src_x,
+                             int32_t                  src_y,
+                             int32_t                  mask_x,
+                             int32_t                  mask_y,
+                             int32_t                  dest_x,
+                             int32_t                  dest_y,
+                             int32_t                  width,
+                             int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    __m64 vsrc, vsrca;
+    uint64_t srcsrc;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    srcsrc = (uint64_t)src << 32 | src;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"	
+	load8888r(%2,$f24)
+	store64a($f24,%0)
+	expand_alpha($f24,$f26)
+	store64a($f26,%1)
+	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
+	);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w)
+	{
+	    uint32_t m = *mask;
+
+	    if (m)
+	    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%0,$f20)
+		load32r(%1,$f22)
+		expand_alpha_rev($f22,$f28)
+		in_over($f24,$f26,$f28,$f20)
+		store8888r($f8,%0)
+		:"+r"(*dst):"r"(m):clobber
+		);
+	    }
+
+	    w--;
+	    mask++;
+	    dst++;
+	}
+    }
+
+}
+
+static void
+ls_composite_over_x888_n_8888 (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                pixman_image_t *         src_image,
+                                pixman_image_t *         mask_image,
+                                pixman_image_t *         dst_image,
+                                int32_t                  src_x,
+                                int32_t                  src_y,
+                                int32_t                  mask_x,
+                                int32_t                  mask_y,
+                                int32_t                  dest_x,
+                                int32_t                  dest_y,
+                                int32_t                  width,
+                                int32_t                  height)
+{
+    uint32_t *dst_line, *dst;
+    uint32_t *src_line, *src;
+    uint32_t mask;
+    __m64 vmask;
+    int dst_stride, src_stride;
+    int32_t w;
+    __m64 srca;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
+
+    mask &= 0xff000000;
+    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"	
+	load8888r(%1,$f24)
+	store64a($f24,%0)
+	:"=m"(vmask):"r"(mask):clobber
+	);
+
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"	
+	load64a(%1,$f26)
+	store64a($f26,%0)
+	:"=m"(srca):"m"(ls_4x00ff):clobber
+	);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w)
+	{
+		uint32_t src_tmp = *src | 0xff000000;
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%1,$f20)
+		load8888r(%0,$f22)		
+		in_over($f20,$f26,$f24,$f22)
+		store8888r($f8,%0)
+		:"+r"(*dst):"r"(src_tmp):clobber
+		);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+}
+
+
+static void
+ls_composite_over_8888_0565 (pixman_implementation_t *imp,
+                               pixman_op_t              op,
+                               pixman_image_t *         src_image,
+                               pixman_image_t *         mask_image,
+                               pixman_image_t *         dst_image,
+                               int32_t                  src_x,
+                               int32_t                  src_y,
+                               int32_t                  mask_x,
+                               int32_t                  mask_y,
+                               int32_t                  dest_x,
+                               int32_t                  dest_y,
+                               int32_t                  width,
+                               int32_t                  height)
+{
+    uint16_t    *dst_line, *dst;
+    uint32_t d;
+    uint32_t    *src_line, *src, s;
+    uint8_t a;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w--)
+	{
+	    s = *src++;
+	    a = s >> 24;
+	    if (s)
+	    {
+		if (a == 0xff)
+		{
+		    d = s;
+		}
+		else
+		{
+		    d = *dst;
+		    d = CONVERT_0565_TO_0888 (d);
+
+		    __asm__ volatile (
+		    ".set arch=loongson2f \n\t"     
+		    load8888r(%1,$f24) 
+		    load8888r(%0,$f20)
+		    expand_alpha($f24,$f26) 
+		    over($f24,$f26,$f20)
+		    store8888r($f8,%0)
+		    :"+r"(d):"r"(s):clobber
+		    );
+
+
+		}
+		*dst = CONVERT_8888_TO_0565 (d);
+	    }
+	    dst++;
+	}
+    }
+}
+
+static void
+ls_composite_over_n_0565 (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           pixman_image_t *         src_image,
+                           pixman_image_t *         mask_image,
+                           pixman_image_t *         dst_image,
+                           int32_t                  src_x,
+                           int32_t                  src_y,
+                           int32_t                  mask_x,
+                           int32_t                  mask_y,
+                           int32_t                  dest_x,
+                           int32_t                  dest_y,
+                           int32_t                  width,
+                           int32_t                  height)
+{
+    uint32_t src;
+    uint32_t d;
+    uint16_t    *dst_line, *dst;
+    int32_t w;
+    int dst_stride;
+    __m64 vsrc, vsrca;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"	
+	load8888r(%2,$f24)
+	store64a($f24,%0)
+	expand_alpha($f24,$f26)
+	store64a($f26,%1)
+	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
+	);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	while (w)
+	{
+
+		d = *dst;
+		d = CONVERT_0565_TO_0888 (d);
+
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%0,$f20)
+
+		over($f24,$f26,$f20)
+		store8888r($f8,%0)
+		:"+r"(d)::clobber
+		);
+
+		*dst = CONVERT_8888_TO_0565 (d);
+
+	    w--;
+	    dst++;
+	}
+    }
+}
+
+static void
+ls_composite_over_n_8_0565 (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             pixman_image_t *         src_image,
+                             pixman_image_t *         mask_image,
+                             pixman_image_t *         dst_image,
+                             int32_t                  src_x,
+                             int32_t                  src_y,
+                             int32_t                  mask_x,
+                             int32_t                  mask_y,
+                             int32_t                  dest_x,
+                             int32_t                  dest_y,
+                             int32_t                  width,
+                             int32_t                  height)
+{
+    uint32_t src, srca, m, d;
+    uint16_t *dst_line, *dst;
+    uint8_t *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    __m64 vsrc, vsrca;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"	
+	load8888r(%2,$f24)
+	store64a($f24,%0)
+	expand_alpha($f24,$f26)
+	store64a($f26,%1)
+	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
+	);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w)
+	{
+	    m = *mask;
+	    d = *dst;
+
+	    if (m)
+	    {
+
+		d = CONVERT_0565_TO_0888 (d);
+
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%0,$f20)
+		load32r(%1,$f22)
+		expand_alpha_rev($f22,$f28)
+		in_over($f24,$f26,$f28,$f20)
+		store8888r($f8,%0)
+		:"+r"(d):"r"(m):clobber
+		);
+
+		*dst = CONVERT_8888_TO_0565 (d);
+
+	    }
+
+	    w--;
+	    mask++;
+	    dst++;
+	}
+    }
+}
+
+static void
+ls_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
+                                   pixman_op_t              op,
+                                   pixman_image_t *         src_image,
+                                   pixman_image_t *         mask_image,
+                                   pixman_image_t *         dst_image,
+                                   int32_t                  src_x,
+                                   int32_t                  src_y,
+                                   int32_t                  mask_x,
+                                   int32_t                  mask_y,
+                                   int32_t                  dest_x,
+                                   int32_t                  dest_y,
+                                   int32_t                  width,
+                                   int32_t                  height)
+{
+    uint32_t src, srca, m, d;
+    uint16_t    *dst_line;
+    uint32_t    *mask_line;
+    int dst_stride, mask_stride;
+    __m64 vsrc, vsrca;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"	
+	load8888r(%2,$f24)
+	store64a($f24,%0)
+	expand_alpha($f24,$f26)
+	store64a($f26,%1)
+	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
+	);
+
+    while (height--)
+    {
+	int twidth = width;
+	uint32_t *p = (uint32_t *)mask_line;
+	uint16_t *q = (uint16_t *)dst_line;
+
+	while (twidth)
+	{
+
+	    m = *(uint32_t *)p;
+	    d = *q;
+
+	    if (m)
+	    {
+
+		d = CONVERT_0565_TO_0888 (d);
+
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%0,$f20)
+		load8888r(%1,$f22)
+		in_over($f24,$f26,$f22,$f20)
+		store8888r($f8,%0)
+		:"+r"(d):"r"(m):clobber
+		);
+
+		*q = CONVERT_8888_TO_0565 (d);
+
+	    }
+
+	    twidth--;
+	    p++;
+	    q++;
+	}
+
+	mask_line += mask_stride;
+	dst_line += dst_stride;
+    }
+}
+static void
+ls_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                pixman_image_t *         src_image,
+                                pixman_image_t *         mask_image,
+                                pixman_image_t *         dst_image,
+                                int32_t                  src_x,
+                                int32_t                  src_y,
+                                int32_t                  mask_x,
+                                int32_t                  mask_y,
+                                int32_t                  dest_x,
+                                int32_t                  dest_y,
+                                int32_t                  width,
+                                int32_t                  height)
+{
+    uint32_t    *dst_line, *dst;
+    uint32_t    *src_line, *src;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+#if 0
+    /* FIXME */
+    assert (src_image->drawable == mask_image->drawable);
+#endif
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w)
+	{
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%1,$f22) 
+		load8888r(%0,$f20) 
+		over_rev_non_pre($f22,$f20)
+		store8888r($f8,%0)
+		:"+r"(*dst):"r"(*src):clobber
+		);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+}
+static void
+ls_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
+                                pixman_op_t              op,
+                                pixman_image_t *         src_image,
+                                pixman_image_t *         mask_image,
+                                pixman_image_t *         dst_image,
+                                int32_t                  src_x,
+                                int32_t                  src_y,
+                                int32_t                  mask_x,
+                                int32_t                  mask_y,
+                                int32_t                  dest_x,
+                                int32_t                  dest_y,
+                                int32_t                  width,
+                                int32_t                  height)
+{
+    uint16_t    *dst_line, *dst;
+    uint32_t    *src_line, *src, d;
+    int dst_stride, src_stride;
+    int32_t w;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+#if 0
+    /* FIXME */
+    assert (src_image->drawable == mask_image->drawable);
+#endif
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	src = src_line;
+	src_line += src_stride;
+	w = width;
+
+	while (w)
+	{
+
+		d = *dst;
+		d = CONVERT_0565_TO_0888 (d);
+
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load8888r(%1,$f20) 
+		load8888r(%0,$f24) 
+		over_rev_non_pre($f20,$f24)
+		store8888r($f8,%0)
+		:"+r"(d):"r"(*src):clobber
+		);
+
+		*dst = CONVERT_8888_TO_0565 (d);
+
+	    w--;
+	    dst++;
+	    src++;
+	}
+    }
+}
+
+static void
+ls_composite_src_n_8_8888 (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            pixman_image_t *         src_image,
+                            pixman_image_t *         mask_image,
+                            pixman_image_t *         dst_image,
+                            int32_t                  src_x,
+                            int32_t                  src_y,
+                            int32_t                  mask_x,
+                            int32_t                  mask_y,
+                            int32_t                  dest_x,
+                            int32_t                  dest_y,
+                            int32_t                  width,
+                            int32_t                  height)
+{
+    uint32_t src, srca;
+    uint32_t    *dst_line, *dst, m;
+    uint8_t     *mask_line, *mask;
+    int dst_stride, mask_stride;
+    int32_t w;
+    __m64 vsrc, vsrca;
+    uint64_t srcsrc;
+
+    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
+
+    srca = src >> 24;
+    if (src == 0)
+    {
+	pixman_fill_ls (dst_image->bits.bits, dst_image->bits.rowstride,
+			 PIXMAN_FORMAT_BPP (dst_image->bits.format),
+	                 dest_x, dest_y, width, height, 0);
+	return;
+    }
+
+    srcsrc = (uint64_t)src << 32 | src;
+
+    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"	
+	load8888r(%2,$f24)
+	store64a($f24,%0)
+	expand_alpha($f24,$f26)
+	store64a($f26,%1)
+	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
+	);
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	mask = mask_line;
+	mask_line += mask_stride;
+	w = width;
+
+	while (w)
+	{
+	    m = *mask;
+
+	    if (m)
+	    {
+		__asm__ volatile (
+		".set arch=loongson2f \n\t"	
+		load32r(%1,$f20)
+		expand_alpha_rev($f20,$f28)
+		in($f24,$f28)
+		store8888r($f8,%0)
+		:"=r"(*dst):"r"(m):clobber
+		);
+
+	    }
+	    else
+	    {
+		*dst = 0;
+	    }
+
+	    w--;
+	    mask++;
+	    dst++;
+	}
+    }
+}
diff -urN pixman//pixman/pixman-cpu.c Pixman.Loongson//pixman/pixman-cpu.c
--- pixman//pixman/pixman-cpu.c	2010-12-25 18:46:00.073234000 +0800
+++ Pixman.Loongson//pixman/pixman-cpu.c	2010-12-25 18:39:15.360337000 +0800
@@ -579,7 +579,9 @@
     if (pixman_have_mmx ())
 	return _pixman_implementation_create_mmx ();
 #endif
-
+#ifdef USE_LS
+	return _pixman_implementation_create_ls ();
+#endif
 #ifdef USE_ARM_NEON
     if (pixman_have_arm_neon ())
 	return _pixman_implementation_create_arm_neon ();
diff -urN pixman//pixman/pixman-ls.c Pixman.Loongson//pixman/pixman-ls.c
--- pixman//pixman/pixman-ls.c	1970-01-01 08:00:00.000000000 +0800
+++ Pixman.Loongson//pixman/pixman-ls.c	2010-12-25 18:39:15.386759000 +0800
@@ -0,0 +1,538 @@
+/*
+* Based on pixman-mmx.c
+* Implemented for loongson 2F only.
+* Free software based on GPL licence.
+* Copyright 2010 WG Ge.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <limits.h>
+#include <stdio.h>
+#include "pixman-private.h"
+#include "pixman-combine32.h"
+#include "primitive.h"
+
+#define __m64  __attribute__ ((aligned (8))) uint64_t
+#define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (n))) v
+#define DECLARE_ALIGNED_8(t, v, ...)  DECLARE_ALIGNED(8, t, v)
+
+DECLARE_ALIGNED_8 (const uint64_t, ls_4x00ff                  ) = 0x00ff00ff00ff00ffULL;
+DECLARE_ALIGNED_8 (const uint64_t, ls_4x0080                  ) = 0x0080008000800080ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ls_565_rgb                 ) = 0x000001f0003f001fULL;
+DECLARE_ALIGNED_8 (const uint64_t, ls_565_unpack_multiplier   ) = 0x0000008404100840ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ls_565_r                   ) = 0x000000f800000000ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ls_565_g                   ) = 0x0000000000fc0000ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ls_565_b                   ) = 0x00000000000000f8ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ls_mask_0                  ) = 0xffffffffffff0000ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ls_mask_1                  ) = 0xffffffff0000ffffULL;
+DECLARE_ALIGNED_8 (const uint64_t, ls_mask_2                  ) = 0xffff0000ffffffffULL;
+DECLARE_ALIGNED_8 (const uint64_t, ls_mask_3                  ) = 0x0000ffffffffffffULL;
+DECLARE_ALIGNED_8 (const uint64_t, ls_full_alpha              ) = 0x00ff000000000000ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ls_ffff0000ffff0000        ) = 0xffff0000ffff0000ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ls_0000ffff00000000        ) = 0x0000ffff00000000ULL;
+DECLARE_ALIGNED_8 (const uint64_t, ls_000000000000ffff        ) = 0x000000000000ffffULL;
+
+
+pixman_bool_t
+pixman_fill_ls (uint32_t *bits,
+                 int       stride,
+                 int       bpp,
+                 int       x,
+                 int       y,
+                 int       width,
+                 int       height,
+                 uint32_t xor)
+{
+    uint64_t fill;
+    uint32_t byte_width;
+    uint8_t     *byte_line;
+
+
+
+    if (bpp != 16 && bpp != 32 && bpp != 8)
+	return FALSE;
+
+    if (bpp == 8)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 1;
+	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
+	byte_width = width;
+	stride *= 1;
+        xor = (xor & 0xff) * 0x01010101;
+    }
+    else if (bpp == 16)
+    {
+	stride = stride * (int) sizeof (uint32_t) / 2;
+	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+	byte_width = 2 * width;
+	stride *= 2;
+        xor = (xor & 0xffff) * 0x00010001;
+    }
+    else
+    {
+	stride = stride * (int) sizeof (uint32_t) / 4;
+	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+	byte_width = 4 * width;
+	stride *= 4;
+    }
+
+    fill = ((uint64_t)xor << 32) | xor;
+
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"
+	"ldc1 $f24, %0 \n\t"
+	::"m"(fill):"$f24"
+	);
+    while (height--)
+    {
+	int w;
+	uint8_t *d = byte_line;
+
+	byte_line += stride;
+	w = byte_width;
+
+	while (w >= 1 && ((unsigned long)d & 1))
+	{
+	    *(uint8_t *)d = (xor & 0xff);
+	    w--;
+	    d++;
+	}
+
+	while (w >= 2 && ((unsigned long)d & 3))
+	{
+	    *(uint16_t *)d = xor;
+	    w -= 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((unsigned long)d & 7))
+	{
+	    *(uint32_t *)d = xor;
+
+	    w -= 4;
+	    d += 4;
+	}
+
+	while (w >= 64)
+	{
+
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"
+	"dmfc1 $8, $f24 \n\t"
+	"sd $8 ,   (%0) \n\t"
+	"sd $8 ,   8(%0) \n\t"	
+	"sd $8 ,   16(%0) \n\t"
+	"sd $8 ,   24(%0) \n\t"
+	"sd $8 ,   32(%0) \n\t"          
+	"sd $8 ,   40(%0) \n\t"	       
+	"sd $8 ,   48(%0) \n\t"        
+	"sd $8 ,   56(%0) \n\t"        
+	::"r"(d):"$8","memory","$f24"
+	);
+	    w -= 64;
+	    d += 64;
+	}
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = xor;
+
+	    w -= 4;
+	    d += 4;
+	}
+	while (w >= 2)
+	{
+	    *(uint16_t *)d = xor;
+	    w -= 2;
+	    d += 2;
+	}
+	while (w >= 1)
+	{
+	    *(uint8_t *)d = (xor & 0xff);
+	    w--;
+	    d++;
+	}
+
+    }
+    return TRUE;
+}
+
+static pixman_bool_t
+pixman_blt_ls (uint32_t *src_bits,
+                uint32_t *dst_bits,
+                int       src_stride,
+                int       dst_stride,
+                int       src_bpp,
+                int       dst_bpp,
+                int       src_x,
+                int       src_y,
+                int       dst_x,
+                int       dst_y,
+                int       width,
+                int       height)
+{
+    uint8_t *   src_bytes;
+    uint8_t *   dst_bytes;
+    int byte_width;
+
+    if (src_bpp != dst_bpp)
+	return FALSE;
+
+    if (src_bpp == 16)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
+	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+	byte_width = 2 * width;
+	src_stride *= 2;
+	dst_stride *= 2;
+    }
+    else if (src_bpp == 32)
+    {
+	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
+	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
+	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
+	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
+	byte_width = 4 * width;
+	src_stride *= 4;
+	dst_stride *= 4;
+    }
+    else
+    {
+	return FALSE;
+    }
+
+    while (height--)
+    {
+	int w;
+	uint8_t *s = src_bytes;
+	uint8_t *d = dst_bytes;
+	src_bytes += src_stride;
+	dst_bytes += dst_stride;
+	w = byte_width;
+
+	while (w >= 2 && ((unsigned long)d & 3))
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+
+	while (w >= 4 && ((unsigned long)d & 7))
+	{
+	    *(uint32_t *)d = *(uint32_t *)s;
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+ if ((unsigned long)s & 7)
+{
+	while (w >= 64)
+	{
+
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"
+	"uld $8 ,   (%1) \n\t"
+	"uld $9 ,  8(%1) \n\t"
+	"uld $10, 16(%1) \n\t"
+	"uld $11, 24(%1) \n\t"
+	"sd $8 ,   (%0) \n\t"
+	"sd $9 ,   8(%0) \n\t"	
+	"sd $10,   16(%0) \n\t"
+	"sd $11,   24(%0) \n\t"
+
+	"uld $8 ,   32(%1) \n\t"
+	"uld $9 ,   40(%1) \n\t"
+	"uld $10,   48(%1) \n\t"
+	"uld $11,   56(%1) \n\t"
+	"sd $8 ,   32(%0) \n\t"          
+	"sd $9 ,   40(%0) \n\t"	       
+	"sd $10,   48(%0) \n\t"        
+	"sd $11,   56(%0) \n\t"        
+	::"r"(d),"r"(s):"$8","$9","$10","$11","memory"
+	);
+	    w -= 64;
+	    s += 64;
+	    d += 64;
+	}
+}
+else
+{
+	while (w >= 64)
+	{
+
+	__asm__ volatile (
+	".set arch=loongson2f \n\t"
+	"ld $8 ,   (%1) \n\t"
+	"ld $9 ,  8(%1) \n\t"
+	"ld $10, 16(%1) \n\t"
+	"ld $11, 24(%1) \n\t"
+	"sd $8 ,   (%0) \n\t"
+	"sd $9 ,   8(%0) \n\t"	
+	"sd $10,   16(%0) \n\t"
+	"sd $11,   24(%0) \n\t"
+
+	"ld $8 ,   32(%1) \n\t"
+	"ld $9 ,   40(%1) \n\t"
+	"ld $10,   48(%1) \n\t"
+	"ld $11,   56(%1) \n\t"
+	"sd $8 ,   32(%0) \n\t"          
+	"sd $9 ,   40(%0) \n\t"	       
+	"sd $10,   48(%0) \n\t"        
+	"sd $11,   56(%0) \n\t"        
+	::"r"(d),"r"(s):"$8","$9","$10","$11","memory"
+	);
+	    w -= 64;
+	    s += 64;
+	    d += 64;
+	}
+}	
+
+	while (w >= 4)
+	{
+	    *(uint32_t *)d = *(uint32_t *)s;
+
+	    w -= 4;
+	    s += 4;
+	    d += 4;
+	}
+	if (w >= 2)
+	{
+	    *(uint16_t *)d = *(uint16_t *)s;
+	    w -= 2;
+	    s += 2;
+	    d += 2;
+	}
+    }
+    return TRUE;
+}
+
+
+#include "pixman-composite-ls.c"
+#include "pixman-combine-ls.c"
+
+static pixman_bool_t
+ls_blt (pixman_implementation_t *imp,
+         uint32_t *               src_bits,
+         uint32_t *               dst_bits,
+         int                      src_stride,
+         int                      dst_stride,
+         int                      src_bpp,
+         int                      dst_bpp,
+         int                      src_x,
+         int                      src_y,
+         int                      dst_x,
+         int                      dst_y,
+         int                      width,
+         int                      height)
+{
+    if (!pixman_blt_ls (
+            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+            src_x, src_y, dst_x, dst_y, width, height))
+    {
+	return _pixman_implementation_blt (
+	    imp->delegate,
+	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
+	    src_x, src_y, dst_x, dst_y, width, height);
+    }
+
+    return TRUE;
+}
+
+static pixman_bool_t
+ls_fill (pixman_implementation_t *imp,
+          uint32_t *               bits,
+          int                      stride,
+          int                      bpp,
+          int                      x,
+          int                      y,
+          int                      width,
+          int                      height,
+          uint32_t xor)
+{
+    if (!pixman_fill_ls (bits, stride, bpp, x, y, width, height, xor))
+    {
+	return _pixman_implementation_fill (
+	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
+    }
+
+    return TRUE;
+}
+
+static void
+ls_composite_copy_area (pixman_implementation_t *imp,
+                         pixman_op_t              op,
+                         pixman_image_t *         src_image,
+                         pixman_image_t *         mask_image,
+                         pixman_image_t *         dst_image,
+                         int32_t                  src_x,
+                         int32_t                  src_y,
+                         int32_t                  mask_x,
+                         int32_t                  mask_y,
+                         int32_t                  dest_x,
+                         int32_t                  dest_y,
+                         int32_t                  width,
+                         int32_t                  height)
+{
+    pixman_blt_ls (src_image->bits.bits,
+                    dst_image->bits.bits,
+                    src_image->bits.rowstride,
+                    dst_image->bits.rowstride,
+                    PIXMAN_FORMAT_BPP (src_image->bits.format),
+                    PIXMAN_FORMAT_BPP (dst_image->bits.format),
+                    src_x, src_y, dest_x, dest_y, width, height);
+}
+
+
+static const pixman_fast_path_t ls_fast_paths[] =
+{
+
+//these are implemented so far
+#if 1
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, ls_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, ls_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, ls_composite_over_x888_8_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, ls_composite_over_x888_8_8888    ),
+#endif
+
+#if 1
+//over_8888_0565 significant perf improvement, slight better L1, L2, 30% better RT
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   ls_composite_over_8888_0565      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   ls_composite_over_8888_0565      ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   ls_composite_over_pixbuf_0565    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   ls_composite_over_pixbuf_0565    ),
+
+//big improvement some closing 100%
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   ls_composite_over_n_8888_0565_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   ls_composite_over_n_8888_0565_ca ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   ls_composite_over_n_8_0565       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   ls_composite_over_n_8_0565       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   ls_composite_over_n_0565         ),
+
+//ubalbe to bench with lowlevel bench, believe it is a gain in perf
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, ls_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, ls_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, ls_composite_over_x888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, ls_composite_over_x888_n_8888    ),
+
+//performance regress 30% in L1,L2, but significant improvement in RT
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, ls_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, ls_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, ls_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, ls_composite_over_8888_8888      ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, ls_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, ls_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, ls_composite_over_pixbuf_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, ls_composite_over_pixbuf_8888    ),
+
+//same performance in L1,L2, but significant improvement in RT 30-40%
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, ls_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, ls_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, ls_composite_over_8888_n_8888    ),
+    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, ls_composite_over_8888_n_8888    ),
+
+//significant perf improvement 20%
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, ls_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, ls_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, ls_composite_over_n_8_8888       ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, ls_composite_over_n_8_8888       ),
+
+//3 times perf improvements
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, ls_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, ls_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, ls_composite_over_n_8888_8888_ca ),
+    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, ls_composite_over_n_8888_8888_ca ),
+
+//significant performance boost
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, ls_composite_over_n_8888         ),
+    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, ls_composite_over_n_8888         ),
+//simple add, expect better perf in generic code
+//    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, ls_composite_add_8888_8888       ),
+//    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, ls_composite_add_8888_8888       ),
+
+// FIXME: Copy memory are not better than geneic code
+#if 0
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, ls_composite_copy_area           ),
+    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, ls_composite_copy_area           ),
+#endif
+
+//significant improvement
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, ls_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, ls_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, ls_composite_src_n_8_8888        ),
+    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, ls_composite_src_n_8_8888        ),
+
+#endif
+
+//these are not yet implemented
+
+#if 0
+
+    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       ls_composite_add_8000_8000       ),
+    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       ls_composite_add_n_8_8           ),
+    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       ls_composite_in_8_8              ),
+    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       ls_composite_in_n_8_8            ),
+#endif
+
+
+    { PIXMAN_OP_NONE },
+};
+
+pixman_implementation_t *
+_pixman_implementation_create_ls (void)
+{
+    pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
+    pixman_implementation_t *imp = _pixman_implementation_create (general, ls_fast_paths);
+
+//Turned on but unable to benchmark.
+#if 1
+    imp->combine_32[PIXMAN_OP_OVER] = ls_combine_over_u;
+    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = ls_combine_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_IN] = ls_combine_in_u;
+    imp->combine_32[PIXMAN_OP_IN_REVERSE] = ls_combine_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_OUT] = ls_combine_out_u;
+    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = ls_combine_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_ATOP] = ls_combine_atop_u;
+    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = ls_combine_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_XOR] = ls_combine_xor_u;
+    imp->combine_32[PIXMAN_OP_ADD] = ls_combine_add_u;
+    imp->combine_32[PIXMAN_OP_SATURATE] = ls_combine_saturate_u;
+
+    imp->combine_32_ca[PIXMAN_OP_SRC] = ls_combine_src_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER] = ls_combine_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = ls_combine_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN] = ls_combine_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = ls_combine_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT] = ls_combine_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = ls_combine_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP] = ls_combine_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = ls_combine_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_XOR] = ls_combine_xor_ca;
+    imp->combine_32_ca[PIXMAN_OP_ADD] = ls_combine_add_ca;
+#endif
+
+//FIXME blt and fill not shown better perf than geneic code
+#if 0
+    imp->blt = ls_blt;
+    imp->fill = ls_fill;
+#endif
+
+    return imp;
+}
+
diff -urN pixman//pixman/pixman-private.h Pixman.Loongson//pixman/pixman-private.h
--- pixman//pixman/pixman-private.h	2010-12-25 18:46:00.102841000 +0800
+++ Pixman.Loongson//pixman/pixman-private.h	2010-12-25 18:39:15.401808000 +0800
@@ -493,6 +493,11 @@
 pixman_implementation_t *
 _pixman_implementation_create_fast_path (void);
 
+#ifdef USE_LS
+pixman_implementation_t *
+_pixman_implementation_create_ls (void);
+#endif
+
 #ifdef USE_MMX
 pixman_implementation_t *
 _pixman_implementation_create_mmx (void);
diff -urN pixman//pixman/primitive.h Pixman.Loongson//pixman/primitive.h
--- pixman//pixman/primitive.h	1970-01-01 08:00:00.000000000 +0800
+++ Pixman.Loongson//pixman/primitive.h	2010-12-25 18:39:15.457084000 +0800
@@ -0,0 +1,214 @@
+/*
+* MMX register usage protocal
+*	return result: f8 
+*	tmp immediate f12
+*	tmp register in primtive f14 f16 f18
+*	tmp register in pixman f0,f4,f6,f10,f20,f22,  
+*	globals in function f24, f26, f28,f30 
+* Exceptions for load and store: 
+*	load will specify dest FPR register
+*	store will specify src FPR register
+*       expand_alpha(_rev) implemented with GPR,  dest FPR as the 2nd parameter
+*
+* Special alert: don't use return result $f8 as input, it might be overwritten
+*/
+
+
+/*primitive macros */
+
+#define clobber "$8","$9","$f0","$f2","$f8",\
+	"$f12","$f14","$f16","$f18","$f20",\
+	"$f22","$f24","$f26","$f28","$f30"
+
+#define  DMTC1_IMM(regc1,imm) \
+	"dli $8, "#imm" \n\t" \
+	"dmtc1 $8, "#regc1" \n\t"
+
+#define  MTC1_IMM(regc1,imm) \
+	"li $8, "#imm" \n\t" \
+	"dmtc1 $8, "#regc1" \n\t"
+
+
+#define save_to(reg1)  "mov.d "#reg1", $f8 \n\t"
+#define zero(reg1)  "xor "#reg1","#reg1","#reg1" \n\t"
+
+#define load32(sp,reg1) \
+		"ulw $8, "#sp" \n\t" \
+		"dmtc1 $8, "#reg1" \n\t"
+
+#define load32a(sp,reg1) \
+		"lw $8, "#sp" \n\t" \
+		"dmtc1 $8, "#reg1" \n\t"
+
+#define load32r(sp,reg1)  \
+	"dmtc1 "#sp", "#reg1" \n\t"
+
+#define load64(sp,reg1) \
+		"uld $8, "#sp" \n\t" \
+		"dmtc1 $8, "#reg1" \n\t"
+
+#define load64a(sp,reg1) \
+		"ld $8, "#sp" \n\t" \
+		"dmtc1 $8, "#reg1" \n\t"
+
+
+#define store32(reg1,sp) \
+		"dmfc1 $8, "#reg1" \n\t" \
+		"usw $8, "#sp" \n\t"
+
+#define store32r(reg1,sp) \
+		"dmfc1 "#sp", "#reg1" \n\t"
+
+#define store32a(reg1,sp) \
+		"swc1 "#reg1", "#sp" \n\t" 
+
+#define store64(reg1,sp) \
+		"dmfc1 $8, "#reg1" \n\t" \
+		"usd $8, "#sp" \n\t"
+
+#define store64a(reg1,sp) \
+		"sdc1 "#reg1", "#sp" \n\t" 
+
+#define load8888(sp,reg1) \
+	load64(sp,reg1) \
+	"xor $f12, $f12, $f12 \n\t" \
+	"punpcklbh "#reg1", "#reg1", $f12 \n\t" 
+
+#define load8888r(sp,reg1) \
+	load32r(sp,reg1) \
+	"xor $f12, $f12, $f12 \n\t" \
+	"punpcklbh "#reg1", "#reg1", $f12 \n\t" 
+
+#define load8888a(sp,reg1) \
+	load64a(sp,reg1) \
+	"xor $f12, $f12, $f12 \n\t" \
+	"punpcklbh "#reg1", "#reg1", $f12 \n\t" 
+
+#define load8888ah(sp,reg1) \
+	load64a(sp,reg1) \
+	"xor $f12, $f12, $f12 \n\t" \
+	"punpckhbh "#reg1", "#reg1", $f12 \n\t" 
+	
+#define store8888(reg1,sp) \
+	"xor $f12, $f12, $f12 \n\t" \
+	"packushb "#reg1", "#reg1", $f12 \n\t" \
+	store64(reg1,sp)
+
+#define store8888r(reg1,sp) \
+	"xor $f12, $f12, $f12 \n\t" \
+	"packushb "#reg1", "#reg1", $f12 \n\t" \
+	store32r(reg1,sp)
+
+#define store8888a(reg1,sp) \
+	"xor $f12, $f12, $f12 \n\t" \
+	"packushb "#reg1", "#reg1", $f12 \n\t" \
+	store64a(reg1,sp)
+
+#define pack8888(reg1,reg2) 	\
+	"packushb $f8, "#reg1","#reg2" \n\t"
+
+#define unpack8888(reg1,reg2) 	\
+	"punpcklbh $f8, "#reg1","#reg2" \n\t"
+
+
+#define negate(sreg,dreg) \
+	DMTC1_IMM($f12, 0x00ff00ff00ff00ff)\
+	"xor "#dreg", "#sreg", $f12 \n\t"
+
+#define pix_add(reg1,reg2) \
+	"paddusb $f8, "#reg1", "#reg2" \n\t"
+
+#define pix_multiply(reg1,reg2) \
+	"pmullh $f14, "#reg1", "#reg2" \n\t " \
+	DMTC1_IMM($f12, 0x0080008000800080) \
+	"paddush $f14, $f14, $f12 \n\t "\
+ 	MTC1_IMM($f12, 8) \
+	"psrlh $f16, $f14, $f12 \n\t" \
+	"paddush $f14, $f14, $f16 \n\t" \
+	"psrlh $f8, $f14, $f12 \n\t" 
+
+#define pix_add_mul(reg1,reg2,reg3,reg4) \
+	pix_multiply(reg1,reg2) \
+	"mov.d $f18, $f8 \n\t" \
+	pix_multiply(reg3,reg4) \
+	pix_add($f18,$f8)	
+
+#define expand_alpha(sreg,dreg) \
+                "dmfc1 $8, "#sreg" \n\t" \
+                "dsrl32 $8, $8, 16 \n\t" \
+                "dsll $9, $8, 16 \n\t" \
+                "or $8, $8, $9 \n\t" \
+                "dsll32 $9, $8, 0 \n\t" \
+                "or $8, $8, $9 \n\t" \
+                "dmtc1 $8, "#dreg" \n\t"
+
+#define expand_alpha_rev(sreg,dreg)\
+                "dmfc1 $8, "#sreg" \n\t" \
+                "dsll32 $8, $8, 16 \n\t" \
+                "dsrl32 $8, $8, 16 \n\t" \
+                "dsll $9, $8, 16 \n\t" \
+                "or $8, $8, $9 \n\t" \
+                "dsll32 $9, $8, 0 \n\t" \
+                "or $8, $8, $9 \n\t" \
+                "dmtc1 $8, "#dreg" \n\t"
+
+#define expand8888(reg1,pos) expand8888_##pos(reg1)
+
+#define expand8888_0(reg1) \
+	"xor $f12, $f12, $f12 \n\t" \
+	"punpcklbh $f8, "#reg1", $f12 \n\t" 
+
+#define expand8888_1(reg1) \
+	"xor $f12, $f12, $f12 \n\t" \
+	"punpckhbh $f8, "#reg1", $f12 \n\t" 
+
+#define expandx888(reg1,pos) \
+	expand8888(reg1,pos) \
+	DMTC1_IMM($f12, 0x00ff000000000000) \
+	"or $f8, $f8, $f12 \n\t"
+
+#define invert_colors(reg1)  \
+	DMTC1_IMM($f12, 0xffff0000ffff0000) \
+	"and $f14, "#reg1", $f12 \n\t" \
+	DMTC1_IMM($f12, 0x000000000000ffff) \
+	"and $f16, "#reg1", $f12 \n\t" \
+	DMTC1_IMM($f12, 0x0000ffff00000000) \
+	"and $f18, "#reg1", $f12 \n\t" \
+	MTC1_IMM($f12, 32) \
+	"dsll $f16, $f16, $f12 \n\t" \
+	"dsrl $f18, $f18, $f12 \n\t" \
+	"or $f14, $f14, $f16 \n\t" \
+	"or $f8, $f14, $f18 \n\t" 
+
+#define over(reg1,reg2,reg3) \
+	negate(reg2,$f8) \
+	pix_multiply(reg3, $f8)\
+	pix_add(reg1, $f8) 
+
+
+#define over_rev_non_pre(reg1,reg2) \
+	expand_alpha(reg1,$f0) \
+	DMTC1_IMM($f12,0x00ff000000000000) \
+	"or $f2, $f0, $f12 \n\t" \
+	invert_colors(reg1) \
+	pix_multiply($f8,$f2) \
+	save_to($f2) \
+	over($f2, $f0, reg2)
+
+#define in(reg1,reg2) pix_multiply(reg1,reg2) 
+
+#define in_over_full_src_alpha(reg1,reg2,reg3) \
+	DMTC1_IMM($f12,0x00ff000000000000) \
+	"or $f0, "#reg1", $f12 \n\t" \
+	in($f0,reg2) \
+	save_to($f0) \
+	over($f0,reg2,reg3)
+
+#define in_over(reg1,reg2,reg3,reg4) \
+	in(reg1,reg3) \
+	"mov.d $f0, $f8 \n\t" \
+	pix_multiply(reg2,reg3) \
+	"mov.d $f2, $f8 \n\t" \
+	over($f0,$f2,reg4)
+
+