More patches

author: Nicolas Reynolds <fauno@kiwwwi.com.ar> 2011-06-22 19:50:41 -0300
committer: Nicolas Reynolds <fauno@kiwwwi.com.ar> 2011-06-22 19:50:41 -0300
commit: 2592adab23ef6c7a48fcc14d03a383f4e3447597 (patch)
tree: 9b091571a06f82d21f29ccbe669062ce96702e0a
parent: 2d8ff6f229ba867a2fc8d1108de40bb4eb299c0d (diff)
11 files changed, 4649 insertions, 17 deletions
diff --git a/extra/ffmpeg/PKGBUILD b/extra/ffmpeg/PKGBUILD
new file mode 100644
index 000000000..1bd7fc9b4
--- /dev/null
+++ b/extra/ffmpeg/PKGBUILD
@@ -0,0 +1,55 @@
+# $Id$
+# Maintainer : Ionut Biru <ibiru@archlinux.org>
+# Contributor: Tom Newsom <Jeepster@gmx.co.uk>
+# Contributor: Paul Mattal <paul@archlinux.org>
+
+pkgname=ffmpeg
+pkgver=20110622
+pkgrel=1
+pkgdesc="Complete and free Internet live audio and video broadcasting solution for Linux/Unix"
+arch=('i686' 'x86_64' 'mips64el')
+url="http://ffmpeg.org/"
+license=('GPL')
+depends=('bzip2' 'lame' 'sdl' 'libvorbis' 'xvidcore' 'zlib' 'x264' 'libtheora' 'opencore-amr' 'alsa-lib' 'libvdpau' 'libxfixes' 'schroedinger' 'libvpx' 'libva' 'openjpeg' 'rtmpdump')
+makedepends=('yasm' 'git')
+#git clone git://git.videolan.org/ffmpeg.git
+source=(ftp://ftp.archlinux.org/other/ffmpeg/${pkgname}-${pkgver}.tar.xz
+        ffmpeg-loongson.patch)
+md5sums=('6003afa1f87857db729d697e3ec1be36'
+         '081d03278559a351322157a441fabcf5')
+
+build() {
+  cd "$srcdir/$pkgname"
+
+  [ "$CARCH" = "mips64el" ] && patch -Np1 -i $srcdir/ffmpeg-loongson.patch
+
+  ./configure \
+    --prefix=/usr \
+    --enable-libmp3lame \
+    --enable-libvorbis \
+    --enable-libxvid \
+    --enable-libx264 \
+    --enable-libvpx \
+    --enable-libtheora \
+    --enable-postproc \
+    --enable-shared \
+    --enable-x11grab \
+    --enable-libopencore_amrnb \
+    --enable-libopencore_amrwb \
+    --enable-libschroedinger \
+    --enable-libopenjpeg \
+    --enable-librtmp \
+    --enable-gpl \
+    --enable-version3 \
+    --enable-runtime-cpudetect \
+    --disable-debug
+
+  make
+  make tools/qt-faststart
+  make doc/ff{mpeg,play,server}.1
+
+  make DESTDIR="$pkgdir" install install-man
+  install -D -m755 tools/qt-faststart "$pkgdir/usr/bin/qt-faststart"
+}
+
+# vim:set ts=2 sw=2 et:
diff --git a/extra/ffmpeg/ffmpeg-loongson.patch b/extra/ffmpeg/ffmpeg-loongson.patch
new file mode 100644
index 000000000..501eafd15
--- /dev/null
+++ b/extra/ffmpeg/ffmpeg-loongson.patch
@@ -0,0 +1,1794 @@
+diff --git a/configure b/configure
+index 25e8cef..1d6c652 100755
+--- a/configure
++++ b/configure
+@@ -230,6 +230,7 @@ Advanced options (experts only):
+   --disable-armvfp         disable ARM VFP optimizations
+   --disable-iwmmxt         disable iwmmxt optimizations
+   --disable-mmi            disable MMI optimizations
++  --disable-loongson2mmi   disable LOONGSON2 Multi-Media Instructions usage"
+   --disable-neon           disable neon optimizations
+   --disable-vis            disable VIS optimizations
+   --disable-yasm           disable use of yasm assembler
+@@ -995,6 +996,7 @@ ARCH_EXT_LIST='
+     armvfp
+     iwmmxt
+     mmi
++    loongson2mmi
+     mmx
+     mmx2
+     neon
+@@ -2862,6 +2864,7 @@ if enabled arm; then
+ fi
+ if enabled mips; then
+     echo "MMI enabled               ${mmi-no}"
++    echo "LOONGSON2MMI enabled               ${loongson2mmi-no}"
+ fi
+ if enabled ppc; then
+     echo "AltiVec enabled           ${altivec-no}"
+diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+index add4b10..8244e51 100644
+--- a/libavcodec/avcodec.h
++++ b/libavcodec/avcodec.h
+@@ -1586,6 +1586,8 @@ typedef struct AVCodecContext {
+ #define FF_IDCT_SIMPLENEON    22
+ #define FF_IDCT_SIMPLEALPHA   23
+ #define FF_IDCT_BINK          24
++#define FF_IDCT_LIBMPEG2LOONGSON2	25
++#define FF_IDCT_XVIDLOONGSON2 26
+ 
+     /**
+      * slice count
+diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
+index bbfdb6a..dfc3452 100644
+--- a/libavcodec/dsputil.c
++++ b/libavcodec/dsputil.c
+@@ -4525,6 +4525,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
+     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
+     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
+     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
++    if (HAVE_LOONGSON2MMI)	dsputil_init_loongson2(c, avctx);
+ 
+     for(i=0; i<64; i++){
+         if(!c->put_2tap_qpel_pixels_tab[0][i])
+diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
+index d1816e6..1a72ae9 100644
+--- a/libavcodec/dsputil.h
++++ b/libavcodec/dsputil.h
+@@ -636,6 +636,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
+ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
+ void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
+ void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
++void dsputil_init_loongson2(DSPContext* c, AVCodecContext *avctx);
+ 
+ void ff_dsputil_init_dwt(DSPContext *c);
+ void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
+diff --git a/libavcodec/loongson2/dsputil_loongson2.c b/libavcodec/loongson2/dsputil_loongson2.c
+new file mode 100644
+index 0000000..01bd3ac
+--- /dev/null
++++ b/libavcodec/loongson2/dsputil_loongson2.c
+@@ -0,0 +1,221 @@
++/*
++ *  Copyright(C) 2006-2010 comcat <jiankemeng@gmail.com>
++ *
++ *  Optimized for Loongson2 CPUs by comcat <jiankemeng@gmail.com>
++ * 
++ */
++
++#include "dsputil_loongson2.h"
++#include "../simple_idct.h"
++#include "../mpegvideo.h"
++
++//extern void ff_idct_xvid_loongson2(short *block);
++
++extern void ff_loongson2_idct(DCTELEM *block);
++extern void ff_idct_xvid_loongson2(short *block);
++
++static void add_pixels_clamped_loongson2(const DCTELEM *block, uint8_t *restrict pixels, int line_size)
++{
++    const DCTELEM *p;
++    uint8_t *pix;
++    int i,j;
++    p = block;
++    pix = pixels;
++    i = 4;
++    j = line_size << 1;
++	__asm __volatile("xor  $f14, $f14, $f14\n\t");
++	do {
++		__asm __volatile(
++//				".set mips3					\n\t"
++				"ldc1   $f0, 0(%2)			\n\t"
++				"ldc1   $f2, 8(%2)			\n\t"
++				"ldc1   $f4, 16(%2)			\n\t"
++				"ldc1   $f6, 24(%2)			\n\t"
++				"ldc1   $f8, %0				\n\t"
++				"ldc1   $f12, %1			\n\t"
++				"mov.d  $f10, $f8			\n\t"
++
++				"punpcklbh  $f8, $f8, $f14 	\n\t"
++				"punpckhbh  $f10, $f10, $f14\n\t"
++
++				"paddsh $f0, $f0, $f8		\n\t"
++				"paddsh $f2, $f2, $f10		\n\t"
++
++				"mov.d  $f10, $f12			\n\t"
++
++				"punpcklbh  $f12, $f12, $f14\n\t"
++				"punpckhbh  $f10, $f10, $f14\n\t"
++
++				"paddsh $f4, $f4, $f12		\n\t"
++				"paddsh $f6, $f6, $f10		\n\t"
++
++				"packushb   $f0, $f0, $f2	\n\t"
++				"packushb   $f4, $f4, $f6	\n\t"
++
++				"sdc1   $f0, %0				\n\t"
++				"sdc1   $f4, %1				\n\t"
++//				".set mips2					\n\t"
++				:"+m"(*pix), "+m"(*(pix+line_size))
++				:"r"(p)
++				:"$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","memory");
++		pix += j;
++		p += 16;
++	} while (--i);
++
++}
++
++static void put_pixels_clamped_loongson2(const DCTELEM *block, uint8_t *restrict pixels, int line_size)
++{
++    const DCTELEM *p;
++    uint8_t *pix;
++    int tmp = line_size * 3;
++    p = block;
++    pix = pixels;
++	__asm __volatile
++	   (
++//		   ".set mips3 \n\t"
++		   //"dadd	$12, $0, $0\n\t"
++		   //"dadd	$13, $0, $0\n\t"
++		   //"dadd	$14, $0, $0\n\t"
++
++		   "ldc1    $f0, 0(%3)\n\t"
++		   "ldc1    $f2, 8(%3)\n\t"
++		   "ldc1    $f4, 16(%3)\n\t"
++		   "ldc1    $f6, 24(%3)\n\t"
++		   "ldc1    $f8, 32(%3)\n\t"
++		   "ldc1    $f10, 40(%3)\n\t"
++		   "ldc1    $f16, 48(%3)\n\t"
++		   "ldc1    $f18, 56(%3)\n\t"
++
++		   "packushb    $f0, $f0, $f2\n\t"
++		   "packushb    $f4, $f4, $f6\n\t"
++		   "packushb    $f8, $f8, $f10\n\t"
++		   "packushb    $f16, $f16, $f18\n\t"
++
++		   "add    $12, %0, %1\n\t"
++		   "add    $13, $12, %1\n\t"
++		   "add    $14, %0, %2\n\t"
++
++		   "sdc1    $f0, 0(%0)\n\t"
++		   "sdc1    $f4, 0($12)\n\t"
++		   "sdc1    $f8, 0($13)\n\t"
++		   "sdc1    $f16, 0($14)\n\t"
++//		   ".set mips2\n\t"
++		   :
++		   :"r" (pix), "r" (line_size), "r" (tmp), "r"(p)
++		   :"$12","$13","$14","$f0","$f2","$f4","$f6","$f8","$f10","$16","$18"
++		);
++
++		pix += line_size*4;
++		p += 32;
++
++	__asm __volatile
++		(
++//			".set mips3 \n\t"
++
++		    "dadd	$12, $0, $0\n\t"
++		    "dadd	$13, $0, $0\n\t"
++		    "dadd	$14, $0, $0\n\t"
++			"lw      $12, %3\n\t"
++
++			"ldc1    $f0, 0($12)\n\t"
++			"ldc1    $f2, 8($12)\n\t"
++			"ldc1    $f4, 16($12)\n\t"
++			"ldc1    $f6, 24($12)\n\t"
++			"ldc1    $f8, 32($12)\n\t"
++			"ldc1    $f10, 40($12)\n\t"
++			"ldc1    $f16, 48($12)\n\t"
++			"ldc1    $f18, 56($12)\n\t"
++
++			"packushb        $f0, $f0, $f2\n\t"
++			"packushb        $f4, $f4, $f6\n\t"
++			"packushb        $f8, $f8, $f10\n\t"
++			"packushb        $f16, $f16, $f18\n\t"
++
++			"add    $12, %1, %0\n\t"
++			"add    $13, $12, %1\n\t"
++			"add    $15, %2, %0\n\t"
++
++			"sdc1    $f0, 0(%0)\n\t"
++			"sdc1    $f4, 0($12)\n\t"
++
++			"sdc1    $f8, 0($13)\n\t"
++			"sdc1    $f16, 0($15)\n\t"
++//			".set mips2\n\t"
++			:
++			:"r" (pix), "r" (line_size), "r" (tmp), "m"(p)
++			:"$12","$13","$15","$f0","$f2","$f4","$f6","$f8","$f10","$16","$18","memory"
++		);
++
++}
++
++/*
++void put_signed_pixels_clamped_loongson2(const DCTELEM *block, uint8_t *pixels, int line_size)
++{
++
++}
++
++
++void ff_loongson2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
++{
++	ff_loongson2_idct(block);
++	put_pixels_clamped_loongson2(block, dest, line_size);
++}
++
++void ff_loongson2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
++{
++	ff_loongson2_idct(block);
++	add_pixels_clamped_loongson2(block, dest, line_size);	
++}*/
++
++static void ff_idct_xvid_loongson2_put(uint8_t *dest, int line_size, DCTELEM *block)
++{
++	ff_idct_xvid_loongson2(block);	
++	put_pixels_clamped_loongson2(block, dest, line_size);
++}
++
++static void ff_idct_xvid_loongson2_add(uint8_t *dest, int line_size, DCTELEM *block)
++{
++	ff_idct_xvid_loongson2(block);	
++	add_pixels_clamped_loongson2(block, dest, line_size);	
++}
++
++void dsputil_init_loongson2(DSPContext *c, AVCodecContext *avctx)
++{
++
++	const int idct_algo = avctx->idct_algo;
++
++/*
++#ifdef CONFIG_ENCODERS
++	const int dct_algo = avctx->dct_algo;
++	if(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_LOONGSON2)
++		c->fdct = ff_fdct_loongson2;
++#endif
++*/
++
++#if 0
++	if(avctx->lowres==0)
++	{
++		if(idct_algo == FF_IDCT_LIBMPEG2LOONGSON2)
++		{
++			c->idct_add = ff_loongson2_idct_add;
++			c->idct_put = ff_loongson2_idct_put;
++			c->idct = ff_loongson2_idct;
++		}
++		else if(idct_algo == FF_IDCT_XVIDLOONGSON2)
++		{
++#endif
++			c->idct_add = ff_idct_xvid_loongson2_add;
++			c->idct_put = ff_idct_xvid_loongson2_put;
++			c->idct = ff_idct_xvid_loongson2;
++		//}
++	//}
++
++	c->put_pixels_clamped = put_pixels_clamped_loongson2;
++	c->add_pixels_clamped = add_pixels_clamped_loongson2;
++
++#ifdef	CONFIG_ENCODERS
++	dsputil_init_pix_loongson2(c, avctx);
++#endif
++
++}
+diff --git a/libavcodec/loongson2/dsputil_loongson2.d b/libavcodec/loongson2/dsputil_loongson2.d
+new file mode 100644
+index 0000000..808f0a3
+--- /dev/null
++++ b/libavcodec/loongson2/dsputil_loongson2.d
+@@ -0,0 +1,18 @@
++libavcodec/loongson2/dsputil_loongson2.o: \
++ libavcodec/loongson2/dsputil_loongson2.c \
++ libavcodec/loongson2/dsputil_loongson2.h libavcodec/dsputil.h \
++ libavutil/intreadwrite.h config.h libavutil/bswap.h \
++ libavutil/attributes.h libavutil/common.h libavutil/intmath.h \
++ libavutil/mem.h libavutil/internal.h libavutil/timer.h libavutil/libm.h \
++ libavutil/mips/intreadwrite.h libavcodec/avcodec.h libavutil/avutil.h \
++ libavutil/error.h libavutil/avutil.h libavutil/mathematics.h \
++ libavutil/rational.h libavutil/intfloat_readwrite.h libavutil/log.h \
++ libavutil/pixfmt.h libavutil/avconfig.h \
++ libavcodec/loongson2/../simple_idct.h libavcodec/loongson2/../dsputil.h \
++ libavcodec/loongson2/../mpegvideo.h libavcodec/loongson2/../get_bits.h \
++ libavutil/bswap.h libavutil/common.h libavutil/log.h \
++ libavcodec/loongson2/../mathops.h libavcodec/loongson2/../mips/mathops.h \
++ libavcodec/loongson2/../put_bits.h libavcodec/loongson2/../ratecontrol.h \
++ libavcodec/loongson2/../eval.h libavcodec/loongson2/../parser.h \
++ libavcodec/loongson2/../avcodec.h libavcodec/loongson2/../mpeg12data.h \
++ libavutil/rational.h libavcodec/loongson2/../rl.h
+diff --git a/libavcodec/loongson2/dsputil_loongson2.h b/libavcodec/loongson2/dsputil_loongson2.h
+new file mode 100644
+index 0000000..87c7bd9
+--- /dev/null
++++ b/libavcodec/loongson2/dsputil_loongson2.h
+@@ -0,0 +1,3 @@
++#include "libavcodec/dsputil.h"
++
++void dsputil_init_pix_loongson2(DSPContext* c, AVCodecContext *avctx);
+diff --git a/libavcodec/loongson2/dsputil_loongson2.o b/libavcodec/loongson2/dsputil_loongson2.o
+new file mode 100644
+index 0000000..fca0b55
+Binary files /dev/null and b/libavcodec/loongson2/dsputil_loongson2.o differ
+diff --git a/libavcodec/loongson2/idct_loongson2.c b/libavcodec/loongson2/idct_loongson2.c
+new file mode 100644
+index 0000000..539cab5
+--- /dev/null
++++ b/libavcodec/loongson2/idct_loongson2.c
+@@ -0,0 +1,336 @@
++/*
++ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
++ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
++ *
++ * Copyright (c) 2007-2010 comcat <jiankemeng@gmail.com>.
++ *
++ * Optimized for Loongson2 CPUs by comcat <jiankemeng@gmail.com>
++ * 
++ * Based on i386
++ */
++
++#include "libavutil/common.h"
++#include "dsputil_loongson2.h"
++
++
++#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
++
++
++#define ROW_SHIFT 11
++#define COL_SHIFT 6
++
++#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
++#define rounder(bias) {round (bias), round (bias)}
++
++
++
++#define loongson2_table(c1,c2,c3,c4,c5,c6,c7) { c4,c2,-c4,-c2, \
++					       c4,c6,c4,c6, \
++					       c1,c3,-c1,-c5,\
++					       c5,c7,c3,-c7, \
++					       c4,-c6,c4,-c6, \
++					       -c4,c2,c4,-c2, \
++					       c5,-c1,c3,-c1, \
++					       c7,c3,c7,-c5 }
++
++
++static inline void loongson2_row_head(int16_t * const row, const int offset,
++					const int16_t * const table)
++{
++	__asm__ volatile(
++//		".set\tmips3\n"
++		".set noreorder\n"
++		"ldc1	$f6,%0\n"
++		"ldc1	$f14,%1\n"
++		"ldc1	$f2,%2\n"
++		"ldc1	$f8,%3\n"
++		"dli $12,%4\n"
++		"dmtc1	$12,$f16\n"
++		"mov.d	$f4,$f6\n"
++		"mov.d	$f10,$f14\n"
++		"pmaddhw	$f2,$f2,$f4\n"
++		"pshufh	$f6,$f6,$f16\n"
++		".set reorder\n"
++//		".set\tmips0\n"
++		:	
++		:"m"(*(row+offset)),"m"(*(row+offset+4)),"m"(*table),"m"(*(table+4)),"i"(0x4e)
++		:"$f2","$f4","$f6","$f8","$f10","$f14","$f16","$12"
++	);
++}
++
++
++static inline void loongson2_row(const int16_t * const table,
++				const int32_t * const rounder)
++{
++	__asm__ volatile (
++//	".set\tmips3\n"
++	".set\tnoreorder\n"
++	"ldc1	$f0,%0\n"
++	"pmaddhw $f8,$f8,$f6\n"
++	"ldc1	$f16,%1\n"
++	"dli	$13,%8\n"
++	"ldc1	$f20,%2\n"
++	"pmaddhw $f0,$f0,$f14\n"
++	"ldc1	$f22,%3\n"
++	"pmaddhw $f4,$f4,$f16\n"
++	"paddw	$f2,$f2,$f22\n"
++	"ldc1	$f22,%4\n"
++	"dmtc1	$13,$f16\n"
++	"paddw	$f2,$f2,$f8\n"
++	"pmaddhw	$f14,$f14,$f22\n"
++	"mov.d	$f8,$f2\n"
++	"pshufh	$f10,$f10,$f16\n"
++	"ldc1	$f22,%3\n"
++	"pmaddhw	$f20,$f20,$f10\n"
++	"ldc1	$f16,%5\n"
++	"paddw	$f4,$f4,$f22\n"
++	"paddw	$f0,$f0,$f20\n"
++	"dli	$12,%6\n"
++	"pmaddhw	$f6,$f6,$f16\n"
++	"psubw	$f2,$f2,$f0\n"
++	"ldc1	$f16,%7\n"
++	"paddw	$f0,$f0,$f8\n"
++	"paddw	$f4,$f4,$f6\n"
++	"pmaddhw	$f10,$f10,$f16\n"
++	"mov.d	$f8,$f4\n"
++	"dmtc1	$12,$f16\n"
++	"paddw	$f14,$f14,$f10\n"
++	"psraw	$f2,$f2,$f16\n"
++	"psraw	$f0,$f0,$f16\n"
++	"paddw	$f4,$f4,$f14\n"
++	"psubw	$f8,$f8,$f14\n"
++	".set\treorder\n"
++//	".set\tmips0\n"
++	:
++	:"m"(*(table+8)),"m"(*(table+16)),"m"(*(table+12)),"m"(*rounder),"m"(*(table+24)),"m"(*(table+20)),"i"(ROW_SHIFT),"m"(*(table+16)),"i"(0x4e)
++	:"$f0","$f2","$f4","$f6","$f8","$f10","$f14","$f16","$f20","$f22","$12","$13","memory"
++	);
++}
++
++static inline void loongson2_row_tail(int16_t * const row, const int store)
++{
++	__asm__ volatile (
++//	".set\tmips3\n"
++	".set\tnoreorder\n"
++	"dli	$12,%2\n"
++	"dmtc1	$12,$f16\n"
++	"psraw	$f4,$f4,$f16\n"
++	"psraw	$f8,$f8,$f16\n"
++	"packsswh	$f0,$f0,$f4\n"
++	"packsswh	$f8,$f8,$f2\n"
++	"sdc1	$f0,%0\n"
++	"dli $13,%3\n"
++	"dmtc1	$13,$f22\n"
++	"pshufh	$f8,$f8,$f22\n"
++	"sdc1	$f8,%1\n"
++	".set\treorder\n"
++//	".set\tmips0\n"
++	:"=m"(*(row+store)),"=m"(*(row+store+4))
++	:"i"(ROW_SHIFT),"i"(0xb1)
++	:"$f0","$f2","$f4","$f6","$f8","$f16","$f22","$12","$13","memory"
++	);
++}
++
++static inline void loongson2_row_mid(int16_t * const row, const int store,
++					const int offset,
++					const int16_t * const table)
++{
++	__asm__ volatile (
++//	".set\tmips3\n"
++	".set\tnoreorder\n"
++	"ldc1	$f6,%2\n"
++	"dli $12,%3\n"
++	"dmtc1	$12,$f16\n"
++	"psraw	$f4,$f4,$f16\n"
++	"ldc1	$f14,%4\n"
++	"psraw	$f8,$f8,$f16\n"
++	"packsswh	$f0,$f0,$f4\n"
++	"mov.d	$f10,$f14\n"
++	"packsswh	$f8,$f8,$f2\n"
++	"mov.d	$f4,$f6\n"
++	"sdc1	$f0,%0\n"
++	"dli $13,%5\n"
++	"dmtc1	$13,$f22\n"
++	"pshufh	$f8,$f8,$f22\n"
++	"ldc1	$f2,%6\n"
++	"sdc1	$f8,%1\n"
++	"pmaddhw	$f2,$f2,$f4\n"
++	"ldc1	$f8,%7\n"
++	"dli $12,%8\n"
++	"dmtc1	$12,$f16\n"
++	"pshufh	$f6,$f6,$f16\n"
++	".set\treorder\n"
++//	".set\tmips0\n"
++	:"=m"(*(row+store)),"=m"(*(row+store+4))
++	: "m"(*(row+offset)),"i"(ROW_SHIFT),"m"(*(row+offset+4)),"i"(0xb1),"m"(*table),"m"(*(table+4)),"i"(0x4e)
++	:"$f0","$f2","$f4","$f6","$f8","$f10","$14","$f16","$f22","$12","$13","memory"
++	);
++}
++
++static inline void idct_col(int16_t * const col, const int offset)
++{
++#define T1 13036
++#define T2 27146
++#define T3 43790
++#define C4 23170
++	static const short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
++	static const short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
++	static const short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
++	static const short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
++
++	__asm__ volatile (
++//	".set\tmips3\n"
++	".set\tnoreorder\n"
++	"ldc1	$f4,%8\n"
++	"ldc1	$f0,%9\n"
++	"mov.d	$f6,$f4\n"
++	"ldc1	$f8,%10\n"
++	"pmulhh	$f4,$f4,$f0\n"
++	"ldc1	$f14,%11\n"
++	"pmulhh	$f6,$f6,$f8\n"
++	"ldc1	$f10,%12\n"
++	"mov.d	$f20,$f14\n"
++	"ldc1	$f2,%13\n"
++	"psubsh	$f4,$f4,$f8\n"
++	"ldc1	$f8,%14\n"
++	"pmulhh	$f14,$f14,$f2\n"
++	"paddsh	$f0,$f0,$f6\n"
++	"pmulhh	$f20,$f20,$f10\n"
++	"mov.d	$f6,$f8\n"
++	"paddsh	$f14,$f14,$f2\n"
++	"ldc1	$f16,%15\n"
++	"pmulhh	$f8,$f8,$f16\n"
++	"paddsh	$f20,$f20,$f10\n"
++	"psubsh	$f14,$f14,$f10\n"
++	"paddsh	$f20,$f20,$f2\n"
++	"ldc1	$f2,%16\n"
++	"mov.d	$f10,$f4\n"
++	"pmulhh	$f6,$f6,$f2\n"
++	"psubsh	$f4,$f4,$f14\n"
++	"psubsh	$f8,$f8,$f2\n"
++	"paddsh	$f14,$f14,$f10\n"
++	"sdc1	$f4,%0\n"
++	"mov.d	$f10,$f0\n"
++	"ldc1	$f22,%15\n"
++	"paddsh	$f6,$f6,$f22\n"
++	"paddsh	$f10,$f10,$f20\n"
++	"psubsh	$f0,$f0,$f20\n"
++	"mov.d	$f20,$f0\n"
++	"ldc1	$f2,%17\n"
++	"paddsh	$f0,$f0,$f14\n"
++	"ldc1	$f4,%18\n"
++	"psubsh	$f20,$f20,$f14\n"
++	"sdc1	$f10,%1\n"
++	"pmulhh	$f0,$f0,$f4\n"
++	"mov.d	$f10,$f8\n"
++	"pmulhh	$f20,$f20,$f4\n"
++	"ldc1	$f14,%19\n"
++	"mov.d	$f4,$f2\n"
++	"psubsh	$f2,$f2,$f14\n"
++	"paddsh	$f4,$f4,$f14\n"
++	"paddsh	$f8,$f8,$f2\n"
++	"mov.d	$f14,$f4\n"
++	"psubsh	$f2,$f2,$f10\n"
++	"paddsh	$f14,$f14,$f6\n"
++	"paddsh	$f0,$f0,$f0\n"
++	"psubsh	$f4,$f4,$f6\n"
++	"paddsh	$f20,$f20,$f20\n"
++	"mov.d	$f6,$f2\n"
++	"mov.d	$f10,$f8\n"
++	"paddsh	$f2,$f2,$f20\n"
++	"dli $12,%20\n"
++	"dmtc1	$12,$f16\n"
++	"psrah	$f2,$f2,$f16\n"
++	"paddsh	$f8,$f8,$f0\n"
++	"psrah	$f8,$f8,$f16\n"
++	"psubsh	$f10,$f10,$f0\n"
++	"ldc1	$f0,%12\n"
++	"psubsh	$f6,$f6,$f20\n"
++	"psrah	$f10,$f10,$f16\n"
++	"mov.d	$f20,$f14\n"
++	"sdc1	$f8,%2\n"
++	"psrah	$f6,$f6,$f16\n"
++	"sdc1	$f2,%3\n"
++	"paddsh	$f14,$f14,$f0\n"
++	"ldc1	$f8,%13\n"
++	"psubsh	$f20,$f20,$f0\n"
++	"psrah	$f14,$f14,$f16\n"
++	"mov.d	$f2,$f4\n"
++	"sdc1	$f6,%1\n"
++	"psubsh	$f2,$f2,$f8\n"
++	"psrah	$f20,$f20,$f16\n"
++	"paddsh	$f8,$f8,$f4\n"
++	"sdc1	$f14,%4\n"
++	"psrah	$f2,$f2,$f16\n"
++	"sdc1	$f10,%5\n"
++	"psrah	$f8,$f8,$f16\n"
++	"sdc1	$f20,%6\n"
++	"sdc1	$f2,%7\n"
++	"sdc1	$f8,%0\n"
++	".set\treorder\n"
++//	".set\tmips0\n"
++	:"=m"(*(col+offset+3*8)),"=m"(*(col+offset+5*8)),"=m"(*(col+offset+1*8)),"=m"(*(col+offset+2*8)),"=m"(*(col+offset+0*8)),"=m"(*(col+offset+6*8)),"=m"(*(col+offset+7*8)),"=m"(*(col+offset+4*8))
++	:"m"(*_T1),"m"(*(col+offset+1*8)),"m"(*(col+offset+7*8)),"m"(*_T3),"m"(*(col+offset+5*8)),"m"(*(col+offset+3*8)),"m"(*_T2),"m"(*(col+offset+2*8)),"m"(*(col+offset+6*8)),"m"(*(col+offset+0*8)),"m"(*_C4),"m"(*(col+offset+4*8)),"i"(COL_SHIFT)
++	:"$f0","$f2","$f4","$f6","$f8","$f10","$14","$f16","$20","$f22","$12","memory" 
++	);
++}
++
++static const int32_t rounder0[] ATTR_ALIGN(8) =
++    rounder ((1 << (COL_SHIFT - 1)) - 0.5);
++static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
++static const int32_t rounder1[] ATTR_ALIGN(8) =
++    rounder (1.25683487303);	/* C1*(C1/C4+C1+C7)/2 */
++static const int32_t rounder7[] ATTR_ALIGN(8) =
++    rounder (-0.25);		/* C1*(C7/C4+C7-C1)/2 */
++static const int32_t rounder2[] ATTR_ALIGN(8) =
++    rounder (0.60355339059);	/* C2 * (C6+C2)/2 */
++static const int32_t rounder6[] ATTR_ALIGN(8) =
++    rounder (-0.25);		/* C2 * (C6-C2)/2 */
++static const int32_t rounder3[] ATTR_ALIGN(8) =
++    rounder (0.087788325588);	/* C3*(-C3/C4+C3+C5)/2 */
++static const int32_t rounder5[] ATTR_ALIGN(8) =
++    rounder (-0.441341716183);	/* C3*(-C5/C4+C5-C3)/2 */
++
++
++#undef COL_SHIFT
++#undef ROW_SHIFT
++
++
++#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid)	\
++inline void idct (int16_t * const block)				\
++{									\
++    static const int16_t table04[] ATTR_ALIGN(16) =			\
++	table (22725, 21407, 19266, 16384, 12873,  8867, 4520);		\
++    static const int16_t table17[] ATTR_ALIGN(16) =			\
++	table (31521, 29692, 26722, 22725, 17855, 12299, 6270);		\
++    static const int16_t table26[] ATTR_ALIGN(16) =			\
++	table (29692, 27969, 25172, 21407, 16819, 11585, 5906);		\
++    static const int16_t table35[] ATTR_ALIGN(16) =			\
++	table (26722, 25172, 22654, 19266, 15137, 10426, 5315);		\
++									\
++    idct_row_head (block, 0*8, table04);				\
++    idct_row (table04, rounder0);					\
++    idct_row_mid (block, 0*8, 4*8, table04);				\
++    idct_row (table04, rounder4);					\
++    idct_row_mid (block, 4*8, 1*8, table17);				\
++    idct_row (table17, rounder1);					\
++    idct_row_mid (block, 1*8, 7*8, table17);				\
++    idct_row (table17, rounder7);					\
++    idct_row_mid (block, 7*8, 2*8, table26);				\
++    idct_row (table26, rounder2);					\
++    idct_row_mid (block, 2*8, 6*8, table26);				\
++    idct_row (table26, rounder6);					\
++    idct_row_mid (block, 6*8, 3*8, table35);				\
++    idct_row (table35, rounder3);					\
++    idct_row_mid (block, 3*8, 5*8, table35);				\
++    idct_row (table35, rounder5);					\
++    idct_row_tail (block, 5*8);						\
++									\
++    idct_col (block, 0);						\
++    idct_col (block, 4);						\
++}
++
++void ff_loongson2_idct(DCTELEM *block);
++
++declare_idct (ff_loongson2_idct, loongson2_table,
++	      loongson2_row_head, loongson2_row, loongson2_row_tail, loongson2_row_mid)
+diff --git a/libavcodec/loongson2/idct_loongson2_xvid.c b/libavcodec/loongson2/idct_loongson2_xvid.c
+new file mode 100644
+index 0000000..4a1ee1e
+--- /dev/null
++++ b/libavcodec/loongson2/idct_loongson2_xvid.c
+@@ -0,0 +1,301 @@
++/*
++ *  XVID MPEG-4 VIDEO CODEC
++ *
++ *  Copyright(C) 2006-2010 comcat <jiankemeng@gmail.com>
++ *
++ *  Optimized for Loongson2 CPUs by comcat <jiankemeng@gmail.com>
++ * 
++ *  Based on i386
++ *
++ */
++
++
++#include <inttypes.h>
++#include "../avcodec.h"
++
++void ff_idct_xvid_loongson2(short *block);
++
++//=============================================================================
++// Macros and other preprocessor constants
++//=============================================================================
++
++#define BITS_INV_ACC    5                              // 4 or 5 for IEEE
++#define SHIFT_INV_ROW   (16 - BITS_INV_ACC) //11
++#define SHIFT_INV_COL   (1 + BITS_INV_ACC) //6
++#define RND_INV_ROW     (1024 * (6 - BITS_INV_ACC))
++#define RND_INV_COL     (16 * (BITS_INV_ACC - 3))
++#define RND_INV_CORR    (RND_INV_COL - 1)
++
++#define BITS_FRW_ACC    3                              // 2 or 3 for accuracy
++#define SHIFT_FRW_COL   BITS_FRW_ACC
++#define SHIFT_FRW_ROW   (BITS_FRW_ACC + 17)
++#define RND_FRW_ROW     (262144*(BITS_FRW_ACC - 1))
++
++
++//-----------------------------------------------------------------------------
++// Various memory constants (trigonometric values or rounding values)
++//-----------------------------------------------------------------------------
++
++static const int16_t tg_1_16[4*4] attribute_used __attribute__ ((aligned(8))) = {
++  13036,13036,13036,13036,        // tg * (2<<16) + 0.5
++  27146,27146,27146,27146,        // tg * (2<<16) + 0.5
++  -21746,-21746,-21746,-21746,    // tg * (2<<16) + 0.5
++  23170,23170,23170,23170};       // cos * (2<<15) + 0.5
++
++static const int32_t rounder_0[2*8] attribute_used __attribute__ ((aligned(8))) = {
++  65536,65536,
++  3597,3597,
++  2260,2260,
++  1203,1203,
++  0,0,
++  120,120,
++  512,512,
++  512,512};
++
++
++// Table for rows 0,4 - constants are multiplied by cos_4_16
++static const int16_t tab_i_04_mmx[32*4] attribute_used __attribute__ ((aligned(8))) = {
++  16384,16384,16384,-16384,       // movq-> w06 w04 w02 w00
++  21407,8867,8867,-21407,         // w07 w05 w03 w01
++  16384,-16384,16384,16384,       // w14 w12 w10 w08
++  -8867,21407,-21407,-8867,       // w15 w13 w11 w09
++  22725,12873,19266,-22725,       // w22 w20 w18 w16
++  19266,4520,-4520,-12873,        // w23 w21 w19 w17
++  12873,4520,4520,19266,          // w30 w28 w26 w24
++  -22725,19266,-12873,-22725,     // w31 w29 w27 w25
++// Table for rows 1,7 - constants are multiplied by cos_1_16
++  22725,22725,22725,-22725,       // movq-> w06 w04 w02 w00
++  29692,12299,12299,-29692,       // w07 w05 w03 w01
++  22725,-22725,22725,22725,       // w14 w12 w10 w08
++  -12299,29692,-29692,-12299,     // w15 w13 w11 w09
++  31521,17855,26722,-31521,       // w22 w20 w18 w16
++  26722,6270,-6270,-17855,        // w23 w21 w19 w17
++  17855,6270,6270,26722,          // w30 w28 w26 w24
++  -31521,26722,-17855,-31521,     // w31 w29 w27 w25
++// Table for rows 2,6 - constants are multiplied by cos_2_16
++  21407,21407,21407,-21407,       // movq-> w06 w04 w02 w00
++  27969,11585,11585,-27969,       // w07 w05 w03 w01
++  21407,-21407,21407,21407,       // w14 w12 w10 w08
++  -11585,27969,-27969,-11585,     // w15 w13 w11 w09
++  29692,16819,25172,-29692,       // w22 w20 w18 w16
++  25172,5906,-5906,-16819,        // w23 w21 w19 w17
++  16819,5906,5906,25172,          // w30 w28 w26 w24
++  -29692,25172,-16819,-29692,     // w31 w29 w27 w25
++// Table for rows 3,5 - constants are multiplied by cos_3_16
++  19266,19266,19266,-19266,       // movq-> w06 w04 w02 w00
++  25172,10426,10426,-25172,       // w07 w05 w03 w01
++  19266,-19266,19266,19266,       // w14 w12 w10 w08
++  -10426,25172,-25172,-10426,     // w15 w13 w11 w09
++  26722,15137,22654,-26722,       // w22 w20 w18 w16
++  22654,5315,-5315,-15137,        // w23 w21 w19 w17
++  15137,5315,5315,22654,          // w30 w28 w26 w24
++  -26722,22654,-15137,-26722,     // w31 w29 w27 w25
++};
++
++
++// %3 for rows 0,4 - constants are multiplied by cos_4_16
++static const int16_t tab_i_04_xmm[32*4] attribute_used __attribute__ ((aligned(8))) = {
++  16384,21407,16384,8867,      // movq-> w05 w04 w01 w00
++  16384,8867,-16384,-21407,    // w07 w06 w03 w02
++  16384,-8867,16384,-21407,    // w13 w12 w09 w08
++  -16384,21407,16384,-8867,    // w15 w14 w11 w10
++  22725,19266,19266,-4520,     // w21 w20 w17 w16
++  12873,4520,-22725,-12873,    // w23 w22 w19 w18
++  12873,-22725,4520,-12873,    // w29 w28 w25 w24
++  4520,19266,19266,-22725,     // w31 w30 w27 w26
++// %3 for rows 1,7 - constants are multiplied by cos_1_16
++  22725,29692,22725,12299,     // movq-> w05 w04 w01 w00
++  22725,12299,-22725,-29692,   // w07 w06 w03 w02
++  22725,-12299,22725,-29692,   // w13 w12 w09 w08
++  -22725,29692,22725,-12299,   // w15 w14 w11 w10
++  31521,26722,26722,-6270,     // w21 w20 w17 w16
++  17855,6270,-31521,-17855,    // w23 w22 w19 w18
++  17855,-31521,6270,-17855,    // w29 w28 w25 w24
++  6270,26722,26722,-31521,     // w31 w30 w27 w26
++// %3 for rows 2,6 - constants are multiplied by cos_2_16
++  21407,27969,21407,11585,     // movq-> w05 w04 w01 w00
++  21407,11585,-21407,-27969,   // w07 w06 w03 w02
++  21407,-11585,21407,-27969,   // w13 w12 w09 w08
++  -21407,27969,21407,-11585,   // w15 w14 w11 w10
++  29692,25172,25172,-5906,     // w21 w20 w17 w16
++  16819,5906,-29692,-16819,    // w23 w22 w19 w18
++  16819,-29692,5906,-16819,    // w29 w28 w25 w24
++  5906,25172,25172,-29692,     // w31 w30 w27 w26
++// %3 for rows 3,5 - constants are multiplied by cos_3_16
++  19266,25172,19266,10426,     // movq-> w05 w04 w01 w00
++  19266,10426,-19266,-25172,   // w07 w06 w03 w02
++  19266,-10426,19266,-25172,   // w13 w12 w09 w08
++  -19266,25172,19266,-10426,   // w15 w14 w11 w10
++  26722,22654,22654,-5315,     // w21 w20 w17 w16
++  15137,5315,-26722,-15137,    // w23 w22 w19 w18
++  15137,-26722,5315,-15137,    // w29 w28 w25 w24
++  5315,22654,22654,-26722,     // w31 w30 w27 w26
++};
++
++
++
++#define DCT_8_INV_ROW_LOONGSON2(A1,A2,A3,A4)\
++  "ldc1		$f0, " #A1 "           \n\t"/* 0     ; x3 x2 x1 x0*/\
++  "ldc1		$f2, 8+" #A1 "         \n\t"/* 1     ; x7 x6 x5 x4*/\
++  "mov.d	$f4, $f0               \n\t"/* 2     ; x3 x2 x1 x0*/\
++  "ldc1		$f6, " #A3 "           \n\t"/* 3     ; w05 w04 w01 w00*/\
++  "li		$12, 0x88				\n\t"\
++  "dmtc1	$12, $f16				\n\t"\
++  "pshufh	$f0, $f0, $f16         \n\t"/* x2 x0 x2 x0*/\
++  "ldc1		$f8, 8+" #A3 "         \n\t"/* 4     ; w07 w06 w03 w02*/\
++  "mov.d	$f10, $f2              \n\t"/* 5     ; x7 x6 x5 x4*/\
++  "pmaddhw	$f6, $f6, $f0          \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\
++  "ldc1		$f12, 32+" #A3 "       \n\t"/* 6     ; w21 w20 w17 w16*/\
++  "pshufh	$f2, $f2, $f16        \n\t"/* x6 x4 x6 x4*/\
++  "pmaddhw	$f8, $f8, $f2          \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\
++  "li		$12, 0xdd			   \n\t"\
++  "dmtc1	$12, $f16			   \n\t"\
++  "ldc1		$f14, 40+" #A3 "       \n\t"/* 7    ; w23 w22 w19 w18*/\
++  "pshufh	$f4, $f4, $f16         \n\t"/* x3 x1 x3 x1*/\
++  "pmaddhw	$f12, $f12, $f4        \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\
++  "ldc1		$f18, " #A4 "          \n\t" \
++  "ldc1		$f20, 16+" #A3 "	   \n\t" \
++  "ldc1		$f22, 24+" #A3 "	   \n\t" \
++  "ldc1		$f24, 48+" #A3 "	   \n\t" \
++  "ldc1		$f26, 56+" #A3 "	   \n\t" \
++  "pshufh	$f10, $f10, $f16       \n\t"/* x7 x5 x7 x5*/\
++  "pmaddhw	$f14, $f14, $f10       \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\
++  "paddw	$f6, $f6, $f18         \n\t"/* +%4*/\
++  "pmaddhw	$f0, $f0, $f20         \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\
++  "paddw	$f6, $f6, $f8	       \n\t"/* 4     ; a1=sum(even1) a0=sum(even0)*/\
++  "pmaddhw	$f2, $f2, $f22         \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\
++  "mov.d	$f8, $f6               \n\t"/* 4     ; a1 a0*/\
++  "li		$12, 11				   \n\t"\
++  "dmtc1	$12, $f16			   \n\t"\
++  "pmaddhw	$f4, $f4, $f24		   \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\
++  "paddw	$f12, $f12, $f14       \n\t"/* 7     ; b1=sum(odd1) b0=sum(odd0)*/\
++  "pmaddhw	$f10, $f10, $f26	   \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\
++  "paddw	$f6, $f6, $f12         \n\t"/* a1+b1 a0+b0*/\
++  "paddw	$f0, $f0, $f18         \n\t"/* +%4*/\
++  "psraw	$f6, $f6, $f16		   \n\t"/* y1=a1+b1 y0=a0+b0*/\
++  "paddw	$f0, $f0, $f2          \n\t"/* 1     ; a3=sum(even3) a2=sum(even2)*/\
++  "psubw	$f8, $f8, $f12         \n\t"/* 6     ; a1-b1 a0-b0*/\
++  "mov.d	$f14, $f0              \n\t"/* 7     ; a3 a2*/\
++  "paddw	$f4, $f4, $f10         \n\t"/* 5     ; b3=sum(odd3) b2=sum(odd2)*/\
++  "paddw	$f0, $f0, $f4          \n\t"/* a3+b3 a2+b2*/\
++  "psraw	$f8, $f8, $f16	       \n\t"/* y6=a1-b1 y7=a0-b0*/\
++  "psubw 	$f14, $f14, $f4        \n\t"/* 2     ; a3-b3 a2-b2*/\
++  "psraw	$f0, $f0, $f16     	   \n\t"/* y3=a3+b3 y2=a2+b2*/\
++  "psraw	$f14, $f14, $f16	   \n\t"/* y4=a3-b3 y5=a2-b2*/\
++  "li		$12, 0xb1			   \n\t"\
++  "dmtc1	$12, $f20			   \n\t"\
++  "packsswh $f6, $f6, $f0          \n\t"/* 0     ; y3 y2 y1 y0*/\
++  "packsswh $f14, $f14, $f8        \n\t"/* 4     ; y6 y7 y4 y5*/\
++  "sdc1		$f6, " #A2 "           \n\t"/* 3     ; save y3 y2 y1 y0*/\
++  "pshufh	$f14, $f14, $f20      \n\t"/* y7 y6 y5 y4*/\
++  "sdc1		$f14, 8 +" #A2 "	   \n\t"/* 7     ; save y7 y6 y5 y4*/\
++
++
++#define DCT_8_INV_COL(A1,A2)\
++  "ldc1		$f0, 2*8(%3)			\n\t"/* */\
++  "ldc1		$f6, 16*3+" #A1 "		\n\t"/* x3 */\
++  "mov.d	$f2, $f0				\n\t"/* tg_3_16*/\
++  "ldc1		$f10, 16*5+" #A1 "		\n\t"/* x5 */\
++  "pmulhh 	$f0, $f0, $f6 			\n\t"/* x3*(tg_3_16-1)*/\
++  "ldc1 	$f8, (%3)				\n\t"\
++  "pmulhh	$f2, $f2, $f10			\n\t"/* x5*(tg_3_16-1)*/\
++  "ldc1 	$f14, 16*7+" #A1 "		\n\t"/* x7 */\
++  "mov.d	$f4, $f8				\n\t"/* tg_1_16*/\
++  "ldc1		$f12, 16*1+" #A1 "		\n\t"/* x1 */\
++  "pmulhh 	$f8, $f8, $f14			\n\t"/* x7*tg_1_16*/\
++  "paddsh	$f0, $f0, $f6			\n\t"/* x3*tg_3_16*/\
++  "pmulhh	$f4, $f4, $f12			\n\t"/* x1*tg_1_16*/\
++  "paddsh	$f2, $f2, $f6			\n\t"/* x3+x5*(tg_3_16-1)*/\
++  "psubsh	$f0, $f0, $f10			\n\t"/* x3*tg_3_16-x5 = tm35*/\
++  "ldc1		$f6, 3*8(%3)			\n\t"\
++  "paddsh	$f2, $f2, $f10			\n\t"/* x3+x5*tg_3_16 = tp35*/\
++  "paddsh	$f8, $f8, $f12			\n\t"/* x1+tg_1_16*x7 = tp17*/\
++  "psubsh	$f4, $f4, $f14			\n\t"/* x1*tg_1_16-x7 = tm17*/\
++  "mov.d	$f10, $f8				\n\t"/* tp17*/\
++  "mov.d	$f12, $f4				\n\t"/* tm17*/\
++  "paddsh	$f10, $f10, $f2			\n\t"/* tp17+tp35 = b0*/\
++  "psubsh	$f12, $f12, $f0			\n\t"/* tm17-tm35 = b3*/\
++  "psubsh	$f8, $f8, $f2			\n\t"/* tp17-tp35 = t1*/\
++  "paddsh	$f4, $f4, $f0			\n\t"/* tm17+tm35 = t2*/\
++  "ldc1		$f14, 1*8(%3)			\n\t"\
++  "mov.d	$f2, $f8				\n\t"/* t1*/\
++  "sdc1		$f10, 3*16+" #A2 "		\n\t"/* save b0*/\
++  "paddsh	$f2, $f2, $f4			\n\t"/* t1+t2*/\
++  "sdc1		$f12, 5*16+" #A2 "		\n\t"/* save b3*/\
++  "psubsh	$f8, $f8, $f4			\n\t"/* t1-t2*/\
++  "ldc1		$f10, 2*16+" #A1 "		\n\t"\
++  "mov.d	$f0, $f14				\n\t"/* tg_2_16*/\
++  "ldc1		$f12, 6*16+" #A1 "		\n\t"\
++  "pmulhh	$f0, $f0, $f10			\n\t"/* x2*tg_2_16*/\
++  "pmulhh	$f14, $f14, $f12		\n\t"/* x6*tg_2_16*/\
++  "pmulhh	$f2, $f2, $f6			\n\t"/* ocos_4_16*(t1+t2) = b1/2*/\
++  "ldc1		$f4, 0*16+" #A1 "		\n\t"\
++  "pmulhh 	$f8, $f8, $f6			\n\t"/* ocos_4_16*(t1-t2) = b2/2*/\
++  "psubsh	$f0, $f0, $f12			\n\t"/* t2*tg_2_16-x6 = tm26*/\
++  "mov.d 	$f6, $f4				\n\t"/* x0*/\
++  "ldc1		$f12, 4*16+" #A1 "		\n\t"\
++  "paddsh 	$f14, $f14, $f10		\n\t"/* x2+x6*tg_2_16 = tp26*/\
++  "paddsh 	$f4, $f4, $f12			\n\t"/* x0+x4 = tp04*/\
++  "psubsh 	$f6, $f6, $f12			\n\t"/* x0-x4 = tm04*/\
++  "mov.d	$f10, $f4				\n\t"/* tp04*/\
++  "mov.d 	$f12, $f6				\n\t"/* tm04*/\
++  "psubsh 	$f4, $f4, $f14			\n\t"/* tp04-tp26 = a3*/\
++  "paddsh 	$f6, $f6, $f0			\n\t"/* tm04+tm26 = a1*/\
++  "paddsh 	$f2, $f2, $f2			\n\t"/* b1*/\
++  "paddsh 	$f8, $f8, $f8			\n\t"/* b2*/\
++  "paddsh	$f10, $f10, $f14		\n\t"/* tp04+tp26 = a0*/\
++  "psubsh	$f12, $f12, $f0			\n\t"/* tm04-tm26 = a2*/\
++  "li		$12, 6					\n\t"\
++  "dmtc1	$12, $f18				\n\t"\
++  "mov.d	$f14, $f6				\n\t"/* a1*/\
++  "mov.d	$f0, $f12				\n\t"/* a2*/\
++  "paddsh 	$f6, $f6, $f2			\n\t"/* a1+b1*/\
++  "paddsh 	$f12, $f12, $f8			\n\t"/* a2+b2*/\
++  "psrah 	$f6, $f6, $f18			\n\t"/* dst1*/\
++  "psubsh 	$f14, $f14, $f2			\n\t"/* a1-b1*/\
++  "psrah 	$f12, $f12, $f18		\n\t"/* dst2*/\
++  "psubsh 	$f0, $f0, $f8			\n\t"/* a2-b2*/\
++  "ldc1		$f2, 3*16+" #A2 "		\n\t"/* load b0*/\
++  "psrah 	$f14, $f14, $f18		\n\t"/* dst6*/\
++  "mov.d	$f8, $f10				\n\t"/* a0*/\
++  "psrah 	$f0, $f0, $f18			\n\t"/* dst5*/\
++  "sdc1		$f6, 1*16+" #A2 "		\n\t"\
++  "paddsh 	$f10, $f10, $f2			\n\t"/* a0+b0*/\
++  "sdc1		$f12, 2*16+" #A2 "		\n\t"\
++  "psubsh 	$f8, $f8, $f2			\n\t"/* a0-b0*/\
++  "ldc1		$f6, 5*16+" #A2 "		\n\t"/* load b3*/\
++  "psrah	$f10, $f10, $f18		\n\t"/* dst0*/\
++  "mov.d	$f12, $f4				\n\t"/* a3*/\
++  "psrah 	$f8, $f8, $f18			\n\t"/* dst7*/\
++  "sdc1		$f0, 5*16+" #A2 "		\n\t"\
++  "paddsh 	$f4, $f4, $f6			\n\t"/* a3+b3*/\
++  "sdc1		$f14, 6*16+" #A2 "		\n\t"\
++  "psubsh 	$f12, $f12, $f6			\n\t"/* a3-b3*/\
++  "sdc1		$f10, 0*16+" #A2 "		\n\t"\
++  "psrah 	$f4, $f4, $f18 			\n\t"/* dst3*/\
++  "sdc1		$f8, 7*16+" #A2 "		\n\t"\
++  "psrah 	$f12, $f12, $f18		\n\t"/* dst4*/\
++  "sdc1		$f4, 3*16+" #A2 "		\n\t"\
++  "sdc1		$f12, 4*16+" #A2 "		\n\t"
++
++
++
++void ff_idct_xvid_loongson2(short *block){
++	__asm__ volatile(
++	//# Process each row
++    DCT_8_INV_ROW_LOONGSON2(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
++    DCT_8_INV_ROW_LOONGSON2(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
++    DCT_8_INV_ROW_LOONGSON2(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
++    DCT_8_INV_ROW_LOONGSON2(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
++    DCT_8_INV_ROW_LOONGSON2(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
++    DCT_8_INV_ROW_LOONGSON2(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
++    DCT_8_INV_ROW_LOONGSON2(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
++    DCT_8_INV_ROW_LOONGSON2(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
++
++	//# Process the columns (4 at a time)
++    DCT_8_INV_COL(0(%0), 0(%0))
++    DCT_8_INV_COL(8(%0), 8(%0))
++    :
++    : "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16)
++    :"$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f18","$f16","$20","$22","$24","$26");
++}
++
+diff --git a/libavcodec/loongson2/motion_est_loongson2.c b/libavcodec/loongson2/motion_est_loongson2.c
+new file mode 100644
+index 0000000..bb67290
+--- /dev/null
++++ b/libavcodec/loongson2/motion_est_loongson2.c
+@@ -0,0 +1,365 @@
++/*
++ * Loongson2E MMI optimized motion estimation
++ * Copyright (c) 2007 comcat <jiankemeng@gmail.com>.
++ *
++ * based on Michael Niedermayer <michaelni@gmx.at>
++ *
++ */
++
++#include "dsputil_loongson2.h"
++#include "../avcodec.h"
++
++static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
++	0x0000000000000000ULL,
++	0x0001000100010001ULL,
++	0x0002000200020002ULL,
++};
++
++static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL;
++
++static inline void sad8_1_loongson2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
++{
++    long len= -(stride*h);
++    __asm__ volatile(
++        
++//		".set mips3						\n\t"
++		".align 4						\n\t"
++
++		"move	$8, %0					\n\t"
++		"move	$21, %1					\n\t"
++		"move	$22, %2					\n\t"
++		"move	$23, %3					\n\t"
++
++        "1:                             \n\t"
++
++		"add	$9, $8, $21				\n\t"
++		"add	$10, $8, $22				\n\t"
++        
++		"uld	$11, ($9)				\n\t"
++		"dmtc1	$11, $f0				\n\t"
++
++		"uld	$12, ($9)				\n\t"
++		"dmtc1	$12, $f4				\n\t"
++        
++		"pasubub $f10, $f0, $f4			\n\t"
++		"biadd	 $f0, $f10				\n\t"
++
++		"add	$8, $8, $23				\n\t"
++
++		"add	$9, $8, $21				\n\t"
++		"add	$10, $8, $22				\n\t"
++        
++		"uld	$11, ($9)				\n\t"
++		"dmtc1	$11, $f2				\n\t"
++		
++		"uld	$12, ($10)				\n\t"
++		"dmtc1	$12, $f6				\n\t"
++
++		"pasubub $f16, $f2, $f6			\n\t"
++		"biadd	 $f6, $f16				\n\t"
++        
++		"paddh	 $f0, $f0, $f6			\n\t"
++        
++		"paddh	 $f12, $f12, $f0		\n\t"
++		
++		"bltz	$8, 1b					\n\t"		
++		"add	$8, $8, $23				\n\t" 
++
++        : "+r" (len)
++        : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
++		: "$8", "$9", "$10", "$21", "$22", "$23", "$f0", "$f2", "$f4", "$f6", "$f10", "$f16"
++    );
++}
++
++static inline void sad8_2_loongson2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
++{
++    long len= -(stride*h);
++    __asm__ volatile(
++        
++//		".set mips3						\n\t"
++		".align 4						\n\t"
++
++		"move	$8, %0					\n\t"
++
++        "1:                             \n\t"
++		"add	$9, $8, %1				\n\t"
++		"add	$10, $8, %2				\n\t"
++		"add	$11, $8, %3				\n\t"
++
++		"uld	$12, ($9)				\n\t"
++		"dmtc1	$12, $f0				\n\t"
++		"uld	$13, ($10)				\n\t"
++		"dmtc1	$13, $f4				\n\t"
++
++		"pavgb	$f0, $f0, $f4			\n\t"
++		
++		"uld	$12, ($11)				\n\t"
++		"dmtc1	$12, $f4				\n\t"
++
++		"pasubub $f10, $f0, $f4			\n\t"
++		"biadd	 $f0, $f10				\n\t"
++        
++		"add	$8, $8, %4				\n\t"
++
++		"add	$9, $8, %1				\n\t"
++		"add	$10, $8, %2				\n\t"
++		"add	$11, $8, %3				\n\t"
++
++		"uld	$12, ($9)				\n\t"
++		"dmtc1	$12, $f2				\n\t"
++		"uld	$13, ($10)				\n\t"
++		"dmtc1	$13, $f6				\n\t"
++        
++		"pavgb	$f6, $f6, $f2			\n\t"
++		
++		"uld	$12, ($11)				\n\t"
++		"dmtc1	$12, $f2				\n\t"
++        
++		"pasubub $f16, $f6, $f2			\n\t"
++		"biadd	 $f6, $f16				\n\t"
++        
++		"paddh	 $f0, $f0, $f6			\n\t"
++		"paddh	 $f12, $f12, $f0		\n\t"
++
++		"bltz	$8, 1b					\n\t"
++		"add	$8, $8, %4				\n\t"
++        : "+r" (len)
++        : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
++		: "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4", "$f6", "$f10", "$f16"
++    );
++}
++
++static inline void sad8_4_loongson2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
++{ 
++    long len= -(stride*h);
++    __asm__ volatile(
++        
++        
++//		".set mips3						\n\t"
++		".align 4						\n\t"
++
++		"ldc1	$f10, "MANGLE(bone)"	\n\t"
++
++		"move	$8, %0					\n\t"
++
++        "1:                             \n\t"
++		"add	$9, $8, %1				\n\t"
++		"add	$10, $8, %2				\n\t"
++		"add	$11, $8, %3				\n\t"
++        
++		"uld	$12, ($9)				\n\t"
++		"dmtc1	$12, $f0				\n\t"
++		
++		"uld	$13, ($10)				\n\t"
++		"dmtc1	$13, $f4				\n\t"
++		
++		"uld	$12, 1($9)				\n\t"
++		"dmtc1	$12, $f2				\n\t"
++		
++		"uld	$13, 1($10)				\n\t"
++		"dmtc1	$13, $f6				\n\t"
++
++		"pavgb	$f0, $f0, $f4			\n\t"
++		"pavgb	$f6, $f6, $f2			\n\t"
++        
++		"psubusb $f6, $f6, $f10			\n\t"
++		"pavgb	 $f0, $f0, $f6			\n\t"
++
++		"uld	$13, 1($11)				\n\t"
++		"dmtc1	$13, $f4				\n\t"
++        
++		"pasubub $f16, $f0, $f4			\n\t"
++		"biadd	 $f0, $f16				\n\t"
++
++		"add	 $8, $8, %4				\n\t"
++
++		"add	$9, $8, %1				\n\t"
++		"add	$10, $8, %2				\n\t"
++		"add	$11, $8, %3				\n\t"
++
++		"uld	$12, ($9)				\n\t"
++		"dmtc1	$12, $f2				\n\t"
++		"uld	$13, ($10)				\n\t"
++		"dmtc1	$12, $f6				\n\t"
++		"uld	$12, 1($9)				\n\t"
++		"dmtc1	$12, $f4				\n\t"
++		"uld	$13, 1($10)				\n\t"
++		"dmtc1	$12, $f8				\n\t"
++
++		"pavgb	$f2, $f2, $f6			\n\t"
++		"pavgb	$f4, $f4, $f8			\n\t"
++        
++		"psubusb $f4, $f4, $f10			\n\t"
++		"pavgb	 $f4, $f4, $f2			\n\t"
++
++		"uld	$13, ($11)				\n\t"
++		"dmtc1	$13, $f2				\n\t"
++
++		"pasubub $f18, $f4, $f2			\n\t"
++		"biadd	 $f4, $f18				\n\t"
++        
++		"paddh	 $f0, $f0, $f4			\n\t"
++		"paddh	 $f12, $f12, $f0		\n\t"
++        
++		"bltz	 $8, 1b					\n\t"
++		"add	 $8, $8, %4				\n\t"
++        : "+r" (len)
++        : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride)
++		: "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f16", "$f18"
++    );
++}
++
++static inline int sum_loongson2(void)
++{
++    int ret;
++    __asm__ volatile(
++//		".set mips3						\n\t"
++        
++		"dmfc1	%0, $f12				\n\t"
++        : "=r" (ret)
++    );
++    return ret;
++}
++
++
++static int sad8_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++    assert(h==8);
++    __asm__ volatile(
++//			".set mips3						\n\t"
++			"xor	$f14, $f14, $f14 \n\t"
++			"xor	$f12, $f12, $f12 \n\t"
++			:
++	);
++
++    sad8_1_loongson2(blk1, blk2, stride, 8);
++
++    return sum_loongson2();
++}
++
++static int sad8_x2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++    assert(h==8);
++    __asm__ volatile(
++//			".set mips3						\n\t"
++			"xor	$f14, $f14, $f14	\n\t"
++			"xor	$f12, $f12, $f12	\n\t"
++            
++			"ldc1	$f10, %0		 \n\t"
++            :: "m"(round_tab[1]) 
++    );
++
++    sad8_2_loongson2(blk1, blk1+1, blk2, stride, 8);
++
++    return sum_loongson2();
++}
++
++static int sad8_y2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++    assert(h==8);
++    __asm__ volatile(
++//				 ".set mips3				\n\t"
++				 "xor	$f14, $f14, $f14	\n\t"
++				 "xor	$f12, $f12, $f12	\n\t"
++                 
++				 "ldc1	$f10, %0		 \n\t"
++                 :: "m"(round_tab[1]) 
++                 );
++
++    sad8_2_loongson2(blk1, blk1+stride, blk2, stride, 8);
++
++    return sum_loongson2();
++}
++
++static int sad8_xy2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++    assert(h==8);
++    __asm__ volatile(
++//				 ".set mips3				\n\t"
++				 "xor	$f14, $f14, $f14	\n\t"
++				 "xor	$f12, $f12, $f12	\n\t"
++				 "ldc1	$f10, %0		 \n\t"
++                 :: "m"(round_tab[2]) 
++                 );
++
++    sad8_4_loongson2(blk1, blk2, stride, 8);
++
++    return sum_loongson2();
++}
++
++static int sad16_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++    __asm__ volatile(
++//				 ".set mips3			\n\t"
++				 "xor	$f14, $f14, $f14	\n\t"
++				 "xor	$f12, $f12, $f12	\n\t":);
++
++    sad8_1_loongson2(blk1  , blk2  , stride, h);
++    sad8_1_loongson2(blk1+8, blk2+8, stride, h);
++
++    return sum_loongson2();
++}
++
++static int sad16_x2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++    __asm__ volatile(
++//				 ".set mips3				\n\t"
++				 "xor	$f14, $f14, $f14	\n\t"
++				 "xor	$f12, $f12, $f12	\n\t"
++				 "ldc1	$f10, %0		 \n\t"
++                 :: "m"(round_tab[1]) 
++                 );
++
++    sad8_2_loongson2(blk1  , blk1+1, blk2  , stride, h);
++    sad8_2_loongson2(blk1+8, blk1+9, blk2+8, stride, h);
++
++    return sum_loongson2();
++}
++
++static int sad16_y2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++    __asm__ volatile(
++//				 ".set mips3				\n\t"
++				 "xor	$f14, $f14, $f14	\n\t"
++				 "xor	$f12, $f12, $f12	\n\t"
++				 "ldc1	$f10, %0		 \n\t"
++                 :: "m"(round_tab[1]) 
++                 );
++
++    sad8_2_loongson2(blk1  , blk1+stride,  blk2  , stride, h);
++    sad8_2_loongson2(blk1+8, blk1+stride+8,blk2+8, stride, h);
++
++    return sum_loongson2();
++}
++
++static int sad16_xy2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++    __asm__ volatile(
++//				 ".set mips3				\n\t"
++				 "xor	$f14, $f14, $f14	\n\t"
++				 "xor	$f12, $f12, $f12	\n\t"
++				 "ldc1	$f10, %0		 \n\t"
++                 :: "m"(round_tab[2]) 
++                 );
++
++    sad8_4_loongson2(blk1  , blk2  , stride, h);
++    sad8_4_loongson2(blk1+8, blk2+8, stride, h);
++
++    return sum_loongson2();
++}
++
++
++void dsputil_init_pix_loongson2(DSPContext* c, AVCodecContext *avctx)
++{
++        c->pix_abs[0][0] = sad16_loongson2;
++        c->pix_abs[0][1] = sad16_x2_loongson2;
++        c->pix_abs[0][2] = sad16_y2_loongson2;
++        c->pix_abs[0][3] = sad16_xy2_loongson2;
++        c->pix_abs[1][0] = sad8_loongson2;
++        c->pix_abs[1][1] = sad8_x2_loongson2;
++        c->pix_abs[1][2] = sad8_y2_loongson2;
++        c->pix_abs[1][3] = sad8_xy2_loongson2;
++
++        c->sad[0]= sad16_loongson2;
++        c->sad[1]= sad8_loongson2;
++}
+diff --git a/libavcodec/loongson2/mpegvideo_loongson2.c b/libavcodec/loongson2/mpegvideo_loongson2.c
+new file mode 100644
+index 0000000..18d070a
+--- /dev/null
++++ b/libavcodec/loongson2/mpegvideo_loongson2.c
+@@ -0,0 +1,385 @@
++/*
++ * The simplest mpeg encoder (well, it was the simplest!)
++ * Copyright (c) 2007-2010 comcat <jiankemeng@gmail.com>.
++ *
++ * Optimized for Loongson2 CPUs by comcat <jiankemeng@gmail.com>
++ * 
++ * Based on i386
++ */
++
++#include "dsputil_loongson2.h"
++#include "../mpegvideo.h"
++#include "../avcodec.h"
++
++extern uint8_t zigzag_direct_noperm[64];
++extern uint16_t inv_zigzag_direct16[64];
++
++static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL;
++static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
++
++
++static void dct_unquantize_h263_intra_loongson2(MpegEncContext *s,
++                                  DCTELEM *block, int n, int qscale)
++{
++    long level, qmul, qadd, nCoeffs;
++
++    qmul = qscale << 1;
++
++    assert(s->block_last_index[n]>=0 || s->h263_aic); 
++    if (!s->h263_aic) {
++        if (n < 4)
++            level = block[0] * s->y_dc_scale;
++        else
++            level = block[0] * s->c_dc_scale;
++        qadd = (qscale - 1) | 1;
++    }else{
++        qadd = 0;
++        level= block[0];
++    }
++    if(s->ac_pred)
++        nCoeffs=63;
++    else
++        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
++
++
++	__asm__ volatile(
++//			".set mips3						\n\t"
++			
++			"xor		$f12, $f12, $f12	\n\t"
++			"lwc1		$f12, %1			\n\t"
++			
++			"xor		$f10, $f10, $f10	\n\t"
++			
++			"packsswh	$f12, $f12, $f12	\n\t"
++			
++			"lwc1		$f10, %2			\n\t"
++			
++			"packsswh	$f10, $f10, $f10	\n\t"
++			
++			"packsswh	$f12, $f12, $f12	\n\t"
++			
++			"xor 		$f14, $f14, $f14	\n\t"
++			
++			"packsswh	$f10, $f10, $f10	\n\t"
++			
++			"xor		$f8, $f8, $f8		\n\t"
++			
++			"psubh		$f14, $f14, $f10	\n\t"
++
++			
++			"1:                             \n\t"
++			"add		$12, %0, %3			\n\t"
++			
++			"ldc1		$f0, ($12)			\n\t"
++			
++			"ldc1		$f2, 8($12)			\n\t"
++
++			"mov.d		$f4, $f0			\n\t"
++			"mov.d		$f6, $f2			\n\t"
++			
++			"pmullh		$f0, $f0, $f12		\n\t"
++			"pmullh		$f2, $f2, $f12		\n\t"
++
++			"pcmpgth	$f4, $f4, $f8		\n\t"
++			"pcmpgth	$f6, $f6, $f8		\n\t"
++			
++			"xor		$f0, $f0, $f4		\n\t"
++			"xor		$f2, $f2, $f6		\n\t"
++
++			
++			"paddh		$f0, $f0, $f14      \n\t"
++			
++			"paddh		$f2, $f2, $f14		\n\t"
++
++			
++			"xor		$f4, $f4, $f0		\n\t"
++			
++			"xor		$f6, $f6, $f2		\n\t"
++
++			
++			"pcmpeqh	$f0, $f0, $f14		\n\t"
++			
++			"pcmpeqh	$f2, $f2, $f14		\n\t"
++
++			
++			"pandn		$f0, $f0, $f4		\n\t"	
++			
++			"pandn		$f2, $f2, $f6		\n\t"
++
++			
++			"sdc1		$f0, ($12)			\n\t"
++			
++			"sdc1		$f2, 8($12)			\n\t"
++
++			
++			"addiu		%3, %3, 16			\n\t"
++			
++			"blez		%3, 1b				\n\t"
++			"nop							\n\t"
++			::"r" (block+nCoeffs), "m"(qmul), "m" (qadd), "r" (2*(-nCoeffs))
++			: "memory"
++        );
++        block[0]= level;
++}
++
++
++static void dct_unquantize_h263_inter_loongson2(MpegEncContext *s,
++                                  DCTELEM *block, int n, int qscale)
++{
++    long qmul, qadd, nCoeffs;
++
++    qmul = qscale << 1;
++    qadd = (qscale - 1) | 1;
++
++    assert(s->block_last_index[n]>=0 || s->h263_aic);
++
++    nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
++
++   __asm__ volatile(
++//				".set mips3						\n\t"
++                
++				"xor		$f12, $f12, $f12	\n\t"
++				"lwc1		$f12, %1			\n\t"
++                
++				"xor		$f10, $f10, $f10	\n\t"
++                
++				"packsswh	$f12, $f12, $f12	\n\t"
++
++				"lwc1		$f10, %2			\n\t"
++                
++				"packsswh	$f10, $f10, $f10	\n\t"
++                
++				"xor		$f14, $f14, $f14	\n\t"
++                
++				"packsswh	$f12, $f12, $f12	\n\t"
++                
++				"packsswh	$f10, $f10, $f10	\n\t"
++                
++				"xor		$f8, $f8, $f8		\n\t"
++
++				"psubh		$f14, $f14, $f10	\n\t"
++                
++
++                "1:                             \n\t"
++				"add		$12, %0, %3			\n\t"
++                
++				"ldc1		$f0, ($12)			\n\t"
++                
++				"ldc1		$f2, 8($12)			\n\t"
++
++				"mov.d		$f4, $f0			\n\t"
++				"mov.d		$f6, $f2			\n\t"
++                
++				"pmullh		$f0, $f0, $f12		\n\t"
++                
++				"pmullh		$f2, $f2, $f12		\n\t"
++
++				"pcmpgth	$f4, $f4, $f8		\n\t"
++                
++				"pcmpgth	$f6, $f6, $f8		\n\t"
++                
++				"xor		$f0, $f0, $f4		\n\t"
++                
++				"xor		$f2, $f2, $f6		\n\t"
++                
++				"paddh		$f0, $f0, $f14		\n\t"
++                
++				"paddh		$f2, $f2, $f14		\n\t"
++                
++				"xor		$f4, $f4, $f0		\n\t"
++                
++				"xor		$f6, $f6, $f2		\n\t"
++                
++				"pcmpeqh	$f0, $f0, $f14		\n\t"
++                
++				"pcmpeqh	$f2, $f2, $f14		\n\t"
++                
++				"pandn		$f0, $f0, $f4		\n\t"
++                
++				"pandn		$f2, $f2, $f6		\n\t"
++                
++				"sdc1		$f0, ($12)			\n\t"
++                
++				"sdc1		$f2, 8($12)			\n\t"
++
++                
++				"addiu		%3, %3, 16			\n\t"
++                
++				"blez		%3, 1b				\n\t"
++				"nop							\n\t"
++                ::"r" (block+nCoeffs), "m"(qmul), "m" (qadd), "r" (2*(-nCoeffs))
++                : "memory"
++        );
++}
++
++
++/* draw the edges of width 'w' of an image of size width, height
++   this mmx version can only handle w==8 || w==16 */
++
++static void draw_edges_loongson2(uint8_t *buf, int wrap, int width, int height, int w)
++{
++    uint8_t *ptr, *last_line;
++    int i;
++
++    last_line = buf + (height - 1) * wrap;
++    
++    ptr = buf;
++    if(w==8)
++    {
++        __asm__ volatile(
++//				".set mips3						\n\t"
++
++				"move		$9, %0				\n\t"
++
++                "1:                             \n\t"
++                
++				"xor		$f0, $f0, $f0		\n\t"
++				"lwc1		$f0, ($9)			\n\t"
++                
++				"punpcklbh	$f0, $f0, $f0		\n\t"
++
++				"add		$12, $9, %2			\n\t"
++                
++				"punpcklhw	$f0, $f0, $f0		\n\t"
++
++				"punpcklwd	$f0, $f0, $f0		\n\t"
++
++				"ldc1		$f2, -8($12)		\n\t"
++
++				"sdc1		$f0, -8($9)			\n\t"
++                
++				"punpckhbh	$f2, $f2, $f2		\n\t"
++                
++				"add		$9, $9, %1			\n\t"
++
++				"punpckhhw	$f2, $f2, $f2		\n\t"
++                
++				"sub		$13, $9, %3			\n\t"
++
++				"punpckhwd	$f2, $f2, $f2		\n\t"
++
++				"bltz		$13, 1b				\n\t"
++                
++				"sdc1		$f2, ($12)			\n\t"
++
++                : "+r" (ptr)
++                : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
++				: "$9", "$13", "$12", "$f2", "$f0"
++        );
++    }
++    else
++    {
++        __asm__ volatile(
++//				".set mips3						\n\t"
++
++				"move		$8, %0				\n\t"
++
++                "1:                             \n\t"
++                
++				"xor		$f0, $f0, $f0		\n\t"
++				"lwc1		$f0, ($8)			\n\t"
++                
++				"punpcklbh	$f0, $f0, $f0		\n\t"
++				"punpcklhw	$f0, $f0, $f0		\n\t"
++				"punpcklwd	$f0, $f0, $f0		\n\t"
++
++				"sdc1		$f0, -8($8)			\n\t"
++				"sdc1		$f0, -16($8)		\n\t"
++                
++				"add		$15, $8, %2			\n\t"
++				"ldc1		$f2, -8($15)		\n\t"
++                
++				"punpckhbh	$f2, $f2, $f2		\n\t"
++				"punpckhhw	$f2, $f2, $f2		\n\t"
++				"punpckhwd	$f2, $f2, $f2		\n\t"
++
++				"sdc1		$f2, ($15)			\n\t"
++				"sdc1		$f2, 8($15)			\n\t"
++
++				"add		$8, $8, %1			\n\t"
++                
++				"sub		$16, $8, %3			\n\t"
++				"bltz		$16, 1b				\n\t"
++				"nop							\n\t"
++                : "+r" (ptr)
++                : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
++				: "$8", "$15", "$16", "$f0", "$f2"
++        );
++    }
++
++    for(i=0;i<w;i+=4) {
++        
++        ptr= buf - (i + 1) * wrap - w;
++        __asm__ volatile(
++//				".set mips3						\n\t"
++				"move		$8, %0				\n\t"
++
++                "1:                             \n\t"
++                
++				"add		$9, $8, %1			\n\t"
++				"ldc1		$f0, ($9)			\n\t"
++
++				"add		$10, $8, %2			\n\t"
++				"add		$11, $10, %2		\n\t"
++				"add		$12, $8, %3			\n\t"
++                
++				"sdc1		$f0, ($8)			\n\t"
++				"sdc1		$f0, ($10)			\n\t"
++				"sdc1		$f0, ($11)			\n\t"
++				"sdc1		$f0, ($12)			\n\t"
++
++				"addiu		$8, $8, 8			\n\t"
++                
++				"sub		$13, $8, %4			\n\t"
++                
++				"bltz		$13, 1b				\n\t"
++				"nop							\n\t"
++
++                : "+r" (ptr)
++                : "r" (((long)buf - (long)ptr - w)), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (*(ptr+width+2*w))
++				: "$8", "$9", "$10", "$11", "$12", "$13", "$f0"
++		);
++
++        ptr= last_line + (i + 1) * wrap - w;
++
++        __asm__ volatile(
++//				".set mips3						\n\t"
++
++				"move		$9, %0				\n\t"
++
++                "1:                             \n\t"
++                
++				"add		$10, $9, %1			\n\t"
++				"ldc1		$f0, ($10)				\n\t"
++
++				"add		$11, $9, %2			\n\t"
++				"add		$12, $11, %2		\n\t"
++				"add		$13, $9, %3			\n\t"
++				
++				"sdc1		$f0, ($9)			\n\t"
++				"sdc1		$f0, ($11)			\n\t"
++				"sdc1		$f0, ($12)			\n\t"
++				"sdc1		$f0, ($13)			\n\t"
++                
++				"addiu		$9, $9, 8			\n\t"
++                
++				"sub		$14, $9, %4			\n\t"
++                
++				"bltz		$14, 1b				\n\t"
++				"nop							\n\t"
++                : "+r" (ptr)
++                : "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w)
++				: "$9", "$10", "$11", "$12", "$13", "$14", "$f0"
++
++		);
++    }
++}
++
++void MPV_common_init_loongson2(MpegEncContext *s)
++{
++	s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_loongson2;
++	s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_loongson2;
++
++//	draw_edges = draw_edges_loongson2;
++
++}
+diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
+index 3f4da68..73e4d56 100644
+--- a/libavcodec/mips/Makefile
++++ b/libavcodec/mips/Makefile
+@@ -1,3 +1,9 @@
+ OBJS-$(HAVE_MMI)                       += ps2/dsputil_mmi.o             \
+                                           ps2/idct_mmi.o                \
+                                           ps2/mpegvideo_mmi.o           \
++
++OBJS-$(HAVE_LOONGSON2MMI)              += loongson2/idct_loongson2.o   \
++                                          loongson2/dsputil_loongson2.o \
++                                          loongson2/idct_loongson2_xvid.o \
++                                          loongson2/mpegvideo_loongson2.o \
++                                          loongson2/motion_est_loongson2.o
+diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
+index b47ff9a..af92552 100644
+--- a/libavcodec/mpegvideo.c
++++ b/libavcodec/mpegvideo.c
+@@ -176,6 +176,9 @@ av_cold int ff_dct_common_init(MpegEncContext *s)
+ #elif ARCH_BFIN
+     MPV_common_init_bfin(s);
+ #endif
++#ifdef HAVE_LOONGSON2MMI
++    MPV_common_init_loongson2(s);
++#endif
+ 
+     /* load & permutate scantables
+        note: only wmv uses different ones
+diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
+index 5302be9..8d09906 100644
+--- a/libavcodec/mpegvideo.h
++++ b/libavcodec/mpegvideo.h
+@@ -689,6 +689,7 @@ int MPV_encode_picture(AVCodecContext *avctx, unsigned char *buf, int buf_size,
+ void MPV_common_init_mmx(MpegEncContext *s);
+ void MPV_common_init_axp(MpegEncContext *s);
+ void MPV_common_init_mlib(MpegEncContext *s);
++void MPV_common_init_loongson2(MpegEncContext *s);
+ void MPV_common_init_mmi(MpegEncContext *s);
+ void MPV_common_init_arm(MpegEncContext *s);
+ void MPV_common_init_altivec(MpegEncContext *s);
+diff --git a/libavcodec/options.c b/libavcodec/options.c
+index 7ca1062..c05b3f4 100644
+--- a/libavcodec/options.c.orig	2011-06-22 12:52:11.584428161 -0300
++++ b/libavcodec/options.c	2011-06-22 12:52:25.003143367 -0300
+@@ -219,6 +219,8 @@ static const AVOption options[]={
+ {"simple", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_IDCT_SIMPLE }, INT_MIN, INT_MAX, V|E|D, "idct"},
+ {"simplemmx", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_IDCT_SIMPLEMMX }, INT_MIN, INT_MAX, V|E|D, "idct"},
+ {"libmpeg2mmx", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_IDCT_LIBMPEG2MMX }, INT_MIN, INT_MAX, V|E|D, "idct"},
++{"libmpeg2loongson2", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_LIBMPEG2LOONGSON2, INT_MIN, INT_MAX, V|E|D, "idct"},
++{"xvidloongson2", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_XVIDLOONGSON2, INT_MIN, INT_MAX, V|E|D, "idct"},
+ {"ps2", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_IDCT_PS2 }, INT_MIN, INT_MAX, V|E|D, "idct"},
+ {"mlib", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_IDCT_MLIB }, INT_MIN, INT_MAX, V|E|D, "idct"},
+ {"arm", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_IDCT_ARM }, INT_MIN, INT_MAX, V|E|D, "idct"},
diff --git a/extra/koffice/PKGBUILD b/extra/koffice/PKGBUILD
index 1884dd92c..c13780881 100644
--- a/extra/koffice/PKGBUILD
+++ b/extra/koffice/PKGBUILD
@@ -38,18 +38,23 @@ pkgname=(
 )
 pkgver=2.3.3
 pkgrel=4
-arch=('i686' 'x86_64')
+arch=('i686' 'x86_64' 'mips64el') 
 url='http://koffice.kde.org'
 license=('GPL' 'LGPL' 'FDL')
 makedepends=('pkg-config' 'cmake' 'automoc4' 'boost' 'eigen' 'gsl' 'lcms'
              'glew' 'qimageblitz' 'kdepimlibs' 'pstoedit' 'poppler-qt' 'libwpd'
              'libwpg' 'opengtl' 'kdegraphics-libs')
+[ "$CARCH" = "mips64el" ] && \
+makedepends=('pkg-config' 'cmake' 'automoc4' 'boost' 'eigen' 'gsl' 'lcms'
+             'glew' 'qimageblitz' 'kdepimlibs' 'pstoedit' 'poppler-qt' 'libwpd'
+             'libwpg' 'kdegraphics-libs' 'libgsf')
 groups=('koffice')
 source=("http://download.kde.org/stable/${pkgbase}-${pkgver}/${pkgbase}-${pkgver}.tar.bz2"
         'kde4-koffice-libwpg02.patch' 'gcc46.patch')
 sha256sums=('31ba0d98c0d29c7b8ab97efdeb6c618b82177b2b0ec85da088178254da43c099'
             '69106deb4081d71b5bd8f2e4f5af67ca689e4ce9f2bb49c11dbce5fb3409d612'
             'e095c0b2bbedf41da6535a68b2275464dafd3f194566028d0135322f596e4739')
+options=(!distcc)
 
 build() {
 	cd "${srcdir}/${pkgbase}-${pkgver}"
@@ -60,6 +65,7 @@ build() {
 	cd "${srcdir}"
 	mkdir build
 	cd build
+  CXX="g++" \
 	cmake ../${pkgbase}-${pkgver} \
 		-DCMAKE_BUILD_TYPE=Release \
 		-DCMAKE_SKIP_RPATH=ON \
@@ -216,6 +222,10 @@ package_koffice-krita(){
 	depends=('hicolor-icon-theme' 'glew' 'qimageblitz' 'koffice-libs' \
 		'koffice-templates' 'koffice-plugins' 'poppler-qt' 'shared-mime-info' \
 		'openexr' 'opengtl')
+[ "$CARCH" = "mips64el" ] && \
+	depends=('hicolor-icon-theme' 'glew' 'qimageblitz' 'koffice-libs' \
+		'koffice-templates' 'koffice-plugins' 'poppler-qt' 'shared-mime-info' \
+		'openexr')
         optdepends=('koffice-filters: import/export filters')
 	install=krita.install
         cd "${srcdir}/build/krita"
diff --git a/extra/pixman/PKGBUILD b/extra/pixman/PKGBUILD
index bf392b456..3c842ff4b 100644
--- a/extra/pixman/PKGBUILD
+++ b/extra/pixman/PKGBUILD
@@ -4,18 +4,22 @@
 
 pkgname=pixman
 pkgver=0.22.0
-pkgrel=1
+pkgrel=2
 pkgdesc="Pixman library"
 arch=(i686 x86_64 'mips64el')
 url="http://xorg.freedesktop.org"
 license=('custom')
 depends=('glibc')
 options=('!libtool')
-source=(http://xorg.freedesktop.org/releases/individual/lib/${pkgname}-${pkgver}.tar.bz2)
-sha1sums=('d24ea233755d7dce9f0d93136ad99fba8d4e4fa0')
+source=(http://xorg.freedesktop.org/releases/individual/lib/${pkgname}-${pkgver}.tar.bz2
+        pixman-loongson2f.patch)
+sha1sums=('d24ea233755d7dce9f0d93136ad99fba8d4e4fa0'
+          'ce4d69ea341f21fdc30f6d401ee479cd3571dab3')
 
 build() {
   cd "${srcdir}/${pkgname}-${pkgver}"
+  [ "$CARCH" = "mips64el" ] && patch -Np1 -i $srcdir/pixman-loongson2f.patch
+  autoreconf -i
   ./configure --prefix=/usr --disable-static
   make
 }
diff --git a/extra/pixman/pixman-loongson2f.patch b/extra/pixman/pixman-loongson2f.patch
new file mode 100644
index 000000000..15e01cb6b
--- /dev/null
+++ b/extra/pixman/pixman-loongson2f.patch
@@ -0,0 +1,2745 @@
+diff -urN pixman//configure.ac Pixman.Loongson//configure.ac
+--- pixman//configure.ac	2010-12-25 18:46:00.018699000 +0800
++++ Pixman.Loongson//configure.ac	2010-12-25 18:39:15.298778000 +0800
+@@ -264,6 +264,43 @@
+ ])
+ 
+ dnl ===========================================================================
++dnl Check for Loongson SIMD
++
++have_loongson_intrinsics=no
++AC_MSG_CHECKING(whether to use Loongson SIMD intrinsics)
++
++AC_COMPILE_IFELSE([
++#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 4))
++error "Need GCC >= 4.4 for Loongson SIMD compilation"
++#endif
++int main () {
++    /* Test with a loongson SIMD instruction. */
++	asm volatile ( ".set arch = loongson2f \n\t" "and \$f0, \$f0, \$f0 \n\t"  : : : "cc", "memory" );
++    return 0;
++}], have_loongson_intrinsics=yes)
++
++
++AC_ARG_ENABLE(loongson,
++   [AC_HELP_STRING([--disable-loongson],
++                   [disable Loongson fast paths])],
++   [enable_loongson=$enableval], [enable_loongson=auto])
++
++if test $enable_loongson = no ; then
++   have_loongson_intrinsics=disabled
++fi
++
++if test $have_loongson_intrinsics = yes ; then
++   AC_DEFINE(USE_LS, 1, [use Loongson compiler intrinsics])
++fi
++
++AC_MSG_RESULT($have_loongson_intrinsics)
++if test $enable_loongson = yes && test $have_loongson_intrinsics = no ; then
++   AC_MSG_ERROR([Loongson intrinsics not detected])
++fi
++
++AM_CONDITIONAL(USE_LS, test $have_loongson_intrinsics = yes)
++
++dnl ===========================================================================
+ dnl Check for MMX
+ 
+ if test "x$MMX_CFLAGS" = "x" ; then
+diff -urN pixman//pixman/Makefile.am Pixman.Loongson//pixman/Makefile.am
+--- pixman//pixman/Makefile.am	2010-12-25 18:46:00.025027000 +0800
++++ Pixman.Loongson//pixman/Makefile.am	2010-12-25 18:39:15.303599000 +0800
+@@ -55,6 +55,19 @@
+ 	pixman-combine.h.template solaris-hwcap.mapfile pixman-x64-mmx-emulation.h
+ CLEANFILES = pixman-combine32.c pixman-combine64.c pixman-combine32.h pixman-combine64.h
+ 
++# loongson code
++if USE_LS
++noinst_LTLIBRARIES += libpixman-ls.la
++libpixman_ls_la_SOURCES = \
++	pixman-ls.c
++libpixman_ls_la_CFLAGS = $(DEP_CFLAGS) $(LS_CFLAGS)
++libpixman_ls_la_LIBADD = $(DEP_LIBS)
++libpixman_1_la_LDFLAGS += $(LS_LDFLAGS)
++libpixman_1_la_LIBADD += libpixman-ls.la
++
++ASM_CFLAGS_ls=$(LS_CFLAGS)
++endif
++
+ # mmx code
+ if USE_MMX
+ noinst_LTLIBRARIES += libpixman-mmx.la
+diff -urN pixman//pixman/pixman-combine-ls.c Pixman.Loongson//pixman/pixman-combine-ls.c
+--- pixman//pixman/pixman-combine-ls.c	1970-01-01 08:00:00.000000000 +0800
++++ Pixman.Loongson//pixman/pixman-combine-ls.c	2010-12-25 18:39:15.344171000 +0800
+@@ -0,0 +1,911 @@
++static force_inline uint32_t
++combine (const uint32_t *src, const uint32_t *mask)
++{
++    uint32_t ssrc = *src;
++
++    if (mask)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f22) 
++		load8888r(%0,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		store8888r($f8,%0)
++		:"+r"(ssrc):"r"(*mask):clobber
++		);
++    }
++    return ssrc;
++}
++
++static void
++ls_combine_saturate_u (pixman_implementation_t *imp,
++                        pixman_op_t              op,
++                        uint32_t *               dest,
++                        const uint32_t *         src,
++                        const uint32_t *         mask,
++                        int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++	uint32_t s = combine (src, mask);
++	uint32_t d = *dest;
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%1,$f22) 
++	load8888r(%0,$f20) 
++	:"+r"(d):"r"(s):clobber
++	);
++
++	uint32_t sa = s >> 24;
++	uint32_t da = ~d >> 24;
++
++	if (sa > da)
++	{
++		uint32_t dds =  DIV_UN8 (da, sa) << 24;
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f24)
++		pix_multiply($f22,$f24)
++		save_to($f22)
++		::"r"(dds):clobber
++		);
++	}
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	pix_add($f20,$f22) 
++	store8888r($f8,%0)
++	:"=r"(*dest)::clobber
++	);
++
++	++src;
++	++dest;
++	if (mask)
++	    mask++;
++    }
++}
++static void
++ls_combine_out_u (pixman_implementation_t *imp,
++                   pixman_op_t              op,
++                   uint32_t *               dest,
++                   const uint32_t *         src,
++                   const uint32_t *         mask,
++                   int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++	if (mask) 
++	{
++
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f24)
++		negate($f24,$f24)
++		pix_multiply($f20,$f24)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++
++		mask++;
++	}else {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f24)
++		negate($f24,$f24)
++		pix_multiply($f20,$f24)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);		
++		
++	}
++	++dest;
++	++src;
++    }
++}
++
++static void
++ls_combine_out_reverse_u (pixman_implementation_t *imp,
++                           pixman_op_t              op,
++                           uint32_t *               dest,
++                           const uint32_t *         src,
++                           const uint32_t *         mask,
++                           int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++	if (mask)
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f20)
++		negate($f20,$f20)
++		pix_multiply($f20,$f24)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++
++		mask++;
++	}else{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f20)
++		negate($f20,$f20)
++		pix_multiply($f20,$f24)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);	
++	}
++	++dest;
++	++src;
++
++    }
++}
++
++static void
++ls_combine_out_ca (pixman_implementation_t *imp,
++                    pixman_op_t              op,
++                    uint32_t *               dest,
++                    const uint32_t *         src,
++                    const uint32_t *         mask,
++                    int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f26)
++		negate($f26,$f26)
++		pix_multiply($f20,$f22)
++		save_to($f20) 
++		pix_multiply($f20,$f26)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++}
++
++static void
++ls_combine_out_reverse_ca (pixman_implementation_t *imp,
++                            pixman_op_t              op,
++                            uint32_t *               dest,
++                            const uint32_t *         src,
++                            const uint32_t *         mask,
++                            int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f28)
++		pix_multiply($f22,$f28)
++		save_to($f22) 
++		negate($f22,$f22)
++		pix_multiply($f24,$f22)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++}
++
++
++static void
++ls_combine_atop_u (pixman_implementation_t *imp,
++                    pixman_op_t              op,
++                    uint32_t *               dest,
++                    const uint32_t *         src,
++                    const uint32_t *         mask,
++                    int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++	if (mask)
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26)
++		expand_alpha($f24,$f28)
++		negate($f26,$f26)
++		pix_add_mul($f20,$f28,$f24,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++
++		mask++;
++	}else {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26)
++		expand_alpha($f24,$f28)
++		negate($f26,$f26)
++		pix_add_mul($f20,$f28,$f24,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);
++	}
++	++dest;
++	++src;
++
++    }
++}
++
++static void
++ls_combine_atop_reverse_u (pixman_implementation_t *imp,
++                            pixman_op_t              op,
++                            uint32_t *               dest,
++                            const uint32_t *         src,
++                            const uint32_t *         mask,
++                            int                      width)
++{
++    const uint32_t *end;
++
++    end = dest + width;
++
++    while (dest < end)
++    {
++	if (mask){
++
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26)
++		expand_alpha($f24,$f28)
++		negate($f28,$f28)
++		pix_add_mul($f20,$f28,$f24,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++		mask++;
++	}else{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26)
++		expand_alpha($f24,$f28)
++		negate($f28,$f28)
++		pix_add_mul($f20,$f28,$f24,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);	
++	}
++	++dest;
++	++src;
++    }
++}
++
++
++static void
++ls_combine_atop_ca (pixman_implementation_t *imp,
++                     pixman_op_t              op,
++                     uint32_t *               dest,
++                     const uint32_t *         src,
++                     const uint32_t *         mask,
++                     int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f26) 
++		expand_alpha($f20,$f28) 
++		pix_multiply($f20,$f22)
++		save_to($f20)
++		pix_multiply($f22,$f28)
++		save_to($f22)
++		negate($f22,$f22)
++		pix_add_mul($f24,$f22,$f20,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++}
++
++static void
++ls_combine_atop_reverse_ca (pixman_implementation_t *imp,
++                             pixman_op_t              op,
++                             uint32_t *               dest,
++                             const uint32_t *         src,
++                             const uint32_t *         mask,
++                             int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f26)
++		expand_alpha($f20,$f28)
++		pix_multiply($f20,$f22)
++		save_to($f20) 
++		pix_multiply($f22,$f28)
++		save_to($f22) 
++		negate($f26,$f26)
++		pix_add_mul($f24,$f22,$f20,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++}
++
++static void
++ls_combine_xor_u (pixman_implementation_t *imp,
++                   pixman_op_t              op,
++                   uint32_t *               dest,
++                   const uint32_t *         src,
++                   const uint32_t *         mask,
++                   int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++	if (mask) 
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26) 
++		expand_alpha($f24,$f28) 
++		negate($f26,$f26)
++		negate($f28,$f28)
++		pix_add_mul($f20,$f28,$f24,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++		mask++;
++	}else{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26) 
++		expand_alpha($f24,$f28) 
++		negate($f26,$f26)
++		negate($f28,$f28)
++		pix_add_mul($f20,$f28,$f24,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);
++	}
++	++dest;
++	++src;
++
++    }
++}
++
++static void
++ls_combine_xor_ca (pixman_implementation_t *imp,
++                    pixman_op_t              op,
++                    uint32_t *               dest,
++                    const uint32_t *         src,
++                    const uint32_t *         mask,
++                    int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f26) 
++		expand_alpha($f20,$f28) 
++		pix_multiply($f20,$f22)
++		save_to($f20) 
++		pix_multiply($f22,$f28)
++		save_to($f22) 
++		negate($f26,$f26)
++		negate($f22,$f22)
++		pix_add_mul($f24,$f22,$f20,$f26)		
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++}
++
++
++static void
++ls_combine_in_reverse_u (pixman_implementation_t *imp,
++                          pixman_op_t              op,
++                          uint32_t *               dest,
++                          const uint32_t *         src,
++                          const uint32_t *         mask,
++                          int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++ 
++	if (mask) 
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26)
++		pix_multiply($f24,$f26)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++		mask++;
++	} else {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f26)
++		pix_multiply($f24,$f26)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber		
++		);
++	}
++	++dest;
++	++src;
++    }
++}
++
++static void
++ls_combine_in_reverse_ca (pixman_implementation_t *imp,
++                           pixman_op_t              op,
++                           uint32_t *               dest,
++                           const uint32_t *         src,
++                           const uint32_t *         mask,
++                           int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		expand_alpha($f20,$f20) 
++		pix_multiply($f22,$f20)
++		save_to($f26)
++		pix_multiply($f24,$f26)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++}
++
++static void
++ls_combine_in_u (pixman_implementation_t *imp,
++                  pixman_op_t              op,
++                  uint32_t *               dest,
++                  const uint32_t *         src,
++                  const uint32_t *         mask,
++                  int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++	if (mask) 
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f24)
++		pix_multiply($f20,$f24)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++		mask++;
++	} else {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f24)
++		pix_multiply($f20,$f24)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);	
++	
++	}
++	++dest;
++	++src;
++    }
++}
++
++static void
++ls_combine_in_ca (pixman_implementation_t *imp,
++                   pixman_op_t              op,
++                   uint32_t *               dest,
++                   const uint32_t *         src,
++                   const uint32_t *         mask,
++                   int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		expand_alpha($f24,$f24) 
++		pix_multiply($f20,$f22)
++		save_to($f26)
++		pix_multiply($f26,$f24)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++ }
++static void
++ls_combine_src_ca (pixman_implementation_t *imp,
++                    pixman_op_t              op,
++                    uint32_t *               dest,
++                    const uint32_t *         src,
++                    const uint32_t *         mask,
++                    int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22) 
++		load8888r(%1,$f20) 
++		pix_multiply($f20,$f22)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++mask;
++	++dest;
++    }
++
++}
++
++
++static void 
++ls_combine_over_u (pixman_implementation_t *imp,
++                    pixman_op_t              op,
++                    uint32_t *               dest,
++                    const uint32_t *         src,
++                    const uint32_t *         mask,
++                    int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++
++	uint32_t ssrc = combine (src, mask);
++	uint32_t a = ssrc >> 24;
++
++	if (a == 0xff)
++	{
++	    *dest = ssrc;
++	}
++	else if (ssrc)
++	{
++
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20)
++
++		expand_alpha($f20,$f24) 
++		load8888r(%0,$f26) 
++		over($f20,$f24,$f26)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(ssrc):clobber
++		);
++	}
++
++	++dest;
++	++src;
++	if (mask)
++	    ++mask;
++    }
++}
++
++static void
++ls_combine_over_reverse_u (pixman_implementation_t *imp,
++                            pixman_op_t              op,
++                            uint32_t *               dest,
++                            const uint32_t *         src,
++                            const uint32_t *         mask,
++                            int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++	if (mask)
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f26) 
++		expand_alpha($f26,$f28)
++		over($f26,$f28,$f20)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++		mask++;
++	}else{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f26) 
++		expand_alpha($f26,$f28)
++		over($f26,$f28,$f20)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);	
++	
++	}
++	++dest;
++	++src;
++    }
++}
++
++
++static void
++ls_combine_over_ca (pixman_implementation_t *imp,
++                     pixman_op_t              op,
++                     uint32_t *               dest,
++                     const uint32_t *         src,
++                     const uint32_t *         mask,
++                     int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f20) 
++		load8888r(%1,$f22) 
++		load8888r(%2,$f24) 
++		expand_alpha($f22,$f26) 
++		in_over($f22,$f26,$f24,$f20)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++
++}
++
++static void
++ls_combine_over_reverse_ca (pixman_implementation_t *imp,
++                             pixman_op_t              op,
++                             uint32_t *               dest,
++                             const uint32_t *         src,
++                             const uint32_t *         mask,
++                             int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f20) 
++		load8888r(%1,$f22) 
++		load8888r(%2,$f24) 
++		in($f22,$f24)
++		save_to($f22)
++		expand_alpha($f20,$f28) 
++		over($f20,$f28,$f22)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++
++}
++
++static void
++ls_combine_add_u (pixman_implementation_t *imp,
++                   pixman_op_t              op,
++                   uint32_t *               dest,
++                   const uint32_t *         src,
++                   const uint32_t *         mask,
++                   int                      width)
++{
++    const uint32_t *end = dest + width;
++
++    while (dest < end)
++    {
++
++	if (mask)
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%2,$f22)  
++		load8888r(%1,$f20) 
++		expand_alpha($f22,$f22)
++		pix_multiply($f20,$f22)
++		save_to ($f20)
++
++		load8888r(%0,$f22) 
++		pix_add($f20,$f22)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++		mask++;
++	}else{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++
++		load8888r(%0,$f22) 
++		pix_add($f20,$f22)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src):clobber
++		);	
++	
++	}
++	++dest;
++	++src;
++    }
++}
++
++static void
++ls_combine_add_ca (pixman_implementation_t *imp,
++                    pixman_op_t              op,
++                    uint32_t *               dest,
++                    const uint32_t *         src,
++                    const uint32_t *         mask,
++                    int                      width)
++{
++    const uint32_t *end = src + width;
++
++    while (src < end)
++    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f20) 
++		load8888r(%1,$f22) 
++		load8888r(%2,$f24) 
++		pix_multiply($f22,$f24)
++		save_to($f22)
++		pix_add($f22,$f20)
++		store8888r($f8,%0)
++		:"+r"(*dest):"r"(*src),"r"(*mask):clobber
++		);
++	++src;
++	++dest;
++	++mask;
++    }
++}
+diff -urN pixman//pixman/pixman-composite-ls.c Pixman.Loongson//pixman/pixman-composite-ls.c
+--- pixman//pixman/pixman-composite-ls.c	1970-01-01 08:00:00.000000000 +0800
++++ Pixman.Loongson//pixman/pixman-composite-ls.c	2010-12-25 18:39:15.356667000 +0800
+@@ -0,0 +1,967 @@
++static void
++ls_composite_over_x888_8_8888 (pixman_implementation_t *imp,
++                                 pixman_op_t              op,
++                                 pixman_image_t *         src_image,
++                                 pixman_image_t *         mask_image,
++                                 pixman_image_t *         dst_image,
++                                 int32_t                  src_x,
++                                 int32_t                  src_y,
++                                 int32_t                  mask_x,
++                                 int32_t                  mask_y,
++                                 int32_t                  dest_x,
++                                 int32_t                  dest_y,
++                                 int32_t                  width,
++                                 int32_t                  height)
++{
++
++    uint32_t    *src, *src_line;
++    uint32_t    *dst, *dst_line;
++    uint8_t     *mask, *mask_line;
++    int src_stride, mask_stride, dst_stride;
++    uint32_t m;
++    uint32_t s, d;
++    int32_t w;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++
++    while (height--)
++    {
++	src = src_line;
++	src_line += src_stride;
++	dst = dst_line;
++	dst_line += dst_stride;
++	mask = mask_line;
++	mask_line += mask_stride;
++
++	w = width;
++	while (w--)
++	{
++	    m = *mask++;
++	    if (m)
++	    {
++		s = *src | 0xff000000;
++
++		if (m == 0xff)
++		{
++		    *dst = s;
++		}
++		else
++		{
++          __asm__ volatile (
++          ".set arch=loongson2f \n\t"     
++          load8888r(%0,$f20) 
++          load8888r(%1,$f22) 
++          load8888r(%2,$f24) 
++          expand_alpha($f22,$f26)
++          expand_alpha_rev($f24,$f28)
++          in_over($f22,$f26,$f28,$f20)
++          store8888r($f8,%0)
++          :"+r"(*dst):"r"(s),"r"(m):clobber
++          );
++
++//		    __m64 sa = expand_alpha (s);
++//		    __m64 vm = expand_alpha_rev (to_m64 (m));
++//		    __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
++//		    *dst = store8888 (vdest);
++
++		}
++	    }
++	    src++;
++	    dst++;
++	}
++    }
++}
++
++
++
++
++
++static void
++ls_composite_over_8888_8888 (pixman_implementation_t *imp,
++                              pixman_op_t              op,
++                              pixman_image_t *         src_image,
++                              pixman_image_t *         mask_image,
++                              pixman_image_t *         dst_image,
++                              int32_t                  src_x,
++                              int32_t                  src_y,
++                              int32_t                  mask_x,
++                              int32_t                  mask_y,
++                              int32_t                  dest_x,
++                              int32_t                  dest_y,
++                              int32_t                  width,
++                              int32_t                  height)
++{
++    uint32_t *dst_line, *dst;
++    uint32_t *src_line, *src;
++    uint32_t s;
++    int dst_stride, src_stride;
++    uint8_t a;
++    int32_t w;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	src = src_line;
++	src_line += src_stride;
++	w = width;
++
++	while (w--)
++	{
++	    s = *src;
++	    a = s >> 24;
++
++	    if (a == 0xff)
++	    {
++		*dst = s;
++	    }
++	    else if (s)
++	    {
++			
++				__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f24) 
++		load8888r(%0,$f20)
++		expand_alpha($f24,$f26)
++		over($f24,$f26,$f20)
++		store8888r($f8,%0)
++		:"+r"(*dst):"r"(*src):clobber
++		);
++	    }
++		dst++;
++		src++;	
++
++	}
++    }
++}
++
++
++static void
++ls_composite_over_8888_n_8888 (pixman_implementation_t *imp,
++                                pixman_op_t              op,
++                                pixman_image_t *         src_image,
++                                pixman_image_t *         mask_image,
++                                pixman_image_t *         dst_image,
++                                int32_t                  src_x,
++                                int32_t                  src_y,
++                                int32_t                  mask_x,
++                                int32_t                  mask_y,
++                                int32_t                  dest_x,
++                                int32_t                  dest_y,
++                                int32_t                  width,
++                                int32_t                  height)
++{
++    uint32_t    *dst_line, *dst;
++    uint32_t    *src_line, *src;
++    uint32_t mask;
++    __m64 vmask;
++    int dst_stride, src_stride;
++    int32_t w;
++    __m64 srca;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++
++    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
++    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888(%1,$f24)
++	store64a($f24,%0)
++	:"=m"(vmask):"m"(mask):clobber
++	);
++
++    srca = ls_4x00ff;
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	src = src_line;
++	src_line += src_stride;
++	w = width;
++
++	while (w)
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++		load8888r(%0,$f22) 
++		expand_alpha($f20,$f28)
++		in_over($f20,$f28,$f24,$f22)
++		store8888r($f8,%0)
++		:"+r"(*dst):"r"(*src):clobber
++		);
++
++	    w--;
++	    dst++;
++	    src++;
++	}
++    }
++}
++
++static void
++ls_composite_over_n_8888 (pixman_implementation_t *imp,
++                           pixman_op_t              op,
++                           pixman_image_t *         src_image,
++                           pixman_image_t *         mask_image,
++                           pixman_image_t *         dst_image,
++                           int32_t                  src_x,
++                           int32_t                  src_y,
++                           int32_t                  mask_x,
++                           int32_t                  mask_y,
++                           int32_t                  dest_x,
++                           int32_t                  dest_y,
++                           int32_t                  width,
++                           int32_t                  height)
++{
++    uint32_t src;
++    uint32_t    *dst_line, *dst;
++    int32_t w;
++    int dst_stride;
++    __m64 vsrc, vsrca; 
++
++    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
++
++    if (src == 0)
++	return;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%2,$f24)
++	store64($f24,%0)
++	expand_alpha($f24,$f26)
++	store64($f26,%1)
++	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
++	);
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	w = width;
++
++	while (w)
++	{
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%0,$f28)
++	over($f24,$f26,$f28)
++	store8888r($f8,%0)
++	:"+r"(*dst)::clobber
++	);
++
++	    w--;
++	    dst++;
++	}
++    }
++}
++
++static void
++ls_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
++                                   pixman_op_t              op,
++                                   pixman_image_t *         src_image,
++                                   pixman_image_t *         mask_image,
++                                   pixman_image_t *         dst_image,
++                                   int32_t                  src_x,
++                                   int32_t                  src_y,
++                                   int32_t                  mask_x,
++                                   int32_t                  mask_y,
++                                   int32_t                  dest_x,
++                                   int32_t                  dest_y,
++                                   int32_t                  width,
++                                   int32_t                  height)
++{
++    uint32_t src, srca;
++    uint32_t    *dst_line;
++    uint32_t    *mask_line;
++    int dst_stride, mask_stride;
++    __m64 vsrc, vsrca;
++
++    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
++
++    srca = src >> 24;
++    if (src == 0)
++	return;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%2,$f24)
++	store64($f24,%0)
++	expand_alpha($f24,$f26)
++	store64($f26,%1)
++	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
++	);
++
++    while (height--)
++    {
++	int twidth = width;
++	uint32_t *p = (uint32_t *)mask_line;
++	uint32_t *q = (uint32_t *)dst_line;
++
++	while (twidth)
++	{
++
++	    if (*p)
++	    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f28)
++		load8888r(%1,$f20)
++		in_over($f24,$f26,$f20,$f28)
++		store8888r($f8,%0)
++		:"+r"(*q):"r"(*p):clobber
++		);
++	    }
++	    twidth--;
++	    p++;
++	    q++;
++	}
++
++	dst_line += dst_stride;
++	mask_line += mask_stride;
++    }
++}
++
++
++static void
++ls_composite_over_n_8_8888 (pixman_implementation_t *imp,
++                             pixman_op_t              op,
++                             pixman_image_t *         src_image,
++                             pixman_image_t *         mask_image,
++                             pixman_image_t *         dst_image,
++                             int32_t                  src_x,
++                             int32_t                  src_y,
++                             int32_t                  mask_x,
++                             int32_t                  mask_y,
++                             int32_t                  dest_x,
++                             int32_t                  dest_y,
++                             int32_t                  width,
++                             int32_t                  height)
++{
++    uint32_t src, srca;
++    uint32_t *dst_line, *dst;
++    uint8_t *mask_line, *mask;
++    int dst_stride, mask_stride;
++    int32_t w;
++    __m64 vsrc, vsrca;
++    uint64_t srcsrc;
++
++    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
++
++    srca = src >> 24;
++    if (src == 0)
++	return;
++
++    srcsrc = (uint64_t)src << 32 | src;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%2,$f24)
++	store64a($f24,%0)
++	expand_alpha($f24,$f26)
++	store64a($f26,%1)
++	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
++	);
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	mask = mask_line;
++	mask_line += mask_stride;
++	w = width;
++
++	while (w)
++	{
++	    uint32_t m = *mask;
++
++	    if (m)
++	    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f20)
++		load32r(%1,$f22)
++		expand_alpha_rev($f22,$f28)
++		in_over($f24,$f26,$f28,$f20)
++		store8888r($f8,%0)
++		:"+r"(*dst):"r"(m):clobber
++		);
++	    }
++
++	    w--;
++	    mask++;
++	    dst++;
++	}
++    }
++
++}
++
++static void
++ls_composite_over_x888_n_8888 (pixman_implementation_t *imp,
++                                pixman_op_t              op,
++                                pixman_image_t *         src_image,
++                                pixman_image_t *         mask_image,
++                                pixman_image_t *         dst_image,
++                                int32_t                  src_x,
++                                int32_t                  src_y,
++                                int32_t                  mask_x,
++                                int32_t                  mask_y,
++                                int32_t                  dest_x,
++                                int32_t                  dest_y,
++                                int32_t                  width,
++                                int32_t                  height)
++{
++    uint32_t *dst_line, *dst;
++    uint32_t *src_line, *src;
++    uint32_t mask;
++    __m64 vmask;
++    int dst_stride, src_stride;
++    int32_t w;
++    __m64 srca;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
++
++    mask &= 0xff000000;
++    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%1,$f24)
++	store64a($f24,%0)
++	:"=m"(vmask):"r"(mask):clobber
++	);
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load64a(%1,$f26)
++	store64a($f26,%0)
++	:"=m"(srca):"m"(ls_4x00ff):clobber
++	);
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	src = src_line;
++	src_line += src_stride;
++	w = width;
++
++	while (w)
++	{
++		uint32_t src_tmp = *src | 0xff000000;
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20)
++		load8888r(%0,$f22)		
++		in_over($f20,$f26,$f24,$f22)
++		store8888r($f8,%0)
++		:"+r"(*dst):"r"(src_tmp):clobber
++		);
++
++	    w--;
++	    dst++;
++	    src++;
++	}
++    }
++}
++
++
++static void
++ls_composite_over_8888_0565 (pixman_implementation_t *imp,
++                               pixman_op_t              op,
++                               pixman_image_t *         src_image,
++                               pixman_image_t *         mask_image,
++                               pixman_image_t *         dst_image,
++                               int32_t                  src_x,
++                               int32_t                  src_y,
++                               int32_t                  mask_x,
++                               int32_t                  mask_y,
++                               int32_t                  dest_x,
++                               int32_t                  dest_y,
++                               int32_t                  width,
++                               int32_t                  height)
++{
++    uint16_t    *dst_line, *dst;
++    uint32_t d;
++    uint32_t    *src_line, *src, s;
++    uint8_t a;
++    int dst_stride, src_stride;
++    int32_t w;
++
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	src = src_line;
++	src_line += src_stride;
++	w = width;
++
++	while (w--)
++	{
++	    s = *src++;
++	    a = s >> 24;
++	    if (s)
++	    {
++		if (a == 0xff)
++		{
++		    d = s;
++		}
++		else
++		{
++		    d = *dst;
++		    d = CONVERT_0565_TO_0888 (d);
++
++		    __asm__ volatile (
++		    ".set arch=loongson2f \n\t"     
++		    load8888r(%1,$f24) 
++		    load8888r(%0,$f20)
++		    expand_alpha($f24,$f26) 
++		    over($f24,$f26,$f20)
++		    store8888r($f8,%0)
++		    :"+r"(d):"r"(s):clobber
++		    );
++
++
++		}
++		*dst = CONVERT_8888_TO_0565 (d);
++	    }
++	    dst++;
++	}
++    }
++}
++
++static void
++ls_composite_over_n_0565 (pixman_implementation_t *imp,
++                           pixman_op_t              op,
++                           pixman_image_t *         src_image,
++                           pixman_image_t *         mask_image,
++                           pixman_image_t *         dst_image,
++                           int32_t                  src_x,
++                           int32_t                  src_y,
++                           int32_t                  mask_x,
++                           int32_t                  mask_y,
++                           int32_t                  dest_x,
++                           int32_t                  dest_y,
++                           int32_t                  width,
++                           int32_t                  height)
++{
++    uint32_t src;
++    uint32_t d;
++    uint16_t    *dst_line, *dst;
++    int32_t w;
++    int dst_stride;
++    __m64 vsrc, vsrca;
++
++    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
++
++    if (src == 0)
++	return;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%2,$f24)
++	store64a($f24,%0)
++	expand_alpha($f24,$f26)
++	store64a($f26,%1)
++	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
++	);
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	w = width;
++
++	while (w)
++	{
++
++		d = *dst;
++		d = CONVERT_0565_TO_0888 (d);
++
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f20)
++
++		over($f24,$f26,$f20)
++		store8888r($f8,%0)
++		:"+r"(d)::clobber
++		);
++
++		*dst = CONVERT_8888_TO_0565 (d);
++
++	    w--;
++	    dst++;
++	}
++    }
++}
++
++static void
++ls_composite_over_n_8_0565 (pixman_implementation_t *imp,
++                             pixman_op_t              op,
++                             pixman_image_t *         src_image,
++                             pixman_image_t *         mask_image,
++                             pixman_image_t *         dst_image,
++                             int32_t                  src_x,
++                             int32_t                  src_y,
++                             int32_t                  mask_x,
++                             int32_t                  mask_y,
++                             int32_t                  dest_x,
++                             int32_t                  dest_y,
++                             int32_t                  width,
++                             int32_t                  height)
++{
++    uint32_t src, srca, m, d;
++    uint16_t *dst_line, *dst;
++    uint8_t *mask_line, *mask;
++    int dst_stride, mask_stride;
++    int32_t w;
++    __m64 vsrc, vsrca;
++
++    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
++
++    srca = src >> 24;
++    if (src == 0)
++	return;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%2,$f24)
++	store64a($f24,%0)
++	expand_alpha($f24,$f26)
++	store64a($f26,%1)
++	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
++	);
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	mask = mask_line;
++	mask_line += mask_stride;
++	w = width;
++
++	while (w)
++	{
++	    m = *mask;
++	    d = *dst;
++
++	    if (m)
++	    {
++
++		d = CONVERT_0565_TO_0888 (d);
++
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f20)
++		load32r(%1,$f22)
++		expand_alpha_rev($f22,$f28)
++		in_over($f24,$f26,$f28,$f20)
++		store8888r($f8,%0)
++		:"+r"(d):"r"(m):clobber
++		);
++
++		*dst = CONVERT_8888_TO_0565 (d);
++
++	    }
++
++	    w--;
++	    mask++;
++	    dst++;
++	}
++    }
++}
++
++static void
++ls_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
++                                   pixman_op_t              op,
++                                   pixman_image_t *         src_image,
++                                   pixman_image_t *         mask_image,
++                                   pixman_image_t *         dst_image,
++                                   int32_t                  src_x,
++                                   int32_t                  src_y,
++                                   int32_t                  mask_x,
++                                   int32_t                  mask_y,
++                                   int32_t                  dest_x,
++                                   int32_t                  dest_y,
++                                   int32_t                  width,
++                                   int32_t                  height)
++{
++    uint32_t src, srca, m, d;
++    uint16_t    *dst_line;
++    uint32_t    *mask_line;
++    int dst_stride, mask_stride;
++    __m64 vsrc, vsrca;
++
++    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
++
++    srca = src >> 24;
++    if (src == 0)
++	return;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%2,$f24)
++	store64a($f24,%0)
++	expand_alpha($f24,$f26)
++	store64a($f26,%1)
++	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
++	);
++
++    while (height--)
++    {
++	int twidth = width;
++	uint32_t *p = (uint32_t *)mask_line;
++	uint16_t *q = (uint16_t *)dst_line;
++
++	while (twidth)
++	{
++
++	    m = *(uint32_t *)p;
++	    d = *q;
++
++	    if (m)
++	    {
++
++		d = CONVERT_0565_TO_0888 (d);
++
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%0,$f20)
++		load8888r(%1,$f22)
++		in_over($f24,$f26,$f22,$f20)
++		store8888r($f8,%0)
++		:"+r"(d):"r"(m):clobber
++		);
++
++		*q = CONVERT_8888_TO_0565 (d);
++
++	    }
++
++	    twidth--;
++	    p++;
++	    q++;
++	}
++
++	mask_line += mask_stride;
++	dst_line += dst_stride;
++    }
++}
++static void
++ls_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
++                                pixman_op_t              op,
++                                pixman_image_t *         src_image,
++                                pixman_image_t *         mask_image,
++                                pixman_image_t *         dst_image,
++                                int32_t                  src_x,
++                                int32_t                  src_y,
++                                int32_t                  mask_x,
++                                int32_t                  mask_y,
++                                int32_t                  dest_x,
++                                int32_t                  dest_y,
++                                int32_t                  width,
++                                int32_t                  height)
++{
++    uint32_t    *dst_line, *dst;
++    uint32_t    *src_line, *src;
++    int dst_stride, src_stride;
++    int32_t w;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++
++#if 0
++    /* FIXME */
++    assert (src_image->drawable == mask_image->drawable);
++#endif
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	src = src_line;
++	src_line += src_stride;
++	w = width;
++
++	while (w)
++	{
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f22) 
++		load8888r(%0,$f20) 
++		over_rev_non_pre($f22,$f20)
++		store8888r($f8,%0)
++		:"+r"(*dst):"r"(*src):clobber
++		);
++
++	    w--;
++	    dst++;
++	    src++;
++	}
++    }
++}
++static void
++ls_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
++                                pixman_op_t              op,
++                                pixman_image_t *         src_image,
++                                pixman_image_t *         mask_image,
++                                pixman_image_t *         dst_image,
++                                int32_t                  src_x,
++                                int32_t                  src_y,
++                                int32_t                  mask_x,
++                                int32_t                  mask_y,
++                                int32_t                  dest_x,
++                                int32_t                  dest_y,
++                                int32_t                  width,
++                                int32_t                  height)
++{
++    uint16_t    *dst_line, *dst;
++    uint32_t    *src_line, *src, d;
++    int dst_stride, src_stride;
++    int32_t w;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
++
++#if 0
++    /* FIXME */
++    assert (src_image->drawable == mask_image->drawable);
++#endif
++
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	src = src_line;
++	src_line += src_stride;
++	w = width;
++
++	while (w)
++	{
++
++		d = *dst;
++		d = CONVERT_0565_TO_0888 (d);
++
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load8888r(%1,$f20) 
++		load8888r(%0,$f24) 
++		over_rev_non_pre($f20,$f24)
++		store8888r($f8,%0)
++		:"+r"(d):"r"(*src):clobber
++		);
++
++		*dst = CONVERT_8888_TO_0565 (d);
++
++	    w--;
++	    dst++;
++	    src++;
++	}
++    }
++}
++
++static void
++ls_composite_src_n_8_8888 (pixman_implementation_t *imp,
++                            pixman_op_t              op,
++                            pixman_image_t *         src_image,
++                            pixman_image_t *         mask_image,
++                            pixman_image_t *         dst_image,
++                            int32_t                  src_x,
++                            int32_t                  src_y,
++                            int32_t                  mask_x,
++                            int32_t                  mask_y,
++                            int32_t                  dest_x,
++                            int32_t                  dest_y,
++                            int32_t                  width,
++                            int32_t                  height)
++{
++    uint32_t src, srca;
++    uint32_t    *dst_line, *dst, m;
++    uint8_t     *mask_line, *mask;
++    int dst_stride, mask_stride;
++    int32_t w;
++    __m64 vsrc, vsrca;
++    uint64_t srcsrc;
++
++    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
++
++    srca = src >> 24;
++    if (src == 0)
++    {
++	pixman_fill_ls (dst_image->bits.bits, dst_image->bits.rowstride,
++			 PIXMAN_FORMAT_BPP (dst_image->bits.format),
++	                 dest_x, dest_y, width, height, 0);
++	return;
++    }
++
++    srcsrc = (uint64_t)src << 32 | src;
++
++    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
++    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"	
++	load8888r(%2,$f24)
++	store64a($f24,%0)
++	expand_alpha($f24,$f26)
++	store64a($f26,%1)
++	:"=m"(vsrc), "=m"(vsrca):"r"(src):clobber
++	);
++    while (height--)
++    {
++	dst = dst_line;
++	dst_line += dst_stride;
++	mask = mask_line;
++	mask_line += mask_stride;
++	w = width;
++
++	while (w)
++	{
++	    m = *mask;
++
++	    if (m)
++	    {
++		__asm__ volatile (
++		".set arch=loongson2f \n\t"	
++		load32r(%1,$f20)
++		expand_alpha_rev($f20,$f28)
++		in($f24,$f28)
++		store8888r($f8,%0)
++		:"=r"(*dst):"r"(m):clobber
++		);
++
++	    }
++	    else
++	    {
++		*dst = 0;
++	    }
++
++	    w--;
++	    mask++;
++	    dst++;
++	}
++    }
++}
+diff -urN pixman//pixman/pixman-cpu.c Pixman.Loongson//pixman/pixman-cpu.c
+--- pixman//pixman/pixman-cpu.c	2010-12-25 18:46:00.073234000 +0800
++++ Pixman.Loongson//pixman/pixman-cpu.c	2010-12-25 18:39:15.360337000 +0800
+@@ -579,7 +579,9 @@
+     if (pixman_have_mmx ())
+ 	return _pixman_implementation_create_mmx ();
+ #endif
+-
++#ifdef USE_LS
++	return _pixman_implementation_create_ls ();
++#endif
+ #ifdef USE_ARM_NEON
+     if (pixman_have_arm_neon ())
+ 	return _pixman_implementation_create_arm_neon ();
+diff -urN pixman//pixman/pixman-ls.c Pixman.Loongson//pixman/pixman-ls.c
+--- pixman//pixman/pixman-ls.c	1970-01-01 08:00:00.000000000 +0800
++++ Pixman.Loongson//pixman/pixman-ls.c	2010-12-25 18:39:15.386759000 +0800
+@@ -0,0 +1,538 @@
++/*
++* Based on pixman-mmx.c
++* Implemented for loongson 2F only.
++* Free software based on GPL licence.
++* Copyright 2010 WG Ge.
++*/
++
++#ifdef HAVE_CONFIG_H
++#include <config.h>
++#endif
++#include <stdlib.h>
++#include <string.h>
++#include <math.h>
++#include <limits.h>
++#include <stdio.h>
++#include "pixman-private.h"
++#include "pixman-combine32.h"
++#include "primitive.h"
++
++#define __m64  __attribute__ ((aligned (8))) uint64_t
++#define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (n))) v
++#define DECLARE_ALIGNED_8(t, v, ...)  DECLARE_ALIGNED(8, t, v)
++
++DECLARE_ALIGNED_8 (const uint64_t, ls_4x00ff                  ) = 0x00ff00ff00ff00ffULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_4x0080                  ) = 0x0080008000800080ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_565_rgb                 ) = 0x000001f0003f001fULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_565_unpack_multiplier   ) = 0x0000008404100840ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_565_r                   ) = 0x000000f800000000ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_565_g                   ) = 0x0000000000fc0000ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_565_b                   ) = 0x00000000000000f8ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_mask_0                  ) = 0xffffffffffff0000ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_mask_1                  ) = 0xffffffff0000ffffULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_mask_2                  ) = 0xffff0000ffffffffULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_mask_3                  ) = 0x0000ffffffffffffULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_full_alpha              ) = 0x00ff000000000000ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_ffff0000ffff0000        ) = 0xffff0000ffff0000ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_0000ffff00000000        ) = 0x0000ffff00000000ULL;
++DECLARE_ALIGNED_8 (const uint64_t, ls_000000000000ffff        ) = 0x000000000000ffffULL;
++
++
++pixman_bool_t
++pixman_fill_ls (uint32_t *bits,
++                 int       stride,
++                 int       bpp,
++                 int       x,
++                 int       y,
++                 int       width,
++                 int       height,
++                 uint32_t xor)
++{
++    uint64_t fill;
++    uint32_t byte_width;
++    uint8_t     *byte_line;
++
++
++
++    if (bpp != 16 && bpp != 32 && bpp != 8)
++	return FALSE;
++
++    if (bpp == 8)
++    {
++	stride = stride * (int) sizeof (uint32_t) / 1;
++	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
++	byte_width = width;
++	stride *= 1;
++        xor = (xor & 0xff) * 0x01010101;
++    }
++    else if (bpp == 16)
++    {
++	stride = stride * (int) sizeof (uint32_t) / 2;
++	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
++	byte_width = 2 * width;
++	stride *= 2;
++        xor = (xor & 0xffff) * 0x00010001;
++    }
++    else
++    {
++	stride = stride * (int) sizeof (uint32_t) / 4;
++	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
++	byte_width = 4 * width;
++	stride *= 4;
++    }
++
++    fill = ((uint64_t)xor << 32) | xor;
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"
++	"ldc1 $f24, %0 \n\t"
++	::"m"(fill):"$f24"
++	);
++    while (height--)
++    {
++	int w;
++	uint8_t *d = byte_line;
++
++	byte_line += stride;
++	w = byte_width;
++
++	while (w >= 1 && ((unsigned long)d & 1))
++	{
++	    *(uint8_t *)d = (xor & 0xff);
++	    w--;
++	    d++;
++	}
++
++	while (w >= 2 && ((unsigned long)d & 3))
++	{
++	    *(uint16_t *)d = xor;
++	    w -= 2;
++	    d += 2;
++	}
++
++	while (w >= 4 && ((unsigned long)d & 7))
++	{
++	    *(uint32_t *)d = xor;
++
++	    w -= 4;
++	    d += 4;
++	}
++
++	while (w >= 64)
++	{
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"
++	"dmfc1 $8, $f24 \n\t"
++	"sd $8 ,   (%0) \n\t"
++	"sd $8 ,   8(%0) \n\t"	
++	"sd $8 ,   16(%0) \n\t"
++	"sd $8 ,   24(%0) \n\t"
++	"sd $8 ,   32(%0) \n\t"          
++	"sd $8 ,   40(%0) \n\t"	       
++	"sd $8 ,   48(%0) \n\t"        
++	"sd $8 ,   56(%0) \n\t"        
++	::"r"(d):"$8","memory","$f24"
++	);
++	    w -= 64;
++	    d += 64;
++	}
++
++	while (w >= 4)
++	{
++	    *(uint32_t *)d = xor;
++
++	    w -= 4;
++	    d += 4;
++	}
++	while (w >= 2)
++	{
++	    *(uint16_t *)d = xor;
++	    w -= 2;
++	    d += 2;
++	}
++	while (w >= 1)
++	{
++	    *(uint8_t *)d = (xor & 0xff);
++	    w--;
++	    d++;
++	}
++
++    }
++    return TRUE;
++}
++
++static pixman_bool_t
++pixman_blt_ls (uint32_t *src_bits,
++                uint32_t *dst_bits,
++                int       src_stride,
++                int       dst_stride,
++                int       src_bpp,
++                int       dst_bpp,
++                int       src_x,
++                int       src_y,
++                int       dst_x,
++                int       dst_y,
++                int       width,
++                int       height)
++{
++    uint8_t *   src_bytes;
++    uint8_t *   dst_bytes;
++    int byte_width;
++
++    if (src_bpp != dst_bpp)
++	return FALSE;
++
++    if (src_bpp == 16)
++    {
++	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
++	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
++	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
++	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
++	byte_width = 2 * width;
++	src_stride *= 2;
++	dst_stride *= 2;
++    }
++    else if (src_bpp == 32)
++    {
++	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
++	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
++	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
++	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
++	byte_width = 4 * width;
++	src_stride *= 4;
++	dst_stride *= 4;
++    }
++    else
++    {
++	return FALSE;
++    }
++
++    while (height--)
++    {
++	int w;
++	uint8_t *s = src_bytes;
++	uint8_t *d = dst_bytes;
++	src_bytes += src_stride;
++	dst_bytes += dst_stride;
++	w = byte_width;
++
++	while (w >= 2 && ((unsigned long)d & 3))
++	{
++	    *(uint16_t *)d = *(uint16_t *)s;
++	    w -= 2;
++	    s += 2;
++	    d += 2;
++	}
++
++	while (w >= 4 && ((unsigned long)d & 7))
++	{
++	    *(uint32_t *)d = *(uint32_t *)s;
++
++	    w -= 4;
++	    s += 4;
++	    d += 4;
++	}
++ if ((unsigned long)s & 7)
++{
++	while (w >= 64)
++	{
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"
++	"uld $8 ,   (%1) \n\t"
++	"uld $9 ,  8(%1) \n\t"
++	"uld $10, 16(%1) \n\t"
++	"uld $11, 24(%1) \n\t"
++	"sd $8 ,   (%0) \n\t"
++	"sd $9 ,   8(%0) \n\t"	
++	"sd $10,   16(%0) \n\t"
++	"sd $11,   24(%0) \n\t"
++
++	"uld $8 ,   32(%1) \n\t"
++	"uld $9 ,   40(%1) \n\t"
++	"uld $10,   48(%1) \n\t"
++	"uld $11,   56(%1) \n\t"
++	"sd $8 ,   32(%0) \n\t"          
++	"sd $9 ,   40(%0) \n\t"	       
++	"sd $10,   48(%0) \n\t"        
++	"sd $11,   56(%0) \n\t"        
++	::"r"(d),"r"(s):"$8","$9","$10","$11","memory"
++	);
++	    w -= 64;
++	    s += 64;
++	    d += 64;
++	}
++}
++else
++{
++	while (w >= 64)
++	{
++
++	__asm__ volatile (
++	".set arch=loongson2f \n\t"
++	"ld $8 ,   (%1) \n\t"
++	"ld $9 ,  8(%1) \n\t"
++	"ld $10, 16(%1) \n\t"
++	"ld $11, 24(%1) \n\t"
++	"sd $8 ,   (%0) \n\t"
++	"sd $9 ,   8(%0) \n\t"	
++	"sd $10,   16(%0) \n\t"
++	"sd $11,   24(%0) \n\t"
++
++	"ld $8 ,   32(%1) \n\t"
++	"ld $9 ,   40(%1) \n\t"
++	"ld $10,   48(%1) \n\t"
++	"ld $11,   56(%1) \n\t"
++	"sd $8 ,   32(%0) \n\t"          
++	"sd $9 ,   40(%0) \n\t"	       
++	"sd $10,   48(%0) \n\t"        
++	"sd $11,   56(%0) \n\t"        
++	::"r"(d),"r"(s):"$8","$9","$10","$11","memory"
++	);
++	    w -= 64;
++	    s += 64;
++	    d += 64;
++	}
++}	
++
++	while (w >= 4)
++	{
++	    *(uint32_t *)d = *(uint32_t *)s;
++
++	    w -= 4;
++	    s += 4;
++	    d += 4;
++	}
++	if (w >= 2)
++	{
++	    *(uint16_t *)d = *(uint16_t *)s;
++	    w -= 2;
++	    s += 2;
++	    d += 2;
++	}
++    }
++    return TRUE;
++}
++
++
++#include "pixman-composite-ls.c"
++#include "pixman-combine-ls.c"
++
++static pixman_bool_t
++ls_blt (pixman_implementation_t *imp,
++         uint32_t *               src_bits,
++         uint32_t *               dst_bits,
++         int                      src_stride,
++         int                      dst_stride,
++         int                      src_bpp,
++         int                      dst_bpp,
++         int                      src_x,
++         int                      src_y,
++         int                      dst_x,
++         int                      dst_y,
++         int                      width,
++         int                      height)
++{
++    if (!pixman_blt_ls (
++            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
++            src_x, src_y, dst_x, dst_y, width, height))
++    {
++	return _pixman_implementation_blt (
++	    imp->delegate,
++	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
++	    src_x, src_y, dst_x, dst_y, width, height);
++    }
++
++    return TRUE;
++}
++
++static pixman_bool_t
++ls_fill (pixman_implementation_t *imp,
++          uint32_t *               bits,
++          int                      stride,
++          int                      bpp,
++          int                      x,
++          int                      y,
++          int                      width,
++          int                      height,
++          uint32_t xor)
++{
++    if (!pixman_fill_ls (bits, stride, bpp, x, y, width, height, xor))
++    {
++	return _pixman_implementation_fill (
++	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
++    }
++
++    return TRUE;
++}
++
++static void
++ls_composite_copy_area (pixman_implementation_t *imp,
++                         pixman_op_t              op,
++                         pixman_image_t *         src_image,
++                         pixman_image_t *         mask_image,
++                         pixman_image_t *         dst_image,
++                         int32_t                  src_x,
++                         int32_t                  src_y,
++                         int32_t                  mask_x,
++                         int32_t                  mask_y,
++                         int32_t                  dest_x,
++                         int32_t                  dest_y,
++                         int32_t                  width,
++                         int32_t                  height)
++{
++    pixman_blt_ls (src_image->bits.bits,
++                    dst_image->bits.bits,
++                    src_image->bits.rowstride,
++                    dst_image->bits.rowstride,
++                    PIXMAN_FORMAT_BPP (src_image->bits.format),
++                    PIXMAN_FORMAT_BPP (dst_image->bits.format),
++                    src_x, src_y, dest_x, dest_y, width, height);
++}
++
++
++static const pixman_fast_path_t ls_fast_paths[] =
++{
++
++//these are implemented so far
++#if 1
++    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, ls_composite_over_x888_8_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, ls_composite_over_x888_8_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, ls_composite_over_x888_8_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, ls_composite_over_x888_8_8888    ),
++#endif
++
++#if 1
++//over_8888_0565 significant perf improvement, slight better L1, L2, 30% better RT
++    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   ls_composite_over_8888_0565      ),
++    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   ls_composite_over_8888_0565      ),
++    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   ls_composite_over_pixbuf_0565    ),
++    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   ls_composite_over_pixbuf_0565    ),
++
++//big improvement some closing 100%
++    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   ls_composite_over_n_8888_0565_ca ),
++    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   ls_composite_over_n_8888_0565_ca ),
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   ls_composite_over_n_8_0565       ),
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   ls_composite_over_n_8_0565       ),
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   ls_composite_over_n_0565         ),
++
++//ubalbe to bench with lowlevel bench, believe it is a gain in perf
++    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, ls_composite_over_x888_n_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, ls_composite_over_x888_n_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, ls_composite_over_x888_n_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, ls_composite_over_x888_n_8888    ),
++
++//performance regress 30% in L1,L2, but significant improvement in RT
++    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, ls_composite_over_8888_8888      ),
++    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, ls_composite_over_8888_8888      ),
++    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, ls_composite_over_8888_8888      ),
++    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, ls_composite_over_8888_8888      ),
++    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, ls_composite_over_pixbuf_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, ls_composite_over_pixbuf_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, ls_composite_over_pixbuf_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, ls_composite_over_pixbuf_8888    ),
++
++//same performance in L1,L2, but significant improvement in RT 30-40%
++    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, ls_composite_over_8888_n_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, ls_composite_over_8888_n_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, ls_composite_over_8888_n_8888    ),
++    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, ls_composite_over_8888_n_8888    ),
++
++//significant perf improvement 20%
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, ls_composite_over_n_8_8888       ),
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, ls_composite_over_n_8_8888       ),
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, ls_composite_over_n_8_8888       ),
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, ls_composite_over_n_8_8888       ),
++
++//3 times perf improvements
++    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, ls_composite_over_n_8888_8888_ca ),
++    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, ls_composite_over_n_8888_8888_ca ),
++    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, ls_composite_over_n_8888_8888_ca ),
++    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, ls_composite_over_n_8888_8888_ca ),
++
++//significant performance boost
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, ls_composite_over_n_8888         ),
++    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, ls_composite_over_n_8888         ),
++//simple add, expect better perf in generic code
++//    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, ls_composite_add_8888_8888       ),
++//    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, ls_composite_add_8888_8888       ),
++
++// FIXME: Copy memory are not better than geneic code
++#if 0
++    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, ls_composite_copy_area           ),
++    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, ls_composite_copy_area           ),
++#endif
++
++//significant improvement
++    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, ls_composite_src_n_8_8888        ),
++    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, ls_composite_src_n_8_8888        ),
++    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, ls_composite_src_n_8_8888        ),
++    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, ls_composite_src_n_8_8888        ),
++
++#endif
++
++//these are not yet implemented
++
++#if 0
++
++    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       ls_composite_add_8000_8000       ),
++    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       ls_composite_add_n_8_8           ),
++    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       ls_composite_in_8_8              ),
++    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       ls_composite_in_n_8_8            ),
++#endif
++
++
++    { PIXMAN_OP_NONE },
++};
++
++pixman_implementation_t *
++_pixman_implementation_create_ls (void)
++{
++    pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
++    pixman_implementation_t *imp = _pixman_implementation_create (general, ls_fast_paths);
++
++//Turned on but unable to benchmark.
++#if 1
++    imp->combine_32[PIXMAN_OP_OVER] = ls_combine_over_u;
++    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = ls_combine_over_reverse_u;
++    imp->combine_32[PIXMAN_OP_IN] = ls_combine_in_u;
++    imp->combine_32[PIXMAN_OP_IN_REVERSE] = ls_combine_in_reverse_u;
++    imp->combine_32[PIXMAN_OP_OUT] = ls_combine_out_u;
++    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = ls_combine_out_reverse_u;
++    imp->combine_32[PIXMAN_OP_ATOP] = ls_combine_atop_u;
++    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = ls_combine_atop_reverse_u;
++    imp->combine_32[PIXMAN_OP_XOR] = ls_combine_xor_u;
++    imp->combine_32[PIXMAN_OP_ADD] = ls_combine_add_u;
++    imp->combine_32[PIXMAN_OP_SATURATE] = ls_combine_saturate_u;
++
++    imp->combine_32_ca[PIXMAN_OP_SRC] = ls_combine_src_ca;
++    imp->combine_32_ca[PIXMAN_OP_OVER] = ls_combine_over_ca;
++    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = ls_combine_over_reverse_ca;
++    imp->combine_32_ca[PIXMAN_OP_IN] = ls_combine_in_ca;
++    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = ls_combine_in_reverse_ca;
++    imp->combine_32_ca[PIXMAN_OP_OUT] = ls_combine_out_ca;
++    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = ls_combine_out_reverse_ca;
++    imp->combine_32_ca[PIXMAN_OP_ATOP] = ls_combine_atop_ca;
++    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = ls_combine_atop_reverse_ca;
++    imp->combine_32_ca[PIXMAN_OP_XOR] = ls_combine_xor_ca;
++    imp->combine_32_ca[PIXMAN_OP_ADD] = ls_combine_add_ca;
++#endif
++
++//FIXME blt and fill not shown better perf than geneic code
++#if 0
++    imp->blt = ls_blt;
++    imp->fill = ls_fill;
++#endif
++
++    return imp;
++}
++
+diff -urN pixman//pixman/pixman-private.h Pixman.Loongson//pixman/pixman-private.h
+--- pixman//pixman/pixman-private.h	2010-12-25 18:46:00.102841000 +0800
++++ Pixman.Loongson//pixman/pixman-private.h	2010-12-25 18:39:15.401808000 +0800
+@@ -493,6 +493,11 @@
+ pixman_implementation_t *
+ _pixman_implementation_create_fast_path (void);
+ 
++#ifdef USE_LS
++pixman_implementation_t *
++_pixman_implementation_create_ls (void);
++#endif
++
+ #ifdef USE_MMX
+ pixman_implementation_t *
+ _pixman_implementation_create_mmx (void);
+diff -urN pixman//pixman/primitive.h Pixman.Loongson//pixman/primitive.h
+--- pixman//pixman/primitive.h	1970-01-01 08:00:00.000000000 +0800
++++ Pixman.Loongson//pixman/primitive.h	2010-12-25 18:39:15.457084000 +0800
+@@ -0,0 +1,214 @@
++/*
++* MMX register usage protocal
++*	return result: f8 
++*	tmp immediate f12
++*	tmp register in primtive f14 f16 f18
++*	tmp register in pixman f0,f4,f6,f10,f20,f22,  
++*	globals in function f24, f26, f28,f30 
++* Exceptions for load and store: 
++*	load will specify dest FPR register
++*	store will specify src FPR register
++*       expand_alpha(_rev) implemented with GPR,  dest FPR as the 2nd parameter
++*
++* Special alert: don't use return result $f8 as input, it might be overwritten
++*/
++
++
++/*primitive macros */
++
++#define clobber "$8","$9","$f0","$f2","$f8",\
++	"$f12","$f14","$f16","$f18","$f20",\
++	"$f22","$f24","$f26","$f28","$f30"
++
++#define  DMTC1_IMM(regc1,imm) \
++	"dli $8, "#imm" \n\t" \
++	"dmtc1 $8, "#regc1" \n\t"
++
++#define  MTC1_IMM(regc1,imm) \
++	"li $8, "#imm" \n\t" \
++	"dmtc1 $8, "#regc1" \n\t"
++
++
++#define save_to(reg1)  "mov.d "#reg1", $f8 \n\t"
++#define zero(reg1)  "xor "#reg1","#reg1","#reg1" \n\t"
++
++#define load32(sp,reg1) \
++		"ulw $8, "#sp" \n\t" \
++		"dmtc1 $8, "#reg1" \n\t"
++
++#define load32a(sp,reg1) \
++		"lw $8, "#sp" \n\t" \
++		"dmtc1 $8, "#reg1" \n\t"
++
++#define load32r(sp,reg1)  \
++	"dmtc1 "#sp", "#reg1" \n\t"
++
++#define load64(sp,reg1) \
++		"uld $8, "#sp" \n\t" \
++		"dmtc1 $8, "#reg1" \n\t"
++
++#define load64a(sp,reg1) \
++		"ld $8, "#sp" \n\t" \
++		"dmtc1 $8, "#reg1" \n\t"
++
++
++#define store32(reg1,sp) \
++		"dmfc1 $8, "#reg1" \n\t" \
++		"usw $8, "#sp" \n\t"
++
++#define store32r(reg1,sp) \
++		"dmfc1 "#sp", "#reg1" \n\t"
++
++#define store32a(reg1,sp) \
++		"swc1 "#reg1", "#sp" \n\t" 
++
++#define store64(reg1,sp) \
++		"dmfc1 $8, "#reg1" \n\t" \
++		"usd $8, "#sp" \n\t"
++
++#define store64a(reg1,sp) \
++		"sdc1 "#reg1", "#sp" \n\t" 
++
++#define load8888(sp,reg1) \
++	load64(sp,reg1) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"punpcklbh "#reg1", "#reg1", $f12 \n\t" 
++
++#define load8888r(sp,reg1) \
++	load32r(sp,reg1) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"punpcklbh "#reg1", "#reg1", $f12 \n\t" 
++
++#define load8888a(sp,reg1) \
++	load64a(sp,reg1) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"punpcklbh "#reg1", "#reg1", $f12 \n\t" 
++
++#define load8888ah(sp,reg1) \
++	load64a(sp,reg1) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"punpckhbh "#reg1", "#reg1", $f12 \n\t" 
++	
++#define store8888(reg1,sp) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"packushb "#reg1", "#reg1", $f12 \n\t" \
++	store64(reg1,sp)
++
++#define store8888r(reg1,sp) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"packushb "#reg1", "#reg1", $f12 \n\t" \
++	store32r(reg1,sp)
++
++#define store8888a(reg1,sp) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"packushb "#reg1", "#reg1", $f12 \n\t" \
++	store64a(reg1,sp)
++
++#define pack8888(reg1,reg2) 	\
++	"packushb $f8, "#reg1","#reg2" \n\t"
++
++#define unpack8888(reg1,reg2) 	\
++	"punpcklbh $f8, "#reg1","#reg2" \n\t"
++
++
++#define negate(sreg,dreg) \
++	DMTC1_IMM($f12, 0x00ff00ff00ff00ff)\
++	"xor "#dreg", "#sreg", $f12 \n\t"
++
++#define pix_add(reg1,reg2) \
++	"paddusb $f8, "#reg1", "#reg2" \n\t"
++
++#define pix_multiply(reg1,reg2) \
++	"pmullh $f14, "#reg1", "#reg2" \n\t " \
++	DMTC1_IMM($f12, 0x0080008000800080) \
++	"paddush $f14, $f14, $f12 \n\t "\
++ 	MTC1_IMM($f12, 8) \
++	"psrlh $f16, $f14, $f12 \n\t" \
++	"paddush $f14, $f14, $f16 \n\t" \
++	"psrlh $f8, $f14, $f12 \n\t" 
++
++#define pix_add_mul(reg1,reg2,reg3,reg4) \
++	pix_multiply(reg1,reg2) \
++	"mov.d $f18, $f8 \n\t" \
++	pix_multiply(reg3,reg4) \
++	pix_add($f18,$f8)	
++
++#define expand_alpha(sreg,dreg) \
++                "dmfc1 $8, "#sreg" \n\t" \
++                "dsrl32 $8, $8, 16 \n\t" \
++                "dsll $9, $8, 16 \n\t" \
++                "or $8, $8, $9 \n\t" \
++                "dsll32 $9, $8, 0 \n\t" \
++                "or $8, $8, $9 \n\t" \
++                "dmtc1 $8, "#dreg" \n\t"
++
++#define expand_alpha_rev(sreg,dreg)\
++                "dmfc1 $8, "#sreg" \n\t" \
++                "dsll32 $8, $8, 16 \n\t" \
++                "dsrl32 $8, $8, 16 \n\t" \
++                "dsll $9, $8, 16 \n\t" \
++                "or $8, $8, $9 \n\t" \
++                "dsll32 $9, $8, 0 \n\t" \
++                "or $8, $8, $9 \n\t" \
++                "dmtc1 $8, "#dreg" \n\t"
++
++#define expand8888(reg1,pos) expand8888_##pos(reg1)
++
++#define expand8888_0(reg1) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"punpcklbh $f8, "#reg1", $f12 \n\t" 
++
++#define expand8888_1(reg1) \
++	"xor $f12, $f12, $f12 \n\t" \
++	"punpckhbh $f8, "#reg1", $f12 \n\t" 
++
++#define expandx888(reg1,pos) \
++	expand8888(reg1,pos) \
++	DMTC1_IMM($f12, 0x00ff000000000000) \
++	"or $f8, $f8, $f12 \n\t"
++
++#define invert_colors(reg1)  \
++	DMTC1_IMM($f12, 0xffff0000ffff0000) \
++	"and $f14, "#reg1", $f12 \n\t" \
++	DMTC1_IMM($f12, 0x000000000000ffff) \
++	"and $f16, "#reg1", $f12 \n\t" \
++	DMTC1_IMM($f12, 0x0000ffff00000000) \
++	"and $f18, "#reg1", $f12 \n\t" \
++	MTC1_IMM($f12, 32) \
++	"dsll $f16, $f16, $f12 \n\t" \
++	"dsrl $f18, $f18, $f12 \n\t" \
++	"or $f14, $f14, $f16 \n\t" \
++	"or $f8, $f14, $f18 \n\t" 
++
++#define over(reg1,reg2,reg3) \
++	negate(reg2,$f8) \
++	pix_multiply(reg3, $f8)\
++	pix_add(reg1, $f8) 
++
++
++#define over_rev_non_pre(reg1,reg2) \
++	expand_alpha(reg1,$f0) \
++	DMTC1_IMM($f12,0x00ff000000000000) \
++	"or $f2, $f0, $f12 \n\t" \
++	invert_colors(reg1) \
++	pix_multiply($f8,$f2) \
++	save_to($f2) \
++	over($f2, $f0, reg2)
++
++#define in(reg1,reg2) pix_multiply(reg1,reg2) 
++
++#define in_over_full_src_alpha(reg1,reg2,reg3) \
++	DMTC1_IMM($f12,0x00ff000000000000) \
++	"or $f0, "#reg1", $f12 \n\t" \
++	in($f0,reg2) \
++	save_to($f0) \
++	over($f0,reg2,reg3)
++
++#define in_over(reg1,reg2,reg3,reg4) \
++	in(reg1,reg3) \
++	"mov.d $f0, $f8 \n\t" \
++	pix_multiply(reg2,reg3) \
++	"mov.d $f2, $f8 \n\t" \
++	over($f0,$f2,reg4)
++
++
diff --git a/extra/pygobject/fix-pycairo-capi-declaration.patch b/extra/pygobject/fix-pycairo-capi-declaration.patch
new file mode 100644
index 000000000..1f0364c7a
--- /dev/null
+++ b/extra/pygobject/fix-pycairo-capi-declaration.patch
@@ -0,0 +1,17 @@
+--- pygobject-2.28.4.orig/gi/pygi-foreign-cairo.c	2011-04-18 17:36:47.000000000 +0200
++++ pygobject-2.28.4/gi/pygi-foreign-cairo.c	2011-06-21 20:05:11.015628222 +0200
+@@ -26,12 +26,12 @@
+ 
+ #if PY_VERSION_HEX < 0x03000000
+ #include <pycairo.h>
++static Pycairo_CAPI_t *Pycairo_CAPI;
+ #else
+ #include <pycairo/py3cairo.h>
++#define Pycairo_IMPORT import_cairo()
+ #endif
+ 
+-Pycairo_CAPI_t *Pycairo_CAPI;
+-
+ #include "pygi-foreign.h"
+ 
+ #include <pyglib-python-compat.h>
diff --git a/extra/x264/PKGBUILD b/extra/x264/PKGBUILD
index 178359631..48aa32925 100644
--- a/extra/x264/PKGBUILD
+++ b/extra/x264/PKGBUILD
@@ -18,7 +18,9 @@ md5sums=('7579aff8166a974a1b293cd18b9ead92')
 build() {
     cd "$srcdir/$pkgname-snapshot-$pkgver-2245"
 
-    ./configure --enable-shared
+    [ "$CARCH" = "mips64el" ] && extra="--enable-pic"
+
+    ./configure --enable-shared $extra
 
     make
     make DESTDIR="$pkgdir" \
diff --git a/extra/xulrunner/PKGBUILD b/extra/xulrunner/PKGBUILD
index 8156af39a..1196fd176 100644
--- a/extra/xulrunner/PKGBUILD
+++ b/extra/xulrunner/PKGBUILD
@@ -22,24 +22,24 @@ options=('!emptydirs')
 
 build() {
   cd "${srcdir}/icecat-${_ffoxver}/"
-  cp "${srcdir}/mozconfig" .mozconfig
+#  cp "${srcdir}/mozconfig" .mozconfig
 
   #fix libdir/sdkdir - fedora
-  patch -Np1 -i "${srcdir}/mozilla-pkgconfig.patch"
+#  patch -Np1 -i "${srcdir}/mozilla-pkgconfig.patch"
 
   #Force installation to the same path for every version
-  patch -Np1 -i "${srcdir}/xulrunner-version.patch"
+#  patch -Np1 -i "${srcdir}/xulrunner-version.patch"
 
   #https://bugzilla.mozilla.org/show_bug.cgi?id=620931
-  patch -Np1 -i "${srcdir}/xulrunner-omnijar.patch"
+#  patch -Np1 -i "${srcdir}/xulrunner-omnijar.patch"
 
   #https://bugzilla.mozilla.org/show_bug.cgi?id=494163
-  patch -Np1 -i "${srcdir}/port_gnomevfs_to_gio.patch"
+#  patch -Np1 -i "${srcdir}/port_gnomevfs_to_gio.patch"
 
- [[ "$CARCH" == "mips64el" ]] && {
-   echo "ac_add_options --disable-ipc" >> .mozconfig
-   patch -Np0 -i "${srcdir}/mips.patch"
- }
+# [[ "$CARCH" == "mips64el" ]] && {
+#   echo "ac_add_options --disable-ipc" >> .mozconfig
+#   patch -Np0 -i "${srcdir}/mips.patch"
+# }
 
   unset CFLAGS
   unset CXXFLAGS
diff --git a/extra/zziplib/PKGBUILD b/extra/zziplib/PKGBUILD
index 4783fc5ee..73d728632 100644
--- a/extra/zziplib/PKGBUILD
+++ b/extra/zziplib/PKGBUILD
@@ -7,7 +7,7 @@ pkgname=zziplib
 pkgver=0.13.60
 pkgrel=1
 pkgdesc="A lightweight library that offers the ability to easily extract data from files archived in a single zip file"
-arch=('i686' 'x86_64')
+arch=('i686' 'x86_64' 'mips64el')
 url="http://zziplib.sourceforge.net"
 license=('LGPL' 'MPL')
 depends=('zlib')
diff --git a/libre/ffmpeg-libre/PKGBUILD b/libre/ffmpeg-libre/PKGBUILD
index 41c332dca..40bd3a579 100644
--- a/libre/ffmpeg-libre/PKGBUILD
+++ b/libre/ffmpeg-libre/PKGBUILD
@@ -14,8 +14,10 @@ license=('GPL')
 depends=('bzip2' 'lame' 'sdl' 'libvorbis' 'xvidcore' 'zlib' 'x264' 'libtheora' 'opencore-amr' 'alsa-lib' 'libvdpau' 'libxfixes' 'schroedinger' 'libvpx' 'libva' 'openjpeg')
 makedepends=('yasm' 'git')
 #git clone git://git.videolan.org/ffmpeg.git
-source=(ftp://ftp.archlinux.org/other/ffmpeg/ffmpeg-${pkgver}.tar.xz)
-md5sums=('dd682a876a496b9f9ae8afb3b3b70389')
+source=(ftp://ftp.archlinux.org/other/ffmpeg/ffmpeg-${pkgver}.tar.xz
+        ffmpeg-loongson.patch)
+md5sums=('dd682a876a496b9f9ae8afb3b3b70389'
+         'a178dab43d73388543689df4828fb2d2')
 #source=(http://ffmpeg.org/releases//releases/ffmpeg-${pkgver}.tar.bz2)
 provides=("ffmpeg=$pkgver")
 conflicts=('ffmpeg')
diff --git a/libre/icecat/PKGBUILD b/libre/icecat/PKGBUILD
index 89f6755fb..b18691b41 100644
--- a/libre/icecat/PKGBUILD
+++ b/libre/icecat/PKGBUILD
@@ -65,6 +65,9 @@ build() {
 
   msg2 "Starting build..."
   cp "${srcdir}/mozconfig" .mozconfig
+
+  [ "$CARCH" = "mips64el" ] && echo "ac_add_options --disable-ipc" >> .mozconfig
+
   unset CFLAGS
   unset CXXFLAGS
author	Nicolas Reynolds <fauno@kiwwwi.com.ar>	2011-06-22 19:50:41 -0300
committer	Nicolas Reynolds <fauno@kiwwwi.com.ar>	2011-06-22 19:50:41 -0300
commit	2592adab23ef6c7a48fcc14d03a383f4e3447597 (patch)
tree	9b091571a06f82d21f29ccbe669062ce96702e0a
parent	2d8ff6f229ba867a2fc8d1108de40bb4eb299c0d (diff)