summaryrefslogtreecommitdiff
path: root/extra/ffmpeg/ffmpeg-loongson.patch
diff options
context:
space:
mode:
Diffstat (limited to 'extra/ffmpeg/ffmpeg-loongson.patch')
-rw-r--r--extra/ffmpeg/ffmpeg-loongson.patch1794
1 files changed, 1794 insertions, 0 deletions
diff --git a/extra/ffmpeg/ffmpeg-loongson.patch b/extra/ffmpeg/ffmpeg-loongson.patch
new file mode 100644
index 000000000..501eafd15
--- /dev/null
+++ b/extra/ffmpeg/ffmpeg-loongson.patch
@@ -0,0 +1,1794 @@
+diff --git a/configure b/configure
+index 25e8cef..1d6c652 100755
+--- a/configure
++++ b/configure
+@@ -230,6 +230,7 @@ Advanced options (experts only):
+ --disable-armvfp disable ARM VFP optimizations
+ --disable-iwmmxt disable iwmmxt optimizations
+ --disable-mmi disable MMI optimizations
++ --disable-loongson2mmi disable LOONGSON2 Multi-Media Instructions usage"
+ --disable-neon disable neon optimizations
+ --disable-vis disable VIS optimizations
+ --disable-yasm disable use of yasm assembler
+@@ -995,6 +996,7 @@ ARCH_EXT_LIST='
+ armvfp
+ iwmmxt
+ mmi
++ loongson2mmi
+ mmx
+ mmx2
+ neon
+@@ -2862,6 +2864,7 @@ if enabled arm; then
+ fi
+ if enabled mips; then
+ echo "MMI enabled ${mmi-no}"
++ echo "LOONGSON2MMI enabled ${loongson2mmi-no}"
+ fi
+ if enabled ppc; then
+ echo "AltiVec enabled ${altivec-no}"
+diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+index add4b10..8244e51 100644
+--- a/libavcodec/avcodec.h
++++ b/libavcodec/avcodec.h
+@@ -1586,6 +1586,8 @@ typedef struct AVCodecContext {
+ #define FF_IDCT_SIMPLENEON 22
+ #define FF_IDCT_SIMPLEALPHA 23
+ #define FF_IDCT_BINK 24
++#define FF_IDCT_LIBMPEG2LOONGSON2 25
++#define FF_IDCT_XVIDLOONGSON2 26
+
+ /**
+ * slice count
+diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c
+index bbfdb6a..dfc3452 100644
+--- a/libavcodec/dsputil.c
++++ b/libavcodec/dsputil.c
+@@ -4525,6 +4525,7 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
+ if (HAVE_MMI) dsputil_init_mmi (c, avctx);
+ if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
+ if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
++ if (HAVE_LOONGSON2MMI) dsputil_init_loongson2(c, avctx);
+
+ for(i=0; i<64; i++){
+ if(!c->put_2tap_qpel_pixels_tab[0][i])
+diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h
+index d1816e6..1a72ae9 100644
+--- a/libavcodec/dsputil.h
++++ b/libavcodec/dsputil.h
+@@ -636,6 +636,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
+ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
+ void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
+ void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
++void dsputil_init_loongson2(DSPContext* c, AVCodecContext *avctx);
+
+ void ff_dsputil_init_dwt(DSPContext *c);
+ void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
+diff --git a/libavcodec/loongson2/dsputil_loongson2.c b/libavcodec/loongson2/dsputil_loongson2.c
+new file mode 100644
+index 0000000..01bd3ac
+--- /dev/null
++++ b/libavcodec/loongson2/dsputil_loongson2.c
+@@ -0,0 +1,221 @@
++/*
++ * Copyright(C) 2006-2010 comcat <jiankemeng@gmail.com>
++ *
++ * Optimized for Loongson2 CPUs by comcat <jiankemeng@gmail.com>
++ *
++ */
++
++#include "dsputil_loongson2.h"
++#include "../simple_idct.h"
++#include "../mpegvideo.h"
++
++//extern void ff_idct_xvid_loongson2(short *block);
++
++extern void ff_loongson2_idct(DCTELEM *block);
++extern void ff_idct_xvid_loongson2(short *block);
++
++static void add_pixels_clamped_loongson2(const DCTELEM *block, uint8_t *restrict pixels, int line_size)
++{
++ const DCTELEM *p;
++ uint8_t *pix;
++ int i,j;
++ p = block;
++ pix = pixels;
++ i = 4;
++ j = line_size << 1;
++ __asm __volatile("xor $f14, $f14, $f14\n\t");
++ do {
++ __asm __volatile(
++// ".set mips3 \n\t"
++ "ldc1 $f0, 0(%2) \n\t"
++ "ldc1 $f2, 8(%2) \n\t"
++ "ldc1 $f4, 16(%2) \n\t"
++ "ldc1 $f6, 24(%2) \n\t"
++ "ldc1 $f8, %0 \n\t"
++ "ldc1 $f12, %1 \n\t"
++ "mov.d $f10, $f8 \n\t"
++
++ "punpcklbh $f8, $f8, $f14 \n\t"
++ "punpckhbh $f10, $f10, $f14\n\t"
++
++ "paddsh $f0, $f0, $f8 \n\t"
++ "paddsh $f2, $f2, $f10 \n\t"
++
++ "mov.d $f10, $f12 \n\t"
++
++ "punpcklbh $f12, $f12, $f14\n\t"
++ "punpckhbh $f10, $f10, $f14\n\t"
++
++ "paddsh $f4, $f4, $f12 \n\t"
++ "paddsh $f6, $f6, $f10 \n\t"
++
++ "packushb $f0, $f0, $f2 \n\t"
++ "packushb $f4, $f4, $f6 \n\t"
++
++ "sdc1 $f0, %0 \n\t"
++ "sdc1 $f4, %1 \n\t"
++// ".set mips2 \n\t"
++ :"+m"(*pix), "+m"(*(pix+line_size))
++ :"r"(p)
++ :"$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","memory");
++ pix += j;
++ p += 16;
++ } while (--i);
++
++}
++
++static void put_pixels_clamped_loongson2(const DCTELEM *block, uint8_t *restrict pixels, int line_size)
++{
++ const DCTELEM *p;
++ uint8_t *pix;
++ int tmp = line_size * 3;
++ p = block;
++ pix = pixels;
++ __asm __volatile
++ (
++// ".set mips3 \n\t"
++ //"dadd $12, $0, $0\n\t"
++ //"dadd $13, $0, $0\n\t"
++ //"dadd $14, $0, $0\n\t"
++
++ "ldc1 $f0, 0(%3)\n\t"
++ "ldc1 $f2, 8(%3)\n\t"
++ "ldc1 $f4, 16(%3)\n\t"
++ "ldc1 $f6, 24(%3)\n\t"
++ "ldc1 $f8, 32(%3)\n\t"
++ "ldc1 $f10, 40(%3)\n\t"
++ "ldc1 $f16, 48(%3)\n\t"
++ "ldc1 $f18, 56(%3)\n\t"
++
++ "packushb $f0, $f0, $f2\n\t"
++ "packushb $f4, $f4, $f6\n\t"
++ "packushb $f8, $f8, $f10\n\t"
++ "packushb $f16, $f16, $f18\n\t"
++
++ "add $12, %0, %1\n\t"
++ "add $13, $12, %1\n\t"
++ "add $14, %0, %2\n\t"
++
++ "sdc1 $f0, 0(%0)\n\t"
++ "sdc1 $f4, 0($12)\n\t"
++ "sdc1 $f8, 0($13)\n\t"
++ "sdc1 $f16, 0($14)\n\t"
++// ".set mips2\n\t"
++ :
++ :"r" (pix), "r" (line_size), "r" (tmp), "r"(p)
++ :"$12","$13","$14","$f0","$f2","$f4","$f6","$f8","$f10","$16","$18"
++ );
++
++ pix += line_size*4;
++ p += 32;
++
++ __asm __volatile
++ (
++// ".set mips3 \n\t"
++
++ "dadd $12, $0, $0\n\t"
++ "dadd $13, $0, $0\n\t"
++ "dadd $14, $0, $0\n\t"
++ "lw $12, %3\n\t"
++
++ "ldc1 $f0, 0($12)\n\t"
++ "ldc1 $f2, 8($12)\n\t"
++ "ldc1 $f4, 16($12)\n\t"
++ "ldc1 $f6, 24($12)\n\t"
++ "ldc1 $f8, 32($12)\n\t"
++ "ldc1 $f10, 40($12)\n\t"
++ "ldc1 $f16, 48($12)\n\t"
++ "ldc1 $f18, 56($12)\n\t"
++
++ "packushb $f0, $f0, $f2\n\t"
++ "packushb $f4, $f4, $f6\n\t"
++ "packushb $f8, $f8, $f10\n\t"
++ "packushb $f16, $f16, $f18\n\t"
++
++ "add $12, %1, %0\n\t"
++ "add $13, $12, %1\n\t"
++ "add $15, %2, %0\n\t"
++
++ "sdc1 $f0, 0(%0)\n\t"
++ "sdc1 $f4, 0($12)\n\t"
++
++ "sdc1 $f8, 0($13)\n\t"
++ "sdc1 $f16, 0($15)\n\t"
++// ".set mips2\n\t"
++ :
++ :"r" (pix), "r" (line_size), "r" (tmp), "m"(p)
++ :"$12","$13","$15","$f0","$f2","$f4","$f6","$f8","$f10","$16","$18","memory"
++ );
++
++}
++
++/*
++void put_signed_pixels_clamped_loongson2(const DCTELEM *block, uint8_t *pixels, int line_size)
++{
++
++}
++
++
++void ff_loongson2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
++{
++ ff_loongson2_idct(block);
++ put_pixels_clamped_loongson2(block, dest, line_size);
++}
++
++void ff_loongson2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
++{
++ ff_loongson2_idct(block);
++ add_pixels_clamped_loongson2(block, dest, line_size);
++}*/
++
++static void ff_idct_xvid_loongson2_put(uint8_t *dest, int line_size, DCTELEM *block)
++{
++ ff_idct_xvid_loongson2(block);
++ put_pixels_clamped_loongson2(block, dest, line_size);
++}
++
++static void ff_idct_xvid_loongson2_add(uint8_t *dest, int line_size, DCTELEM *block)
++{
++ ff_idct_xvid_loongson2(block);
++ add_pixels_clamped_loongson2(block, dest, line_size);
++}
++
++void dsputil_init_loongson2(DSPContext *c, AVCodecContext *avctx)
++{
++
++ const int idct_algo = avctx->idct_algo;
++
++/*
++#ifdef CONFIG_ENCODERS
++ const int dct_algo = avctx->dct_algo;
++ if(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_LOONGSON2)
++ c->fdct = ff_fdct_loongson2;
++#endif
++*/
++
++#if 0
++ if(avctx->lowres==0)
++ {
++ if(idct_algo == FF_IDCT_LIBMPEG2LOONGSON2)
++ {
++ c->idct_add = ff_loongson2_idct_add;
++ c->idct_put = ff_loongson2_idct_put;
++ c->idct = ff_loongson2_idct;
++ }
++ else if(idct_algo == FF_IDCT_XVIDLOONGSON2)
++ {
++#endif
++ c->idct_add = ff_idct_xvid_loongson2_add;
++ c->idct_put = ff_idct_xvid_loongson2_put;
++ c->idct = ff_idct_xvid_loongson2;
++ //}
++ //}
++
++ c->put_pixels_clamped = put_pixels_clamped_loongson2;
++ c->add_pixels_clamped = add_pixels_clamped_loongson2;
++
++#ifdef CONFIG_ENCODERS
++ dsputil_init_pix_loongson2(c, avctx);
++#endif
++
++}
+diff --git a/libavcodec/loongson2/dsputil_loongson2.d b/libavcodec/loongson2/dsputil_loongson2.d
+new file mode 100644
+index 0000000..808f0a3
+--- /dev/null
++++ b/libavcodec/loongson2/dsputil_loongson2.d
+@@ -0,0 +1,18 @@
++libavcodec/loongson2/dsputil_loongson2.o: \
++ libavcodec/loongson2/dsputil_loongson2.c \
++ libavcodec/loongson2/dsputil_loongson2.h libavcodec/dsputil.h \
++ libavutil/intreadwrite.h config.h libavutil/bswap.h \
++ libavutil/attributes.h libavutil/common.h libavutil/intmath.h \
++ libavutil/mem.h libavutil/internal.h libavutil/timer.h libavutil/libm.h \
++ libavutil/mips/intreadwrite.h libavcodec/avcodec.h libavutil/avutil.h \
++ libavutil/error.h libavutil/avutil.h libavutil/mathematics.h \
++ libavutil/rational.h libavutil/intfloat_readwrite.h libavutil/log.h \
++ libavutil/pixfmt.h libavutil/avconfig.h \
++ libavcodec/loongson2/../simple_idct.h libavcodec/loongson2/../dsputil.h \
++ libavcodec/loongson2/../mpegvideo.h libavcodec/loongson2/../get_bits.h \
++ libavutil/bswap.h libavutil/common.h libavutil/log.h \
++ libavcodec/loongson2/../mathops.h libavcodec/loongson2/../mips/mathops.h \
++ libavcodec/loongson2/../put_bits.h libavcodec/loongson2/../ratecontrol.h \
++ libavcodec/loongson2/../eval.h libavcodec/loongson2/../parser.h \
++ libavcodec/loongson2/../avcodec.h libavcodec/loongson2/../mpeg12data.h \
++ libavutil/rational.h libavcodec/loongson2/../rl.h
+diff --git a/libavcodec/loongson2/dsputil_loongson2.h b/libavcodec/loongson2/dsputil_loongson2.h
+new file mode 100644
+index 0000000..87c7bd9
+--- /dev/null
++++ b/libavcodec/loongson2/dsputil_loongson2.h
+@@ -0,0 +1,3 @@
++#include "libavcodec/dsputil.h"
++
++void dsputil_init_pix_loongson2(DSPContext* c, AVCodecContext *avctx);
+diff --git a/libavcodec/loongson2/dsputil_loongson2.o b/libavcodec/loongson2/dsputil_loongson2.o
+new file mode 100644
+index 0000000..fca0b55
+Binary files /dev/null and b/libavcodec/loongson2/dsputil_loongson2.o differ
+diff --git a/libavcodec/loongson2/idct_loongson2.c b/libavcodec/loongson2/idct_loongson2.c
+new file mode 100644
+index 0000000..539cab5
+--- /dev/null
++++ b/libavcodec/loongson2/idct_loongson2.c
+@@ -0,0 +1,336 @@
++/*
++ * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
++ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
++ *
++ * Copyright (c) 2007-2010 comcat <jiankemeng@gmail.com>.
++ *
++ * Optimized for Loongson2 CPUs by comcat <jiankemeng@gmail.com>
++ *
++ * Based on i386
++ */
++
++#include "libavutil/common.h"
++#include "dsputil_loongson2.h"
++
++
++#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
++
++
++#define ROW_SHIFT 11
++#define COL_SHIFT 6
++
++#define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
++#define rounder(bias) {round (bias), round (bias)}
++
++
++
++#define loongson2_table(c1,c2,c3,c4,c5,c6,c7) { c4,c2,-c4,-c2, \
++ c4,c6,c4,c6, \
++ c1,c3,-c1,-c5,\
++ c5,c7,c3,-c7, \
++ c4,-c6,c4,-c6, \
++ -c4,c2,c4,-c2, \
++ c5,-c1,c3,-c1, \
++ c7,c3,c7,-c5 }
++
++
++static inline void loongson2_row_head(int16_t * const row, const int offset,
++ const int16_t * const table)
++{
++ __asm__ volatile(
++// ".set\tmips3\n"
++ ".set noreorder\n"
++ "ldc1 $f6,%0\n"
++ "ldc1 $f14,%1\n"
++ "ldc1 $f2,%2\n"
++ "ldc1 $f8,%3\n"
++ "dli $12,%4\n"
++ "dmtc1 $12,$f16\n"
++ "mov.d $f4,$f6\n"
++ "mov.d $f10,$f14\n"
++ "pmaddhw $f2,$f2,$f4\n"
++ "pshufh $f6,$f6,$f16\n"
++ ".set reorder\n"
++// ".set\tmips0\n"
++ :
++ :"m"(*(row+offset)),"m"(*(row+offset+4)),"m"(*table),"m"(*(table+4)),"i"(0x4e)
++ :"$f2","$f4","$f6","$f8","$f10","$f14","$f16","$12"
++ );
++}
++
++
++static inline void loongson2_row(const int16_t * const table,
++ const int32_t * const rounder)
++{
++ __asm__ volatile (
++// ".set\tmips3\n"
++ ".set\tnoreorder\n"
++ "ldc1 $f0,%0\n"
++ "pmaddhw $f8,$f8,$f6\n"
++ "ldc1 $f16,%1\n"
++ "dli $13,%8\n"
++ "ldc1 $f20,%2\n"
++ "pmaddhw $f0,$f0,$f14\n"
++ "ldc1 $f22,%3\n"
++ "pmaddhw $f4,$f4,$f16\n"
++ "paddw $f2,$f2,$f22\n"
++ "ldc1 $f22,%4\n"
++ "dmtc1 $13,$f16\n"
++ "paddw $f2,$f2,$f8\n"
++ "pmaddhw $f14,$f14,$f22\n"
++ "mov.d $f8,$f2\n"
++ "pshufh $f10,$f10,$f16\n"
++ "ldc1 $f22,%3\n"
++ "pmaddhw $f20,$f20,$f10\n"
++ "ldc1 $f16,%5\n"
++ "paddw $f4,$f4,$f22\n"
++ "paddw $f0,$f0,$f20\n"
++ "dli $12,%6\n"
++ "pmaddhw $f6,$f6,$f16\n"
++ "psubw $f2,$f2,$f0\n"
++ "ldc1 $f16,%7\n"
++ "paddw $f0,$f0,$f8\n"
++ "paddw $f4,$f4,$f6\n"
++ "pmaddhw $f10,$f10,$f16\n"
++ "mov.d $f8,$f4\n"
++ "dmtc1 $12,$f16\n"
++ "paddw $f14,$f14,$f10\n"
++ "psraw $f2,$f2,$f16\n"
++ "psraw $f0,$f0,$f16\n"
++ "paddw $f4,$f4,$f14\n"
++ "psubw $f8,$f8,$f14\n"
++ ".set\treorder\n"
++// ".set\tmips0\n"
++ :
++ :"m"(*(table+8)),"m"(*(table+16)),"m"(*(table+12)),"m"(*rounder),"m"(*(table+24)),"m"(*(table+20)),"i"(ROW_SHIFT),"m"(*(table+16)),"i"(0x4e)
++ :"$f0","$f2","$f4","$f6","$f8","$f10","$f14","$f16","$f20","$f22","$12","$13","memory"
++ );
++}
++
++static inline void loongson2_row_tail(int16_t * const row, const int store)
++{
++ __asm__ volatile (
++// ".set\tmips3\n"
++ ".set\tnoreorder\n"
++ "dli $12,%2\n"
++ "dmtc1 $12,$f16\n"
++ "psraw $f4,$f4,$f16\n"
++ "psraw $f8,$f8,$f16\n"
++ "packsswh $f0,$f0,$f4\n"
++ "packsswh $f8,$f8,$f2\n"
++ "sdc1 $f0,%0\n"
++ "dli $13,%3\n"
++ "dmtc1 $13,$f22\n"
++ "pshufh $f8,$f8,$f22\n"
++ "sdc1 $f8,%1\n"
++ ".set\treorder\n"
++// ".set\tmips0\n"
++ :"=m"(*(row+store)),"=m"(*(row+store+4))
++ :"i"(ROW_SHIFT),"i"(0xb1)
++ :"$f0","$f2","$f4","$f6","$f8","$f16","$f22","$12","$13","memory"
++ );
++}
++
++static inline void loongson2_row_mid(int16_t * const row, const int store,
++ const int offset,
++ const int16_t * const table)
++{
++ __asm__ volatile (
++// ".set\tmips3\n"
++ ".set\tnoreorder\n"
++ "ldc1 $f6,%2\n"
++ "dli $12,%3\n"
++ "dmtc1 $12,$f16\n"
++ "psraw $f4,$f4,$f16\n"
++ "ldc1 $f14,%4\n"
++ "psraw $f8,$f8,$f16\n"
++ "packsswh $f0,$f0,$f4\n"
++ "mov.d $f10,$f14\n"
++ "packsswh $f8,$f8,$f2\n"
++ "mov.d $f4,$f6\n"
++ "sdc1 $f0,%0\n"
++ "dli $13,%5\n"
++ "dmtc1 $13,$f22\n"
++ "pshufh $f8,$f8,$f22\n"
++ "ldc1 $f2,%6\n"
++ "sdc1 $f8,%1\n"
++ "pmaddhw $f2,$f2,$f4\n"
++ "ldc1 $f8,%7\n"
++ "dli $12,%8\n"
++ "dmtc1 $12,$f16\n"
++ "pshufh $f6,$f6,$f16\n"
++ ".set\treorder\n"
++// ".set\tmips0\n"
++ :"=m"(*(row+store)),"=m"(*(row+store+4))
++ : "m"(*(row+offset)),"i"(ROW_SHIFT),"m"(*(row+offset+4)),"i"(0xb1),"m"(*table),"m"(*(table+4)),"i"(0x4e)
++ :"$f0","$f2","$f4","$f6","$f8","$f10","$14","$f16","$f22","$12","$13","memory"
++ );
++}
++
++static inline void idct_col(int16_t * const col, const int offset)
++{
++#define T1 13036
++#define T2 27146
++#define T3 43790
++#define C4 23170
++ static const short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
++ static const short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
++ static const short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
++ static const short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
++
++ __asm__ volatile (
++// ".set\tmips3\n"
++ ".set\tnoreorder\n"
++ "ldc1 $f4,%8\n"
++ "ldc1 $f0,%9\n"
++ "mov.d $f6,$f4\n"
++ "ldc1 $f8,%10\n"
++ "pmulhh $f4,$f4,$f0\n"
++ "ldc1 $f14,%11\n"
++ "pmulhh $f6,$f6,$f8\n"
++ "ldc1 $f10,%12\n"
++ "mov.d $f20,$f14\n"
++ "ldc1 $f2,%13\n"
++ "psubsh $f4,$f4,$f8\n"
++ "ldc1 $f8,%14\n"
++ "pmulhh $f14,$f14,$f2\n"
++ "paddsh $f0,$f0,$f6\n"
++ "pmulhh $f20,$f20,$f10\n"
++ "mov.d $f6,$f8\n"
++ "paddsh $f14,$f14,$f2\n"
++ "ldc1 $f16,%15\n"
++ "pmulhh $f8,$f8,$f16\n"
++ "paddsh $f20,$f20,$f10\n"
++ "psubsh $f14,$f14,$f10\n"
++ "paddsh $f20,$f20,$f2\n"
++ "ldc1 $f2,%16\n"
++ "mov.d $f10,$f4\n"
++ "pmulhh $f6,$f6,$f2\n"
++ "psubsh $f4,$f4,$f14\n"
++ "psubsh $f8,$f8,$f2\n"
++ "paddsh $f14,$f14,$f10\n"
++ "sdc1 $f4,%0\n"
++ "mov.d $f10,$f0\n"
++ "ldc1 $f22,%15\n"
++ "paddsh $f6,$f6,$f22\n"
++ "paddsh $f10,$f10,$f20\n"
++ "psubsh $f0,$f0,$f20\n"
++ "mov.d $f20,$f0\n"
++ "ldc1 $f2,%17\n"
++ "paddsh $f0,$f0,$f14\n"
++ "ldc1 $f4,%18\n"
++ "psubsh $f20,$f20,$f14\n"
++ "sdc1 $f10,%1\n"
++ "pmulhh $f0,$f0,$f4\n"
++ "mov.d $f10,$f8\n"
++ "pmulhh $f20,$f20,$f4\n"
++ "ldc1 $f14,%19\n"
++ "mov.d $f4,$f2\n"
++ "psubsh $f2,$f2,$f14\n"
++ "paddsh $f4,$f4,$f14\n"
++ "paddsh $f8,$f8,$f2\n"
++ "mov.d $f14,$f4\n"
++ "psubsh $f2,$f2,$f10\n"
++ "paddsh $f14,$f14,$f6\n"
++ "paddsh $f0,$f0,$f0\n"
++ "psubsh $f4,$f4,$f6\n"
++ "paddsh $f20,$f20,$f20\n"
++ "mov.d $f6,$f2\n"
++ "mov.d $f10,$f8\n"
++ "paddsh $f2,$f2,$f20\n"
++ "dli $12,%20\n"
++ "dmtc1 $12,$f16\n"
++ "psrah $f2,$f2,$f16\n"
++ "paddsh $f8,$f8,$f0\n"
++ "psrah $f8,$f8,$f16\n"
++ "psubsh $f10,$f10,$f0\n"
++ "ldc1 $f0,%12\n"
++ "psubsh $f6,$f6,$f20\n"
++ "psrah $f10,$f10,$f16\n"
++ "mov.d $f20,$f14\n"
++ "sdc1 $f8,%2\n"
++ "psrah $f6,$f6,$f16\n"
++ "sdc1 $f2,%3\n"
++ "paddsh $f14,$f14,$f0\n"
++ "ldc1 $f8,%13\n"
++ "psubsh $f20,$f20,$f0\n"
++ "psrah $f14,$f14,$f16\n"
++ "mov.d $f2,$f4\n"
++ "sdc1 $f6,%1\n"
++ "psubsh $f2,$f2,$f8\n"
++ "psrah $f20,$f20,$f16\n"
++ "paddsh $f8,$f8,$f4\n"
++ "sdc1 $f14,%4\n"
++ "psrah $f2,$f2,$f16\n"
++ "sdc1 $f10,%5\n"
++ "psrah $f8,$f8,$f16\n"
++ "sdc1 $f20,%6\n"
++ "sdc1 $f2,%7\n"
++ "sdc1 $f8,%0\n"
++ ".set\treorder\n"
++// ".set\tmips0\n"
++ :"=m"(*(col+offset+3*8)),"=m"(*(col+offset+5*8)),"=m"(*(col+offset+1*8)),"=m"(*(col+offset+2*8)),"=m"(*(col+offset+0*8)),"=m"(*(col+offset+6*8)),"=m"(*(col+offset+7*8)),"=m"(*(col+offset+4*8))
++ :"m"(*_T1),"m"(*(col+offset+1*8)),"m"(*(col+offset+7*8)),"m"(*_T3),"m"(*(col+offset+5*8)),"m"(*(col+offset+3*8)),"m"(*_T2),"m"(*(col+offset+2*8)),"m"(*(col+offset+6*8)),"m"(*(col+offset+0*8)),"m"(*_C4),"m"(*(col+offset+4*8)),"i"(COL_SHIFT)
++ :"$f0","$f2","$f4","$f6","$f8","$f10","$14","$f16","$20","$f22","$12","memory"
++ );
++}
++
++static const int32_t rounder0[] ATTR_ALIGN(8) =
++ rounder ((1 << (COL_SHIFT - 1)) - 0.5);
++static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
++static const int32_t rounder1[] ATTR_ALIGN(8) =
++ rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */
++static const int32_t rounder7[] ATTR_ALIGN(8) =
++ rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */
++static const int32_t rounder2[] ATTR_ALIGN(8) =
++ rounder (0.60355339059); /* C2 * (C6+C2)/2 */
++static const int32_t rounder6[] ATTR_ALIGN(8) =
++ rounder (-0.25); /* C2 * (C6-C2)/2 */
++static const int32_t rounder3[] ATTR_ALIGN(8) =
++ rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */
++static const int32_t rounder5[] ATTR_ALIGN(8) =
++ rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */
++
++
++#undef COL_SHIFT
++#undef ROW_SHIFT
++
++
++#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \
++inline void idct (int16_t * const block) \
++{ \
++ static const int16_t table04[] ATTR_ALIGN(16) = \
++ table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \
++ static const int16_t table17[] ATTR_ALIGN(16) = \
++ table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \
++ static const int16_t table26[] ATTR_ALIGN(16) = \
++ table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \
++ static const int16_t table35[] ATTR_ALIGN(16) = \
++ table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \
++ \
++ idct_row_head (block, 0*8, table04); \
++ idct_row (table04, rounder0); \
++ idct_row_mid (block, 0*8, 4*8, table04); \
++ idct_row (table04, rounder4); \
++ idct_row_mid (block, 4*8, 1*8, table17); \
++ idct_row (table17, rounder1); \
++ idct_row_mid (block, 1*8, 7*8, table17); \
++ idct_row (table17, rounder7); \
++ idct_row_mid (block, 7*8, 2*8, table26); \
++ idct_row (table26, rounder2); \
++ idct_row_mid (block, 2*8, 6*8, table26); \
++ idct_row (table26, rounder6); \
++ idct_row_mid (block, 6*8, 3*8, table35); \
++ idct_row (table35, rounder3); \
++ idct_row_mid (block, 3*8, 5*8, table35); \
++ idct_row (table35, rounder5); \
++ idct_row_tail (block, 5*8); \
++ \
++ idct_col (block, 0); \
++ idct_col (block, 4); \
++}
++
++void ff_loongson2_idct(DCTELEM *block);
++
++declare_idct (ff_loongson2_idct, loongson2_table,
++ loongson2_row_head, loongson2_row, loongson2_row_tail, loongson2_row_mid)
+diff --git a/libavcodec/loongson2/idct_loongson2_xvid.c b/libavcodec/loongson2/idct_loongson2_xvid.c
+new file mode 100644
+index 0000000..4a1ee1e
+--- /dev/null
++++ b/libavcodec/loongson2/idct_loongson2_xvid.c
+@@ -0,0 +1,301 @@
++/*
++ * XVID MPEG-4 VIDEO CODEC
++ *
++ * Copyright(C) 2006-2010 comcat <jiankemeng@gmail.com>
++ *
++ * Optimized for Loongson2 CPUs by comcat <jiankemeng@gmail.com>
++ *
++ * Based on i386
++ *
++ */
++
++
++#include <inttypes.h>
++#include "../avcodec.h"
++
++void ff_idct_xvid_loongson2(short *block);
++
++//=============================================================================
++// Macros and other preprocessor constants
++//=============================================================================
++
++#define BITS_INV_ACC 5 // 4 or 5 for IEEE
++#define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11
++#define SHIFT_INV_COL (1 + BITS_INV_ACC) //6
++#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC))
++#define RND_INV_COL (16 * (BITS_INV_ACC - 3))
++#define RND_INV_CORR (RND_INV_COL - 1)
++
++#define BITS_FRW_ACC 3 // 2 or 3 for accuracy
++#define SHIFT_FRW_COL BITS_FRW_ACC
++#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)
++#define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1))
++
++
++//-----------------------------------------------------------------------------
++// Various memory constants (trigonometric values or rounding values)
++//-----------------------------------------------------------------------------
++
++static const int16_t tg_1_16[4*4] attribute_used __attribute__ ((aligned(8))) = {
++ 13036,13036,13036,13036, // tg * (2<<16) + 0.5
++ 27146,27146,27146,27146, // tg * (2<<16) + 0.5
++ -21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5
++ 23170,23170,23170,23170}; // cos * (2<<15) + 0.5
++
++static const int32_t rounder_0[2*8] attribute_used __attribute__ ((aligned(8))) = {
++ 65536,65536,
++ 3597,3597,
++ 2260,2260,
++ 1203,1203,
++ 0,0,
++ 120,120,
++ 512,512,
++ 512,512};
++
++
++// Table for rows 0,4 - constants are multiplied by cos_4_16
++static const int16_t tab_i_04_mmx[32*4] attribute_used __attribute__ ((aligned(8))) = {
++ 16384,16384,16384,-16384, // movq-> w06 w04 w02 w00
++ 21407,8867,8867,-21407, // w07 w05 w03 w01
++ 16384,-16384,16384,16384, // w14 w12 w10 w08
++ -8867,21407,-21407,-8867, // w15 w13 w11 w09
++ 22725,12873,19266,-22725, // w22 w20 w18 w16
++ 19266,4520,-4520,-12873, // w23 w21 w19 w17
++ 12873,4520,4520,19266, // w30 w28 w26 w24
++ -22725,19266,-12873,-22725, // w31 w29 w27 w25
++// Table for rows 1,7 - constants are multiplied by cos_1_16
++ 22725,22725,22725,-22725, // movq-> w06 w04 w02 w00
++ 29692,12299,12299,-29692, // w07 w05 w03 w01
++ 22725,-22725,22725,22725, // w14 w12 w10 w08
++ -12299,29692,-29692,-12299, // w15 w13 w11 w09
++ 31521,17855,26722,-31521, // w22 w20 w18 w16
++ 26722,6270,-6270,-17855, // w23 w21 w19 w17
++ 17855,6270,6270,26722, // w30 w28 w26 w24
++ -31521,26722,-17855,-31521, // w31 w29 w27 w25
++// Table for rows 2,6 - constants are multiplied by cos_2_16
++ 21407,21407,21407,-21407, // movq-> w06 w04 w02 w00
++ 27969,11585,11585,-27969, // w07 w05 w03 w01
++ 21407,-21407,21407,21407, // w14 w12 w10 w08
++ -11585,27969,-27969,-11585, // w15 w13 w11 w09
++ 29692,16819,25172,-29692, // w22 w20 w18 w16
++ 25172,5906,-5906,-16819, // w23 w21 w19 w17
++ 16819,5906,5906,25172, // w30 w28 w26 w24
++ -29692,25172,-16819,-29692, // w31 w29 w27 w25
++// Table for rows 3,5 - constants are multiplied by cos_3_16
++ 19266,19266,19266,-19266, // movq-> w06 w04 w02 w00
++ 25172,10426,10426,-25172, // w07 w05 w03 w01
++ 19266,-19266,19266,19266, // w14 w12 w10 w08
++ -10426,25172,-25172,-10426, // w15 w13 w11 w09
++ 26722,15137,22654,-26722, // w22 w20 w18 w16
++ 22654,5315,-5315,-15137, // w23 w21 w19 w17
++ 15137,5315,5315,22654, // w30 w28 w26 w24
++ -26722,22654,-15137,-26722, // w31 w29 w27 w25
++};
++
++
++// %3 for rows 0,4 - constants are multiplied by cos_4_16
++static const int16_t tab_i_04_xmm[32*4] attribute_used __attribute__ ((aligned(8))) = {
++ 16384,21407,16384,8867, // movq-> w05 w04 w01 w00
++ 16384,8867,-16384,-21407, // w07 w06 w03 w02
++ 16384,-8867,16384,-21407, // w13 w12 w09 w08
++ -16384,21407,16384,-8867, // w15 w14 w11 w10
++ 22725,19266,19266,-4520, // w21 w20 w17 w16
++ 12873,4520,-22725,-12873, // w23 w22 w19 w18
++ 12873,-22725,4520,-12873, // w29 w28 w25 w24
++ 4520,19266,19266,-22725, // w31 w30 w27 w26
++// %3 for rows 1,7 - constants are multiplied by cos_1_16
++ 22725,29692,22725,12299, // movq-> w05 w04 w01 w00
++ 22725,12299,-22725,-29692, // w07 w06 w03 w02
++ 22725,-12299,22725,-29692, // w13 w12 w09 w08
++ -22725,29692,22725,-12299, // w15 w14 w11 w10
++ 31521,26722,26722,-6270, // w21 w20 w17 w16
++ 17855,6270,-31521,-17855, // w23 w22 w19 w18
++ 17855,-31521,6270,-17855, // w29 w28 w25 w24
++ 6270,26722,26722,-31521, // w31 w30 w27 w26
++// %3 for rows 2,6 - constants are multiplied by cos_2_16
++ 21407,27969,21407,11585, // movq-> w05 w04 w01 w00
++ 21407,11585,-21407,-27969, // w07 w06 w03 w02
++ 21407,-11585,21407,-27969, // w13 w12 w09 w08
++ -21407,27969,21407,-11585, // w15 w14 w11 w10
++ 29692,25172,25172,-5906, // w21 w20 w17 w16
++ 16819,5906,-29692,-16819, // w23 w22 w19 w18
++ 16819,-29692,5906,-16819, // w29 w28 w25 w24
++ 5906,25172,25172,-29692, // w31 w30 w27 w26
++// %3 for rows 3,5 - constants are multiplied by cos_3_16
++ 19266,25172,19266,10426, // movq-> w05 w04 w01 w00
++ 19266,10426,-19266,-25172, // w07 w06 w03 w02
++ 19266,-10426,19266,-25172, // w13 w12 w09 w08
++ -19266,25172,19266,-10426, // w15 w14 w11 w10
++ 26722,22654,22654,-5315, // w21 w20 w17 w16
++ 15137,5315,-26722,-15137, // w23 w22 w19 w18
++ 15137,-26722,5315,-15137, // w29 w28 w25 w24
++ 5315,22654,22654,-26722, // w31 w30 w27 w26
++};
++
++
++
++#define DCT_8_INV_ROW_LOONGSON2(A1,A2,A3,A4)\
++ "ldc1 $f0, " #A1 " \n\t"/* 0 ; x3 x2 x1 x0*/\
++ "ldc1 $f2, 8+" #A1 " \n\t"/* 1 ; x7 x6 x5 x4*/\
++ "mov.d $f4, $f0 \n\t"/* 2 ; x3 x2 x1 x0*/\
++ "ldc1 $f6, " #A3 " \n\t"/* 3 ; w05 w04 w01 w00*/\
++ "li $12, 0x88 \n\t"\
++ "dmtc1 $12, $f16 \n\t"\
++ "pshufh $f0, $f0, $f16 \n\t"/* x2 x0 x2 x0*/\
++ "ldc1 $f8, 8+" #A3 " \n\t"/* 4 ; w07 w06 w03 w02*/\
++ "mov.d $f10, $f2 \n\t"/* 5 ; x7 x6 x5 x4*/\
++ "pmaddhw $f6, $f6, $f0 \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\
++ "ldc1 $f12, 32+" #A3 " \n\t"/* 6 ; w21 w20 w17 w16*/\
++ "pshufh $f2, $f2, $f16 \n\t"/* x6 x4 x6 x4*/\
++ "pmaddhw $f8, $f8, $f2 \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\
++ "li $12, 0xdd \n\t"\
++ "dmtc1 $12, $f16 \n\t"\
++ "ldc1 $f14, 40+" #A3 " \n\t"/* 7 ; w23 w22 w19 w18*/\
++ "pshufh $f4, $f4, $f16 \n\t"/* x3 x1 x3 x1*/\
++ "pmaddhw $f12, $f12, $f4 \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\
++ "ldc1 $f18, " #A4 " \n\t" \
++ "ldc1 $f20, 16+" #A3 " \n\t" \
++ "ldc1 $f22, 24+" #A3 " \n\t" \
++ "ldc1 $f24, 48+" #A3 " \n\t" \
++ "ldc1 $f26, 56+" #A3 " \n\t" \
++ "pshufh $f10, $f10, $f16 \n\t"/* x7 x5 x7 x5*/\
++ "pmaddhw $f14, $f14, $f10 \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\
++ "paddw $f6, $f6, $f18 \n\t"/* +%4*/\
++ "pmaddhw $f0, $f0, $f20 \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\
++ "paddw $f6, $f6, $f8 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
++ "pmaddhw $f2, $f2, $f22 \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\
++ "mov.d $f8, $f6 \n\t"/* 4 ; a1 a0*/\
++ "li $12, 11 \n\t"\
++ "dmtc1 $12, $f16 \n\t"\
++ "pmaddhw $f4, $f4, $f24 \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\
++ "paddw $f12, $f12, $f14 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
++ "pmaddhw $f10, $f10, $f26 \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\
++ "paddw $f6, $f6, $f12 \n\t"/* a1+b1 a0+b0*/\
++ "paddw $f0, $f0, $f18 \n\t"/* +%4*/\
++ "psraw $f6, $f6, $f16 \n\t"/* y1=a1+b1 y0=a0+b0*/\
++ "paddw $f0, $f0, $f2 \n\t"/* 1 ; a3=sum(even3) a2=sum(even2)*/\
++ "psubw $f8, $f8, $f12 \n\t"/* 6 ; a1-b1 a0-b0*/\
++ "mov.d $f14, $f0 \n\t"/* 7 ; a3 a2*/\
++ "paddw $f4, $f4, $f10 \n\t"/* 5 ; b3=sum(odd3) b2=sum(odd2)*/\
++ "paddw $f0, $f0, $f4 \n\t"/* a3+b3 a2+b2*/\
++ "psraw $f8, $f8, $f16 \n\t"/* y6=a1-b1 y7=a0-b0*/\
++ "psubw $f14, $f14, $f4 \n\t"/* 2 ; a3-b3 a2-b2*/\
++ "psraw $f0, $f0, $f16 \n\t"/* y3=a3+b3 y2=a2+b2*/\
++ "psraw $f14, $f14, $f16 \n\t"/* y4=a3-b3 y5=a2-b2*/\
++ "li $12, 0xb1 \n\t"\
++ "dmtc1 $12, $f20 \n\t"\
++ "packsswh $f6, $f6, $f0 \n\t"/* 0 ; y3 y2 y1 y0*/\
++ "packsswh $f14, $f14, $f8 \n\t"/* 4 ; y6 y7 y4 y5*/\
++ "sdc1 $f6, " #A2 " \n\t"/* 3 ; save y3 y2 y1 y0*/\
++ "pshufh $f14, $f14, $f20 \n\t"/* y7 y6 y5 y4*/\
++ "sdc1 $f14, 8 +" #A2 " \n\t"/* 7 ; save y7 y6 y5 y4*/\
++
++
++#define DCT_8_INV_COL(A1,A2)\
++ "ldc1 $f0, 2*8(%3) \n\t"/* */\
++ "ldc1 $f6, 16*3+" #A1 " \n\t"/* x3 */\
++ "mov.d $f2, $f0 \n\t"/* tg_3_16*/\
++ "ldc1 $f10, 16*5+" #A1 " \n\t"/* x5 */\
++ "pmulhh $f0, $f0, $f6 \n\t"/* x3*(tg_3_16-1)*/\
++ "ldc1 $f8, (%3) \n\t"\
++ "pmulhh $f2, $f2, $f10 \n\t"/* x5*(tg_3_16-1)*/\
++ "ldc1 $f14, 16*7+" #A1 " \n\t"/* x7 */\
++ "mov.d $f4, $f8 \n\t"/* tg_1_16*/\
++ "ldc1 $f12, 16*1+" #A1 " \n\t"/* x1 */\
++ "pmulhh $f8, $f8, $f14 \n\t"/* x7*tg_1_16*/\
++ "paddsh $f0, $f0, $f6 \n\t"/* x3*tg_3_16*/\
++ "pmulhh $f4, $f4, $f12 \n\t"/* x1*tg_1_16*/\
++ "paddsh $f2, $f2, $f6 \n\t"/* x3+x5*(tg_3_16-1)*/\
++ "psubsh $f0, $f0, $f10 \n\t"/* x3*tg_3_16-x5 = tm35*/\
++ "ldc1 $f6, 3*8(%3) \n\t"\
++ "paddsh $f2, $f2, $f10 \n\t"/* x3+x5*tg_3_16 = tp35*/\
++ "paddsh $f8, $f8, $f12 \n\t"/* x1+tg_1_16*x7 = tp17*/\
++ "psubsh $f4, $f4, $f14 \n\t"/* x1*tg_1_16-x7 = tm17*/\
++ "mov.d $f10, $f8 \n\t"/* tp17*/\
++ "mov.d $f12, $f4 \n\t"/* tm17*/\
++ "paddsh $f10, $f10, $f2 \n\t"/* tp17+tp35 = b0*/\
++ "psubsh $f12, $f12, $f0 \n\t"/* tm17-tm35 = b3*/\
++ "psubsh $f8, $f8, $f2 \n\t"/* tp17-tp35 = t1*/\
++ "paddsh $f4, $f4, $f0 \n\t"/* tm17+tm35 = t2*/\
++ "ldc1 $f14, 1*8(%3) \n\t"\
++ "mov.d $f2, $f8 \n\t"/* t1*/\
++ "sdc1 $f10, 3*16+" #A2 " \n\t"/* save b0*/\
++ "paddsh $f2, $f2, $f4 \n\t"/* t1+t2*/\
++ "sdc1 $f12, 5*16+" #A2 " \n\t"/* save b3*/\
++ "psubsh $f8, $f8, $f4 \n\t"/* t1-t2*/\
++ "ldc1 $f10, 2*16+" #A1 " \n\t"\
++ "mov.d $f0, $f14 \n\t"/* tg_2_16*/\
++ "ldc1 $f12, 6*16+" #A1 " \n\t"\
++ "pmulhh $f0, $f0, $f10 \n\t"/* x2*tg_2_16*/\
++ "pmulhh $f14, $f14, $f12 \n\t"/* x6*tg_2_16*/\
++ "pmulhh $f2, $f2, $f6 \n\t"/* ocos_4_16*(t1+t2) = b1/2*/\
++ "ldc1 $f4, 0*16+" #A1 " \n\t"\
++ "pmulhh $f8, $f8, $f6 \n\t"/* ocos_4_16*(t1-t2) = b2/2*/\
++ "psubsh $f0, $f0, $f12 \n\t"/* t2*tg_2_16-x6 = tm26*/\
++ "mov.d $f6, $f4 \n\t"/* x0*/\
++ "ldc1 $f12, 4*16+" #A1 " \n\t"\
++ "paddsh $f14, $f14, $f10 \n\t"/* x2+x6*tg_2_16 = tp26*/\
++ "paddsh $f4, $f4, $f12 \n\t"/* x0+x4 = tp04*/\
++ "psubsh $f6, $f6, $f12 \n\t"/* x0-x4 = tm04*/\
++ "mov.d $f10, $f4 \n\t"/* tp04*/\
++ "mov.d $f12, $f6 \n\t"/* tm04*/\
++ "psubsh $f4, $f4, $f14 \n\t"/* tp04-tp26 = a3*/\
++ "paddsh $f6, $f6, $f0 \n\t"/* tm04+tm26 = a1*/\
++ "paddsh $f2, $f2, $f2 \n\t"/* b1*/\
++ "paddsh $f8, $f8, $f8 \n\t"/* b2*/\
++ "paddsh $f10, $f10, $f14 \n\t"/* tp04+tp26 = a0*/\
++ "psubsh $f12, $f12, $f0 \n\t"/* tm04-tm26 = a2*/\
++ "li $12, 6 \n\t"\
++ "dmtc1 $12, $f18 \n\t"\
++ "mov.d $f14, $f6 \n\t"/* a1*/\
++ "mov.d $f0, $f12 \n\t"/* a2*/\
++ "paddsh $f6, $f6, $f2 \n\t"/* a1+b1*/\
++ "paddsh $f12, $f12, $f8 \n\t"/* a2+b2*/\
++ "psrah $f6, $f6, $f18 \n\t"/* dst1*/\
++ "psubsh $f14, $f14, $f2 \n\t"/* a1-b1*/\
++ "psrah $f12, $f12, $f18 \n\t"/* dst2*/\
++ "psubsh $f0, $f0, $f8 \n\t"/* a2-b2*/\
++ "ldc1 $f2, 3*16+" #A2 " \n\t"/* load b0*/\
++ "psrah $f14, $f14, $f18 \n\t"/* dst6*/\
++ "mov.d $f8, $f10 \n\t"/* a0*/\
++ "psrah $f0, $f0, $f18 \n\t"/* dst5*/\
++ "sdc1 $f6, 1*16+" #A2 " \n\t"\
++ "paddsh $f10, $f10, $f2 \n\t"/* a0+b0*/\
++ "sdc1 $f12, 2*16+" #A2 " \n\t"\
++ "psubsh $f8, $f8, $f2 \n\t"/* a0-b0*/\
++ "ldc1 $f6, 5*16+" #A2 " \n\t"/* load b3*/\
++ "psrah $f10, $f10, $f18 \n\t"/* dst0*/\
++ "mov.d $f12, $f4 \n\t"/* a3*/\
++ "psrah $f8, $f8, $f18 \n\t"/* dst7*/\
++ "sdc1 $f0, 5*16+" #A2 " \n\t"\
++ "paddsh $f4, $f4, $f6 \n\t"/* a3+b3*/\
++ "sdc1 $f14, 6*16+" #A2 " \n\t"\
++ "psubsh $f12, $f12, $f6 \n\t"/* a3-b3*/\
++ "sdc1 $f10, 0*16+" #A2 " \n\t"\
++ "psrah $f4, $f4, $f18 \n\t"/* dst3*/\
++ "sdc1 $f8, 7*16+" #A2 " \n\t"\
++ "psrah $f12, $f12, $f18 \n\t"/* dst4*/\
++ "sdc1 $f4, 3*16+" #A2 " \n\t"\
++ "sdc1 $f12, 4*16+" #A2 " \n\t"
++
++
++
++void ff_idct_xvid_loongson2(short *block){
++ __asm__ volatile(
++ //# Process each row
++ DCT_8_INV_ROW_LOONGSON2(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
++ DCT_8_INV_ROW_LOONGSON2(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
++ DCT_8_INV_ROW_LOONGSON2(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
++ DCT_8_INV_ROW_LOONGSON2(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
++ DCT_8_INV_ROW_LOONGSON2(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
++ DCT_8_INV_ROW_LOONGSON2(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
++ DCT_8_INV_ROW_LOONGSON2(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
++ DCT_8_INV_ROW_LOONGSON2(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
++
++ //# Process the columns (4 at a time)
++ DCT_8_INV_COL(0(%0), 0(%0))
++ DCT_8_INV_COL(8(%0), 8(%0))
++ :
++ : "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16)
++ :"$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f18","$f16","$20","$22","$24","$26");
++}
++
+diff --git a/libavcodec/loongson2/motion_est_loongson2.c b/libavcodec/loongson2/motion_est_loongson2.c
+new file mode 100644
+index 0000000..bb67290
+--- /dev/null
++++ b/libavcodec/loongson2/motion_est_loongson2.c
+@@ -0,0 +1,365 @@
++/*
++ * Loongson2E MMI optimized motion estimation
++ * Copyright (c) 2007 comcat <jiankemeng@gmail.com>.
++ *
++ * based on Michael Niedermayer <michaelni@gmx.at>
++ *
++ */
++
++#include "dsputil_loongson2.h"
++#include "../avcodec.h"
++
++static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
++ 0x0000000000000000ULL,
++ 0x0001000100010001ULL,
++ 0x0002000200020002ULL,
++};
++
++static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL;
++
++static inline void sad8_1_loongson2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
++{
++ long len= -(stride*h);
++ __asm__ volatile(
++
++// ".set mips3 \n\t"
++ ".align 4 \n\t"
++
++ "move $8, %0 \n\t"
++ "move $21, %1 \n\t"
++ "move $22, %2 \n\t"
++ "move $23, %3 \n\t"
++
++ "1: \n\t"
++
++ "add $9, $8, $21 \n\t"
++ "add $10, $8, $22 \n\t"
++
++ "uld $11, ($9) \n\t"
++ "dmtc1 $11, $f0 \n\t"
++
++ "uld $12, ($9) \n\t"
++ "dmtc1 $12, $f4 \n\t"
++
++ "pasubub $f10, $f0, $f4 \n\t"
++ "biadd $f0, $f10 \n\t"
++
++ "add $8, $8, $23 \n\t"
++
++ "add $9, $8, $21 \n\t"
++ "add $10, $8, $22 \n\t"
++
++ "uld $11, ($9) \n\t"
++ "dmtc1 $11, $f2 \n\t"
++
++ "uld $12, ($10) \n\t"
++ "dmtc1 $12, $f6 \n\t"
++
++ "pasubub $f16, $f2, $f6 \n\t"
++ "biadd $f6, $f16 \n\t"
++
++ "paddh $f0, $f0, $f6 \n\t"
++
++ "paddh $f12, $f12, $f0 \n\t"
++
++ "bltz $8, 1b \n\t"
++ "add $8, $8, $23 \n\t"
++
++ : "+r" (len)
++ : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
++ : "$8", "$9", "$10", "$21", "$22", "$23", "$f0", "$f2", "$f4", "$f6", "$f10", "$f16"
++ );
++}
++
++static inline void sad8_2_loongson2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
++{
++ long len= -(stride*h);
++ __asm__ volatile(
++
++// ".set mips3 \n\t"
++ ".align 4 \n\t"
++
++ "move $8, %0 \n\t"
++
++ "1: \n\t"
++ "add $9, $8, %1 \n\t"
++ "add $10, $8, %2 \n\t"
++ "add $11, $8, %3 \n\t"
++
++ "uld $12, ($9) \n\t"
++ "dmtc1 $12, $f0 \n\t"
++ "uld $13, ($10) \n\t"
++ "dmtc1 $13, $f4 \n\t"
++
++ "pavgb $f0, $f0, $f4 \n\t"
++
++ "uld $12, ($11) \n\t"
++ "dmtc1 $12, $f4 \n\t"
++
++ "pasubub $f10, $f0, $f4 \n\t"
++ "biadd $f0, $f10 \n\t"
++
++ "add $8, $8, %4 \n\t"
++
++ "add $9, $8, %1 \n\t"
++ "add $10, $8, %2 \n\t"
++ "add $11, $8, %3 \n\t"
++
++ "uld $12, ($9) \n\t"
++ "dmtc1 $12, $f2 \n\t"
++ "uld $13, ($10) \n\t"
++ "dmtc1 $13, $f6 \n\t"
++
++ "pavgb $f6, $f6, $f2 \n\t"
++
++ "uld $12, ($11) \n\t"
++ "dmtc1 $12, $f2 \n\t"
++
++ "pasubub $f16, $f6, $f2 \n\t"
++ "biadd $f6, $f16 \n\t"
++
++ "paddh $f0, $f0, $f6 \n\t"
++ "paddh $f12, $f12, $f0 \n\t"
++
++ "bltz $8, 1b \n\t"
++ "add $8, $8, %4 \n\t"
++ : "+r" (len)
++ : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
++ : "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4", "$f6", "$f10", "$f16"
++ );
++}
++
++static inline void sad8_4_loongson2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
++{
++ long len= -(stride*h);
++ __asm__ volatile(
++
++
++// ".set mips3 \n\t"
++ ".align 4 \n\t"
++
++ "ldc1 $f10, "MANGLE(bone)" \n\t"
++
++ "move $8, %0 \n\t"
++
++ "1: \n\t"
++ "add $9, $8, %1 \n\t"
++ "add $10, $8, %2 \n\t"
++ "add $11, $8, %3 \n\t"
++
++ "uld $12, ($9) \n\t"
++ "dmtc1 $12, $f0 \n\t"
++
++ "uld $13, ($10) \n\t"
++ "dmtc1 $13, $f4 \n\t"
++
++ "uld $12, 1($9) \n\t"
++ "dmtc1 $12, $f2 \n\t"
++
++ "uld $13, 1($10) \n\t"
++ "dmtc1 $13, $f6 \n\t"
++
++ "pavgb $f0, $f0, $f4 \n\t"
++ "pavgb $f6, $f6, $f2 \n\t"
++
++ "psubusb $f6, $f6, $f10 \n\t"
++ "pavgb $f0, $f0, $f6 \n\t"
++
++ "uld $13, 1($11) \n\t"
++ "dmtc1 $13, $f4 \n\t"
++
++ "pasubub $f16, $f0, $f4 \n\t"
++ "biadd $f0, $f16 \n\t"
++
++ "add $8, $8, %4 \n\t"
++
++ "add $9, $8, %1 \n\t"
++ "add $10, $8, %2 \n\t"
++ "add $11, $8, %3 \n\t"
++
++ "uld $12, ($9) \n\t"
++ "dmtc1 $12, $f2 \n\t"
++ "uld $13, ($10) \n\t"
++ "dmtc1 $12, $f6 \n\t"
++ "uld $12, 1($9) \n\t"
++ "dmtc1 $12, $f4 \n\t"
++ "uld $13, 1($10) \n\t"
++ "dmtc1 $12, $f8 \n\t"
++
++ "pavgb $f2, $f2, $f6 \n\t"
++ "pavgb $f4, $f4, $f8 \n\t"
++
++ "psubusb $f4, $f4, $f10 \n\t"
++ "pavgb $f4, $f4, $f2 \n\t"
++
++ "uld $13, ($11) \n\t"
++ "dmtc1 $13, $f2 \n\t"
++
++ "pasubub $f18, $f4, $f2 \n\t"
++ "biadd $f4, $f18 \n\t"
++
++ "paddh $f0, $f0, $f4 \n\t"
++ "paddh $f12, $f12, $f0 \n\t"
++
++ "bltz $8, 1b \n\t"
++ "add $8, $8, %4 \n\t"
++ : "+r" (len)
++ : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride)
++ : "$8", "$9", "$10", "$11", "$12", "$13", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f16", "$f18"
++ );
++}
++
++static inline int sum_loongson2(void)
++{
++ int ret;
++ __asm__ volatile(
++// ".set mips3 \n\t"
++
++ "dmfc1 %0, $f12 \n\t"
++ : "=r" (ret)
++ );
++ return ret;
++}
++
++
++static int sad8_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++ assert(h==8);
++ __asm__ volatile(
++// ".set mips3 \n\t"
++ "xor $f14, $f14, $f14 \n\t"
++ "xor $f12, $f12, $f12 \n\t"
++ :
++ );
++
++ sad8_1_loongson2(blk1, blk2, stride, 8);
++
++ return sum_loongson2();
++}
++
++static int sad8_x2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++ assert(h==8);
++ __asm__ volatile(
++// ".set mips3 \n\t"
++ "xor $f14, $f14, $f14 \n\t"
++ "xor $f12, $f12, $f12 \n\t"
++
++ "ldc1 $f10, %0 \n\t"
++ :: "m"(round_tab[1])
++ );
++
++ sad8_2_loongson2(blk1, blk1+1, blk2, stride, 8);
++
++ return sum_loongson2();
++}
++
++static int sad8_y2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++ assert(h==8);
++ __asm__ volatile(
++// ".set mips3 \n\t"
++ "xor $f14, $f14, $f14 \n\t"
++ "xor $f12, $f12, $f12 \n\t"
++
++ "ldc1 $f10, %0 \n\t"
++ :: "m"(round_tab[1])
++ );
++
++ sad8_2_loongson2(blk1, blk1+stride, blk2, stride, 8);
++
++ return sum_loongson2();
++}
++
++static int sad8_xy2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++ assert(h==8);
++ __asm__ volatile(
++// ".set mips3 \n\t"
++ "xor $f14, $f14, $f14 \n\t"
++ "xor $f12, $f12, $f12 \n\t"
++ "ldc1 $f10, %0 \n\t"
++ :: "m"(round_tab[2])
++ );
++
++ sad8_4_loongson2(blk1, blk2, stride, 8);
++
++ return sum_loongson2();
++}
++
++static int sad16_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++ __asm__ volatile(
++// ".set mips3 \n\t"
++ "xor $f14, $f14, $f14 \n\t"
++ "xor $f12, $f12, $f12 \n\t":);
++
++ sad8_1_loongson2(blk1 , blk2 , stride, h);
++ sad8_1_loongson2(blk1+8, blk2+8, stride, h);
++
++ return sum_loongson2();
++}
++
++static int sad16_x2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++ __asm__ volatile(
++// ".set mips3 \n\t"
++ "xor $f14, $f14, $f14 \n\t"
++ "xor $f12, $f12, $f12 \n\t"
++ "ldc1 $f10, %0 \n\t"
++ :: "m"(round_tab[1])
++ );
++
++ sad8_2_loongson2(blk1 , blk1+1, blk2 , stride, h);
++ sad8_2_loongson2(blk1+8, blk1+9, blk2+8, stride, h);
++
++ return sum_loongson2();
++}
++
++static int sad16_y2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++ __asm__ volatile(
++// ".set mips3 \n\t"
++ "xor $f14, $f14, $f14 \n\t"
++ "xor $f12, $f12, $f12 \n\t"
++ "ldc1 $f10, %0 \n\t"
++ :: "m"(round_tab[1])
++ );
++
++ sad8_2_loongson2(blk1 , blk1+stride, blk2 , stride, h);
++ sad8_2_loongson2(blk1+8, blk1+stride+8,blk2+8, stride, h);
++
++ return sum_loongson2();
++}
++
++static int sad16_xy2_loongson2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
++{
++ __asm__ volatile(
++// ".set mips3 \n\t"
++ "xor $f14, $f14, $f14 \n\t"
++ "xor $f12, $f12, $f12 \n\t"
++ "ldc1 $f10, %0 \n\t"
++ :: "m"(round_tab[2])
++ );
++
++ sad8_4_loongson2(blk1 , blk2 , stride, h);
++ sad8_4_loongson2(blk1+8, blk2+8, stride, h);
++
++ return sum_loongson2();
++}
++
++
++void dsputil_init_pix_loongson2(DSPContext* c, AVCodecContext *avctx)
++{
++ c->pix_abs[0][0] = sad16_loongson2;
++ c->pix_abs[0][1] = sad16_x2_loongson2;
++ c->pix_abs[0][2] = sad16_y2_loongson2;
++ c->pix_abs[0][3] = sad16_xy2_loongson2;
++ c->pix_abs[1][0] = sad8_loongson2;
++ c->pix_abs[1][1] = sad8_x2_loongson2;
++ c->pix_abs[1][2] = sad8_y2_loongson2;
++ c->pix_abs[1][3] = sad8_xy2_loongson2;
++
++ c->sad[0]= sad16_loongson2;
++ c->sad[1]= sad8_loongson2;
++}
+diff --git a/libavcodec/loongson2/mpegvideo_loongson2.c b/libavcodec/loongson2/mpegvideo_loongson2.c
+new file mode 100644
+index 0000000..18d070a
+--- /dev/null
++++ b/libavcodec/loongson2/mpegvideo_loongson2.c
+@@ -0,0 +1,385 @@
++/*
++ * The simplest mpeg encoder (well, it was the simplest!)
++ * Copyright (c) 2007-2010 comcat <jiankemeng@gmail.com>.
++ *
++ * Optimized for Loongson2 CPUs by comcat <jiankemeng@gmail.com>
++ *
++ * Based on i386
++ */
++
++#include "dsputil_loongson2.h"
++#include "../mpegvideo.h"
++#include "../avcodec.h"
++
++extern uint8_t zigzag_direct_noperm[64];
++extern uint16_t inv_zigzag_direct16[64];
++
++static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL;
++static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
++
++
++static void dct_unquantize_h263_intra_loongson2(MpegEncContext *s,
++ DCTELEM *block, int n, int qscale)
++{
++ long level, qmul, qadd, nCoeffs;
++
++ qmul = qscale << 1;
++
++ assert(s->block_last_index[n]>=0 || s->h263_aic);
++ if (!s->h263_aic) {
++ if (n < 4)
++ level = block[0] * s->y_dc_scale;
++ else
++ level = block[0] * s->c_dc_scale;
++ qadd = (qscale - 1) | 1;
++ }else{
++ qadd = 0;
++ level= block[0];
++ }
++ if(s->ac_pred)
++ nCoeffs=63;
++ else
++ nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
++
++
++ __asm__ volatile(
++// ".set mips3 \n\t"
++
++ "xor $f12, $f12, $f12 \n\t"
++ "lwc1 $f12, %1 \n\t"
++
++ "xor $f10, $f10, $f10 \n\t"
++
++ "packsswh $f12, $f12, $f12 \n\t"
++
++ "lwc1 $f10, %2 \n\t"
++
++ "packsswh $f10, $f10, $f10 \n\t"
++
++ "packsswh $f12, $f12, $f12 \n\t"
++
++ "xor $f14, $f14, $f14 \n\t"
++
++ "packsswh $f10, $f10, $f10 \n\t"
++
++ "xor $f8, $f8, $f8 \n\t"
++
++ "psubh $f14, $f14, $f10 \n\t"
++
++
++ "1: \n\t"
++ "add $12, %0, %3 \n\t"
++
++ "ldc1 $f0, ($12) \n\t"
++
++ "ldc1 $f2, 8($12) \n\t"
++
++ "mov.d $f4, $f0 \n\t"
++ "mov.d $f6, $f2 \n\t"
++
++ "pmullh $f0, $f0, $f12 \n\t"
++ "pmullh $f2, $f2, $f12 \n\t"
++
++ "pcmpgth $f4, $f4, $f8 \n\t"
++ "pcmpgth $f6, $f6, $f8 \n\t"
++
++ "xor $f0, $f0, $f4 \n\t"
++ "xor $f2, $f2, $f6 \n\t"
++
++
++ "paddh $f0, $f0, $f14 \n\t"
++
++ "paddh $f2, $f2, $f14 \n\t"
++
++
++ "xor $f4, $f4, $f0 \n\t"
++
++ "xor $f6, $f6, $f2 \n\t"
++
++
++ "pcmpeqh $f0, $f0, $f14 \n\t"
++
++ "pcmpeqh $f2, $f2, $f14 \n\t"
++
++
++ "pandn $f0, $f0, $f4 \n\t"
++
++ "pandn $f2, $f2, $f6 \n\t"
++
++
++ "sdc1 $f0, ($12) \n\t"
++
++ "sdc1 $f2, 8($12) \n\t"
++
++
++ "addiu %3, %3, 16 \n\t"
++
++ "blez %3, 1b \n\t"
++ "nop \n\t"
++ ::"r" (block+nCoeffs), "m"(qmul), "m" (qadd), "r" (2*(-nCoeffs))
++ : "memory"
++ );
++ block[0]= level;
++}
++
++
++static void dct_unquantize_h263_inter_loongson2(MpegEncContext *s,
++ DCTELEM *block, int n, int qscale)
++{
++ long qmul, qadd, nCoeffs;
++
++ qmul = qscale << 1;
++ qadd = (qscale - 1) | 1;
++
++ assert(s->block_last_index[n]>=0 || s->h263_aic);
++
++ nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
++
++ __asm__ volatile(
++// ".set mips3 \n\t"
++
++ "xor $f12, $f12, $f12 \n\t"
++ "lwc1 $f12, %1 \n\t"
++
++ "xor $f10, $f10, $f10 \n\t"
++
++ "packsswh $f12, $f12, $f12 \n\t"
++
++ "lwc1 $f10, %2 \n\t"
++
++ "packsswh $f10, $f10, $f10 \n\t"
++
++ "xor $f14, $f14, $f14 \n\t"
++
++ "packsswh $f12, $f12, $f12 \n\t"
++
++ "packsswh $f10, $f10, $f10 \n\t"
++
++ "xor $f8, $f8, $f8 \n\t"
++
++ "psubh $f14, $f14, $f10 \n\t"
++
++
++ "1: \n\t"
++ "add $12, %0, %3 \n\t"
++
++ "ldc1 $f0, ($12) \n\t"
++
++ "ldc1 $f2, 8($12) \n\t"
++
++ "mov.d $f4, $f0 \n\t"
++ "mov.d $f6, $f2 \n\t"
++
++ "pmullh $f0, $f0, $f12 \n\t"
++
++ "pmullh $f2, $f2, $f12 \n\t"
++
++ "pcmpgth $f4, $f4, $f8 \n\t"
++
++ "pcmpgth $f6, $f6, $f8 \n\t"
++
++ "xor $f0, $f0, $f4 \n\t"
++
++ "xor $f2, $f2, $f6 \n\t"
++
++ "paddh $f0, $f0, $f14 \n\t"
++
++ "paddh $f2, $f2, $f14 \n\t"
++
++ "xor $f4, $f4, $f0 \n\t"
++
++ "xor $f6, $f6, $f2 \n\t"
++
++ "pcmpeqh $f0, $f0, $f14 \n\t"
++
++ "pcmpeqh $f2, $f2, $f14 \n\t"
++
++ "pandn $f0, $f0, $f4 \n\t"
++
++ "pandn $f2, $f2, $f6 \n\t"
++
++ "sdc1 $f0, ($12) \n\t"
++
++ "sdc1 $f2, 8($12) \n\t"
++
++
++ "addiu %3, %3, 16 \n\t"
++
++ "blez %3, 1b \n\t"
++ "nop \n\t"
++ ::"r" (block+nCoeffs), "m"(qmul), "m" (qadd), "r" (2*(-nCoeffs))
++ : "memory"
++ );
++}
++
++
++/* draw the edges of width 'w' of an image of size width, height
++ this mmx version can only handle w==8 || w==16 */
++
++static void draw_edges_loongson2(uint8_t *buf, int wrap, int width, int height, int w)
++{
++ uint8_t *ptr, *last_line;
++ int i;
++
++ last_line = buf + (height - 1) * wrap;
++
++ ptr = buf;
++ if(w==8)
++ {
++ __asm__ volatile(
++// ".set mips3 \n\t"
++
++ "move $9, %0 \n\t"
++
++ "1: \n\t"
++
++ "xor $f0, $f0, $f0 \n\t"
++ "lwc1 $f0, ($9) \n\t"
++
++ "punpcklbh $f0, $f0, $f0 \n\t"
++
++ "add $12, $9, %2 \n\t"
++
++ "punpcklhw $f0, $f0, $f0 \n\t"
++
++ "punpcklwd $f0, $f0, $f0 \n\t"
++
++ "ldc1 $f2, -8($12) \n\t"
++
++ "sdc1 $f0, -8($9) \n\t"
++
++ "punpckhbh $f2, $f2, $f2 \n\t"
++
++ "add $9, $9, %1 \n\t"
++
++ "punpckhhw $f2, $f2, $f2 \n\t"
++
++ "sub $13, $9, %3 \n\t"
++
++ "punpckhwd $f2, $f2, $f2 \n\t"
++
++ "bltz $13, 1b \n\t"
++
++ "sdc1 $f2, ($12) \n\t"
++
++ : "+r" (ptr)
++ : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
++ : "$9", "$13", "$12", "$f2", "$f0"
++ );
++ }
++ else
++ {
++ __asm__ volatile(
++// ".set mips3 \n\t"
++
++ "move $8, %0 \n\t"
++
++ "1: \n\t"
++
++ "xor $f0, $f0, $f0 \n\t"
++ "lwc1 $f0, ($8) \n\t"
++
++ "punpcklbh $f0, $f0, $f0 \n\t"
++ "punpcklhw $f0, $f0, $f0 \n\t"
++ "punpcklwd $f0, $f0, $f0 \n\t"
++
++ "sdc1 $f0, -8($8) \n\t"
++ "sdc1 $f0, -16($8) \n\t"
++
++ "add $15, $8, %2 \n\t"
++ "ldc1 $f2, -8($15) \n\t"
++
++ "punpckhbh $f2, $f2, $f2 \n\t"
++ "punpckhhw $f2, $f2, $f2 \n\t"
++ "punpckhwd $f2, $f2, $f2 \n\t"
++
++ "sdc1 $f2, ($15) \n\t"
++ "sdc1 $f2, 8($15) \n\t"
++
++ "add $8, $8, %1 \n\t"
++
++ "sub $16, $8, %3 \n\t"
++ "bltz $16, 1b \n\t"
++ "nop \n\t"
++ : "+r" (ptr)
++ : "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
++ : "$8", "$15", "$16", "$f0", "$f2"
++ );
++ }
++
++ for(i=0;i<w;i+=4) {
++
++ ptr= buf - (i + 1) * wrap - w;
++ __asm__ volatile(
++// ".set mips3 \n\t"
++ "move $8, %0 \n\t"
++
++ "1: \n\t"
++
++ "add $9, $8, %1 \n\t"
++ "ldc1 $f0, ($9) \n\t"
++
++ "add $10, $8, %2 \n\t"
++ "add $11, $10, %2 \n\t"
++ "add $12, $8, %3 \n\t"
++
++ "sdc1 $f0, ($8) \n\t"
++ "sdc1 $f0, ($10) \n\t"
++ "sdc1 $f0, ($11) \n\t"
++ "sdc1 $f0, ($12) \n\t"
++
++ "addiu $8, $8, 8 \n\t"
++
++ "sub $13, $8, %4 \n\t"
++
++ "bltz $13, 1b \n\t"
++ "nop \n\t"
++
++ : "+r" (ptr)
++ : "r" (((long)buf - (long)ptr - w)), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (*(ptr+width+2*w))
++ : "$8", "$9", "$10", "$11", "$12", "$13", "$f0"
++ );
++
++ ptr= last_line + (i + 1) * wrap - w;
++
++ __asm__ volatile(
++// ".set mips3 \n\t"
++
++ "move $9, %0 \n\t"
++
++ "1: \n\t"
++
++ "add $10, $9, %1 \n\t"
++ "ldc1 $f0, ($10) \n\t"
++
++ "add $11, $9, %2 \n\t"
++ "add $12, $11, %2 \n\t"
++ "add $13, $9, %3 \n\t"
++
++ "sdc1 $f0, ($9) \n\t"
++ "sdc1 $f0, ($11) \n\t"
++ "sdc1 $f0, ($12) \n\t"
++ "sdc1 $f0, ($13) \n\t"
++
++ "addiu $9, $9, 8 \n\t"
++
++ "sub $14, $9, %4 \n\t"
++
++ "bltz $14, 1b \n\t"
++ "nop \n\t"
++ : "+r" (ptr)
++ : "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w)
++ : "$9", "$10", "$11", "$12", "$13", "$14", "$f0"
++
++ );
++ }
++}
++
++void MPV_common_init_loongson2(MpegEncContext *s)
++{
++ s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_loongson2;
++ s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_loongson2;
++
++// draw_edges = draw_edges_loongson2;
++
++}
+diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
+index 3f4da68..73e4d56 100644
+--- a/libavcodec/mips/Makefile
++++ b/libavcodec/mips/Makefile
+@@ -1,3 +1,9 @@
+ OBJS-$(HAVE_MMI) += ps2/dsputil_mmi.o \
+ ps2/idct_mmi.o \
+ ps2/mpegvideo_mmi.o \
++
++OBJS-$(HAVE_LOONGSON2MMI) += loongson2/idct_loongson2.o \
++ loongson2/dsputil_loongson2.o \
++ loongson2/idct_loongson2_xvid.o \
++ loongson2/mpegvideo_loongson2.o \
++ loongson2/motion_est_loongson2.o
+diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
+index b47ff9a..af92552 100644
+--- a/libavcodec/mpegvideo.c
++++ b/libavcodec/mpegvideo.c
+@@ -176,6 +176,9 @@ av_cold int ff_dct_common_init(MpegEncContext *s)
+ #elif ARCH_BFIN
+ MPV_common_init_bfin(s);
+ #endif
++#ifdef HAVE_LOONGSON2MMI
++ MPV_common_init_loongson2(s);
++#endif
+
+ /* load & permutate scantables
+ note: only wmv uses different ones
+diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
+index 5302be9..8d09906 100644
+--- a/libavcodec/mpegvideo.h
++++ b/libavcodec/mpegvideo.h
+@@ -689,6 +689,7 @@ int MPV_encode_picture(AVCodecContext *avctx, unsigned char *buf, int buf_size,
+ void MPV_common_init_mmx(MpegEncContext *s);
+ void MPV_common_init_axp(MpegEncContext *s);
+ void MPV_common_init_mlib(MpegEncContext *s);
++void MPV_common_init_loongson2(MpegEncContext *s);
+ void MPV_common_init_mmi(MpegEncContext *s);
+ void MPV_common_init_arm(MpegEncContext *s);
+ void MPV_common_init_altivec(MpegEncContext *s);
+diff --git a/libavcodec/options.c b/libavcodec/options.c
+index 7ca1062..c05b3f4 100644
+--- a/libavcodec/options.c.orig 2011-06-22 12:52:11.584428161 -0300
++++ b/libavcodec/options.c 2011-06-22 12:52:25.003143367 -0300
+@@ -219,6 +219,8 @@ static const AVOption options[]={
+ {"simple", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_IDCT_SIMPLE }, INT_MIN, INT_MAX, V|E|D, "idct"},
+ {"simplemmx", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_IDCT_SIMPLEMMX }, INT_MIN, INT_MAX, V|E|D, "idct"},
+ {"libmpeg2mmx", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_IDCT_LIBMPEG2MMX }, INT_MIN, INT_MAX, V|E|D, "idct"},
++{"libmpeg2loongson2", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_LIBMPEG2LOONGSON2, INT_MIN, INT_MAX, V|E|D, "idct"},
++{"xvidloongson2", NULL, 0, FF_OPT_TYPE_CONST, FF_IDCT_XVIDLOONGSON2, INT_MIN, INT_MAX, V|E|D, "idct"},
+ {"ps2", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_IDCT_PS2 }, INT_MIN, INT_MAX, V|E|D, "idct"},
+ {"mlib", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_IDCT_MLIB }, INT_MIN, INT_MAX, V|E|D, "idct"},
+ {"arm", NULL, 0, FF_OPT_TYPE_CONST, {.dbl = FF_IDCT_ARM }, INT_MIN, INT_MAX, V|E|D, "idct"},