• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/x86/idct_sse2_xvid.c

Go to the documentation of this file.
00001 /*
00002  * XVID MPEG-4 VIDEO CODEC
00003  * - SSE2 inverse discrete cosine transform -
00004  *
00005  * Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
00006  *
00007  * Conversion to gcc syntax with modifications
00008  * by Alexander Strange <astrange@ithinksw.com>
00009  *
00010  * Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
00011  *
00012  * This file is part of FFmpeg.
00013  *
00014  * Vertical pass is an implementation of the scheme:
00015  *  Loeffler C., Ligtenberg A., and Moschytz C.S.:
00016  *  Practical Fast 1D DCT Algorithm with Eleven Multiplications,
00017  *  Proc. ICASSP 1989, 988-991.
00018  *
00019  * Horizontal pass is a double 4x4 vector/matrix multiplication,
00020  * (see also Intel's Application Note 922:
00021  *  http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
00022  *  Copyright (C) 1999 Intel Corporation)
00023  *
00024  * More details at http://skal.planet-d.net/coding/dct.html
00025  *
00026  * FFmpeg is free software; you can redistribute it and/or
00027  * modify it under the terms of the GNU Lesser General Public
00028  * License as published by the Free Software Foundation; either
00029  * version 2.1 of the License, or (at your option) any later version.
00030  *
00031  * FFmpeg is distributed in the hope that it will be useful,
00032  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00033  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00034  * Lesser General Public License for more details.
00035  *
00036  * You should have received a copy of the GNU Lesser General Public License
00037  * along with FFmpeg; if not, write to the Free Software Foundation,
00038  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00039  */
00040 
00041 #include "libavcodec/dsputil.h"
00042 #include "idct_xvid.h"
00043 
00049 #define X8(x)     x,x,x,x,x,x,x,x
00050 
00051 #define ROW_SHIFT 11
00052 #define COL_SHIFT 6
00053 
00054 DECLARE_ASM_CONST(16, int16_t, tan1[]) = {X8(13036)}; // tan( pi/16)
00055 DECLARE_ASM_CONST(16, int16_t, tan2[]) = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1
00056 DECLARE_ASM_CONST(16, int16_t, tan3[]) = {X8(43790)}; // tan(3pi/16)-1
00057 DECLARE_ASM_CONST(16, int16_t, sqrt2[])= {X8(23170)}; // 0.5/sqrt(2)
00058 DECLARE_ASM_CONST(8,  uint8_t, m127[]) = {X8(127)};
00059 
00060 DECLARE_ASM_CONST(16, int16_t, iTab1[]) = {
00061  0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
00062  0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
00063  0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
00064  0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
00065 };
00066 
00067 DECLARE_ASM_CONST(16, int16_t, iTab2[]) = {
00068  0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
00069  0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
00070  0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
00071  0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
00072 };
00073 
00074 DECLARE_ASM_CONST(16, int16_t, iTab3[]) = {
00075  0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
00076  0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
00077  0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
00078  0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
00079 };
00080 
00081 DECLARE_ASM_CONST(16, int16_t, iTab4[]) = {
00082  0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
00083  0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
00084  0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
00085  0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
00086 };
00087 
00088 DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders[]) = {
00089  65536, 65536, 65536, 65536,
00090   3597,  3597,  3597,  3597,
00091   2260,  2260,  2260,  2260,
00092   1203,  1203,  1203,  1203,
00093    120,   120,   120,   120,
00094    512,   512,   512,   512
00095 };
00096 
00097 // Temporary storage before the column pass
00098 #define ROW1 "%%xmm6"
00099 #define ROW3 "%%xmm4"
00100 #define ROW5 "%%xmm5"
00101 #define ROW7 "%%xmm7"
00102 
00103 #define CLEAR_ODD(r) "pxor  "r","r" \n\t"
00104 #define PUT_ODD(dst) "pshufhw  $0x1B, %%xmm2, "dst"   \n\t"
00105 
00106 #if ARCH_X86_64
00107 
00108 # define ROW0 "%%xmm8"
00109 # define REG0 ROW0
00110 # define ROW2 "%%xmm9"
00111 # define REG2 ROW2
00112 # define ROW4 "%%xmm10"
00113 # define REG4 ROW4
00114 # define ROW6 "%%xmm11"
00115 # define REG6 ROW6
00116 # define CLEAR_EVEN(r) CLEAR_ODD(r)
00117 # define PUT_EVEN(dst) PUT_ODD(dst)
00118 # define XMMS "%%xmm12"
00119 # define MOV_32_ONLY "#"
00120 # define SREG2 REG2
00121 # define TAN3 "%%xmm13"
00122 # define TAN1 "%%xmm14"
00123 
00124 #else
00125 
00126 # define ROW0 "(%0)"
00127 # define REG0 "%%xmm4"
00128 # define ROW2 "2*16(%0)"
00129 # define REG2 "%%xmm4"
00130 # define ROW4 "4*16(%0)"
00131 # define REG4 "%%xmm6"
00132 # define ROW6 "6*16(%0)"
00133 # define REG6 "%%xmm6"
00134 # define CLEAR_EVEN(r)
00135 # define PUT_EVEN(dst) \
00136     "pshufhw  $0x1B, %%xmm2, %%xmm2   \n\t" \
00137     "movdqa          %%xmm2, "dst"    \n\t"
00138 # define XMMS "%%xmm2"
00139 # define MOV_32_ONLY "movdqa "
00140 # define SREG2 "%%xmm7"
00141 # define TAN3 "%%xmm0"
00142 # define TAN1 "%%xmm2"
00143 
00144 #endif
00145 
00146 #define ROUND(x) "paddd   "MANGLE(x)
00147 
00148 #define JZ(reg, to)                         \
00149     "testl     "reg","reg"            \n\t" \
00150     "jz        "to"                   \n\t"
00151 
00152 #define JNZ(reg, to)                        \
00153     "testl     "reg","reg"            \n\t" \
00154     "jnz       "to"                   \n\t"
00155 
00156 #define TEST_ONE_ROW(src, reg, clear)       \
00157     clear                                   \
00158     "movq     "src", %%mm1            \n\t" \
00159     "por    8+"src", %%mm1            \n\t" \
00160     "paddusb  %%mm0, %%mm1            \n\t" \
00161     "pmovmskb %%mm1, "reg"            \n\t"
00162 
00163 #define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
00164     clear1                                  \
00165     clear2                                  \
00166     "movq     "row1", %%mm1           \n\t" \
00167     "por    8+"row1", %%mm1           \n\t" \
00168     "movq     "row2", %%mm2           \n\t" \
00169     "por    8+"row2", %%mm2           \n\t" \
00170     "paddusb   %%mm0, %%mm1           \n\t" \
00171     "paddusb   %%mm0, %%mm2           \n\t" \
00172     "pmovmskb  %%mm1, "reg1"          \n\t" \
00173     "pmovmskb  %%mm2, "reg2"          \n\t"
00174 
00176 #define iMTX_MULT(src, table, rounder, put) \
00177     "movdqa        "src", %%xmm3      \n\t" \
00178     "movdqa       %%xmm3, %%xmm0      \n\t" \
00179     "pshufd   $0x11, %%xmm3, %%xmm1   \n\t" /* 4602 */ \
00180     "punpcklqdq   %%xmm0, %%xmm0      \n\t" /* 0246 */ \
00181     "pmaddwd     "table", %%xmm0      \n\t" \
00182     "pmaddwd  16+"table", %%xmm1      \n\t" \
00183     "pshufd   $0xBB, %%xmm3, %%xmm2   \n\t" /* 5713 */ \
00184     "punpckhqdq   %%xmm3, %%xmm3      \n\t" /* 1357 */ \
00185     "pmaddwd  32+"table", %%xmm2      \n\t" \
00186     "pmaddwd  48+"table", %%xmm3      \n\t" \
00187     "paddd        %%xmm1, %%xmm0      \n\t" \
00188     "paddd        %%xmm3, %%xmm2      \n\t" \
00189     rounder",     %%xmm0              \n\t" \
00190     "movdqa       %%xmm2, %%xmm3      \n\t" \
00191     "paddd        %%xmm0, %%xmm2      \n\t" \
00192     "psubd        %%xmm3, %%xmm0      \n\t" \
00193     "psrad           $11, %%xmm2      \n\t" \
00194     "psrad           $11, %%xmm0      \n\t" \
00195     "packssdw     %%xmm0, %%xmm2      \n\t" \
00196     put                                     \
00197     "1:                               \n\t"
00198 
00199 #define iLLM_HEAD                           \
00200     "movdqa   "MANGLE(tan3)", "TAN3"  \n\t" \
00201     "movdqa   "MANGLE(tan1)", "TAN1"  \n\t" \
00202 
00203 
00204 #define iLLM_PASS(dct)                      \
00205     "movdqa   "TAN3", %%xmm1          \n\t" \
00206     "movdqa   "TAN1", %%xmm3          \n\t" \
00207     "pmulhw   %%xmm4, "TAN3"          \n\t" \
00208     "pmulhw   %%xmm5, %%xmm1          \n\t" \
00209     "paddsw   %%xmm4, "TAN3"          \n\t" \
00210     "paddsw   %%xmm5, %%xmm1          \n\t" \
00211     "psubsw   %%xmm5, "TAN3"          \n\t" \
00212     "paddsw   %%xmm4, %%xmm1          \n\t" \
00213     "pmulhw   %%xmm7, %%xmm3          \n\t" \
00214     "pmulhw   %%xmm6, "TAN1"          \n\t" \
00215     "paddsw   %%xmm6, %%xmm3          \n\t" \
00216     "psubsw   %%xmm7, "TAN1"          \n\t" \
00217     "movdqa   %%xmm3, %%xmm7          \n\t" \
00218     "movdqa   "TAN1", %%xmm6          \n\t" \
00219     "psubsw   %%xmm1, %%xmm3          \n\t" \
00220     "psubsw   "TAN3", "TAN1"          \n\t" \
00221     "paddsw   %%xmm7, %%xmm1          \n\t" \
00222     "paddsw   %%xmm6, "TAN3"          \n\t" \
00223     "movdqa   %%xmm3, %%xmm6          \n\t" \
00224     "psubsw   "TAN3", %%xmm3          \n\t" \
00225     "paddsw   %%xmm6, "TAN3"          \n\t" \
00226     "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
00227     "pmulhw   %%xmm4, %%xmm3          \n\t" \
00228     "pmulhw   %%xmm4, "TAN3"          \n\t" \
00229     "paddsw   "TAN3", "TAN3"          \n\t" \
00230     "paddsw   %%xmm3, %%xmm3          \n\t" \
00231     "movdqa   "MANGLE(tan2)", %%xmm7  \n\t" \
00232     MOV_32_ONLY ROW2", "REG2"         \n\t" \
00233     MOV_32_ONLY ROW6", "REG6"         \n\t" \
00234     "movdqa   %%xmm7, %%xmm5          \n\t" \
00235     "pmulhw   "REG6", %%xmm7          \n\t" \
00236     "pmulhw   "REG2", %%xmm5          \n\t" \
00237     "paddsw   "REG2", %%xmm7          \n\t" \
00238     "psubsw   "REG6", %%xmm5          \n\t" \
00239     MOV_32_ONLY ROW0", "REG0"         \n\t" \
00240     MOV_32_ONLY ROW4", "REG4"         \n\t" \
00241     MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
00242     "movdqa   "REG0", "XMMS"          \n\t" \
00243     "psubsw   "REG4", "REG0"          \n\t" \
00244     "paddsw   "XMMS", "REG4"          \n\t" \
00245     "movdqa   "REG4", "XMMS"          \n\t" \
00246     "psubsw   %%xmm7, "REG4"          \n\t" \
00247     "paddsw   "XMMS", %%xmm7          \n\t" \
00248     "movdqa   "REG0", "XMMS"          \n\t" \
00249     "psubsw   %%xmm5, "REG0"          \n\t" \
00250     "paddsw   "XMMS", %%xmm5          \n\t" \
00251     "movdqa   %%xmm5, "XMMS"          \n\t" \
00252     "psubsw   "TAN3", %%xmm5          \n\t" \
00253     "paddsw   "XMMS", "TAN3"          \n\t" \
00254     "movdqa   "REG0", "XMMS"          \n\t" \
00255     "psubsw   %%xmm3, "REG0"          \n\t" \
00256     "paddsw   "XMMS", %%xmm3          \n\t" \
00257     MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
00258     "psraw        $6, %%xmm5          \n\t" \
00259     "psraw        $6, "REG0"          \n\t" \
00260     "psraw        $6, "TAN3"          \n\t" \
00261     "psraw        $6, %%xmm3          \n\t" \
00262     "movdqa   "TAN3", 1*16("dct")     \n\t" \
00263     "movdqa   %%xmm3, 2*16("dct")     \n\t" \
00264     "movdqa   "REG0", 5*16("dct")     \n\t" \
00265     "movdqa   %%xmm5, 6*16("dct")     \n\t" \
00266     "movdqa   %%xmm7, %%xmm0          \n\t" \
00267     "movdqa   "REG4", %%xmm4          \n\t" \
00268     "psubsw   %%xmm1, %%xmm7          \n\t" \
00269     "psubsw   "TAN1", "REG4"          \n\t" \
00270     "paddsw   %%xmm0, %%xmm1          \n\t" \
00271     "paddsw   %%xmm4, "TAN1"          \n\t" \
00272     "psraw        $6, %%xmm1          \n\t" \
00273     "psraw        $6, %%xmm7          \n\t" \
00274     "psraw        $6, "TAN1"          \n\t" \
00275     "psraw        $6, "REG4"          \n\t" \
00276     "movdqa   %%xmm1, ("dct")         \n\t" \
00277     "movdqa   "TAN1", 3*16("dct")     \n\t" \
00278     "movdqa   "REG4", 4*16("dct")     \n\t" \
00279     "movdqa   %%xmm7, 7*16("dct")     \n\t"
00280 
00282 #define iLLM_PASS_SPARSE(dct)               \
00283     "pmulhw   %%xmm4, "TAN3"          \n\t" \
00284     "paddsw   %%xmm4, "TAN3"          \n\t" \
00285     "movdqa   %%xmm6, %%xmm3          \n\t" \
00286     "pmulhw   %%xmm6, "TAN1"          \n\t" \
00287     "movdqa   %%xmm4, %%xmm1          \n\t" \
00288     "psubsw   %%xmm1, %%xmm3          \n\t" \
00289     "paddsw   %%xmm6, %%xmm1          \n\t" \
00290     "movdqa   "TAN1", %%xmm6          \n\t" \
00291     "psubsw   "TAN3", "TAN1"          \n\t" \
00292     "paddsw   %%xmm6, "TAN3"          \n\t" \
00293     "movdqa   %%xmm3, %%xmm6          \n\t" \
00294     "psubsw   "TAN3", %%xmm3          \n\t" \
00295     "paddsw   %%xmm6, "TAN3"          \n\t" \
00296     "movdqa   "MANGLE(sqrt2)", %%xmm4 \n\t" \
00297     "pmulhw   %%xmm4, %%xmm3          \n\t" \
00298     "pmulhw   %%xmm4, "TAN3"          \n\t" \
00299     "paddsw   "TAN3", "TAN3"          \n\t" \
00300     "paddsw   %%xmm3, %%xmm3          \n\t" \
00301     "movdqa   "MANGLE(tan2)", %%xmm5  \n\t" \
00302     MOV_32_ONLY ROW2", "SREG2"        \n\t" \
00303     "pmulhw   "SREG2", %%xmm5         \n\t" \
00304     MOV_32_ONLY ROW0", "REG0"         \n\t" \
00305     "movdqa   "REG0", %%xmm6          \n\t" \
00306     "psubsw   "SREG2", %%xmm6         \n\t" \
00307     "paddsw   "REG0", "SREG2"         \n\t" \
00308     MOV_32_ONLY"  "TAN1", (%0)        \n\t" \
00309     "movdqa   "REG0", "XMMS"          \n\t" \
00310     "psubsw   %%xmm5, "REG0"          \n\t" \
00311     "paddsw   "XMMS", %%xmm5          \n\t" \
00312     "movdqa   %%xmm5, "XMMS"          \n\t" \
00313     "psubsw   "TAN3", %%xmm5          \n\t" \
00314     "paddsw   "XMMS", "TAN3"          \n\t" \
00315     "movdqa   "REG0", "XMMS"          \n\t" \
00316     "psubsw   %%xmm3, "REG0"          \n\t" \
00317     "paddsw   "XMMS", %%xmm3          \n\t" \
00318     MOV_32_ONLY"  (%0), "TAN1"        \n\t" \
00319     "psraw        $6, %%xmm5          \n\t" \
00320     "psraw        $6, "REG0"          \n\t" \
00321     "psraw        $6, "TAN3"          \n\t" \
00322     "psraw        $6, %%xmm3          \n\t" \
00323     "movdqa   "TAN3", 1*16("dct")     \n\t" \
00324     "movdqa   %%xmm3, 2*16("dct")     \n\t" \
00325     "movdqa   "REG0", 5*16("dct")     \n\t" \
00326     "movdqa   %%xmm5, 6*16("dct")     \n\t" \
00327     "movdqa   "SREG2", %%xmm0         \n\t" \
00328     "movdqa   %%xmm6, %%xmm4          \n\t" \
00329     "psubsw   %%xmm1, "SREG2"         \n\t" \
00330     "psubsw   "TAN1", %%xmm6          \n\t" \
00331     "paddsw   %%xmm0, %%xmm1          \n\t" \
00332     "paddsw   %%xmm4, "TAN1"          \n\t" \
00333     "psraw        $6, %%xmm1          \n\t" \
00334     "psraw        $6, "SREG2"         \n\t" \
00335     "psraw        $6, "TAN1"          \n\t" \
00336     "psraw        $6, %%xmm6          \n\t" \
00337     "movdqa   %%xmm1, ("dct")         \n\t" \
00338     "movdqa   "TAN1", 3*16("dct")     \n\t" \
00339     "movdqa   %%xmm6, 4*16("dct")     \n\t" \
00340     "movdqa   "SREG2", 7*16("dct")    \n\t"
00341 
00342 inline void ff_idct_xvid_sse2(short *block)
00343 {
00344     __asm__ volatile(
00345     "movq     "MANGLE(m127)", %%mm0                              \n\t"
00346     iMTX_MULT("(%0)",     MANGLE(iTab1), ROUND(walkenIdctRounders),      PUT_EVEN(ROW0))
00347     iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
00348     iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
00349 
00350     TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
00351     JZ("%%eax", "1f")
00352     iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
00353 
00354     TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
00355     TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
00356     iLLM_HEAD
00357     ASMALIGN(4)
00358     JNZ("%%ecx", "2f")
00359     JNZ("%%eax", "3f")
00360     JNZ("%%edx", "4f")
00361     JNZ("%%esi", "5f")
00362     iLLM_PASS_SPARSE("%0")
00363     "jmp 6f                                                      \n\t"
00364     "2:                                                          \n\t"
00365     iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
00366     "3:                                                          \n\t"
00367     iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
00368     JZ("%%edx", "1f")
00369     "4:                                                          \n\t"
00370     iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
00371     JZ("%%esi", "1f")
00372     "5:                                                          \n\t"
00373     iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
00374 #if !ARCH_X86_64
00375     iLLM_HEAD
00376 #endif
00377     iLLM_PASS("%0")
00378     "6:                                                          \n\t"
00379     : "+r"(block)
00380     :
00381     : "%eax", "%ecx", "%edx", "%esi", "memory");
00382 }
00383 
00384 void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
00385 {
00386     ff_idct_xvid_sse2(block);
00387     put_pixels_clamped_mmx(block, dest, line_size);
00388 }
00389 
00390 void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
00391 {
00392     ff_idct_xvid_sse2(block);
00393     add_pixels_clamped_mmx(block, dest, line_size);
00394 }

Generated on Sat Feb 16 2013 09:23:14 for ffmpeg by  doxygen 1.7.1