• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/x86/h264dsp_mmx.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
00003  *
00004  * This file is part of FFmpeg.
00005  *
00006  * FFmpeg is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * FFmpeg is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with FFmpeg; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00019  */
00020 
00021 #include "dsputil_mmx.h"
00022 
00023 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1  ) = 0x0103010301030103ULL;
00024 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3  ) = 0x0307030703070307ULL;
00025 
00026 /***********************************/
00027 /* IDCT */
00028 
00029 #define SUMSUB_BADC( a, b, c, d ) \
00030     "paddw "#b", "#a" \n\t"\
00031     "paddw "#d", "#c" \n\t"\
00032     "paddw "#b", "#b" \n\t"\
00033     "paddw "#d", "#d" \n\t"\
00034     "psubw "#a", "#b" \n\t"\
00035     "psubw "#c", "#d" \n\t"
00036 
00037 #define SUMSUBD2_AB( a, b, t ) \
00038     "movq  "#b", "#t" \n\t"\
00039     "psraw  $1 , "#b" \n\t"\
00040     "paddw "#a", "#b" \n\t"\
00041     "psraw  $1 , "#a" \n\t"\
00042     "psubw "#t", "#a" \n\t"
00043 
00044 #define IDCT4_1D( s02, s13, d02, d13, t ) \
00045     SUMSUB_BA  ( s02, d02 )\
00046     SUMSUBD2_AB( s13, d13, t )\
00047     SUMSUB_BADC( d13, s02, s13, d02 )
00048 
00049 #define STORE_DIFF_4P( p, t, z ) \
00050     "psraw      $6,     "#p" \n\t"\
00051     "movd       (%0),   "#t" \n\t"\
00052     "punpcklbw "#z",    "#t" \n\t"\
00053     "paddsw    "#t",    "#p" \n\t"\
00054     "packuswb  "#z",    "#p" \n\t"\
00055     "movd      "#p",    (%0) \n\t"
00056 
00057 static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
00058 {
00059     /* Load dct coeffs */
00060     __asm__ volatile(
00061         "movq   (%0), %%mm0 \n\t"
00062         "movq  8(%0), %%mm1 \n\t"
00063         "movq 16(%0), %%mm2 \n\t"
00064         "movq 24(%0), %%mm3 \n\t"
00065     :: "r"(block) );
00066 
00067     __asm__ volatile(
00068         /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
00069         IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
00070 
00071         "movq      %0,    %%mm6 \n\t"
00072         /* in: 1,4,0,2  out: 1,2,3,0 */
00073         TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
00074 
00075         "paddw     %%mm6, %%mm3 \n\t"
00076 
00077         /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
00078         IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
00079 
00080         "pxor %%mm7, %%mm7    \n\t"
00081     :: "m"(ff_pw_32));
00082 
00083     __asm__ volatile(
00084     STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
00085         "add %1, %0             \n\t"
00086     STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
00087         "add %1, %0             \n\t"
00088     STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
00089         "add %1, %0             \n\t"
00090     STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
00091         : "+r"(dst)
00092         : "r" ((x86_reg)stride)
00093     );
00094 }
00095 
00096 static inline void h264_idct8_1d(int16_t *block)
00097 {
00098     __asm__ volatile(
00099         "movq 112(%0), %%mm7  \n\t"
00100         "movq  80(%0), %%mm0  \n\t"
00101         "movq  48(%0), %%mm3  \n\t"
00102         "movq  16(%0), %%mm5  \n\t"
00103 
00104         "movq   %%mm0, %%mm4  \n\t"
00105         "movq   %%mm5, %%mm1  \n\t"
00106         "psraw  $1,    %%mm4  \n\t"
00107         "psraw  $1,    %%mm1  \n\t"
00108         "paddw  %%mm0, %%mm4  \n\t"
00109         "paddw  %%mm5, %%mm1  \n\t"
00110         "paddw  %%mm7, %%mm4  \n\t"
00111         "paddw  %%mm0, %%mm1  \n\t"
00112         "psubw  %%mm5, %%mm4  \n\t"
00113         "paddw  %%mm3, %%mm1  \n\t"
00114 
00115         "psubw  %%mm3, %%mm5  \n\t"
00116         "psubw  %%mm3, %%mm0  \n\t"
00117         "paddw  %%mm7, %%mm5  \n\t"
00118         "psubw  %%mm7, %%mm0  \n\t"
00119         "psraw  $1,    %%mm3  \n\t"
00120         "psraw  $1,    %%mm7  \n\t"
00121         "psubw  %%mm3, %%mm5  \n\t"
00122         "psubw  %%mm7, %%mm0  \n\t"
00123 
00124         "movq   %%mm4, %%mm3  \n\t"
00125         "movq   %%mm1, %%mm7  \n\t"
00126         "psraw  $2,    %%mm1  \n\t"
00127         "psraw  $2,    %%mm3  \n\t"
00128         "paddw  %%mm5, %%mm3  \n\t"
00129         "psraw  $2,    %%mm5  \n\t"
00130         "paddw  %%mm0, %%mm1  \n\t"
00131         "psraw  $2,    %%mm0  \n\t"
00132         "psubw  %%mm4, %%mm5  \n\t"
00133         "psubw  %%mm0, %%mm7  \n\t"
00134 
00135         "movq  32(%0), %%mm2  \n\t"
00136         "movq  96(%0), %%mm6  \n\t"
00137         "movq   %%mm2, %%mm4  \n\t"
00138         "movq   %%mm6, %%mm0  \n\t"
00139         "psraw  $1,    %%mm4  \n\t"
00140         "psraw  $1,    %%mm6  \n\t"
00141         "psubw  %%mm0, %%mm4  \n\t"
00142         "paddw  %%mm2, %%mm6  \n\t"
00143 
00144         "movq    (%0), %%mm2  \n\t"
00145         "movq  64(%0), %%mm0  \n\t"
00146         SUMSUB_BA( %%mm0, %%mm2 )
00147         SUMSUB_BA( %%mm6, %%mm0 )
00148         SUMSUB_BA( %%mm4, %%mm2 )
00149         SUMSUB_BA( %%mm7, %%mm6 )
00150         SUMSUB_BA( %%mm5, %%mm4 )
00151         SUMSUB_BA( %%mm3, %%mm2 )
00152         SUMSUB_BA( %%mm1, %%mm0 )
00153         :: "r"(block)
00154     );
00155 }
00156 
00157 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
00158 {
00159     int i;
00160     int16_t __attribute__ ((aligned(8))) b2[64];
00161 
00162     block[0] += 32;
00163 
00164     for(i=0; i<2; i++){
00165         DECLARE_ALIGNED_8(uint64_t, tmp);
00166 
00167         h264_idct8_1d(block+4*i);
00168 
00169         __asm__ volatile(
00170             "movq   %%mm7,    %0   \n\t"
00171             TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
00172             "movq   %%mm0,  8(%1)  \n\t"
00173             "movq   %%mm6, 24(%1)  \n\t"
00174             "movq   %%mm7, 40(%1)  \n\t"
00175             "movq   %%mm4, 56(%1)  \n\t"
00176             "movq    %0,    %%mm7  \n\t"
00177             TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
00178             "movq   %%mm7,   (%1)  \n\t"
00179             "movq   %%mm1, 16(%1)  \n\t"
00180             "movq   %%mm0, 32(%1)  \n\t"
00181             "movq   %%mm3, 48(%1)  \n\t"
00182             : "=m"(tmp)
00183             : "r"(b2+32*i)
00184             : "memory"
00185         );
00186     }
00187 
00188     for(i=0; i<2; i++){
00189         h264_idct8_1d(b2+4*i);
00190 
00191         __asm__ volatile(
00192             "psraw     $6, %%mm7  \n\t"
00193             "psraw     $6, %%mm6  \n\t"
00194             "psraw     $6, %%mm5  \n\t"
00195             "psraw     $6, %%mm4  \n\t"
00196             "psraw     $6, %%mm3  \n\t"
00197             "psraw     $6, %%mm2  \n\t"
00198             "psraw     $6, %%mm1  \n\t"
00199             "psraw     $6, %%mm0  \n\t"
00200 
00201             "movq   %%mm7,    (%0)  \n\t"
00202             "movq   %%mm5,  16(%0)  \n\t"
00203             "movq   %%mm3,  32(%0)  \n\t"
00204             "movq   %%mm1,  48(%0)  \n\t"
00205             "movq   %%mm0,  64(%0)  \n\t"
00206             "movq   %%mm2,  80(%0)  \n\t"
00207             "movq   %%mm4,  96(%0)  \n\t"
00208             "movq   %%mm6, 112(%0)  \n\t"
00209             :: "r"(b2+4*i)
00210             : "memory"
00211         );
00212     }
00213 
00214     add_pixels_clamped_mmx(b2, dst, stride);
00215 }
00216 
00217 #define STORE_DIFF_8P( p, d, t, z )\
00218         "movq       "#d", "#t" \n"\
00219         "psraw       $6,  "#p" \n"\
00220         "punpcklbw  "#z", "#t" \n"\
00221         "paddsw     "#t", "#p" \n"\
00222         "packuswb   "#p", "#p" \n"\
00223         "movq       "#p", "#d" \n"
00224 
00225 #define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
00226         "movdqa     "#c", "#a" \n"\
00227         "movdqa     "#g", "#e" \n"\
00228         "psraw       $1,  "#c" \n"\
00229         "psraw       $1,  "#g" \n"\
00230         "psubw      "#e", "#c" \n"\
00231         "paddw      "#a", "#g" \n"\
00232         "movdqa     "#b", "#e" \n"\
00233         "psraw       $1,  "#e" \n"\
00234         "paddw      "#b", "#e" \n"\
00235         "paddw      "#d", "#e" \n"\
00236         "paddw      "#f", "#e" \n"\
00237         "movdqa     "#f", "#a" \n"\
00238         "psraw       $1,  "#a" \n"\
00239         "paddw      "#f", "#a" \n"\
00240         "paddw      "#h", "#a" \n"\
00241         "psubw      "#b", "#a" \n"\
00242         "psubw      "#d", "#b" \n"\
00243         "psubw      "#d", "#f" \n"\
00244         "paddw      "#h", "#b" \n"\
00245         "psubw      "#h", "#f" \n"\
00246         "psraw       $1,  "#d" \n"\
00247         "psraw       $1,  "#h" \n"\
00248         "psubw      "#d", "#b" \n"\
00249         "psubw      "#h", "#f" \n"\
00250         "movdqa     "#e", "#d" \n"\
00251         "movdqa     "#a", "#h" \n"\
00252         "psraw       $2,  "#d" \n"\
00253         "psraw       $2,  "#h" \n"\
00254         "paddw      "#f", "#d" \n"\
00255         "paddw      "#b", "#h" \n"\
00256         "psraw       $2,  "#f" \n"\
00257         "psraw       $2,  "#b" \n"\
00258         "psubw      "#f", "#e" \n"\
00259         "psubw      "#a", "#b" \n"\
00260         "movdqa 0x00(%1), "#a" \n"\
00261         "movdqa 0x40(%1), "#f" \n"\
00262         SUMSUB_BA(f, a)\
00263         SUMSUB_BA(g, f)\
00264         SUMSUB_BA(c, a)\
00265         SUMSUB_BA(e, g)\
00266         SUMSUB_BA(b, c)\
00267         SUMSUB_BA(h, a)\
00268         SUMSUB_BA(d, f)
00269 
00270 static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
00271 {
00272     __asm__ volatile(
00273         "movdqa   0x10(%1), %%xmm1 \n"
00274         "movdqa   0x20(%1), %%xmm2 \n"
00275         "movdqa   0x30(%1), %%xmm3 \n"
00276         "movdqa   0x50(%1), %%xmm5 \n"
00277         "movdqa   0x60(%1), %%xmm6 \n"
00278         "movdqa   0x70(%1), %%xmm7 \n"
00279         H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
00280         TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
00281         "paddw          %4, %%xmm4 \n"
00282         "movdqa     %%xmm4, 0x00(%1) \n"
00283         "movdqa     %%xmm2, 0x40(%1) \n"
00284         H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
00285         "movdqa     %%xmm6, 0x60(%1) \n"
00286         "movdqa     %%xmm7, 0x70(%1) \n"
00287         "pxor       %%xmm7, %%xmm7 \n"
00288         STORE_DIFF_8P(%%xmm2, (%0),      %%xmm6, %%xmm7)
00289         STORE_DIFF_8P(%%xmm0, (%0,%2),   %%xmm6, %%xmm7)
00290         STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
00291         STORE_DIFF_8P(%%xmm3, (%0,%3),   %%xmm6, %%xmm7)
00292         "lea     (%0,%2,4), %0 \n"
00293         STORE_DIFF_8P(%%xmm5, (%0),      %%xmm6, %%xmm7)
00294         STORE_DIFF_8P(%%xmm4, (%0,%2),   %%xmm6, %%xmm7)
00295         "movdqa   0x60(%1), %%xmm0 \n"
00296         "movdqa   0x70(%1), %%xmm1 \n"
00297         STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
00298         STORE_DIFF_8P(%%xmm1, (%0,%3),   %%xmm6, %%xmm7)
00299         :"+r"(dst)
00300         :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
00301     );
00302 }
00303 
00304 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
00305 {
00306     int dc = (block[0] + 32) >> 6;
00307     __asm__ volatile(
00308         "movd          %0, %%mm0 \n\t"
00309         "pshufw $0, %%mm0, %%mm0 \n\t"
00310         "pxor       %%mm1, %%mm1 \n\t"
00311         "psubw      %%mm0, %%mm1 \n\t"
00312         "packuswb   %%mm0, %%mm0 \n\t"
00313         "packuswb   %%mm1, %%mm1 \n\t"
00314         ::"r"(dc)
00315     );
00316     __asm__ volatile(
00317         "movd          %0, %%mm2 \n\t"
00318         "movd          %1, %%mm3 \n\t"
00319         "movd          %2, %%mm4 \n\t"
00320         "movd          %3, %%mm5 \n\t"
00321         "paddusb    %%mm0, %%mm2 \n\t"
00322         "paddusb    %%mm0, %%mm3 \n\t"
00323         "paddusb    %%mm0, %%mm4 \n\t"
00324         "paddusb    %%mm0, %%mm5 \n\t"
00325         "psubusb    %%mm1, %%mm2 \n\t"
00326         "psubusb    %%mm1, %%mm3 \n\t"
00327         "psubusb    %%mm1, %%mm4 \n\t"
00328         "psubusb    %%mm1, %%mm5 \n\t"
00329         "movd       %%mm2, %0    \n\t"
00330         "movd       %%mm3, %1    \n\t"
00331         "movd       %%mm4, %2    \n\t"
00332         "movd       %%mm5, %3    \n\t"
00333         :"+m"(*(uint32_t*)(dst+0*stride)),
00334          "+m"(*(uint32_t*)(dst+1*stride)),
00335          "+m"(*(uint32_t*)(dst+2*stride)),
00336          "+m"(*(uint32_t*)(dst+3*stride))
00337     );
00338 }
00339 
00340 static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
00341 {
00342     int dc = (block[0] + 32) >> 6;
00343     int y;
00344     __asm__ volatile(
00345         "movd          %0, %%mm0 \n\t"
00346         "pshufw $0, %%mm0, %%mm0 \n\t"
00347         "pxor       %%mm1, %%mm1 \n\t"
00348         "psubw      %%mm0, %%mm1 \n\t"
00349         "packuswb   %%mm0, %%mm0 \n\t"
00350         "packuswb   %%mm1, %%mm1 \n\t"
00351         ::"r"(dc)
00352     );
00353     for(y=2; y--; dst += 4*stride){
00354     __asm__ volatile(
00355         "movq          %0, %%mm2 \n\t"
00356         "movq          %1, %%mm3 \n\t"
00357         "movq          %2, %%mm4 \n\t"
00358         "movq          %3, %%mm5 \n\t"
00359         "paddusb    %%mm0, %%mm2 \n\t"
00360         "paddusb    %%mm0, %%mm3 \n\t"
00361         "paddusb    %%mm0, %%mm4 \n\t"
00362         "paddusb    %%mm0, %%mm5 \n\t"
00363         "psubusb    %%mm1, %%mm2 \n\t"
00364         "psubusb    %%mm1, %%mm3 \n\t"
00365         "psubusb    %%mm1, %%mm4 \n\t"
00366         "psubusb    %%mm1, %%mm5 \n\t"
00367         "movq       %%mm2, %0    \n\t"
00368         "movq       %%mm3, %1    \n\t"
00369         "movq       %%mm4, %2    \n\t"
00370         "movq       %%mm5, %3    \n\t"
00371         :"+m"(*(uint64_t*)(dst+0*stride)),
00372          "+m"(*(uint64_t*)(dst+1*stride)),
00373          "+m"(*(uint64_t*)(dst+2*stride)),
00374          "+m"(*(uint64_t*)(dst+3*stride))
00375     );
00376     }
00377 }
00378 
00379 //FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
00380 static const uint8_t scan8[16 + 2*4]={
00381  4+1*8, 5+1*8, 4+2*8, 5+2*8,
00382  6+1*8, 7+1*8, 6+2*8, 7+2*8,
00383  4+3*8, 5+3*8, 4+4*8, 5+4*8,
00384  6+3*8, 7+3*8, 6+4*8, 7+4*8,
00385  1+1*8, 2+1*8,
00386  1+2*8, 2+2*8,
00387  1+4*8, 2+4*8,
00388  1+5*8, 2+5*8,
00389 };
00390 
00391 static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00392     int i;
00393     for(i=0; i<16; i++){
00394         if(nnzc[ scan8[i] ])
00395             ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
00396     }
00397 }
00398 
00399 static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00400     int i;
00401     for(i=0; i<16; i+=4){
00402         if(nnzc[ scan8[i] ])
00403             ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride);
00404     }
00405 }
00406 
00407 
00408 static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00409     int i;
00410     for(i=0; i<16; i++){
00411         int nnz = nnzc[ scan8[i] ];
00412         if(nnz){
00413             if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00414             else                      ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
00415         }
00416     }
00417 }
00418 
00419 static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00420     int i;
00421     for(i=0; i<16; i++){
00422         if(nnzc[ scan8[i] ] || block[i*16])
00423             ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
00424     }
00425 }
00426 
00427 static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00428     int i;
00429     for(i=0; i<16; i++){
00430         if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx    (dst + block_offset[i], block + i*16, stride);
00431         else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00432     }
00433 }
00434 
00435 static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00436     int i;
00437     for(i=0; i<16; i+=4){
00438         int nnz = nnzc[ scan8[i] ];
00439         if(nnz){
00440             if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00441             else                      ff_h264_idct8_add_mmx    (dst + block_offset[i], block + i*16, stride);
00442         }
00443     }
00444 }
00445 
00446 static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00447     int i;
00448     for(i=0; i<16; i+=4){
00449         int nnz = nnzc[ scan8[i] ];
00450         if(nnz){
00451             if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00452             else                      ff_h264_idct8_add_sse2   (dst + block_offset[i], block + i*16, stride);
00453         }
00454     }
00455 }
00456 
00457 static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00458     int i;
00459     for(i=16; i<16+8; i++){
00460         if(nnzc[ scan8[i] ] || block[i*16])
00461             ff_h264_idct_add_mmx    (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00462     }
00463 }
00464 
00465 static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00466     int i;
00467     for(i=16; i<16+8; i++){
00468         if(nnzc[ scan8[i] ])
00469             ff_h264_idct_add_mmx    (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00470         else if(block[i*16])
00471             ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00472     }
00473 }
00474 
00475 #if CONFIG_GPL && HAVE_YASM
00476 static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride)
00477 {
00478     __asm__ volatile(
00479         "movd             %0, %%mm0 \n\t"   //  0 0 X D
00480         "punpcklwd        %1, %%mm0 \n\t"   //  x X d D
00481         "paddsw           %2, %%mm0 \n\t"
00482         "psraw            $6, %%mm0 \n\t"
00483         "punpcklwd     %%mm0, %%mm0 \n\t"   //  d d D D
00484         "pxor          %%mm1, %%mm1 \n\t"   //  0 0 0 0
00485         "psubw         %%mm0, %%mm1 \n\t"   // -d-d-D-D
00486         "packuswb      %%mm1, %%mm0 \n\t"   // -d-d-D-D d d D D
00487         "pshufw $0xFA, %%mm0, %%mm1 \n\t"   // -d-d-d-d-D-D-D-D
00488         "punpcklwd     %%mm0, %%mm0 \n\t"   //  d d d d D D D D
00489         ::"m"(block[ 0]),
00490           "m"(block[16]),
00491           "m"(ff_pw_32)
00492     );
00493     __asm__ volatile(
00494         "movq          %0, %%mm2 \n\t"
00495         "movq          %1, %%mm3 \n\t"
00496         "movq          %2, %%mm4 \n\t"
00497         "movq          %3, %%mm5 \n\t"
00498         "paddusb    %%mm0, %%mm2 \n\t"
00499         "paddusb    %%mm0, %%mm3 \n\t"
00500         "paddusb    %%mm0, %%mm4 \n\t"
00501         "paddusb    %%mm0, %%mm5 \n\t"
00502         "psubusb    %%mm1, %%mm2 \n\t"
00503         "psubusb    %%mm1, %%mm3 \n\t"
00504         "psubusb    %%mm1, %%mm4 \n\t"
00505         "psubusb    %%mm1, %%mm5 \n\t"
00506         "movq       %%mm2, %0    \n\t"
00507         "movq       %%mm3, %1    \n\t"
00508         "movq       %%mm4, %2    \n\t"
00509         "movq       %%mm5, %3    \n\t"
00510         :"+m"(*(uint64_t*)(dst+0*stride)),
00511          "+m"(*(uint64_t*)(dst+1*stride)),
00512          "+m"(*(uint64_t*)(dst+2*stride)),
00513          "+m"(*(uint64_t*)(dst+3*stride))
00514     );
00515 }
00516 
00517 extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride);
00518 
00519 static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00520     int i;
00521     for(i=0; i<16; i+=2)
00522         if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
00523             ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
00524 }
00525 
00526 static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00527     int i;
00528     for(i=0; i<16; i+=2){
00529         if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
00530             ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
00531         else if(block[i*16]|block[i*16+16])
00532             ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride);
00533     }
00534 }
00535 
00536 static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00537     int i;
00538     for(i=16; i<16+8; i+=2){
00539         if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
00540             ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00541         else if(block[i*16]|block[i*16+16])
00542             ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00543     }
00544 }
00545 #endif
00546 
00547 /***********************************/
00548 /* deblocking */
00549 
00550 // out: o = |x-y|>a
00551 // clobbers: t
00552 #define DIFF_GT_MMX(x,y,a,o,t)\
00553     "movq     "#y", "#t"  \n\t"\
00554     "movq     "#x", "#o"  \n\t"\
00555     "psubusb  "#x", "#t"  \n\t"\
00556     "psubusb  "#y", "#o"  \n\t"\
00557     "por      "#t", "#o"  \n\t"\
00558     "psubusb  "#a", "#o"  \n\t"
00559 
00560 // out: o = |x-y|>a
00561 // clobbers: t
00562 #define DIFF_GT2_MMX(x,y,a,o,t)\
00563     "movq     "#y", "#t"  \n\t"\
00564     "movq     "#x", "#o"  \n\t"\
00565     "psubusb  "#x", "#t"  \n\t"\
00566     "psubusb  "#y", "#o"  \n\t"\
00567     "psubusb  "#a", "#t"  \n\t"\
00568     "psubusb  "#a", "#o"  \n\t"\
00569     "pcmpeqb  "#t", "#o"  \n\t"\
00570 
00571 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1
00572 // out: mm5=beta-1, mm7=mask
00573 // clobbers: mm4,mm6
00574 #define H264_DEBLOCK_MASK(alpha1, beta1) \
00575     "pshufw $0, "#alpha1", %%mm4 \n\t"\
00576     "pshufw $0, "#beta1 ", %%mm5 \n\t"\
00577     "packuswb  %%mm4, %%mm4      \n\t"\
00578     "packuswb  %%mm5, %%mm5      \n\t"\
00579     DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) /* |p0-q0| > alpha-1 */\
00580     DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) /* |p1-p0| > beta-1 */\
00581     "por       %%mm4, %%mm7      \n\t"\
00582     DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) /* |q1-q0| > beta-1 */\
00583     "por       %%mm4, %%mm7      \n\t"\
00584     "pxor      %%mm6, %%mm6      \n\t"\
00585     "pcmpeqb   %%mm6, %%mm7      \n\t"
00586 
00587 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask)
00588 // out: mm1=p0' mm2=q0'
00589 // clobbers: mm0,3-6
00590 #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
00591         "movq    %%mm1              , %%mm5 \n\t"\
00592         "pxor    %%mm2              , %%mm5 \n\t" /* p0^q0*/\
00593         "pand    "#pb_01"           , %%mm5 \n\t" /* (p0^q0)&1*/\
00594         "pcmpeqb %%mm4              , %%mm4 \n\t"\
00595         "pxor    %%mm4              , %%mm3 \n\t"\
00596         "pavgb   %%mm0              , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\
00597         "pavgb   "MANGLE(ff_pb_3)"  , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\
00598         "pxor    %%mm1              , %%mm4 \n\t"\
00599         "pavgb   %%mm2              , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\
00600         "pavgb   %%mm5              , %%mm3 \n\t"\
00601         "paddusb %%mm4              , %%mm3 \n\t" /* d+128+33*/\
00602         "movq    "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
00603         "psubusb %%mm3              , %%mm6 \n\t"\
00604         "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
00605         "pminub  %%mm7              , %%mm6 \n\t"\
00606         "pminub  %%mm7              , %%mm3 \n\t"\
00607         "psubusb %%mm6              , %%mm1 \n\t"\
00608         "psubusb %%mm3              , %%mm2 \n\t"\
00609         "paddusb %%mm3              , %%mm1 \n\t"\
00610         "paddusb %%mm6              , %%mm2 \n\t"
00611 
00612 // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
00613 // out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
00614 // clobbers: q2, tmp, tc0
00615 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
00616         "movq     %%mm1,  "#tmp"   \n\t"\
00617         "pavgb    %%mm2,  "#tmp"   \n\t"\
00618         "pavgb    "#tmp", "#q2"    \n\t" /* avg(p2,avg(p0,q0)) */\
00619         "pxor   "q2addr", "#tmp"   \n\t"\
00620         "pand     %8,     "#tmp"   \n\t" /* (p2^avg(p0,q0))&1 */\
00621         "psubusb  "#tmp", "#q2"    \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
00622         "movq     "#p1",  "#tmp"   \n\t"\
00623         "psubusb  "#tc0", "#tmp"   \n\t"\
00624         "paddusb  "#p1",  "#tc0"   \n\t"\
00625         "pmaxub   "#tmp", "#q2"    \n\t"\
00626         "pminub   "#tc0", "#q2"    \n\t"\
00627         "movq     "#q2",  "q1addr" \n\t"
00628 
00629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
00630 {
00631     DECLARE_ALIGNED_8(uint64_t, tmp0[2]);
00632 
00633     __asm__ volatile(
00634         "movq    (%1,%3), %%mm0    \n\t" //p1
00635         "movq    (%1,%3,2), %%mm1  \n\t" //p0
00636         "movq    (%2),    %%mm2    \n\t" //q0
00637         "movq    (%2,%3), %%mm3    \n\t" //q1
00638         H264_DEBLOCK_MASK(%6, %7)
00639 
00640         "movd      %5,    %%mm4    \n\t"
00641         "punpcklbw %%mm4, %%mm4    \n\t"
00642         "punpcklwd %%mm4, %%mm4    \n\t"
00643         "pcmpeqb   %%mm3, %%mm3    \n\t"
00644         "movq      %%mm4, %%mm6    \n\t"
00645         "pcmpgtb   %%mm3, %%mm4    \n\t"
00646         "movq      %%mm6, 8+%0     \n\t"
00647         "pand      %%mm4, %%mm7    \n\t"
00648         "movq      %%mm7, %0       \n\t"
00649 
00650         /* filter p1 */
00651         "movq     (%1),   %%mm3    \n\t" //p2
00652         DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1
00653         "pand     %%mm7,  %%mm6    \n\t" // mask & |p2-p0|<beta
00654         "pand     8+%0,   %%mm7    \n\t" // mask & tc0
00655         "movq     %%mm7,  %%mm4    \n\t"
00656         "psubb    %%mm6,  %%mm7    \n\t"
00657         "pand     %%mm4,  %%mm6    \n\t" // mask & |p2-p0|<beta & tc0
00658         H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
00659 
00660         /* filter q1 */
00661         "movq    (%2,%3,2), %%mm4  \n\t" //q2
00662         DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1
00663         "pand     %0,     %%mm6    \n\t"
00664         "movq     8+%0,   %%mm5    \n\t" // can be merged with the and below but is slower then
00665         "pand     %%mm6,  %%mm5    \n\t"
00666         "psubb    %%mm6,  %%mm7    \n\t"
00667         "movq    (%2,%3), %%mm3    \n\t"
00668         H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
00669 
00670         /* filter p0, q0 */
00671         H264_DEBLOCK_P0_Q0(%8, unused)
00672         "movq      %%mm1, (%1,%3,2) \n\t"
00673         "movq      %%mm2, (%2)      \n\t"
00674 
00675         : "=m"(*tmp0)
00676         : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
00677           "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
00678           "m"(ff_bone)
00679     );
00680 }
00681 
00682 static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00683 {
00684     if((tc0[0] & tc0[1]) >= 0)
00685         h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
00686     if((tc0[2] & tc0[3]) >= 0)
00687         h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
00688 }
00689 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00690 {
00691     //FIXME: could cut some load/stores by merging transpose with filter
00692     // also, it only needs to transpose 6x8
00693     DECLARE_ALIGNED_8(uint8_t, trans[8*8]);
00694     int i;
00695     for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
00696         if((tc0[0] & tc0[1]) < 0)
00697             continue;
00698         transpose4x4(trans,       pix-4,          8, stride);
00699         transpose4x4(trans  +4*8, pix,            8, stride);
00700         transpose4x4(trans+4,     pix-4+4*stride, 8, stride);
00701         transpose4x4(trans+4+4*8, pix  +4*stride, 8, stride);
00702         h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
00703         transpose4x4(pix-2,          trans  +2*8, stride, 8);
00704         transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
00705     }
00706 }
00707 
00708 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
00709 {
00710     __asm__ volatile(
00711         "movq    (%0),    %%mm0     \n\t" //p1
00712         "movq    (%0,%2), %%mm1     \n\t" //p0
00713         "movq    (%1),    %%mm2     \n\t" //q0
00714         "movq    (%1,%2), %%mm3     \n\t" //q1
00715         H264_DEBLOCK_MASK(%4, %5)
00716         "movd      %3,    %%mm6     \n\t"
00717         "punpcklbw %%mm6, %%mm6     \n\t"
00718         "pand      %%mm6, %%mm7     \n\t" // mm7 = tc&mask
00719         H264_DEBLOCK_P0_Q0(%6, %7)
00720         "movq      %%mm1, (%0,%2)   \n\t"
00721         "movq      %%mm2, (%1)      \n\t"
00722 
00723         :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
00724            "r"(*(uint32_t*)tc0),
00725            "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
00726     );
00727 }
00728 
00729 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00730 {
00731     h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
00732 }
00733 
00734 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00735 {
00736     //FIXME: could cut some load/stores by merging transpose with filter
00737     DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
00738     transpose4x4(trans, pix-2, 8, stride);
00739     transpose4x4(trans+4, pix-2+4*stride, 8, stride);
00740     h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
00741     transpose4x4(pix-2, trans, stride, 8);
00742     transpose4x4(pix-2+4*stride, trans+4, stride, 8);
00743 }
00744 
00745 // p0 = (p0 + q1 + 2*p1 + 2) >> 2
00746 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \
00747     "movq    "#p0", %%mm4  \n\t"\
00748     "pxor    "#q1", %%mm4  \n\t"\
00749     "pand   "#one", %%mm4  \n\t" /* mm4 = (p0^q1)&1 */\
00750     "pavgb   "#q1", "#p0"  \n\t"\
00751     "psubusb %%mm4, "#p0"  \n\t"\
00752     "pavgb   "#p1", "#p0"  \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
00753 
00754 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
00755 {
00756     __asm__ volatile(
00757         "movq    (%0),    %%mm0     \n\t"
00758         "movq    (%0,%2), %%mm1     \n\t"
00759         "movq    (%1),    %%mm2     \n\t"
00760         "movq    (%1,%2), %%mm3     \n\t"
00761         H264_DEBLOCK_MASK(%3, %4)
00762         "movq    %%mm1,   %%mm5     \n\t"
00763         "movq    %%mm2,   %%mm6     \n\t"
00764         H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5) //p0'
00765         H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5) //q0'
00766         "psubb   %%mm5,   %%mm1     \n\t"
00767         "psubb   %%mm6,   %%mm2     \n\t"
00768         "pand    %%mm7,   %%mm1     \n\t"
00769         "pand    %%mm7,   %%mm2     \n\t"
00770         "paddb   %%mm5,   %%mm1     \n\t"
00771         "paddb   %%mm6,   %%mm2     \n\t"
00772         "movq    %%mm1,   (%0,%2)   \n\t"
00773         "movq    %%mm2,   (%1)      \n\t"
00774         :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
00775            "m"(alpha1), "m"(beta1), "m"(ff_bone)
00776     );
00777 }
00778 
00779 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
00780 {
00781     h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
00782 }
00783 
00784 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
00785 {
00786     //FIXME: could cut some load/stores by merging transpose with filter
00787     DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
00788     transpose4x4(trans, pix-2, 8, stride);
00789     transpose4x4(trans+4, pix-2+4*stride, 8, stride);
00790     h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
00791     transpose4x4(pix-2, trans, stride, 8);
00792     transpose4x4(pix-2+4*stride, trans+4, stride, 8);
00793 }
00794 
00795 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
00796                                             int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
00797     int dir;
00798     __asm__ volatile(
00799         "pxor %%mm7, %%mm7 \n\t"
00800         "movq %0, %%mm6 \n\t"
00801         "movq %1, %%mm5 \n\t"
00802         "movq %2, %%mm4 \n\t"
00803         ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7)
00804     );
00805     if(field)
00806         __asm__ volatile(
00807             "movq %0, %%mm5 \n\t"
00808             "movq %1, %%mm4 \n\t"
00809             ::"m"(ff_pb_3_1), "m"(ff_pb_7_3)
00810         );
00811 
00812     // could do a special case for dir==0 && edges==1, but it only reduces the
00813     // average filter time by 1.2%
00814     for( dir=1; dir>=0; dir-- ) {
00815         const int d_idx = dir ? -8 : -1;
00816         const int mask_mv = dir ? mask_mv1 : mask_mv0;
00817         DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
00818         int b_idx, edge, l;
00819         for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
00820             __asm__ volatile(
00821                 "pand %0, %%mm0 \n\t"
00822                 ::"m"(mask_dir)
00823             );
00824             if(!(mask_mv & edge)) {
00825                 __asm__ volatile("pxor %%mm0, %%mm0 \n\t":);
00826                 for( l = bidir; l >= 0; l-- ) {
00827                     __asm__ volatile(
00828                         "movd %0, %%mm1 \n\t"
00829                         "punpckldq %1, %%mm1 \n\t"
00830                         "movq %%mm1, %%mm2 \n\t"
00831                         "psrlw $7, %%mm2 \n\t"
00832                         "pand %%mm6, %%mm2 \n\t"
00833                         "por %%mm2, %%mm1 \n\t" // ref_cache with -2 mapped to -1
00834                         "punpckldq %%mm1, %%mm2 \n\t"
00835                         "pcmpeqb %%mm2, %%mm1 \n\t"
00836                         "paddb %%mm6, %%mm1 \n\t"
00837                         "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn]
00838                         "por %%mm1, %%mm0 \n\t"
00839 
00840                         "movq %2, %%mm1 \n\t"
00841                         "movq %3, %%mm2 \n\t"
00842                         "psubw %4, %%mm1 \n\t"
00843                         "psubw %5, %%mm2 \n\t"
00844                         "packsswb %%mm2, %%mm1 \n\t"
00845                         "paddb %%mm5, %%mm1 \n\t"
00846                         "pminub %%mm4, %%mm1 \n\t"
00847                         "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit
00848                         "por %%mm1, %%mm0 \n\t"
00849                         ::"m"(ref[l][b_idx]),
00850                           "m"(ref[l][b_idx+d_idx]),
00851                           "m"(mv[l][b_idx][0]),
00852                           "m"(mv[l][b_idx+2][0]),
00853                           "m"(mv[l][b_idx+d_idx][0]),
00854                           "m"(mv[l][b_idx+d_idx+2][0])
00855                     );
00856                 }
00857             }
00858             __asm__ volatile(
00859                 "movd %0, %%mm1 \n\t"
00860                 "por  %1, %%mm1 \n\t"
00861                 "punpcklbw %%mm7, %%mm1 \n\t"
00862                 "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn]
00863                 ::"m"(nnz[b_idx]),
00864                   "m"(nnz[b_idx+d_idx])
00865             );
00866             __asm__ volatile(
00867                 "pcmpeqw %%mm7, %%mm0 \n\t"
00868                 "pcmpeqw %%mm7, %%mm0 \n\t"
00869                 "psrlw $15, %%mm0 \n\t" // nonzero -> 1
00870                 "psrlw $14, %%mm1 \n\t"
00871                 "movq %%mm0, %%mm2 \n\t"
00872                 "por %%mm1, %%mm2 \n\t"
00873                 "psrlw $1, %%mm1 \n\t"
00874                 "pandn %%mm2, %%mm1 \n\t"
00875                 "movq %%mm1, %0 \n\t"
00876                 :"=m"(*bS[dir][edge])
00877                 ::"memory"
00878             );
00879         }
00880         edges = 4;
00881         step = 1;
00882     }
00883     __asm__ volatile(
00884         "movq   (%0), %%mm0 \n\t"
00885         "movq  8(%0), %%mm1 \n\t"
00886         "movq 16(%0), %%mm2 \n\t"
00887         "movq 24(%0), %%mm3 \n\t"
00888         TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
00889         "movq %%mm0,   (%0) \n\t"
00890         "movq %%mm3,  8(%0) \n\t"
00891         "movq %%mm4, 16(%0) \n\t"
00892         "movq %%mm2, 24(%0) \n\t"
00893         ::"r"(bS[0])
00894         :"memory"
00895     );
00896 }
00897 
00898 /***********************************/
00899 /* motion compensation */
00900 
00901 #define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
00902         "mov"#q" "#C", "#T"         \n\t"\
00903         "mov"#d" (%0), "#F"         \n\t"\
00904         "paddw "#D", "#T"           \n\t"\
00905         "psllw $2, "#T"             \n\t"\
00906         "psubw "#B", "#T"           \n\t"\
00907         "psubw "#E", "#T"           \n\t"\
00908         "punpcklbw "#Z", "#F"       \n\t"\
00909         "pmullw %4, "#T"            \n\t"\
00910         "paddw %5, "#A"             \n\t"\
00911         "add %2, %0                 \n\t"\
00912         "paddw "#F", "#A"           \n\t"\
00913         "paddw "#A", "#T"           \n\t"\
00914         "psraw $5, "#T"             \n\t"\
00915         "packuswb "#T", "#T"        \n\t"\
00916         OP(T, (%1), A, d)\
00917         "add %3, %1                 \n\t"
00918 
00919 #define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
00920         "mov"#q" "#C", "#T"         \n\t"\
00921         "mov"#d" (%0), "#F"         \n\t"\
00922         "paddw "#D", "#T"           \n\t"\
00923         "psllw $2, "#T"             \n\t"\
00924         "paddw %4, "#A"             \n\t"\
00925         "psubw "#B", "#T"           \n\t"\
00926         "psubw "#E", "#T"           \n\t"\
00927         "punpcklbw "#Z", "#F"       \n\t"\
00928         "pmullw %3, "#T"            \n\t"\
00929         "paddw "#F", "#A"           \n\t"\
00930         "add %2, %0                 \n\t"\
00931         "paddw "#A", "#T"           \n\t"\
00932         "mov"#q" "#T", "#OF"(%1)    \n\t"
00933 
00934 #define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
00935 #define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
00936 #define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
00937 #define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
00938 
00939 
00940 #define QPEL_H264(OPNAME, OP, MMX)\
00941 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00942     int h=4;\
00943 \
00944     __asm__ volatile(\
00945         "pxor %%mm7, %%mm7          \n\t"\
00946         "movq %5, %%mm4             \n\t"\
00947         "movq %6, %%mm5             \n\t"\
00948         "1:                         \n\t"\
00949         "movd  -1(%0), %%mm1        \n\t"\
00950         "movd    (%0), %%mm2        \n\t"\
00951         "movd   1(%0), %%mm3        \n\t"\
00952         "movd   2(%0), %%mm0        \n\t"\
00953         "punpcklbw %%mm7, %%mm1     \n\t"\
00954         "punpcklbw %%mm7, %%mm2     \n\t"\
00955         "punpcklbw %%mm7, %%mm3     \n\t"\
00956         "punpcklbw %%mm7, %%mm0     \n\t"\
00957         "paddw %%mm0, %%mm1         \n\t"\
00958         "paddw %%mm3, %%mm2         \n\t"\
00959         "movd  -2(%0), %%mm0        \n\t"\
00960         "movd   3(%0), %%mm3        \n\t"\
00961         "punpcklbw %%mm7, %%mm0     \n\t"\
00962         "punpcklbw %%mm7, %%mm3     \n\t"\
00963         "paddw %%mm3, %%mm0         \n\t"\
00964         "psllw $2, %%mm2            \n\t"\
00965         "psubw %%mm1, %%mm2         \n\t"\
00966         "pmullw %%mm4, %%mm2        \n\t"\
00967         "paddw %%mm5, %%mm0         \n\t"\
00968         "paddw %%mm2, %%mm0         \n\t"\
00969         "psraw $5, %%mm0            \n\t"\
00970         "packuswb %%mm0, %%mm0      \n\t"\
00971         OP(%%mm0, (%1),%%mm6, d)\
00972         "add %3, %0                 \n\t"\
00973         "add %4, %1                 \n\t"\
00974         "decl %2                    \n\t"\
00975         " jnz 1b                    \n\t"\
00976         : "+a"(src), "+c"(dst), "+g"(h)\
00977         : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
00978         : "memory"\
00979     );\
00980 }\
00981 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
00982     int h=4;\
00983     __asm__ volatile(\
00984         "pxor %%mm7, %%mm7          \n\t"\
00985         "movq %0, %%mm4             \n\t"\
00986         "movq %1, %%mm5             \n\t"\
00987         :: "m"(ff_pw_5), "m"(ff_pw_16)\
00988     );\
00989     do{\
00990     __asm__ volatile(\
00991         "movd  -1(%0), %%mm1        \n\t"\
00992         "movd    (%0), %%mm2        \n\t"\
00993         "movd   1(%0), %%mm3        \n\t"\
00994         "movd   2(%0), %%mm0        \n\t"\
00995         "punpcklbw %%mm7, %%mm1     \n\t"\
00996         "punpcklbw %%mm7, %%mm2     \n\t"\
00997         "punpcklbw %%mm7, %%mm3     \n\t"\
00998         "punpcklbw %%mm7, %%mm0     \n\t"\
00999         "paddw %%mm0, %%mm1         \n\t"\
01000         "paddw %%mm3, %%mm2         \n\t"\
01001         "movd  -2(%0), %%mm0        \n\t"\
01002         "movd   3(%0), %%mm3        \n\t"\
01003         "punpcklbw %%mm7, %%mm0     \n\t"\
01004         "punpcklbw %%mm7, %%mm3     \n\t"\
01005         "paddw %%mm3, %%mm0         \n\t"\
01006         "psllw $2, %%mm2            \n\t"\
01007         "psubw %%mm1, %%mm2         \n\t"\
01008         "pmullw %%mm4, %%mm2        \n\t"\
01009         "paddw %%mm5, %%mm0         \n\t"\
01010         "paddw %%mm2, %%mm0         \n\t"\
01011         "movd   (%2), %%mm3         \n\t"\
01012         "psraw $5, %%mm0            \n\t"\
01013         "packuswb %%mm0, %%mm0      \n\t"\
01014         PAVGB" %%mm3, %%mm0         \n\t"\
01015         OP(%%mm0, (%1),%%mm6, d)\
01016         "add %4, %0                 \n\t"\
01017         "add %4, %1                 \n\t"\
01018         "add %3, %2                 \n\t"\
01019         : "+a"(src), "+c"(dst), "+d"(src2)\
01020         : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
01021         : "memory"\
01022     );\
01023     }while(--h);\
01024 }\
01025 static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01026     src -= 2*srcStride;\
01027     __asm__ volatile(\
01028         "pxor %%mm7, %%mm7          \n\t"\
01029         "movd (%0), %%mm0           \n\t"\
01030         "add %2, %0                 \n\t"\
01031         "movd (%0), %%mm1           \n\t"\
01032         "add %2, %0                 \n\t"\
01033         "movd (%0), %%mm2           \n\t"\
01034         "add %2, %0                 \n\t"\
01035         "movd (%0), %%mm3           \n\t"\
01036         "add %2, %0                 \n\t"\
01037         "movd (%0), %%mm4           \n\t"\
01038         "add %2, %0                 \n\t"\
01039         "punpcklbw %%mm7, %%mm0     \n\t"\
01040         "punpcklbw %%mm7, %%mm1     \n\t"\
01041         "punpcklbw %%mm7, %%mm2     \n\t"\
01042         "punpcklbw %%mm7, %%mm3     \n\t"\
01043         "punpcklbw %%mm7, %%mm4     \n\t"\
01044         QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01045         QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01046         QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01047         QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01048          \
01049         : "+a"(src), "+c"(dst)\
01050         : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01051         : "memory"\
01052     );\
01053 }\
01054 static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01055     int h=4;\
01056     int w=3;\
01057     src -= 2*srcStride+2;\
01058     while(w--){\
01059         __asm__ volatile(\
01060             "pxor %%mm7, %%mm7      \n\t"\
01061             "movd (%0), %%mm0       \n\t"\
01062             "add %2, %0             \n\t"\
01063             "movd (%0), %%mm1       \n\t"\
01064             "add %2, %0             \n\t"\
01065             "movd (%0), %%mm2       \n\t"\
01066             "add %2, %0             \n\t"\
01067             "movd (%0), %%mm3       \n\t"\
01068             "add %2, %0             \n\t"\
01069             "movd (%0), %%mm4       \n\t"\
01070             "add %2, %0             \n\t"\
01071             "punpcklbw %%mm7, %%mm0 \n\t"\
01072             "punpcklbw %%mm7, %%mm1 \n\t"\
01073             "punpcklbw %%mm7, %%mm2 \n\t"\
01074             "punpcklbw %%mm7, %%mm3 \n\t"\
01075             "punpcklbw %%mm7, %%mm4 \n\t"\
01076             QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
01077             QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
01078             QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
01079             QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
01080              \
01081             : "+a"(src)\
01082             : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01083             : "memory"\
01084         );\
01085         tmp += 4;\
01086         src += 4 - 9*srcStride;\
01087     }\
01088     tmp -= 3*4;\
01089     __asm__ volatile(\
01090         "1:                         \n\t"\
01091         "movq     (%0), %%mm0       \n\t"\
01092         "paddw  10(%0), %%mm0       \n\t"\
01093         "movq    2(%0), %%mm1       \n\t"\
01094         "paddw   8(%0), %%mm1       \n\t"\
01095         "movq    4(%0), %%mm2       \n\t"\
01096         "paddw   6(%0), %%mm2       \n\t"\
01097         "psubw %%mm1, %%mm0         \n\t"/*a-b   (abccba)*/\
01098         "psraw $2, %%mm0            \n\t"/*(a-b)/4 */\
01099         "psubw %%mm1, %%mm0         \n\t"/*(a-b)/4-b */\
01100         "paddsw %%mm2, %%mm0        \n\t"\
01101         "psraw $2, %%mm0            \n\t"/*((a-b)/4-b+c)/4 */\
01102         "paddw %%mm2, %%mm0         \n\t"/*(a-5*b+20*c)/16 */\
01103         "psraw $6, %%mm0            \n\t"\
01104         "packuswb %%mm0, %%mm0      \n\t"\
01105         OP(%%mm0, (%1),%%mm7, d)\
01106         "add $24, %0                \n\t"\
01107         "add %3, %1                 \n\t"\
01108         "decl %2                    \n\t"\
01109         " jnz 1b                    \n\t"\
01110         : "+a"(tmp), "+c"(dst), "+g"(h)\
01111         : "S"((x86_reg)dstStride)\
01112         : "memory"\
01113     );\
01114 }\
01115 \
01116 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01117     int h=8;\
01118     __asm__ volatile(\
01119         "pxor %%mm7, %%mm7          \n\t"\
01120         "movq %5, %%mm6             \n\t"\
01121         "1:                         \n\t"\
01122         "movq    (%0), %%mm0        \n\t"\
01123         "movq   1(%0), %%mm2        \n\t"\
01124         "movq %%mm0, %%mm1          \n\t"\
01125         "movq %%mm2, %%mm3          \n\t"\
01126         "punpcklbw %%mm7, %%mm0     \n\t"\
01127         "punpckhbw %%mm7, %%mm1     \n\t"\
01128         "punpcklbw %%mm7, %%mm2     \n\t"\
01129         "punpckhbw %%mm7, %%mm3     \n\t"\
01130         "paddw %%mm2, %%mm0         \n\t"\
01131         "paddw %%mm3, %%mm1         \n\t"\
01132         "psllw $2, %%mm0            \n\t"\
01133         "psllw $2, %%mm1            \n\t"\
01134         "movq   -1(%0), %%mm2       \n\t"\
01135         "movq    2(%0), %%mm4       \n\t"\
01136         "movq %%mm2, %%mm3          \n\t"\
01137         "movq %%mm4, %%mm5          \n\t"\
01138         "punpcklbw %%mm7, %%mm2     \n\t"\
01139         "punpckhbw %%mm7, %%mm3     \n\t"\
01140         "punpcklbw %%mm7, %%mm4     \n\t"\
01141         "punpckhbw %%mm7, %%mm5     \n\t"\
01142         "paddw %%mm4, %%mm2         \n\t"\
01143         "paddw %%mm3, %%mm5         \n\t"\
01144         "psubw %%mm2, %%mm0         \n\t"\
01145         "psubw %%mm5, %%mm1         \n\t"\
01146         "pmullw %%mm6, %%mm0        \n\t"\
01147         "pmullw %%mm6, %%mm1        \n\t"\
01148         "movd   -2(%0), %%mm2       \n\t"\
01149         "movd    7(%0), %%mm5       \n\t"\
01150         "punpcklbw %%mm7, %%mm2     \n\t"\
01151         "punpcklbw %%mm7, %%mm5     \n\t"\
01152         "paddw %%mm3, %%mm2         \n\t"\
01153         "paddw %%mm5, %%mm4         \n\t"\
01154         "movq %6, %%mm5             \n\t"\
01155         "paddw %%mm5, %%mm2         \n\t"\
01156         "paddw %%mm5, %%mm4         \n\t"\
01157         "paddw %%mm2, %%mm0         \n\t"\
01158         "paddw %%mm4, %%mm1         \n\t"\
01159         "psraw $5, %%mm0            \n\t"\
01160         "psraw $5, %%mm1            \n\t"\
01161         "packuswb %%mm1, %%mm0      \n\t"\
01162         OP(%%mm0, (%1),%%mm5, q)\
01163         "add %3, %0                 \n\t"\
01164         "add %4, %1                 \n\t"\
01165         "decl %2                    \n\t"\
01166         " jnz 1b                    \n\t"\
01167         : "+a"(src), "+c"(dst), "+g"(h)\
01168         : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01169         : "memory"\
01170     );\
01171 }\
01172 \
01173 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01174     int h=8;\
01175     __asm__ volatile(\
01176         "pxor %%mm7, %%mm7          \n\t"\
01177         "movq %0, %%mm6             \n\t"\
01178         :: "m"(ff_pw_5)\
01179     );\
01180     do{\
01181     __asm__ volatile(\
01182         "movq    (%0), %%mm0        \n\t"\
01183         "movq   1(%0), %%mm2        \n\t"\
01184         "movq %%mm0, %%mm1          \n\t"\
01185         "movq %%mm2, %%mm3          \n\t"\
01186         "punpcklbw %%mm7, %%mm0     \n\t"\
01187         "punpckhbw %%mm7, %%mm1     \n\t"\
01188         "punpcklbw %%mm7, %%mm2     \n\t"\
01189         "punpckhbw %%mm7, %%mm3     \n\t"\
01190         "paddw %%mm2, %%mm0         \n\t"\
01191         "paddw %%mm3, %%mm1         \n\t"\
01192         "psllw $2, %%mm0            \n\t"\
01193         "psllw $2, %%mm1            \n\t"\
01194         "movq   -1(%0), %%mm2       \n\t"\
01195         "movq    2(%0), %%mm4       \n\t"\
01196         "movq %%mm2, %%mm3          \n\t"\
01197         "movq %%mm4, %%mm5          \n\t"\
01198         "punpcklbw %%mm7, %%mm2     \n\t"\
01199         "punpckhbw %%mm7, %%mm3     \n\t"\
01200         "punpcklbw %%mm7, %%mm4     \n\t"\
01201         "punpckhbw %%mm7, %%mm5     \n\t"\
01202         "paddw %%mm4, %%mm2         \n\t"\
01203         "paddw %%mm3, %%mm5         \n\t"\
01204         "psubw %%mm2, %%mm0         \n\t"\
01205         "psubw %%mm5, %%mm1         \n\t"\
01206         "pmullw %%mm6, %%mm0        \n\t"\
01207         "pmullw %%mm6, %%mm1        \n\t"\
01208         "movd   -2(%0), %%mm2       \n\t"\
01209         "movd    7(%0), %%mm5       \n\t"\
01210         "punpcklbw %%mm7, %%mm2     \n\t"\
01211         "punpcklbw %%mm7, %%mm5     \n\t"\
01212         "paddw %%mm3, %%mm2         \n\t"\
01213         "paddw %%mm5, %%mm4         \n\t"\
01214         "movq %5, %%mm5             \n\t"\
01215         "paddw %%mm5, %%mm2         \n\t"\
01216         "paddw %%mm5, %%mm4         \n\t"\
01217         "paddw %%mm2, %%mm0         \n\t"\
01218         "paddw %%mm4, %%mm1         \n\t"\
01219         "psraw $5, %%mm0            \n\t"\
01220         "psraw $5, %%mm1            \n\t"\
01221         "movq (%2), %%mm4           \n\t"\
01222         "packuswb %%mm1, %%mm0      \n\t"\
01223         PAVGB" %%mm4, %%mm0         \n\t"\
01224         OP(%%mm0, (%1),%%mm5, q)\
01225         "add %4, %0                 \n\t"\
01226         "add %4, %1                 \n\t"\
01227         "add %3, %2                 \n\t"\
01228         : "+a"(src), "+c"(dst), "+d"(src2)\
01229         : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
01230           "m"(ff_pw_16)\
01231         : "memory"\
01232     );\
01233     }while(--h);\
01234 }\
01235 \
01236 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01237     int w= 2;\
01238     src -= 2*srcStride;\
01239     \
01240     while(w--){\
01241       __asm__ volatile(\
01242         "pxor %%mm7, %%mm7          \n\t"\
01243         "movd (%0), %%mm0           \n\t"\
01244         "add %2, %0                 \n\t"\
01245         "movd (%0), %%mm1           \n\t"\
01246         "add %2, %0                 \n\t"\
01247         "movd (%0), %%mm2           \n\t"\
01248         "add %2, %0                 \n\t"\
01249         "movd (%0), %%mm3           \n\t"\
01250         "add %2, %0                 \n\t"\
01251         "movd (%0), %%mm4           \n\t"\
01252         "add %2, %0                 \n\t"\
01253         "punpcklbw %%mm7, %%mm0     \n\t"\
01254         "punpcklbw %%mm7, %%mm1     \n\t"\
01255         "punpcklbw %%mm7, %%mm2     \n\t"\
01256         "punpcklbw %%mm7, %%mm3     \n\t"\
01257         "punpcklbw %%mm7, %%mm4     \n\t"\
01258         QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01259         QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01260         QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01261         QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01262         QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
01263         QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
01264         QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01265         QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01266          \
01267         : "+a"(src), "+c"(dst)\
01268         : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01269         : "memory"\
01270      );\
01271      if(h==16){\
01272         __asm__ volatile(\
01273             QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01274             QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01275             QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
01276             QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
01277             QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01278             QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01279             QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01280             QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01281             \
01282            : "+a"(src), "+c"(dst)\
01283            : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01284            : "memory"\
01285         );\
01286      }\
01287      src += 4-(h+5)*srcStride;\
01288      dst += 4-h*dstStride;\
01289    }\
01290 }\
01291 static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
01292     int w = (size+8)>>2;\
01293     src -= 2*srcStride+2;\
01294     while(w--){\
01295         __asm__ volatile(\
01296             "pxor %%mm7, %%mm7      \n\t"\
01297             "movd (%0), %%mm0       \n\t"\
01298             "add %2, %0             \n\t"\
01299             "movd (%0), %%mm1       \n\t"\
01300             "add %2, %0             \n\t"\
01301             "movd (%0), %%mm2       \n\t"\
01302             "add %2, %0             \n\t"\
01303             "movd (%0), %%mm3       \n\t"\
01304             "add %2, %0             \n\t"\
01305             "movd (%0), %%mm4       \n\t"\
01306             "add %2, %0             \n\t"\
01307             "punpcklbw %%mm7, %%mm0 \n\t"\
01308             "punpcklbw %%mm7, %%mm1 \n\t"\
01309             "punpcklbw %%mm7, %%mm2 \n\t"\
01310             "punpcklbw %%mm7, %%mm3 \n\t"\
01311             "punpcklbw %%mm7, %%mm4 \n\t"\
01312             QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
01313             QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
01314             QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
01315             QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
01316             QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
01317             QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
01318             QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
01319             QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
01320             : "+a"(src)\
01321             : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01322             : "memory"\
01323         );\
01324         if(size==16){\
01325             __asm__ volatile(\
01326                 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1,  8*48)\
01327                 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2,  9*48)\
01328                 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
01329                 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
01330                 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
01331                 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
01332                 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
01333                 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
01334                 : "+a"(src)\
01335                 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01336                 : "memory"\
01337             );\
01338         }\
01339         tmp += 4;\
01340         src += 4 - (size+5)*srcStride;\
01341     }\
01342 }\
01343 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
01344     int w = size>>4;\
01345     do{\
01346     int h = size;\
01347     __asm__ volatile(\
01348         "1:                         \n\t"\
01349         "movq     (%0), %%mm0       \n\t"\
01350         "movq    8(%0), %%mm3       \n\t"\
01351         "movq    2(%0), %%mm1       \n\t"\
01352         "movq   10(%0), %%mm4       \n\t"\
01353         "paddw   %%mm4, %%mm0       \n\t"\
01354         "paddw   %%mm3, %%mm1       \n\t"\
01355         "paddw  18(%0), %%mm3       \n\t"\
01356         "paddw  16(%0), %%mm4       \n\t"\
01357         "movq    4(%0), %%mm2       \n\t"\
01358         "movq   12(%0), %%mm5       \n\t"\
01359         "paddw   6(%0), %%mm2       \n\t"\
01360         "paddw  14(%0), %%mm5       \n\t"\
01361         "psubw %%mm1, %%mm0         \n\t"\
01362         "psubw %%mm4, %%mm3         \n\t"\
01363         "psraw $2, %%mm0            \n\t"\
01364         "psraw $2, %%mm3            \n\t"\
01365         "psubw %%mm1, %%mm0         \n\t"\
01366         "psubw %%mm4, %%mm3         \n\t"\
01367         "paddsw %%mm2, %%mm0        \n\t"\
01368         "paddsw %%mm5, %%mm3        \n\t"\
01369         "psraw $2, %%mm0            \n\t"\
01370         "psraw $2, %%mm3            \n\t"\
01371         "paddw %%mm2, %%mm0         \n\t"\
01372         "paddw %%mm5, %%mm3         \n\t"\
01373         "psraw $6, %%mm0            \n\t"\
01374         "psraw $6, %%mm3            \n\t"\
01375         "packuswb %%mm3, %%mm0      \n\t"\
01376         OP(%%mm0, (%1),%%mm7, q)\
01377         "add $48, %0                \n\t"\
01378         "add %3, %1                 \n\t"\
01379         "decl %2                    \n\t"\
01380         " jnz 1b                    \n\t"\
01381         : "+a"(tmp), "+c"(dst), "+g"(h)\
01382         : "S"((x86_reg)dstStride)\
01383         : "memory"\
01384     );\
01385     tmp += 8 - size*24;\
01386     dst += 8 - size*dstStride;\
01387     }while(w--);\
01388 }\
01389 \
01390 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01391     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
01392 }\
01393 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01394     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
01395     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
01396 }\
01397 \
01398 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01399     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
01400     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01401     src += 8*srcStride;\
01402     dst += 8*dstStride;\
01403     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
01404     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01405 }\
01406 \
01407 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01408     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
01409     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01410     src += 8*dstStride;\
01411     dst += 8*dstStride;\
01412     src2 += 8*src2Stride;\
01413     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
01414     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01415 }\
01416 \
01417 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
01418           put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
01419     OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
01420 }\
01421 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01422     OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
01423 }\
01424 \
01425 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01426     OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 16);\
01427 }\
01428 \
01429 static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01430 {\
01431     __asm__ volatile(\
01432         "movq      (%1), %%mm0          \n\t"\
01433         "movq    24(%1), %%mm1          \n\t"\
01434         "psraw      $5,  %%mm0          \n\t"\
01435         "psraw      $5,  %%mm1          \n\t"\
01436         "packuswb %%mm0, %%mm0          \n\t"\
01437         "packuswb %%mm1, %%mm1          \n\t"\
01438         PAVGB"     (%0), %%mm0          \n\t"\
01439         PAVGB"  (%0,%3), %%mm1          \n\t"\
01440         OP(%%mm0, (%2),    %%mm4, d)\
01441         OP(%%mm1, (%2,%4), %%mm5, d)\
01442         "lea  (%0,%3,2), %0             \n\t"\
01443         "lea  (%2,%4,2), %2             \n\t"\
01444         "movq    48(%1), %%mm0          \n\t"\
01445         "movq    72(%1), %%mm1          \n\t"\
01446         "psraw      $5,  %%mm0          \n\t"\
01447         "psraw      $5,  %%mm1          \n\t"\
01448         "packuswb %%mm0, %%mm0          \n\t"\
01449         "packuswb %%mm1, %%mm1          \n\t"\
01450         PAVGB"     (%0), %%mm0          \n\t"\
01451         PAVGB"  (%0,%3), %%mm1          \n\t"\
01452         OP(%%mm0, (%2),    %%mm4, d)\
01453         OP(%%mm1, (%2,%4), %%mm5, d)\
01454         :"+a"(src8), "+c"(src16), "+d"(dst)\
01455         :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
01456         :"memory");\
01457 }\
01458 static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01459 {\
01460     do{\
01461     __asm__ volatile(\
01462         "movq      (%1), %%mm0          \n\t"\
01463         "movq     8(%1), %%mm1          \n\t"\
01464         "movq    48(%1), %%mm2          \n\t"\
01465         "movq  8+48(%1), %%mm3          \n\t"\
01466         "psraw      $5,  %%mm0          \n\t"\
01467         "psraw      $5,  %%mm1          \n\t"\
01468         "psraw      $5,  %%mm2          \n\t"\
01469         "psraw      $5,  %%mm3          \n\t"\
01470         "packuswb %%mm1, %%mm0          \n\t"\
01471         "packuswb %%mm3, %%mm2          \n\t"\
01472         PAVGB"     (%0), %%mm0          \n\t"\
01473         PAVGB"  (%0,%3), %%mm2          \n\t"\
01474         OP(%%mm0, (%2), %%mm5, q)\
01475         OP(%%mm2, (%2,%4), %%mm5, q)\
01476         ::"a"(src8), "c"(src16), "d"(dst),\
01477           "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
01478         :"memory");\
01479         src8 += 2L*src8Stride;\
01480         src16 += 48;\
01481         dst += 2L*dstStride;\
01482     }while(h-=2);\
01483 }\
01484 static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01485 {\
01486     OPNAME ## pixels8_l2_shift5_ ## MMX(dst  , src16  , src8  , dstStride, src8Stride, h);\
01487     OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
01488 }\
01489 
01490 
01491 #if ARCH_X86_64
01492 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
01493 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01494     int h=16;\
01495     __asm__ volatile(\
01496         "pxor %%xmm15, %%xmm15      \n\t"\
01497         "movdqa %6, %%xmm14         \n\t"\
01498         "movdqa %7, %%xmm13         \n\t"\
01499         "1:                         \n\t"\
01500         "lddqu    3(%0), %%xmm1     \n\t"\
01501         "lddqu   -5(%0), %%xmm7     \n\t"\
01502         "movdqa  %%xmm1, %%xmm0     \n\t"\
01503         "punpckhbw %%xmm15, %%xmm1  \n\t"\
01504         "punpcklbw %%xmm15, %%xmm0  \n\t"\
01505         "punpcklbw %%xmm15, %%xmm7  \n\t"\
01506         "movdqa  %%xmm1, %%xmm2     \n\t"\
01507         "movdqa  %%xmm0, %%xmm6     \n\t"\
01508         "movdqa  %%xmm1, %%xmm3     \n\t"\
01509         "movdqa  %%xmm0, %%xmm8     \n\t"\
01510         "movdqa  %%xmm1, %%xmm4     \n\t"\
01511         "movdqa  %%xmm0, %%xmm9     \n\t"\
01512         "movdqa  %%xmm1, %%xmm5     \n\t"\
01513         "movdqa  %%xmm0, %%xmm10    \n\t"\
01514         "palignr $6, %%xmm0, %%xmm5 \n\t"\
01515         "palignr $6, %%xmm7, %%xmm10\n\t"\
01516         "palignr $8, %%xmm0, %%xmm4 \n\t"\
01517         "palignr $8, %%xmm7, %%xmm9 \n\t"\
01518         "palignr $10,%%xmm0, %%xmm3 \n\t"\
01519         "palignr $10,%%xmm7, %%xmm8 \n\t"\
01520         "paddw   %%xmm1, %%xmm5     \n\t"\
01521         "paddw   %%xmm0, %%xmm10    \n\t"\
01522         "palignr $12,%%xmm0, %%xmm2 \n\t"\
01523         "palignr $12,%%xmm7, %%xmm6 \n\t"\
01524         "palignr $14,%%xmm0, %%xmm1 \n\t"\
01525         "palignr $14,%%xmm7, %%xmm0 \n\t"\
01526         "paddw   %%xmm3, %%xmm2     \n\t"\
01527         "paddw   %%xmm8, %%xmm6     \n\t"\
01528         "paddw   %%xmm4, %%xmm1     \n\t"\
01529         "paddw   %%xmm9, %%xmm0     \n\t"\
01530         "psllw   $2,     %%xmm2     \n\t"\
01531         "psllw   $2,     %%xmm6     \n\t"\
01532         "psubw   %%xmm1, %%xmm2     \n\t"\
01533         "psubw   %%xmm0, %%xmm6     \n\t"\
01534         "paddw   %%xmm13,%%xmm5     \n\t"\
01535         "paddw   %%xmm13,%%xmm10    \n\t"\
01536         "pmullw  %%xmm14,%%xmm2     \n\t"\
01537         "pmullw  %%xmm14,%%xmm6     \n\t"\
01538         "lddqu   (%2),   %%xmm3     \n\t"\
01539         "paddw   %%xmm5, %%xmm2     \n\t"\
01540         "paddw   %%xmm10,%%xmm6     \n\t"\
01541         "psraw   $5,     %%xmm2     \n\t"\
01542         "psraw   $5,     %%xmm6     \n\t"\
01543         "packuswb %%xmm2,%%xmm6     \n\t"\
01544         "pavgb   %%xmm3, %%xmm6     \n\t"\
01545         OP(%%xmm6, (%1), %%xmm4, dqa)\
01546         "add %5, %0                 \n\t"\
01547         "add %5, %1                 \n\t"\
01548         "add %4, %2                 \n\t"\
01549         "decl %3                    \n\t"\
01550         "jg 1b                      \n\t"\
01551         : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
01552         : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
01553           "m"(ff_pw_5), "m"(ff_pw_16)\
01554         : "memory"\
01555     );\
01556 }
01557 #else // ARCH_X86_64
01558 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
01559 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01560     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
01561     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01562     src += 8*dstStride;\
01563     dst += 8*dstStride;\
01564     src2 += 8*src2Stride;\
01565     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
01566     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01567 }
01568 #endif // ARCH_X86_64
01569 
01570 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
01571 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01572     int h=8;\
01573     __asm__ volatile(\
01574         "pxor %%xmm7, %%xmm7        \n\t"\
01575         "movdqa %0, %%xmm6          \n\t"\
01576         :: "m"(ff_pw_5)\
01577     );\
01578     do{\
01579     __asm__ volatile(\
01580         "lddqu   -5(%0), %%xmm1     \n\t"\
01581         "movdqa  %%xmm1, %%xmm0     \n\t"\
01582         "punpckhbw %%xmm7, %%xmm1   \n\t"\
01583         "punpcklbw %%xmm7, %%xmm0   \n\t"\
01584         "movdqa  %%xmm1, %%xmm2     \n\t"\
01585         "movdqa  %%xmm1, %%xmm3     \n\t"\
01586         "movdqa  %%xmm1, %%xmm4     \n\t"\
01587         "movdqa  %%xmm1, %%xmm5     \n\t"\
01588         "palignr $6, %%xmm0, %%xmm5 \n\t"\
01589         "palignr $8, %%xmm0, %%xmm4 \n\t"\
01590         "palignr $10,%%xmm0, %%xmm3 \n\t"\
01591         "paddw   %%xmm1, %%xmm5     \n\t"\
01592         "palignr $12,%%xmm0, %%xmm2 \n\t"\
01593         "palignr $14,%%xmm0, %%xmm1 \n\t"\
01594         "paddw   %%xmm3, %%xmm2     \n\t"\
01595         "paddw   %%xmm4, %%xmm1     \n\t"\
01596         "psllw   $2,     %%xmm2     \n\t"\
01597         "movq    (%2),   %%xmm3     \n\t"\
01598         "psubw   %%xmm1, %%xmm2     \n\t"\
01599         "paddw   %5,     %%xmm5     \n\t"\
01600         "pmullw  %%xmm6, %%xmm2     \n\t"\
01601         "paddw   %%xmm5, %%xmm2     \n\t"\
01602         "psraw   $5,     %%xmm2     \n\t"\
01603         "packuswb %%xmm2, %%xmm2    \n\t"\
01604         "pavgb   %%xmm3, %%xmm2     \n\t"\
01605         OP(%%xmm2, (%1), %%xmm4, q)\
01606         "add %4, %0                 \n\t"\
01607         "add %4, %1                 \n\t"\
01608         "add %3, %2                 \n\t"\
01609         : "+a"(src), "+c"(dst), "+d"(src2)\
01610         : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
01611           "m"(ff_pw_16)\
01612         : "memory"\
01613     );\
01614     }while(--h);\
01615 }\
01616 QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
01617 \
01618 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01619     int h=8;\
01620     __asm__ volatile(\
01621         "pxor %%xmm7, %%xmm7        \n\t"\
01622         "movdqa %5, %%xmm6          \n\t"\
01623         "1:                         \n\t"\
01624         "lddqu   -5(%0), %%xmm1     \n\t"\
01625         "movdqa  %%xmm1, %%xmm0     \n\t"\
01626         "punpckhbw %%xmm7, %%xmm1   \n\t"\
01627         "punpcklbw %%xmm7, %%xmm0   \n\t"\
01628         "movdqa  %%xmm1, %%xmm2     \n\t"\
01629         "movdqa  %%xmm1, %%xmm3     \n\t"\
01630         "movdqa  %%xmm1, %%xmm4     \n\t"\
01631         "movdqa  %%xmm1, %%xmm5     \n\t"\
01632         "palignr $6, %%xmm0, %%xmm5 \n\t"\
01633         "palignr $8, %%xmm0, %%xmm4 \n\t"\
01634         "palignr $10,%%xmm0, %%xmm3 \n\t"\
01635         "paddw   %%xmm1, %%xmm5     \n\t"\
01636         "palignr $12,%%xmm0, %%xmm2 \n\t"\
01637         "palignr $14,%%xmm0, %%xmm1 \n\t"\
01638         "paddw   %%xmm3, %%xmm2     \n\t"\
01639         "paddw   %%xmm4, %%xmm1     \n\t"\
01640         "psllw   $2,     %%xmm2     \n\t"\
01641         "psubw   %%xmm1, %%xmm2     \n\t"\
01642         "paddw   %6,     %%xmm5     \n\t"\
01643         "pmullw  %%xmm6, %%xmm2     \n\t"\
01644         "paddw   %%xmm5, %%xmm2     \n\t"\
01645         "psraw   $5,     %%xmm2     \n\t"\
01646         "packuswb %%xmm2, %%xmm2    \n\t"\
01647         OP(%%xmm2, (%1), %%xmm4, q)\
01648         "add %3, %0                 \n\t"\
01649         "add %4, %1                 \n\t"\
01650         "decl %2                    \n\t"\
01651         " jnz 1b                    \n\t"\
01652         : "+a"(src), "+c"(dst), "+g"(h)\
01653         : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\
01654           "m"(ff_pw_5), "m"(ff_pw_16)\
01655         : "memory"\
01656     );\
01657 }\
01658 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01659     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
01660     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01661     src += 8*srcStride;\
01662     dst += 8*dstStride;\
01663     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
01664     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01665 }\
01666 
01667 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
01668 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01669     src -= 2*srcStride;\
01670     \
01671     __asm__ volatile(\
01672         "pxor %%xmm7, %%xmm7        \n\t"\
01673         "movq (%0), %%xmm0          \n\t"\
01674         "add %2, %0                 \n\t"\
01675         "movq (%0), %%xmm1          \n\t"\
01676         "add %2, %0                 \n\t"\
01677         "movq (%0), %%xmm2          \n\t"\
01678         "add %2, %0                 \n\t"\
01679         "movq (%0), %%xmm3          \n\t"\
01680         "add %2, %0                 \n\t"\
01681         "movq (%0), %%xmm4          \n\t"\
01682         "add %2, %0                 \n\t"\
01683         "punpcklbw %%xmm7, %%xmm0   \n\t"\
01684         "punpcklbw %%xmm7, %%xmm1   \n\t"\
01685         "punpcklbw %%xmm7, %%xmm2   \n\t"\
01686         "punpcklbw %%xmm7, %%xmm3   \n\t"\
01687         "punpcklbw %%xmm7, %%xmm4   \n\t"\
01688         QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
01689         QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
01690         QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
01691         QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
01692         QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
01693         QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
01694         QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
01695         QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
01696          \
01697         : "+a"(src), "+c"(dst)\
01698         : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01699         : "memory"\
01700     );\
01701     if(h==16){\
01702         __asm__ volatile(\
01703             QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
01704             QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
01705             QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
01706             QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
01707             QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
01708             QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
01709             QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
01710             QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
01711             \
01712             : "+a"(src), "+c"(dst)\
01713             : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01714             : "memory"\
01715         );\
01716     }\
01717 }\
01718 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01719     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
01720 }\
01721 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01722     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
01723     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
01724 }
01725 
01726 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
01727     int w = (size+8)>>3;
01728     src -= 2*srcStride+2;
01729     while(w--){
01730         __asm__ volatile(
01731             "pxor %%xmm7, %%xmm7        \n\t"
01732             "movq (%0), %%xmm0          \n\t"
01733             "add %2, %0                 \n\t"
01734             "movq (%0), %%xmm1          \n\t"
01735             "add %2, %0                 \n\t"
01736             "movq (%0), %%xmm2          \n\t"
01737             "add %2, %0                 \n\t"
01738             "movq (%0), %%xmm3          \n\t"
01739             "add %2, %0                 \n\t"
01740             "movq (%0), %%xmm4          \n\t"
01741             "add %2, %0                 \n\t"
01742             "punpcklbw %%xmm7, %%xmm0   \n\t"
01743             "punpcklbw %%xmm7, %%xmm1   \n\t"
01744             "punpcklbw %%xmm7, %%xmm2   \n\t"
01745             "punpcklbw %%xmm7, %%xmm3   \n\t"
01746             "punpcklbw %%xmm7, %%xmm4   \n\t"
01747             QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
01748             QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
01749             QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
01750             QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
01751             QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
01752             QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
01753             QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
01754             QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
01755             : "+a"(src)
01756             : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
01757             : "memory"
01758         );
01759         if(size==16){
01760             __asm__ volatile(
01761                 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
01762                 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
01763                 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
01764                 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
01765                 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
01766                 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
01767                 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
01768                 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
01769                 : "+a"(src)
01770                 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
01771                 : "memory"
01772             );
01773         }
01774         tmp += 8;
01775         src += 8 - (size+5)*srcStride;
01776     }
01777 }
01778 
01779 #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
01780 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
01781     int h = size;\
01782     if(size == 16){\
01783         __asm__ volatile(\
01784             "1:                         \n\t"\
01785             "movdqa 32(%0), %%xmm4      \n\t"\
01786             "movdqa 16(%0), %%xmm5      \n\t"\
01787             "movdqa   (%0), %%xmm7      \n\t"\
01788             "movdqa %%xmm4, %%xmm3      \n\t"\
01789             "movdqa %%xmm4, %%xmm2      \n\t"\
01790             "movdqa %%xmm4, %%xmm1      \n\t"\
01791             "movdqa %%xmm4, %%xmm0      \n\t"\
01792             "palignr $10, %%xmm5, %%xmm0 \n\t"\
01793             "palignr  $8, %%xmm5, %%xmm1 \n\t"\
01794             "palignr  $6, %%xmm5, %%xmm2 \n\t"\
01795             "palignr  $4, %%xmm5, %%xmm3 \n\t"\
01796             "palignr  $2, %%xmm5, %%xmm4 \n\t"\
01797             "paddw  %%xmm5, %%xmm0      \n\t"\
01798             "paddw  %%xmm4, %%xmm1      \n\t"\
01799             "paddw  %%xmm3, %%xmm2      \n\t"\
01800             "movdqa %%xmm5, %%xmm6      \n\t"\
01801             "movdqa %%xmm5, %%xmm4      \n\t"\
01802             "movdqa %%xmm5, %%xmm3      \n\t"\
01803             "palignr  $8, %%xmm7, %%xmm4 \n\t"\
01804             "palignr  $2, %%xmm7, %%xmm6 \n\t"\
01805             "palignr $10, %%xmm7, %%xmm3 \n\t"\
01806             "paddw  %%xmm6, %%xmm4      \n\t"\
01807             "movdqa %%xmm5, %%xmm6      \n\t"\
01808             "palignr  $6, %%xmm7, %%xmm5 \n\t"\
01809             "palignr  $4, %%xmm7, %%xmm6 \n\t"\
01810             "paddw  %%xmm7, %%xmm3      \n\t"\
01811             "paddw  %%xmm6, %%xmm5      \n\t"\
01812             \
01813             "psubw  %%xmm1, %%xmm0      \n\t"\
01814             "psubw  %%xmm4, %%xmm3      \n\t"\
01815             "psraw      $2, %%xmm0      \n\t"\
01816             "psraw      $2, %%xmm3      \n\t"\
01817             "psubw  %%xmm1, %%xmm0      \n\t"\
01818             "psubw  %%xmm4, %%xmm3      \n\t"\
01819             "paddw  %%xmm2, %%xmm0      \n\t"\
01820             "paddw  %%xmm5, %%xmm3      \n\t"\
01821             "psraw      $2, %%xmm0      \n\t"\
01822             "psraw      $2, %%xmm3      \n\t"\
01823             "paddw  %%xmm2, %%xmm0      \n\t"\
01824             "paddw  %%xmm5, %%xmm3      \n\t"\
01825             "psraw      $6, %%xmm0      \n\t"\
01826             "psraw      $6, %%xmm3      \n\t"\
01827             "packuswb %%xmm0, %%xmm3    \n\t"\
01828             OP(%%xmm3, (%1), %%xmm7, dqa)\
01829             "add $48, %0                \n\t"\
01830             "add %3, %1                 \n\t"\
01831             "decl %2                    \n\t"\
01832             " jnz 1b                    \n\t"\
01833             : "+a"(tmp), "+c"(dst), "+g"(h)\
01834             : "S"((x86_reg)dstStride)\
01835             : "memory"\
01836         );\
01837     }else{\
01838         __asm__ volatile(\
01839             "1:                         \n\t"\
01840             "movdqa 16(%0), %%xmm1      \n\t"\
01841             "movdqa   (%0), %%xmm0      \n\t"\
01842             "movdqa %%xmm1, %%xmm2      \n\t"\
01843             "movdqa %%xmm1, %%xmm3      \n\t"\
01844             "movdqa %%xmm1, %%xmm4      \n\t"\
01845             "movdqa %%xmm1, %%xmm5      \n\t"\
01846             "palignr $10, %%xmm0, %%xmm5 \n\t"\
01847             "palignr  $8, %%xmm0, %%xmm4 \n\t"\
01848             "palignr  $6, %%xmm0, %%xmm3 \n\t"\
01849             "palignr  $4, %%xmm0, %%xmm2 \n\t"\
01850             "palignr  $2, %%xmm0, %%xmm1 \n\t"\
01851             "paddw  %%xmm5, %%xmm0      \n\t"\
01852             "paddw  %%xmm4, %%xmm1      \n\t"\
01853             "paddw  %%xmm3, %%xmm2      \n\t"\
01854             "psubw  %%xmm1, %%xmm0      \n\t"\
01855             "psraw      $2, %%xmm0      \n\t"\
01856             "psubw  %%xmm1, %%xmm0      \n\t"\
01857             "paddw  %%xmm2, %%xmm0      \n\t"\
01858             "psraw      $2, %%xmm0      \n\t"\
01859             "paddw  %%xmm2, %%xmm0      \n\t"\
01860             "psraw      $6, %%xmm0      \n\t"\
01861             "packuswb %%xmm0, %%xmm0    \n\t"\
01862             OP(%%xmm0, (%1), %%xmm7, q)\
01863             "add $48, %0                \n\t"\
01864             "add %3, %1                 \n\t"\
01865             "decl %2                    \n\t"\
01866             " jnz 1b                    \n\t"\
01867             : "+a"(tmp), "+c"(dst), "+g"(h)\
01868             : "S"((x86_reg)dstStride)\
01869             : "memory"\
01870         );\
01871     }\
01872 }
01873 
01874 #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
01875 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
01876           put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
01877     OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
01878 }\
01879 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01880     OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
01881 }\
01882 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01883     OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
01884 }\
01885 
01886 #define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
01887 #define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
01888 #define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
01889 #define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
01890 #define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
01891 #define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
01892 #define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
01893 #define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
01894 
01895 #define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
01896 #define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
01897 #define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
01898 #define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
01899 #define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
01900 #define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
01901 #define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
01902 #define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
01903 
01904 #define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
01905 #define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
01906 #define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
01907 #define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
01908 
01909 #define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
01910 #define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
01911 #define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
01912 #define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
01913 
01914 #define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
01915 #define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
01916 
01917 #define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
01918 H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
01919 H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
01920 H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
01921 H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
01922 
01923 static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
01924     put_pixels16_sse2(dst, src, stride, 16);
01925 }
01926 static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
01927     avg_pixels16_sse2(dst, src, stride, 16);
01928 }
01929 #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
01930 #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
01931 
01932 #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
01933 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
01934     OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
01935 }\
01936 
01937 #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
01938 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01939     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
01940 }\
01941 \
01942 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01943     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
01944 }\
01945 \
01946 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01947     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
01948 }\
01949 
01950 #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
01951 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01952     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
01953     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01954     OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
01955 }\
01956 \
01957 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01958     OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
01959 }\
01960 \
01961 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01962     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
01963     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01964     OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
01965 }\
01966 
01967 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
01968 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01969     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
01970     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01971     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
01972 }\
01973 \
01974 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01975     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
01976     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
01977     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
01978 }\
01979 \
01980 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01981     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
01982     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01983     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
01984 }\
01985 \
01986 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01987     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
01988     put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
01989     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
01990 }\
01991 \
01992 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01993     DECLARE_ALIGNED(ALIGN, uint16_t, temp[SIZE*(SIZE<8?12:24)]);\
01994     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
01995 }\
01996 \
01997 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01998     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
01999     uint8_t * const halfHV= temp;\
02000     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02001     assert(((int)temp & 7) == 0);\
02002     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02003     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
02004 }\
02005 \
02006 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02007     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
02008     uint8_t * const halfHV= temp;\
02009     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02010     assert(((int)temp & 7) == 0);\
02011     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02012     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
02013 }\
02014 \
02015 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02016     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
02017     uint8_t * const halfHV= temp;\
02018     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02019     assert(((int)temp & 7) == 0);\
02020     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02021     OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
02022 }\
02023 \
02024 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02025     DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
02026     uint8_t * const halfHV= temp;\
02027     int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02028     assert(((int)temp & 7) == 0);\
02029     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02030     OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
02031 }\
02032 
02033 #define H264_MC_4816(MMX)\
02034 H264_MC(put_, 4, MMX, 8)\
02035 H264_MC(put_, 8, MMX, 8)\
02036 H264_MC(put_, 16,MMX, 8)\
02037 H264_MC(avg_, 4, MMX, 8)\
02038 H264_MC(avg_, 8, MMX, 8)\
02039 H264_MC(avg_, 16,MMX, 8)\
02040 
02041 #define H264_MC_816(QPEL, XMM)\
02042 QPEL(put_, 8, XMM, 16)\
02043 QPEL(put_, 16,XMM, 16)\
02044 QPEL(avg_, 8, XMM, 16)\
02045 QPEL(avg_, 16,XMM, 16)\
02046 
02047 
02048 #define AVG_3DNOW_OP(a,b,temp, size) \
02049 "mov" #size " " #b ", " #temp "   \n\t"\
02050 "pavgusb " #temp ", " #a "        \n\t"\
02051 "mov" #size " " #a ", " #b "      \n\t"
02052 #define AVG_MMX2_OP(a,b,temp, size) \
02053 "mov" #size " " #b ", " #temp "   \n\t"\
02054 "pavgb " #temp ", " #a "          \n\t"\
02055 "mov" #size " " #a ", " #b "      \n\t"
02056 
02057 #define PAVGB "pavgusb"
02058 QPEL_H264(put_,       PUT_OP, 3dnow)
02059 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
02060 #undef PAVGB
02061 #define PAVGB "pavgb"
02062 QPEL_H264(put_,       PUT_OP, mmx2)
02063 QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
02064 QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
02065 QPEL_H264_V_XMM(avg_,  AVG_MMX2_OP, sse2)
02066 QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
02067 QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, sse2)
02068 #if HAVE_SSSE3
02069 QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
02070 QPEL_H264_H_XMM(avg_,  AVG_MMX2_OP, ssse3)
02071 QPEL_H264_HV2_XMM(put_,       PUT_OP, ssse3)
02072 QPEL_H264_HV2_XMM(avg_,  AVG_MMX2_OP, ssse3)
02073 QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
02074 QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
02075 #endif
02076 #undef PAVGB
02077 
02078 H264_MC_4816(3dnow)
02079 H264_MC_4816(mmx2)
02080 H264_MC_816(H264_MC_V, sse2)
02081 H264_MC_816(H264_MC_HV, sse2)
02082 #if HAVE_SSSE3
02083 H264_MC_816(H264_MC_H, ssse3)
02084 H264_MC_816(H264_MC_HV, ssse3)
02085 #endif
02086 
02087 /* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */
02088 DECLARE_ALIGNED_8(static const uint64_t, h264_rnd_reg[4]) = {
02089     0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL
02090 };
02091 
02092 #define H264_CHROMA_OP(S,D)
02093 #define H264_CHROMA_OP4(S,D,T)
02094 #define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx
02095 #define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx
02096 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
02097 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
02098 #include "dsputil_h264_template_mmx.c"
02099 
02100 static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02101 {
02102     put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
02103 }
02104 static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02105 {
02106     put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg+2);
02107 }
02108 static void put_h264_chroma_mc4_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02109 {
02110     put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
02111 }
02112 
02113 #undef H264_CHROMA_OP
02114 #undef H264_CHROMA_OP4
02115 #undef H264_CHROMA_MC8_TMPL
02116 #undef H264_CHROMA_MC4_TMPL
02117 #undef H264_CHROMA_MC2_TMPL
02118 #undef H264_CHROMA_MC8_MV0
02119 
02120 #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
02121 #define H264_CHROMA_OP4(S,D,T) "movd  " #S ", " #T " \n\t"\
02122                                "pavgb " #T ", " #D " \n\t"
02123 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_mmx2
02124 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_mmx2
02125 #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
02126 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
02127 #include "dsputil_h264_template_mmx.c"
02128 static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02129 {
02130     avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
02131 }
02132 static void avg_h264_chroma_mc4_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02133 {
02134     avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
02135 }
02136 #undef H264_CHROMA_OP
02137 #undef H264_CHROMA_OP4
02138 #undef H264_CHROMA_MC8_TMPL
02139 #undef H264_CHROMA_MC4_TMPL
02140 #undef H264_CHROMA_MC2_TMPL
02141 #undef H264_CHROMA_MC8_MV0
02142 
02143 #define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
02144 #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
02145                                "pavgusb " #T ", " #D " \n\t"
02146 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_3dnow
02147 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_3dnow
02148 #define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
02149 #include "dsputil_h264_template_mmx.c"
02150 static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02151 {
02152     avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
02153 }
02154 static void avg_h264_chroma_mc4_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02155 {
02156     avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
02157 }
02158 #undef H264_CHROMA_OP
02159 #undef H264_CHROMA_OP4
02160 #undef H264_CHROMA_MC8_TMPL
02161 #undef H264_CHROMA_MC4_TMPL
02162 #undef H264_CHROMA_MC8_MV0
02163 
02164 #if HAVE_SSSE3
02165 #define AVG_OP(X)
02166 #undef H264_CHROMA_MC8_TMPL
02167 #undef H264_CHROMA_MC4_TMPL
02168 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
02169 #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
02170 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
02171 #include "dsputil_h264_template_ssse3.c"
02172 static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02173 {
02174     put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
02175 }
02176 static void put_h264_chroma_mc8_ssse3_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02177 {
02178     put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0);
02179 }
02180 
02181 #undef AVG_OP
02182 #undef H264_CHROMA_MC8_TMPL
02183 #undef H264_CHROMA_MC4_TMPL
02184 #undef H264_CHROMA_MC8_MV0
02185 #define AVG_OP(X) X
02186 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
02187 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
02188 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
02189 #include "dsputil_h264_template_ssse3.c"
02190 static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
02191 {
02192     avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
02193 }
02194 #undef AVG_OP
02195 #undef H264_CHROMA_MC8_TMPL
02196 #undef H264_CHROMA_MC4_TMPL
02197 #undef H264_CHROMA_MC8_MV0
02198 #endif
02199 
02200 /***********************************/
02201 /* weighted prediction */
02202 
02203 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
02204 {
02205     int x, y;
02206     offset <<= log2_denom;
02207     offset += (1 << log2_denom) >> 1;
02208     __asm__ volatile(
02209         "movd    %0, %%mm4        \n\t"
02210         "movd    %1, %%mm5        \n\t"
02211         "movd    %2, %%mm6        \n\t"
02212         "pshufw  $0, %%mm4, %%mm4 \n\t"
02213         "pshufw  $0, %%mm5, %%mm5 \n\t"
02214         "pxor    %%mm7, %%mm7     \n\t"
02215         :: "g"(weight), "g"(offset), "g"(log2_denom)
02216     );
02217     for(y=0; y<h; y+=2){
02218         for(x=0; x<w; x+=4){
02219             __asm__ volatile(
02220                 "movd      %0,    %%mm0 \n\t"
02221                 "movd      %1,    %%mm1 \n\t"
02222                 "punpcklbw %%mm7, %%mm0 \n\t"
02223                 "punpcklbw %%mm7, %%mm1 \n\t"
02224                 "pmullw    %%mm4, %%mm0 \n\t"
02225                 "pmullw    %%mm4, %%mm1 \n\t"
02226                 "paddsw    %%mm5, %%mm0 \n\t"
02227                 "paddsw    %%mm5, %%mm1 \n\t"
02228                 "psraw     %%mm6, %%mm0 \n\t"
02229                 "psraw     %%mm6, %%mm1 \n\t"
02230                 "packuswb  %%mm7, %%mm0 \n\t"
02231                 "packuswb  %%mm7, %%mm1 \n\t"
02232                 "movd      %%mm0, %0    \n\t"
02233                 "movd      %%mm1, %1    \n\t"
02234                 : "+m"(*(uint32_t*)(dst+x)),
02235                   "+m"(*(uint32_t*)(dst+x+stride))
02236             );
02237         }
02238         dst += 2*stride;
02239     }
02240 }
02241 
02242 static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
02243 {
02244     int x, y;
02245     offset = ((offset + 1) | 1) << log2_denom;
02246     __asm__ volatile(
02247         "movd    %0, %%mm3        \n\t"
02248         "movd    %1, %%mm4        \n\t"
02249         "movd    %2, %%mm5        \n\t"
02250         "movd    %3, %%mm6        \n\t"
02251         "pshufw  $0, %%mm3, %%mm3 \n\t"
02252         "pshufw  $0, %%mm4, %%mm4 \n\t"
02253         "pshufw  $0, %%mm5, %%mm5 \n\t"
02254         "pxor    %%mm7, %%mm7     \n\t"
02255         :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
02256     );
02257     for(y=0; y<h; y++){
02258         for(x=0; x<w; x+=4){
02259             __asm__ volatile(
02260                 "movd      %0,    %%mm0 \n\t"
02261                 "movd      %1,    %%mm1 \n\t"
02262                 "punpcklbw %%mm7, %%mm0 \n\t"
02263                 "punpcklbw %%mm7, %%mm1 \n\t"
02264                 "pmullw    %%mm3, %%mm0 \n\t"
02265                 "pmullw    %%mm4, %%mm1 \n\t"
02266                 "paddsw    %%mm1, %%mm0 \n\t"
02267                 "paddsw    %%mm5, %%mm0 \n\t"
02268                 "psraw     %%mm6, %%mm0 \n\t"
02269                 "packuswb  %%mm0, %%mm0 \n\t"
02270                 "movd      %%mm0, %0    \n\t"
02271                 : "+m"(*(uint32_t*)(dst+x))
02272                 :  "m"(*(uint32_t*)(src+x))
02273             );
02274         }
02275         src += stride;
02276         dst += stride;
02277     }
02278 }
02279 
02280 #define H264_WEIGHT(W,H) \
02281 static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
02282     ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
02283 } \
02284 static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
02285     ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
02286 }
02287 
02288 H264_WEIGHT(16,16)
02289 H264_WEIGHT(16, 8)
02290 H264_WEIGHT( 8,16)
02291 H264_WEIGHT( 8, 8)
02292 H264_WEIGHT( 8, 4)
02293 H264_WEIGHT( 4, 8)
02294 H264_WEIGHT( 4, 4)
02295 H264_WEIGHT( 4, 2)
02296 

Generated on Sat Feb 16 2013 09:23:14 for ffmpeg by  doxygen 1.7.1