• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libswscale/swscale_template.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
00003  *
00004  * This file is part of FFmpeg.
00005  *
00006  * FFmpeg is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 2 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * FFmpeg is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with FFmpeg; if not, write to the Free Software
00018  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00019  *
00020  * The C code (not assembly, MMX, ...) of this file can be used
00021  * under the LGPL license.
00022  */
00023 
00024 #undef REAL_MOVNTQ
00025 #undef MOVNTQ
00026 #undef PAVGB
00027 #undef PREFETCH
00028 #undef PREFETCHW
00029 #undef EMMS
00030 #undef SFENCE
00031 
00032 #if HAVE_AMD3DNOW
00033 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
00034 #define EMMS     "femms"
00035 #else
00036 #define EMMS     "emms"
00037 #endif
00038 
00039 #if HAVE_AMD3DNOW
00040 #define PREFETCH  "prefetch"
00041 #define PREFETCHW "prefetchw"
00042 #elif HAVE_MMX2
00043 #define PREFETCH "prefetchnta"
00044 #define PREFETCHW "prefetcht0"
00045 #else
00046 #define PREFETCH  " # nop"
00047 #define PREFETCHW " # nop"
00048 #endif
00049 
00050 #if HAVE_MMX2
00051 #define SFENCE "sfence"
00052 #else
00053 #define SFENCE " # nop"
00054 #endif
00055 
00056 #if HAVE_MMX2
00057 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
00058 #elif HAVE_AMD3DNOW
00059 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
00060 #endif
00061 
00062 #if HAVE_MMX2
00063 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
00064 #else
00065 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
00066 #endif
00067 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
00068 
00069 #if HAVE_ALTIVEC
00070 #include "swscale_altivec_template.c"
00071 #endif
00072 
00073 #define YSCALEYUV2YV12X(x, offset, dest, width) \
00074     __asm__ volatile(\
00075     "xor                          %%"REG_a", %%"REG_a"  \n\t"\
00076     "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
00077     "movq                             %%mm3, %%mm4      \n\t"\
00078     "lea                     " offset "(%0), %%"REG_d"  \n\t"\
00079     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
00080     ASMALIGN(4) /* FIXME Unroll? */\
00081     "1:                                                 \n\t"\
00082     "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
00083     "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
00084     "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
00085     "add                                $16, %%"REG_d"  \n\t"\
00086     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
00087     "test                         %%"REG_S", %%"REG_S"  \n\t"\
00088     "pmulhw                           %%mm0, %%mm2      \n\t"\
00089     "pmulhw                           %%mm0, %%mm5      \n\t"\
00090     "paddw                            %%mm2, %%mm3      \n\t"\
00091     "paddw                            %%mm5, %%mm4      \n\t"\
00092     " jnz                                1b             \n\t"\
00093     "psraw                               $3, %%mm3      \n\t"\
00094     "psraw                               $3, %%mm4      \n\t"\
00095     "packuswb                         %%mm4, %%mm3      \n\t"\
00096     MOVNTQ(%%mm3, (%1, %%REGa))\
00097     "add                                 $8, %%"REG_a"  \n\t"\
00098     "cmp                                 %2, %%"REG_a"  \n\t"\
00099     "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
00100     "movq                             %%mm3, %%mm4      \n\t"\
00101     "lea                     " offset "(%0), %%"REG_d"  \n\t"\
00102     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
00103     "jb                                  1b             \n\t"\
00104     :: "r" (&c->redDither),\
00105     "r" (dest), "g" (width)\
00106     : "%"REG_a, "%"REG_d, "%"REG_S\
00107     );
00108 
00109 #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
00110     __asm__ volatile(\
00111     "lea                     " offset "(%0), %%"REG_d"  \n\t"\
00112     "xor                          %%"REG_a", %%"REG_a"  \n\t"\
00113     "pxor                             %%mm4, %%mm4      \n\t"\
00114     "pxor                             %%mm5, %%mm5      \n\t"\
00115     "pxor                             %%mm6, %%mm6      \n\t"\
00116     "pxor                             %%mm7, %%mm7      \n\t"\
00117     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
00118     ASMALIGN(4) \
00119     "1:                                                 \n\t"\
00120     "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
00121     "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
00122     "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
00123     "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
00124     "movq                             %%mm0, %%mm3      \n\t"\
00125     "punpcklwd                        %%mm1, %%mm0      \n\t"\
00126     "punpckhwd                        %%mm1, %%mm3      \n\t"\
00127     "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
00128     "pmaddwd                          %%mm1, %%mm0      \n\t"\
00129     "pmaddwd                          %%mm1, %%mm3      \n\t"\
00130     "paddd                            %%mm0, %%mm4      \n\t"\
00131     "paddd                            %%mm3, %%mm5      \n\t"\
00132     "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
00133     "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
00134     "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
00135     "test                         %%"REG_S", %%"REG_S"  \n\t"\
00136     "movq                             %%mm2, %%mm0      \n\t"\
00137     "punpcklwd                        %%mm3, %%mm2      \n\t"\
00138     "punpckhwd                        %%mm3, %%mm0      \n\t"\
00139     "pmaddwd                          %%mm1, %%mm2      \n\t"\
00140     "pmaddwd                          %%mm1, %%mm0      \n\t"\
00141     "paddd                            %%mm2, %%mm6      \n\t"\
00142     "paddd                            %%mm0, %%mm7      \n\t"\
00143     " jnz                                1b             \n\t"\
00144     "psrad                              $16, %%mm4      \n\t"\
00145     "psrad                              $16, %%mm5      \n\t"\
00146     "psrad                              $16, %%mm6      \n\t"\
00147     "psrad                              $16, %%mm7      \n\t"\
00148     "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
00149     "packssdw                         %%mm5, %%mm4      \n\t"\
00150     "packssdw                         %%mm7, %%mm6      \n\t"\
00151     "paddw                            %%mm0, %%mm4      \n\t"\
00152     "paddw                            %%mm0, %%mm6      \n\t"\
00153     "psraw                               $3, %%mm4      \n\t"\
00154     "psraw                               $3, %%mm6      \n\t"\
00155     "packuswb                         %%mm6, %%mm4      \n\t"\
00156     MOVNTQ(%%mm4, (%1, %%REGa))\
00157     "add                                 $8, %%"REG_a"  \n\t"\
00158     "cmp                                 %2, %%"REG_a"  \n\t"\
00159     "lea                     " offset "(%0), %%"REG_d"  \n\t"\
00160     "pxor                             %%mm4, %%mm4      \n\t"\
00161     "pxor                             %%mm5, %%mm5      \n\t"\
00162     "pxor                             %%mm6, %%mm6      \n\t"\
00163     "pxor                             %%mm7, %%mm7      \n\t"\
00164     "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
00165     "jb                                  1b             \n\t"\
00166     :: "r" (&c->redDither),\
00167     "r" (dest), "g" (width)\
00168     : "%"REG_a, "%"REG_d, "%"REG_S\
00169     );
00170 
00171 #define YSCALEYUV2YV121 \
00172     "mov %2, %%"REG_a"                    \n\t"\
00173     ASMALIGN(4) /* FIXME Unroll? */\
00174     "1:                                   \n\t"\
00175     "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
00176     "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
00177     "psraw                 $7, %%mm0      \n\t"\
00178     "psraw                 $7, %%mm1      \n\t"\
00179     "packuswb           %%mm1, %%mm0      \n\t"\
00180     MOVNTQ(%%mm0, (%1, %%REGa))\
00181     "add                   $8, %%"REG_a"  \n\t"\
00182     "jnc                   1b             \n\t"
00183 
00184 #define YSCALEYUV2YV121_ACCURATE \
00185     "mov %2, %%"REG_a"                    \n\t"\
00186     "pcmpeqw %%mm7, %%mm7                 \n\t"\
00187     "psrlw                 $15, %%mm7     \n\t"\
00188     "psllw                  $6, %%mm7     \n\t"\
00189     ASMALIGN(4) /* FIXME Unroll? */\
00190     "1:                                   \n\t"\
00191     "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
00192     "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
00193     "paddsw             %%mm7, %%mm0      \n\t"\
00194     "paddsw             %%mm7, %%mm1      \n\t"\
00195     "psraw                 $7, %%mm0      \n\t"\
00196     "psraw                 $7, %%mm1      \n\t"\
00197     "packuswb           %%mm1, %%mm0      \n\t"\
00198     MOVNTQ(%%mm0, (%1, %%REGa))\
00199     "add                   $8, %%"REG_a"  \n\t"\
00200     "jnc                   1b             \n\t"
00201 
00202 /*
00203     :: "m" (-lumFilterSize), "m" (-chrFilterSize),
00204        "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
00205        "r" (dest), "m" (dstW),
00206        "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
00207     : "%eax", "%ebx", "%ecx", "%edx", "%esi"
00208 */
00209 #define YSCALEYUV2PACKEDX_UV \
00210     __asm__ volatile(\
00211     "xor                   %%"REG_a", %%"REG_a"     \n\t"\
00212     ASMALIGN(4)\
00213     "nop                                            \n\t"\
00214     "1:                                             \n\t"\
00215     "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
00216     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00217     "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
00218     "movq                      %%mm3, %%mm4         \n\t"\
00219     ASMALIGN(4)\
00220     "2:                                             \n\t"\
00221     "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
00222     "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
00223     "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
00224     "add                         $16, %%"REG_d"     \n\t"\
00225     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00226     "pmulhw                    %%mm0, %%mm2         \n\t"\
00227     "pmulhw                    %%mm0, %%mm5         \n\t"\
00228     "paddw                     %%mm2, %%mm3         \n\t"\
00229     "paddw                     %%mm5, %%mm4         \n\t"\
00230     "test                  %%"REG_S", %%"REG_S"     \n\t"\
00231     " jnz                         2b                \n\t"\
00232 
00233 #define YSCALEYUV2PACKEDX_YA(offset) \
00234     "lea                "offset"(%0), %%"REG_d"     \n\t"\
00235     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00236     "movq      "VROUNDER_OFFSET"(%0), %%mm1         \n\t"\
00237     "movq                      %%mm1, %%mm7         \n\t"\
00238     ASMALIGN(4)\
00239     "2:                                             \n\t"\
00240     "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
00241     "movq  (%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y1srcData */\
00242     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5         \n\t" /* Y2srcData */\
00243     "add                         $16, %%"REG_d"            \n\t"\
00244     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00245     "pmulhw                    %%mm0, %%mm2         \n\t"\
00246     "pmulhw                    %%mm0, %%mm5         \n\t"\
00247     "paddw                     %%mm2, %%mm1         \n\t"\
00248     "paddw                     %%mm5, %%mm7         \n\t"\
00249     "test                  %%"REG_S", %%"REG_S"     \n\t"\
00250     " jnz                         2b                \n\t"\
00251 
00252 #define YSCALEYUV2PACKEDX \
00253     YSCALEYUV2PACKEDX_UV \
00254     YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET) \
00255 
00256 #define YSCALEYUV2PACKEDX_END                 \
00257     :: "r" (&c->redDither),                   \
00258         "m" (dummy), "m" (dummy), "m" (dummy),\
00259         "r" (dest), "m" (dstW)                \
00260     : "%"REG_a, "%"REG_d, "%"REG_S            \
00261     );
00262 
00263 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
00264     __asm__ volatile(\
00265     "xor %%"REG_a", %%"REG_a"                       \n\t"\
00266     ASMALIGN(4)\
00267     "nop                                            \n\t"\
00268     "1:                                             \n\t"\
00269     "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
00270     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00271     "pxor                      %%mm4, %%mm4         \n\t"\
00272     "pxor                      %%mm5, %%mm5         \n\t"\
00273     "pxor                      %%mm6, %%mm6         \n\t"\
00274     "pxor                      %%mm7, %%mm7         \n\t"\
00275     ASMALIGN(4)\
00276     "2:                                             \n\t"\
00277     "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
00278     "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
00279     "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
00280     "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
00281     "movq                      %%mm0, %%mm3         \n\t"\
00282     "punpcklwd                 %%mm1, %%mm0         \n\t"\
00283     "punpckhwd                 %%mm1, %%mm3         \n\t"\
00284     "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
00285     "pmaddwd                   %%mm1, %%mm0         \n\t"\
00286     "pmaddwd                   %%mm1, %%mm3         \n\t"\
00287     "paddd                     %%mm0, %%mm4         \n\t"\
00288     "paddd                     %%mm3, %%mm5         \n\t"\
00289     "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
00290     "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
00291     "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
00292     "test                  %%"REG_S", %%"REG_S"     \n\t"\
00293     "movq                      %%mm2, %%mm0         \n\t"\
00294     "punpcklwd                 %%mm3, %%mm2         \n\t"\
00295     "punpckhwd                 %%mm3, %%mm0         \n\t"\
00296     "pmaddwd                   %%mm1, %%mm2         \n\t"\
00297     "pmaddwd                   %%mm1, %%mm0         \n\t"\
00298     "paddd                     %%mm2, %%mm6         \n\t"\
00299     "paddd                     %%mm0, %%mm7         \n\t"\
00300     " jnz                         2b                \n\t"\
00301     "psrad                       $16, %%mm4         \n\t"\
00302     "psrad                       $16, %%mm5         \n\t"\
00303     "psrad                       $16, %%mm6         \n\t"\
00304     "psrad                       $16, %%mm7         \n\t"\
00305     "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
00306     "packssdw                  %%mm5, %%mm4         \n\t"\
00307     "packssdw                  %%mm7, %%mm6         \n\t"\
00308     "paddw                     %%mm0, %%mm4         \n\t"\
00309     "paddw                     %%mm0, %%mm6         \n\t"\
00310     "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
00311     "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
00312 
00313 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
00314     "lea                "offset"(%0), %%"REG_d"     \n\t"\
00315     "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
00316     "pxor                      %%mm1, %%mm1         \n\t"\
00317     "pxor                      %%mm5, %%mm5         \n\t"\
00318     "pxor                      %%mm7, %%mm7         \n\t"\
00319     "pxor                      %%mm6, %%mm6         \n\t"\
00320     ASMALIGN(4)\
00321     "2:                                             \n\t"\
00322     "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
00323     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
00324     "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
00325     "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
00326     "movq                      %%mm0, %%mm3         \n\t"\
00327     "punpcklwd                 %%mm4, %%mm0         \n\t"\
00328     "punpckhwd                 %%mm4, %%mm3         \n\t"\
00329     "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
00330     "pmaddwd                   %%mm4, %%mm0         \n\t"\
00331     "pmaddwd                   %%mm4, %%mm3         \n\t"\
00332     "paddd                     %%mm0, %%mm1         \n\t"\
00333     "paddd                     %%mm3, %%mm5         \n\t"\
00334     "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
00335     "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
00336     "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
00337     "test                  %%"REG_S", %%"REG_S"     \n\t"\
00338     "movq                      %%mm2, %%mm0         \n\t"\
00339     "punpcklwd                 %%mm3, %%mm2         \n\t"\
00340     "punpckhwd                 %%mm3, %%mm0         \n\t"\
00341     "pmaddwd                   %%mm4, %%mm2         \n\t"\
00342     "pmaddwd                   %%mm4, %%mm0         \n\t"\
00343     "paddd                     %%mm2, %%mm7         \n\t"\
00344     "paddd                     %%mm0, %%mm6         \n\t"\
00345     " jnz                         2b                \n\t"\
00346     "psrad                       $16, %%mm1         \n\t"\
00347     "psrad                       $16, %%mm5         \n\t"\
00348     "psrad                       $16, %%mm7         \n\t"\
00349     "psrad                       $16, %%mm6         \n\t"\
00350     "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
00351     "packssdw                  %%mm5, %%mm1         \n\t"\
00352     "packssdw                  %%mm6, %%mm7         \n\t"\
00353     "paddw                     %%mm0, %%mm1         \n\t"\
00354     "paddw                     %%mm0, %%mm7         \n\t"\
00355     "movq               "U_TEMP"(%0), %%mm3         \n\t"\
00356     "movq               "V_TEMP"(%0), %%mm4         \n\t"\
00357 
00358 #define YSCALEYUV2PACKEDX_ACCURATE \
00359     YSCALEYUV2PACKEDX_ACCURATE_UV \
00360     YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
00361 
00362 #define YSCALEYUV2RGBX \
00363     "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
00364     "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
00365     "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
00366     "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
00367     "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
00368     "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
00369 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00370     "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
00371     "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
00372     "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
00373     "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
00374     "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
00375     "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
00376 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00377     "paddw           %%mm3, %%mm4       \n\t"\
00378     "movq            %%mm2, %%mm0       \n\t"\
00379     "movq            %%mm5, %%mm6       \n\t"\
00380     "movq            %%mm4, %%mm3       \n\t"\
00381     "punpcklwd       %%mm2, %%mm2       \n\t"\
00382     "punpcklwd       %%mm5, %%mm5       \n\t"\
00383     "punpcklwd       %%mm4, %%mm4       \n\t"\
00384     "paddw           %%mm1, %%mm2       \n\t"\
00385     "paddw           %%mm1, %%mm5       \n\t"\
00386     "paddw           %%mm1, %%mm4       \n\t"\
00387     "punpckhwd       %%mm0, %%mm0       \n\t"\
00388     "punpckhwd       %%mm6, %%mm6       \n\t"\
00389     "punpckhwd       %%mm3, %%mm3       \n\t"\
00390     "paddw           %%mm7, %%mm0       \n\t"\
00391     "paddw           %%mm7, %%mm6       \n\t"\
00392     "paddw           %%mm7, %%mm3       \n\t"\
00393     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00394     "packuswb        %%mm0, %%mm2       \n\t"\
00395     "packuswb        %%mm6, %%mm5       \n\t"\
00396     "packuswb        %%mm3, %%mm4       \n\t"\
00397 
00398 #define REAL_YSCALEYUV2PACKED(index, c) \
00399     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
00400     "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
00401     "psraw                $3, %%mm0                           \n\t"\
00402     "psraw                $3, %%mm1                           \n\t"\
00403     "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
00404     "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
00405     "xor            "#index", "#index"                        \n\t"\
00406     ASMALIGN(4)\
00407     "1:                                 \n\t"\
00408     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00409     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00410     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
00411     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
00412     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
00413     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
00414     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
00415     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
00416     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
00417     "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
00418     "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
00419     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
00420     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
00421     "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
00422     "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
00423     "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
00424     "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
00425     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
00426     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
00427     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00428     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00429     "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00430     "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00431     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00432     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00433 
00434 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
00435 
00436 #define REAL_YSCALEYUV2RGB_UV(index, c) \
00437     "xor            "#index", "#index"  \n\t"\
00438     ASMALIGN(4)\
00439     "1:                                 \n\t"\
00440     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00441     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00442     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
00443     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
00444     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
00445     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
00446     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
00447     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
00448     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
00449     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
00450     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
00451     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
00452     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
00453     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
00454     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
00455     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
00456     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
00457     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
00458     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
00459     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00460 
00461 #define REAL_YSCALEYUV2RGB_YA(index, c) \
00462     "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
00463     "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
00464     "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
00465     "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
00466     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
00467     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
00468     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00469     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
00470     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00471     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00472     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00473     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
00474 
00475 #define REAL_YSCALEYUV2RGB_COEFF(c) \
00476     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
00477     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
00478     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
00479     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
00480     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
00481     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
00482     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00483     "paddw             %%mm3, %%mm4     \n\t"\
00484     "movq              %%mm2, %%mm0     \n\t"\
00485     "movq              %%mm5, %%mm6     \n\t"\
00486     "movq              %%mm4, %%mm3     \n\t"\
00487     "punpcklwd         %%mm2, %%mm2     \n\t"\
00488     "punpcklwd         %%mm5, %%mm5     \n\t"\
00489     "punpcklwd         %%mm4, %%mm4     \n\t"\
00490     "paddw             %%mm1, %%mm2     \n\t"\
00491     "paddw             %%mm1, %%mm5     \n\t"\
00492     "paddw             %%mm1, %%mm4     \n\t"\
00493     "punpckhwd         %%mm0, %%mm0     \n\t"\
00494     "punpckhwd         %%mm6, %%mm6     \n\t"\
00495     "punpckhwd         %%mm3, %%mm3     \n\t"\
00496     "paddw             %%mm7, %%mm0     \n\t"\
00497     "paddw             %%mm7, %%mm6     \n\t"\
00498     "paddw             %%mm7, %%mm3     \n\t"\
00499     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00500     "packuswb          %%mm0, %%mm2     \n\t"\
00501     "packuswb          %%mm6, %%mm5     \n\t"\
00502     "packuswb          %%mm3, %%mm4     \n\t"\
00503 
00504 #define YSCALEYUV2RGB_YA(index, c) REAL_YSCALEYUV2RGB_YA(index, c)
00505 
00506 #define YSCALEYUV2RGB(index, c) \
00507     REAL_YSCALEYUV2RGB_UV(index, c) \
00508     REAL_YSCALEYUV2RGB_YA(index, c) \
00509     REAL_YSCALEYUV2RGB_COEFF(c)
00510 
00511 #define REAL_YSCALEYUV2PACKED1(index, c) \
00512     "xor            "#index", "#index"  \n\t"\
00513     ASMALIGN(4)\
00514     "1:                                 \n\t"\
00515     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
00516     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
00517     "psraw                $7, %%mm3     \n\t" \
00518     "psraw                $7, %%mm4     \n\t" \
00519     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
00520     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
00521     "psraw                $7, %%mm1     \n\t" \
00522     "psraw                $7, %%mm7     \n\t" \
00523 
00524 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
00525 
00526 #define REAL_YSCALEYUV2RGB1(index, c) \
00527     "xor            "#index", "#index"  \n\t"\
00528     ASMALIGN(4)\
00529     "1:                                 \n\t"\
00530     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
00531     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
00532     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
00533     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
00534     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
00535     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
00536     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
00537     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
00538     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
00539     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
00540     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00541     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
00542     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
00543     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00544     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00545     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
00546     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
00547     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
00548     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
00549     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
00550     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
00551     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00552     "paddw             %%mm3, %%mm4     \n\t"\
00553     "movq              %%mm2, %%mm0     \n\t"\
00554     "movq              %%mm5, %%mm6     \n\t"\
00555     "movq              %%mm4, %%mm3     \n\t"\
00556     "punpcklwd         %%mm2, %%mm2     \n\t"\
00557     "punpcklwd         %%mm5, %%mm5     \n\t"\
00558     "punpcklwd         %%mm4, %%mm4     \n\t"\
00559     "paddw             %%mm1, %%mm2     \n\t"\
00560     "paddw             %%mm1, %%mm5     \n\t"\
00561     "paddw             %%mm1, %%mm4     \n\t"\
00562     "punpckhwd         %%mm0, %%mm0     \n\t"\
00563     "punpckhwd         %%mm6, %%mm6     \n\t"\
00564     "punpckhwd         %%mm3, %%mm3     \n\t"\
00565     "paddw             %%mm7, %%mm0     \n\t"\
00566     "paddw             %%mm7, %%mm6     \n\t"\
00567     "paddw             %%mm7, %%mm3     \n\t"\
00568     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00569     "packuswb          %%mm0, %%mm2     \n\t"\
00570     "packuswb          %%mm6, %%mm5     \n\t"\
00571     "packuswb          %%mm3, %%mm4     \n\t"\
00572 
00573 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
00574 
00575 #define REAL_YSCALEYUV2PACKED1b(index, c) \
00576     "xor "#index", "#index"             \n\t"\
00577     ASMALIGN(4)\
00578     "1:                                 \n\t"\
00579     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00580     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00581     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
00582     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
00583     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
00584     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
00585     "psrlw                $8, %%mm3     \n\t" \
00586     "psrlw                $8, %%mm4     \n\t" \
00587     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
00588     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
00589     "psraw                $7, %%mm1     \n\t" \
00590     "psraw                $7, %%mm7     \n\t"
00591 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
00592 
00593 // do vertical chrominance interpolation
00594 #define REAL_YSCALEYUV2RGB1b(index, c) \
00595     "xor            "#index", "#index"  \n\t"\
00596     ASMALIGN(4)\
00597     "1:                                 \n\t"\
00598     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
00599     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
00600     "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
00601     "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
00602     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
00603     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
00604     "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
00605     "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
00606     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
00607     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
00608     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
00609     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
00610     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
00611     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
00612     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
00613     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
00614     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
00615     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00616     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
00617     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
00618     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
00619     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
00620     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
00621     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
00622     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
00623     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
00624     "paddw             %%mm3, %%mm4     \n\t"\
00625     "movq              %%mm2, %%mm0     \n\t"\
00626     "movq              %%mm5, %%mm6     \n\t"\
00627     "movq              %%mm4, %%mm3     \n\t"\
00628     "punpcklwd         %%mm2, %%mm2     \n\t"\
00629     "punpcklwd         %%mm5, %%mm5     \n\t"\
00630     "punpcklwd         %%mm4, %%mm4     \n\t"\
00631     "paddw             %%mm1, %%mm2     \n\t"\
00632     "paddw             %%mm1, %%mm5     \n\t"\
00633     "paddw             %%mm1, %%mm4     \n\t"\
00634     "punpckhwd         %%mm0, %%mm0     \n\t"\
00635     "punpckhwd         %%mm6, %%mm6     \n\t"\
00636     "punpckhwd         %%mm3, %%mm3     \n\t"\
00637     "paddw             %%mm7, %%mm0     \n\t"\
00638     "paddw             %%mm7, %%mm6     \n\t"\
00639     "paddw             %%mm7, %%mm3     \n\t"\
00640     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
00641     "packuswb          %%mm0, %%mm2     \n\t"\
00642     "packuswb          %%mm6, %%mm5     \n\t"\
00643     "packuswb          %%mm3, %%mm4     \n\t"\
00644 
00645 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
00646 
00647 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
00648     "movq       "#b", "#q2"     \n\t" /* B */\
00649     "movq       "#r", "#t"      \n\t" /* R */\
00650     "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
00651     "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
00652     "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
00653     "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
00654     "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
00655     "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
00656     "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
00657     "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
00658     "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
00659     "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
00660 \
00661     MOVNTQ(   q0,   (dst, index, 4))\
00662     MOVNTQ(    b,  8(dst, index, 4))\
00663     MOVNTQ(   q2, 16(dst, index, 4))\
00664     MOVNTQ(   q3, 24(dst, index, 4))\
00665 \
00666     "add      $8, "#index"      \n\t"\
00667     "cmp "#dstw", "#index"      \n\t"\
00668     " jb      1b                \n\t"
00669 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
00670 
00671 #define REAL_WRITERGB16(dst, dstw, index) \
00672     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
00673     "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
00674     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
00675     "psrlq           $3, %%mm2  \n\t"\
00676 \
00677     "movq         %%mm2, %%mm1  \n\t"\
00678     "movq         %%mm4, %%mm3  \n\t"\
00679 \
00680     "punpcklbw    %%mm7, %%mm3  \n\t"\
00681     "punpcklbw    %%mm5, %%mm2  \n\t"\
00682     "punpckhbw    %%mm7, %%mm4  \n\t"\
00683     "punpckhbw    %%mm5, %%mm1  \n\t"\
00684 \
00685     "psllq           $3, %%mm3  \n\t"\
00686     "psllq           $3, %%mm4  \n\t"\
00687 \
00688     "por          %%mm3, %%mm2  \n\t"\
00689     "por          %%mm4, %%mm1  \n\t"\
00690 \
00691     MOVNTQ(%%mm2,  (dst, index, 2))\
00692     MOVNTQ(%%mm1, 8(dst, index, 2))\
00693 \
00694     "add             $8, "#index"   \n\t"\
00695     "cmp        "#dstw", "#index"   \n\t"\
00696     " jb             1b             \n\t"
00697 #define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
00698 
00699 #define REAL_WRITERGB15(dst, dstw, index) \
00700     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
00701     "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
00702     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
00703     "psrlq           $3, %%mm2  \n\t"\
00704     "psrlq           $1, %%mm5  \n\t"\
00705 \
00706     "movq         %%mm2, %%mm1  \n\t"\
00707     "movq         %%mm4, %%mm3  \n\t"\
00708 \
00709     "punpcklbw    %%mm7, %%mm3  \n\t"\
00710     "punpcklbw    %%mm5, %%mm2  \n\t"\
00711     "punpckhbw    %%mm7, %%mm4  \n\t"\
00712     "punpckhbw    %%mm5, %%mm1  \n\t"\
00713 \
00714     "psllq           $2, %%mm3  \n\t"\
00715     "psllq           $2, %%mm4  \n\t"\
00716 \
00717     "por          %%mm3, %%mm2  \n\t"\
00718     "por          %%mm4, %%mm1  \n\t"\
00719 \
00720     MOVNTQ(%%mm2,  (dst, index, 2))\
00721     MOVNTQ(%%mm1, 8(dst, index, 2))\
00722 \
00723     "add             $8, "#index"   \n\t"\
00724     "cmp        "#dstw", "#index"   \n\t"\
00725     " jb             1b             \n\t"
00726 #define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
00727 
00728 #define WRITEBGR24OLD(dst, dstw, index) \
00729     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
00730     "movq      %%mm2, %%mm1             \n\t" /* B */\
00731     "movq      %%mm5, %%mm6             \n\t" /* R */\
00732     "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
00733     "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
00734     "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
00735     "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
00736     "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
00737     "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
00738     "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
00739     "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
00740     "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
00741     "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
00742 \
00743     "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
00744     "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
00745     "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
00746     "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
00747     "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
00748     "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
00749     "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
00750     "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
00751 \
00752     "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
00753     "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
00754     "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
00755     "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
00756     "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
00757     "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
00758     "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
00759     "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
00760     "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
00761     "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
00762     "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
00763     "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
00764     "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
00765 \
00766     "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
00767     "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
00768     "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
00769     "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
00770     "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
00771     "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
00772     "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
00773     "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
00774 \
00775     MOVNTQ(%%mm0,   (dst))\
00776     MOVNTQ(%%mm2,  8(dst))\
00777     MOVNTQ(%%mm3, 16(dst))\
00778     "add         $24, "#dst"            \n\t"\
00779 \
00780     "add          $8, "#index"          \n\t"\
00781     "cmp     "#dstw", "#index"          \n\t"\
00782     " jb          1b                    \n\t"
00783 
00784 #define WRITEBGR24MMX(dst, dstw, index) \
00785     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
00786     "movq      %%mm2, %%mm1     \n\t" /* B */\
00787     "movq      %%mm5, %%mm6     \n\t" /* R */\
00788     "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
00789     "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
00790     "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
00791     "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
00792     "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
00793     "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
00794     "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
00795     "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
00796     "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
00797     "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
00798 \
00799     "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
00800     "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
00801     "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
00802     "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
00803 \
00804     "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
00805     "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
00806     "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
00807     "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
00808 \
00809     "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
00810     "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
00811     "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
00812     "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
00813 \
00814     "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
00815     "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
00816     "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
00817     "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
00818     MOVNTQ(%%mm0, (dst))\
00819 \
00820     "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
00821     "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
00822     "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
00823     "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
00824     MOVNTQ(%%mm6, 8(dst))\
00825 \
00826     "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
00827     "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
00828     "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
00829     MOVNTQ(%%mm5, 16(dst))\
00830 \
00831     "add         $24, "#dst"    \n\t"\
00832 \
00833     "add          $8, "#index"  \n\t"\
00834     "cmp     "#dstw", "#index"  \n\t"\
00835     " jb          1b            \n\t"
00836 
00837 #define WRITEBGR24MMX2(dst, dstw, index) \
00838     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
00839     "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
00840     "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
00841     "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
00842     "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
00843     "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
00844 \
00845     "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
00846     "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
00847     "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
00848 \
00849     "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
00850     "por    %%mm1, %%mm6        \n\t"\
00851     "por    %%mm3, %%mm6        \n\t"\
00852     MOVNTQ(%%mm6, (dst))\
00853 \
00854     "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
00855     "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
00856     "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
00857     "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
00858 \
00859     "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
00860     "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
00861     "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
00862 \
00863     "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
00864     "por    %%mm3, %%mm6        \n\t"\
00865     MOVNTQ(%%mm6, 8(dst))\
00866 \
00867     "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
00868     "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
00869     "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
00870 \
00871     "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
00872     "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
00873     "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
00874 \
00875     "por    %%mm1, %%mm3        \n\t"\
00876     "por    %%mm3, %%mm6        \n\t"\
00877     MOVNTQ(%%mm6, 16(dst))\
00878 \
00879     "add      $24, "#dst"       \n\t"\
00880 \
00881     "add       $8, "#index"     \n\t"\
00882     "cmp  "#dstw", "#index"     \n\t"\
00883     " jb       1b               \n\t"
00884 
00885 #if HAVE_MMX2
00886 #undef WRITEBGR24
00887 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
00888 #else
00889 #undef WRITEBGR24
00890 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
00891 #endif
00892 
00893 #define REAL_WRITEYUY2(dst, dstw, index) \
00894     "packuswb  %%mm3, %%mm3     \n\t"\
00895     "packuswb  %%mm4, %%mm4     \n\t"\
00896     "packuswb  %%mm7, %%mm1     \n\t"\
00897     "punpcklbw %%mm4, %%mm3     \n\t"\
00898     "movq      %%mm1, %%mm7     \n\t"\
00899     "punpcklbw %%mm3, %%mm1     \n\t"\
00900     "punpckhbw %%mm3, %%mm7     \n\t"\
00901 \
00902     MOVNTQ(%%mm1, (dst, index, 2))\
00903     MOVNTQ(%%mm7, 8(dst, index, 2))\
00904 \
00905     "add          $8, "#index"  \n\t"\
00906     "cmp     "#dstw", "#index"  \n\t"\
00907     " jb          1b            \n\t"
00908 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
00909 
00910 
00911 static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
00912                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
00913                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
00914 {
00915 #if HAVE_MMX
00916     if(!(c->flags & SWS_BITEXACT)){
00917         if (c->flags & SWS_ACCURATE_RND){
00918             if (uDest){
00919                 YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
00920                 YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
00921             }
00922 
00923             YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
00924         }else{
00925             if (uDest){
00926                 YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
00927                 YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
00928             }
00929 
00930             YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
00931         }
00932         return;
00933     }
00934 #endif
00935 #if HAVE_ALTIVEC
00936 yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
00937                       chrFilter, chrSrc, chrFilterSize,
00938                       dest, uDest, vDest, dstW, chrDstW);
00939 #else //HAVE_ALTIVEC
00940 yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
00941             chrFilter, chrSrc, chrFilterSize,
00942             dest, uDest, vDest, dstW, chrDstW);
00943 #endif //!HAVE_ALTIVEC
00944 }
00945 
00946 static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
00947                                      int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
00948                                      uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
00949 {
00950 yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
00951              chrFilter, chrSrc, chrFilterSize,
00952              dest, uDest, dstW, chrDstW, dstFormat);
00953 }
00954 
00955 static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
00956                                     uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
00957 {
00958     int i;
00959 #if HAVE_MMX
00960     if(!(c->flags & SWS_BITEXACT)){
00961         long p= uDest ? 3 : 1;
00962         uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
00963         uint8_t *dst[3]= {dest, uDest, vDest};
00964         long counter[3] = {dstW, chrDstW, chrDstW};
00965 
00966         if (c->flags & SWS_ACCURATE_RND){
00967             while(p--){
00968                 __asm__ volatile(
00969                     YSCALEYUV2YV121_ACCURATE
00970                     :: "r" (src[p]), "r" (dst[p] + counter[p]),
00971                     "g" (-counter[p])
00972                     : "%"REG_a
00973                 );
00974             }
00975         }else{
00976             while(p--){
00977                 __asm__ volatile(
00978                     YSCALEYUV2YV121
00979                     :: "r" (src[p]), "r" (dst[p] + counter[p]),
00980                     "g" (-counter[p])
00981                     : "%"REG_a
00982                 );
00983             }
00984         }
00985         return;
00986     }
00987 #endif
00988     for (i=0; i<dstW; i++)
00989     {
00990         int val= (lumSrc[i]+64)>>7;
00991 
00992         if (val&256){
00993             if (val<0) val=0;
00994             else       val=255;
00995         }
00996 
00997         dest[i]= val;
00998     }
00999 
01000     if (uDest)
01001         for (i=0; i<chrDstW; i++)
01002         {
01003             int u=(chrSrc[i       ]+64)>>7;
01004             int v=(chrSrc[i + VOFW]+64)>>7;
01005 
01006             if ((u|v)&256){
01007                 if (u<0)        u=0;
01008                 else if (u>255) u=255;
01009                 if (v<0)        v=0;
01010                 else if (v>255) v=255;
01011             }
01012 
01013             uDest[i]= u;
01014             vDest[i]= v;
01015         }
01016 }
01017 
01018 
01022 static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
01023                                        int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
01024                                        uint8_t *dest, long dstW, long dstY)
01025 {
01026 #if HAVE_MMX
01027     long dummy=0;
01028     if(!(c->flags & SWS_BITEXACT)){
01029         if (c->flags & SWS_ACCURATE_RND){
01030             switch(c->dstFormat){
01031             case PIX_FMT_RGB32:
01032                 YSCALEYUV2PACKEDX_ACCURATE
01033                 YSCALEYUV2RGBX
01034                 "pcmpeqd %%mm7, %%mm7 \n\t"
01035                 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01036 
01037                 YSCALEYUV2PACKEDX_END
01038                 return;
01039             case PIX_FMT_BGR24:
01040                 YSCALEYUV2PACKEDX_ACCURATE
01041                 YSCALEYUV2RGBX
01042                 "pxor %%mm7, %%mm7 \n\t"
01043                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
01044                 "add %4, %%"REG_c"                        \n\t"
01045                 WRITEBGR24(%%REGc, %5, %%REGa)
01046 
01047 
01048                 :: "r" (&c->redDither),
01049                 "m" (dummy), "m" (dummy), "m" (dummy),
01050                 "r" (dest), "m" (dstW)
01051                 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
01052                 );
01053                 return;
01054             case PIX_FMT_RGB555:
01055                 YSCALEYUV2PACKEDX_ACCURATE
01056                 YSCALEYUV2RGBX
01057                 "pxor %%mm7, %%mm7 \n\t"
01058                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01059 #ifdef DITHER1XBPP
01060                 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
01061                 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
01062                 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
01063 #endif
01064 
01065                 WRITERGB15(%4, %5, %%REGa)
01066                 YSCALEYUV2PACKEDX_END
01067                 return;
01068             case PIX_FMT_RGB565:
01069                 YSCALEYUV2PACKEDX_ACCURATE
01070                 YSCALEYUV2RGBX
01071                 "pxor %%mm7, %%mm7 \n\t"
01072                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01073 #ifdef DITHER1XBPP
01074                 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
01075                 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
01076                 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
01077 #endif
01078 
01079                 WRITERGB16(%4, %5, %%REGa)
01080                 YSCALEYUV2PACKEDX_END
01081                 return;
01082             case PIX_FMT_YUYV422:
01083                 YSCALEYUV2PACKEDX_ACCURATE
01084                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01085 
01086                 "psraw $3, %%mm3    \n\t"
01087                 "psraw $3, %%mm4    \n\t"
01088                 "psraw $3, %%mm1    \n\t"
01089                 "psraw $3, %%mm7    \n\t"
01090                 WRITEYUY2(%4, %5, %%REGa)
01091                 YSCALEYUV2PACKEDX_END
01092                 return;
01093             }
01094         }else{
01095             switch(c->dstFormat)
01096             {
01097             case PIX_FMT_RGB32:
01098                 YSCALEYUV2PACKEDX
01099                 YSCALEYUV2RGBX
01100                 "pcmpeqd %%mm7, %%mm7 \n\t"
01101                 WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01102                 YSCALEYUV2PACKEDX_END
01103                 return;
01104             case PIX_FMT_BGR24:
01105                 YSCALEYUV2PACKEDX
01106                 YSCALEYUV2RGBX
01107                 "pxor                    %%mm7, %%mm7       \n\t"
01108                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
01109                 "add                        %4, %%"REG_c"   \n\t"
01110                 WRITEBGR24(%%REGc, %5, %%REGa)
01111 
01112                 :: "r" (&c->redDither),
01113                 "m" (dummy), "m" (dummy), "m" (dummy),
01114                 "r" (dest),  "m" (dstW)
01115                 : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
01116                 );
01117                 return;
01118             case PIX_FMT_RGB555:
01119                 YSCALEYUV2PACKEDX
01120                 YSCALEYUV2RGBX
01121                 "pxor %%mm7, %%mm7 \n\t"
01122                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01123 #ifdef DITHER1XBPP
01124                 "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
01125                 "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
01126                 "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
01127 #endif
01128 
01129                 WRITERGB15(%4, %5, %%REGa)
01130                 YSCALEYUV2PACKEDX_END
01131                 return;
01132             case PIX_FMT_RGB565:
01133                 YSCALEYUV2PACKEDX
01134                 YSCALEYUV2RGBX
01135                 "pxor %%mm7, %%mm7 \n\t"
01136                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01137 #ifdef DITHER1XBPP
01138                 "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
01139                 "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
01140                 "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
01141 #endif
01142 
01143                 WRITERGB16(%4, %5, %%REGa)
01144                 YSCALEYUV2PACKEDX_END
01145                 return;
01146             case PIX_FMT_YUYV422:
01147                 YSCALEYUV2PACKEDX
01148                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01149 
01150                 "psraw $3, %%mm3    \n\t"
01151                 "psraw $3, %%mm4    \n\t"
01152                 "psraw $3, %%mm1    \n\t"
01153                 "psraw $3, %%mm7    \n\t"
01154                 WRITEYUY2(%4, %5, %%REGa)
01155                 YSCALEYUV2PACKEDX_END
01156                 return;
01157             }
01158         }
01159     }
01160 #endif /* HAVE_MMX */
01161 #if HAVE_ALTIVEC
01162     /* The following list of supported dstFormat values should
01163        match what's found in the body of altivec_yuv2packedX() */
01164     if (!(c->flags & SWS_BITEXACT) &&
01165        (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
01166         c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
01167         c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
01168             altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
01169                                  chrFilter, chrSrc, chrFilterSize,
01170                                  dest, dstW, dstY);
01171     else
01172 #endif
01173         yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
01174                        chrFilter, chrSrc, chrFilterSize,
01175                        dest, dstW, dstY);
01176 }
01177 
01181 static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
01182                           uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
01183 {
01184     int  yalpha1=4095- yalpha;
01185     int uvalpha1=4095-uvalpha;
01186     int i;
01187 
01188 #if HAVE_MMX
01189     if(!(c->flags & SWS_BITEXACT)){
01190         switch(c->dstFormat)
01191         {
01192             //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
01193             case PIX_FMT_RGB32:
01194                 __asm__ volatile(
01195                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01196                 "mov        %4, %%"REG_b"               \n\t"
01197                 "push %%"REG_BP"                        \n\t"
01198                 YSCALEYUV2RGB(%%REGBP, %5)
01199                 "pcmpeqd %%mm7, %%mm7                   \n\t"
01200                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01201                 "pop %%"REG_BP"                         \n\t"
01202                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01203 
01204                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01205                 "a" (&c->redDither)
01206                 );
01207                 return;
01208             case PIX_FMT_BGR24:
01209                 __asm__ volatile(
01210                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01211                 "mov        %4, %%"REG_b"               \n\t"
01212                 "push %%"REG_BP"                        \n\t"
01213                 YSCALEYUV2RGB(%%REGBP, %5)
01214                 "pxor    %%mm7, %%mm7                   \n\t"
01215                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01216                 "pop %%"REG_BP"                         \n\t"
01217                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01218                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01219                 "a" (&c->redDither)
01220                 );
01221                 return;
01222             case PIX_FMT_RGB555:
01223                 __asm__ volatile(
01224                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01225                 "mov        %4, %%"REG_b"               \n\t"
01226                 "push %%"REG_BP"                        \n\t"
01227                 YSCALEYUV2RGB(%%REGBP, %5)
01228                 "pxor    %%mm7, %%mm7                   \n\t"
01229                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01230 #ifdef DITHER1XBPP
01231                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01232                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01233                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01234 #endif
01235 
01236                 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01237                 "pop %%"REG_BP"                         \n\t"
01238                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01239 
01240                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01241                 "a" (&c->redDither)
01242                 );
01243                 return;
01244             case PIX_FMT_RGB565:
01245                 __asm__ volatile(
01246                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01247                 "mov        %4, %%"REG_b"               \n\t"
01248                 "push %%"REG_BP"                        \n\t"
01249                 YSCALEYUV2RGB(%%REGBP, %5)
01250                 "pxor    %%mm7, %%mm7                   \n\t"
01251                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01252 #ifdef DITHER1XBPP
01253                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01254                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01255                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01256 #endif
01257 
01258                 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01259                 "pop %%"REG_BP"                         \n\t"
01260                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01261                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01262                 "a" (&c->redDither)
01263                 );
01264                 return;
01265             case PIX_FMT_YUYV422:
01266                 __asm__ volatile(
01267                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01268                 "mov %4, %%"REG_b"                        \n\t"
01269                 "push %%"REG_BP"                        \n\t"
01270                 YSCALEYUV2PACKED(%%REGBP, %5)
01271                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01272                 "pop %%"REG_BP"                         \n\t"
01273                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01274                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01275                 "a" (&c->redDither)
01276                 );
01277                 return;
01278             default: break;
01279         }
01280     }
01281 #endif //HAVE_MMX
01282 YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
01283 }
01284 
01288 static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
01289                           uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
01290 {
01291     const int yalpha1=0;
01292     int i;
01293 
01294     uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
01295     const int yalpha= 4096; //FIXME ...
01296 
01297     if (flags&SWS_FULL_CHR_H_INT)
01298     {
01299         RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
01300         return;
01301     }
01302 
01303 #if HAVE_MMX
01304     if(!(flags & SWS_BITEXACT)){
01305         if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
01306         {
01307             switch(dstFormat)
01308             {
01309             case PIX_FMT_RGB32:
01310                 __asm__ volatile(
01311                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01312                 "mov        %4, %%"REG_b"               \n\t"
01313                 "push %%"REG_BP"                        \n\t"
01314                 YSCALEYUV2RGB1(%%REGBP, %5)
01315                 "pcmpeqd %%mm7, %%mm7                   \n\t"
01316                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01317                 "pop %%"REG_BP"                         \n\t"
01318                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01319 
01320                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01321                 "a" (&c->redDither)
01322                 );
01323                 return;
01324             case PIX_FMT_BGR24:
01325                 __asm__ volatile(
01326                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01327                 "mov        %4, %%"REG_b"               \n\t"
01328                 "push %%"REG_BP"                        \n\t"
01329                 YSCALEYUV2RGB1(%%REGBP, %5)
01330                 "pxor    %%mm7, %%mm7                   \n\t"
01331                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01332                 "pop %%"REG_BP"                         \n\t"
01333                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01334 
01335                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01336                 "a" (&c->redDither)
01337                 );
01338                 return;
01339             case PIX_FMT_RGB555:
01340                 __asm__ volatile(
01341                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01342                 "mov        %4, %%"REG_b"               \n\t"
01343                 "push %%"REG_BP"                        \n\t"
01344                 YSCALEYUV2RGB1(%%REGBP, %5)
01345                 "pxor    %%mm7, %%mm7                   \n\t"
01346                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01347 #ifdef DITHER1XBPP
01348                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01349                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01350                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01351 #endif
01352                 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01353                 "pop %%"REG_BP"                         \n\t"
01354                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01355 
01356                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01357                 "a" (&c->redDither)
01358                 );
01359                 return;
01360             case PIX_FMT_RGB565:
01361                 __asm__ volatile(
01362                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01363                 "mov        %4, %%"REG_b"               \n\t"
01364                 "push %%"REG_BP"                        \n\t"
01365                 YSCALEYUV2RGB1(%%REGBP, %5)
01366                 "pxor    %%mm7, %%mm7                   \n\t"
01367                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01368 #ifdef DITHER1XBPP
01369                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01370                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01371                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01372 #endif
01373 
01374                 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01375                 "pop %%"REG_BP"                         \n\t"
01376                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01377 
01378                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01379                 "a" (&c->redDither)
01380                 );
01381                 return;
01382             case PIX_FMT_YUYV422:
01383                 __asm__ volatile(
01384                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01385                 "mov        %4, %%"REG_b"               \n\t"
01386                 "push %%"REG_BP"                        \n\t"
01387                 YSCALEYUV2PACKED1(%%REGBP, %5)
01388                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01389                 "pop %%"REG_BP"                         \n\t"
01390                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01391 
01392                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01393                 "a" (&c->redDither)
01394                 );
01395                 return;
01396             }
01397         }
01398         else
01399         {
01400             switch(dstFormat)
01401             {
01402             case PIX_FMT_RGB32:
01403                 __asm__ volatile(
01404                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01405                 "mov        %4, %%"REG_b"               \n\t"
01406                 "push %%"REG_BP"                        \n\t"
01407                 YSCALEYUV2RGB1b(%%REGBP, %5)
01408                 "pcmpeqd %%mm7, %%mm7                   \n\t"
01409                 WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
01410                 "pop %%"REG_BP"                         \n\t"
01411                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01412 
01413                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01414                 "a" (&c->redDither)
01415                 );
01416                 return;
01417             case PIX_FMT_BGR24:
01418                 __asm__ volatile(
01419                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01420                 "mov        %4, %%"REG_b"               \n\t"
01421                 "push %%"REG_BP"                        \n\t"
01422                 YSCALEYUV2RGB1b(%%REGBP, %5)
01423                 "pxor    %%mm7, %%mm7                   \n\t"
01424                 WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
01425                 "pop %%"REG_BP"                         \n\t"
01426                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01427 
01428                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01429                 "a" (&c->redDither)
01430                 );
01431                 return;
01432             case PIX_FMT_RGB555:
01433                 __asm__ volatile(
01434                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01435                 "mov        %4, %%"REG_b"               \n\t"
01436                 "push %%"REG_BP"                        \n\t"
01437                 YSCALEYUV2RGB1b(%%REGBP, %5)
01438                 "pxor    %%mm7, %%mm7                   \n\t"
01439                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01440 #ifdef DITHER1XBPP
01441                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01442                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01443                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01444 #endif
01445                 WRITERGB15(%%REGb, 8280(%5), %%REGBP)
01446                 "pop %%"REG_BP"                         \n\t"
01447                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01448 
01449                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01450                 "a" (&c->redDither)
01451                 );
01452                 return;
01453             case PIX_FMT_RGB565:
01454                 __asm__ volatile(
01455                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01456                 "mov        %4, %%"REG_b"               \n\t"
01457                 "push %%"REG_BP"                        \n\t"
01458                 YSCALEYUV2RGB1b(%%REGBP, %5)
01459                 "pxor    %%mm7, %%mm7                   \n\t"
01460                 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
01461 #ifdef DITHER1XBPP
01462                 "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
01463                 "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
01464                 "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
01465 #endif
01466 
01467                 WRITERGB16(%%REGb, 8280(%5), %%REGBP)
01468                 "pop %%"REG_BP"                         \n\t"
01469                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01470 
01471                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01472                 "a" (&c->redDither)
01473                 );
01474                 return;
01475             case PIX_FMT_YUYV422:
01476                 __asm__ volatile(
01477                 "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
01478                 "mov        %4, %%"REG_b"               \n\t"
01479                 "push %%"REG_BP"                        \n\t"
01480                 YSCALEYUV2PACKED1b(%%REGBP, %5)
01481                 WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
01482                 "pop %%"REG_BP"                         \n\t"
01483                 "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
01484 
01485                 :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
01486                 "a" (&c->redDither)
01487                 );
01488                 return;
01489             }
01490         }
01491     }
01492 #endif /* HAVE_MMX */
01493     if (uvalpha < 2048)
01494     {
01495         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
01496     }else{
01497         YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
01498     }
01499 }
01500 
01501 //FIXME yuy2* can read up to 7 samples too much
01502 
01503 static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
01504 {
01505 #if HAVE_MMX
01506     __asm__ volatile(
01507     "movq "MANGLE(bm01010101)", %%mm2           \n\t"
01508     "mov                    %0, %%"REG_a"       \n\t"
01509     "1:                                         \n\t"
01510     "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
01511     "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
01512     "pand                %%mm2, %%mm0           \n\t"
01513     "pand                %%mm2, %%mm1           \n\t"
01514     "packuswb            %%mm1, %%mm0           \n\t"
01515     "movq                %%mm0, (%2, %%"REG_a") \n\t"
01516     "add                    $8, %%"REG_a"       \n\t"
01517     " js                    1b                  \n\t"
01518     : : "g" (-width), "r" (src+width*2), "r" (dst+width)
01519     : "%"REG_a
01520     );
01521 #else
01522     int i;
01523     for (i=0; i<width; i++)
01524         dst[i]= src[2*i];
01525 #endif
01526 }
01527 
01528 static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
01529 {
01530 #if HAVE_MMX
01531     __asm__ volatile(
01532     "movq "MANGLE(bm01010101)", %%mm4           \n\t"
01533     "mov                    %0, %%"REG_a"       \n\t"
01534     "1:                                         \n\t"
01535     "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
01536     "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
01537     "psrlw                  $8, %%mm0           \n\t"
01538     "psrlw                  $8, %%mm1           \n\t"
01539     "packuswb            %%mm1, %%mm0           \n\t"
01540     "movq                %%mm0, %%mm1           \n\t"
01541     "psrlw                  $8, %%mm0           \n\t"
01542     "pand                %%mm4, %%mm1           \n\t"
01543     "packuswb            %%mm0, %%mm0           \n\t"
01544     "packuswb            %%mm1, %%mm1           \n\t"
01545     "movd                %%mm0, (%3, %%"REG_a") \n\t"
01546     "movd                %%mm1, (%2, %%"REG_a") \n\t"
01547     "add                    $4, %%"REG_a"       \n\t"
01548     " js                    1b                  \n\t"
01549     : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
01550     : "%"REG_a
01551     );
01552 #else
01553     int i;
01554     for (i=0; i<width; i++)
01555     {
01556         dstU[i]= src1[4*i + 1];
01557         dstV[i]= src1[4*i + 3];
01558     }
01559 #endif
01560     assert(src1 == src2);
01561 }
01562 
01563 /* This is almost identical to the previous, end exists only because
01564  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
01565 static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
01566 {
01567 #if HAVE_MMX
01568     __asm__ volatile(
01569     "mov                  %0, %%"REG_a"         \n\t"
01570     "1:                                         \n\t"
01571     "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
01572     "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
01573     "psrlw                $8, %%mm0             \n\t"
01574     "psrlw                $8, %%mm1             \n\t"
01575     "packuswb          %%mm1, %%mm0             \n\t"
01576     "movq              %%mm0, (%2, %%"REG_a")   \n\t"
01577     "add                  $8, %%"REG_a"         \n\t"
01578     " js                  1b                    \n\t"
01579     : : "g" (-width), "r" (src+width*2), "r" (dst+width)
01580     : "%"REG_a
01581     );
01582 #else
01583     int i;
01584     for (i=0; i<width; i++)
01585         dst[i]= src[2*i+1];
01586 #endif
01587 }
01588 
01589 static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
01590 {
01591 #if HAVE_MMX
01592     __asm__ volatile(
01593     "movq "MANGLE(bm01010101)", %%mm4           \n\t"
01594     "mov                    %0, %%"REG_a"       \n\t"
01595     "1:                                         \n\t"
01596     "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
01597     "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
01598     "pand                %%mm4, %%mm0           \n\t"
01599     "pand                %%mm4, %%mm1           \n\t"
01600     "packuswb            %%mm1, %%mm0           \n\t"
01601     "movq                %%mm0, %%mm1           \n\t"
01602     "psrlw                  $8, %%mm0           \n\t"
01603     "pand                %%mm4, %%mm1           \n\t"
01604     "packuswb            %%mm0, %%mm0           \n\t"
01605     "packuswb            %%mm1, %%mm1           \n\t"
01606     "movd                %%mm0, (%3, %%"REG_a") \n\t"
01607     "movd                %%mm1, (%2, %%"REG_a") \n\t"
01608     "add                    $4, %%"REG_a"       \n\t"
01609     " js                    1b                  \n\t"
01610     : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
01611     : "%"REG_a
01612     );
01613 #else
01614     int i;
01615     for (i=0; i<width; i++)
01616     {
01617         dstU[i]= src1[4*i + 0];
01618         dstV[i]= src1[4*i + 2];
01619     }
01620 #endif
01621     assert(src1 == src2);
01622 }
01623 
01624 #define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
01625 static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
01626 {\
01627     int i;\
01628     for (i=0; i<width; i++)\
01629     {\
01630         int b= (((type*)src)[i]>>shb)&maskb;\
01631         int g= (((type*)src)[i]>>shg)&maskg;\
01632         int r= (((type*)src)[i]>>shr)&maskr;\
01633 \
01634         dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
01635     }\
01636 }
01637 
01638 BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
01639 BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
01640 BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
01641 BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
01642 BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
01643 BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
01644 
01645 #define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
01646 static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
01647 {\
01648     int i;\
01649     for (i=0; i<width; i++)\
01650     {\
01651         int b= (((type*)src)[i]&maskb)>>shb;\
01652         int g= (((type*)src)[i]&maskg)>>shg;\
01653         int r= (((type*)src)[i]&maskr)>>shr;\
01654 \
01655         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
01656         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
01657     }\
01658 }\
01659 static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
01660 {\
01661     int i;\
01662     for (i=0; i<width; i++)\
01663     {\
01664         int pix0= ((type*)src)[2*i+0];\
01665         int pix1= ((type*)src)[2*i+1];\
01666         int g= (pix0&(maskg|maska))+(pix1&(maskg|maska));\
01667         int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
01668         int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
01669         g&= maskg|(2*maskg);\
01670 \
01671         g>>=shg;\
01672 \
01673         dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
01674         dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
01675     }\
01676 }
01677 
01678 BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
01679 BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
01680 BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0,          0,   0x001F, 0x07E0,   0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
01681 BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0,          0,   0x001F, 0x03E0,   0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
01682 BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0,          0,   0xF800, 0x07E0,   0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
01683 BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0,          0,   0x7C00, 0x03E0,   0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
01684 
01685 #if HAVE_MMX
01686 static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
01687 {
01688 
01689     if(srcFormat == PIX_FMT_BGR24){
01690         __asm__ volatile(
01691             "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
01692             "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
01693             :
01694         );
01695     }else{
01696         __asm__ volatile(
01697             "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
01698             "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
01699             :
01700         );
01701     }
01702 
01703     __asm__ volatile(
01704         "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
01705         "mov                        %2, %%"REG_a"   \n\t"
01706         "pxor                    %%mm7, %%mm7       \n\t"
01707         "1:                                         \n\t"
01708         PREFETCH"               64(%0)              \n\t"
01709         "movd                     (%0), %%mm0       \n\t"
01710         "movd                    2(%0), %%mm1       \n\t"
01711         "movd                    6(%0), %%mm2       \n\t"
01712         "movd                    8(%0), %%mm3       \n\t"
01713         "add                       $12, %0          \n\t"
01714         "punpcklbw               %%mm7, %%mm0       \n\t"
01715         "punpcklbw               %%mm7, %%mm1       \n\t"
01716         "punpcklbw               %%mm7, %%mm2       \n\t"
01717         "punpcklbw               %%mm7, %%mm3       \n\t"
01718         "pmaddwd                 %%mm5, %%mm0       \n\t"
01719         "pmaddwd                 %%mm6, %%mm1       \n\t"
01720         "pmaddwd                 %%mm5, %%mm2       \n\t"
01721         "pmaddwd                 %%mm6, %%mm3       \n\t"
01722         "paddd                   %%mm1, %%mm0       \n\t"
01723         "paddd                   %%mm3, %%mm2       \n\t"
01724         "paddd                   %%mm4, %%mm0       \n\t"
01725         "paddd                   %%mm4, %%mm2       \n\t"
01726         "psrad                     $15, %%mm0       \n\t"
01727         "psrad                     $15, %%mm2       \n\t"
01728         "packssdw                %%mm2, %%mm0       \n\t"
01729         "packuswb                %%mm0, %%mm0       \n\t"
01730         "movd                %%mm0, (%1, %%"REG_a") \n\t"
01731         "add                        $4, %%"REG_a"   \n\t"
01732         " js                        1b              \n\t"
01733     : "+r" (src)
01734     : "r" (dst+width), "g" (-width)
01735     : "%"REG_a
01736     );
01737 }
01738 
01739 static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
01740 {
01741     __asm__ volatile(
01742         "movq                   24(%4), %%mm6       \n\t"
01743         "mov                        %3, %%"REG_a"   \n\t"
01744         "pxor                    %%mm7, %%mm7       \n\t"
01745         "1:                                         \n\t"
01746         PREFETCH"               64(%0)              \n\t"
01747         "movd                     (%0), %%mm0       \n\t"
01748         "movd                    2(%0), %%mm1       \n\t"
01749         "punpcklbw               %%mm7, %%mm0       \n\t"
01750         "punpcklbw               %%mm7, %%mm1       \n\t"
01751         "movq                    %%mm0, %%mm2       \n\t"
01752         "movq                    %%mm1, %%mm3       \n\t"
01753         "pmaddwd                  (%4), %%mm0       \n\t"
01754         "pmaddwd                 8(%4), %%mm1       \n\t"
01755         "pmaddwd                16(%4), %%mm2       \n\t"
01756         "pmaddwd                 %%mm6, %%mm3       \n\t"
01757         "paddd                   %%mm1, %%mm0       \n\t"
01758         "paddd                   %%mm3, %%mm2       \n\t"
01759 
01760         "movd                    6(%0), %%mm1       \n\t"
01761         "movd                    8(%0), %%mm3       \n\t"
01762         "add                       $12, %0          \n\t"
01763         "punpcklbw               %%mm7, %%mm1       \n\t"
01764         "punpcklbw               %%mm7, %%mm3       \n\t"
01765         "movq                    %%mm1, %%mm4       \n\t"
01766         "movq                    %%mm3, %%mm5       \n\t"
01767         "pmaddwd                  (%4), %%mm1       \n\t"
01768         "pmaddwd                 8(%4), %%mm3       \n\t"
01769         "pmaddwd                16(%4), %%mm4       \n\t"
01770         "pmaddwd                 %%mm6, %%mm5       \n\t"
01771         "paddd                   %%mm3, %%mm1       \n\t"
01772         "paddd                   %%mm5, %%mm4       \n\t"
01773 
01774         "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
01775         "paddd                   %%mm3, %%mm0       \n\t"
01776         "paddd                   %%mm3, %%mm2       \n\t"
01777         "paddd                   %%mm3, %%mm1       \n\t"
01778         "paddd                   %%mm3, %%mm4       \n\t"
01779         "psrad                     $15, %%mm0       \n\t"
01780         "psrad                     $15, %%mm2       \n\t"
01781         "psrad                     $15, %%mm1       \n\t"
01782         "psrad                     $15, %%mm4       \n\t"
01783         "packssdw                %%mm1, %%mm0       \n\t"
01784         "packssdw                %%mm4, %%mm2       \n\t"
01785         "packuswb                %%mm0, %%mm0       \n\t"
01786         "packuswb                %%mm2, %%mm2       \n\t"
01787         "movd                %%mm0, (%1, %%"REG_a") \n\t"
01788         "movd                %%mm2, (%2, %%"REG_a") \n\t"
01789         "add                        $4, %%"REG_a"   \n\t"
01790         " js                        1b              \n\t"
01791     : "+r" (src)
01792     : "r" (dstU+width), "r" (dstV+width), "g" (-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
01793     : "%"REG_a
01794     );
01795 }
01796 #endif
01797 
01798 static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
01799 {
01800 #if HAVE_MMX
01801     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
01802 #else
01803     int i;
01804     for (i=0; i<width; i++)
01805     {
01806         int b= src[i*3+0];
01807         int g= src[i*3+1];
01808         int r= src[i*3+2];
01809 
01810         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
01811     }
01812 #endif /* HAVE_MMX */
01813 }
01814 
01815 static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
01816 {
01817 #if HAVE_MMX
01818     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
01819 #else
01820     int i;
01821     for (i=0; i<width; i++)
01822     {
01823         int b= src1[3*i + 0];
01824         int g= src1[3*i + 1];
01825         int r= src1[3*i + 2];
01826 
01827         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
01828         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
01829     }
01830 #endif /* HAVE_MMX */
01831     assert(src1 == src2);
01832 }
01833 
01834 static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
01835 {
01836     int i;
01837     for (i=0; i<width; i++)
01838     {
01839         int b= src1[6*i + 0] + src1[6*i + 3];
01840         int g= src1[6*i + 1] + src1[6*i + 4];
01841         int r= src1[6*i + 2] + src1[6*i + 5];
01842 
01843         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
01844         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
01845     }
01846     assert(src1 == src2);
01847 }
01848 
01849 static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
01850 {
01851 #if HAVE_MMX
01852     RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
01853 #else
01854     int i;
01855     for (i=0; i<width; i++)
01856     {
01857         int r= src[i*3+0];
01858         int g= src[i*3+1];
01859         int b= src[i*3+2];
01860 
01861         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
01862     }
01863 #endif
01864 }
01865 
01866 static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
01867 {
01868 #if HAVE_MMX
01869     assert(src1==src2);
01870     RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
01871 #else
01872     int i;
01873     assert(src1==src2);
01874     for (i=0; i<width; i++)
01875     {
01876         int r= src1[3*i + 0];
01877         int g= src1[3*i + 1];
01878         int b= src1[3*i + 2];
01879 
01880         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
01881         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
01882     }
01883 #endif
01884 }
01885 
01886 static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
01887 {
01888     int i;
01889     assert(src1==src2);
01890     for (i=0; i<width; i++)
01891     {
01892         int r= src1[6*i + 0] + src1[6*i + 3];
01893         int g= src1[6*i + 1] + src1[6*i + 4];
01894         int b= src1[6*i + 2] + src1[6*i + 5];
01895 
01896         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
01897         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
01898     }
01899 }
01900 
01901 
01902 static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
01903 {
01904     int i;
01905     for (i=0; i<width; i++)
01906     {
01907         int d= src[i];
01908 
01909         dst[i]= pal[d] & 0xFF;
01910     }
01911 }
01912 
01913 static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
01914 {
01915     int i;
01916     assert(src1 == src2);
01917     for (i=0; i<width; i++)
01918     {
01919         int p= pal[src1[i]];
01920 
01921         dstU[i]= p>>8;
01922         dstV[i]= p>>16;
01923     }
01924 }
01925 
01926 static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
01927 {
01928     int i, j;
01929     for (i=0; i<width/8; i++){
01930         int d= ~src[i];
01931         for(j=0; j<8; j++)
01932             dst[8*i+j]= ((d>>(7-j))&1)*255;
01933     }
01934 }
01935 
01936 static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
01937 {
01938     int i, j;
01939     for (i=0; i<width/8; i++){
01940         int d= src[i];
01941         for(j=0; j<8; j++)
01942             dst[8*i+j]= ((d>>(7-j))&1)*255;
01943     }
01944 }
01945 
01946 // bilinear / bicubic scaling
01947 static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
01948                                   int16_t *filter, int16_t *filterPos, long filterSize)
01949 {
01950 #if HAVE_MMX
01951     assert(filterSize % 4 == 0 && filterSize>0);
01952     if (filterSize==4) // Always true for upscaling, sometimes for down, too.
01953     {
01954         long counter= -2*dstW;
01955         filter-= counter*2;
01956         filterPos-= counter/2;
01957         dst-= counter/2;
01958         __asm__ volatile(
01959 #if defined(PIC)
01960         "push            %%"REG_b"              \n\t"
01961 #endif
01962         "pxor                %%mm7, %%mm7       \n\t"
01963         "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
01964         "mov             %%"REG_a", %%"REG_BP"  \n\t"
01965         ASMALIGN(4)
01966         "1:                                     \n\t"
01967         "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
01968         "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
01969         "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
01970         "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
01971         "movd      (%3, %%"REG_a"), %%mm0       \n\t"
01972         "movd      (%3, %%"REG_b"), %%mm2       \n\t"
01973         "punpcklbw           %%mm7, %%mm0       \n\t"
01974         "punpcklbw           %%mm7, %%mm2       \n\t"
01975         "pmaddwd             %%mm1, %%mm0       \n\t"
01976         "pmaddwd             %%mm2, %%mm3       \n\t"
01977         "movq                %%mm0, %%mm4       \n\t"
01978         "punpckldq           %%mm3, %%mm0       \n\t"
01979         "punpckhdq           %%mm3, %%mm4       \n\t"
01980         "paddd               %%mm4, %%mm0       \n\t"
01981         "psrad                  $7, %%mm0       \n\t"
01982         "packssdw            %%mm0, %%mm0       \n\t"
01983         "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
01984         "add                    $4, %%"REG_BP"  \n\t"
01985         " jnc                   1b              \n\t"
01986 
01987         "pop            %%"REG_BP"              \n\t"
01988 #if defined(PIC)
01989         "pop             %%"REG_b"              \n\t"
01990 #endif
01991         : "+a" (counter)
01992         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
01993 #if !defined(PIC)
01994         : "%"REG_b
01995 #endif
01996         );
01997     }
01998     else if (filterSize==8)
01999     {
02000         long counter= -2*dstW;
02001         filter-= counter*4;
02002         filterPos-= counter/2;
02003         dst-= counter/2;
02004         __asm__ volatile(
02005 #if defined(PIC)
02006         "push             %%"REG_b"             \n\t"
02007 #endif
02008         "pxor                 %%mm7, %%mm7      \n\t"
02009         "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
02010         "mov              %%"REG_a", %%"REG_BP" \n\t"
02011         ASMALIGN(4)
02012         "1:                                     \n\t"
02013         "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
02014         "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
02015         "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
02016         "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
02017         "movd       (%3, %%"REG_a"), %%mm0      \n\t"
02018         "movd       (%3, %%"REG_b"), %%mm2      \n\t"
02019         "punpcklbw            %%mm7, %%mm0      \n\t"
02020         "punpcklbw            %%mm7, %%mm2      \n\t"
02021         "pmaddwd              %%mm1, %%mm0      \n\t"
02022         "pmaddwd              %%mm2, %%mm3      \n\t"
02023 
02024         "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
02025         "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
02026         "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
02027         "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
02028         "punpcklbw            %%mm7, %%mm4      \n\t"
02029         "punpcklbw            %%mm7, %%mm2      \n\t"
02030         "pmaddwd              %%mm1, %%mm4      \n\t"
02031         "pmaddwd              %%mm2, %%mm5      \n\t"
02032         "paddd                %%mm4, %%mm0      \n\t"
02033         "paddd                %%mm5, %%mm3      \n\t"
02034         "movq                 %%mm0, %%mm4      \n\t"
02035         "punpckldq            %%mm3, %%mm0      \n\t"
02036         "punpckhdq            %%mm3, %%mm4      \n\t"
02037         "paddd                %%mm4, %%mm0      \n\t"
02038         "psrad                   $7, %%mm0      \n\t"
02039         "packssdw             %%mm0, %%mm0      \n\t"
02040         "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
02041         "add                     $4, %%"REG_BP" \n\t"
02042         " jnc                    1b             \n\t"
02043 
02044         "pop             %%"REG_BP"             \n\t"
02045 #if defined(PIC)
02046         "pop              %%"REG_b"             \n\t"
02047 #endif
02048         : "+a" (counter)
02049         : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
02050 #if !defined(PIC)
02051         : "%"REG_b
02052 #endif
02053         );
02054     }
02055     else
02056     {
02057         uint8_t *offset = src+filterSize;
02058         long counter= -2*dstW;
02059         //filter-= counter*filterSize/2;
02060         filterPos-= counter/2;
02061         dst-= counter/2;
02062         __asm__ volatile(
02063         "pxor                  %%mm7, %%mm7     \n\t"
02064         ASMALIGN(4)
02065         "1:                                     \n\t"
02066         "mov                      %2, %%"REG_c" \n\t"
02067         "movzwl      (%%"REG_c", %0), %%eax     \n\t"
02068         "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
02069         "mov                      %5, %%"REG_c" \n\t"
02070         "pxor                  %%mm4, %%mm4     \n\t"
02071         "pxor                  %%mm5, %%mm5     \n\t"
02072         "2:                                     \n\t"
02073         "movq                   (%1), %%mm1     \n\t"
02074         "movq               (%1, %6), %%mm3     \n\t"
02075         "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
02076         "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
02077         "punpcklbw             %%mm7, %%mm0     \n\t"
02078         "punpcklbw             %%mm7, %%mm2     \n\t"
02079         "pmaddwd               %%mm1, %%mm0     \n\t"
02080         "pmaddwd               %%mm2, %%mm3     \n\t"
02081         "paddd                 %%mm3, %%mm5     \n\t"
02082         "paddd                 %%mm0, %%mm4     \n\t"
02083         "add                      $8, %1        \n\t"
02084         "add                      $4, %%"REG_c" \n\t"
02085         "cmp                      %4, %%"REG_c" \n\t"
02086         " jb                      2b            \n\t"
02087         "add                      %6, %1        \n\t"
02088         "movq                  %%mm4, %%mm0     \n\t"
02089         "punpckldq             %%mm5, %%mm4     \n\t"
02090         "punpckhdq             %%mm5, %%mm0     \n\t"
02091         "paddd                 %%mm0, %%mm4     \n\t"
02092         "psrad                    $7, %%mm4     \n\t"
02093         "packssdw              %%mm4, %%mm4     \n\t"
02094         "mov                      %3, %%"REG_a" \n\t"
02095         "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
02096         "add                      $4, %0        \n\t"
02097         " jnc                     1b            \n\t"
02098 
02099         : "+r" (counter), "+r" (filter)
02100         : "m" (filterPos), "m" (dst), "m"(offset),
02101           "m" (src), "r" (filterSize*2)
02102         : "%"REG_a, "%"REG_c, "%"REG_d
02103         );
02104     }
02105 #else
02106 #if HAVE_ALTIVEC
02107     hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
02108 #else
02109     int i;
02110     for (i=0; i<dstW; i++)
02111     {
02112         int j;
02113         int srcPos= filterPos[i];
02114         int val=0;
02115         //printf("filterPos: %d\n", filterPos[i]);
02116         for (j=0; j<filterSize; j++)
02117         {
02118             //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
02119             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
02120         }
02121         //filter += hFilterSize;
02122         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
02123         //dst[i] = val>>7;
02124     }
02125 #endif /* HAVE_ALTIVEC */
02126 #endif /* HAVE_MMX */
02127 }
02128       // *** horizontal scale Y line to temp buffer
02129 static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
02130                                    int flags, int canMMX2BeUsed, int16_t *hLumFilter,
02131                                    int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
02132                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
02133                                    int32_t *mmx2FilterPos, uint32_t *pal)
02134 {
02135     if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
02136     {
02137         RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
02138         src= formatConvBuffer;
02139     }
02140     else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
02141     {
02142         RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
02143         src= formatConvBuffer;
02144     }
02145     else if (srcFormat==PIX_FMT_RGB32)
02146     {
02147         RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
02148         src= formatConvBuffer;
02149     }
02150     else if (srcFormat==PIX_FMT_RGB32_1)
02151     {
02152         RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
02153         src= formatConvBuffer;
02154     }
02155     else if (srcFormat==PIX_FMT_BGR24)
02156     {
02157         RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
02158         src= formatConvBuffer;
02159     }
02160     else if (srcFormat==PIX_FMT_BGR565)
02161     {
02162         RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
02163         src= formatConvBuffer;
02164     }
02165     else if (srcFormat==PIX_FMT_BGR555)
02166     {
02167         RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
02168         src= formatConvBuffer;
02169     }
02170     else if (srcFormat==PIX_FMT_BGR32)
02171     {
02172         RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
02173         src= formatConvBuffer;
02174     }
02175     else if (srcFormat==PIX_FMT_BGR32_1)
02176     {
02177         RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
02178         src= formatConvBuffer;
02179     }
02180     else if (srcFormat==PIX_FMT_RGB24)
02181     {
02182         RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
02183         src= formatConvBuffer;
02184     }
02185     else if (srcFormat==PIX_FMT_RGB565)
02186     {
02187         RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
02188         src= formatConvBuffer;
02189     }
02190     else if (srcFormat==PIX_FMT_RGB555)
02191     {
02192         RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
02193         src= formatConvBuffer;
02194     }
02195     else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
02196     {
02197         RENAME(palToY)(formatConvBuffer, src, srcW, pal);
02198         src= formatConvBuffer;
02199     }
02200     else if (srcFormat==PIX_FMT_MONOBLACK)
02201     {
02202         RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
02203         src= formatConvBuffer;
02204     }
02205     else if (srcFormat==PIX_FMT_MONOWHITE)
02206     {
02207         RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
02208         src= formatConvBuffer;
02209     }
02210 
02211 #if HAVE_MMX
02212     // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
02213     if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
02214 #else
02215     if (!(flags&SWS_FAST_BILINEAR))
02216 #endif
02217     {
02218         RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
02219     }
02220     else // fast bilinear upscale / crap downscale
02221     {
02222 #if ARCH_X86 && CONFIG_GPL
02223 #if HAVE_MMX2
02224         int i;
02225 #if defined(PIC)
02226         uint64_t ebxsave __attribute__((aligned(8)));
02227 #endif
02228         if (canMMX2BeUsed)
02229         {
02230             __asm__ volatile(
02231 #if defined(PIC)
02232             "mov               %%"REG_b", %5        \n\t"
02233 #endif
02234             "pxor                  %%mm7, %%mm7     \n\t"
02235             "mov                      %0, %%"REG_c" \n\t"
02236             "mov                      %1, %%"REG_D" \n\t"
02237             "mov                      %2, %%"REG_d" \n\t"
02238             "mov                      %3, %%"REG_b" \n\t"
02239             "xor               %%"REG_a", %%"REG_a" \n\t" // i
02240             PREFETCH"        (%%"REG_c")            \n\t"
02241             PREFETCH"      32(%%"REG_c")            \n\t"
02242             PREFETCH"      64(%%"REG_c")            \n\t"
02243 
02244 #if ARCH_X86_64
02245 
02246 #define FUNNY_Y_CODE \
02247             "movl            (%%"REG_b"), %%esi     \n\t"\
02248             "call                    *%4            \n\t"\
02249             "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
02250             "add               %%"REG_S", %%"REG_c" \n\t"\
02251             "add               %%"REG_a", %%"REG_D" \n\t"\
02252             "xor               %%"REG_a", %%"REG_a" \n\t"\
02253 
02254 #else
02255 
02256 #define FUNNY_Y_CODE \
02257             "movl (%%"REG_b"), %%esi        \n\t"\
02258             "call         *%4                       \n\t"\
02259             "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
02260             "add               %%"REG_a", %%"REG_D" \n\t"\
02261             "xor               %%"REG_a", %%"REG_a" \n\t"\
02262 
02263 #endif /* ARCH_X86_64 */
02264 
02265 FUNNY_Y_CODE
02266 FUNNY_Y_CODE
02267 FUNNY_Y_CODE
02268 FUNNY_Y_CODE
02269 FUNNY_Y_CODE
02270 FUNNY_Y_CODE
02271 FUNNY_Y_CODE
02272 FUNNY_Y_CODE
02273 
02274 #if defined(PIC)
02275             "mov                      %5, %%"REG_b" \n\t"
02276 #endif
02277             :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
02278             "m" (funnyYCode)
02279 #if defined(PIC)
02280             ,"m" (ebxsave)
02281 #endif
02282             : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
02283 #if !defined(PIC)
02284             ,"%"REG_b
02285 #endif
02286             );
02287             for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
02288         }
02289         else
02290         {
02291 #endif /* HAVE_MMX2 */
02292         long xInc_shr16 = xInc >> 16;
02293         uint16_t xInc_mask = xInc & 0xffff;
02294         //NO MMX just normal asm ...
02295         __asm__ volatile(
02296         "xor %%"REG_a", %%"REG_a"            \n\t" // i
02297         "xor %%"REG_d", %%"REG_d"            \n\t" // xx
02298         "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
02299         ASMALIGN(4)
02300         "1:                                  \n\t"
02301         "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
02302         "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
02303         "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
02304         "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
02305         "shll      $16, %%edi                \n\t"
02306         "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
02307         "mov        %1, %%"REG_D"            \n\t"
02308         "shrl       $9, %%esi                \n\t"
02309         "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
02310         "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
02311         "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
02312 
02313         "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
02314         "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
02315         "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
02316         "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
02317         "shll      $16, %%edi                \n\t"
02318         "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
02319         "mov        %1, %%"REG_D"            \n\t"
02320         "shrl       $9, %%esi                \n\t"
02321         "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
02322         "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
02323         "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
02324 
02325 
02326         "add        $2, %%"REG_a"            \n\t"
02327         "cmp        %2, %%"REG_a"            \n\t"
02328         " jb        1b                       \n\t"
02329 
02330 
02331         :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
02332         : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
02333         );
02334 #if HAVE_MMX2
02335         } //if MMX2 can't be used
02336 #endif
02337 #else
02338         int i;
02339         unsigned int xpos=0;
02340         for (i=0;i<dstWidth;i++)
02341         {
02342             register unsigned int xx=xpos>>16;
02343             register unsigned int xalpha=(xpos&0xFFFF)>>9;
02344             dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
02345             xpos+=xInc;
02346         }
02347 #endif /* ARCH_X86 */
02348     }
02349 
02350     if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
02351         int i;
02352         //FIXME all pal and rgb srcFormats could do this convertion as well
02353         //FIXME all scalers more complex than bilinear could do half of this transform
02354         if(c->srcRange){
02355             for (i=0; i<dstWidth; i++)
02356                 dst[i]= (dst[i]*14071 + 33561947)>>14;
02357         }else{
02358             for (i=0; i<dstWidth; i++)
02359                 dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
02360         }
02361     }
02362 }
02363 
02364 inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
02365                                    int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
02366                                    int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
02367                                    int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
02368                                    int32_t *mmx2FilterPos, uint32_t *pal)
02369 {
02370     if (srcFormat==PIX_FMT_YUYV422)
02371     {
02372         RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02373         src1= formatConvBuffer;
02374         src2= formatConvBuffer+VOFW;
02375     }
02376     else if (srcFormat==PIX_FMT_UYVY422)
02377     {
02378         RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02379         src1= formatConvBuffer;
02380         src2= formatConvBuffer+VOFW;
02381     }
02382     else if (srcFormat==PIX_FMT_RGB32)
02383     {
02384         if(c->chrSrcHSubSample)
02385             RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02386         else
02387             RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02388         src1= formatConvBuffer;
02389         src2= formatConvBuffer+VOFW;
02390     }
02391     else if (srcFormat==PIX_FMT_RGB32_1)
02392     {
02393         if(c->chrSrcHSubSample)
02394             RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
02395         else
02396             RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
02397         src1= formatConvBuffer;
02398         src2= formatConvBuffer+VOFW;
02399     }
02400     else if (srcFormat==PIX_FMT_BGR24)
02401     {
02402         if(c->chrSrcHSubSample)
02403             RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02404         else
02405             RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02406         src1= formatConvBuffer;
02407         src2= formatConvBuffer+VOFW;
02408     }
02409     else if (srcFormat==PIX_FMT_BGR565)
02410     {
02411         if(c->chrSrcHSubSample)
02412             RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02413         else
02414             RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02415         src1= formatConvBuffer;
02416         src2= formatConvBuffer+VOFW;
02417     }
02418     else if (srcFormat==PIX_FMT_BGR555)
02419     {
02420         if(c->chrSrcHSubSample)
02421             RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02422         else
02423             RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02424         src1= formatConvBuffer;
02425         src2= formatConvBuffer+VOFW;
02426     }
02427     else if (srcFormat==PIX_FMT_BGR32)
02428     {
02429         if(c->chrSrcHSubSample)
02430             RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02431         else
02432             RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02433         src1= formatConvBuffer;
02434         src2= formatConvBuffer+VOFW;
02435     }
02436     else if (srcFormat==PIX_FMT_BGR32_1)
02437     {
02438         if(c->chrSrcHSubSample)
02439             RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
02440         else
02441             RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
02442         src1= formatConvBuffer;
02443         src2= formatConvBuffer+VOFW;
02444     }
02445     else if (srcFormat==PIX_FMT_RGB24)
02446     {
02447         if(c->chrSrcHSubSample)
02448             RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02449         else
02450             RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02451         src1= formatConvBuffer;
02452         src2= formatConvBuffer+VOFW;
02453     }
02454     else if (srcFormat==PIX_FMT_RGB565)
02455     {
02456         if(c->chrSrcHSubSample)
02457             RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02458         else
02459             RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02460         src1= formatConvBuffer;
02461         src2= formatConvBuffer+VOFW;
02462     }
02463     else if (srcFormat==PIX_FMT_RGB555)
02464     {
02465         if(c->chrSrcHSubSample)
02466             RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02467         else
02468             RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02469         src1= formatConvBuffer;
02470         src2= formatConvBuffer+VOFW;
02471     }
02472     else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
02473     {
02474         return;
02475     }
02476     else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
02477     {
02478         RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
02479         src1= formatConvBuffer;
02480         src2= formatConvBuffer+VOFW;
02481     }
02482 
02483 #if HAVE_MMX
02484     // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
02485     if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
02486 #else
02487     if (!(flags&SWS_FAST_BILINEAR))
02488 #endif
02489     {
02490         RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
02491         RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
02492     }
02493     else // fast bilinear upscale / crap downscale
02494     {
02495 #if ARCH_X86 && CONFIG_GPL
02496 #if HAVE_MMX2
02497         int i;
02498 #if defined(PIC)
02499         uint64_t ebxsave __attribute__((aligned(8)));
02500 #endif
02501         if (canMMX2BeUsed)
02502         {
02503             __asm__ volatile(
02504 #if defined(PIC)
02505             "mov          %%"REG_b", %6         \n\t"
02506 #endif
02507             "pxor             %%mm7, %%mm7      \n\t"
02508             "mov                 %0, %%"REG_c"  \n\t"
02509             "mov                 %1, %%"REG_D"  \n\t"
02510             "mov                 %2, %%"REG_d"  \n\t"
02511             "mov                 %3, %%"REG_b"  \n\t"
02512             "xor          %%"REG_a", %%"REG_a"  \n\t" // i
02513             PREFETCH"   (%%"REG_c")             \n\t"
02514             PREFETCH" 32(%%"REG_c")             \n\t"
02515             PREFETCH" 64(%%"REG_c")             \n\t"
02516 
02517 #if ARCH_X86_64
02518 
02519 #define FUNNY_UV_CODE \
02520             "movl       (%%"REG_b"), %%esi      \n\t"\
02521             "call               *%4             \n\t"\
02522             "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
02523             "add          %%"REG_S", %%"REG_c"  \n\t"\
02524             "add          %%"REG_a", %%"REG_D"  \n\t"\
02525             "xor          %%"REG_a", %%"REG_a"  \n\t"\
02526 
02527 #else
02528 
02529 #define FUNNY_UV_CODE \
02530             "movl       (%%"REG_b"), %%esi      \n\t"\
02531             "call               *%4             \n\t"\
02532             "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
02533             "add          %%"REG_a", %%"REG_D"  \n\t"\
02534             "xor          %%"REG_a", %%"REG_a"  \n\t"\
02535 
02536 #endif /* ARCH_X86_64 */
02537 
02538 FUNNY_UV_CODE
02539 FUNNY_UV_CODE
02540 FUNNY_UV_CODE
02541 FUNNY_UV_CODE
02542             "xor          %%"REG_a", %%"REG_a"  \n\t" // i
02543             "mov                 %5, %%"REG_c"  \n\t" // src
02544             "mov                 %1, %%"REG_D"  \n\t" // buf1
02545             "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
02546             PREFETCH"   (%%"REG_c")             \n\t"
02547             PREFETCH" 32(%%"REG_c")             \n\t"
02548             PREFETCH" 64(%%"REG_c")             \n\t"
02549 
02550 FUNNY_UV_CODE
02551 FUNNY_UV_CODE
02552 FUNNY_UV_CODE
02553 FUNNY_UV_CODE
02554 
02555 #if defined(PIC)
02556             "mov %6, %%"REG_b"    \n\t"
02557 #endif
02558             :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
02559             "m" (funnyUVCode), "m" (src2)
02560 #if defined(PIC)
02561             ,"m" (ebxsave)
02562 #endif
02563             : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
02564 #if !defined(PIC)
02565              ,"%"REG_b
02566 #endif
02567             );
02568             for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
02569             {
02570                 //printf("%d %d %d\n", dstWidth, i, srcW);
02571                 dst[i] = src1[srcW-1]*128;
02572                 dst[i+VOFW] = src2[srcW-1]*128;
02573             }
02574         }
02575         else
02576         {
02577 #endif /* HAVE_MMX2 */
02578             long xInc_shr16 = (long) (xInc >> 16);
02579             uint16_t xInc_mask = xInc & 0xffff;
02580             __asm__ volatile(
02581             "xor %%"REG_a", %%"REG_a"               \n\t" // i
02582             "xor %%"REG_d", %%"REG_d"               \n\t" // xx
02583             "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
02584             ASMALIGN(4)
02585             "1:                                     \n\t"
02586             "mov        %0, %%"REG_S"               \n\t"
02587             "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
02588             "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
02589             "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
02590             "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
02591             "shll      $16, %%edi                   \n\t"
02592             "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
02593             "mov        %1, %%"REG_D"               \n\t"
02594             "shrl       $9, %%esi                   \n\t"
02595             "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
02596 
02597             "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
02598             "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
02599             "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
02600             "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
02601             "shll      $16, %%edi                   \n\t"
02602             "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
02603             "mov        %1, %%"REG_D"               \n\t"
02604             "shrl       $9, %%esi                   \n\t"
02605             "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
02606 
02607             "addw       %4, %%cx                    \n\t" //2*xalpha += xInc&0xFF
02608             "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>8 + carry
02609             "add        $1, %%"REG_a"               \n\t"
02610             "cmp        %2, %%"REG_a"               \n\t"
02611             " jb        1b                          \n\t"
02612 
02613 /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
02614    which is needed to support GCC 4.0. */
02615 #if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
02616             :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
02617 #else
02618             :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
02619 #endif
02620             "r" (src2)
02621             : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
02622             );
02623 #if HAVE_MMX2
02624         } //if MMX2 can't be used
02625 #endif
02626 #else
02627         int i;
02628         unsigned int xpos=0;
02629         for (i=0;i<dstWidth;i++)
02630         {
02631             register unsigned int xx=xpos>>16;
02632             register unsigned int xalpha=(xpos&0xFFFF)>>9;
02633             dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
02634             dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
02635             /* slower
02636             dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
02637             dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
02638             */
02639             xpos+=xInc;
02640         }
02641 #endif /* ARCH_X86 */
02642     }
02643     if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
02644         int i;
02645         //FIXME all pal and rgb srcFormats could do this convertion as well
02646         //FIXME all scalers more complex than bilinear could do half of this transform
02647         if(c->srcRange){
02648             for (i=0; i<dstWidth; i++){
02649                 dst[i     ]= (dst[i     ]*1799 + 4081085)>>11; //1469
02650                 dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
02651             }
02652         }else{
02653             for (i=0; i<dstWidth; i++){
02654                 dst[i     ]= (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
02655                 dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
02656             }
02657         }
02658     }
02659 }
02660 
02661 static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
02662                            int srcSliceH, uint8_t* dst[], int dstStride[]){
02663 
02664     /* load a few things into local vars to make the code more readable? and faster */
02665     const int srcW= c->srcW;
02666     const int dstW= c->dstW;
02667     const int dstH= c->dstH;
02668     const int chrDstW= c->chrDstW;
02669     const int chrSrcW= c->chrSrcW;
02670     const int lumXInc= c->lumXInc;
02671     const int chrXInc= c->chrXInc;
02672     const int dstFormat= c->dstFormat;
02673     const int srcFormat= c->srcFormat;
02674     const int flags= c->flags;
02675     const int canMMX2BeUsed= c->canMMX2BeUsed;
02676     int16_t *vLumFilterPos= c->vLumFilterPos;
02677     int16_t *vChrFilterPos= c->vChrFilterPos;
02678     int16_t *hLumFilterPos= c->hLumFilterPos;
02679     int16_t *hChrFilterPos= c->hChrFilterPos;
02680     int16_t *vLumFilter= c->vLumFilter;
02681     int16_t *vChrFilter= c->vChrFilter;
02682     int16_t *hLumFilter= c->hLumFilter;
02683     int16_t *hChrFilter= c->hChrFilter;
02684     int32_t *lumMmxFilter= c->lumMmxFilter;
02685     int32_t *chrMmxFilter= c->chrMmxFilter;
02686     const int vLumFilterSize= c->vLumFilterSize;
02687     const int vChrFilterSize= c->vChrFilterSize;
02688     const int hLumFilterSize= c->hLumFilterSize;
02689     const int hChrFilterSize= c->hChrFilterSize;
02690     int16_t **lumPixBuf= c->lumPixBuf;
02691     int16_t **chrPixBuf= c->chrPixBuf;
02692     const int vLumBufSize= c->vLumBufSize;
02693     const int vChrBufSize= c->vChrBufSize;
02694     uint8_t *funnyYCode= c->funnyYCode;
02695     uint8_t *funnyUVCode= c->funnyUVCode;
02696     uint8_t *formatConvBuffer= c->formatConvBuffer;
02697     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
02698     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
02699     int lastDstY;
02700     uint32_t *pal=c->pal_yuv;
02701 
02702     /* vars which will change and which we need to store back in the context */
02703     int dstY= c->dstY;
02704     int lumBufIndex= c->lumBufIndex;
02705     int chrBufIndex= c->chrBufIndex;
02706     int lastInLumBuf= c->lastInLumBuf;
02707     int lastInChrBuf= c->lastInChrBuf;
02708 
02709     if (isPacked(c->srcFormat)){
02710         src[0]=
02711         src[1]=
02712         src[2]= src[0];
02713         srcStride[0]=
02714         srcStride[1]=
02715         srcStride[2]= srcStride[0];
02716     }
02717     srcStride[1]<<= c->vChrDrop;
02718     srcStride[2]<<= c->vChrDrop;
02719 
02720     //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
02721     //       (int)dst[0], (int)dst[1], (int)dst[2]);
02722 
02723 #if 0 //self test FIXME move to a vfilter or something
02724     {
02725     static volatile int i=0;
02726     i++;
02727     if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
02728         selfTest(src, srcStride, c->srcW, c->srcH);
02729     i--;
02730     }
02731 #endif
02732 
02733     //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
02734     //dstStride[0],dstStride[1],dstStride[2]);
02735 
02736     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
02737     {
02738         static int warnedAlready=0; //FIXME move this into the context perhaps
02739         if (flags & SWS_PRINT_INFO && !warnedAlready)
02740         {
02741             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
02742                    "         ->cannot do aligned memory accesses anymore\n");
02743             warnedAlready=1;
02744         }
02745     }
02746 
02747     /* Note the user might start scaling the picture in the middle so this
02748        will not get executed. This is not really intended but works
02749        currently, so people might do it. */
02750     if (srcSliceY ==0){
02751         lumBufIndex=0;
02752         chrBufIndex=0;
02753         dstY=0;
02754         lastInLumBuf= -1;
02755         lastInChrBuf= -1;
02756     }
02757 
02758     lastDstY= dstY;
02759 
02760     for (;dstY < dstH; dstY++){
02761         unsigned char *dest =dst[0]+dstStride[0]*dstY;
02762         const int chrDstY= dstY>>c->chrDstVSubSample;
02763         unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
02764         unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
02765 
02766         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
02767         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
02768         const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
02769         const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
02770 
02771         //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
02772         // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
02773         //handle holes (FAST_BILINEAR & weird filters)
02774         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
02775         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
02776         //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
02777         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
02778         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
02779 
02780         // Do we have enough lines in this slice to output the dstY line
02781         if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
02782         {
02783             //Do horizontal scaling
02784             while(lastInLumBuf < lastLumSrcY)
02785             {
02786                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
02787                 lumBufIndex++;
02788                 //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
02789                 assert(lumBufIndex < 2*vLumBufSize);
02790                 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
02791                 assert(lastInLumBuf + 1 - srcSliceY >= 0);
02792                 //printf("%d %d\n", lumBufIndex, vLumBufSize);
02793                 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
02794                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
02795                                 funnyYCode, c->srcFormat, formatConvBuffer,
02796                                 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
02797                 lastInLumBuf++;
02798             }
02799             while(lastInChrBuf < lastChrSrcY)
02800             {
02801                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
02802                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
02803                 chrBufIndex++;
02804                 assert(chrBufIndex < 2*vChrBufSize);
02805                 assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
02806                 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
02807                 //FIXME replace parameters through context struct (some at least)
02808 
02809                 if (!(isGray(srcFormat) || isGray(dstFormat)))
02810                     RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
02811                                     flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
02812                                     funnyUVCode, c->srcFormat, formatConvBuffer,
02813                                     c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
02814                 lastInChrBuf++;
02815             }
02816             //wrap buf index around to stay inside the ring buffer
02817             if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
02818             if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
02819         }
02820         else // not enough lines left in this slice -> load the rest in the buffer
02821         {
02822             /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
02823             firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
02824             lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
02825             vChrBufSize, vLumBufSize);*/
02826 
02827             //Do horizontal scaling
02828             while(lastInLumBuf+1 < srcSliceY + srcSliceH)
02829             {
02830                 uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
02831                 lumBufIndex++;
02832                 assert(lumBufIndex < 2*vLumBufSize);
02833                 assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
02834                 assert(lastInLumBuf + 1 - srcSliceY >= 0);
02835                 RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
02836                                 flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
02837                                 funnyYCode, c->srcFormat, formatConvBuffer,
02838                                 c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
02839                 lastInLumBuf++;
02840             }
02841             while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
02842             {
02843                 uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
02844                 uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
02845                 chrBufIndex++;
02846                 assert(chrBufIndex < 2*vChrBufSize);
02847                 assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
02848                 assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
02849 
02850                 if (!(isGray(srcFormat) || isGray(dstFormat)))
02851                     RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
02852                             flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
02853                             funnyUVCode, c->srcFormat, formatConvBuffer,
02854                             c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
02855                 lastInChrBuf++;
02856             }
02857             //wrap buf index around to stay inside the ring buffer
02858             if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
02859             if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
02860             break; //we can't output a dstY line so let's try with the next slice
02861         }
02862 
02863 #if HAVE_MMX
02864         c->blueDither= ff_dither8[dstY&1];
02865         if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
02866             c->greenDither= ff_dither8[dstY&1];
02867         else
02868             c->greenDither= ff_dither4[dstY&1];
02869         c->redDither= ff_dither8[(dstY+1)&1];
02870 #endif
02871         if (dstY < dstH-2)
02872         {
02873             int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
02874             int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
02875 #if HAVE_MMX
02876             int i;
02877         if (flags & SWS_ACCURATE_RND){
02878             int s= APCK_SIZE / 8;
02879             for (i=0; i<vLumFilterSize; i+=2){
02880                 *(void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
02881                 *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
02882                           lumMmxFilter[s*i+APCK_COEF/4  ]=
02883                           lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
02884                     + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
02885             }
02886             for (i=0; i<vChrFilterSize; i+=2){
02887                 *(void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
02888                 *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
02889                           chrMmxFilter[s*i+APCK_COEF/4  ]=
02890                           chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
02891                     + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
02892             }
02893         }else{
02894             for (i=0; i<vLumFilterSize; i++)
02895             {
02896                 lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
02897                 lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
02898                 lumMmxFilter[4*i+2]=
02899                 lumMmxFilter[4*i+3]=
02900                     ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
02901             }
02902             for (i=0; i<vChrFilterSize; i++)
02903             {
02904                 chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
02905                 chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
02906                 chrMmxFilter[4*i+2]=
02907                 chrMmxFilter[4*i+3]=
02908                     ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
02909             }
02910         }
02911 #endif
02912             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
02913                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
02914                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
02915                 RENAME(yuv2nv12X)(c,
02916                     vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
02917                     vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02918                     dest, uDest, dstW, chrDstW, dstFormat);
02919             }
02920             else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
02921             {
02922                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
02923                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
02924                 if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
02925                 {
02926                     int16_t *lumBuf = lumPixBuf[0];
02927                     int16_t *chrBuf= chrPixBuf[0];
02928                     RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
02929                 }
02930                 else //General YV12
02931                 {
02932                     RENAME(yuv2yuvX)(c,
02933                         vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
02934                         vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02935                         dest, uDest, vDest, dstW, chrDstW);
02936                 }
02937             }
02938             else
02939             {
02940                 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
02941                 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
02942                 if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
02943                 {
02944                     int chrAlpha= vChrFilter[2*dstY+1];
02945                     if(flags & SWS_FULL_CHR_H_INT){
02946                         yuv2rgbXinC_full(c, //FIXME write a packed1_full function
02947                             vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02948                             vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02949                             dest, dstW, dstY);
02950                     }else{
02951                         RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
02952                             dest, dstW, chrAlpha, dstFormat, flags, dstY);
02953                     }
02954                 }
02955                 else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
02956                 {
02957                     int lumAlpha= vLumFilter[2*dstY+1];
02958                     int chrAlpha= vChrFilter[2*dstY+1];
02959                     lumMmxFilter[2]=
02960                     lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
02961                     chrMmxFilter[2]=
02962                     chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
02963                     if(flags & SWS_FULL_CHR_H_INT){
02964                         yuv2rgbXinC_full(c, //FIXME write a packed2_full function
02965                             vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02966                             vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02967                             dest, dstW, dstY);
02968                     }else{
02969                         RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
02970                             dest, dstW, lumAlpha, chrAlpha, dstY);
02971                     }
02972                 }
02973                 else //general RGB
02974                 {
02975                     if(flags & SWS_FULL_CHR_H_INT){
02976                         yuv2rgbXinC_full(c,
02977                             vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02978                             vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02979                             dest, dstW, dstY);
02980                     }else{
02981                         RENAME(yuv2packedX)(c,
02982                             vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
02983                             vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02984                             dest, dstW, dstY);
02985                     }
02986                 }
02987             }
02988         }
02989         else // hmm looks like we can't use MMX here without overwriting this array's tail
02990         {
02991             int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
02992             int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
02993             if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
02994                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
02995                 if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
02996                 yuv2nv12XinC(
02997                     vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
02998                     vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
02999                     dest, uDest, dstW, chrDstW, dstFormat);
03000             }
03001             else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
03002             {
03003                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
03004                 if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
03005                 yuv2yuvXinC(
03006                     vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
03007                     vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
03008                     dest, uDest, vDest, dstW, chrDstW);
03009             }
03010             else
03011             {
03012                 assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
03013                 assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
03014                 if(flags & SWS_FULL_CHR_H_INT){
03015                     yuv2rgbXinC_full(c,
03016                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
03017                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
03018                         dest, dstW, dstY);
03019                 }else{
03020                     yuv2packedXinC(c,
03021                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
03022                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
03023                         dest, dstW, dstY);
03024                 }
03025             }
03026         }
03027     }
03028 
03029 #if HAVE_MMX
03030     __asm__ volatile(SFENCE:::"memory");
03031     __asm__ volatile(EMMS:::"memory");
03032 #endif
03033     /* store changed local vars back in the context */
03034     c->dstY= dstY;
03035     c->lumBufIndex= lumBufIndex;
03036     c->chrBufIndex= chrBufIndex;
03037     c->lastInLumBuf= lastInLumBuf;
03038     c->lastInChrBuf= lastInChrBuf;
03039 
03040     return dstY - lastDstY;
03041 }

Generated on Sat Feb 16 2013 09:23:15 for ffmpeg by  doxygen 1.7.1