• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

libavcodec/ppc/imgresample_altivec.c

Go to the documentation of this file.
00001 /*
00002  * High quality image resampling with polyphase filters
00003  * Copyright (c) 2001 Fabrice Bellard
00004  *
00005  * This file is part of FFmpeg.
00006  *
00007  * FFmpeg is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU Lesser General Public
00009  * License as published by the Free Software Foundation; either
00010  * version 2.1 of the License, or (at your option) any later version.
00011  *
00012  * FFmpeg is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with FFmpeg; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00020  */
00021 
00027 #include "util_altivec.h"
00028 #define FILTER_BITS   8
00029 
00030 typedef         union {
00031     vector signed short v;
00032     signed short s[8];
00033 } vec_ss;
00034 
00035 void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
00036                           int wrap, int16_t *filter)
00037 {
00038     int sum, i;
00039     const uint8_t *s;
00040     vector unsigned char *tv, tmp, dstv, zero;
00041     vec_ss srchv[4], srclv[4], fv[4];
00042     vector signed short zeros, sumhv, sumlv;
00043     s = src;
00044 
00045     for(i=0;i<4;i++) {
00046         /*
00047            The vec_madds later on does an implicit >>15 on the result.
00048            Since FILTER_BITS is 8, and we have 15 bits of magnitude in
00049            a signed short, we have just enough bits to pre-shift our
00050            filter constants <<7 to compensate for vec_madds.
00051         */
00052         fv[i].s[0] = filter[i] << (15-FILTER_BITS);
00053         fv[i].v = vec_splat(fv[i].v, 0);
00054     }
00055 
00056     zero = vec_splat_u8(0);
00057     zeros = vec_splat_s16(0);
00058 
00059 
00060     /*
00061        When we're resampling, we'd ideally like both our input buffers,
00062        and output buffers to be 16-byte aligned, so we can do both aligned
00063        reads and writes. Sadly we can't always have this at the moment, so
00064        we opt for aligned writes, as unaligned writes have a huge overhead.
00065        To do this, do enough scalar resamples to get dst 16-byte aligned.
00066     */
00067     i = (-(int)dst) & 0xf;
00068     while(i>0) {
00069         sum = s[0 * wrap] * filter[0] +
00070         s[1 * wrap] * filter[1] +
00071         s[2 * wrap] * filter[2] +
00072         s[3 * wrap] * filter[3];
00073         sum = sum >> FILTER_BITS;
00074         if (sum<0) sum = 0; else if (sum>255) sum=255;
00075         dst[0] = sum;
00076         dst++;
00077         s++;
00078         dst_width--;
00079         i--;
00080     }
00081 
00082     /* Do our altivec resampling on 16 pixels at once. */
00083     while(dst_width>=16) {
00084         /* Read 16 (potentially unaligned) bytes from each of
00085            4 lines into 4 vectors, and split them into shorts.
00086            Interleave the multipy/accumulate for the resample
00087            filter with the loads to hide the 3 cycle latency
00088            the vec_madds have. */
00089         tv = (vector unsigned char *) &s[0 * wrap];
00090         tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
00091         srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
00092         srclv[0].v = (vector signed short) vec_mergel(zero, tmp);
00093         sumhv = vec_madds(srchv[0].v, fv[0].v, zeros);
00094         sumlv = vec_madds(srclv[0].v, fv[0].v, zeros);
00095 
00096         tv = (vector unsigned char *) &s[1 * wrap];
00097         tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap]));
00098         srchv[1].v = (vector signed short) vec_mergeh(zero, tmp);
00099         srclv[1].v = (vector signed short) vec_mergel(zero, tmp);
00100         sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv);
00101         sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv);
00102 
00103         tv = (vector unsigned char *) &s[2 * wrap];
00104         tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap]));
00105         srchv[2].v = (vector signed short) vec_mergeh(zero, tmp);
00106         srclv[2].v = (vector signed short) vec_mergel(zero, tmp);
00107         sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv);
00108         sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv);
00109 
00110         tv = (vector unsigned char *) &s[3 * wrap];
00111         tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap]));
00112         srchv[3].v = (vector signed short) vec_mergeh(zero, tmp);
00113         srclv[3].v = (vector signed short) vec_mergel(zero, tmp);
00114         sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
00115         sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
00116 
00117         /* Pack the results into our destination vector,
00118            and do an aligned write of that back to memory. */
00119         dstv = vec_packsu(sumhv, sumlv) ;
00120         vec_st(dstv, 0, (vector unsigned char *) dst);
00121 
00122         dst+=16;
00123         s+=16;
00124         dst_width-=16;
00125     }
00126 
00127     /* If there are any leftover pixels, resample them
00128        with the slow scalar method. */
00129     while(dst_width>0) {
00130         sum = s[0 * wrap] * filter[0] +
00131         s[1 * wrap] * filter[1] +
00132         s[2 * wrap] * filter[2] +
00133         s[3 * wrap] * filter[3];
00134         sum = sum >> FILTER_BITS;
00135         if (sum<0) sum = 0; else if (sum>255) sum=255;
00136         dst[0] = sum;
00137         dst++;
00138         s++;
00139         dst_width--;
00140     }
00141 }
00142 

Generated on Sat Feb 16 2013 09:23:13 for ffmpeg by  doxygen 1.7.1