Skip to content

AFNI/NIfTI Server

Sections
Personal tools
You are here: Home » AFNI » Documentation

Doxygen Source Code Documentation


Main Page   Alphabetical List   Data Structures   File List   Data Fields   Globals   Search  

motion_comp_mmx.c

Go to the documentation of this file.
00001 /*
00002  * motion_comp_mmx.c
00003  * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
00004  * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
00005  *
00006  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
00007  * See http://libmpeg2.sourceforge.net/ for updates.
00008  *
00009  * mpeg2dec is free software; you can redistribute it and/or modify
00010  * it under the terms of the GNU General Public License as published by
00011  * the Free Software Foundation; either version 2 of the License, or
00012  * (at your option) any later version.
00013  *
00014  * mpeg2dec is distributed in the hope that it will be useful,
00015  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  * GNU General Public License for more details.
00018  *
00019  * You should have received a copy of the GNU General Public License
00020  * along with this program; if not, write to the Free Software
00021  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00022  */
00023 
00024 #include "config.h"
00025 
00026 #ifdef ARCH_X86
00027 
00028 #include <inttypes.h>
00029 
00030 #include "mpeg2.h"
00031 #include "mpeg2_internal.h"
00032 #include "attributes.h"
00033 #include "mmx.h"
00034 
00035 #define CPU_MMXEXT 0
00036 #define CPU_3DNOW 1
00037 
00038 
00039 /* MMX code - needs a rewrite */
00040 
00041 /*
00042  * Motion Compensation frequently needs to average values using the
00043  * formula (x+y+1)>>1. Both MMXEXT and 3Dnow include one instruction
00044  * to compute this, but it's been left out of classic MMX.
00045  *
00046  * We need to be careful of overflows when doing this computation.
00047  * Rather than unpacking data to 16-bits, which reduces parallelism,
00048  * we use the following formulas:
00049  *
00050  * (x+y)>>1 == (x&y)+((x^y)>>1)
00051  * (x+y+1)>>1 == (x|y)-((x^y)>>1)
00052  */
00053 
00054 /* some rounding constants */
00055 static mmx_t mask1 = {0xfefefefefefefefeLL};
00056 static mmx_t round4 = {0x0002000200020002LL};
00057 
00058 /*
00059  * This code should probably be compiled with loop unrolling
00060  * (ie, -funroll-loops in gcc)becuase some of the loops
00061  * use a small static number of iterations. This was written
00062  * with the assumption the compiler knows best about when
00063  * unrolling will help
00064  */
00065 
00066 static inline void mmx_zero_reg ()
00067 {
00068     /* load 0 into mm0 */
00069     pxor_r2r (mm0, mm0);
00070 }
00071 
00072 static inline void mmx_average_2_U8 (uint8_t * dest, const uint8_t * src1,
00073                                      const uint8_t * src2)
00074 {
00075     /* *dest = (*src1 + *src2 + 1)/ 2; */
00076 
00077     movq_m2r (*src1, mm1);      /* load 8 src1 bytes */
00078     movq_r2r (mm1, mm2);        /* copy 8 src1 bytes */
00079 
00080     movq_m2r (*src2, mm3);      /* load 8 src2 bytes */
00081     movq_r2r (mm3, mm4);        /* copy 8 src2 bytes */
00082 
00083     pxor_r2r (mm1, mm3);        /* xor src1 and src2 */
00084     pand_m2r (mask1, mm3);      /* mask lower bits */
00085     psrlq_i2r (1, mm3);         /* /2 */
00086     por_r2r (mm2, mm4);         /* or src1 and src2 */
00087     psubb_r2r (mm3, mm4);       /* subtract subresults */
00088     movq_r2m (mm4, *dest);      /* store result in dest */
00089 }
00090 
00091 static inline void mmx_interp_average_2_U8 (uint8_t * dest,
00092                                             const uint8_t * src1,
00093                                             const uint8_t * src2)
00094 {
00095     /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */
00096 
00097     movq_m2r (*dest, mm1);      /* load 8 dest bytes */
00098     movq_r2r (mm1, mm2);        /* copy 8 dest bytes */
00099 
00100     movq_m2r (*src1, mm3);      /* load 8 src1 bytes */
00101     movq_r2r (mm3, mm4);        /* copy 8 src1 bytes */
00102 
00103     movq_m2r (*src2, mm5);      /* load 8 src2 bytes */
00104     movq_r2r (mm5, mm6);        /* copy 8 src2 bytes */
00105 
00106     pxor_r2r (mm3, mm5);        /* xor src1 and src2 */
00107     pand_m2r (mask1, mm5);      /* mask lower bits */
00108     psrlq_i2r (1, mm5);         /* /2 */
00109     por_r2r (mm4, mm6);         /* or src1 and src2 */
00110     psubb_r2r (mm5, mm6);       /* subtract subresults */
00111     movq_r2r (mm6, mm5);        /* copy subresult */
00112 
00113     pxor_r2r (mm1, mm5);        /* xor srcavg and dest */
00114     pand_m2r (mask1, mm5);      /* mask lower bits */
00115     psrlq_i2r (1, mm5);         /* /2 */
00116     por_r2r (mm2, mm6);         /* or srcavg and dest */
00117     psubb_r2r (mm5, mm6);       /* subtract subresults */
00118     movq_r2m (mm6, *dest);      /* store result in dest */
00119 }
00120 
00121 static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1,
00122                                      const uint8_t * src2,
00123                                      const uint8_t * src3,
00124                                      const uint8_t * src4)
00125 {
00126     /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */
00127 
00128     movq_m2r (*src1, mm1);      /* load 8 src1 bytes */
00129     movq_r2r (mm1, mm2);        /* copy 8 src1 bytes */
00130 
00131     punpcklbw_r2r (mm0, mm1);   /* unpack low src1 bytes */
00132     punpckhbw_r2r (mm0, mm2);   /* unpack high src1 bytes */
00133 
00134     movq_m2r (*src2, mm3);      /* load 8 src2 bytes */
00135     movq_r2r (mm3, mm4);        /* copy 8 src2 bytes */
00136 
00137     punpcklbw_r2r (mm0, mm3);   /* unpack low src2 bytes */
00138     punpckhbw_r2r (mm0, mm4);   /* unpack high src2 bytes */
00139 
00140     paddw_r2r (mm3, mm1);       /* add lows */
00141     paddw_r2r (mm4, mm2);       /* add highs */
00142 
00143     /* now have partials in mm1 and mm2 */
00144 
00145     movq_m2r (*src3, mm3);      /* load 8 src3 bytes */
00146     movq_r2r (mm3, mm4);        /* copy 8 src3 bytes */
00147 
00148     punpcklbw_r2r (mm0, mm3);   /* unpack low src3 bytes */
00149     punpckhbw_r2r (mm0, mm4);   /* unpack high src3 bytes */
00150 
00151     paddw_r2r (mm3, mm1);       /* add lows */
00152     paddw_r2r (mm4, mm2);       /* add highs */
00153 
00154     movq_m2r (*src4, mm5);      /* load 8 src4 bytes */
00155     movq_r2r (mm5, mm6);        /* copy 8 src4 bytes */
00156 
00157     punpcklbw_r2r (mm0, mm5);   /* unpack low src4 bytes */
00158     punpckhbw_r2r (mm0, mm6);   /* unpack high src4 bytes */
00159 
00160     paddw_r2r (mm5, mm1);       /* add lows */
00161     paddw_r2r (mm6, mm2);       /* add highs */
00162 
00163     /* now have subtotal in mm1 and mm2 */
00164 
00165     paddw_m2r (round4, mm1);
00166     psraw_i2r (2, mm1);         /* /4 */
00167     paddw_m2r (round4, mm2);
00168     psraw_i2r (2, mm2);         /* /4 */
00169 
00170     packuswb_r2r (mm2, mm1);    /* pack (w/ saturation) */
00171     movq_r2m (mm1, *dest);      /* store result in dest */
00172 }
00173 
00174 static inline void mmx_interp_average_4_U8 (uint8_t * dest,
00175                                             const uint8_t * src1,
00176                                             const uint8_t * src2,
00177                                             const uint8_t * src3,
00178                                             const uint8_t * src4)
00179 {
00180     /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */
00181 
00182     movq_m2r (*src1, mm1);      /* load 8 src1 bytes */
00183     movq_r2r (mm1, mm2);        /* copy 8 src1 bytes */
00184 
00185     punpcklbw_r2r (mm0, mm1);   /* unpack low src1 bytes */
00186     punpckhbw_r2r (mm0, mm2);   /* unpack high src1 bytes */
00187 
00188     movq_m2r (*src2, mm3);      /* load 8 src2 bytes */
00189     movq_r2r (mm3, mm4);        /* copy 8 src2 bytes */
00190 
00191     punpcklbw_r2r (mm0, mm3);   /* unpack low src2 bytes */
00192     punpckhbw_r2r (mm0, mm4);   /* unpack high src2 bytes */
00193 
00194     paddw_r2r (mm3, mm1);       /* add lows */
00195     paddw_r2r (mm4, mm2);       /* add highs */
00196 
00197     /* now have partials in mm1 and mm2 */
00198 
00199     movq_m2r (*src3, mm3);      /* load 8 src3 bytes */
00200     movq_r2r (mm3, mm4);        /* copy 8 src3 bytes */
00201 
00202     punpcklbw_r2r (mm0, mm3);   /* unpack low src3 bytes */
00203     punpckhbw_r2r (mm0, mm4);   /* unpack high src3 bytes */
00204 
00205     paddw_r2r (mm3, mm1);       /* add lows */
00206     paddw_r2r (mm4, mm2);       /* add highs */
00207 
00208     movq_m2r (*src4, mm5);      /* load 8 src4 bytes */
00209     movq_r2r (mm5, mm6);        /* copy 8 src4 bytes */
00210 
00211     punpcklbw_r2r (mm0, mm5);   /* unpack low src4 bytes */
00212     punpckhbw_r2r (mm0, mm6);   /* unpack high src4 bytes */
00213 
00214     paddw_r2r (mm5, mm1);       /* add lows */
00215     paddw_r2r (mm6, mm2);       /* add highs */
00216 
00217     paddw_m2r (round4, mm1);
00218     psraw_i2r (2, mm1);         /* /4 */
00219     paddw_m2r (round4, mm2);
00220     psraw_i2r (2, mm2);         /* /4 */
00221 
00222     /* now have subtotal/4 in mm1 and mm2 */
00223 
00224     movq_m2r (*dest, mm3);      /* load 8 dest bytes */
00225     movq_r2r (mm3, mm4);        /* copy 8 dest bytes */
00226 
00227     packuswb_r2r (mm2, mm1);    /* pack (w/ saturation) */
00228     movq_r2r (mm1,mm2);         /* copy subresult */
00229 
00230     pxor_r2r (mm1, mm3);        /* xor srcavg and dest */
00231     pand_m2r (mask1, mm3);      /* mask lower bits */
00232     psrlq_i2r (1, mm3);         /* /2 */
00233     por_r2r (mm2, mm4);         /* or srcavg and dest */
00234     psubb_r2r (mm3, mm4);       /* subtract subresults */
00235     movq_r2m (mm4, *dest);      /* store result in dest */
00236 }
00237 
00238 /*-----------------------------------------------------------------------*/
00239 
00240 static inline void MC_avg_mmx (const int width, int height, uint8_t * dest,
00241                                const uint8_t * ref, const int stride)
00242 {
00243     mmx_zero_reg ();
00244 
00245     do {
00246         mmx_average_2_U8 (dest, dest, ref);
00247 
00248         if (width == 16)
00249             mmx_average_2_U8 (dest+8, dest+8, ref+8);
00250 
00251         dest += stride;
00252         ref += stride;
00253     } while (--height);
00254 }
00255 
00256 static void MC_avg_o_16_mmx (uint8_t * dest, const uint8_t * ref,
00257                              int stride, int height)
00258 {
00259     MC_avg_mmx (16, height, dest, ref, stride);
00260 }
00261 
00262 static void MC_avg_o_8_mmx (uint8_t * dest, const uint8_t * ref,
00263                             int stride, int height)
00264 {
00265     MC_avg_mmx (8, height, dest, ref, stride);
00266 }
00267 
00268 /*-----------------------------------------------------------------------*/
00269 
00270 static inline void MC_put_mmx (const int width, int height, uint8_t * dest,
00271                                const uint8_t * ref, const int stride)
00272 {
00273     mmx_zero_reg ();
00274 
00275     do {
00276         movq_m2r (* ref, mm1);  /* load 8 ref bytes */
00277         movq_r2m (mm1,* dest);  /* store 8 bytes at curr */
00278 
00279         if (width == 16)
00280             {
00281                 movq_m2r (* (ref+8), mm1);      /* load 8 ref bytes */
00282                 movq_r2m (mm1,* (dest+8));      /* store 8 bytes at curr */
00283             }
00284 
00285         dest += stride;
00286         ref += stride;
00287     } while (--height);
00288 }
00289 
00290 static void MC_put_o_16_mmx (uint8_t * dest, const uint8_t * ref,
00291                              int stride, int height)
00292 {
00293     MC_put_mmx (16, height, dest, ref, stride);
00294 }
00295 
00296 static void MC_put_o_8_mmx (uint8_t * dest, const uint8_t * ref,
00297                             int stride, int height)
00298 {
00299     MC_put_mmx (8, height, dest, ref, stride);
00300 }
00301 
00302 /*-----------------------------------------------------------------------*/
00303 
00304 /* Half pixel interpolation in the x direction */
00305 static inline void MC_avg_x_mmx (const int width, int height, uint8_t * dest,
00306                                  const uint8_t * ref, const int stride)
00307 {
00308     mmx_zero_reg ();
00309 
00310     do {
00311         mmx_interp_average_2_U8 (dest, ref, ref+1);
00312 
00313         if (width == 16)
00314             mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
00315 
00316         dest += stride;
00317         ref += stride;
00318     } while (--height);
00319 }
00320 
00321 static void MC_avg_x_16_mmx (uint8_t * dest, const uint8_t * ref,
00322                              int stride, int height)
00323 {
00324     MC_avg_x_mmx (16, height, dest, ref, stride);
00325 }
00326 
00327 static void MC_avg_x_8_mmx (uint8_t * dest, const uint8_t * ref,
00328                             int stride, int height)
00329 {
00330     MC_avg_x_mmx (8, height, dest, ref, stride);
00331 }
00332 
00333 /*-----------------------------------------------------------------------*/
00334 
00335 static inline void MC_put_x_mmx (const int width, int height, uint8_t * dest,
00336                                  const uint8_t * ref, const int stride)
00337 {
00338     mmx_zero_reg ();
00339 
00340     do {
00341         mmx_average_2_U8 (dest, ref, ref+1);
00342 
00343         if (width == 16)
00344             mmx_average_2_U8 (dest+8, ref+8, ref+9);
00345 
00346         dest += stride;
00347         ref += stride;
00348     } while (--height);
00349 }
00350 
00351 static void MC_put_x_16_mmx (uint8_t * dest, const uint8_t * ref,
00352                              int stride, int height)
00353 {
00354     MC_put_x_mmx (16, height, dest, ref, stride);
00355 }
00356 
00357 static void MC_put_x_8_mmx (uint8_t * dest, const uint8_t * ref,
00358                             int stride, int height)
00359 {
00360     MC_put_x_mmx (8, height, dest, ref, stride);
00361 }
00362 
00363 /*-----------------------------------------------------------------------*/
00364 
00365 static inline void MC_avg_xy_mmx (const int width, int height, uint8_t * dest,
00366                                   const uint8_t * ref, const int stride)
00367 {
00368     const uint8_t * ref_next = ref + stride;
00369 
00370     mmx_zero_reg ();
00371 
00372     do {
00373         mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
00374 
00375         if (width == 16)
00376             mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
00377                                      ref_next+8, ref_next+9);
00378 
00379         dest += stride;
00380         ref += stride;
00381         ref_next += stride;
00382     } while (--height);
00383 }
00384 
00385 static void MC_avg_xy_16_mmx (uint8_t * dest, const uint8_t * ref,
00386                               int stride, int height)
00387 {
00388     MC_avg_xy_mmx (16, height, dest, ref, stride);
00389 }
00390 
00391 static void MC_avg_xy_8_mmx (uint8_t * dest, const uint8_t * ref,
00392                              int stride, int height)
00393 {
00394     MC_avg_xy_mmx (8, height, dest, ref, stride);
00395 }
00396 
00397 /*-----------------------------------------------------------------------*/
00398 
00399 static inline void MC_put_xy_mmx (const int width, int height, uint8_t * dest,
00400                                   const uint8_t * ref, const int stride)
00401 {
00402     const uint8_t * ref_next = ref + stride;
00403 
00404     mmx_zero_reg ();
00405 
00406     do {
00407         mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
00408 
00409         if (width == 16)
00410             mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
00411 
00412         dest += stride;
00413         ref += stride;
00414         ref_next += stride;
00415     } while (--height);
00416 }
00417 
00418 static void MC_put_xy_16_mmx (uint8_t * dest, const uint8_t * ref,
00419                               int stride, int height)
00420 {
00421     MC_put_xy_mmx (16, height, dest, ref, stride);
00422 }
00423 
00424 static void MC_put_xy_8_mmx (uint8_t * dest, const uint8_t * ref,
00425                              int stride, int height)
00426 {
00427     MC_put_xy_mmx (8, height, dest, ref, stride);
00428 }
00429 
00430 /*-----------------------------------------------------------------------*/
00431 
00432 static inline void MC_avg_y_mmx (const int width, int height, uint8_t * dest,
00433                                  const uint8_t * ref, const int stride)
00434 {
00435     const uint8_t * ref_next = ref + stride;
00436 
00437     mmx_zero_reg ();
00438 
00439     do {
00440         mmx_interp_average_2_U8 (dest, ref, ref_next);
00441 
00442         if (width == 16)
00443             mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
00444 
00445         dest += stride;
00446         ref += stride;
00447         ref_next += stride;
00448     } while (--height);
00449 }
00450 
00451 static void MC_avg_y_16_mmx (uint8_t * dest, const uint8_t * ref,
00452                              int stride, int height)
00453 {
00454     MC_avg_y_mmx (16, height, dest, ref, stride);
00455 }
00456 
00457 static void MC_avg_y_8_mmx (uint8_t * dest, const uint8_t * ref,
00458                             int stride, int height)
00459 {
00460     MC_avg_y_mmx (8, height, dest, ref, stride);
00461 }
00462 
00463 /*-----------------------------------------------------------------------*/
00464 
00465 static inline void MC_put_y_mmx (const int width, int height, uint8_t * dest,
00466                                  const uint8_t * ref, const int stride)
00467 {
00468     const uint8_t * ref_next = ref + stride;
00469 
00470     mmx_zero_reg ();
00471 
00472     do {
00473         mmx_average_2_U8 (dest, ref, ref_next);
00474 
00475         if (width == 16)
00476             mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
00477 
00478         dest += stride;
00479         ref += stride;
00480         ref_next += stride;
00481     } while (--height);
00482 }
00483 
00484 static void MC_put_y_16_mmx (uint8_t * dest, const uint8_t * ref,
00485                              int stride, int height)
00486 {
00487     MC_put_y_mmx (16, height, dest, ref, stride);
00488 }
00489 
00490 static void MC_put_y_8_mmx (uint8_t * dest, const uint8_t * ref,
00491                             int stride, int height)
00492 {
00493     MC_put_y_mmx (8, height, dest, ref, stride);
00494 }
00495 
00496 
00497 MPEG2_MC_EXTERN (mmx)
00498 
00499 
00500 
00501 
00502 
00503 
00504 
00505 /* CPU_MMXEXT/CPU_3DNOW adaptation layer */
00506 
00507 #define pavg_r2r(src,dest)              \
00508 do {                                    \
00509     if (cpu == CPU_MMXEXT)              \
00510         pavgb_r2r (src, dest);          \
00511     else                                \
00512         pavgusb_r2r (src, dest);        \
00513 } while (0)
00514 
00515 #define pavg_m2r(src,dest)              \
00516 do {                                    \
00517     if (cpu == CPU_MMXEXT)              \
00518         pavgb_m2r (src, dest);          \
00519     else                                \
00520         pavgusb_m2r (src, dest);        \
00521 } while (0)
00522 
00523 
00524 /* CPU_MMXEXT code */
00525 
00526 
00527 static inline void MC_put1_8 (int height, uint8_t * dest, const uint8_t * ref,
00528                               const int stride)
00529 {
00530     do {
00531         movq_m2r (*ref, mm0);
00532         movq_r2m (mm0, *dest);
00533         ref += stride;
00534         dest += stride;
00535     } while (--height);
00536 }
00537 
00538 static inline void MC_put1_16 (int height, uint8_t * dest, const uint8_t * ref,
00539                                const int stride)
00540 {
00541     do {
00542         movq_m2r (*ref, mm0);
00543         movq_m2r (*(ref+8), mm1);
00544         ref += stride;
00545         movq_r2m (mm0, *dest);
00546         movq_r2m (mm1, *(dest+8));
00547         dest += stride;
00548     } while (--height);
00549 }
00550 
00551 static inline void MC_avg1_8 (int height, uint8_t * dest, const uint8_t * ref,
00552                               const int stride, const int cpu)
00553 {
00554     do {
00555         movq_m2r (*ref, mm0);
00556         pavg_m2r (*dest, mm0);
00557         ref += stride;
00558         movq_r2m (mm0, *dest);
00559         dest += stride;
00560     } while (--height);
00561 }
00562 
00563 static inline void MC_avg1_16 (int height, uint8_t * dest, const uint8_t * ref,
00564                                const int stride, const int cpu)
00565 {
00566     do {
00567         movq_m2r (*ref, mm0);
00568         movq_m2r (*(ref+8), mm1);
00569         pavg_m2r (*dest, mm0);
00570         pavg_m2r (*(dest+8), mm1);
00571         movq_r2m (mm0, *dest);
00572         ref += stride;
00573         movq_r2m (mm1, *(dest+8));
00574         dest += stride;
00575     } while (--height);
00576 }
00577 
00578 static inline void MC_put2_8 (int height, uint8_t * dest, const uint8_t * ref,
00579                               const int stride, const int offset,
00580                               const int cpu)
00581 {
00582     do {
00583         movq_m2r (*ref, mm0);
00584         pavg_m2r (*(ref+offset), mm0);
00585         ref += stride;
00586         movq_r2m (mm0, *dest);
00587         dest += stride;
00588     } while (--height);
00589 }
00590 
00591 static inline void MC_put2_16 (int height, uint8_t * dest, const uint8_t * ref,
00592                                const int stride, const int offset,
00593                                const int cpu)
00594 {
00595     do {
00596         movq_m2r (*ref, mm0);
00597         movq_m2r (*(ref+8), mm1);
00598         pavg_m2r (*(ref+offset), mm0);
00599         pavg_m2r (*(ref+offset+8), mm1);
00600         movq_r2m (mm0, *dest);
00601         ref += stride;
00602         movq_r2m (mm1, *(dest+8));
00603         dest += stride;
00604     } while (--height);
00605 }
00606 
00607 static inline void MC_avg2_8 (int height, uint8_t * dest, const uint8_t * ref,
00608                               const int stride, const int offset,
00609                               const int cpu)
00610 {
00611     do {
00612         movq_m2r (*ref, mm0);
00613         pavg_m2r (*(ref+offset), mm0);
00614         pavg_m2r (*dest, mm0);
00615         ref += stride;
00616         movq_r2m (mm0, *dest);
00617         dest += stride;
00618     } while (--height);
00619 }
00620 
00621 static inline void MC_avg2_16 (int height, uint8_t * dest, const uint8_t * ref,
00622                                const int stride, const int offset,
00623                                const int cpu)
00624 {
00625     do {
00626         movq_m2r (*ref, mm0);
00627         movq_m2r (*(ref+8), mm1);
00628         pavg_m2r (*(ref+offset), mm0);
00629         pavg_m2r (*(ref+offset+8), mm1);
00630         pavg_m2r (*dest, mm0);
00631         pavg_m2r (*(dest+8), mm1);
00632         ref += stride;
00633         movq_r2m (mm0, *dest);
00634         movq_r2m (mm1, *(dest+8));
00635         dest += stride;
00636     } while (--height);
00637 }
00638 
00639 static mmx_t mask_one = {0x0101010101010101LL};
00640 
00641 static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref,
00642                               const int stride, const int cpu)
00643 {
00644     movq_m2r (*ref, mm0);
00645     movq_m2r (*(ref+1), mm1);
00646     movq_r2r (mm0, mm7);
00647     pxor_r2r (mm1, mm7);
00648     pavg_r2r (mm1, mm0);
00649     ref += stride;
00650 
00651     do {
00652         movq_m2r (*ref, mm2);
00653         movq_r2r (mm0, mm5);
00654 
00655         movq_m2r (*(ref+1), mm3);
00656         movq_r2r (mm2, mm6);
00657 
00658         pxor_r2r (mm3, mm6);
00659         pavg_r2r (mm3, mm2);
00660 
00661         por_r2r (mm6, mm7);
00662         pxor_r2r (mm2, mm5);
00663 
00664         pand_r2r (mm5, mm7);
00665         pavg_r2r (mm2, mm0);
00666 
00667         pand_m2r (mask_one, mm7);
00668 
00669         psubusb_r2r (mm7, mm0);
00670 
00671         ref += stride;
00672         movq_r2m (mm0, *dest);
00673         dest += stride;
00674 
00675         movq_r2r (mm6, mm7);    /* unroll ! */
00676         movq_r2r (mm2, mm0);    /* unroll ! */
00677     } while (--height);
00678 }
00679 
00680 static inline void MC_put4_16 (int height, uint8_t * dest, const uint8_t * ref,
00681                                const int stride, const int cpu)
00682 {
00683     do {
00684         movq_m2r (*ref, mm0);
00685         movq_m2r (*(ref+stride+1), mm1);
00686         movq_r2r (mm0, mm7);
00687         movq_m2r (*(ref+1), mm2);
00688         pxor_r2r (mm1, mm7);
00689         movq_m2r (*(ref+stride), mm3);
00690         movq_r2r (mm2, mm6);
00691         pxor_r2r (mm3, mm6);
00692         pavg_r2r (mm1, mm0);
00693         pavg_r2r (mm3, mm2);
00694         por_r2r (mm6, mm7);
00695         movq_r2r (mm0, mm6);
00696         pxor_r2r (mm2, mm6);
00697         pand_r2r (mm6, mm7);
00698         pand_m2r (mask_one, mm7);
00699         pavg_r2r (mm2, mm0);
00700         psubusb_r2r (mm7, mm0);
00701         movq_r2m (mm0, *dest);
00702 
00703         movq_m2r (*(ref+8), mm0);
00704         movq_m2r (*(ref+stride+9), mm1);
00705         movq_r2r (mm0, mm7);
00706         movq_m2r (*(ref+9), mm2);
00707         pxor_r2r (mm1, mm7);
00708         movq_m2r (*(ref+stride+8), mm3);
00709         movq_r2r (mm2, mm6);
00710         pxor_r2r (mm3, mm6);
00711         pavg_r2r (mm1, mm0);
00712         pavg_r2r (mm3, mm2);
00713         por_r2r (mm6, mm7);
00714         movq_r2r (mm0, mm6);
00715         pxor_r2r (mm2, mm6);
00716         pand_r2r (mm6, mm7);
00717         pand_m2r (mask_one, mm7);
00718         pavg_r2r (mm2, mm0);
00719         psubusb_r2r (mm7, mm0);
00720         ref += stride;
00721         movq_r2m (mm0, *(dest+8));
00722         dest += stride;
00723     } while (--height);
00724 }
00725 
00726 static inline void MC_avg4_8 (int height, uint8_t * dest, const uint8_t * ref,
00727                               const int stride, const int cpu)
00728 {
00729     do {
00730         movq_m2r (*ref, mm0);
00731         movq_m2r (*(ref+stride+1), mm1);
00732         movq_r2r (mm0, mm7);
00733         movq_m2r (*(ref+1), mm2);
00734         pxor_r2r (mm1, mm7);
00735         movq_m2r (*(ref+stride), mm3);
00736         movq_r2r (mm2, mm6);
00737         pxor_r2r (mm3, mm6);
00738         pavg_r2r (mm1, mm0);
00739         pavg_r2r (mm3, mm2);
00740         por_r2r (mm6, mm7);
00741         movq_r2r (mm0, mm6);
00742         pxor_r2r (mm2, mm6);
00743         pand_r2r (mm6, mm7);
00744         pand_m2r (mask_one, mm7);
00745         pavg_r2r (mm2, mm0);
00746         psubusb_r2r (mm7, mm0);
00747         movq_m2r (*dest, mm1);
00748         pavg_r2r (mm1, mm0);
00749         ref += stride;
00750         movq_r2m (mm0, *dest);
00751         dest += stride;
00752     } while (--height);
00753 }
00754 
00755 static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref,
00756                                const int stride, const int cpu)
00757 {
00758     do {
00759         movq_m2r (*ref, mm0);
00760         movq_m2r (*(ref+stride+1), mm1);
00761         movq_r2r (mm0, mm7);
00762         movq_m2r (*(ref+1), mm2);
00763         pxor_r2r (mm1, mm7);
00764         movq_m2r (*(ref+stride), mm3);
00765         movq_r2r (mm2, mm6);
00766         pxor_r2r (mm3, mm6);
00767         pavg_r2r (mm1, mm0);
00768         pavg_r2r (mm3, mm2);
00769         por_r2r (mm6, mm7);
00770         movq_r2r (mm0, mm6);
00771         pxor_r2r (mm2, mm6);
00772         pand_r2r (mm6, mm7);
00773         pand_m2r (mask_one, mm7);
00774         pavg_r2r (mm2, mm0);
00775         psubusb_r2r (mm7, mm0);
00776         movq_m2r (*dest, mm1);
00777         pavg_r2r (mm1, mm0);
00778         movq_r2m (mm0, *dest);
00779 
00780         movq_m2r (*(ref+8), mm0);
00781         movq_m2r (*(ref+stride+9), mm1);
00782         movq_r2r (mm0, mm7);
00783         movq_m2r (*(ref+9), mm2);
00784         pxor_r2r (mm1, mm7);
00785         movq_m2r (*(ref+stride+8), mm3);
00786         movq_r2r (mm2, mm6);
00787         pxor_r2r (mm3, mm6);
00788         pavg_r2r (mm1, mm0);
00789         pavg_r2r (mm3, mm2);
00790         por_r2r (mm6, mm7);
00791         movq_r2r (mm0, mm6);
00792         pxor_r2r (mm2, mm6);
00793         pand_r2r (mm6, mm7);
00794         pand_m2r (mask_one, mm7);
00795         pavg_r2r (mm2, mm0);
00796         psubusb_r2r (mm7, mm0);
00797         movq_m2r (*(dest+8), mm1);
00798         pavg_r2r (mm1, mm0);
00799         ref += stride;
00800         movq_r2m (mm0, *(dest+8));
00801         dest += stride;
00802     } while (--height);
00803 }
00804 
00805 static void MC_avg_o_16_mmxext (uint8_t * dest, const uint8_t * ref,
00806                                 int stride, int height)
00807 {
00808     MC_avg1_16 (height, dest, ref, stride, CPU_MMXEXT);
00809 }
00810 
00811 static void MC_avg_o_8_mmxext (uint8_t * dest, const uint8_t * ref,
00812                                int stride, int height)
00813 {
00814     MC_avg1_8 (height, dest, ref, stride, CPU_MMXEXT);
00815 }
00816 
00817 static void MC_put_o_16_mmxext (uint8_t * dest, const uint8_t * ref,
00818                                 int stride, int height)
00819 {
00820     MC_put1_16 (height, dest, ref, stride);
00821 }
00822 
00823 static void MC_put_o_8_mmxext (uint8_t * dest, const uint8_t * ref,
00824                                int stride, int height)
00825 {
00826     MC_put1_8 (height, dest, ref, stride);
00827 }
00828 
00829 static void MC_avg_x_16_mmxext (uint8_t * dest, const uint8_t * ref,
00830                                 int stride, int height)
00831 {
00832     MC_avg2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
00833 }
00834 
00835 static void MC_avg_x_8_mmxext (uint8_t * dest, const uint8_t * ref,
00836                                int stride, int height)
00837 {
00838     MC_avg2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
00839 }
00840 
00841 static void MC_put_x_16_mmxext (uint8_t * dest, const uint8_t * ref,
00842                                 int stride, int height)
00843 {
00844     MC_put2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
00845 }
00846 
00847 static void MC_put_x_8_mmxext (uint8_t * dest, const uint8_t * ref,
00848                                int stride, int height)
00849 {
00850     MC_put2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
00851 }
00852 
00853 static void MC_avg_y_16_mmxext (uint8_t * dest, const uint8_t * ref,
00854                                 int stride, int height)
00855 {
00856     MC_avg2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
00857 }
00858 
00859 static void MC_avg_y_8_mmxext (uint8_t * dest, const uint8_t * ref,
00860                                int stride, int height)
00861 {
00862     MC_avg2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
00863 }
00864 
00865 static void MC_put_y_16_mmxext (uint8_t * dest, const uint8_t * ref,
00866                                 int stride, int height)
00867 {
00868     MC_put2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
00869 }
00870 
00871 static void MC_put_y_8_mmxext (uint8_t * dest, const uint8_t * ref,
00872                                int stride, int height)
00873 {
00874     MC_put2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
00875 }
00876 
00877 static void MC_avg_xy_16_mmxext (uint8_t * dest, const uint8_t * ref,
00878                                  int stride, int height)
00879 {
00880     MC_avg4_16 (height, dest, ref, stride, CPU_MMXEXT);
00881 }
00882 
00883 static void MC_avg_xy_8_mmxext (uint8_t * dest, const uint8_t * ref,
00884                                 int stride, int height)
00885 {
00886     MC_avg4_8 (height, dest, ref, stride, CPU_MMXEXT);
00887 }
00888 
00889 static void MC_put_xy_16_mmxext (uint8_t * dest, const uint8_t * ref,
00890                                  int stride, int height)
00891 {
00892     MC_put4_16 (height, dest, ref, stride, CPU_MMXEXT);
00893 }
00894 
00895 static void MC_put_xy_8_mmxext (uint8_t * dest, const uint8_t * ref,
00896                                 int stride, int height)
00897 {
00898     MC_put4_8 (height, dest, ref, stride, CPU_MMXEXT);
00899 }
00900 
00901 
00902 MPEG2_MC_EXTERN (mmxext)
00903 
00904 
00905 
00906 static void MC_avg_o_16_3dnow (uint8_t * dest, const uint8_t * ref,
00907                                int stride, int height)
00908 {
00909     MC_avg1_16 (height, dest, ref, stride, CPU_3DNOW);
00910 }
00911 
00912 static void MC_avg_o_8_3dnow (uint8_t * dest, const uint8_t * ref,
00913                               int stride, int height)
00914 {
00915     MC_avg1_8 (height, dest, ref, stride, CPU_3DNOW);
00916 }
00917 
00918 static void MC_put_o_16_3dnow (uint8_t * dest, const uint8_t * ref,
00919                                int stride, int height)
00920 {
00921     MC_put1_16 (height, dest, ref, stride);
00922 }
00923 
00924 static void MC_put_o_8_3dnow (uint8_t * dest, const uint8_t * ref,
00925                               int stride, int height)
00926 {
00927     MC_put1_8 (height, dest, ref, stride);
00928 }
00929 
00930 static void MC_avg_x_16_3dnow (uint8_t * dest, const uint8_t * ref,
00931                                int stride, int height)
00932 {
00933     MC_avg2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
00934 }
00935 
00936 static void MC_avg_x_8_3dnow (uint8_t * dest, const uint8_t * ref,
00937                               int stride, int height)
00938 {
00939     MC_avg2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
00940 }
00941 
00942 static void MC_put_x_16_3dnow (uint8_t * dest, const uint8_t * ref,
00943                                int stride, int height)
00944 {
00945     MC_put2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
00946 }
00947 
00948 static void MC_put_x_8_3dnow (uint8_t * dest, const uint8_t * ref,
00949                               int stride, int height)
00950 {
00951     MC_put2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
00952 }
00953 
00954 static void MC_avg_y_16_3dnow (uint8_t * dest, const uint8_t * ref,
00955                                int stride, int height)
00956 {
00957     MC_avg2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
00958 }
00959 
00960 static void MC_avg_y_8_3dnow (uint8_t * dest, const uint8_t * ref,
00961                               int stride, int height)
00962 {
00963     MC_avg2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
00964 }
00965 
00966 static void MC_put_y_16_3dnow (uint8_t * dest, const uint8_t * ref,
00967                                int stride, int height)
00968 {
00969     MC_put2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
00970 }
00971 
00972 static void MC_put_y_8_3dnow (uint8_t * dest, const uint8_t * ref,
00973                               int stride, int height)
00974 {
00975     MC_put2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
00976 }
00977 
00978 static void MC_avg_xy_16_3dnow (uint8_t * dest, const uint8_t * ref,
00979                                 int stride, int height)
00980 {
00981     MC_avg4_16 (height, dest, ref, stride, CPU_3DNOW);
00982 }
00983 
00984 static void MC_avg_xy_8_3dnow (uint8_t * dest, const uint8_t * ref,
00985                                int stride, int height)
00986 {
00987     MC_avg4_8 (height, dest, ref, stride, CPU_3DNOW);
00988 }
00989 
00990 static void MC_put_xy_16_3dnow (uint8_t * dest, const uint8_t * ref,
00991                                 int stride, int height)
00992 {
00993     MC_put4_16 (height, dest, ref, stride, CPU_3DNOW);
00994 }
00995 
00996 static void MC_put_xy_8_3dnow (uint8_t * dest, const uint8_t * ref,
00997                                int stride, int height)
00998 {
00999     MC_put4_8 (height, dest, ref, stride, CPU_3DNOW);
01000 }
01001 
01002 
01003 MPEG2_MC_EXTERN (3dnow)
01004 
01005 #endif
 

Powered by Plone

This site conforms to the following standards: