Doxygen Source Code Documentation

Main Page Alphabetical List Data Structures File List Data Fields Globals Search
yuv2rgb_mmx.c

00001 /*
00002  * yuv2rgb_mmx.c
00003  * Copyright (C) 2000-2002 Silicon Integrated System Corp.
00004  * All Rights Reserved.
00005  *
00006  * Author: Olie Lho <ollie@sis.com.tw>
00007  *
00008  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
00009  * See http://libmpeg2.sourceforge.net/ for updates.
00010  *
00011  * mpeg2dec is free software; you can redistribute it and/or modify
00012  * it under the terms of the GNU General Public License as published by
00013  * the Free Software Foundation; either version 2 of the License, or
00014  * (at your option) any later version.
00015  *
00016  * mpeg2dec is distributed in the hope that it will be useful,
00017  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00018  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00019  * GNU General Public License for more details.
00020  *
00021  * You should have received a copy of the GNU General Public License
00022  * along with this program; if not, write to the Free Software
00023  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00024  */
00025 
00026 #include "config.h"
00027 
00028 #ifdef ARCH_X86
00029 
00030 #include <stdio.h>
00031 #include <stdlib.h>
00032 #include <inttypes.h>
00033 
00034 #include "convert.h"
00035 #include "convert_internal.h"
00036 #include "attributes.h"
00037 #include "mmx.h"
00038 
00039 #define CPU_MMXEXT 0
00040 #define CPU_MMX 1
00041 
00042 /* CPU_MMXEXT/CPU_MMX adaptation layer */
00043 
00044 #define movntq(src,dest)        \
00045 do {                            \
00046     if (cpu == CPU_MMXEXT)      \
00047         movntq_r2m (src, dest); \
00048     else                        \
00049         movq_r2m (src, dest);   \
00050 } while (0)
00051 
00052 static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv)
00053 {
00054     static mmx_t mmx_80w = {0x0080008000800080LL};
00055     static mmx_t mmx_U_green = {0xf37df37df37df37dLL};
00056     static mmx_t mmx_U_blue = {0x4093409340934093LL};
00057     static mmx_t mmx_V_red = {0x3312331233123312LL};
00058     static mmx_t mmx_V_green = {0xe5fce5fce5fce5fcLL};
00059     static mmx_t mmx_10w = {0x1010101010101010LL};
00060     static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffLL};
00061     static mmx_t mmx_Y_coeff = {0x253f253f253f253fLL};
00062 
00063     movd_m2r (*pu, mm0);                /* mm0 = 00 00 00 00 u3 u2 u1 u0 */
00064     movd_m2r (*pv, mm1);                /* mm1 = 00 00 00 00 v3 v2 v1 v0 */
00065     movq_m2r (*py, mm6);                /* mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
00066     pxor_r2r (mm4, mm4);                /* mm4 = 0 */
00067     /* XXX might do cache preload for image here */
00068 
00069     /*
00070      * Do the multiply part of the conversion for even and odd pixels
00071      * register usage:
00072      * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
00073      * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels
00074      * mm6 -> Y even, mm7 -> Y odd
00075      */
00076 
00077     punpcklbw_r2r (mm4, mm0);           /* mm0 = u3 u2 u1 u0 */
00078     punpcklbw_r2r (mm4, mm1);           /* mm1 = v3 v2 v1 v0 */
00079     psubsw_m2r (mmx_80w, mm0);          /* u -= 128 */
00080     psubsw_m2r (mmx_80w, mm1);          /* v -= 128 */
00081     psllw_i2r (3, mm0);                 /* promote precision */
00082     psllw_i2r (3, mm1);                 /* promote precision */
00083     movq_r2r (mm0, mm2);                /* mm2 = u3 u2 u1 u0 */
00084     movq_r2r (mm1, mm3);                /* mm3 = v3 v2 v1 v0 */
00085     pmulhw_m2r (mmx_U_green, mm2);      /* mm2 = u * u_green */
00086     pmulhw_m2r (mmx_V_green, mm3);      /* mm3 = v * v_green */
00087     pmulhw_m2r (mmx_U_blue, mm0);       /* mm0 = chroma_b */
00088     pmulhw_m2r (mmx_V_red, mm1);        /* mm1 = chroma_r */
00089     paddsw_r2r (mm3, mm2);              /* mm2 = chroma_g */
00090 
00091     psubusb_m2r (mmx_10w, mm6);         /* Y -= 16 */
00092     movq_r2r (mm6, mm7);                /* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
00093     pand_m2r (mmx_00ffw, mm6);          /* mm6 =    Y6    Y4    Y2    Y0 */
00094     psrlw_i2r (8, mm7);                 /* mm7 =    Y7    Y5    Y3    Y1 */
00095     psllw_i2r (3, mm6);                 /* promote precision */
00096     psllw_i2r (3, mm7);                 /* promote precision */
00097     pmulhw_m2r (mmx_Y_coeff, mm6);      /* mm6 = luma_rgb even */
00098     pmulhw_m2r (mmx_Y_coeff, mm7);      /* mm7 = luma_rgb odd */
00099 
00100     /*
00101      * Do the addition part of the conversion for even and odd pixels
00102      * register usage:
00103      * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
00104      * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels
00105      * mm6 -> Y even, mm7 -> Y odd
00106      */
00107 
00108     movq_r2r (mm0, mm3);                /* mm3 = chroma_b */
00109     movq_r2r (mm1, mm4);                /* mm4 = chroma_r */
00110     movq_r2r (mm2, mm5);                /* mm5 = chroma_g */
00111     paddsw_r2r (mm6, mm0);              /* mm0 = B6 B4 B2 B0 */
00112     paddsw_r2r (mm7, mm3);              /* mm3 = B7 B5 B3 B1 */
00113     paddsw_r2r (mm6, mm1);              /* mm1 = R6 R4 R2 R0 */
00114     paddsw_r2r (mm7, mm4);              /* mm4 = R7 R5 R3 R1 */
00115     paddsw_r2r (mm6, mm2);              /* mm2 = G6 G4 G2 G0 */
00116     paddsw_r2r (mm7, mm5);              /* mm5 = G7 G5 G3 G1 */
00117     packuswb_r2r (mm0, mm0);            /* saturate to 0-255 */
00118     packuswb_r2r (mm1, mm1);            /* saturate to 0-255 */
00119     packuswb_r2r (mm2, mm2);            /* saturate to 0-255 */
00120     packuswb_r2r (mm3, mm3);            /* saturate to 0-255 */
00121     packuswb_r2r (mm4, mm4);            /* saturate to 0-255 */
00122     packuswb_r2r (mm5, mm5);            /* saturate to 0-255 */
00123     punpcklbw_r2r (mm3, mm0);           /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */
00124     punpcklbw_r2r (mm4, mm1);           /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */
00125     punpcklbw_r2r (mm5, mm2);           /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */
00126 }
00127 
00128 static inline void mmx_unpack_16rgb (uint8_t * image, const int cpu)
00129 {
00130     static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8LL};
00131     static mmx_t mmx_greenmask = {0xfcfcfcfcfcfcfcfcLL};
00132     static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8LL};
00133 
00134     /*
00135      * convert RGB plane to RGB 16 bits
00136      * mm0 -> B, mm1 -> R, mm2 -> G
00137      * mm4 -> GB, mm5 -> AR pixel 4-7
00138      * mm6 -> GB, mm7 -> AR pixel 0-3
00139      */
00140 
00141     pand_m2r (mmx_bluemask, mm0);       /* mm0 = b7b6b5b4b3______ */
00142     pand_m2r (mmx_greenmask, mm2);      /* mm2 = g7g6g5g4g3g2____ */
00143     pand_m2r (mmx_redmask, mm1);        /* mm1 = r7r6r5r4r3______ */
00144     psrlq_i2r (3, mm0);                 /* mm0 = ______b7b6b5b4b3 */
00145     pxor_r2r (mm4, mm4);                /* mm4 = 0 */
00146     movq_r2r (mm0, mm5);                /* mm5 = ______b7b6b5b4b3 */
00147     movq_r2r (mm2, mm7);                /* mm7 = g7g6g5g4g3g2____ */
00148 
00149     punpcklbw_r2r (mm4, mm2);
00150     punpcklbw_r2r (mm1, mm0);
00151     psllq_i2r (3, mm2);
00152     por_r2r (mm2, mm0);
00153     movntq (mm0, *image);
00154 
00155     punpckhbw_r2r (mm4, mm7);
00156     punpckhbw_r2r (mm1, mm5);
00157     psllq_i2r (3, mm7);
00158     por_r2r (mm7, mm5);
00159     movntq (mm5, *(image+8));
00160 }
00161 
00162 static inline void mmx_unpack_32rgb (uint8_t * image, const int cpu)
00163 {
00164     /*
00165      * convert RGB plane to RGB packed format,
00166      * mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
00167      * mm4 -> GB, mm5 -> AR pixel 4-7,
00168      * mm6 -> GB, mm7 -> AR pixel 0-3
00169      */
00170 
00171     pxor_r2r (mm3, mm3);
00172     movq_r2r (mm0, mm6);
00173     movq_r2r (mm1, mm7);
00174     movq_r2r (mm0, mm4);
00175     movq_r2r (mm1, mm5);
00176     punpcklbw_r2r (mm2, mm6);
00177     punpcklbw_r2r (mm3, mm7);
00178     punpcklwd_r2r (mm7, mm6);
00179     movntq (mm6, *image);
00180     movq_r2r (mm0, mm6);
00181     punpcklbw_r2r (mm2, mm6);
00182     punpckhwd_r2r (mm7, mm6);
00183     movntq (mm6, *(image+8));
00184     punpckhbw_r2r (mm2, mm4);
00185     punpckhbw_r2r (mm3, mm5);
00186     punpcklwd_r2r (mm5, mm4);
00187     movntq (mm4, *(image+16));
00188     movq_r2r (mm0, mm4);
00189     punpckhbw_r2r (mm2, mm4);
00190     punpckhwd_r2r (mm5, mm4);
00191     movntq (mm4, *(image+24));
00192 }
00193 
00194 static inline void yuv420_rgb16 (uint8_t * image,
00195                                  uint8_t * py, uint8_t * pu, uint8_t * pv,
00196                                  int width, int height,
00197                                  int rgb_stride, int y_stride, int uv_stride,
00198                                  const int cpu)
00199 {
00200     int i;
00201 
00202     rgb_stride -= 2 * width;
00203     y_stride -= width;
00204     uv_stride -= width >> 1;
00205     width >>= 3;
00206 
00207     do {
00208         i = width;
00209         do {
00210             mmx_yuv2rgb (py, pu, pv);
00211             mmx_unpack_16rgb (image, cpu);
00212             py += 8;
00213             pu += 4;
00214             pv += 4;
00215             image += 16;
00216         } while (--i);
00217 
00218         py += y_stride;
00219         image += rgb_stride;
00220         if (height & 1) {
00221             pu += uv_stride;
00222             pv += uv_stride;
00223         } else {
00224             pu -= 4 * width;
00225             pv -= 4 * width;
00226         }
00227     } while (--height);
00228 }
00229 
00230 static inline void yuv420_argb32 (uint8_t * image, uint8_t * py,
00231                                   uint8_t * pu, uint8_t * pv,
00232                                   int width, int height,
00233                                   int rgb_stride, int y_stride, int uv_stride,
00234                                   const int cpu)
00235 {
00236     int i;
00237 
00238     rgb_stride -= 4 * width;
00239     y_stride -= width;
00240     uv_stride -= width >> 1;
00241     width >>= 3;
00242 
00243     do {
00244         i = width;
00245         do {
00246             mmx_yuv2rgb (py, pu, pv);
00247             mmx_unpack_32rgb (image, cpu);
00248             py += 8;
00249             pu += 4;
00250             pv += 4;
00251             image += 32;
00252         } while (--i);
00253 
00254         py += y_stride;
00255         image += rgb_stride;
00256         if (height & 1) {
00257             pu += uv_stride;
00258             pv += uv_stride;
00259         } else {
00260             pu -= 4 * width;
00261             pv -= 4 * width;
00262         }
00263     } while (--height);
00264 }
00265 
00266 static void mmxext_rgb16 (void * _id, uint8_t * const * src,
00267                           unsigned int v_offset)
00268 {
00269     convert_rgb_t * id = (convert_rgb_t *) _id;
00270 
00271     yuv420_rgb16 (id->rgb_ptr + id->rgb_stride * v_offset,
00272                   src[0], src[1], src[2], id->width, 16,
00273                   id->rgb_stride, id->uv_stride << 1, id->uv_stride,
00274                   CPU_MMXEXT);
00275 }
00276 
00277 static void mmxext_argb32 (void * _id, uint8_t * const * src,
00278                            unsigned int v_offset)
00279 {
00280     convert_rgb_t * id = (convert_rgb_t *) _id;
00281 
00282     yuv420_argb32 (id->rgb_ptr + id->rgb_stride * v_offset,
00283                    src[0], src[1], src[2], id->width, 16,
00284                   id->rgb_stride, id->uv_stride << 1, id->uv_stride,
00285                   CPU_MMXEXT);
00286 }
00287 
00288 static void mmx_rgb16 (void * _id, uint8_t * const * src,
00289                        unsigned int v_offset)
00290 {
00291     convert_rgb_t * id = (convert_rgb_t *) _id;
00292 
00293     yuv420_rgb16 (id->rgb_ptr + id->rgb_stride * v_offset,
00294                   src[0], src[1], src[2], id->width, 16,
00295                   id->rgb_stride, id->uv_stride << 1, id->uv_stride, CPU_MMX);
00296 }
00297 
00298 static void mmx_argb32 (void * _id, uint8_t * const * src,
00299                         unsigned int v_offset)
00300 {
00301     convert_rgb_t * id = (convert_rgb_t *) _id;
00302 
00303     yuv420_argb32 (id->rgb_ptr + id->rgb_stride * v_offset,
00304                    src[0], src[1], src[2], id->width, 16,
00305                   id->rgb_stride, id->uv_stride << 1, id->uv_stride, CPU_MMX);
00306 }
00307 
00308 yuv2rgb_copy * yuv2rgb_init_mmxext (int order, int bpp)
00309 {
00310     if ((order == CONVERT_RGB) && (bpp == 16))
00311         return mmxext_rgb16;
00312     else if ((order == CONVERT_RGB) && (bpp == 32))
00313         return mmxext_argb32;
00314     return NULL;        /* Fallback to C */
00315 }
00316 
00317 yuv2rgb_copy * yuv2rgb_init_mmx (int order, int bpp)
00318 {
00319     if ((order == CONVERT_RGB) && (bpp == 16))
00320         return mmx_rgb16;
00321     else if ((order == CONVERT_RGB) && (bpp == 32))
00322         return mmx_argb32;
00323     return NULL;        /* Fallback to C */
00324 }
00325 #endif
AFNI/NIfTI Server

Sections

Personal tools

Navigation

Doxygen Source Code Documentation

yuv2rgb_mmx.c