Doxygen Source Code Documentation

Main Page Alphabetical List Data Structures File List Data Fields Globals Search
idct_alpha.c

00001 /*
00002  * idct_alpha.c
00003  * Copyright (C) 2002 Falk Hueffner <falk@debian.org>
00004  * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
00005  * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
00006  *
00007  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
00008  * See http://libmpeg2.sourceforge.net/ for updates.
00009  *
00010  * mpeg2dec is free software; you can redistribute it and/or modify
00011  * it under the terms of the GNU General Public License as published by
00012  * the Free Software Foundation; either version 2 of the License, or
00013  * (at your option) any later version.
00014  *
00015  * mpeg2dec is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018  * GNU General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU General Public License
00021  * along with this program; if not, write to the Free Software
00022  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00023  */
00024 
00025 #include "config.h"
00026 
00027 #ifdef ARCH_ALPHA
00028 
00029 #include <stdlib.h>
00030 #include <inttypes.h>
00031 
00032 #include "alpha_asm.h"
00033 #include "attributes.h"
00034 
00035 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
00036 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
00037 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
00038 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
00039 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
00040 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
00041 
00042 static uint8_t clip_lut[1024];
00043 #define CLIP(i) ((clip_lut+384)[(i)])
00044 
00045 #if 0
00046 #define BUTTERFLY(t0,t1,W0,W1,d0,d1)    \
00047 do {                                    \
00048     t0 = W0*d0 + W1*d1;                 \
00049     t1 = W0*d1 - W1*d0;                 \
00050 } while (0)
00051 #else
00052 #define BUTTERFLY(t0,t1,W0,W1,d0,d1)    \
00053 do {                                    \
00054     int_fast32_t tmp = W0 * (d0 + d1);  \
00055     t0 = tmp + (W1 - W0) * d1;          \
00056     t1 = tmp - (W1 + W0) * d0;          \
00057 } while (0)
00058 #endif
00059 
00060 static void inline idct_row (int16_t * const block)
00061 {
00062     uint64_t l, r;
00063     int_fast32_t d0, d1, d2, d3;
00064     int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
00065     int_fast32_t t0, t1, t2, t3;
00066 
00067     l = ldq (block);
00068     r = ldq (block + 4);
00069 
00070     /* shortcut */
00071     if (likely (!((l & ~0xffffUL) | r))) {
00072         uint64_t tmp = (uint16_t) (l << 3);
00073         tmp |= tmp << 16;
00074         tmp |= tmp << 32;
00075         ((int32_t *)block)[0] = tmp;
00076         ((int32_t *)block)[1] = tmp;
00077         ((int32_t *)block)[2] = tmp;
00078         ((int32_t *)block)[3] = tmp;
00079         return;
00080     }
00081 
00082     d0 = (sextw (l) << 11) + 128;
00083     d1 = sextw (extwl (l, 2));
00084     d2 = sextw (extwl (l, 4)) << 11;
00085     d3 = sextw (extwl (l, 6));
00086     t0 = d0 + d2;
00087     t1 = d0 - d2;
00088     BUTTERFLY (t2, t3, W6, W2, d3, d1);
00089     a0 = t0 + t2;
00090     a1 = t1 + t3;
00091     a2 = t1 - t3;
00092     a3 = t0 - t2;
00093 
00094     d0 = sextw (r);
00095     d1 = sextw (extwl (r, 2));
00096     d2 = sextw (extwl (r, 4));
00097     d3 = sextw (extwl (r, 6));
00098     BUTTERFLY (t0, t1, W7, W1, d3, d0);
00099     BUTTERFLY (t2, t3, W3, W5, d1, d2);
00100     b0 = t0 + t2;
00101     b3 = t1 + t3;
00102     t0 -= t2;
00103     t1 -= t3;
00104     b1 = ((t0 + t1) * 181) >> 8;
00105     b2 = ((t0 - t1) * 181) >> 8;
00106 
00107     block[0] = (a0 + b0) >> 8;
00108     block[1] = (a1 + b1) >> 8;
00109     block[2] = (a2 + b2) >> 8;
00110     block[3] = (a3 + b3) >> 8;
00111     block[4] = (a3 - b3) >> 8;
00112     block[5] = (a2 - b2) >> 8;
00113     block[6] = (a1 - b1) >> 8;
00114     block[7] = (a0 - b0) >> 8;
00115 }
00116 
00117 static void inline idct_col (int16_t * const block)
00118 {
00119     int_fast32_t d0, d1, d2, d3;
00120     int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
00121     int_fast32_t t0, t1, t2, t3;
00122 
00123     d0 = (block[8*0] << 11) + 65536;
00124     d1 = block[8*1];
00125     d2 = block[8*2] << 11;
00126     d3 = block[8*3];
00127     t0 = d0 + d2;
00128     t1 = d0 - d2;
00129     BUTTERFLY (t2, t3, W6, W2, d3, d1);
00130     a0 = t0 + t2;
00131     a1 = t1 + t3;
00132     a2 = t1 - t3;
00133     a3 = t0 - t2;
00134 
00135     d0 = block[8*4];
00136     d1 = block[8*5];
00137     d2 = block[8*6];
00138     d3 = block[8*7];
00139     BUTTERFLY (t0, t1, W7, W1, d3, d0);
00140     BUTTERFLY (t2, t3, W3, W5, d1, d2);
00141     b0 = t0 + t2;
00142     b3 = t1 + t3;
00143     t0 = (t0 - t2) >> 8;
00144     t1 = (t1 - t3) >> 8;
00145     b1 = (t0 + t1) * 181;
00146     b2 = (t0 - t1) * 181;
00147 
00148     block[8*0] = (a0 + b0) >> 17;
00149     block[8*1] = (a1 + b1) >> 17;
00150     block[8*2] = (a2 + b2) >> 17;
00151     block[8*3] = (a3 + b3) >> 17;
00152     block[8*4] = (a3 - b3) >> 17;
00153     block[8*5] = (a2 - b2) >> 17;
00154     block[8*6] = (a1 - b1) >> 17;
00155     block[8*7] = (a0 - b0) >> 17;
00156 }
00157 
00158 void mpeg2_idct_copy_mvi (int16_t * block, uint8_t * dest, const int stride)
00159 {
00160     uint64_t clampmask;
00161     int i;
00162 
00163     for (i = 0; i < 8; i++)
00164         idct_row (block + 8 * i);
00165 
00166     for (i = 0; i < 8; i++)
00167         idct_col (block + i);
00168 
00169     clampmask = zap (-1, 0xaa); /* 0x00ff00ff00ff00ff */
00170     do {
00171         uint64_t shorts0, shorts1;
00172 
00173         shorts0 = ldq (block);
00174         shorts0 = maxsw4 (shorts0, 0);
00175         shorts0 = minsw4 (shorts0, clampmask);
00176         stl (pkwb (shorts0), dest);
00177 
00178         shorts1 = ldq (block + 4);
00179         shorts1 = maxsw4 (shorts1, 0);
00180         shorts1 = minsw4 (shorts1, clampmask);
00181         stl (pkwb (shorts1), dest + 4);
00182 
00183         stq (0, block);
00184         stq (0, block + 4);
00185 
00186         dest += stride;
00187         block += 8;
00188     } while (--i);
00189 }
00190 
00191 void mpeg2_idct_add_mvi (const int last, int16_t * block,
00192                          uint8_t * dest, const int stride)
00193 {
00194     uint64_t clampmask;
00195     uint64_t signmask;
00196     int i;
00197 
00198     if (last != 129 || (block[0] & 7) == 4) {
00199         for (i = 0; i < 8; i++)
00200             idct_row (block + 8 * i);
00201         for (i = 0; i < 8; i++)
00202             idct_col (block + i);
00203         clampmask = zap (-1, 0xaa);     /* 0x00ff00ff00ff00ff */
00204         signmask = zap (-1, 0x33);
00205         signmask ^= signmask >> 1;      /* 0x8000800080008000 */
00206 
00207         do {
00208             uint64_t shorts0, pix0, signs0;
00209             uint64_t shorts1, pix1, signs1;
00210 
00211             shorts0 = ldq (block);
00212             shorts1 = ldq (block + 4);
00213 
00214             pix0 = unpkbw (ldl (dest));
00215             /* signed subword add (MMX paddw).  */
00216             signs0 = shorts0 & signmask;
00217             shorts0 &= ~signmask;
00218             shorts0 += pix0;
00219             shorts0 ^= signs0;
00220             /* clamp. */
00221             shorts0 = maxsw4 (shorts0, 0);
00222             shorts0 = minsw4 (shorts0, clampmask);      
00223 
00224             /* next 4.  */
00225             pix1 = unpkbw (ldl (dest + 4));
00226             signs1 = shorts1 & signmask;
00227             shorts1 &= ~signmask;
00228             shorts1 += pix1;
00229             shorts1 ^= signs1;
00230             shorts1 = maxsw4 (shorts1, 0);
00231             shorts1 = minsw4 (shorts1, clampmask);
00232 
00233             stl (pkwb (shorts0), dest);
00234             stl (pkwb (shorts1), dest + 4);
00235             stq (0, block);
00236             stq (0, block + 4);
00237 
00238             dest += stride;
00239             block += 8;
00240         } while (--i);
00241     } else {
00242         int DC;
00243         uint64_t p0, p1, p2, p3, p4, p5, p6, p7;
00244         uint64_t DCs;
00245 
00246         DC = (block[0] + 4) >> 3;
00247         block[0] = block[63] = 0;
00248 
00249         p0 = ldq (dest + 0 * stride);
00250         p1 = ldq (dest + 1 * stride);
00251         p2 = ldq (dest + 2 * stride);
00252         p3 = ldq (dest + 3 * stride);
00253         p4 = ldq (dest + 4 * stride);
00254         p5 = ldq (dest + 5 * stride);
00255         p6 = ldq (dest + 6 * stride);
00256         p7 = ldq (dest + 7 * stride);
00257 
00258         if (DC > 0) {
00259             DCs = BYTE_VEC (likely (DC <= 255) ? DC : 255);
00260             p0 += minub8 (DCs, ~p0);
00261             p1 += minub8 (DCs, ~p1);
00262             p2 += minub8 (DCs, ~p2);
00263             p3 += minub8 (DCs, ~p3);
00264             p4 += minub8 (DCs, ~p4);
00265             p5 += minub8 (DCs, ~p5);
00266             p6 += minub8 (DCs, ~p6);
00267             p7 += minub8 (DCs, ~p7);
00268         } else {
00269             DCs = BYTE_VEC (likely (-DC <= 255) ? -DC : 255);
00270             p0 -= minub8 (DCs, p0);
00271             p1 -= minub8 (DCs, p1);
00272             p2 -= minub8 (DCs, p2);
00273             p3 -= minub8 (DCs, p3);
00274             p4 -= minub8 (DCs, p4);
00275             p5 -= minub8 (DCs, p5);
00276             p6 -= minub8 (DCs, p6);
00277             p7 -= minub8 (DCs, p7);
00278         }
00279 
00280         stq (p0, dest + 0 * stride);
00281         stq (p1, dest + 1 * stride);
00282         stq (p2, dest + 2 * stride);
00283         stq (p3, dest + 3 * stride);
00284         stq (p4, dest + 4 * stride);
00285         stq (p5, dest + 5 * stride);
00286         stq (p6, dest + 6 * stride);
00287         stq (p7, dest + 7 * stride);
00288     }
00289 }
00290 
00291 void mpeg2_idct_copy_alpha (int16_t * block, uint8_t * dest, const int stride)
00292 {
00293     int i;
00294 
00295     for (i = 0; i < 8; i++)
00296         idct_row (block + 8 * i);
00297     for (i = 0; i < 8; i++)
00298         idct_col (block + i);
00299     do {
00300         dest[0] = CLIP (block[0]);
00301         dest[1] = CLIP (block[1]);
00302         dest[2] = CLIP (block[2]);
00303         dest[3] = CLIP (block[3]);
00304         dest[4] = CLIP (block[4]);
00305         dest[5] = CLIP (block[5]);
00306         dest[6] = CLIP (block[6]);
00307         dest[7] = CLIP (block[7]);
00308 
00309         stq(0, block);
00310         stq(0, block + 4);
00311 
00312         dest += stride;
00313         block += 8;
00314     } while (--i);
00315 }
00316 
00317 void mpeg2_idct_add_alpha (const int last, int16_t * block,
00318                            uint8_t * dest, const int stride)
00319 {
00320     int i;
00321 
00322     if (last != 129 || (block[0] & 7) == 4) {
00323         for (i = 0; i < 8; i++)
00324             idct_row (block + 8 * i);
00325         for (i = 0; i < 8; i++)
00326             idct_col (block + i);
00327         do {
00328             dest[0] = CLIP (block[0] + dest[0]);
00329             dest[1] = CLIP (block[1] + dest[1]);
00330             dest[2] = CLIP (block[2] + dest[2]);
00331             dest[3] = CLIP (block[3] + dest[3]);
00332             dest[4] = CLIP (block[4] + dest[4]);
00333             dest[5] = CLIP (block[5] + dest[5]);
00334             dest[6] = CLIP (block[6] + dest[6]);
00335             dest[7] = CLIP (block[7] + dest[7]);
00336 
00337             stq(0, block);
00338             stq(0, block + 4);
00339 
00340             dest += stride;
00341             block += 8;
00342         } while (--i);
00343     } else {
00344         int DC;
00345 
00346         DC = (block[0] + 4) >> 3;
00347         block[0] = block[63] = 0;
00348         i = 8;
00349         do {
00350             dest[0] = CLIP (DC + dest[0]);
00351             dest[1] = CLIP (DC + dest[1]);
00352             dest[2] = CLIP (DC + dest[2]);
00353             dest[3] = CLIP (DC + dest[3]);
00354             dest[4] = CLIP (DC + dest[4]);
00355             dest[5] = CLIP (DC + dest[5]);
00356             dest[6] = CLIP (DC + dest[6]);
00357             dest[7] = CLIP (DC + dest[7]);
00358             dest += stride;
00359         } while (--i);
00360     }
00361 }
00362 
00363 void mpeg2_idct_alpha_init(int no_mvi)
00364 {
00365     extern uint8_t mpeg2_scan_norm[64];
00366     extern uint8_t mpeg2_scan_alt[64];
00367     int i, j;
00368 
00369     if (no_mvi)
00370         for (i = -384; i < 640; i++)
00371             clip_lut[i + 384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
00372     for (i = 0; i < 64; i++) {
00373         j = mpeg2_scan_norm[i];
00374         mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
00375         j = mpeg2_scan_alt[i];
00376         mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
00377     }
00378 }
00379 
00380 #endif /* ARCH_ALPHA */
AFNI/NIfTI Server

Sections

Personal tools

Navigation

Doxygen Source Code Documentation

idct_alpha.c