Skip to content

AFNI/NIfTI Server

Sections
Personal tools
You are here: Home » AFNI » Documentation

Doxygen Source Code Documentation


Main Page   Alphabetical List   Data Structures   File List   Data Fields   Globals   Search  

idct_altivec.c

Go to the documentation of this file.
00001 /*
00002  * idct_altivec.c
00003  * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
00004  * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
00005  *
00006  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
00007  * See http://libmpeg2.sourceforge.net/ for updates.
00008  *
00009  * mpeg2dec is free software; you can redistribute it and/or modify
00010  * it under the terms of the GNU General Public License as published by
00011  * the Free Software Foundation; either version 2 of the License, or
00012  * (at your option) any later version.
00013  *
00014  * mpeg2dec is distributed in the hope that it will be useful,
00015  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  * GNU General Public License for more details.
00018  *
00019  * You should have received a copy of the GNU General Public License
00020  * along with this program; if not, write to the Free Software
00021  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00022  */
00023 
00024 #ifndef __ALTIVEC__
00025 
00026 #include "config.h"
00027 
00028 #ifdef ARCH_PPC
00029 
00030 #include <inttypes.h>
00031 
00032 #include "mpeg2.h"
00033 #include "mpeg2_internal.h"
00034 #include "attributes.h"
00035 
00036 static const int16_t constants[5][8] ATTR_ALIGN(16) = {
00037     {23170, 13573, 6518, 21895, -23170, -21895, 32, 31},
00038     {16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725},
00039     {22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521},
00040     {21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692},
00041     {19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722}
00042 };
00043 
00044 /*
00045  * The asm code is generated with:
00046  *
00047  * gcc-2.95 -fvec -D__ALTIVEC__ -O9 -fomit-frame-pointer -mregnames -S
00048  *      idct_altivec.c
00049  *
00050  * awk '{args=""; len=split ($2, arg, ",");
00051  *      for (i=1; i<=len; i++) { a=arg[i]; if (i<len) a=a",";
00052  *                               args = args sprintf ("%-6s", a) }
00053  *      printf ("\t\"\t%-16s%-24s\\n\"\n", $1, args) }' idct_altivec.s |
00054  * unexpand -a
00055  *
00056  * I then do some simple trimming on the function prolog/trailers
00057  */
00058 
00059 void mpeg2_idct_copy_altivec (int16_t * block, uint8_t * dest, int stride)
00060 {
00061     asm ("                                              \n"
00062         "#      stwu            %r1,  -128(%r1)         \n"
00063         "#      mflr            %r0                     \n"
00064         "#      stw             %r0,  132(%r1)          \n"
00065         "#      addi            %r0,  %r1,  128         \n"
00066         "#      bl              _savev25                \n"
00067 
00068         "       addi            %r9,  %r3,  112         \n"
00069         "       vspltish        %v25, 4                 \n"
00070         "       vxor            %v13, %v13, %v13        \n"
00071         "       lis             %r10, constants@ha      \n"
00072         "       lvx             %v1,  0,    %r9         \n"
00073         "       la              %r10, constants@l(%r10) \n"
00074         "       lvx             %v5,  0,    %r3         \n"
00075         "       addi            %r9,  %r3,  16          \n"
00076         "       lvx             %v8,  0,    %r10        \n"
00077         "       addi            %r11, %r10, 32          \n"
00078         "       lvx             %v12, 0,    %r9         \n"
00079         "       lvx             %v6,  0,    %r11        \n"
00080         "       addi            %r8,  %r3,  48          \n"
00081         "       vslh            %v1,  %v1,  %v25        \n"
00082         "       addi            %r9,  %r3,  80          \n"
00083         "       lvx             %v11, 0,    %r8         \n"
00084         "       vslh            %v5,  %v5,  %v25        \n"
00085         "       lvx             %v0,  0,    %r9         \n"
00086         "       addi            %r11, %r10, 64          \n"
00087         "       vsplth          %v3,  %v8,  2           \n"
00088         "       lvx             %v7,  0,    %r11        \n"
00089         "       addi            %r9,  %r3,  96          \n"
00090         "       vslh            %v12, %v12, %v25        \n"
00091         "       vmhraddshs      %v27, %v1,  %v6,  %v13  \n"
00092         "       addi            %r8,  %r3,  32          \n"
00093         "       vsplth          %v2,  %v8,  5           \n"
00094         "       lvx             %v1,  0,    %r9         \n"
00095         "       vslh            %v11, %v11, %v25        \n"
00096         "       addi            %r3,  %r3,  64          \n"
00097         "       lvx             %v9,  0,    %r8         \n"
00098         "       addi            %r9,  %r10, 48          \n"
00099         "       vslh            %v0,  %v0,  %v25        \n"
00100         "       lvx             %v4,  0,    %r9         \n"
00101         "       vmhraddshs      %v31, %v12, %v6,  %v13  \n"
00102         "       addi            %r10, %r10, 16          \n"
00103         "       vmhraddshs      %v30, %v0,  %v7,  %v13  \n"
00104         "       lvx             %v10, 0,    %r3         \n"
00105         "       vsplth          %v19, %v8,  3           \n"
00106         "       vmhraddshs      %v15, %v11, %v7,  %v13  \n"
00107         "       lvx             %v12, 0,    %r10        \n"
00108         "       vsplth          %v6,  %v8,  4           \n"
00109         "       vslh            %v1,  %v1,  %v25        \n"
00110         "       vsplth          %v11, %v8,  1           \n"
00111         "       li              %r9,  4                 \n"
00112         "       vslh            %v9,  %v9,  %v25        \n"
00113         "       vsplth          %v7,  %v8,  0           \n"
00114         "       vmhraddshs      %v18, %v1,  %v4,  %v13  \n"
00115         "       vspltw          %v8,  %v8,  3           \n"
00116         "       vsubshs         %v0,  %v13, %v27        \n"
00117         "       vmhraddshs      %v1,  %v9,  %v4,  %v13  \n"
00118         "       vmhraddshs      %v17, %v3,  %v31, %v0   \n"
00119         "       vmhraddshs      %v4,  %v2,  %v15, %v30  \n"
00120         "       vslh            %v10, %v10, %v25        \n"
00121         "       vmhraddshs      %v9,  %v5,  %v12, %v13  \n"
00122         "       vspltish        %v25, 6                 \n"
00123         "       vmhraddshs      %v5,  %v10, %v12, %v13  \n"
00124         "       vmhraddshs      %v28, %v19, %v30, %v15  \n"
00125         "       vmhraddshs      %v27, %v3,  %v27, %v31  \n"
00126         "       vsubshs         %v0,  %v13, %v18        \n"
00127         "       vmhraddshs      %v18, %v11, %v18, %v1   \n"
00128         "       vaddshs         %v30, %v17, %v4         \n"
00129         "       vmhraddshs      %v12, %v11, %v1,  %v0   \n"
00130         "       vsubshs         %v4,  %v17, %v4         \n"
00131         "       vaddshs         %v10, %v9,  %v5         \n"
00132         "       vsubshs         %v17, %v27, %v28        \n"
00133         "       vaddshs         %v27, %v27, %v28        \n"
00134         "       vsubshs         %v1,  %v9,  %v5         \n"
00135         "       vaddshs         %v28, %v10, %v18        \n"
00136         "       vsubshs         %v18, %v10, %v18        \n"
00137         "       vaddshs         %v10, %v1,  %v12        \n"
00138         "       vsubshs         %v1,  %v1,  %v12        \n"
00139         "       vsubshs         %v12, %v17, %v4         \n"
00140         "       vaddshs         %v4,  %v17, %v4         \n"
00141         "       vmhraddshs      %v5,  %v7,  %v12, %v1   \n"
00142         "       vmhraddshs      %v26, %v6,  %v4,  %v10  \n"
00143         "       vmhraddshs      %v29, %v6,  %v12, %v1   \n"
00144         "       vmhraddshs      %v14, %v7,  %v4,  %v10  \n"
00145         "       vsubshs         %v12, %v18, %v30        \n"
00146         "       vaddshs         %v9,  %v28, %v27        \n"
00147         "       vaddshs         %v16, %v18, %v30        \n"
00148         "       vsubshs         %v10, %v28, %v27        \n"
00149         "       vmrglh          %v31, %v9,  %v12        \n"
00150         "       vmrglh          %v30, %v5,  %v26        \n"
00151         "       vmrglh          %v15, %v14, %v29        \n"
00152         "       vmrghh          %v5,  %v5,  %v26        \n"
00153         "       vmrglh          %v27, %v16, %v10        \n"
00154         "       vmrghh          %v9,  %v9,  %v12        \n"
00155         "       vmrghh          %v18, %v16, %v10        \n"
00156         "       vmrghh          %v1,  %v14, %v29        \n"
00157         "       vmrglh          %v14, %v9,  %v5         \n"
00158         "       vmrglh          %v16, %v31, %v30        \n"
00159         "       vmrglh          %v10, %v15, %v27        \n"
00160         "       vmrghh          %v9,  %v9,  %v5         \n"
00161         "       vmrghh          %v26, %v15, %v27        \n"
00162         "       vmrglh          %v27, %v16, %v10        \n"
00163         "       vmrghh          %v12, %v1,  %v18        \n"
00164         "       vmrglh          %v29, %v1,  %v18        \n"
00165         "       vsubshs         %v0,  %v13, %v27        \n"
00166         "       vmrghh          %v5,  %v31, %v30        \n"
00167         "       vmrglh          %v31, %v9,  %v12        \n"
00168         "       vmrglh          %v30, %v5,  %v26        \n"
00169         "       vmrglh          %v15, %v14, %v29        \n"
00170         "       vmhraddshs      %v17, %v3,  %v31, %v0   \n"
00171         "       vmrghh          %v18, %v16, %v10        \n"
00172         "       vmhraddshs      %v27, %v3,  %v27, %v31  \n"
00173         "       vmhraddshs      %v4,  %v2,  %v15, %v30  \n"
00174         "       vmrghh          %v1,  %v14, %v29        \n"
00175         "       vmhraddshs      %v28, %v19, %v30, %v15  \n"
00176         "       vmrghh          %v0,  %v9,  %v12        \n"
00177         "       vsubshs         %v13, %v13, %v18        \n"
00178         "       vmrghh          %v5,  %v5,  %v26        \n"
00179         "       vmhraddshs      %v18, %v11, %v18, %v1   \n"
00180         "       vaddshs         %v9,  %v0,  %v8         \n"
00181         "       vaddshs         %v30, %v17, %v4         \n"
00182         "       vmhraddshs      %v12, %v11, %v1,  %v13  \n"
00183         "       vsubshs         %v4,  %v17, %v4         \n"
00184         "       vaddshs         %v10, %v9,  %v5         \n"
00185         "       vsubshs         %v17, %v27, %v28        \n"
00186         "       vaddshs         %v27, %v27, %v28        \n"
00187         "       vsubshs         %v1,  %v9,  %v5         \n"
00188         "       vaddshs         %v28, %v10, %v18        \n"
00189         "       vsubshs         %v18, %v10, %v18        \n"
00190         "       vaddshs         %v10, %v1,  %v12        \n"
00191         "       vsubshs         %v1,  %v1,  %v12        \n"
00192         "       vsubshs         %v12, %v17, %v4         \n"
00193         "       vaddshs         %v4,  %v17, %v4         \n"
00194         "       vaddshs         %v9,  %v28, %v27        \n"
00195         "       vmhraddshs      %v14, %v7,  %v4,  %v10  \n"
00196         "       vsrah           %v9,  %v9,  %v25        \n"
00197         "       vmhraddshs      %v5,  %v7,  %v12, %v1   \n"
00198         "       vpkshus         %v0,  %v9,  %v9         \n"
00199         "       vmhraddshs      %v29, %v6,  %v12, %v1   \n"
00200         "       stvewx          %v0,  0,    %r4         \n"
00201         "       vaddshs         %v16, %v18, %v30        \n"
00202         "       vsrah           %v31, %v14, %v25        \n"
00203         "       stvewx          %v0,  %r9,  %r4         \n"
00204         "       add             %r4,  %r4,  %r5         \n"
00205         "       vsrah           %v15, %v16, %v25        \n"
00206         "       vpkshus         %v0,  %v31, %v31        \n"
00207         "       vsrah           %v1,  %v5,  %v25        \n"
00208         "       stvewx          %v0,  0,    %r4         \n"
00209         "       vsubshs         %v12, %v18, %v30        \n"
00210         "       stvewx          %v0,  %r9,  %r4         \n"
00211         "       vmhraddshs      %v26, %v6,  %v4,  %v10  \n"
00212         "       vpkshus         %v0,  %v1,  %v1         \n"
00213         "       add             %r4,  %r4,  %r5         \n"
00214         "       vsrah           %v5,  %v12, %v25        \n"
00215         "       stvewx          %v0,  0,    %r4         \n"
00216         "       vsrah           %v30, %v29, %v25        \n"
00217         "       stvewx          %v0,  %r9,  %r4         \n"
00218         "       vsubshs         %v10, %v28, %v27        \n"
00219         "       vpkshus         %v0,  %v15, %v15        \n"
00220         "       add             %r4,  %r4,  %r5         \n"
00221         "       stvewx          %v0,  0,    %r4         \n"
00222         "       vsrah           %v18, %v26, %v25        \n"
00223         "       stvewx          %v0,  %r9,  %r4         \n"
00224         "       vsrah           %v27, %v10, %v25        \n"
00225         "       vpkshus         %v0,  %v5,  %v5         \n"
00226         "       add             %r4,  %r4,  %r5         \n"
00227         "       stvewx          %v0,  0,    %r4         \n"
00228         "       stvewx          %v0,  %r9,  %r4         \n"
00229         "       vpkshus         %v0,  %v30, %v30        \n"
00230         "       add             %r4,  %r4,  %r5         \n"
00231         "       stvewx          %v0,  0,    %r4         \n"
00232         "       stvewx          %v0,  %r9,  %r4         \n"
00233         "       vpkshus         %v0,  %v18, %v18        \n"
00234         "       add             %r4,  %r4,  %r5         \n"
00235         "       stvewx          %v0,  0,    %r4         \n"
00236         "       stvewx          %v0,  %r9,  %r4         \n"
00237         "       add             %r4,  %r4,  %r5         \n"
00238         "       vpkshus         %v0,  %v27, %v27        \n"
00239         "       stvewx          %v0,  0,    %r4         \n"
00240         "       stvewx          %v0,  %r9,  %r4         \n"
00241 
00242         "#      addi            %r0,  %r1,  128         \n"
00243         "#      bl              _restv25                \n"
00244         "#      lwz             %r0,  132(%r1)          \n"
00245         "#      mtlr            %r0                     \n"
00246         "#      la              %r1,  128(%r1)          \n"
00247 
00248         "       vxor            %v1,  %v1,  %v1         \n"
00249         "       addi            %r9,  %r3,  16          \n"
00250         "       stvx            %v1,  0,    %r3         \n"
00251         "       stvx            %v1,  0,    %r9         \n"
00252         "       addi            %r11, %r3,  32          \n"
00253         "       stvx            %v1,  0,    %r11        \n"
00254         "       addi            %r9,  %r3,  48          \n"
00255         "       stvx            %v1,  0,    %r9         \n"
00256         "       addi            %r11, %r3,  -64         \n"
00257         "       stvx            %v1,  0,    %r11        \n"
00258         "       addi            %r9,  %r3,  -48         \n"
00259         "       stvx            %v1,  0,    %r9         \n"
00260         "       addi            %r11, %r3,  -32         \n"
00261         "       stvx            %v1,  0,    %r11        \n"
00262         "       addi            %r3,  %r3,  -16         \n"
00263         "       stvx            %v1,  0,    %r3         \n"
00264          );
00265 }
00266 
00267 void mpeg2_idct_add_altivec (int last, int16_t * block,
00268                              uint8_t * dest, int stride)
00269 {
00270     asm ("                                              \n"
00271         "#      stwu            %r1,  -192(%r1)         \n"
00272         "#      mflr            %r0                     \n"
00273         "#      stw             %r0,  196(%r1)          \n"
00274         "#      addi            %r0,  %r1,  192         \n"
00275         "#      bl              _savev21                \n"
00276 
00277         "       addi            %r9,  %r4,  112         \n"
00278         "       vspltish        %v21, 4                 \n"
00279         "       vxor            %v1,  %v1,  %v1         \n"
00280         "       lvx             %v13, 0,    %r9         \n"
00281         "       lis             %r10, constants@ha      \n"
00282         "       vspltisw        %v3,  -1                \n"
00283         "       la              %r10, constants@l(%r10) \n"
00284         "       lvx             %v5,  0,    %r4         \n"
00285         "       addi            %r9,  %r4,  16          \n"
00286         "       lvx             %v8,  0,    %r10        \n"
00287         "       lvx             %v12, 0,    %r9         \n"
00288         "       addi            %r11, %r10, 32          \n"
00289         "       lvx             %v6,  0,    %r11        \n"
00290         "       addi            %r8,  %r4,  48          \n"
00291         "       vslh            %v13, %v13, %v21        \n"
00292         "       addi            %r9,  %r4,  80          \n"
00293         "       lvx             %v11, 0,    %r8         \n"
00294         "       vslh            %v5,  %v5,  %v21        \n"
00295         "       lvx             %v0,  0,    %r9         \n"
00296         "       addi            %r11, %r10, 64          \n"
00297         "       vsplth          %v2,  %v8,  2           \n"
00298         "       lvx             %v7,  0,    %r11        \n"
00299         "       vslh            %v12, %v12, %v21        \n"
00300         "       addi            %r9,  %r4,  96          \n"
00301         "       vmhraddshs      %v24, %v13, %v6,  %v1   \n"
00302         "       addi            %r8,  %r4,  32          \n"
00303         "       vsplth          %v17, %v8,  5           \n"
00304         "       lvx             %v13, 0,    %r9         \n"
00305         "       vslh            %v11, %v11, %v21        \n"
00306         "       addi            %r4,  %r4,  64          \n"
00307         "       lvx             %v10, 0,    %r8         \n"
00308         "       vslh            %v0,  %v0,  %v21        \n"
00309         "       addi            %r9,  %r10, 48          \n"
00310         "       vmhraddshs      %v31, %v12, %v6,  %v1   \n"
00311         "       lvx             %v4,  0,    %r9         \n"
00312         "       addi            %r10, %r10, 16          \n"
00313         "       vmhraddshs      %v26, %v0,  %v7,  %v1   \n"
00314         "       lvx             %v9,  0,    %r4         \n"
00315         "       vsplth          %v16, %v8,  3           \n"
00316         "       vmhraddshs      %v22, %v11, %v7,  %v1   \n"
00317         "       lvx             %v6,  0,    %r10        \n"
00318         "       lvsl            %v19, 0,    %r5         \n"
00319         "       vsubshs         %v12, %v1,  %v24        \n"
00320         "       lvsl            %v0,  %r6,  %r5         \n"
00321         "       vsplth          %v11, %v8,  1           \n"
00322         "       vslh            %v10, %v10, %v21        \n"
00323         "       vmrghb          %v19, %v3,  %v19        \n"
00324         "       lvx             %v15, 0,    %r5         \n"
00325         "       vslh            %v13, %v13, %v21        \n"
00326         "       vmrghb          %v3,  %v3,  %v0         \n"
00327         "       li              %r9,  4                 \n"
00328         "       vmhraddshs      %v14, %v2,  %v31, %v12  \n"
00329         "       vsplth          %v7,  %v8,  0           \n"
00330         "       vmhraddshs      %v23, %v13, %v4,  %v1   \n"
00331         "       vsplth          %v18, %v8,  4           \n"
00332         "       vmhraddshs      %v27, %v10, %v4,  %v1   \n"
00333         "       vspltw          %v8,  %v8,  3           \n"
00334         "       vmhraddshs      %v12, %v17, %v22, %v26  \n"
00335         "       vperm           %v15, %v15, %v1,  %v19  \n"
00336         "       vslh            %v9,  %v9,  %v21        \n"
00337         "       vmhraddshs      %v10, %v5,  %v6,  %v1   \n"
00338         "       vspltish        %v21, 6                 \n"
00339         "       vmhraddshs      %v30, %v9,  %v6,  %v1   \n"
00340         "       vmhraddshs      %v26, %v16, %v26, %v22  \n"
00341         "       vmhraddshs      %v24, %v2,  %v24, %v31  \n"
00342         "       vmhraddshs      %v31, %v11, %v23, %v27  \n"
00343         "       vsubshs         %v0,  %v1,  %v23        \n"
00344         "       vaddshs         %v23, %v14, %v12        \n"
00345         "       vmhraddshs      %v9,  %v11, %v27, %v0   \n"
00346         "       vsubshs         %v12, %v14, %v12        \n"
00347         "       vaddshs         %v6,  %v10, %v30        \n"
00348         "       vsubshs         %v14, %v24, %v26        \n"
00349         "       vaddshs         %v24, %v24, %v26        \n"
00350         "       vsubshs         %v13, %v10, %v30        \n"
00351         "       vaddshs         %v26, %v6,  %v31        \n"
00352         "       vsubshs         %v31, %v6,  %v31        \n"
00353         "       vaddshs         %v6,  %v13, %v9         \n"
00354         "       vsubshs         %v13, %v13, %v9         \n"
00355         "       vsubshs         %v9,  %v14, %v12        \n"
00356         "       vaddshs         %v12, %v14, %v12        \n"
00357         "       vmhraddshs      %v30, %v7,  %v9,  %v13  \n"
00358         "       vmhraddshs      %v25, %v18, %v12, %v6   \n"
00359         "       vmhraddshs      %v28, %v18, %v9,  %v13  \n"
00360         "       vmhraddshs      %v29, %v7,  %v12, %v6   \n"
00361         "       vaddshs         %v10, %v26, %v24        \n"
00362         "       vsubshs         %v5,  %v31, %v23        \n"
00363         "       vsubshs         %v13, %v26, %v24        \n"
00364         "       vaddshs         %v4,  %v31, %v23        \n"
00365         "       vmrglh          %v26, %v30, %v25        \n"
00366         "       vmrglh          %v31, %v10, %v5         \n"
00367         "       vmrglh          %v22, %v29, %v28        \n"
00368         "       vmrghh          %v30, %v30, %v25        \n"
00369         "       vmrglh          %v24, %v4,  %v13        \n"
00370         "       vmrghh          %v10, %v10, %v5         \n"
00371         "       vmrghh          %v23, %v4,  %v13        \n"
00372         "       vmrghh          %v27, %v29, %v28        \n"
00373         "       vmrglh          %v29, %v10, %v30        \n"
00374         "       vmrglh          %v4,  %v31, %v26        \n"
00375         "       vmrglh          %v13, %v22, %v24        \n"
00376         "       vmrghh          %v10, %v10, %v30        \n"
00377         "       vmrghh          %v25, %v22, %v24        \n"
00378         "       vmrglh          %v24, %v4,  %v13        \n"
00379         "       vmrghh          %v5,  %v27, %v23        \n"
00380         "       vmrglh          %v28, %v27, %v23        \n"
00381         "       vsubshs         %v0,  %v1,  %v24        \n"
00382         "       vmrghh          %v30, %v31, %v26        \n"
00383         "       vmrglh          %v31, %v10, %v5         \n"
00384         "       vmrglh          %v26, %v30, %v25        \n"
00385         "       vmrglh          %v22, %v29, %v28        \n"
00386         "       vmhraddshs      %v14, %v2,  %v31, %v0   \n"
00387         "       vmrghh          %v23, %v4,  %v13        \n"
00388         "       vmhraddshs      %v24, %v2,  %v24, %v31  \n"
00389         "       vmhraddshs      %v12, %v17, %v22, %v26  \n"
00390         "       vmrghh          %v27, %v29, %v28        \n"
00391         "       vmhraddshs      %v26, %v16, %v26, %v22  \n"
00392         "       vmrghh          %v0,  %v10, %v5         \n"
00393         "       vmhraddshs      %v31, %v11, %v23, %v27  \n"
00394         "       vmrghh          %v30, %v30, %v25        \n"
00395         "       vsubshs         %v13, %v1,  %v23        \n"
00396         "       vaddshs         %v10, %v0,  %v8         \n"
00397         "       vaddshs         %v23, %v14, %v12        \n"
00398         "       vsubshs         %v12, %v14, %v12        \n"
00399         "       vaddshs         %v6,  %v10, %v30        \n"
00400         "       vsubshs         %v14, %v24, %v26        \n"
00401         "       vmhraddshs      %v9,  %v11, %v27, %v13  \n"
00402         "       vaddshs         %v24, %v24, %v26        \n"
00403         "       vaddshs         %v26, %v6,  %v31        \n"
00404         "       vsubshs         %v13, %v10, %v30        \n"
00405         "       vaddshs         %v10, %v26, %v24        \n"
00406         "       vsubshs         %v31, %v6,  %v31        \n"
00407         "       vaddshs         %v6,  %v13, %v9         \n"
00408         "       vsrah           %v10, %v10, %v21        \n"
00409         "       vsubshs         %v13, %v13, %v9         \n"
00410         "       vaddshs         %v0,  %v15, %v10        \n"
00411         "       vsubshs         %v9,  %v14, %v12        \n"
00412         "       vaddshs         %v12, %v14, %v12        \n"
00413         "       vpkshus         %v15, %v0,  %v0         \n"
00414         "       stvewx          %v15, 0,    %r5         \n"
00415         "       vaddshs         %v4,  %v31, %v23        \n"
00416         "       vmhraddshs      %v29, %v7,  %v12, %v6   \n"
00417         "       stvewx          %v15, %r9,  %r5         \n"
00418         "       add             %r5,  %r5,  %r6         \n"
00419         "       vsubshs         %v5,  %v31, %v23        \n"
00420         "       lvx             %v15, 0,    %r5         \n"
00421         "       vmhraddshs      %v30, %v7,  %v9,  %v13  \n"
00422         "       vsrah           %v22, %v4,  %v21        \n"
00423         "       vperm           %v15, %v15, %v1,  %v3   \n"
00424         "       vmhraddshs      %v28, %v18, %v9,  %v13  \n"
00425         "       vsrah           %v31, %v29, %v21        \n"
00426         "       vsubshs         %v13, %v26, %v24        \n"
00427         "       vaddshs         %v0,  %v15, %v31        \n"
00428         "       vsrah           %v27, %v30, %v21        \n"
00429         "       vpkshus         %v15, %v0,  %v0         \n"
00430         "       vsrah           %v30, %v5,  %v21        \n"
00431         "       stvewx          %v15, 0,    %r5         \n"
00432         "       vsrah           %v26, %v28, %v21        \n"
00433         "       stvewx          %v15, %r9,  %r5         \n"
00434         "       vmhraddshs      %v25, %v18, %v12, %v6   \n"
00435         "       add             %r5,  %r5,  %r6         \n"
00436         "       vsrah           %v24, %v13, %v21        \n"
00437         "       lvx             %v15, 0,    %r5         \n"
00438         "       vperm           %v15, %v15, %v1,  %v19  \n"
00439         "       vsrah           %v23, %v25, %v21        \n"
00440         "       vaddshs         %v0,  %v15, %v27        \n"
00441         "       vpkshus         %v15, %v0,  %v0         \n"
00442         "       stvewx          %v15, 0,    %r5         \n"
00443         "       stvewx          %v15, %r9,  %r5         \n"
00444         "       add             %r5,  %r5,  %r6         \n"
00445         "       lvx             %v15, 0,    %r5         \n"
00446         "       vperm           %v15, %v15, %v1,  %v3   \n"
00447         "       vaddshs         %v0,  %v15, %v22        \n"
00448         "       vpkshus         %v15, %v0,  %v0         \n"
00449         "       stvewx          %v15, 0,    %r5         \n"
00450         "       stvewx          %v15, %r9,  %r5         \n"
00451         "       add             %r5,  %r5,  %r6         \n"
00452         "       lvx             %v15, 0,    %r5         \n"
00453         "       vperm           %v15, %v15, %v1,  %v19  \n"
00454         "       vaddshs         %v0,  %v15, %v30        \n"
00455         "       vpkshus         %v15, %v0,  %v0         \n"
00456         "       stvewx          %v15, 0,    %r5         \n"
00457         "       stvewx          %v15, %r9,  %r5         \n"
00458         "       add             %r5,  %r5,  %r6         \n"
00459         "       lvx             %v15, 0,    %r5         \n"
00460         "       vperm           %v15, %v15, %v1,  %v3   \n"
00461         "       vaddshs         %v0,  %v15, %v26        \n"
00462         "       vpkshus         %v15, %v0,  %v0         \n"
00463         "       stvewx          %v15, 0,    %r5         \n"
00464         "       stvewx          %v15, %r9,  %r5         \n"
00465         "       add             %r5,  %r5,  %r6         \n"
00466         "       lvx             %v15, 0,    %r5         \n"
00467         "       vperm           %v15, %v15, %v1,  %v19  \n"
00468         "       vaddshs         %v0,  %v15, %v23        \n"
00469         "       vpkshus         %v15, %v0,  %v0         \n"
00470         "       stvewx          %v15, 0,    %r5         \n"
00471         "       stvewx          %v15, %r9,  %r5         \n"
00472         "       add             %r5,  %r5,  %r6         \n"
00473         "       lvx             %v15, 0,    %r5         \n"
00474         "       vperm           %v15, %v15, %v1,  %v3   \n"
00475         "       vaddshs         %v0,  %v15, %v24        \n"
00476         "       vpkshus         %v15, %v0,  %v0         \n"
00477         "       stvewx          %v15, 0,    %r5         \n"
00478         "       stvewx          %v15, %r9,  %r5         \n"
00479 
00480         "#      addi            %r0,  %r1,  192         \n"
00481         "#      bl              _restv21                \n"
00482         "#      lwz             %r0,  196(%r1)          \n"
00483         "#      mtlr            %r0                     \n"
00484         "#      la              %r1,  192(%r1)          \n"
00485 
00486         "       addi            %r9,  %r4,  16          \n"
00487         "       stvx            %v1,  0,    %r4         \n"
00488         "       stvx            %v1,  0,    %r9         \n"
00489         "       addi            %r11, %r4,  32          \n"
00490         "       stvx            %v1,  0,    %r11        \n"
00491         "       addi            %r9,  %r4,  48          \n"
00492         "       stvx            %v1,  0,    %r9         \n"
00493         "       addi            %r11, %r4,  -64         \n"
00494         "       stvx            %v1,  0,    %r11        \n"
00495         "       addi            %r9,  %r4,  -48         \n"
00496         "       stvx            %v1,  0,    %r9         \n"
00497         "       addi            %r11, %r4,  -32         \n"
00498         "       stvx            %v1,  0,    %r11        \n"
00499         "       addi            %r4,  %r4,  -16         \n"
00500         "       stvx            %v1,  0,    %r4         \n"
00501          );
00502 }
00503 
00504 void mpeg2_idct_altivec_init (void)
00505 {
00506     extern uint8_t mpeg2_scan_norm[64];
00507     extern uint8_t mpeg2_scan_alt[64];
00508     int i, j;
00509 
00510     i = constants[0][0];        /* just pretending - keeps gcc happy */
00511 
00512     /* the altivec idct uses a transposed input, so we patch scan tables */
00513     for (i = 0; i < 64; i++) {
00514         j = mpeg2_scan_norm[i];
00515         mpeg2_scan_norm[i] = (j >> 3) | ((j & 7) << 3);
00516         j = mpeg2_scan_alt[i];
00517         mpeg2_scan_alt[i] = (j >> 3) | ((j & 7) << 3);
00518     }
00519 }
00520 
00521 #endif  /* ARCH_PPC */
00522 
00523 #else   /* __ALTIVEC__ */
00524 
00525 #define vector_s16_t vector signed short
00526 #define vector_u16_t vector unsigned short
00527 #define vector_s8_t vector signed char
00528 #define vector_u8_t vector unsigned char
00529 #define vector_s32_t vector signed int
00530 #define vector_u32_t vector unsigned int
00531 
00532 #define IDCT_HALF                                       \
00533     /* 1st stage */                                     \
00534     t1 = vec_mradds (a1, vx7, vx1 );                    \
00535     t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7));    \
00536     t7 = vec_mradds (a2, vx5, vx3);                     \
00537     t3 = vec_mradds (ma2, vx3, vx5);                    \
00538                                                         \
00539     /* 2nd stage */                                     \
00540     t5 = vec_adds (vx0, vx4);                           \
00541     t0 = vec_subs (vx0, vx4);                           \
00542     t2 = vec_mradds (a0, vx6, vx2);                     \
00543     t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6));    \
00544     t6 = vec_adds (t8, t3);                             \
00545     t3 = vec_subs (t8, t3);                             \
00546     t8 = vec_subs (t1, t7);                             \
00547     t1 = vec_adds (t1, t7);                             \
00548                                                         \
00549     /* 3rd stage */                                     \
00550     t7 = vec_adds (t5, t2);                             \
00551     t2 = vec_subs (t5, t2);                             \
00552     t5 = vec_adds (t0, t4);                             \
00553     t0 = vec_subs (t0, t4);                             \
00554     t4 = vec_subs (t8, t3);                             \
00555     t3 = vec_adds (t8, t3);                             \
00556                                                         \
00557     /* 4th stage */                                     \
00558     vy0 = vec_adds (t7, t1);                            \
00559     vy7 = vec_subs (t7, t1);                            \
00560     vy1 = vec_mradds (c4, t3, t5);                      \
00561     vy6 = vec_mradds (mc4, t3, t5);                     \
00562     vy2 = vec_mradds (c4, t4, t0);                      \
00563     vy5 = vec_mradds (mc4, t4, t0);                     \
00564     vy3 = vec_adds (t2, t6);                            \
00565     vy4 = vec_subs (t2, t6);
00566 
00567 #define IDCT                                                            \
00568     vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;                \
00569     vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;                \
00570     vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias;                  \
00571     vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8;                    \
00572     vector_u16_t shift;                                                 \
00573                                                                         \
00574     c4 = vec_splat (constants[0], 0);                                   \
00575     a0 = vec_splat (constants[0], 1);                                   \
00576     a1 = vec_splat (constants[0], 2);                                   \
00577     a2 = vec_splat (constants[0], 3);                                   \
00578     mc4 = vec_splat (constants[0], 4);                                  \
00579     ma2 = vec_splat (constants[0], 5);                                  \
00580     bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3);     \
00581                                                                         \
00582     zero = vec_splat_s16 (0);                                           \
00583     shift = vec_splat_u16 (4);                                          \
00584                                                                         \
00585     vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero);    \
00586     vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero);    \
00587     vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero);    \
00588     vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero);    \
00589     vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero);    \
00590     vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero);    \
00591     vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero);    \
00592     vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero);    \
00593                                                                         \
00594     IDCT_HALF                                                           \
00595                                                                         \
00596     vx0 = vec_mergeh (vy0, vy4);                                        \
00597     vx1 = vec_mergel (vy0, vy4);                                        \
00598     vx2 = vec_mergeh (vy1, vy5);                                        \
00599     vx3 = vec_mergel (vy1, vy5);                                        \
00600     vx4 = vec_mergeh (vy2, vy6);                                        \
00601     vx5 = vec_mergel (vy2, vy6);                                        \
00602     vx6 = vec_mergeh (vy3, vy7);                                        \
00603     vx7 = vec_mergel (vy3, vy7);                                        \
00604                                                                         \
00605     vy0 = vec_mergeh (vx0, vx4);                                        \
00606     vy1 = vec_mergel (vx0, vx4);                                        \
00607     vy2 = vec_mergeh (vx1, vx5);                                        \
00608     vy3 = vec_mergel (vx1, vx5);                                        \
00609     vy4 = vec_mergeh (vx2, vx6);                                        \
00610     vy5 = vec_mergel (vx2, vx6);                                        \
00611     vy6 = vec_mergeh (vx3, vx7);                                        \
00612     vy7 = vec_mergel (vx3, vx7);                                        \
00613                                                                         \
00614     vx0 = vec_adds (vec_mergeh (vy0, vy4), bias);                       \
00615     vx1 = vec_mergel (vy0, vy4);                                        \
00616     vx2 = vec_mergeh (vy1, vy5);                                        \
00617     vx3 = vec_mergel (vy1, vy5);                                        \
00618     vx4 = vec_mergeh (vy2, vy6);                                        \
00619     vx5 = vec_mergel (vy2, vy6);                                        \
00620     vx6 = vec_mergeh (vy3, vy7);                                        \
00621     vx7 = vec_mergel (vy3, vy7);                                        \
00622                                                                         \
00623     IDCT_HALF                                                           \
00624                                                                         \
00625     shift = vec_splat_u16 (6);                                          \
00626     vx0 = vec_sra (vy0, shift);                                         \
00627     vx1 = vec_sra (vy1, shift);                                         \
00628     vx2 = vec_sra (vy2, shift);                                         \
00629     vx3 = vec_sra (vy3, shift);                                         \
00630     vx4 = vec_sra (vy4, shift);                                         \
00631     vx5 = vec_sra (vy5, shift);                                         \
00632     vx6 = vec_sra (vy6, shift);                                         \
00633     vx7 = vec_sra (vy7, shift);
00634 
00635 static const vector_s16_t constants[5] = {
00636     (vector_s16_t)(23170, 13573, 6518, 21895, -23170, -21895, 32, 31),
00637     (vector_s16_t)(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725),
00638     (vector_s16_t)(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521),
00639     (vector_s16_t)(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692),
00640     (vector_s16_t)(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722)
00641 };
00642 
00643 void mpeg2_idct_copy_altivec (vector_s16_t * const block, unsigned char * dest,
00644                               const int stride)
00645 {
00646     vector_u8_t tmp;
00647 
00648     IDCT
00649 
00650 #define COPY(dest,src)                                          \
00651     tmp = vec_packsu (src, src);                                \
00652     vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);       \
00653     vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
00654 
00655     COPY (dest, vx0)    dest += stride;
00656     COPY (dest, vx1)    dest += stride;
00657     COPY (dest, vx2)    dest += stride;
00658     COPY (dest, vx3)    dest += stride;
00659     COPY (dest, vx4)    dest += stride;
00660     COPY (dest, vx5)    dest += stride;
00661     COPY (dest, vx6)    dest += stride;
00662     COPY (dest, vx7)
00663 
00664     memset (block, 0, 64 * sizeof (signed short));
00665 }
00666 
00667 void mpeg2_idct_add_altivec (const int last, vector_s16_t * const block,
00668                              unsigned char * dest, const int stride)
00669 {
00670     vector_u8_t tmp;
00671     vector_s16_t tmp2, tmp3;
00672     vector_u8_t perm0;
00673     vector_u8_t perm1;
00674     vector_u8_t p0, p1, p;
00675 
00676     IDCT
00677 
00678     p0 = vec_lvsl (0, dest);
00679     p1 = vec_lvsl (stride, dest);
00680     p = vec_splat_u8 (-1);
00681     perm0 = vec_mergeh (p, p0);
00682     perm1 = vec_mergeh (p, p1);
00683 
00684 #define ADD(dest,src,perm)                                              \
00685     /* *(uint64_t *)&tmp = *(uint64_t *)dest; */                        \
00686     tmp = vec_ld (0, dest);                                             \
00687     tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm);       \
00688     tmp3 = vec_adds (tmp2, src);                                        \
00689     tmp = vec_packsu (tmp3, tmp3);                                      \
00690     vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);               \
00691     vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
00692 
00693     ADD (dest, vx0, perm0)      dest += stride;
00694     ADD (dest, vx1, perm1)      dest += stride;
00695     ADD (dest, vx2, perm0)      dest += stride;
00696     ADD (dest, vx3, perm1)      dest += stride;
00697     ADD (dest, vx4, perm0)      dest += stride;
00698     ADD (dest, vx5, perm1)      dest += stride;
00699     ADD (dest, vx6, perm0)      dest += stride;
00700     ADD (dest, vx7, perm1)
00701 
00702     memset (block, 0, 64 * sizeof (signed short));
00703 }
00704 
00705 #endif  /* __ALTIVEC__ */
 

Powered by Plone

This site conforms to the following standards: