00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "config.h"
00025
00026 #ifdef ARCH_X86
00027
00028 #include <inttypes.h>
00029
00030 #include "mpeg2.h"
00031 #include "mpeg2_internal.h"
00032 #include "attributes.h"
00033 #include "mmx.h"
00034
00035 #define ROW_SHIFT 11
00036 #define COL_SHIFT 6
00037
00038 #define round(bias) ((int)(((bias)+0.5) * (1<<ROW_SHIFT)))
00039 #define rounder(bias) {round (bias), round (bias)}
00040
00041
00042 #if 0
00043
00044 static inline void idct_row (int16_t * row, int offset,
00045 int16_t * table, int32_t * rounder)
00046 {
00047 int C1, C2, C3, C4, C5, C6, C7;
00048 int a0, a1, a2, a3, b0, b1, b2, b3;
00049
00050 row += offset;
00051
00052 C1 = table[1];
00053 C2 = table[2];
00054 C3 = table[3];
00055 C4 = table[4];
00056 C5 = table[5];
00057 C6 = table[6];
00058 C7 = table[7];
00059
00060 a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + *rounder;
00061 a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + *rounder;
00062 a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + *rounder;
00063 a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + *rounder;
00064
00065 b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
00066 b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
00067 b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
00068 b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
00069
00070 row[0] = (a0 + b0) >> ROW_SHIFT;
00071 row[1] = (a1 + b1) >> ROW_SHIFT;
00072 row[2] = (a2 + b2) >> ROW_SHIFT;
00073 row[3] = (a3 + b3) >> ROW_SHIFT;
00074 row[4] = (a3 - b3) >> ROW_SHIFT;
00075 row[5] = (a2 - b2) >> ROW_SHIFT;
00076 row[6] = (a1 - b1) >> ROW_SHIFT;
00077 row[7] = (a0 - b0) >> ROW_SHIFT;
00078 }
00079 #endif
00080
00081
00082
00083
00084 #define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \
00085 c4, c6, c4, c6, \
00086 c1, c3, -c1, -c5, \
00087 c5, c7, c3, -c7, \
00088 c4, -c6, c4, -c6, \
00089 -c4, c2, c4, -c2, \
00090 c5, -c1, c3, -c1, \
00091 c7, c3, c7, -c5 }
00092
00093 static inline void mmxext_row_head (int16_t * const row, const int offset,
00094 const int16_t * const table)
00095 {
00096 movq_m2r (*(row+offset), mm2);
00097
00098 movq_m2r (*(row+offset+4), mm5);
00099 movq_r2r (mm2, mm0);
00100
00101 movq_m2r (*table, mm3);
00102 movq_r2r (mm5, mm6);
00103
00104 movq_m2r (*(table+4), mm4);
00105 pmaddwd_r2r (mm0, mm3);
00106
00107 pshufw_r2r (mm2, mm2, 0x4e);
00108 }
00109
00110 static inline void mmxext_row (const int16_t * const table,
00111 const int32_t * const rounder)
00112 {
00113 movq_m2r (*(table+8), mm1);
00114 pmaddwd_r2r (mm2, mm4);
00115
00116 pmaddwd_m2r (*(table+16), mm0);
00117 pshufw_r2r (mm6, mm6, 0x4e);
00118
00119 movq_m2r (*(table+12), mm7);
00120 pmaddwd_r2r (mm5, mm1);
00121
00122 paddd_m2r (*rounder, mm3);
00123 pmaddwd_r2r (mm6, mm7);
00124
00125 pmaddwd_m2r (*(table+20), mm2);
00126 paddd_r2r (mm4, mm3);
00127
00128 pmaddwd_m2r (*(table+24), mm5);
00129 movq_r2r (mm3, mm4);
00130
00131 pmaddwd_m2r (*(table+28), mm6);
00132 paddd_r2r (mm7, mm1);
00133
00134 paddd_m2r (*rounder, mm0);
00135 psubd_r2r (mm1, mm3);
00136
00137 psrad_i2r (ROW_SHIFT, mm3);
00138 paddd_r2r (mm4, mm1);
00139
00140 paddd_r2r (mm2, mm0);
00141 psrad_i2r (ROW_SHIFT, mm1);
00142
00143 paddd_r2r (mm6, mm5);
00144 movq_r2r (mm0, mm4);
00145
00146 paddd_r2r (mm5, mm0);
00147 psubd_r2r (mm5, mm4);
00148 }
00149
00150 static inline void mmxext_row_tail (int16_t * const row, const int store)
00151 {
00152 psrad_i2r (ROW_SHIFT, mm0);
00153
00154 psrad_i2r (ROW_SHIFT, mm4);
00155
00156 packssdw_r2r (mm0, mm1);
00157
00158 packssdw_r2r (mm3, mm4);
00159
00160 movq_r2m (mm1, *(row+store));
00161 pshufw_r2r (mm4, mm4, 0xb1);
00162
00163
00164
00165 movq_r2m (mm4, *(row+store+4));
00166 }
00167
00168 static inline void mmxext_row_mid (int16_t * const row, const int store,
00169 const int offset,
00170 const int16_t * const table)
00171 {
00172 movq_m2r (*(row+offset), mm2);
00173 psrad_i2r (ROW_SHIFT, mm0);
00174
00175 movq_m2r (*(row+offset+4), mm5);
00176 psrad_i2r (ROW_SHIFT, mm4);
00177
00178 packssdw_r2r (mm0, mm1);
00179 movq_r2r (mm5, mm6);
00180
00181 packssdw_r2r (mm3, mm4);
00182 movq_r2r (mm2, mm0);
00183
00184 movq_r2m (mm1, *(row+store));
00185 pshufw_r2r (mm4, mm4, 0xb1);
00186
00187 movq_m2r (*table, mm3);
00188 movq_r2m (mm4, *(row+store+4));
00189
00190 pmaddwd_r2r (mm0, mm3);
00191
00192 movq_m2r (*(table+4), mm4);
00193 pshufw_r2r (mm2, mm2, 0x4e);
00194 }
00195
00196
00197
00198
00199 #define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \
00200 c4, c6, -c4, -c2, \
00201 c1, c3, c3, -c7, \
00202 c5, c7, -c1, -c5, \
00203 c4, -c6, c4, -c2, \
00204 -c4, c2, c4, -c6, \
00205 c5, -c1, c7, -c5, \
00206 c7, c3, c3, -c1 }
00207
00208 static inline void mmx_row_head (int16_t * const row, const int offset,
00209 const int16_t * const table)
00210 {
00211 movq_m2r (*(row+offset), mm2);
00212
00213 movq_m2r (*(row+offset+4), mm5);
00214 movq_r2r (mm2, mm0);
00215
00216 movq_m2r (*table, mm3);
00217 movq_r2r (mm5, mm6);
00218
00219 punpckldq_r2r (mm0, mm0);
00220
00221 movq_m2r (*(table+4), mm4);
00222 pmaddwd_r2r (mm0, mm3);
00223
00224 movq_m2r (*(table+8), mm1);
00225 punpckhdq_r2r (mm2, mm2);
00226 }
00227
00228 static inline void mmx_row (const int16_t * const table,
00229 const int32_t * const rounder)
00230 {
00231 pmaddwd_r2r (mm2, mm4);
00232 punpckldq_r2r (mm5, mm5);
00233
00234 pmaddwd_m2r (*(table+16), mm0);
00235 punpckhdq_r2r (mm6, mm6);
00236
00237 movq_m2r (*(table+12), mm7);
00238 pmaddwd_r2r (mm5, mm1);
00239
00240 paddd_m2r (*rounder, mm3);
00241 pmaddwd_r2r (mm6, mm7);
00242
00243 pmaddwd_m2r (*(table+20), mm2);
00244 paddd_r2r (mm4, mm3);
00245
00246 pmaddwd_m2r (*(table+24), mm5);
00247 movq_r2r (mm3, mm4);
00248
00249 pmaddwd_m2r (*(table+28), mm6);
00250 paddd_r2r (mm7, mm1);
00251
00252 paddd_m2r (*rounder, mm0);
00253 psubd_r2r (mm1, mm3);
00254
00255 psrad_i2r (ROW_SHIFT, mm3);
00256 paddd_r2r (mm4, mm1);
00257
00258 paddd_r2r (mm2, mm0);
00259 psrad_i2r (ROW_SHIFT, mm1);
00260
00261 paddd_r2r (mm6, mm5);
00262 movq_r2r (mm0, mm7);
00263
00264 paddd_r2r (mm5, mm0);
00265 psubd_r2r (mm5, mm7);
00266 }
00267
00268 static inline void mmx_row_tail (int16_t * const row, const int store)
00269 {
00270 psrad_i2r (ROW_SHIFT, mm0);
00271
00272 psrad_i2r (ROW_SHIFT, mm7);
00273
00274 packssdw_r2r (mm0, mm1);
00275
00276 packssdw_r2r (mm3, mm7);
00277
00278 movq_r2m (mm1, *(row+store));
00279 movq_r2r (mm7, mm4);
00280
00281 pslld_i2r (16, mm7);
00282
00283 psrld_i2r (16, mm4);
00284
00285 por_r2r (mm4, mm7);
00286
00287
00288
00289 movq_r2m (mm7, *(row+store+4));
00290 }
00291
00292 static inline void mmx_row_mid (int16_t * const row, const int store,
00293 const int offset, const int16_t * const table)
00294 {
00295 movq_m2r (*(row+offset), mm2);
00296 psrad_i2r (ROW_SHIFT, mm0);
00297
00298 movq_m2r (*(row+offset+4), mm5);
00299 psrad_i2r (ROW_SHIFT, mm7);
00300
00301 packssdw_r2r (mm0, mm1);
00302 movq_r2r (mm5, mm6);
00303
00304 packssdw_r2r (mm3, mm7);
00305 movq_r2r (mm2, mm0);
00306
00307 movq_r2m (mm1, *(row+store));
00308 movq_r2r (mm7, mm1);
00309
00310 punpckldq_r2r (mm0, mm0);
00311 psrld_i2r (16, mm7);
00312
00313 movq_m2r (*table, mm3);
00314 pslld_i2r (16, mm1);
00315
00316 movq_m2r (*(table+4), mm4);
00317 por_r2r (mm1, mm7);
00318
00319 movq_m2r (*(table+8), mm1);
00320 punpckhdq_r2r (mm2, mm2);
00321
00322 movq_r2m (mm7, *(row+store+4));
00323 pmaddwd_r2r (mm0, mm3);
00324 }
00325
00326
00327 #if 0
00328
00329 static inline void idct_col (int16_t * col, int offset)
00330 {
00331
00332 #define F(c,x) (((c) * (x)) >> 16)
00333
00334
00335 #define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x))
00336
00337 int16_t x0, x1, x2, x3, x4, x5, x6, x7;
00338 int16_t y0, y1, y2, y3, y4, y5, y6, y7;
00339 int16_t a0, a1, a2, a3, b0, b1, b2, b3;
00340 int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12;
00341
00342 col += offset;
00343
00344 x0 = col[0*8];
00345 x1 = col[1*8];
00346 x2 = col[2*8];
00347 x3 = col[3*8];
00348 x4 = col[4*8];
00349 x5 = col[5*8];
00350 x6 = col[6*8];
00351 x7 = col[7*8];
00352
00353 u04 = S (x0 + x4);
00354 v04 = S (x0 - x4);
00355 u26 = S (F (T2, x6) + x2);
00356 v26 = S (F (T2, x2) - x6);
00357
00358 a0 = S (u04 + u26);
00359 a1 = S (v04 + v26);
00360 a2 = S (v04 - v26);
00361 a3 = S (u04 - u26);
00362
00363 u17 = S (F (T1, x7) + x1);
00364 v17 = S (F (T1, x1) - x7);
00365 u35 = S (F (T3, x5) + x3);
00366 v35 = S (F (T3, x3) - x5);
00367
00368 b0 = S (u17 + u35);
00369 b3 = S (v17 - v35);
00370 u12 = S (u17 - u35);
00371 v12 = S (v17 + v35);
00372 u12 = S (2 * F (C4, u12));
00373 v12 = S (2 * F (C4, v12));
00374 b1 = S (u12 + v12);
00375 b2 = S (u12 - v12);
00376
00377 y0 = S (a0 + b0) >> COL_SHIFT;
00378 y1 = S (a1 + b1) >> COL_SHIFT;
00379 y2 = S (a2 + b2) >> COL_SHIFT;
00380 y3 = S (a3 + b3) >> COL_SHIFT;
00381
00382 y4 = S (a3 - b3) >> COL_SHIFT;
00383 y5 = S (a2 - b2) >> COL_SHIFT;
00384 y6 = S (a1 - b1) >> COL_SHIFT;
00385 y7 = S (a0 - b0) >> COL_SHIFT;
00386
00387 col[0*8] = y0;
00388 col[1*8] = y1;
00389 col[2*8] = y2;
00390 col[3*8] = y3;
00391 col[4*8] = y4;
00392 col[5*8] = y5;
00393 col[6*8] = y6;
00394 col[7*8] = y7;
00395 }
00396 #endif
00397
00398
00399
00400 static inline void idct_col (int16_t * const col, const int offset)
00401 {
00402 #define T1 13036
00403 #define T2 27146
00404 #define T3 43790
00405 #define C4 23170
00406
00407 static const short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
00408 static const short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
00409 static const short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
00410 static const short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
00411
00412
00413
00414
00415 movq_m2r (*_T1, mm0);
00416
00417 movq_m2r (*(col+offset+1*8), mm1);
00418 movq_r2r (mm0, mm2);
00419
00420 movq_m2r (*(col+offset+7*8), mm4);
00421 pmulhw_r2r (mm1, mm0);
00422
00423 movq_m2r (*_T3, mm5);
00424 pmulhw_r2r (mm4, mm2);
00425
00426 movq_m2r (*(col+offset+5*8), mm6);
00427 movq_r2r (mm5, mm7);
00428
00429 movq_m2r (*(col+offset+3*8), mm3);
00430 psubsw_r2r (mm4, mm0);
00431
00432 movq_m2r (*_T2, mm4);
00433 pmulhw_r2r (mm3, mm5);
00434
00435 paddsw_r2r (mm2, mm1);
00436 pmulhw_r2r (mm6, mm7);
00437
00438
00439
00440 movq_r2r (mm4, mm2);
00441 paddsw_r2r (mm3, mm5);
00442
00443 pmulhw_m2r (*(col+offset+2*8), mm4);
00444 paddsw_r2r (mm6, mm7);
00445
00446 psubsw_r2r (mm6, mm5);
00447 paddsw_r2r (mm3, mm7);
00448
00449 movq_m2r (*(col+offset+6*8), mm3);
00450 movq_r2r (mm0, mm6);
00451
00452 pmulhw_r2r (mm3, mm2);
00453 psubsw_r2r (mm5, mm0);
00454
00455 psubsw_r2r (mm3, mm4);
00456 paddsw_r2r (mm6, mm5);
00457
00458 movq_r2m (mm0, *(col+offset+3*8));
00459 movq_r2r (mm1, mm6);
00460
00461 paddsw_m2r (*(col+offset+2*8), mm2);
00462 paddsw_r2r (mm7, mm6);
00463
00464 psubsw_r2r (mm7, mm1);
00465 movq_r2r (mm1, mm7);
00466
00467 movq_m2r (*(col+offset+0*8), mm3);
00468 paddsw_r2r (mm5, mm1);
00469
00470 movq_m2r (*_C4, mm0);
00471 psubsw_r2r (mm5, mm7);
00472
00473 movq_r2m (mm6, *(col+offset+5*8));
00474 pmulhw_r2r (mm0, mm1);
00475
00476 movq_r2r (mm4, mm6);
00477 pmulhw_r2r (mm0, mm7);
00478
00479 movq_m2r (*(col+offset+4*8), mm5);
00480 movq_r2r (mm3, mm0);
00481
00482 psubsw_r2r (mm5, mm3);
00483 paddsw_r2r (mm5, mm0);
00484
00485 paddsw_r2r (mm3, mm4);
00486 movq_r2r (mm0, mm5);
00487
00488 psubsw_r2r (mm6, mm3);
00489 paddsw_r2r (mm2, mm5);
00490
00491 paddsw_r2r (mm1, mm1);
00492 psubsw_r2r (mm2, mm0);
00493
00494 paddsw_r2r (mm7, mm7);
00495 movq_r2r (mm3, mm2);
00496
00497 movq_r2r (mm4, mm6);
00498 paddsw_r2r (mm7, mm3);
00499
00500 psraw_i2r (COL_SHIFT, mm3);
00501 paddsw_r2r (mm1, mm4);
00502
00503 psraw_i2r (COL_SHIFT, mm4);
00504 psubsw_r2r (mm1, mm6);
00505
00506 movq_m2r (*(col+offset+5*8), mm1);
00507 psubsw_r2r (mm7, mm2);
00508
00509 psraw_i2r (COL_SHIFT, mm6);
00510 movq_r2r (mm5, mm7);
00511
00512 movq_r2m (mm4, *(col+offset+1*8));
00513 psraw_i2r (COL_SHIFT, mm2);
00514
00515 movq_r2m (mm3, *(col+offset+2*8));
00516 paddsw_r2r (mm1, mm5);
00517
00518 movq_m2r (*(col+offset+3*8), mm4);
00519 psubsw_r2r (mm1, mm7);
00520
00521 psraw_i2r (COL_SHIFT, mm5);
00522 movq_r2r (mm0, mm3);
00523
00524 movq_r2m (mm2, *(col+offset+5*8));
00525 psubsw_r2r (mm4, mm3);
00526
00527 psraw_i2r (COL_SHIFT, mm7);
00528 paddsw_r2r (mm0, mm4);
00529
00530 movq_r2m (mm5, *(col+offset+0*8));
00531 psraw_i2r (COL_SHIFT, mm3);
00532
00533 movq_r2m (mm6, *(col+offset+6*8));
00534 psraw_i2r (COL_SHIFT, mm4);
00535
00536 movq_r2m (mm7, *(col+offset+7*8));
00537
00538 movq_r2m (mm3, *(col+offset+4*8));
00539
00540 movq_r2m (mm4, *(col+offset+3*8));
00541 }
00542
00543
00544 static const int32_t rounder0[] ATTR_ALIGN(8) =
00545 rounder ((1 << (COL_SHIFT - 1)) - 0.5);
00546 static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
00547 static const int32_t rounder1[] ATTR_ALIGN(8) =
00548 rounder (1.25683487303);
00549 static const int32_t rounder7[] ATTR_ALIGN(8) =
00550 rounder (-0.25);
00551 static const int32_t rounder2[] ATTR_ALIGN(8) =
00552 rounder (0.60355339059);
00553 static const int32_t rounder6[] ATTR_ALIGN(8) =
00554 rounder (-0.25);
00555 static const int32_t rounder3[] ATTR_ALIGN(8) =
00556 rounder (0.087788325588);
00557 static const int32_t rounder5[] ATTR_ALIGN(8) =
00558 rounder (-0.441341716183);
00559
00560
00561 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \
00562 static inline void idct (int16_t * const block) \
00563 { \
00564 static const int16_t table04[] ATTR_ALIGN(16) = \
00565 table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \
00566 static const int16_t table17[] ATTR_ALIGN(16) = \
00567 table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \
00568 static const int16_t table26[] ATTR_ALIGN(16) = \
00569 table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \
00570 static const int16_t table35[] ATTR_ALIGN(16) = \
00571 table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \
00572 \
00573 idct_row_head (block, 0*8, table04); \
00574 idct_row (table04, rounder0); \
00575 idct_row_mid (block, 0*8, 4*8, table04); \
00576 idct_row (table04, rounder4); \
00577 idct_row_mid (block, 4*8, 1*8, table17); \
00578 idct_row (table17, rounder1); \
00579 idct_row_mid (block, 1*8, 7*8, table17); \
00580 idct_row (table17, rounder7); \
00581 idct_row_mid (block, 7*8, 2*8, table26); \
00582 idct_row (table26, rounder2); \
00583 idct_row_mid (block, 2*8, 6*8, table26); \
00584 idct_row (table26, rounder6); \
00585 idct_row_mid (block, 6*8, 3*8, table35); \
00586 idct_row (table35, rounder3); \
00587 idct_row_mid (block, 3*8, 5*8, table35); \
00588 idct_row (table35, rounder5); \
00589 idct_row_tail (block, 5*8); \
00590 \
00591 idct_col (block, 0); \
00592 idct_col (block, 4); \
00593 }
00594
00595
00596 #define COPY_MMX(offset,r0,r1,r2) \
00597 do { \
00598 movq_m2r (*(block+offset), r0); \
00599 dest += stride; \
00600 movq_m2r (*(block+offset+4), r1); \
00601 movq_r2m (r2, *dest); \
00602 packuswb_r2r (r1, r0); \
00603 } while (0)
00604
00605 static inline void block_copy (int16_t * const block, uint8_t * dest,
00606 const int stride)
00607 {
00608 movq_m2r (*(block+0*8), mm0);
00609 movq_m2r (*(block+0*8+4), mm1);
00610 movq_m2r (*(block+1*8), mm2);
00611 packuswb_r2r (mm1, mm0);
00612 movq_m2r (*(block+1*8+4), mm3);
00613 movq_r2m (mm0, *dest);
00614 packuswb_r2r (mm3, mm2);
00615 COPY_MMX (2*8, mm0, mm1, mm2);
00616 COPY_MMX (3*8, mm2, mm3, mm0);
00617 COPY_MMX (4*8, mm0, mm1, mm2);
00618 COPY_MMX (5*8, mm2, mm3, mm0);
00619 COPY_MMX (6*8, mm0, mm1, mm2);
00620 COPY_MMX (7*8, mm2, mm3, mm0);
00621 movq_r2m (mm2, *(dest+stride));
00622 }
00623
00624
00625 #define ADD_MMX(offset,r1,r2,r3,r4) \
00626 do { \
00627 movq_m2r (*(dest+2*stride), r1); \
00628 packuswb_r2r (r4, r3); \
00629 movq_r2r (r1, r2); \
00630 dest += stride; \
00631 movq_r2m (r3, *dest); \
00632 punpcklbw_r2r (mm0, r1); \
00633 paddsw_m2r (*(block+offset), r1); \
00634 punpckhbw_r2r (mm0, r2); \
00635 paddsw_m2r (*(block+offset+4), r2); \
00636 } while (0)
00637
00638 static inline void block_add (int16_t * const block, uint8_t * dest,
00639 const int stride)
00640 {
00641 movq_m2r (*dest, mm1);
00642 pxor_r2r (mm0, mm0);
00643 movq_m2r (*(dest+stride), mm3);
00644 movq_r2r (mm1, mm2);
00645 punpcklbw_r2r (mm0, mm1);
00646 movq_r2r (mm3, mm4);
00647 paddsw_m2r (*(block+0*8), mm1);
00648 punpckhbw_r2r (mm0, mm2);
00649 paddsw_m2r (*(block+0*8+4), mm2);
00650 punpcklbw_r2r (mm0, mm3);
00651 paddsw_m2r (*(block+1*8), mm3);
00652 packuswb_r2r (mm2, mm1);
00653 punpckhbw_r2r (mm0, mm4);
00654 movq_r2m (mm1, *dest);
00655 paddsw_m2r (*(block+1*8+4), mm4);
00656 ADD_MMX (2*8, mm1, mm2, mm3, mm4);
00657 ADD_MMX (3*8, mm3, mm4, mm1, mm2);
00658 ADD_MMX (4*8, mm1, mm2, mm3, mm4);
00659 ADD_MMX (5*8, mm3, mm4, mm1, mm2);
00660 ADD_MMX (6*8, mm1, mm2, mm3, mm4);
00661 ADD_MMX (7*8, mm3, mm4, mm1, mm2);
00662 packuswb_r2r (mm4, mm3);
00663 movq_r2m (mm3, *(dest+stride));
00664 }
00665
00666
00667 static inline void block_zero (int16_t * const block)
00668 {
00669 pxor_r2r (mm0, mm0);
00670 movq_r2m (mm0, *(block+0*4));
00671 movq_r2m (mm0, *(block+1*4));
00672 movq_r2m (mm0, *(block+2*4));
00673 movq_r2m (mm0, *(block+3*4));
00674 movq_r2m (mm0, *(block+4*4));
00675 movq_r2m (mm0, *(block+5*4));
00676 movq_r2m (mm0, *(block+6*4));
00677 movq_r2m (mm0, *(block+7*4));
00678 movq_r2m (mm0, *(block+8*4));
00679 movq_r2m (mm0, *(block+9*4));
00680 movq_r2m (mm0, *(block+10*4));
00681 movq_r2m (mm0, *(block+11*4));
00682 movq_r2m (mm0, *(block+12*4));
00683 movq_r2m (mm0, *(block+13*4));
00684 movq_r2m (mm0, *(block+14*4));
00685 movq_r2m (mm0, *(block+15*4));
00686 }
00687
00688
00689 #define CPU_MMXEXT 0
00690 #define CPU_MMX 1
00691
00692 #define dup4(reg) \
00693 do { \
00694 if (cpu != CPU_MMXEXT) { \
00695 punpcklwd_r2r (reg, reg); \
00696 punpckldq_r2r (reg, reg); \
00697 } else \
00698 pshufw_r2r (reg, reg, 0x00); \
00699 } while (0)
00700
00701 static inline void block_add_DC (int16_t * const block, uint8_t * dest,
00702 const int stride, const int cpu)
00703 {
00704 movd_v2r ((block[0] + 4) >> 3, mm0);
00705 pxor_r2r (mm1, mm1);
00706 movq_m2r (*dest, mm2);
00707 dup4 (mm0);
00708 psubsw_r2r (mm0, mm1);
00709 packuswb_r2r (mm0, mm0);
00710 paddusb_r2r (mm0, mm2);
00711 packuswb_r2r (mm1, mm1);
00712 movq_m2r (*(dest + stride), mm3);
00713 psubusb_r2r (mm1, mm2);
00714 block[0] = 0;
00715 paddusb_r2r (mm0, mm3);
00716 movq_r2m (mm2, *dest);
00717 psubusb_r2r (mm1, mm3);
00718 movq_m2r (*(dest + 2*stride), mm2);
00719 dest += stride;
00720 movq_r2m (mm3, *dest);
00721 paddusb_r2r (mm0, mm2);
00722 movq_m2r (*(dest + 2*stride), mm3);
00723 psubusb_r2r (mm1, mm2);
00724 dest += stride;
00725 paddusb_r2r (mm0, mm3);
00726 movq_r2m (mm2, *dest);
00727 psubusb_r2r (mm1, mm3);
00728 movq_m2r (*(dest + 2*stride), mm2);
00729 dest += stride;
00730 movq_r2m (mm3, *dest);
00731 paddusb_r2r (mm0, mm2);
00732 movq_m2r (*(dest + 2*stride), mm3);
00733 psubusb_r2r (mm1, mm2);
00734 dest += stride;
00735 paddusb_r2r (mm0, mm3);
00736 movq_r2m (mm2, *dest);
00737 psubusb_r2r (mm1, mm3);
00738 movq_m2r (*(dest + 2*stride), mm2);
00739 dest += stride;
00740 movq_r2m (mm3, *dest);
00741 paddusb_r2r (mm0, mm2);
00742 movq_m2r (*(dest + 2*stride), mm3);
00743 psubusb_r2r (mm1, mm2);
00744 block[63] = 0;
00745 paddusb_r2r (mm0, mm3);
00746 movq_r2m (mm2, *(dest + stride));
00747 psubusb_r2r (mm1, mm3);
00748 movq_r2m (mm3, *(dest + 2*stride));
00749 }
00750
00751
00752 declare_idct (mmxext_idct, mmxext_table,
00753 mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
00754
00755 void mpeg2_idct_copy_mmxext (int16_t * const block, uint8_t * const dest,
00756 const int stride)
00757 {
00758 mmxext_idct (block);
00759 block_copy (block, dest, stride);
00760 block_zero (block);
00761 }
00762
00763 void mpeg2_idct_add_mmxext (const int last, int16_t * const block,
00764 uint8_t * const dest, const int stride)
00765 {
00766 if (last != 129 || (block[0] & 7) == 4) {
00767 mmxext_idct (block);
00768 block_add (block, dest, stride);
00769 block_zero (block);
00770 } else
00771 block_add_DC (block, dest, stride, CPU_MMXEXT);
00772 }
00773
00774
00775 declare_idct (mmx_idct, mmx_table,
00776 mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
00777
00778 void mpeg2_idct_copy_mmx (int16_t * const block, uint8_t * const dest,
00779 const int stride)
00780 {
00781 mmx_idct (block);
00782 block_copy (block, dest, stride);
00783 block_zero (block);
00784 }
00785
00786 void mpeg2_idct_add_mmx (const int last, int16_t * const block,
00787 uint8_t * const dest, const int stride)
00788 {
00789 if (last != 129 || (block[0] & 7) == 4) {
00790 mmx_idct (block);
00791 block_add (block, dest, stride);
00792 block_zero (block);
00793 } else
00794 block_add_DC (block, dest, stride, CPU_MMX);
00795 }
00796
00797
00798 void mpeg2_idct_mmx_init (void)
00799 {
00800 extern uint8_t mpeg2_scan_norm[64];
00801 extern uint8_t mpeg2_scan_alt[64];
00802 int i, j;
00803
00804
00805
00806 for (i = 0; i < 64; i++) {
00807 j = mpeg2_scan_norm[i];
00808 mpeg2_scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
00809 j = mpeg2_scan_alt[i];
00810 mpeg2_scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
00811 }
00812 }
00813
00814 #endif