| OLD | NEW |
| 1 /* | 1 /* |
| 2 * VC-1 and WMV3 - DSP functions MMX-optimized | 2 * VC-1 and WMV3 - DSP functions MMX-optimized |
| 3 * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> | 3 * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> |
| 4 * | 4 * |
| 5 * Permission is hereby granted, free of charge, to any person | 5 * Permission is hereby granted, free of charge, to any person |
| 6 * obtaining a copy of this software and associated documentation | 6 * obtaining a copy of this software and associated documentation |
| 7 * files (the "Software"), to deal in the Software without | 7 * files (the "Software"), to deal in the Software without |
| 8 * restriction, including without limitation the rights to use, | 8 * restriction, including without limitation the rights to use, |
| 9 * copy, modify, merge, publish, distribute, sublicense, and/or sell | 9 * copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 10 * copies of the Software, and to permit persons to whom the | 10 * copies of the Software, and to permit persons to whom the |
| (...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 66 "punpcklbw %%mm0, %%mm"#R0" \n\t" \ | 66 "punpcklbw %%mm0, %%mm"#R0" \n\t" \ |
| 67 "movd (%0,%2), %%mm"#R3" \n\t" \ | 67 "movd (%0,%2), %%mm"#R3" \n\t" \ |
| 68 "psubw %%mm"#R0", %%mm"#R1" \n\t" \ | 68 "psubw %%mm"#R0", %%mm"#R1" \n\t" \ |
| 69 "punpcklbw %%mm0, %%mm"#R3" \n\t" \ | 69 "punpcklbw %%mm0, %%mm"#R3" \n\t" \ |
| 70 "paddw %%mm7, %%mm"#R1" \n\t" \ | 70 "paddw %%mm7, %%mm"#R1" \n\t" \ |
| 71 "psubw %%mm"#R3", %%mm"#R1" \n\t" \ | 71 "psubw %%mm"#R3", %%mm"#R1" \n\t" \ |
| 72 "psraw %4, %%mm"#R1" \n\t" \ | 72 "psraw %4, %%mm"#R1" \n\t" \ |
| 73 "movq %%mm"#R1", "#OFF"(%1) \n\t" \ | 73 "movq %%mm"#R1", "#OFF"(%1) \n\t" \ |
| 74 "add %2, %0 \n\t" | 74 "add %2, %0 \n\t" |
| 75 | 75 |
| 76 DECLARE_ALIGNED_16(const uint64_t, ff_pw_9) = 0x0009000900090009ULL; | 76 DECLARE_ALIGNED(16, const uint64_t, ff_pw_9) = 0x0009000900090009ULL; |
| 77 | 77 |
| 78 /** Sacrifying mm6 allows to pipeline loads from src */ | 78 /** Sacrifying mm6 allows to pipeline loads from src */ |
| 79 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, | 79 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, |
| 80 const uint8_t *src, x86_reg stride, | 80 const uint8_t *src, x86_reg stride, |
| 81 int rnd, int64_t shift) | 81 int rnd, int64_t shift) |
| 82 { | 82 { |
| 83 __asm__ volatile( | 83 __asm__ volatile( |
| 84 "mov $3, %%"REG_c" \n\t" | 84 "mov $3, %%"REG_c" \n\t" |
| 85 LOAD_ROUNDER_MMX("%5") | 85 LOAD_ROUNDER_MMX("%5") |
| 86 "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" | 86 "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" |
| (...skipping 348 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 435 __asm__ volatile(\ | 435 __asm__ volatile(\ |
| 436 "pxor %%mm0, %%mm0 \n\t"\ | 436 "pxor %%mm0, %%mm0 \n\t"\ |
| 437 ::: "memory"\ | 437 ::: "memory"\ |
| 438 );\ | 438 );\ |
| 439 \ | 439 \ |
| 440 if (vmode) { /* Vertical filter to apply */\ | 440 if (vmode) { /* Vertical filter to apply */\ |
| 441 if (hmode) { /* Horizontal filter to apply, output to tmp */\ | 441 if (hmode) { /* Horizontal filter to apply, output to tmp */\ |
| 442 static const int shift_value[] = { 0, 5, 1, 5 };\ | 442 static const int shift_value[] = { 0, 5, 1, 5 };\ |
| 443 int shift = (shift_value[hmode]+shift_value[vmode])>>1;
\ | 443 int shift = (shift_value[hmode]+shift_value[vmode])>>1;
\ |
| 444 int r;\ | 444 int r;\ |
| 445 DECLARE_ALIGNED_16(int16_t, tmp)[12*8];\ | 445 DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\ |
| 446 \ | 446 \ |
| 447 r = (1<<(shift-1)) + rnd-1;\ | 447 r = (1<<(shift-1)) + rnd-1;\ |
| 448 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ | 448 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ |
| 449 \ | 449 \ |
| 450 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ | 450 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ |
| 451 return;\ | 451 return;\ |
| 452 }\ | 452 }\ |
| 453 else { /* No horizontal filter, output 8 lines to dst */\ | 453 else { /* No horizontal filter, output 8 lines to dst */\ |
| 454 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ | 454 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ |
| 455 return;\ | 455 return;\ |
| 456 }\ | 456 }\ |
| 457 }\ | 457 }\ |
| 458 \ | 458 \ |
| 459 /* Horizontal mode with no vertical mode */\ | 459 /* Horizontal mode with no vertical mode */\ |
| 460 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ | 460 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ |
| 461 } | 461 } |
| 462 | 462 |
| 463 VC1_MSPEL_MC(put_) | 463 VC1_MSPEL_MC(put_) |
| 464 VC1_MSPEL_MC(avg_) | 464 VC1_MSPEL_MC(avg_) |
| 465 | 465 |
| 466 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int
rnd); | |
| 467 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, in
t rnd); | |
| 468 | |
| 469 /** Macro to ease bicubic filter interpolation functions declarations */ | 466 /** Macro to ease bicubic filter interpolation functions declarations */ |
| 470 #define DECLARE_FUNCTION(a, b) \ | 467 #define DECLARE_FUNCTION(a, b) \ |
| 471 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src,
int stride, int rnd) { \ | 468 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src,
int stride, int rnd) { \ |
| 472 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ | 469 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
| 473 }\ | 470 }\ |
| 474 static void avg_vc1_mspel_mc ## a ## b ## _mmx2(uint8_t *dst, const uint8_t *src
, int stride, int rnd) { \ | 471 static void avg_vc1_mspel_mc ## a ## b ## _mmx2(uint8_t *dst, const uint8_t *src
, int stride, int rnd) { \ |
| 475 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ | 472 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
| 476 } | 473 } |
| 477 | 474 |
| 478 DECLARE_FUNCTION(0, 1) | 475 DECLARE_FUNCTION(0, 1) |
| (...skipping 256 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 735 dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2; | 732 dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2; |
| 736 dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2; | 733 dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2; |
| 737 dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2; | 734 dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2; |
| 738 | 735 |
| 739 dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2; | 736 dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2; |
| 740 dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2; | 737 dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2; |
| 741 dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2; | 738 dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2; |
| 742 dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2; | 739 dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2; |
| 743 } | 740 } |
| 744 } | 741 } |
| OLD | NEW |