OLD | NEW |
1 /* | 1 /* |
2 * VC-1 and WMV3 - DSP functions MMX-optimized | 2 * VC-1 and WMV3 - DSP functions MMX-optimized |
3 * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> | 3 * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> |
4 * | 4 * |
5 * Permission is hereby granted, free of charge, to any person | 5 * Permission is hereby granted, free of charge, to any person |
6 * obtaining a copy of this software and associated documentation | 6 * obtaining a copy of this software and associated documentation |
7 * files (the "Software"), to deal in the Software without | 7 * files (the "Software"), to deal in the Software without |
8 * restriction, including without limitation the rights to use, | 8 * restriction, including without limitation the rights to use, |
9 * copy, modify, merge, publish, distribute, sublicense, and/or sell | 9 * copy, modify, merge, publish, distribute, sublicense, and/or sell |
10 * copies of the Software, and to permit persons to whom the | 10 * copies of the Software, and to permit persons to whom the |
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
66 "punpcklbw %%mm0, %%mm"#R0" \n\t" \ | 66 "punpcklbw %%mm0, %%mm"#R0" \n\t" \ |
67 "movd (%0,%2), %%mm"#R3" \n\t" \ | 67 "movd (%0,%2), %%mm"#R3" \n\t" \ |
68 "psubw %%mm"#R0", %%mm"#R1" \n\t" \ | 68 "psubw %%mm"#R0", %%mm"#R1" \n\t" \ |
69 "punpcklbw %%mm0, %%mm"#R3" \n\t" \ | 69 "punpcklbw %%mm0, %%mm"#R3" \n\t" \ |
70 "paddw %%mm7, %%mm"#R1" \n\t" \ | 70 "paddw %%mm7, %%mm"#R1" \n\t" \ |
71 "psubw %%mm"#R3", %%mm"#R1" \n\t" \ | 71 "psubw %%mm"#R3", %%mm"#R1" \n\t" \ |
72 "psraw %4, %%mm"#R1" \n\t" \ | 72 "psraw %4, %%mm"#R1" \n\t" \ |
73 "movq %%mm"#R1", "#OFF"(%1) \n\t" \ | 73 "movq %%mm"#R1", "#OFF"(%1) \n\t" \ |
74 "add %2, %0 \n\t" | 74 "add %2, %0 \n\t" |
75 | 75 |
76 DECLARE_ALIGNED_16(const uint64_t, ff_pw_9) = 0x0009000900090009ULL; | 76 DECLARE_ALIGNED(16, const uint64_t, ff_pw_9) = 0x0009000900090009ULL; |
77 | 77 |
78 /** Sacrifying mm6 allows to pipeline loads from src */ | 78 /** Sacrifying mm6 allows to pipeline loads from src */ |
79 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, | 79 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, |
80 const uint8_t *src, x86_reg stride, | 80 const uint8_t *src, x86_reg stride, |
81 int rnd, int64_t shift) | 81 int rnd, int64_t shift) |
82 { | 82 { |
83 __asm__ volatile( | 83 __asm__ volatile( |
84 "mov $3, %%"REG_c" \n\t" | 84 "mov $3, %%"REG_c" \n\t" |
85 LOAD_ROUNDER_MMX("%5") | 85 LOAD_ROUNDER_MMX("%5") |
86 "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" | 86 "movq "MANGLE(ff_pw_9)", %%mm6 \n\t" |
(...skipping 348 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
435 __asm__ volatile(\ | 435 __asm__ volatile(\ |
436 "pxor %%mm0, %%mm0 \n\t"\ | 436 "pxor %%mm0, %%mm0 \n\t"\ |
437 ::: "memory"\ | 437 ::: "memory"\ |
438 );\ | 438 );\ |
439 \ | 439 \ |
440 if (vmode) { /* Vertical filter to apply */\ | 440 if (vmode) { /* Vertical filter to apply */\ |
441 if (hmode) { /* Horizontal filter to apply, output to tmp */\ | 441 if (hmode) { /* Horizontal filter to apply, output to tmp */\ |
442 static const int shift_value[] = { 0, 5, 1, 5 };\ | 442 static const int shift_value[] = { 0, 5, 1, 5 };\ |
443 int shift = (shift_value[hmode]+shift_value[vmode])>>1;
\ | 443 int shift = (shift_value[hmode]+shift_value[vmode])>>1;
\ |
444 int r;\ | 444 int r;\ |
445 DECLARE_ALIGNED_16(int16_t, tmp)[12*8];\ | 445 DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\ |
446 \ | 446 \ |
447 r = (1<<(shift-1)) + rnd-1;\ | 447 r = (1<<(shift-1)) + rnd-1;\ |
448 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ | 448 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ |
449 \ | 449 \ |
450 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ | 450 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ |
451 return;\ | 451 return;\ |
452 }\ | 452 }\ |
453 else { /* No horizontal filter, output 8 lines to dst */\ | 453 else { /* No horizontal filter, output 8 lines to dst */\ |
454 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ | 454 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ |
455 return;\ | 455 return;\ |
456 }\ | 456 }\ |
457 }\ | 457 }\ |
458 \ | 458 \ |
459 /* Horizontal mode with no vertical mode */\ | 459 /* Horizontal mode with no vertical mode */\ |
460 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ | 460 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ |
461 } | 461 } |
462 | 462 |
463 VC1_MSPEL_MC(put_) | 463 VC1_MSPEL_MC(put_) |
464 VC1_MSPEL_MC(avg_) | 464 VC1_MSPEL_MC(avg_) |
465 | 465 |
466 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int
rnd); | |
467 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, in
t rnd); | |
468 | |
469 /** Macro to ease bicubic filter interpolation functions declarations */ | 466 /** Macro to ease bicubic filter interpolation functions declarations */ |
470 #define DECLARE_FUNCTION(a, b) \ | 467 #define DECLARE_FUNCTION(a, b) \ |
471 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src,
int stride, int rnd) { \ | 468 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, const uint8_t *src,
int stride, int rnd) { \ |
472 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ | 469 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
473 }\ | 470 }\ |
474 static void avg_vc1_mspel_mc ## a ## b ## _mmx2(uint8_t *dst, const uint8_t *src
, int stride, int rnd) { \ | 471 static void avg_vc1_mspel_mc ## a ## b ## _mmx2(uint8_t *dst, const uint8_t *src
, int stride, int rnd) { \ |
475 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ | 472 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
476 } | 473 } |
477 | 474 |
478 DECLARE_FUNCTION(0, 1) | 475 DECLARE_FUNCTION(0, 1) |
(...skipping 256 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
735 dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2; | 732 dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmx2; |
736 dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2; | 733 dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmx2; |
737 dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2; | 734 dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmx2; |
738 | 735 |
739 dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2; | 736 dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmx2; |
740 dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2; | 737 dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmx2; |
741 dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2; | 738 dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmx2; |
742 dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2; | 739 dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmx2; |
743 } | 740 } |
744 } | 741 } |
OLD | NEW |