| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt | 2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt |
| 3 * | 3 * |
| 4 * This file is part of FFmpeg. | 4 * This file is part of FFmpeg. |
| 5 * | 5 * |
| 6 * FFmpeg is free software; you can redistribute it and/or | 6 * FFmpeg is free software; you can redistribute it and/or |
| 7 * modify it under the terms of the GNU Lesser General Public | 7 * modify it under the terms of the GNU Lesser General Public |
| 8 * License as published by the Free Software Foundation; either | 8 * License as published by the Free Software Foundation; either |
| 9 * version 2.1 of the License, or (at your option) any later version. | 9 * version 2.1 of the License, or (at your option) any later version. |
| 10 * | 10 * |
| 11 * FFmpeg is distributed in the hope that it will be useful, | 11 * FFmpeg is distributed in the hope that it will be useful, |
| 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 * Lesser General Public License for more details. | 14 * Lesser General Public License for more details. |
| 15 * | 15 * |
| 16 * You should have received a copy of the GNU Lesser General Public | 16 * You should have received a copy of the GNU Lesser General Public |
| 17 * License along with FFmpeg; if not, write to the Free Software | 17 * License along with FFmpeg; if not, write to the Free Software |
| 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 19 */ | 19 */ |
| 20 | 20 |
| 21 #include "dsputil_mmx.h" | 21 #include "dsputil_mmx.h" |
| 22 | 22 |
| 23 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; | 23 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; |
| 24 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; | 24 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; |
| 25 | 25 |
| 26 /***********************************/ | 26 /***********************************/ |
| 27 /* IDCT */ | 27 /* IDCT */ |
| 28 | 28 |
| 29 #define SUMSUB_BADC( a, b, c, d ) \ | 29 #define SUMSUB_BADC( a, b, c, d ) \ |
| 30 "paddw "#b", "#a" \n\t"\ | 30 "paddw "#b", "#a" \n\t"\ |
| 31 "paddw "#d", "#c" \n\t"\ | 31 "paddw "#d", "#c" \n\t"\ |
| 32 "paddw "#b", "#b" \n\t"\ | 32 "paddw "#b", "#b" \n\t"\ |
| 33 "paddw "#d", "#d" \n\t"\ | 33 "paddw "#d", "#d" \n\t"\ |
| 34 "psubw "#a", "#b" \n\t"\ | 34 "psubw "#a", "#b" \n\t"\ |
| (...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 150 SUMSUB_BA( %%mm5, %%mm4 ) | 150 SUMSUB_BA( %%mm5, %%mm4 ) |
| 151 SUMSUB_BA( %%mm3, %%mm2 ) | 151 SUMSUB_BA( %%mm3, %%mm2 ) |
| 152 SUMSUB_BA( %%mm1, %%mm0 ) | 152 SUMSUB_BA( %%mm1, %%mm0 ) |
| 153 :: "r"(block) | 153 :: "r"(block) |
| 154 ); | 154 ); |
| 155 } | 155 } |
| 156 | 156 |
| 157 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) | 157 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) |
| 158 { | 158 { |
| 159 int i; | 159 int i; |
| 160 DECLARE_ALIGNED_8(int16_t, b2)[64]; | 160 DECLARE_ALIGNED(8, int16_t, b2)[64]; |
| 161 | 161 |
| 162 block[0] += 32; | 162 block[0] += 32; |
| 163 | 163 |
| 164 for(i=0; i<2; i++){ | 164 for(i=0; i<2; i++){ |
| 165 DECLARE_ALIGNED_8(uint64_t, tmp); | 165 DECLARE_ALIGNED(8, uint64_t, tmp); |
| 166 | 166 |
| 167 h264_idct8_1d(block+4*i); | 167 h264_idct8_1d(block+4*i); |
| 168 | 168 |
| 169 __asm__ volatile( | 169 __asm__ volatile( |
| 170 "movq %%mm7, %0 \n\t" | 170 "movq %%mm7, %0 \n\t" |
| 171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) | 171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) |
| 172 "movq %%mm0, 8(%1) \n\t" | 172 "movq %%mm0, 8(%1) \n\t" |
| 173 "movq %%mm6, 24(%1) \n\t" | 173 "movq %%mm6, 24(%1) \n\t" |
| 174 "movq %%mm7, 40(%1) \n\t" | 174 "movq %%mm7, 40(%1) \n\t" |
| 175 "movq %%mm4, 56(%1) \n\t" | 175 "movq %%mm4, 56(%1) \n\t" |
| (...skipping 445 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 621 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ | 621 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ |
| 622 "movq "#p1", "#tmp" \n\t"\ | 622 "movq "#p1", "#tmp" \n\t"\ |
| 623 "psubusb "#tc0", "#tmp" \n\t"\ | 623 "psubusb "#tc0", "#tmp" \n\t"\ |
| 624 "paddusb "#p1", "#tc0" \n\t"\ | 624 "paddusb "#p1", "#tc0" \n\t"\ |
| 625 "pmaxub "#tmp", "#q2" \n\t"\ | 625 "pmaxub "#tmp", "#q2" \n\t"\ |
| 626 "pminub "#tc0", "#q2" \n\t"\ | 626 "pminub "#tc0", "#q2" \n\t"\ |
| 627 "movq "#q2", "q1addr" \n\t" | 627 "movq "#q2", "q1addr" \n\t" |
| 628 | 628 |
| 629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alph
a1, int beta1, int8_t *tc0) | 629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alph
a1, int beta1, int8_t *tc0) |
| 630 { | 630 { |
| 631 DECLARE_ALIGNED_8(uint64_t, tmp0)[2]; | 631 DECLARE_ALIGNED(8, uint64_t, tmp0)[2]; |
| 632 | 632 |
| 633 __asm__ volatile( | 633 __asm__ volatile( |
| 634 "movq (%2,%4), %%mm0 \n\t" //p1 | 634 "movq (%2,%4), %%mm0 \n\t" //p1 |
| 635 "movq (%2,%4,2), %%mm1 \n\t" //p0 | 635 "movq (%2,%4,2), %%mm1 \n\t" //p0 |
| 636 "movq (%3), %%mm2 \n\t" //q0 | 636 "movq (%3), %%mm2 \n\t" //q0 |
| 637 "movq (%3,%4), %%mm3 \n\t" //q1 | 637 "movq (%3,%4), %%mm3 \n\t" //q1 |
| 638 H264_DEBLOCK_MASK(%7, %8) | 638 H264_DEBLOCK_MASK(%7, %8) |
| 639 | 639 |
| 640 "movd %6, %%mm4 \n\t" | 640 "movd %6, %%mm4 \n\t" |
| 641 "punpcklbw %%mm4, %%mm4 \n\t" | 641 "punpcklbw %%mm4, %%mm4 \n\t" |
| (...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 683 { | 683 { |
| 684 if((tc0[0] & tc0[1]) >= 0) | 684 if((tc0[0] & tc0[1]) >= 0) |
| 685 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); | 685 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); |
| 686 if((tc0[2] & tc0[3]) >= 0) | 686 if((tc0[2] & tc0[3]) >= 0) |
| 687 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); | 687 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); |
| 688 } | 688 } |
| 689 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, in
t beta, int8_t *tc0) | 689 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, in
t beta, int8_t *tc0) |
| 690 { | 690 { |
| 691 //FIXME: could cut some load/stores by merging transpose with filter | 691 //FIXME: could cut some load/stores by merging transpose with filter |
| 692 // also, it only needs to transpose 6x8 | 692 // also, it only needs to transpose 6x8 |
| 693 DECLARE_ALIGNED_8(uint8_t, trans)[8*8]; | 693 DECLARE_ALIGNED(8, uint8_t, trans)[8*8]; |
| 694 int i; | 694 int i; |
| 695 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { | 695 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { |
| 696 if((tc0[0] & tc0[1]) < 0) | 696 if((tc0[0] & tc0[1]) < 0) |
| 697 continue; | 697 continue; |
| 698 transpose4x4(trans, pix-4, 8, stride); | 698 transpose4x4(trans, pix-4, 8, stride); |
| 699 transpose4x4(trans +4*8, pix, 8, stride); | 699 transpose4x4(trans +4*8, pix, 8, stride); |
| 700 transpose4x4(trans+4, pix-4+4*stride, 8, stride); | 700 transpose4x4(trans+4, pix-4+4*stride, 8, stride); |
| 701 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); | 701 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); |
| 702 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); | 702 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); |
| 703 transpose4x4(pix-2, trans +2*8, stride, 8); | 703 transpose4x4(pix-2, trans +2*8, stride, 8); |
| (...skipping 23 matching lines...) Expand all Loading... |
| 727 } | 727 } |
| 728 | 728 |
| 729 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0) | 729 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0) |
| 730 { | 730 { |
| 731 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); | 731 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); |
| 732 } | 732 } |
| 733 | 733 |
| 734 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0) | 734 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0) |
| 735 { | 735 { |
| 736 //FIXME: could cut some load/stores by merging transpose with filter | 736 //FIXME: could cut some load/stores by merging transpose with filter |
| 737 DECLARE_ALIGNED_8(uint8_t, trans)[8*4]; | 737 DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; |
| 738 transpose4x4(trans, pix-2, 8, stride); | 738 transpose4x4(trans, pix-2, 8, stride); |
| 739 transpose4x4(trans+4, pix-2+4*stride, 8, stride); | 739 transpose4x4(trans+4, pix-2+4*stride, 8, stride); |
| 740 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); | 740 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); |
| 741 transpose4x4(pix-2, trans, stride, 8); | 741 transpose4x4(pix-2, trans, stride, 8); |
| 742 transpose4x4(pix-2+4*stride, trans+4, stride, 8); | 742 transpose4x4(pix-2+4*stride, trans+4, stride, 8); |
| 743 } | 743 } |
| 744 | 744 |
| 745 // p0 = (p0 + q1 + 2*p1 + 2) >> 2 | 745 // p0 = (p0 + q1 + 2*p1 + 2) >> 2 |
| 746 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \ | 746 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \ |
| 747 "movq "#p0", %%mm4 \n\t"\ | 747 "movq "#p0", %%mm4 \n\t"\ |
| (...skipping 29 matching lines...) Expand all Loading... |
| 777 } | 777 } |
| 778 | 778 |
| 779 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a
lpha, int beta) | 779 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a
lpha, int beta) |
| 780 { | 780 { |
| 781 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); | 781 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); |
| 782 } | 782 } |
| 783 | 783 |
| 784 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a
lpha, int beta) | 784 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a
lpha, int beta) |
| 785 { | 785 { |
| 786 //FIXME: could cut some load/stores by merging transpose with filter | 786 //FIXME: could cut some load/stores by merging transpose with filter |
| 787 DECLARE_ALIGNED_8(uint8_t, trans)[8*4]; | 787 DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; |
| 788 transpose4x4(trans, pix-2, 8, stride); | 788 transpose4x4(trans, pix-2, 8, stride); |
| 789 transpose4x4(trans+4, pix-2+4*stride, 8, stride); | 789 transpose4x4(trans+4, pix-2+4*stride, 8, stride); |
| 790 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); | 790 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); |
| 791 transpose4x4(pix-2, trans, stride, 8); | 791 transpose4x4(pix-2, trans, stride, 8); |
| 792 transpose4x4(pix-2+4*stride, trans+4, stride, 8); | 792 transpose4x4(pix-2+4*stride, trans+4, stride, 8); |
| 793 } | 793 } |
| 794 | 794 |
| 795 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
, int8_t ref[2][40], int16_t mv[2][40][2], | 795 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
, int8_t ref[2][40], int16_t mv[2][40][2], |
| 796 int bidir, int edges, int step, int
mask_mv0, int mask_mv1, int field ) { | 796 int bidir, int edges, int step, int
mask_mv0, int mask_mv1, int field ) { |
| 797 int dir; | 797 int dir; |
| (...skipping 10 matching lines...) Expand all Loading... |
| 808 __asm__ volatile( | 808 __asm__ volatile( |
| 809 "movq %%mm6, %%mm5 \n" | 809 "movq %%mm6, %%mm5 \n" |
| 810 "paddb %%mm5, %%mm5 \n" | 810 "paddb %%mm5, %%mm5 \n" |
| 811 :); | 811 :); |
| 812 | 812 |
| 813 // could do a special case for dir==0 && edges==1, but it only reduces the | 813 // could do a special case for dir==0 && edges==1, but it only reduces the |
| 814 // average filter time by 1.2% | 814 // average filter time by 1.2% |
| 815 for( dir=1; dir>=0; dir-- ) { | 815 for( dir=1; dir>=0; dir-- ) { |
| 816 const x86_reg d_idx = dir ? -8 : -1; | 816 const x86_reg d_idx = dir ? -8 : -1; |
| 817 const int mask_mv = dir ? mask_mv1 : mask_mv0; | 817 const int mask_mv = dir ? mask_mv1 : mask_mv0; |
| 818 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffff
ffULL; | 818 DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xfffffffffffff
fffULL; |
| 819 int b_idx, edge; | 819 int b_idx, edge; |
| 820 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { | 820 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { |
| 821 __asm__ volatile( | 821 __asm__ volatile( |
| 822 "pand %0, %%mm0 \n\t" | 822 "pand %0, %%mm0 \n\t" |
| 823 ::"m"(mask_dir) | 823 ::"m"(mask_dir) |
| 824 ); | 824 ); |
| 825 if(!(mask_mv & edge)) { | 825 if(!(mask_mv & edge)) { |
| 826 if(bidir) { | 826 if(bidir) { |
| 827 __asm__ volatile( | 827 __asm__ volatile( |
| 828 "movd (%1,%0), %%mm2 \n" | 828 "movd (%1,%0), %%mm2 \n" |
| (...skipping 1270 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2099 H264_MC_4816(3dnow) | 2099 H264_MC_4816(3dnow) |
| 2100 H264_MC_4816(mmx2) | 2100 H264_MC_4816(mmx2) |
| 2101 H264_MC_816(H264_MC_V, sse2) | 2101 H264_MC_816(H264_MC_V, sse2) |
| 2102 H264_MC_816(H264_MC_HV, sse2) | 2102 H264_MC_816(H264_MC_HV, sse2) |
| 2103 #if HAVE_SSSE3 | 2103 #if HAVE_SSSE3 |
| 2104 H264_MC_816(H264_MC_H, ssse3) | 2104 H264_MC_816(H264_MC_H, ssse3) |
| 2105 H264_MC_816(H264_MC_HV, ssse3) | 2105 H264_MC_816(H264_MC_HV, ssse3) |
| 2106 #endif | 2106 #endif |
| 2107 | 2107 |
| 2108 /* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */ | 2108 /* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */ |
| 2109 DECLARE_ALIGNED_8(static const uint64_t, h264_rnd_reg)[4] = { | 2109 DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = { |
| 2110 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x00030
00300030003ULL | 2110 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x00030
00300030003ULL |
| 2111 }; | 2111 }; |
| 2112 | 2112 |
| 2113 #define H264_CHROMA_OP(S,D) | 2113 #define H264_CHROMA_OP(S,D) |
| 2114 #define H264_CHROMA_OP4(S,D,T) | 2114 #define H264_CHROMA_OP4(S,D,T) |
| 2115 #define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx | 2115 #define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx |
| 2116 #define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx | 2116 #define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx |
| 2117 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2 | 2117 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2 |
| 2118 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx | 2118 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx |
| 2119 #include "dsputil_h264_template_mmx.c" | 2119 #include "dsputil_h264_template_mmx.c" |
| (...skipping 196 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2316 | 2316 |
| 2317 H264_WEIGHT(16,16) | 2317 H264_WEIGHT(16,16) |
| 2318 H264_WEIGHT(16, 8) | 2318 H264_WEIGHT(16, 8) |
| 2319 H264_WEIGHT( 8,16) | 2319 H264_WEIGHT( 8,16) |
| 2320 H264_WEIGHT( 8, 8) | 2320 H264_WEIGHT( 8, 8) |
| 2321 H264_WEIGHT( 8, 4) | 2321 H264_WEIGHT( 8, 4) |
| 2322 H264_WEIGHT( 4, 8) | 2322 H264_WEIGHT( 4, 8) |
| 2323 H264_WEIGHT( 4, 4) | 2323 H264_WEIGHT( 4, 4) |
| 2324 H264_WEIGHT( 4, 2) | 2324 H264_WEIGHT( 4, 2) |
| 2325 | 2325 |
| OLD | NEW |