OLD | NEW |
1 /* | 1 /* |
2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt | 2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt |
3 * | 3 * |
4 * This file is part of FFmpeg. | 4 * This file is part of FFmpeg. |
5 * | 5 * |
6 * FFmpeg is free software; you can redistribute it and/or | 6 * FFmpeg is free software; you can redistribute it and/or |
7 * modify it under the terms of the GNU Lesser General Public | 7 * modify it under the terms of the GNU Lesser General Public |
8 * License as published by the Free Software Foundation; either | 8 * License as published by the Free Software Foundation; either |
9 * version 2.1 of the License, or (at your option) any later version. | 9 * version 2.1 of the License, or (at your option) any later version. |
10 * | 10 * |
11 * FFmpeg is distributed in the hope that it will be useful, | 11 * FFmpeg is distributed in the hope that it will be useful, |
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 * Lesser General Public License for more details. | 14 * Lesser General Public License for more details. |
15 * | 15 * |
16 * You should have received a copy of the GNU Lesser General Public | 16 * You should have received a copy of the GNU Lesser General Public |
17 * License along with FFmpeg; if not, write to the Free Software | 17 * License along with FFmpeg; if not, write to the Free Software |
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 */ | 19 */ |
20 | 20 |
21 #include "dsputil_mmx.h" | 21 #include "dsputil_mmx.h" |
22 | 22 |
23 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; | 23 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; |
24 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; | 24 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; |
25 | 25 |
26 /***********************************/ | 26 /***********************************/ |
27 /* IDCT */ | 27 /* IDCT */ |
28 | 28 |
29 #define SUMSUB_BADC( a, b, c, d ) \ | 29 #define SUMSUB_BADC( a, b, c, d ) \ |
30 "paddw "#b", "#a" \n\t"\ | 30 "paddw "#b", "#a" \n\t"\ |
31 "paddw "#d", "#c" \n\t"\ | 31 "paddw "#d", "#c" \n\t"\ |
32 "paddw "#b", "#b" \n\t"\ | 32 "paddw "#b", "#b" \n\t"\ |
33 "paddw "#d", "#d" \n\t"\ | 33 "paddw "#d", "#d" \n\t"\ |
34 "psubw "#a", "#b" \n\t"\ | 34 "psubw "#a", "#b" \n\t"\ |
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
150 SUMSUB_BA( %%mm5, %%mm4 ) | 150 SUMSUB_BA( %%mm5, %%mm4 ) |
151 SUMSUB_BA( %%mm3, %%mm2 ) | 151 SUMSUB_BA( %%mm3, %%mm2 ) |
152 SUMSUB_BA( %%mm1, %%mm0 ) | 152 SUMSUB_BA( %%mm1, %%mm0 ) |
153 :: "r"(block) | 153 :: "r"(block) |
154 ); | 154 ); |
155 } | 155 } |
156 | 156 |
157 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) | 157 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) |
158 { | 158 { |
159 int i; | 159 int i; |
160 DECLARE_ALIGNED_8(int16_t, b2)[64]; | 160 DECLARE_ALIGNED(8, int16_t, b2)[64]; |
161 | 161 |
162 block[0] += 32; | 162 block[0] += 32; |
163 | 163 |
164 for(i=0; i<2; i++){ | 164 for(i=0; i<2; i++){ |
165 DECLARE_ALIGNED_8(uint64_t, tmp); | 165 DECLARE_ALIGNED(8, uint64_t, tmp); |
166 | 166 |
167 h264_idct8_1d(block+4*i); | 167 h264_idct8_1d(block+4*i); |
168 | 168 |
169 __asm__ volatile( | 169 __asm__ volatile( |
170 "movq %%mm7, %0 \n\t" | 170 "movq %%mm7, %0 \n\t" |
171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) | 171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) |
172 "movq %%mm0, 8(%1) \n\t" | 172 "movq %%mm0, 8(%1) \n\t" |
173 "movq %%mm6, 24(%1) \n\t" | 173 "movq %%mm6, 24(%1) \n\t" |
174 "movq %%mm7, 40(%1) \n\t" | 174 "movq %%mm7, 40(%1) \n\t" |
175 "movq %%mm4, 56(%1) \n\t" | 175 "movq %%mm4, 56(%1) \n\t" |
(...skipping 445 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
621 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ | 621 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ |
622 "movq "#p1", "#tmp" \n\t"\ | 622 "movq "#p1", "#tmp" \n\t"\ |
623 "psubusb "#tc0", "#tmp" \n\t"\ | 623 "psubusb "#tc0", "#tmp" \n\t"\ |
624 "paddusb "#p1", "#tc0" \n\t"\ | 624 "paddusb "#p1", "#tc0" \n\t"\ |
625 "pmaxub "#tmp", "#q2" \n\t"\ | 625 "pmaxub "#tmp", "#q2" \n\t"\ |
626 "pminub "#tc0", "#q2" \n\t"\ | 626 "pminub "#tc0", "#q2" \n\t"\ |
627 "movq "#q2", "q1addr" \n\t" | 627 "movq "#q2", "q1addr" \n\t" |
628 | 628 |
629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alph
a1, int beta1, int8_t *tc0) | 629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alph
a1, int beta1, int8_t *tc0) |
630 { | 630 { |
631 DECLARE_ALIGNED_8(uint64_t, tmp0)[2]; | 631 DECLARE_ALIGNED(8, uint64_t, tmp0)[2]; |
632 | 632 |
633 __asm__ volatile( | 633 __asm__ volatile( |
634 "movq (%2,%4), %%mm0 \n\t" //p1 | 634 "movq (%2,%4), %%mm0 \n\t" //p1 |
635 "movq (%2,%4,2), %%mm1 \n\t" //p0 | 635 "movq (%2,%4,2), %%mm1 \n\t" //p0 |
636 "movq (%3), %%mm2 \n\t" //q0 | 636 "movq (%3), %%mm2 \n\t" //q0 |
637 "movq (%3,%4), %%mm3 \n\t" //q1 | 637 "movq (%3,%4), %%mm3 \n\t" //q1 |
638 H264_DEBLOCK_MASK(%7, %8) | 638 H264_DEBLOCK_MASK(%7, %8) |
639 | 639 |
640 "movd %6, %%mm4 \n\t" | 640 "movd %6, %%mm4 \n\t" |
641 "punpcklbw %%mm4, %%mm4 \n\t" | 641 "punpcklbw %%mm4, %%mm4 \n\t" |
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
683 { | 683 { |
684 if((tc0[0] & tc0[1]) >= 0) | 684 if((tc0[0] & tc0[1]) >= 0) |
685 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); | 685 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); |
686 if((tc0[2] & tc0[3]) >= 0) | 686 if((tc0[2] & tc0[3]) >= 0) |
687 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); | 687 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); |
688 } | 688 } |
689 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, in
t beta, int8_t *tc0) | 689 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, in
t beta, int8_t *tc0) |
690 { | 690 { |
691 //FIXME: could cut some load/stores by merging transpose with filter | 691 //FIXME: could cut some load/stores by merging transpose with filter |
692 // also, it only needs to transpose 6x8 | 692 // also, it only needs to transpose 6x8 |
693 DECLARE_ALIGNED_8(uint8_t, trans)[8*8]; | 693 DECLARE_ALIGNED(8, uint8_t, trans)[8*8]; |
694 int i; | 694 int i; |
695 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { | 695 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { |
696 if((tc0[0] & tc0[1]) < 0) | 696 if((tc0[0] & tc0[1]) < 0) |
697 continue; | 697 continue; |
698 transpose4x4(trans, pix-4, 8, stride); | 698 transpose4x4(trans, pix-4, 8, stride); |
699 transpose4x4(trans +4*8, pix, 8, stride); | 699 transpose4x4(trans +4*8, pix, 8, stride); |
700 transpose4x4(trans+4, pix-4+4*stride, 8, stride); | 700 transpose4x4(trans+4, pix-4+4*stride, 8, stride); |
701 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); | 701 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); |
702 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); | 702 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); |
703 transpose4x4(pix-2, trans +2*8, stride, 8); | 703 transpose4x4(pix-2, trans +2*8, stride, 8); |
(...skipping 23 matching lines...) Expand all Loading... |
727 } | 727 } |
728 | 728 |
729 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0) | 729 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0) |
730 { | 730 { |
731 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); | 731 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); |
732 } | 732 } |
733 | 733 |
734 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0) | 734 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0) |
735 { | 735 { |
736 //FIXME: could cut some load/stores by merging transpose with filter | 736 //FIXME: could cut some load/stores by merging transpose with filter |
737 DECLARE_ALIGNED_8(uint8_t, trans)[8*4]; | 737 DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; |
738 transpose4x4(trans, pix-2, 8, stride); | 738 transpose4x4(trans, pix-2, 8, stride); |
739 transpose4x4(trans+4, pix-2+4*stride, 8, stride); | 739 transpose4x4(trans+4, pix-2+4*stride, 8, stride); |
740 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); | 740 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); |
741 transpose4x4(pix-2, trans, stride, 8); | 741 transpose4x4(pix-2, trans, stride, 8); |
742 transpose4x4(pix-2+4*stride, trans+4, stride, 8); | 742 transpose4x4(pix-2+4*stride, trans+4, stride, 8); |
743 } | 743 } |
744 | 744 |
745 // p0 = (p0 + q1 + 2*p1 + 2) >> 2 | 745 // p0 = (p0 + q1 + 2*p1 + 2) >> 2 |
746 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \ | 746 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \ |
747 "movq "#p0", %%mm4 \n\t"\ | 747 "movq "#p0", %%mm4 \n\t"\ |
(...skipping 29 matching lines...) Expand all Loading... |
777 } | 777 } |
778 | 778 |
779 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a
lpha, int beta) | 779 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a
lpha, int beta) |
780 { | 780 { |
781 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); | 781 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); |
782 } | 782 } |
783 | 783 |
784 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a
lpha, int beta) | 784 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a
lpha, int beta) |
785 { | 785 { |
786 //FIXME: could cut some load/stores by merging transpose with filter | 786 //FIXME: could cut some load/stores by merging transpose with filter |
787 DECLARE_ALIGNED_8(uint8_t, trans)[8*4]; | 787 DECLARE_ALIGNED(8, uint8_t, trans)[8*4]; |
788 transpose4x4(trans, pix-2, 8, stride); | 788 transpose4x4(trans, pix-2, 8, stride); |
789 transpose4x4(trans+4, pix-2+4*stride, 8, stride); | 789 transpose4x4(trans+4, pix-2+4*stride, 8, stride); |
790 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); | 790 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); |
791 transpose4x4(pix-2, trans, stride, 8); | 791 transpose4x4(pix-2, trans, stride, 8); |
792 transpose4x4(pix-2+4*stride, trans+4, stride, 8); | 792 transpose4x4(pix-2+4*stride, trans+4, stride, 8); |
793 } | 793 } |
794 | 794 |
795 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
, int8_t ref[2][40], int16_t mv[2][40][2], | 795 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
, int8_t ref[2][40], int16_t mv[2][40][2], |
796 int bidir, int edges, int step, int
mask_mv0, int mask_mv1, int field ) { | 796 int bidir, int edges, int step, int
mask_mv0, int mask_mv1, int field ) { |
797 int dir; | 797 int dir; |
(...skipping 10 matching lines...) Expand all Loading... |
808 __asm__ volatile( | 808 __asm__ volatile( |
809 "movq %%mm6, %%mm5 \n" | 809 "movq %%mm6, %%mm5 \n" |
810 "paddb %%mm5, %%mm5 \n" | 810 "paddb %%mm5, %%mm5 \n" |
811 :); | 811 :); |
812 | 812 |
813 // could do a special case for dir==0 && edges==1, but it only reduces the | 813 // could do a special case for dir==0 && edges==1, but it only reduces the |
814 // average filter time by 1.2% | 814 // average filter time by 1.2% |
815 for( dir=1; dir>=0; dir-- ) { | 815 for( dir=1; dir>=0; dir-- ) { |
816 const x86_reg d_idx = dir ? -8 : -1; | 816 const x86_reg d_idx = dir ? -8 : -1; |
817 const int mask_mv = dir ? mask_mv1 : mask_mv0; | 817 const int mask_mv = dir ? mask_mv1 : mask_mv0; |
818 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffff
ffULL; | 818 DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xfffffffffffff
fffULL; |
819 int b_idx, edge; | 819 int b_idx, edge; |
820 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { | 820 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { |
821 __asm__ volatile( | 821 __asm__ volatile( |
822 "pand %0, %%mm0 \n\t" | 822 "pand %0, %%mm0 \n\t" |
823 ::"m"(mask_dir) | 823 ::"m"(mask_dir) |
824 ); | 824 ); |
825 if(!(mask_mv & edge)) { | 825 if(!(mask_mv & edge)) { |
826 if(bidir) { | 826 if(bidir) { |
827 __asm__ volatile( | 827 __asm__ volatile( |
828 "movd (%1,%0), %%mm2 \n" | 828 "movd (%1,%0), %%mm2 \n" |
(...skipping 1270 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2099 H264_MC_4816(3dnow) | 2099 H264_MC_4816(3dnow) |
2100 H264_MC_4816(mmx2) | 2100 H264_MC_4816(mmx2) |
2101 H264_MC_816(H264_MC_V, sse2) | 2101 H264_MC_816(H264_MC_V, sse2) |
2102 H264_MC_816(H264_MC_HV, sse2) | 2102 H264_MC_816(H264_MC_HV, sse2) |
2103 #if HAVE_SSSE3 | 2103 #if HAVE_SSSE3 |
2104 H264_MC_816(H264_MC_H, ssse3) | 2104 H264_MC_816(H264_MC_H, ssse3) |
2105 H264_MC_816(H264_MC_HV, ssse3) | 2105 H264_MC_816(H264_MC_HV, ssse3) |
2106 #endif | 2106 #endif |
2107 | 2107 |
2108 /* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */ | 2108 /* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */ |
2109 DECLARE_ALIGNED_8(static const uint64_t, h264_rnd_reg)[4] = { | 2109 DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = { |
2110 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x00030
00300030003ULL | 2110 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x00030
00300030003ULL |
2111 }; | 2111 }; |
2112 | 2112 |
2113 #define H264_CHROMA_OP(S,D) | 2113 #define H264_CHROMA_OP(S,D) |
2114 #define H264_CHROMA_OP4(S,D,T) | 2114 #define H264_CHROMA_OP4(S,D,T) |
2115 #define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx | 2115 #define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx |
2116 #define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx | 2116 #define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx |
2117 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2 | 2117 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2 |
2118 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx | 2118 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx |
2119 #include "dsputil_h264_template_mmx.c" | 2119 #include "dsputil_h264_template_mmx.c" |
(...skipping 196 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2316 | 2316 |
2317 H264_WEIGHT(16,16) | 2317 H264_WEIGHT(16,16) |
2318 H264_WEIGHT(16, 8) | 2318 H264_WEIGHT(16, 8) |
2319 H264_WEIGHT( 8,16) | 2319 H264_WEIGHT( 8,16) |
2320 H264_WEIGHT( 8, 8) | 2320 H264_WEIGHT( 8, 8) |
2321 H264_WEIGHT( 8, 4) | 2321 H264_WEIGHT( 8, 4) |
2322 H264_WEIGHT( 4, 8) | 2322 H264_WEIGHT( 4, 8) |
2323 H264_WEIGHT( 4, 4) | 2323 H264_WEIGHT( 4, 4) |
2324 H264_WEIGHT( 4, 2) | 2324 H264_WEIGHT( 4, 2) |
2325 | 2325 |
OLD | NEW |