patched-ffmpeg-mt/libavcodec/x86/h264dsp_mmx.c - Issue 789004: ffmpeg roll of source to mar 9 version...

Side by Side Diff: patched-ffmpeg-mt/libavcodec/x86/h264dsp_mmx.c

Issue 789004: ffmpeg roll of source to mar 9 version... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/ffmpeg/

Patch Set: '' Created 10 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt	2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt

3 *	3 *

4 * This file is part of FFmpeg.	4 * This file is part of FFmpeg.

5 *	5 *

6 * FFmpeg is free software; you can redistribute it and/or	6 * FFmpeg is free software; you can redistribute it and/or

7 * modify it under the terms of the GNU Lesser General Public	7 * modify it under the terms of the GNU Lesser General Public

8 * License as published by the Free Software Foundation; either	8 * License as published by the Free Software Foundation; either

9 * version 2.1 of the License, or (at your option) any later version.	9 * version 2.1 of the License, or (at your option) any later version.

10 *	10 *

11 * FFmpeg is distributed in the hope that it will be useful,	11 * FFmpeg is distributed in the hope that it will be useful,

12 * but WITHOUT ANY WARRANTY; without even the implied warranty of	12 * but WITHOUT ANY WARRANTY; without even the implied warranty of

13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

14 * Lesser General Public License for more details.	14 * Lesser General Public License for more details.

15 *	15 *

16 * You should have received a copy of the GNU Lesser General Public	16 * You should have received a copy of the GNU Lesser General Public

17 * License along with FFmpeg; if not, write to the Free Software	17 * License along with FFmpeg; if not, write to the Free Software

18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA	18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

19 */	19 */

20	20

21 #include "dsputil_mmx.h"	21 #include "dsputil_mmx.h"

22	22

23 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;	23 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;

24 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL;	24 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL;

25	25

26 /***********************************/	26 /***********************************/

27 /* IDCT */	27 /* IDCT */

28	28

29 #define SUMSUB_BADC( a, b, c, d ) \	29 #define SUMSUB_BADC( a, b, c, d ) \

30 "paddw "#b", "#a" \n\t"\	30 "paddw "#b", "#a" \n\t"\

31 "paddw "#d", "#c" \n\t"\	31 "paddw "#d", "#c" \n\t"\

32 "paddw "#b", "#b" \n\t"\	32 "paddw "#b", "#b" \n\t"\

33 "paddw "#d", "#d" \n\t"\	33 "paddw "#d", "#d" \n\t"\

34 "psubw "#a", "#b" \n\t"\	34 "psubw "#a", "#b" \n\t"\

(...skipping 115 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
150 SUMSUB_BA( %%mm5, %%mm4 )	150 SUMSUB_BA( %%mm5, %%mm4 )

151 SUMSUB_BA( %%mm3, %%mm2 )	151 SUMSUB_BA( %%mm3, %%mm2 )

152 SUMSUB_BA( %%mm1, %%mm0 )	152 SUMSUB_BA( %%mm1, %%mm0 )

153 :: "r"(block)	153 :: "r"(block)

154 );	154 );

155 }	155 }

156	156

157 static void ff_h264_idct8_add_mmx(uint8_t dst, int16_t block, int stride)	157 static void ff_h264_idct8_add_mmx(uint8_t dst, int16_t block, int stride)

158 {	158 {

159 int i;	159 int i;

160 DECLARE_ALIGNED_8(int16_t, b2)[64];	160 DECLARE_ALIGNED(8, int16_t, b2)[64];

161	161

162 block[0] += 32;	162 block[0] += 32;

163	163

164 for(i=0; i<2; i++){	164 for(i=0; i<2; i++){

165 DECLARE_ALIGNED_8(uint64_t, tmp);	165 DECLARE_ALIGNED(8, uint64_t, tmp);

166	166

167 h264_idct8_1d(block+4*i);	167 h264_idct8_1d(block+4*i);

168	168

169 __asm__ volatile(	169 __asm__ volatile(

170 "movq %%mm7, %0 \n\t"	170 "movq %%mm7, %0 \n\t"

171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )	171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )

172 "movq %%mm0, 8(%1) \n\t"	172 "movq %%mm0, 8(%1) \n\t"

173 "movq %%mm6, 24(%1) \n\t"	173 "movq %%mm6, 24(%1) \n\t"

174 "movq %%mm7, 40(%1) \n\t"	174 "movq %%mm7, 40(%1) \n\t"

175 "movq %%mm4, 56(%1) \n\t"	175 "movq %%mm4, 56(%1) \n\t"

(...skipping 445 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
621 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\	621 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\

622 "movq "#p1", "#tmp" \n\t"\	622 "movq "#p1", "#tmp" \n\t"\

623 "psubusb "#tc0", "#tmp" \n\t"\	623 "psubusb "#tc0", "#tmp" \n\t"\

624 "paddusb "#p1", "#tc0" \n\t"\	624 "paddusb "#p1", "#tc0" \n\t"\

625 "pmaxub "#tmp", "#q2" \n\t"\	625 "pmaxub "#tmp", "#q2" \n\t"\

626 "pminub "#tc0", "#q2" \n\t"\	626 "pminub "#tc0", "#q2" \n\t"\

627 "movq "#q2", "q1addr" \n\t"	627 "movq "#q2", "q1addr" \n\t"

628	628

629 static inline void h264_loop_filter_luma_mmx2(uint8_t pix, int stride, int alph a1, int beta1, int8_t tc0)	629 static inline void h264_loop_filter_luma_mmx2(uint8_t pix, int stride, int alph a1, int beta1, int8_t tc0)

630 {	630 {

631 DECLARE_ALIGNED_8(uint64_t, tmp0)[2];	631 DECLARE_ALIGNED(8, uint64_t, tmp0)[2];

632	632

633 __asm__ volatile(	633 __asm__ volatile(

634 "movq (%2,%4), %%mm0 \n\t" //p1	634 "movq (%2,%4), %%mm0 \n\t" //p1

635 "movq (%2,%4,2), %%mm1 \n\t" //p0	635 "movq (%2,%4,2), %%mm1 \n\t" //p0

636 "movq (%3), %%mm2 \n\t" //q0	636 "movq (%3), %%mm2 \n\t" //q0

637 "movq (%3,%4), %%mm3 \n\t" //q1	637 "movq (%3,%4), %%mm3 \n\t" //q1

638 H264_DEBLOCK_MASK(%7, %8)	638 H264_DEBLOCK_MASK(%7, %8)

639	639

640 "movd %6, %%mm4 \n\t"	640 "movd %6, %%mm4 \n\t"

641 "punpcklbw %%mm4, %%mm4 \n\t"	641 "punpcklbw %%mm4, %%mm4 \n\t"

(...skipping 41 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
683 {	683 {

684 if((tc0[0] & tc0[1]) >= 0)	684 if((tc0[0] & tc0[1]) >= 0)

685 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);	685 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);

686 if((tc0[2] & tc0[3]) >= 0)	686 if((tc0[2] & tc0[3]) >= 0)

687 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);	687 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);

688 }	688 }

689 static void h264_h_loop_filter_luma_mmx2(uint8_t pix, int stride, int alpha, in t beta, int8_t tc0)	689 static void h264_h_loop_filter_luma_mmx2(uint8_t pix, int stride, int alpha, in t beta, int8_t tc0)

690 {	690 {

691 //FIXME: could cut some load/stores by merging transpose with filter	691 //FIXME: could cut some load/stores by merging transpose with filter

692 // also, it only needs to transpose 6x8	692 // also, it only needs to transpose 6x8

693 DECLARE_ALIGNED_8(uint8_t, trans)[8*8];	693 DECLARE_ALIGNED(8, uint8_t, trans)[8*8];

694 int i;	694 int i;

695 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {	695 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {

696 if((tc0[0] & tc0[1]) < 0)	696 if((tc0[0] & tc0[1]) < 0)

697 continue;	697 continue;

698 transpose4x4(trans, pix-4, 8, stride);	698 transpose4x4(trans, pix-4, 8, stride);

699 transpose4x4(trans +4*8, pix, 8, stride);	699 transpose4x4(trans +4*8, pix, 8, stride);

700 transpose4x4(trans+4, pix-4+4*stride, 8, stride);	700 transpose4x4(trans+4, pix-4+4*stride, 8, stride);

701 transpose4x4(trans+4+48, pix +4stride, 8, stride);	701 transpose4x4(trans+4+48, pix +4stride, 8, stride);

702 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);	702 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);

703 transpose4x4(pix-2, trans +2*8, stride, 8);	703 transpose4x4(pix-2, trans +2*8, stride, 8);

(...skipping 23 matching lines...) Expand all Loading...
727 }	727 }

728	728

729 static void h264_v_loop_filter_chroma_mmx2(uint8_t pix, int stride, int alpha, int beta, int8_t tc0)	729 static void h264_v_loop_filter_chroma_mmx2(uint8_t pix, int stride, int alpha, int beta, int8_t tc0)

730 {	730 {

731 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);	731 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);

732 }	732 }

733	733

734 static void h264_h_loop_filter_chroma_mmx2(uint8_t pix, int stride, int alpha, int beta, int8_t tc0)	734 static void h264_h_loop_filter_chroma_mmx2(uint8_t pix, int stride, int alpha, int beta, int8_t tc0)

735 {	735 {

736 //FIXME: could cut some load/stores by merging transpose with filter	736 //FIXME: could cut some load/stores by merging transpose with filter

737 DECLARE_ALIGNED_8(uint8_t, trans)[8*4];	737 DECLARE_ALIGNED(8, uint8_t, trans)[8*4];

738 transpose4x4(trans, pix-2, 8, stride);	738 transpose4x4(trans, pix-2, 8, stride);

739 transpose4x4(trans+4, pix-2+4*stride, 8, stride);	739 transpose4x4(trans+4, pix-2+4*stride, 8, stride);

740 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);	740 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);

741 transpose4x4(pix-2, trans, stride, 8);	741 transpose4x4(pix-2, trans, stride, 8);

742 transpose4x4(pix-2+4*stride, trans+4, stride, 8);	742 transpose4x4(pix-2+4*stride, trans+4, stride, 8);

743 }	743 }

744	744

745 // p0 = (p0 + q1 + 2*p1 + 2) >> 2	745 // p0 = (p0 + q1 + 2*p1 + 2) >> 2

746 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \	746 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \

747 "movq "#p0", %%mm4 \n\t"\	747 "movq "#p0", %%mm4 \n\t"\

(...skipping 29 matching lines...) Expand all Loading...
777 }	777 }

778	778

779 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a lpha, int beta)	779 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a lpha, int beta)

780 {	780 {

781 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);	781 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);

782 }	782 }

783	783

784 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a lpha, int beta)	784 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a lpha, int beta)

785 {	785 {

786 //FIXME: could cut some load/stores by merging transpose with filter	786 //FIXME: could cut some load/stores by merging transpose with filter

787 DECLARE_ALIGNED_8(uint8_t, trans)[8*4];	787 DECLARE_ALIGNED(8, uint8_t, trans)[8*4];

788 transpose4x4(trans, pix-2, 8, stride);	788 transpose4x4(trans, pix-2, 8, stride);

789 transpose4x4(trans+4, pix-2+4*stride, 8, stride);	789 transpose4x4(trans+4, pix-2+4*stride, 8, stride);

790 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);	790 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);

791 transpose4x4(pix-2, trans, stride, 8);	791 transpose4x4(pix-2, trans, stride, 8);

792 transpose4x4(pix-2+4*stride, trans+4, stride, 8);	792 transpose4x4(pix-2+4*stride, trans+4, stride, 8);

793 }	793 }

794	794

795 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40] , int8_t ref[2][40], int16_t mv[2][40][2],	795 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40] , int8_t ref[2][40], int16_t mv[2][40][2],

796 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {	796 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {

797 int dir;	797 int dir;

(...skipping 10 matching lines...) Expand all Loading...
808 __asm__ volatile(	808 __asm__ volatile(

809 "movq %%mm6, %%mm5 \n"	809 "movq %%mm6, %%mm5 \n"

810 "paddb %%mm5, %%mm5 \n"	810 "paddb %%mm5, %%mm5 \n"

811 :);	811 :);

812	812

813 // could do a special case for dir==0 && edges==1, but it only reduces the	813 // could do a special case for dir==0 && edges==1, but it only reduces the

814 // average filter time by 1.2%	814 // average filter time by 1.2%

815 for( dir=1; dir>=0; dir-- ) {	815 for( dir=1; dir>=0; dir-- ) {

816 const x86_reg d_idx = dir ? -8 : -1;	816 const x86_reg d_idx = dir ? -8 : -1;

817 const int mask_mv = dir ? mask_mv1 : mask_mv0;	817 const int mask_mv = dir ? mask_mv1 : mask_mv0;

818 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffff ffULL;	818 DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xfffffffffffff fffULL;

819 int b_idx, edge;	819 int b_idx, edge;

820 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {	820 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {

821 __asm__ volatile(	821 __asm__ volatile(

822 "pand %0, %%mm0 \n\t"	822 "pand %0, %%mm0 \n\t"

823 ::"m"(mask_dir)	823 ::"m"(mask_dir)

824 );	824 );

825 if(!(mask_mv & edge)) {	825 if(!(mask_mv & edge)) {

826 if(bidir) {	826 if(bidir) {

827 __asm__ volatile(	827 __asm__ volatile(

828 "movd (%1,%0), %%mm2 \n"	828 "movd (%1,%0), %%mm2 \n"

(...skipping 1270 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2099 H264_MC_4816(3dnow)	2099 H264_MC_4816(3dnow)

2100 H264_MC_4816(mmx2)	2100 H264_MC_4816(mmx2)

2101 H264_MC_816(H264_MC_V, sse2)	2101 H264_MC_816(H264_MC_V, sse2)

2102 H264_MC_816(H264_MC_HV, sse2)	2102 H264_MC_816(H264_MC_HV, sse2)

2103 #if HAVE_SSSE3	2103 #if HAVE_SSSE3

2104 H264_MC_816(H264_MC_H, ssse3)	2104 H264_MC_816(H264_MC_H, ssse3)

2105 H264_MC_816(H264_MC_HV, ssse3)	2105 H264_MC_816(H264_MC_HV, ssse3)

2106 #endif	2106 #endif

2107	2107

2108 /* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */	2108 /* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */

2109 DECLARE_ALIGNED_8(static const uint64_t, h264_rnd_reg)[4] = {	2109 DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = {

2110 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x00030 00300030003ULL	2110 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x00030 00300030003ULL

2111 };	2111 };

2112	2112

2113 #define H264_CHROMA_OP(S,D)	2113 #define H264_CHROMA_OP(S,D)

2114 #define H264_CHROMA_OP4(S,D,T)	2114 #define H264_CHROMA_OP4(S,D,T)

2115 #define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx	2115 #define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx

2116 #define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx	2116 #define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx

2117 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2	2117 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2

2118 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx	2118 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx

2119 #include "dsputil_h264_template_mmx.c"	2119 #include "dsputil_h264_template_mmx.c"

(...skipping 196 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2316	2316

2317 H264_WEIGHT(16,16)	2317 H264_WEIGHT(16,16)

2318 H264_WEIGHT(16, 8)	2318 H264_WEIGHT(16, 8)

2319 H264_WEIGHT( 8,16)	2319 H264_WEIGHT( 8,16)

2320 H264_WEIGHT( 8, 8)	2320 H264_WEIGHT( 8, 8)

2321 H264_WEIGHT( 8, 4)	2321 H264_WEIGHT( 8, 4)

2322 H264_WEIGHT( 4, 8)	2322 H264_WEIGHT( 4, 8)

2323 H264_WEIGHT( 4, 4)	2323 H264_WEIGHT( 4, 4)

2324 H264_WEIGHT( 4, 2)	2324 H264_WEIGHT( 4, 2)

2325	2325

OLD	NEW

« patched-ffmpeg-mt/libavcodec/mpeg4video_es_bsf.c ('K') | « patched-ffmpeg-mt/libavcodec/x86/fft_3dn2.c ('k') | patched-ffmpeg-mt/libavcodec/x86/idct_mmx.c » ('j') | no next file with comments »