Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(189)

Side by Side Diff: patched-ffmpeg-mt/libavcodec/x86/h264dsp_mmx.c

Issue 789004: ffmpeg roll of source to mar 9 version... (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/ffmpeg/
Patch Set: '' Created 10 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt 2 * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
3 * 3 *
4 * This file is part of FFmpeg. 4 * This file is part of FFmpeg.
5 * 5 *
6 * FFmpeg is free software; you can redistribute it and/or 6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public 7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either 8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version. 9 * version 2.1 of the License, or (at your option) any later version.
10 * 10 *
11 * FFmpeg is distributed in the hope that it will be useful, 11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details. 14 * Lesser General Public License for more details.
15 * 15 *
16 * You should have received a copy of the GNU Lesser General Public 16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software 17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */ 19 */
20 20
21 #include "dsputil_mmx.h" 21 #include "dsputil_mmx.h"
22 22
23 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; 23 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;
24 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; 24 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL;
25 25
26 /***********************************/ 26 /***********************************/
27 /* IDCT */ 27 /* IDCT */
28 28
29 #define SUMSUB_BADC( a, b, c, d ) \ 29 #define SUMSUB_BADC( a, b, c, d ) \
30 "paddw "#b", "#a" \n\t"\ 30 "paddw "#b", "#a" \n\t"\
31 "paddw "#d", "#c" \n\t"\ 31 "paddw "#d", "#c" \n\t"\
32 "paddw "#b", "#b" \n\t"\ 32 "paddw "#b", "#b" \n\t"\
33 "paddw "#d", "#d" \n\t"\ 33 "paddw "#d", "#d" \n\t"\
34 "psubw "#a", "#b" \n\t"\ 34 "psubw "#a", "#b" \n\t"\
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after
150 SUMSUB_BA( %%mm5, %%mm4 ) 150 SUMSUB_BA( %%mm5, %%mm4 )
151 SUMSUB_BA( %%mm3, %%mm2 ) 151 SUMSUB_BA( %%mm3, %%mm2 )
152 SUMSUB_BA( %%mm1, %%mm0 ) 152 SUMSUB_BA( %%mm1, %%mm0 )
153 :: "r"(block) 153 :: "r"(block)
154 ); 154 );
155 } 155 }
156 156
157 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) 157 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
158 { 158 {
159 int i; 159 int i;
160 DECLARE_ALIGNED_8(int16_t, b2)[64]; 160 DECLARE_ALIGNED(8, int16_t, b2)[64];
161 161
162 block[0] += 32; 162 block[0] += 32;
163 163
164 for(i=0; i<2; i++){ 164 for(i=0; i<2; i++){
165 DECLARE_ALIGNED_8(uint64_t, tmp); 165 DECLARE_ALIGNED(8, uint64_t, tmp);
166 166
167 h264_idct8_1d(block+4*i); 167 h264_idct8_1d(block+4*i);
168 168
169 __asm__ volatile( 169 __asm__ volatile(
170 "movq %%mm7, %0 \n\t" 170 "movq %%mm7, %0 \n\t"
171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) 171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
172 "movq %%mm0, 8(%1) \n\t" 172 "movq %%mm0, 8(%1) \n\t"
173 "movq %%mm6, 24(%1) \n\t" 173 "movq %%mm6, 24(%1) \n\t"
174 "movq %%mm7, 40(%1) \n\t" 174 "movq %%mm7, 40(%1) \n\t"
175 "movq %%mm4, 56(%1) \n\t" 175 "movq %%mm4, 56(%1) \n\t"
(...skipping 445 matching lines...) Expand 10 before | Expand all | Expand 10 after
621 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\ 621 "psubusb "#tmp", "#q2" \n\t" /* (p2+((p0+q0+1)>>1))>>1 */\
622 "movq "#p1", "#tmp" \n\t"\ 622 "movq "#p1", "#tmp" \n\t"\
623 "psubusb "#tc0", "#tmp" \n\t"\ 623 "psubusb "#tc0", "#tmp" \n\t"\
624 "paddusb "#p1", "#tc0" \n\t"\ 624 "paddusb "#p1", "#tc0" \n\t"\
625 "pmaxub "#tmp", "#q2" \n\t"\ 625 "pmaxub "#tmp", "#q2" \n\t"\
626 "pminub "#tc0", "#q2" \n\t"\ 626 "pminub "#tc0", "#q2" \n\t"\
627 "movq "#q2", "q1addr" \n\t" 627 "movq "#q2", "q1addr" \n\t"
628 628
629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alph a1, int beta1, int8_t *tc0) 629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alph a1, int beta1, int8_t *tc0)
630 { 630 {
631 DECLARE_ALIGNED_8(uint64_t, tmp0)[2]; 631 DECLARE_ALIGNED(8, uint64_t, tmp0)[2];
632 632
633 __asm__ volatile( 633 __asm__ volatile(
634 "movq (%2,%4), %%mm0 \n\t" //p1 634 "movq (%2,%4), %%mm0 \n\t" //p1
635 "movq (%2,%4,2), %%mm1 \n\t" //p0 635 "movq (%2,%4,2), %%mm1 \n\t" //p0
636 "movq (%3), %%mm2 \n\t" //q0 636 "movq (%3), %%mm2 \n\t" //q0
637 "movq (%3,%4), %%mm3 \n\t" //q1 637 "movq (%3,%4), %%mm3 \n\t" //q1
638 H264_DEBLOCK_MASK(%7, %8) 638 H264_DEBLOCK_MASK(%7, %8)
639 639
640 "movd %6, %%mm4 \n\t" 640 "movd %6, %%mm4 \n\t"
641 "punpcklbw %%mm4, %%mm4 \n\t" 641 "punpcklbw %%mm4, %%mm4 \n\t"
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
683 { 683 {
684 if((tc0[0] & tc0[1]) >= 0) 684 if((tc0[0] & tc0[1]) >= 0)
685 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0); 685 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
686 if((tc0[2] & tc0[3]) >= 0) 686 if((tc0[2] & tc0[3]) >= 0)
687 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2); 687 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
688 } 688 }
689 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, in t beta, int8_t *tc0) 689 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, in t beta, int8_t *tc0)
690 { 690 {
691 //FIXME: could cut some load/stores by merging transpose with filter 691 //FIXME: could cut some load/stores by merging transpose with filter
692 // also, it only needs to transpose 6x8 692 // also, it only needs to transpose 6x8
693 DECLARE_ALIGNED_8(uint8_t, trans)[8*8]; 693 DECLARE_ALIGNED(8, uint8_t, trans)[8*8];
694 int i; 694 int i;
695 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { 695 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
696 if((tc0[0] & tc0[1]) < 0) 696 if((tc0[0] & tc0[1]) < 0)
697 continue; 697 continue;
698 transpose4x4(trans, pix-4, 8, stride); 698 transpose4x4(trans, pix-4, 8, stride);
699 transpose4x4(trans +4*8, pix, 8, stride); 699 transpose4x4(trans +4*8, pix, 8, stride);
700 transpose4x4(trans+4, pix-4+4*stride, 8, stride); 700 transpose4x4(trans+4, pix-4+4*stride, 8, stride);
701 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride); 701 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride);
702 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0); 702 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
703 transpose4x4(pix-2, trans +2*8, stride, 8); 703 transpose4x4(pix-2, trans +2*8, stride, 8);
(...skipping 23 matching lines...) Expand all
727 } 727 }
728 728
729 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 729 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
730 { 730 {
731 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0); 731 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
732 } 732 }
733 733
734 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 734 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
735 { 735 {
736 //FIXME: could cut some load/stores by merging transpose with filter 736 //FIXME: could cut some load/stores by merging transpose with filter
737 DECLARE_ALIGNED_8(uint8_t, trans)[8*4]; 737 DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
738 transpose4x4(trans, pix-2, 8, stride); 738 transpose4x4(trans, pix-2, 8, stride);
739 transpose4x4(trans+4, pix-2+4*stride, 8, stride); 739 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
740 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); 740 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
741 transpose4x4(pix-2, trans, stride, 8); 741 transpose4x4(pix-2, trans, stride, 8);
742 transpose4x4(pix-2+4*stride, trans+4, stride, 8); 742 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
743 } 743 }
744 744
745 // p0 = (p0 + q1 + 2*p1 + 2) >> 2 745 // p0 = (p0 + q1 + 2*p1 + 2) >> 2
746 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \ 746 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \
747 "movq "#p0", %%mm4 \n\t"\ 747 "movq "#p0", %%mm4 \n\t"\
(...skipping 29 matching lines...) Expand all
777 } 777 }
778 778
779 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a lpha, int beta) 779 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a lpha, int beta)
780 { 780 {
781 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1); 781 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
782 } 782 }
783 783
784 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a lpha, int beta) 784 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a lpha, int beta)
785 { 785 {
786 //FIXME: could cut some load/stores by merging transpose with filter 786 //FIXME: could cut some load/stores by merging transpose with filter
787 DECLARE_ALIGNED_8(uint8_t, trans)[8*4]; 787 DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
788 transpose4x4(trans, pix-2, 8, stride); 788 transpose4x4(trans, pix-2, 8, stride);
789 transpose4x4(trans+4, pix-2+4*stride, 8, stride); 789 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
790 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); 790 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
791 transpose4x4(pix-2, trans, stride, 8); 791 transpose4x4(pix-2, trans, stride, 8);
792 transpose4x4(pix-2+4*stride, trans+4, stride, 8); 792 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
793 } 793 }
794 794
795 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40] , int8_t ref[2][40], int16_t mv[2][40][2], 795 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40] , int8_t ref[2][40], int16_t mv[2][40][2],
796 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { 796 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
797 int dir; 797 int dir;
(...skipping 10 matching lines...) Expand all
808 __asm__ volatile( 808 __asm__ volatile(
809 "movq %%mm6, %%mm5 \n" 809 "movq %%mm6, %%mm5 \n"
810 "paddb %%mm5, %%mm5 \n" 810 "paddb %%mm5, %%mm5 \n"
811 :); 811 :);
812 812
813 // could do a special case for dir==0 && edges==1, but it only reduces the 813 // could do a special case for dir==0 && edges==1, but it only reduces the
814 // average filter time by 1.2% 814 // average filter time by 1.2%
815 for( dir=1; dir>=0; dir-- ) { 815 for( dir=1; dir>=0; dir-- ) {
816 const x86_reg d_idx = dir ? -8 : -1; 816 const x86_reg d_idx = dir ? -8 : -1;
817 const int mask_mv = dir ? mask_mv1 : mask_mv0; 817 const int mask_mv = dir ? mask_mv1 : mask_mv0;
818 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffff ffULL; 818 DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xfffffffffffff fffULL;
819 int b_idx, edge; 819 int b_idx, edge;
820 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { 820 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
821 __asm__ volatile( 821 __asm__ volatile(
822 "pand %0, %%mm0 \n\t" 822 "pand %0, %%mm0 \n\t"
823 ::"m"(mask_dir) 823 ::"m"(mask_dir)
824 ); 824 );
825 if(!(mask_mv & edge)) { 825 if(!(mask_mv & edge)) {
826 if(bidir) { 826 if(bidir) {
827 __asm__ volatile( 827 __asm__ volatile(
828 "movd (%1,%0), %%mm2 \n" 828 "movd (%1,%0), %%mm2 \n"
(...skipping 1270 matching lines...) Expand 10 before | Expand all | Expand 10 after
2099 H264_MC_4816(3dnow) 2099 H264_MC_4816(3dnow)
2100 H264_MC_4816(mmx2) 2100 H264_MC_4816(mmx2)
2101 H264_MC_816(H264_MC_V, sse2) 2101 H264_MC_816(H264_MC_V, sse2)
2102 H264_MC_816(H264_MC_HV, sse2) 2102 H264_MC_816(H264_MC_HV, sse2)
2103 #if HAVE_SSSE3 2103 #if HAVE_SSSE3
2104 H264_MC_816(H264_MC_H, ssse3) 2104 H264_MC_816(H264_MC_H, ssse3)
2105 H264_MC_816(H264_MC_HV, ssse3) 2105 H264_MC_816(H264_MC_HV, ssse3)
2106 #endif 2106 #endif
2107 2107
2108 /* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */ 2108 /* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */
2109 DECLARE_ALIGNED_8(static const uint64_t, h264_rnd_reg)[4] = { 2109 DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = {
2110 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x00030 00300030003ULL 2110 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x00030 00300030003ULL
2111 }; 2111 };
2112 2112
2113 #define H264_CHROMA_OP(S,D) 2113 #define H264_CHROMA_OP(S,D)
2114 #define H264_CHROMA_OP4(S,D,T) 2114 #define H264_CHROMA_OP4(S,D,T)
2115 #define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx 2115 #define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx
2116 #define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx 2116 #define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx
2117 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2 2117 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
2118 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx 2118 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
2119 #include "dsputil_h264_template_mmx.c" 2119 #include "dsputil_h264_template_mmx.c"
(...skipping 196 matching lines...) Expand 10 before | Expand all | Expand 10 after
2316 2316
2317 H264_WEIGHT(16,16) 2317 H264_WEIGHT(16,16)
2318 H264_WEIGHT(16, 8) 2318 H264_WEIGHT(16, 8)
2319 H264_WEIGHT( 8,16) 2319 H264_WEIGHT( 8,16)
2320 H264_WEIGHT( 8, 8) 2320 H264_WEIGHT( 8, 8)
2321 H264_WEIGHT( 8, 4) 2321 H264_WEIGHT( 8, 4)
2322 H264_WEIGHT( 4, 8) 2322 H264_WEIGHT( 4, 8)
2323 H264_WEIGHT( 4, 4) 2323 H264_WEIGHT( 4, 4)
2324 H264_WEIGHT( 4, 2) 2324 H264_WEIGHT( 4, 2)
2325 2325
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698