source/patched-ffmpeg-mt/libavcodec/x86/fft_sse.c - Issue 3384002: ffmpeg source update for sep 09

Side by Side Diff: source/patched-ffmpeg-mt/libavcodec/x86/fft_sse.c

Issue 3384002: ffmpeg source update for sep 09 (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/ffmpeg/

Patch Set: Created 10 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 /*	1 /*

2 * FFT/MDCT transform with SSE optimizations	2 * FFT/MDCT transform with SSE optimizations

3 * Copyright (c) 2008 Loren Merritt	3 * Copyright (c) 2008 Loren Merritt

4 *	4 *

5 * This file is part of FFmpeg.	5 * This file is part of FFmpeg.

6 *	6 *

7 * FFmpeg is free software; you can redistribute it and/or	7 * FFmpeg is free software; you can redistribute it and/or

8 * modify it under the terms of the GNU Lesser General Public	8 * modify it under the terms of the GNU Lesser General Public

9 * License as published by the Free Software Foundation; either	9 * License as published by the Free Software Foundation; either

10 * version 2.1 of the License, or (at your option) any later version.	10 * version 2.1 of the License, or (at your option) any later version.

(...skipping 53 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
64 "movlps %%xmm0, %0 \n"	64 "movlps %%xmm0, %0 \n"

65 "movhps %%xmm0, %1 \n"	65 "movhps %%xmm0, %1 \n"

66 :"=m"(s->tmp_buf[s->revtab[i]]),	66 :"=m"(s->tmp_buf[s->revtab[i]]),

67 "=m"(s->tmp_buf[s->revtab[i+1]])	67 "=m"(s->tmp_buf[s->revtab[i+1]])

68 :"m"(z[i])	68 :"m"(z[i])

69 );	69 );

70 }	70 }

71 memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));	71 memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));

72 }	72 }

73	73

74 void ff_imdct_half_sse(FFTContext s, FFTSample output, const FFTSample *input)

75 {

76 av_unused x86_reg i, j, k, l;

77 long n = 1 << s->mdct_bits;

78 long n2 = n >> 1;

79 long n4 = n >> 2;

80 long n8 = n >> 3;

81 const uint16_t *revtab = s->revtab + n8;

82 const FFTSample *tcos = s->tcos;

83 const FFTSample *tsin = s->tsin;

84 FFTComplex z = (FFTComplex )output;

85

86 /* pre rotation */

87 for(k=n8-2; k>=0; k-=2) {

88 __asm__ volatile(

89 "movaps (%2,%1,2), %%xmm0 \n" // { z[k].re, z[k].im, z[k+1 ].re, z[k+1].im }

90 "movaps -16(%2,%0,2), %%xmm1 \n" // { z[-k-2].re, z[-k-2].im, z[-k- 1].re, z[-k-1].im }

91 "movaps %%xmm0, %%xmm2 \n"

92 "shufps $0x88, %%xmm1, %%xmm0 \n" // { z[k].re, z[k+1].re, z[-k- 2].re, z[-k-1].re }

93 "shufps $0x77, %%xmm2, %%xmm1 \n" // { z[-k-1].im, z[-k-2].im, z[k+1 ].im, z[k].im }

94 "movlps (%3,%1), %%xmm4 \n"

95 "movlps (%4,%1), %%xmm5 \n"

96 "movhps -8(%3,%0), %%xmm4 \n" // { cos[k], cos[k+1], cos[- k-2], cos[-k-1] }

97 "movhps -8(%4,%0), %%xmm5 \n" // { sin[k], sin[k+1], sin[- k-2], sin[-k-1] }

98 "movaps %%xmm0, %%xmm2 \n"

99 "movaps %%xmm1, %%xmm3 \n"

100 "mulps %%xmm5, %%xmm0 \n" // re*sin

101 "mulps %%xmm4, %%xmm1 \n" // im*cos

102 "mulps %%xmm4, %%xmm2 \n" // re*cos

103 "mulps %%xmm5, %%xmm3 \n" // im*sin

104 "subps %%xmm0, %%xmm1 \n" // -> re

105 "addps %%xmm3, %%xmm2 \n" // -> im

106 "movaps %%xmm1, %%xmm0 \n"

107 "unpcklps %%xmm2, %%xmm1 \n" // { z[k], z[k+1] }

108 "unpckhps %%xmm2, %%xmm0 \n" // { z[-k-2], z[-k-1] }

109 ::"r"(-4k), "r"(4k),

110 "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8)

111 );

112 #if ARCH_X86_64

113 // if we have enough regs, don't let gcc make the luts latency-bound

114 // but if not, latency is faster than spilling

115 __asm__("movlps %%xmm0, %0 \n"

116 "movhps %%xmm0, %1 \n"

117 "movlps %%xmm1, %2 \n"

118 "movhps %%xmm1, %3 \n"

119 :"=m"(z[revtab[-k-2]]),

120 "=m"(z[revtab[-k-1]]),

121 "=m"(z[revtab[ k ]]),

122 "=m"(z[revtab[ k+1]])

123 );

124 #else

125 __asm__("movlps %%xmm0, %0" :"=m"(z[revtab[-k-2]]));

126 __asm__("movhps %%xmm0, %0" :"=m"(z[revtab[-k-1]]));

127 __asm__("movlps %%xmm1, %0" :"=m"(z[revtab[ k ]]));

128 __asm__("movhps %%xmm1, %0" :"=m"(z[revtab[ k+1]]));

129 #endif

130 }

131

132 ff_fft_dispatch_sse(z, s->nbits);

133

134 /* post rotation + reinterleave + reorder */

135

136 #define CMUL(j,xmm0,xmm1)\

137 "movaps (%2,"#j",2), %%xmm6 \n"\

138 "movaps 16(%2,"#j",2), "#xmm0"\n"\

139 "movaps %%xmm6, "#xmm1"\n"\

140 "movaps "#xmm0",%%xmm7 \n"\

141 "mulps (%3,"#j"), %%xmm6 \n"\

142 "mulps (%4,"#j"), "#xmm0"\n"\

143 "mulps (%4,"#j"), "#xmm1"\n"\

144 "mulps (%3,"#j"), %%xmm7 \n"\

145 "subps %%xmm6, "#xmm0"\n"\

146 "addps %%xmm7, "#xmm1"\n"

147

148 j = -n2;

149 k = n2-16;

150 __asm__ volatile(

151 "1: \n"

152 CMUL(%0, %%xmm0, %%xmm1)

153 CMUL(%1, %%xmm4, %%xmm5)

154 "shufps $0x1b, %%xmm1, %%xmm1 \n"

155 "shufps $0x1b, %%xmm5, %%xmm5 \n"

156 "movaps %%xmm4, %%xmm6 \n"

157 "unpckhps %%xmm1, %%xmm4 \n"

158 "unpcklps %%xmm1, %%xmm6 \n"

159 "movaps %%xmm0, %%xmm2 \n"

160 "unpcklps %%xmm5, %%xmm0 \n"

161 "unpckhps %%xmm5, %%xmm2 \n"

162 "movaps %%xmm6, (%2,%1,2) \n"

163 "movaps %%xmm4, 16(%2,%1,2) \n"

164 "movaps %%xmm0, (%2,%0,2) \n"

165 "movaps %%xmm2, 16(%2,%0,2) \n"

166 "sub $16, %1 \n"

167 "add $16, %0 \n"

168 "jl 1b \n"

169 :"+&r"(j), "+&r"(k)

170 :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8)

171 :"memory"

172 );

173 }

174

175 void ff_imdct_calc_sse(FFTContext s, FFTSample output, const FFTSample *input)	74 void ff_imdct_calc_sse(FFTContext s, FFTSample output, const FFTSample *input)

176 {	75 {

177 x86_reg j, k;	76 x86_reg j, k;

178 long n = 1 << s->mdct_bits;	77 long n = s->mdct_size;

179 long n4 = n >> 2;	78 long n4 = n >> 2;

180	79

181 ff_imdct_half_sse(s, output+n4, input);	80 ff_imdct_half_sse(s, output+n4, input);

182	81

183 j = -n;	82 j = -n;

184 k = n-16;	83 k = n-16;

185 __asm__ volatile(	84 __asm__ volatile(

186 "movaps %4, %%xmm7 \n"	85 "movaps %4, %%xmm7 \n"

187 "1: \n"	86 "1: \n"

188 "movaps (%2,%1), %%xmm0 \n"	87 "movaps (%2,%1), %%xmm0 \n"

189 "movaps (%3,%0), %%xmm1 \n"	88 "movaps (%3,%0), %%xmm1 \n"

190 "shufps $0x1b, %%xmm0, %%xmm0 \n"	89 "shufps $0x1b, %%xmm0, %%xmm0 \n"

191 "shufps $0x1b, %%xmm1, %%xmm1 \n"	90 "shufps $0x1b, %%xmm1, %%xmm1 \n"

192 "xorps %%xmm7, %%xmm0 \n"	91 "xorps %%xmm7, %%xmm0 \n"

193 "movaps %%xmm1, (%3,%1) \n"	92 "movaps %%xmm1, (%3,%1) \n"

194 "movaps %%xmm0, (%2,%0) \n"	93 "movaps %%xmm0, (%2,%0) \n"

195 "sub $16, %1 \n"	94 "sub $16, %1 \n"

196 "add $16, %0 \n"	95 "add $16, %0 \n"

197 "jl 1b \n"	96 "jl 1b \n"

198 :"+r"(j), "+r"(k)	97 :"+r"(j), "+r"(k)

199 :"r"(output+n4), "r"(output+n4*3),	98 :"r"(output+n4), "r"(output+n4*3),

200 "m"(*m1m1m1m1)	99 "m"(*m1m1m1m1)

201 );	100 );

202 }	101 }

203	102

OLD	NEW

« no previous file with comments | « source/patched-ffmpeg-mt/libavcodec/x86/fft_mmx.asm ('k') | source/patched-ffmpeg-mt/libavcodec/x86/h264_chromamc.asm » ('j') | no next file with comments »