| OLD | NEW |
| 1 /* | 1 /* |
| 2 * FFT/MDCT transform with SSE optimizations | 2 * FFT/MDCT transform with SSE optimizations |
| 3 * Copyright (c) 2008 Loren Merritt | 3 * Copyright (c) 2008 Loren Merritt |
| 4 * | 4 * |
| 5 * This file is part of FFmpeg. | 5 * This file is part of FFmpeg. |
| 6 * | 6 * |
| 7 * FFmpeg is free software; you can redistribute it and/or | 7 * FFmpeg is free software; you can redistribute it and/or |
| 8 * modify it under the terms of the GNU Lesser General Public | 8 * modify it under the terms of the GNU Lesser General Public |
| 9 * License as published by the Free Software Foundation; either | 9 * License as published by the Free Software Foundation; either |
| 10 * version 2.1 of the License, or (at your option) any later version. | 10 * version 2.1 of the License, or (at your option) any later version. |
| (...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 64 "movlps %%xmm0, %0 \n" | 64 "movlps %%xmm0, %0 \n" |
| 65 "movhps %%xmm0, %1 \n" | 65 "movhps %%xmm0, %1 \n" |
| 66 :"=m"(s->tmp_buf[s->revtab[i]]), | 66 :"=m"(s->tmp_buf[s->revtab[i]]), |
| 67 "=m"(s->tmp_buf[s->revtab[i+1]]) | 67 "=m"(s->tmp_buf[s->revtab[i+1]]) |
| 68 :"m"(z[i]) | 68 :"m"(z[i]) |
| 69 ); | 69 ); |
| 70 } | 70 } |
| 71 memcpy(z, s->tmp_buf, n*sizeof(FFTComplex)); | 71 memcpy(z, s->tmp_buf, n*sizeof(FFTComplex)); |
| 72 } | 72 } |
| 73 | 73 |
| 74 void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input) | |
| 75 { | |
| 76 av_unused x86_reg i, j, k, l; | |
| 77 long n = 1 << s->mdct_bits; | |
| 78 long n2 = n >> 1; | |
| 79 long n4 = n >> 2; | |
| 80 long n8 = n >> 3; | |
| 81 const uint16_t *revtab = s->revtab + n8; | |
| 82 const FFTSample *tcos = s->tcos; | |
| 83 const FFTSample *tsin = s->tsin; | |
| 84 FFTComplex *z = (FFTComplex *)output; | |
| 85 | |
| 86 /* pre rotation */ | |
| 87 for(k=n8-2; k>=0; k-=2) { | |
| 88 __asm__ volatile( | |
| 89 "movaps (%2,%1,2), %%xmm0 \n" // { z[k].re, z[k].im, z[k+1
].re, z[k+1].im } | |
| 90 "movaps -16(%2,%0,2), %%xmm1 \n" // { z[-k-2].re, z[-k-2].im, z[-k-
1].re, z[-k-1].im } | |
| 91 "movaps %%xmm0, %%xmm2 \n" | |
| 92 "shufps $0x88, %%xmm1, %%xmm0 \n" // { z[k].re, z[k+1].re, z[-k-
2].re, z[-k-1].re } | |
| 93 "shufps $0x77, %%xmm2, %%xmm1 \n" // { z[-k-1].im, z[-k-2].im, z[k+1
].im, z[k].im } | |
| 94 "movlps (%3,%1), %%xmm4 \n" | |
| 95 "movlps (%4,%1), %%xmm5 \n" | |
| 96 "movhps -8(%3,%0), %%xmm4 \n" // { cos[k], cos[k+1], cos[-
k-2], cos[-k-1] } | |
| 97 "movhps -8(%4,%0), %%xmm5 \n" // { sin[k], sin[k+1], sin[-
k-2], sin[-k-1] } | |
| 98 "movaps %%xmm0, %%xmm2 \n" | |
| 99 "movaps %%xmm1, %%xmm3 \n" | |
| 100 "mulps %%xmm5, %%xmm0 \n" // re*sin | |
| 101 "mulps %%xmm4, %%xmm1 \n" // im*cos | |
| 102 "mulps %%xmm4, %%xmm2 \n" // re*cos | |
| 103 "mulps %%xmm5, %%xmm3 \n" // im*sin | |
| 104 "subps %%xmm0, %%xmm1 \n" // -> re | |
| 105 "addps %%xmm3, %%xmm2 \n" // -> im | |
| 106 "movaps %%xmm1, %%xmm0 \n" | |
| 107 "unpcklps %%xmm2, %%xmm1 \n" // { z[k], z[k+1] } | |
| 108 "unpckhps %%xmm2, %%xmm0 \n" // { z[-k-2], z[-k-1] } | |
| 109 ::"r"(-4*k), "r"(4*k), | |
| 110 "r"(input+n4), "r"(tcos+n8), "r"(tsin+n8) | |
| 111 ); | |
| 112 #if ARCH_X86_64 | |
| 113 // if we have enough regs, don't let gcc make the luts latency-bound | |
| 114 // but if not, latency is faster than spilling | |
| 115 __asm__("movlps %%xmm0, %0 \n" | |
| 116 "movhps %%xmm0, %1 \n" | |
| 117 "movlps %%xmm1, %2 \n" | |
| 118 "movhps %%xmm1, %3 \n" | |
| 119 :"=m"(z[revtab[-k-2]]), | |
| 120 "=m"(z[revtab[-k-1]]), | |
| 121 "=m"(z[revtab[ k ]]), | |
| 122 "=m"(z[revtab[ k+1]]) | |
| 123 ); | |
| 124 #else | |
| 125 __asm__("movlps %%xmm0, %0" :"=m"(z[revtab[-k-2]])); | |
| 126 __asm__("movhps %%xmm0, %0" :"=m"(z[revtab[-k-1]])); | |
| 127 __asm__("movlps %%xmm1, %0" :"=m"(z[revtab[ k ]])); | |
| 128 __asm__("movhps %%xmm1, %0" :"=m"(z[revtab[ k+1]])); | |
| 129 #endif | |
| 130 } | |
| 131 | |
| 132 ff_fft_dispatch_sse(z, s->nbits); | |
| 133 | |
| 134 /* post rotation + reinterleave + reorder */ | |
| 135 | |
| 136 #define CMUL(j,xmm0,xmm1)\ | |
| 137 "movaps (%2,"#j",2), %%xmm6 \n"\ | |
| 138 "movaps 16(%2,"#j",2), "#xmm0"\n"\ | |
| 139 "movaps %%xmm6, "#xmm1"\n"\ | |
| 140 "movaps "#xmm0",%%xmm7 \n"\ | |
| 141 "mulps (%3,"#j"), %%xmm6 \n"\ | |
| 142 "mulps (%4,"#j"), "#xmm0"\n"\ | |
| 143 "mulps (%4,"#j"), "#xmm1"\n"\ | |
| 144 "mulps (%3,"#j"), %%xmm7 \n"\ | |
| 145 "subps %%xmm6, "#xmm0"\n"\ | |
| 146 "addps %%xmm7, "#xmm1"\n" | |
| 147 | |
| 148 j = -n2; | |
| 149 k = n2-16; | |
| 150 __asm__ volatile( | |
| 151 "1: \n" | |
| 152 CMUL(%0, %%xmm0, %%xmm1) | |
| 153 CMUL(%1, %%xmm4, %%xmm5) | |
| 154 "shufps $0x1b, %%xmm1, %%xmm1 \n" | |
| 155 "shufps $0x1b, %%xmm5, %%xmm5 \n" | |
| 156 "movaps %%xmm4, %%xmm6 \n" | |
| 157 "unpckhps %%xmm1, %%xmm4 \n" | |
| 158 "unpcklps %%xmm1, %%xmm6 \n" | |
| 159 "movaps %%xmm0, %%xmm2 \n" | |
| 160 "unpcklps %%xmm5, %%xmm0 \n" | |
| 161 "unpckhps %%xmm5, %%xmm2 \n" | |
| 162 "movaps %%xmm6, (%2,%1,2) \n" | |
| 163 "movaps %%xmm4, 16(%2,%1,2) \n" | |
| 164 "movaps %%xmm0, (%2,%0,2) \n" | |
| 165 "movaps %%xmm2, 16(%2,%0,2) \n" | |
| 166 "sub $16, %1 \n" | |
| 167 "add $16, %0 \n" | |
| 168 "jl 1b \n" | |
| 169 :"+&r"(j), "+&r"(k) | |
| 170 :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) | |
| 171 :"memory" | |
| 172 ); | |
| 173 } | |
| 174 | |
| 175 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input) | 74 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input) |
| 176 { | 75 { |
| 177 x86_reg j, k; | 76 x86_reg j, k; |
| 178 long n = 1 << s->mdct_bits; | 77 long n = s->mdct_size; |
| 179 long n4 = n >> 2; | 78 long n4 = n >> 2; |
| 180 | 79 |
| 181 ff_imdct_half_sse(s, output+n4, input); | 80 ff_imdct_half_sse(s, output+n4, input); |
| 182 | 81 |
| 183 j = -n; | 82 j = -n; |
| 184 k = n-16; | 83 k = n-16; |
| 185 __asm__ volatile( | 84 __asm__ volatile( |
| 186 "movaps %4, %%xmm7 \n" | 85 "movaps %4, %%xmm7 \n" |
| 187 "1: \n" | 86 "1: \n" |
| 188 "movaps (%2,%1), %%xmm0 \n" | 87 "movaps (%2,%1), %%xmm0 \n" |
| 189 "movaps (%3,%0), %%xmm1 \n" | 88 "movaps (%3,%0), %%xmm1 \n" |
| 190 "shufps $0x1b, %%xmm0, %%xmm0 \n" | 89 "shufps $0x1b, %%xmm0, %%xmm0 \n" |
| 191 "shufps $0x1b, %%xmm1, %%xmm1 \n" | 90 "shufps $0x1b, %%xmm1, %%xmm1 \n" |
| 192 "xorps %%xmm7, %%xmm0 \n" | 91 "xorps %%xmm7, %%xmm0 \n" |
| 193 "movaps %%xmm1, (%3,%1) \n" | 92 "movaps %%xmm1, (%3,%1) \n" |
| 194 "movaps %%xmm0, (%2,%0) \n" | 93 "movaps %%xmm0, (%2,%0) \n" |
| 195 "sub $16, %1 \n" | 94 "sub $16, %1 \n" |
| 196 "add $16, %0 \n" | 95 "add $16, %0 \n" |
| 197 "jl 1b \n" | 96 "jl 1b \n" |
| 198 :"+r"(j), "+r"(k) | 97 :"+r"(j), "+r"(k) |
| 199 :"r"(output+n4), "r"(output+n4*3), | 98 :"r"(output+n4), "r"(output+n4*3), |
| 200 "m"(*m1m1m1m1) | 99 "m"(*m1m1m1m1) |
| 201 ); | 100 ); |
| 202 } | 101 } |
| 203 | 102 |
| OLD | NEW |