| OLD | NEW |
| (Empty) | |
| 1 /* Copyright (c) 2014, Cisco Systems, INC |
| 2 Written by XiangMingZhu WeiZhou MinPeng YanWang |
| 3 |
| 4 Redistribution and use in source and binary forms, with or without |
| 5 modification, are permitted provided that the following conditions |
| 6 are met: |
| 7 |
| 8 - Redistributions of source code must retain the above copyright |
| 9 notice, this list of conditions and the following disclaimer. |
| 10 |
| 11 - Redistributions in binary form must reproduce the above copyright |
| 12 notice, this list of conditions and the following disclaimer in the |
| 13 documentation and/or other materials provided with the distribution. |
| 14 |
| 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
| 19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 26 */ |
| 27 |
| 28 #ifdef HAVE_CONFIG_H |
| 29 #include "config.h" |
| 30 #endif |
| 31 |
| 32 #include "macros.h" |
| 33 #include "celt_lpc.h" |
| 34 #include "stack_alloc.h" |
| 35 #include "mathops.h" |
| 36 #include "pitch.h" |
| 37 |
| 38 #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT) |
| 39 |
| 40 #include <xmmintrin.h> |
| 41 #include "arch.h" |
| 42 |
| 43 void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4
], int len) |
| 44 { |
| 45 int j; |
| 46 __m128 xsum1, xsum2; |
| 47 xsum1 = _mm_loadu_ps(sum); |
| 48 xsum2 = _mm_setzero_ps(); |
| 49 |
| 50 for (j = 0; j < len-3; j += 4) |
| 51 { |
| 52 __m128 x0 = _mm_loadu_ps(x+j); |
| 53 __m128 yj = _mm_loadu_ps(y+j); |
| 54 __m128 y3 = _mm_loadu_ps(y+j+3); |
| 55 |
| 56 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj)); |
| 57 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55), |
| 58 _mm_shuffle_ps(yj,y3,0x49))); |
| 59 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa), |
| 60 _mm_shuffle_ps(yj,y3,0x9e))); |
| 61 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3)); |
| 62 } |
| 63 if (j < len) |
| 64 { |
| 65 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j))); |
| 66 if (++j < len) |
| 67 { |
| 68 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)
)); |
| 69 if (++j < len) |
| 70 { |
| 71 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y
+j))); |
| 72 } |
| 73 } |
| 74 } |
| 75 _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2)); |
| 76 } |
| 77 |
| 78 |
| 79 void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_
val16 *y02, |
| 80 int N, opus_val32 *xy1, opus_val32 *xy2) |
| 81 { |
| 82 int i; |
| 83 __m128 xsum1, xsum2; |
| 84 xsum1 = _mm_setzero_ps(); |
| 85 xsum2 = _mm_setzero_ps(); |
| 86 for (i=0;i<N-3;i+=4) |
| 87 { |
| 88 __m128 xi = _mm_loadu_ps(x+i); |
| 89 __m128 y1i = _mm_loadu_ps(y01+i); |
| 90 __m128 y2i = _mm_loadu_ps(y02+i); |
| 91 xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i)); |
| 92 xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i)); |
| 93 } |
| 94 /* Horizontal sum */ |
| 95 xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1)); |
| 96 xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55)); |
| 97 _mm_store_ss(xy1, xsum1); |
| 98 xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2)); |
| 99 xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55)); |
| 100 _mm_store_ss(xy2, xsum2); |
| 101 for (;i<N;i++) |
| 102 { |
| 103 *xy1 = MAC16_16(*xy1, x[i], y01[i]); |
| 104 *xy2 = MAC16_16(*xy2, x[i], y02[i]); |
| 105 } |
| 106 } |
| 107 |
| 108 opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y, |
| 109 int N) |
| 110 { |
| 111 int i; |
| 112 float xy; |
| 113 __m128 sum; |
| 114 sum = _mm_setzero_ps(); |
| 115 /* FIXME: We should probably go 8-way and use 2 sums. */ |
| 116 for (i=0;i<N-3;i+=4) |
| 117 { |
| 118 __m128 xi = _mm_loadu_ps(x+i); |
| 119 __m128 yi = _mm_loadu_ps(y+i); |
| 120 sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi)); |
| 121 } |
| 122 /* Horizontal sum */ |
| 123 sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum)); |
| 124 sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55)); |
| 125 _mm_store_ss(&xy, sum); |
| 126 for (;i<N;i++) |
| 127 { |
| 128 xy = MAC16_16(xy, x[i], y[i]); |
| 129 } |
| 130 return xy; |
| 131 } |
| 132 |
| 133 void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N, |
| 134 opus_val16 g10, opus_val16 g11, opus_val16 g12) |
| 135 { |
| 136 int i; |
| 137 __m128 x0v; |
| 138 __m128 g10v, g11v, g12v; |
| 139 g10v = _mm_load1_ps(&g10); |
| 140 g11v = _mm_load1_ps(&g11); |
| 141 g12v = _mm_load1_ps(&g12); |
| 142 x0v = _mm_loadu_ps(&x[-T-2]); |
| 143 for (i=0;i<N-3;i+=4) |
| 144 { |
| 145 __m128 yi, yi2, x1v, x2v, x3v, x4v; |
| 146 const opus_val32 *xp = &x[i-T-2]; |
| 147 yi = _mm_loadu_ps(x+i); |
| 148 x4v = _mm_loadu_ps(xp+4); |
| 149 #if 0 |
| 150 /* Slower version with all loads */ |
| 151 x1v = _mm_loadu_ps(xp+1); |
| 152 x2v = _mm_loadu_ps(xp+2); |
| 153 x3v = _mm_loadu_ps(xp+3); |
| 154 #else |
| 155 x2v = _mm_shuffle_ps(x0v, x4v, 0x4e); |
| 156 x1v = _mm_shuffle_ps(x0v, x2v, 0x99); |
| 157 x3v = _mm_shuffle_ps(x2v, x4v, 0x99); |
| 158 #endif |
| 159 |
| 160 yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v)); |
| 161 #if 0 /* Set to 1 to make it bit-exact with the non-SSE version */ |
| 162 yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v))); |
| 163 yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); |
| 164 #else |
| 165 /* Use partial sums */ |
| 166 yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)), |
| 167 _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v))); |
| 168 yi = _mm_add_ps(yi, yi2); |
| 169 #endif |
| 170 x0v=x4v; |
| 171 _mm_storeu_ps(y+i, yi); |
| 172 } |
| 173 #ifdef CUSTOM_MODES |
| 174 for (;i<N;i++) |
| 175 { |
| 176 y[i] = x[i] |
| 177 + MULT16_32_Q15(g10,x[i-T]) |
| 178 + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1])) |
| 179 + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2])); |
| 180 } |
| 181 #endif |
| 182 } |
| 183 |
| 184 |
| 185 #endif |
| OLD | NEW |