OLD | NEW |
1 /*********************************************************************** | 1 /* Copyright (c) 2014, Cisco Systems, INC |
2 Copyright (c) 2006-2011, Skype Limited. All rights reserved. | 2 Written by XiangMingZhu WeiZhou MinPeng YanWang |
3 Redistribution and use in source and binary forms, with or without | 3 |
4 modification, are permitted provided that the following conditions | 4 Redistribution and use in source and binary forms, with or without |
5 are met: | 5 modification, are permitted provided that the following conditions |
6 - Redistributions of source code must retain the above copyright notice, | 6 are met: |
7 this list of conditions and the following disclaimer. | 7 |
8 - Redistributions in binary form must reproduce the above copyright | 8 - Redistributions of source code must retain the above copyright |
9 notice, this list of conditions and the following disclaimer in the | 9 notice, this list of conditions and the following disclaimer. |
10 documentation and/or other materials provided with the distribution. | 10 |
11 - Neither the name of Internet Society, IETF or IETF Trust, nor the | 11 - Redistributions in binary form must reproduce the above copyright |
12 names of specific contributors, may be used to endorse or promote | 12 notice, this list of conditions and the following disclaimer in the |
13 products derived from this software without specific prior written | 13 documentation and/or other materials provided with the distribution. |
14 permission. | 14 |
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
25 POSSIBILITY OF SUCH DAMAGE. | 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 ***********************************************************************/ | 26 */ |
27 | 27 |
28 #ifdef HAVE_CONFIG_H | 28 #ifdef HAVE_CONFIG_H |
29 #include "config.h" | 29 #include "config.h" |
30 #endif | 30 #endif |
31 | 31 |
| 32 #include <xmmintrin.h> |
| 33 #include <emmintrin.h> |
| 34 #include <smmintrin.h> |
| 35 |
32 #include "SigProc_FIX.h" | 36 #include "SigProc_FIX.h" |
33 #include "define.h" | 37 #include "define.h" |
34 #include "tuning_parameters.h" | 38 #include "tuning_parameters.h" |
35 #include "pitch.h" | 39 #include "pitch.h" |
| 40 #include "celt/x86/x86cpu.h" |
36 | 41 |
37 #define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr =
( 0.005 * 16000 + 16 ) * 4 = 384 */ | 42 #define MAX_FRAME_SIZE 384 /* subfr_length * nb_subfr =
( 0.005 * 16000 + 16 ) * 4 = 384 */ |
38 | 43 |
39 #define QA 25 | 44 #define QA 25 |
40 #define N_BITS_HEAD_ROOM 2 | 45 #define N_BITS_HEAD_ROOM 2 |
41 #define MIN_RSHIFTS -16 | 46 #define MIN_RSHIFTS -16 |
42 #define MAX_RSHIFTS (32 - QA) | 47 #define MAX_RSHIFTS (32 - QA) |
43 | 48 |
44 /* Compute reflection coefficients from input signal */ | 49 /* Compute reflection coefficients from input signal */ |
45 void silk_burg_modified( | 50 void silk_burg_modified_sse4_1( |
46 opus_int32 *res_nrg, /* O Residual energy
*/ | 51 opus_int32 *res_nrg, /* O Residual energy
*/ |
47 opus_int *res_nrg_Q, /* O Residual energy Q va
lue */ | 52 opus_int *res_nrg_Q, /* O Residual energy Q va
lue */ |
48 opus_int32 A_Q16[], /* O Prediction coefficie
nts (length order) */ | 53 opus_int32 A_Q16[], /* O Prediction coefficie
nts (length order) */ |
49 const opus_int16 x[], /* I Input signal, length
: nb_subfr * ( D + subfr_length ) */ | 54 const opus_int16 x[], /* I Input signal, length
: nb_subfr * ( D + subfr_length ) */ |
50 const opus_int32 minInvGain_Q30, /* I Inverse of max predi
ction gain */ | 55 const opus_int32 minInvGain_Q30, /* I Inverse of max predi
ction gain */ |
51 const opus_int subfr_length, /* I Input signal subfram
e length (incl. D preceding samples) */ | 56 const opus_int subfr_length, /* I Input signal subfram
e length (incl. D preceding samples) */ |
52 const opus_int nb_subfr, /* I Number of subframes
stacked in x */ | 57 const opus_int nb_subfr, /* I Number of subframes
stacked in x */ |
53 const opus_int D, /* I Order
*/ | 58 const opus_int D, /* I Order
*/ |
54 int arch /* I Run-time architectur
e */ | 59 int arch /* I Run-time architectur
e */ |
55 ) | 60 ) |
56 { | 61 { |
57 opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain; | 62 opus_int k, n, s, lz, rshifts, rshifts_extra, reached_max_gain; |
58 opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tm
p2, x1, x2; | 63 opus_int32 C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tm
p2, x1, x2; |
59 const opus_int16 *x_ptr; | 64 const opus_int16 *x_ptr; |
60 opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ]; | 65 opus_int32 C_first_row[ SILK_MAX_ORDER_LPC ]; |
61 opus_int32 C_last_row[ SILK_MAX_ORDER_LPC ]; | 66 opus_int32 C_last_row[ SILK_MAX_ORDER_LPC ]; |
62 opus_int32 Af_QA[ SILK_MAX_ORDER_LPC ]; | 67 opus_int32 Af_QA[ SILK_MAX_ORDER_LPC ]; |
63 opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ]; | 68 opus_int32 CAf[ SILK_MAX_ORDER_LPC + 1 ]; |
64 opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ]; | 69 opus_int32 CAb[ SILK_MAX_ORDER_LPC + 1 ]; |
65 opus_int32 xcorr[ SILK_MAX_ORDER_LPC ]; | 70 opus_int32 xcorr[ SILK_MAX_ORDER_LPC ]; |
66 | 71 |
| 72 __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_
3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210; |
| 73 __m128i CONST1 = _mm_set1_epi32(1); |
| 74 |
67 silk_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE ); | 75 silk_assert( subfr_length * nb_subfr <= MAX_FRAME_SIZE ); |
68 | 76 |
69 /* Compute autocorrelations, added over subframes */ | 77 /* Compute autocorrelations, added over subframes */ |
70 silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length ); | 78 silk_sum_sqr_shift( &C0, &rshifts, x, nb_subfr * subfr_length ); |
71 if( rshifts > MAX_RSHIFTS ) { | 79 if( rshifts > MAX_RSHIFTS ) { |
72 C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS ); | 80 C0 = silk_LSHIFT32( C0, rshifts - MAX_RSHIFTS ); |
73 silk_assert( C0 > 0 ); | 81 silk_assert( C0 > 0 ); |
74 rshifts = MAX_RSHIFTS; | 82 rshifts = MAX_RSHIFTS; |
75 } else { | 83 } else { |
76 lz = silk_CLZ32( C0 ) - 1; | 84 lz = silk_CLZ32( C0 ) - 1; |
77 rshifts_extra = N_BITS_HEAD_ROOM - lz; | 85 rshifts_extra = N_BITS_HEAD_ROOM - lz; |
78 if( rshifts_extra > 0 ) { | 86 if( rshifts_extra > 0 ) { |
79 rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts ); | 87 rshifts_extra = silk_min( rshifts_extra, MAX_RSHIFTS - rshifts ); |
80 C0 = silk_RSHIFT32( C0, rshifts_extra ); | 88 C0 = silk_RSHIFT32( C0, rshifts_extra ); |
81 } else { | 89 } else { |
82 rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts ); | 90 rshifts_extra = silk_max( rshifts_extra, MIN_RSHIFTS - rshifts ); |
83 C0 = silk_LSHIFT32( C0, -rshifts_extra ); | 91 C0 = silk_LSHIFT32( C0, -rshifts_extra ); |
84 } | 92 } |
85 rshifts += rshifts_extra; | 93 rshifts += rshifts_extra; |
86 } | 94 } |
87 CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32
), C0 ) + 1; /* Q(-rshifts) */ | 95 CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_FAC, 32
), C0 ) + 1; /* Q(-rshifts) */ |
88 silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) ); | 96 silk_memset( C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof( opus_int32 ) ); |
89 if( rshifts > 0 ) { | 97 if( rshifts > 0 ) { |
90 for( s = 0; s < nb_subfr; s++ ) { | 98 for( s = 0; s < nb_subfr; s++ ) { |
91 x_ptr = x + s * subfr_length; | 99 x_ptr = x + s * subfr_length; |
92 for( n = 1; n < D + 1; n++ ) { | 100 for( n = 1; n < D + 1; n++ ) { |
93 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64( | 101 C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64( |
94 silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length
- n ), rshifts ); | 102 silk_inner_prod16_aligned_64( x_ptr, x_ptr + n, subfr_length
- n, arch ), rshifts ); |
95 } | 103 } |
96 } | 104 } |
97 } else { | 105 } else { |
98 for( s = 0; s < nb_subfr; s++ ) { | 106 for( s = 0; s < nb_subfr; s++ ) { |
99 int i; | 107 int i; |
100 opus_int32 d; | 108 opus_int32 d; |
101 x_ptr = x + s * subfr_length; | 109 x_ptr = x + s * subfr_length; |
102 celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch
); | 110 celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch
); |
103 for( n = 1; n < D + 1; n++ ) { | 111 for( n = 1; n < D + 1; n++ ) { |
104 for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ ) | 112 for ( i = n + subfr_length - D, d = 0; i < subfr_length; i++ ) |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
143 CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length
- n + k - 1 ] ); /* Q( -rshift ) */ | 151 CAb[ k ] = silk_SMLAWB( CAb[ k ], tmp2, x_ptr[ subfr_length
- n + k - 1 ] ); /* Q( -rshift ) */ |
144 } | 152 } |
145 } | 153 } |
146 } else { | 154 } else { |
147 for( s = 0; s < nb_subfr; s++ ) { | 155 for( s = 0; s < nb_subfr; s++ ) { |
148 x_ptr = x + s * subfr_length; | 156 x_ptr = x + s * subfr_length; |
149 x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ],
-rshifts ); /* Q( -rshifts ) */ | 157 x1 = -silk_LSHIFT32( (opus_int32)x_ptr[ n ],
-rshifts ); /* Q( -rshifts ) */ |
150 x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ],
-rshifts ); /* Q( -rshifts ) */ | 158 x2 = -silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ],
-rshifts ); /* Q( -rshifts ) */ |
151 tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ],
17 ); /* Q17 */ | 159 tmp1 = silk_LSHIFT32( (opus_int32)x_ptr[ n ],
17 ); /* Q17 */ |
152 tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ],
17 ); /* Q17 */ | 160 tmp2 = silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n - 1 ],
17 ); /* Q17 */ |
153 for( k = 0; k < n; k++ ) { | 161 |
| 162 X1_3210 = _mm_set1_epi32( x1 ); |
| 163 X2_3210 = _mm_set1_epi32( x2 ); |
| 164 TMP1_3210 = _mm_setzero_si128(); |
| 165 TMP2_3210 = _mm_setzero_si128(); |
| 166 for( k = 0; k < n - 3; k += 4 ) { |
| 167 PTR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n - k - 1 - 3 ]
); |
| 168 SUBFR_3210 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subfr_length - n
+ k ] ); |
| 169 FIRST_3210 = _mm_loadu_si128( (__m128i *)&C_first_row[ k ] )
; |
| 170 PTR_3210 = _mm_shuffle_epi32( PTR_3210, _MM_SHUFFLE( 0, 1
, 2, 3 ) ); |
| 171 LAST_3210 = _mm_loadu_si128( (__m128i *)&C_last_row[ k ] ); |
| 172 ATMP_3210 = _mm_loadu_si128( (__m128i *)&Af_QA[ k ] ); |
| 173 |
| 174 T1_3210 = _mm_mullo_epi32( PTR_3210, X1_3210 ); |
| 175 T2_3210 = _mm_mullo_epi32( SUBFR_3210, X2_3210 ); |
| 176 |
| 177 ATMP_3210 = _mm_srai_epi32( ATMP_3210, 7 ); |
| 178 ATMP_3210 = _mm_add_epi32( ATMP_3210, CONST1 ); |
| 179 ATMP_3210 = _mm_srai_epi32( ATMP_3210, 1 ); |
| 180 |
| 181 FIRST_3210 = _mm_add_epi32( FIRST_3210, T1_3210 ); |
| 182 LAST_3210 = _mm_add_epi32( LAST_3210, T2_3210 ); |
| 183 |
| 184 PTR_3210 = _mm_mullo_epi32( ATMP_3210, PTR_3210 ); |
| 185 SUBFR_3210 = _mm_mullo_epi32( ATMP_3210, SUBFR_3210 ); |
| 186 |
| 187 _mm_storeu_si128( (__m128i *)&C_first_row[ k ], FIRST_3210 )
; |
| 188 _mm_storeu_si128( (__m128i *)&C_last_row[ k ], LAST_3210 ); |
| 189 |
| 190 TMP1_3210 = _mm_add_epi32( TMP1_3210, PTR_3210 ); |
| 191 TMP2_3210 = _mm_add_epi32( TMP2_3210, SUBFR_3210 ); |
| 192 } |
| 193 |
| 194 TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_unpackhi_epi64(TMP1_32
10, TMP1_3210 ) ); |
| 195 TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_unpackhi_epi64(TMP2_32
10, TMP2_3210 ) ); |
| 196 TMP1_3210 = _mm_add_epi32( TMP1_3210, _mm_shufflelo_epi16(TMP1_3
210, 0x0E ) ); |
| 197 TMP2_3210 = _mm_add_epi32( TMP2_3210, _mm_shufflelo_epi16(TMP2_3
210, 0x0E ) ); |
| 198 |
| 199 tmp1 += _mm_cvtsi128_si32( TMP1_3210 ); |
| 200 tmp2 += _mm_cvtsi128_si32( TMP2_3210 ); |
| 201 |
| 202 for( ; k < n; k++ ) { |
154 C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n
- k - 1 ] ); /* Q( -rshifts ) */ | 203 C_first_row[ k ] = silk_MLA( C_first_row[ k ], x1, x_ptr[ n
- k - 1 ] ); /* Q( -rshifts ) */ |
155 C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ su
bfr_length - n + k ] ); /* Q( -rshifts ) */ | 204 C_last_row[ k ] = silk_MLA( C_last_row[ k ], x2, x_ptr[ su
bfr_length - n + k ] ); /* Q( -rshifts ) */ |
156 Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );
/* Q17 */ | 205 Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 17 );
/* Q17 */ |
157 tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ], Atmp1
); /* Q17 */ | 206 tmp1 = silk_MLA( tmp1, x_ptr[ n - k - 1 ], Atmp1
); /* Q17 */ |
158 tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1
); /* Q17 */ | 207 tmp2 = silk_MLA( tmp2, x_ptr[ subfr_length - n + k ], Atmp1
); /* Q17 */ |
159 } | 208 } |
160 tmp1 = -tmp1;
/* Q17 */ | 209 |
161 tmp2 = -tmp2;
/* Q17 */ | 210 tmp1 = -tmp1; /* Q17 */ |
162 for( k = 0; k <= n; k++ ) { | 211 tmp2 = -tmp2; /* Q17 */ |
163 CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1, | 212 |
164 silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts - 1
) ); /* Q( -rshift ) */ | 213 { |
165 CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2, | 214 __m128i xmm_tmp1, xmm_tmp2; |
166 silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n + k -
1 ], -rshifts - 1 ) ); /* Q( -rshift ) */ | 215 __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1; |
| 216 __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1; |
| 217 |
| 218 xmm_tmp1 = _mm_set1_epi32( tmp1 ); |
| 219 xmm_tmp2 = _mm_set1_epi32( tmp2 ); |
| 220 |
| 221 for( k = 0; k <= n - 3; k += 4 ) { |
| 222 xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ n -
k - 3 ] ); |
| 223 xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64( &x_ptr[ subf
r_length - n + k - 1 ] ); |
| 224 |
| 225 xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2
x0, _MM_SHUFFLE( 0, 1, 2, 3 ) ); |
| 226 |
| 227 xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32( xmm_x_ptr_n_k_x2x0,
-rshifts - 1 ); |
| 228 xmm_x_ptr_sub_x2x0 = _mm_slli_epi32( xmm_x_ptr_sub_x2x0,
-rshifts - 1 ); |
| 229 |
| 230 /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_s
rli_si128(xmm_x_ptr_n_k_x2x0, 4)*/ |
| 231 xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_n_k_x2
x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); |
| 232 xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32( xmm_x_ptr_sub_x2
x0, _MM_SHUFFLE( 0, 3, 2, 1 ) ); |
| 233 |
| 234 xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32( xmm_x_ptr_n_k_x2x0,
xmm_tmp1 ); |
| 235 xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32( xmm_x_ptr_n_k_x3x1,
xmm_tmp1 ); |
| 236 xmm_x_ptr_sub_x2x0 = _mm_mul_epi32( xmm_x_ptr_sub_x2x0,
xmm_tmp2 ); |
| 237 xmm_x_ptr_sub_x3x1 = _mm_mul_epi32( xmm_x_ptr_sub_x3x1,
xmm_tmp2 ); |
| 238 |
| 239 xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64( xmm_x_ptr_n_k_x2x0,
16 ); |
| 240 xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64( xmm_x_ptr_n_k_x3x1,
16 ); |
| 241 xmm_x_ptr_sub_x2x0 = _mm_srli_epi64( xmm_x_ptr_sub_x2x0,
16 ); |
| 242 xmm_x_ptr_sub_x3x1 = _mm_slli_epi64( xmm_x_ptr_sub_x3x1,
16 ); |
| 243 |
| 244 xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16( xmm_x_ptr_n_k_x2x0
, xmm_x_ptr_n_k_x3x1, 0xCC ); |
| 245 xmm_x_ptr_sub_x2x0 = _mm_blend_epi16( xmm_x_ptr_sub_x2x0
, xmm_x_ptr_sub_x3x1, 0xCC ); |
| 246 |
| 247 X1_3210 = _mm_loadu_si128( (__m128i *)&CAf[ k ] ); |
| 248 PTR_3210 = _mm_loadu_si128( (__m128i *)&CAb[ k ] ); |
| 249 |
| 250 X1_3210 = _mm_add_epi32( X1_3210, xmm_x_ptr_n_k_x2x0 ); |
| 251 PTR_3210 = _mm_add_epi32( PTR_3210, xmm_x_ptr_sub_x2x0 )
; |
| 252 |
| 253 _mm_storeu_si128( (__m128i *)&CAf[ k ], X1_3210 ); |
| 254 _mm_storeu_si128( (__m128i *)&CAb[ k ], PTR_3210 ); |
| 255 } |
| 256 |
| 257 for( ; k <= n; k++ ) { |
| 258 CAf[ k ] = silk_SMLAWW( CAf[ k ], tmp1, |
| 259 silk_LSHIFT32( (opus_int32)x_ptr[ n - k ], -rshifts
- 1 ) ); /* Q( -rshift ) */ |
| 260 CAb[ k ] = silk_SMLAWW( CAb[ k ], tmp2, |
| 261 silk_LSHIFT32( (opus_int32)x_ptr[ subfr_length - n +
k - 1 ], -rshifts - 1 ) ); /* Q( -rshift ) */ |
| 262 } |
167 } | 263 } |
168 } | 264 } |
169 } | 265 } |
170 | 266 |
171 /* Calculate nominator and denominator for the next order reflection (pa
rcor) coefficient */ | 267 /* Calculate nominator and denominator for the next order reflection (pa
rcor) coefficient */ |
172 tmp1 = C_first_row[ n ];
/* Q( -rshifts ) */ | 268 tmp1 = C_first_row[ n ];
/* Q( -rshifts ) */ |
173 tmp2 = C_last_row[ n ];
/* Q( -rshifts ) */ | 269 tmp2 = C_last_row[ n ];
/* Q( -rshifts ) */ |
174 num = 0;
/* Q( -rshifts ) */ | 270 num = 0;
/* Q( -rshifts ) */ |
175 nrg = silk_ADD32( CAb[ 0 ], CAf[ 0 ] );
/* Q( 1-rshifts ) */ | 271 nrg = silk_ADD32( CAb[ 0 ], CAf[ 0 ] );
/* Q( 1-rshifts ) */ |
176 for( k = 0; k < n; k++ ) { | 272 for( k = 0; k < n; k++ ) { |
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
245 | 341 |
246 if( reached_max_gain ) { | 342 if( reached_max_gain ) { |
247 for( k = 0; k < D; k++ ) { | 343 for( k = 0; k < D; k++ ) { |
248 /* Scale coefficients */ | 344 /* Scale coefficients */ |
249 A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 ); | 345 A_Q16[ k ] = -silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 ); |
250 } | 346 } |
251 /* Subtract energy of preceding samples from C0 */ | 347 /* Subtract energy of preceding samples from C0 */ |
252 if( rshifts > 0 ) { | 348 if( rshifts > 0 ) { |
253 for( s = 0; s < nb_subfr; s++ ) { | 349 for( s = 0; s < nb_subfr; s++ ) { |
254 x_ptr = x + s * subfr_length; | 350 x_ptr = x + s * subfr_length; |
255 C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x
_ptr, x_ptr, D ), rshifts ); | 351 C0 -= (opus_int32)silk_RSHIFT64( silk_inner_prod16_aligned_64( x
_ptr, x_ptr, D, arch ), rshifts ); |
256 } | 352 } |
257 } else { | 353 } else { |
258 for( s = 0; s < nb_subfr; s++ ) { | 354 for( s = 0; s < nb_subfr; s++ ) { |
259 x_ptr = x + s * subfr_length; | 355 x_ptr = x + s * subfr_length; |
260 C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D ),
-rshifts ); | 356 C0 -= silk_LSHIFT32( silk_inner_prod_aligned( x_ptr, x_ptr, D, a
rch ), -rshifts ); |
261 } | 357 } |
262 } | 358 } |
263 /* Approximate residual energy */ | 359 /* Approximate residual energy */ |
264 *res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 ); | 360 *res_nrg = silk_LSHIFT( silk_SMMUL( invGain_Q30, C0 ), 2 ); |
265 *res_nrg_Q = -rshifts; | 361 *res_nrg_Q = -rshifts; |
266 } else { | 362 } else { |
267 /* Return residual energy */ | 363 /* Return residual energy */ |
268 nrg = CAf[ 0 ];
/* Q( -rshifts ) */ | 364 nrg = CAf[ 0 ];
/* Q( -rshifts ) */ |
269 tmp1 = (opus_int32)1 << 16;
/* Q16 */ | 365 tmp1 = (opus_int32)1 << 16;
/* Q16 */ |
270 for( k = 0; k < D; k++ ) { | 366 for( k = 0; k < D; k++ ) { |
271 Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );
/* Q16 */ | 367 Atmp1 = silk_RSHIFT_ROUND( Af_QA[ k ], QA - 16 );
/* Q16 */ |
272 nrg = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 );
/* Q( -rshifts ) */ | 368 nrg = silk_SMLAWW( nrg, CAf[ k + 1 ], Atmp1 );
/* Q( -rshifts ) */ |
273 tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 );
/* Q16 */ | 369 tmp1 = silk_SMLAWW( tmp1, Atmp1, Atmp1 );
/* Q16 */ |
274 A_Q16[ k ] = -Atmp1; | 370 A_Q16[ k ] = -Atmp1; |
275 } | 371 } |
276 *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_F
AC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */ | 372 *res_nrg = silk_SMLAWW( nrg, silk_SMMUL( SILK_FIX_CONST( FIND_LPC_COND_F
AC, 32 ), C0 ), -tmp1 );/* Q( -rshifts ) */ |
277 *res_nrg_Q = -rshifts; | 373 *res_nrg_Q = -rshifts; |
278 } | 374 } |
279 } | 375 } |
OLD | NEW |