| OLD | NEW |
| (Empty) | |
| 1 /* Copyright (c) 2014, Cisco Systems, INC |
| 2 Written by XiangMingZhu WeiZhou MinPeng YanWang |
| 3 |
| 4 Redistribution and use in source and binary forms, with or without |
| 5 modification, are permitted provided that the following conditions |
| 6 are met: |
| 7 |
| 8 - Redistributions of source code must retain the above copyright |
| 9 notice, this list of conditions and the following disclaimer. |
| 10 |
| 11 - Redistributions in binary form must reproduce the above copyright |
| 12 notice, this list of conditions and the following disclaimer in the |
| 13 documentation and/or other materials provided with the distribution. |
| 14 |
| 15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 16 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER |
| 19 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 20 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 21 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 22 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| 23 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| 24 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| 25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 26 */ |
| 27 |
| 28 #ifdef HAVE_CONFIG_H |
| 29 #include "config.h" |
| 30 #endif |
| 31 |
| 32 #include <xmmintrin.h> |
| 33 #include <emmintrin.h> |
| 34 #include <smmintrin.h> |
| 35 #include "main.h" |
| 36 #include "celt/x86/x86cpu.h" |
| 37 |
| 38 #include "stack_alloc.h" |
| 39 |
| 40 typedef struct { |
| 41 opus_int32 sLPC_Q14[ MAX_SUB_FRAME_LENGTH + NSQ_LPC_BUF_LENGTH ]; |
| 42 opus_int32 RandState[ DECISION_DELAY ]; |
| 43 opus_int32 Q_Q10[ DECISION_DELAY ]; |
| 44 opus_int32 Xq_Q14[ DECISION_DELAY ]; |
| 45 opus_int32 Pred_Q15[ DECISION_DELAY ]; |
| 46 opus_int32 Shape_Q14[ DECISION_DELAY ]; |
| 47 opus_int32 sAR2_Q14[ MAX_SHAPE_LPC_ORDER ]; |
| 48 opus_int32 LF_AR_Q14; |
| 49 opus_int32 Seed; |
| 50 opus_int32 SeedInit; |
| 51 opus_int32 RD_Q10; |
| 52 } NSQ_del_dec_struct; |
| 53 |
| 54 typedef struct { |
| 55 opus_int32 Q_Q10; |
| 56 opus_int32 RD_Q10; |
| 57 opus_int32 xq_Q14; |
| 58 opus_int32 LF_AR_Q14; |
| 59 opus_int32 sLTP_shp_Q14; |
| 60 opus_int32 LPC_exc_Q14; |
| 61 } NSQ_sample_struct; |
| 62 |
| 63 typedef NSQ_sample_struct NSQ_sample_pair[ 2 ]; |
| 64 |
| 65 static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( |
| 66 const silk_encoder_state *psEncC, /* I Encoder State
*/ |
| 67 silk_nsq_state *NSQ, /* I/O NSQ state
*/ |
| 68 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision sta
tes */ |
| 69 const opus_int32 x_Q3[], /* I Input in Q3
*/ |
| 70 opus_int32 x_sc_Q10[], /* O Input scaled with 1/
Gain in Q10 */ |
| 71 const opus_int16 sLTP[], /* I Re-whitened LTP stat
e in Q0 */ |
| 72 opus_int32 sLTP_Q15[], /* O LTP state matching s
caled input */ |
| 73 opus_int subfr, /* I Subframe number
*/ |
| 74 opus_int nStatesDelayedDecision, /* I Number of del dec st
ates */ |
| 75 const opus_int LTP_scale_Q14, /* I LTP state scaling
*/ |
| 76 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I
*/ |
| 77 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag
*/ |
| 78 const opus_int signal_type, /* I Signal type
*/ |
| 79 const opus_int decisionDelay /* I Decision delay
*/ |
| 80 ); |
| 81 |
| 82 /******************************************/ |
| 83 /* Noise shape quantizer for one subframe */ |
| 84 /******************************************/ |
| 85 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( |
| 86 silk_nsq_state *NSQ, /* I/O NSQ state
*/ |
| 87 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states
*/ |
| 88 opus_int signalType, /* I Signal type
*/ |
| 89 const opus_int32 x_Q10[], /* I
*/ |
| 90 opus_int8 pulses[], /* O
*/ |
| 91 opus_int16 xq[], /* O
*/ |
| 92 opus_int32 sLTP_Q15[], /* I/O LTP filter state
*/ |
| 93 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer
*/ |
| 94 const opus_int16 a_Q12[], /* I Short term prediction co
efs */ |
| 95 const opus_int16 b_Q14[], /* I Long term prediction coe
fs */ |
| 96 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs
*/ |
| 97 opus_int lag, /* I Pitch lag
*/ |
| 98 opus_int32 HarmShapeFIRPacked_Q14, /* I
*/ |
| 99 opus_int Tilt_Q14, /* I Spectral tilt
*/ |
| 100 opus_int32 LF_shp_Q14, /* I
*/ |
| 101 opus_int32 Gain_Q16, /* I
*/ |
| 102 opus_int Lambda_Q10, /* I
*/ |
| 103 opus_int offset_Q10, /* I
*/ |
| 104 opus_int length, /* I Input length
*/ |
| 105 opus_int subfr, /* I Subframe number
*/ |
| 106 opus_int shapingLPCOrder, /* I Shaping LPC filter order
*/ |
| 107 opus_int predictLPCOrder, /* I Prediction filter order
*/ |
| 108 opus_int warping_Q16, /* I
*/ |
| 109 opus_int nStatesDelayedDecision, /* I Number of states in deci
sion tree */ |
| 110 opus_int *smpl_buf_idx, /* I Index to newest samples
in buffers */ |
| 111 opus_int decisionDelay /* I
*/ |
| 112 ); |
| 113 |
| 114 void silk_NSQ_del_dec_sse4_1( |
| 115 const silk_encoder_state *psEncC, /* I
/O Encoder State */ |
| 116 silk_nsq_state *NSQ, /* I
/O NSQ state */ |
| 117 SideInfoIndices *psIndices, /* I
/O Quantization Indices */ |
| 118 const opus_int32 x_Q3[], /* I
Prefiltered input signal */ |
| 119 opus_int8 pulses[], /* O
Quantized pulse signal */ |
| 120 const opus_int16 PredCoef_Q12[ 2 * MAX_LPC_ORDER ], /* I
Short term prediction coefs */ |
| 121 const opus_int16 LTPCoef_Q14[ LTP_ORDER * MAX_NB_SUBFR ], /* I
Long term prediction coefs */ |
| 122 const opus_int16 AR2_Q13[ MAX_NB_SUBFR * MAX_SHAPE_LPC_ORDER ], /
* I Noise shaping coefs */ |
| 123 const opus_int HarmShapeGain_Q14[ MAX_NB_SUBFR ], /* I
Long term shaping coefs */ |
| 124 const opus_int Tilt_Q14[ MAX_NB_SUBFR ], /* I
Spectral tilt */ |
| 125 const opus_int32 LF_shp_Q14[ MAX_NB_SUBFR ], /* I
Low frequency shaping coefs */ |
| 126 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I
Quantization step sizes */ |
| 127 const opus_int pitchL[ MAX_NB_SUBFR ], /* I
Pitch lags */ |
| 128 const opus_int Lambda_Q10, /* I
Rate/distortion tradeoff */ |
| 129 const opus_int LTP_scale_Q14 /* I
LTP state scaling */ |
| 130 ) |
| 131 { |
| 132 opus_int i, k, lag, start_idx, LSF_interpolation_flag, Winner_ind
, subfr; |
| 133 opus_int last_smple_idx, smpl_buf_idx, decisionDelay; |
| 134 const opus_int16 *A_Q12, *B_Q14, *AR_shp_Q13; |
| 135 opus_int16 *pxq; |
| 136 VARDECL( opus_int32, sLTP_Q15 ); |
| 137 VARDECL( opus_int16, sLTP ); |
| 138 opus_int32 HarmShapeFIRPacked_Q14; |
| 139 opus_int offset_Q10; |
| 140 opus_int32 RDmin_Q10, Gain_Q10; |
| 141 VARDECL( opus_int32, x_sc_Q10 ); |
| 142 VARDECL( opus_int32, delayedGain_Q10 ); |
| 143 VARDECL( NSQ_del_dec_struct, psDelDec ); |
| 144 NSQ_del_dec_struct *psDD; |
| 145 SAVE_STACK; |
| 146 |
| 147 /* Set unvoiced lag to the previous one, overwrite later for voiced */ |
| 148 lag = NSQ->lagPrev; |
| 149 |
| 150 silk_assert( NSQ->prev_gain_Q16 != 0 ); |
| 151 |
| 152 /* Initialize delayed decision states */ |
| 153 ALLOC( psDelDec, psEncC->nStatesDelayedDecision, NSQ_del_dec_struct ); |
| 154 silk_memset( psDelDec, 0, psEncC->nStatesDelayedDecision * sizeof( NSQ_del_d
ec_struct ) ); |
| 155 for( k = 0; k < psEncC->nStatesDelayedDecision; k++ ) { |
| 156 psDD = &psDelDec[ k ]; |
| 157 psDD->Seed = ( k + psIndices->Seed ) & 3; |
| 158 psDD->SeedInit = psDD->Seed; |
| 159 psDD->RD_Q10 = 0; |
| 160 psDD->LF_AR_Q14 = NSQ->sLF_AR_shp_Q14; |
| 161 psDD->Shape_Q14[ 0 ] = NSQ->sLTP_shp_Q14[ psEncC->ltp_mem_length - 1 ]; |
| 162 silk_memcpy( psDD->sLPC_Q14, NSQ->sLPC_Q14, NSQ_LPC_BUF_LENGTH * sizeof(
opus_int32 ) ); |
| 163 silk_memcpy( psDD->sAR2_Q14, NSQ->sAR2_Q14, sizeof( NSQ->sAR2_Q14 ) ); |
| 164 } |
| 165 |
| 166 offset_Q10 = silk_Quantization_Offsets_Q10[ psIndices->signalType >> 1 ][
psIndices->quantOffsetType ]; |
| 167 smpl_buf_idx = 0; /* index of oldest samples */ |
| 168 |
| 169 decisionDelay = silk_min_int( DECISION_DELAY, psEncC->subfr_length ); |
| 170 |
| 171 /* For voiced frames limit the decision delay to lower than the pitch lag */ |
| 172 if( psIndices->signalType == TYPE_VOICED ) { |
| 173 for( k = 0; k < psEncC->nb_subfr; k++ ) { |
| 174 decisionDelay = silk_min_int( decisionDelay, pitchL[ k ] - LTP_ORDER
/ 2 - 1 ); |
| 175 } |
| 176 } else { |
| 177 if( lag > 0 ) { |
| 178 decisionDelay = silk_min_int( decisionDelay, lag - LTP_ORDER / 2 - 1
); |
| 179 } |
| 180 } |
| 181 |
| 182 if( psIndices->NLSFInterpCoef_Q2 == 4 ) { |
| 183 LSF_interpolation_flag = 0; |
| 184 } else { |
| 185 LSF_interpolation_flag = 1; |
| 186 } |
| 187 |
| 188 ALLOC( sLTP_Q15, |
| 189 psEncC->ltp_mem_length + psEncC->frame_length, opus_int32 ); |
| 190 ALLOC( sLTP, psEncC->ltp_mem_length + psEncC->frame_length, opus_int16 ); |
| 191 ALLOC( x_sc_Q10, psEncC->subfr_length, opus_int32 ); |
| 192 ALLOC( delayedGain_Q10, DECISION_DELAY, opus_int32 ); |
| 193 /* Set up pointers to start of sub frame */ |
| 194 pxq = &NSQ->xq[ psEncC->ltp_mem_length ]; |
| 195 NSQ->sLTP_shp_buf_idx = psEncC->ltp_mem_length; |
| 196 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; |
| 197 subfr = 0; |
| 198 for( k = 0; k < psEncC->nb_subfr; k++ ) { |
| 199 A_Q12 = &PredCoef_Q12[ ( ( k >> 1 ) | ( 1 - LSF_interpolation_flag
) ) * MAX_LPC_ORDER ]; |
| 200 B_Q14 = <PCoef_Q14[ k * LTP_ORDER ]; |
| 201 AR_shp_Q13 = &AR2_Q13[ k * MAX_SHAPE_LPC_ORDER ]; |
| 202 |
| 203 /* Noise shape parameters */ |
| 204 silk_assert( HarmShapeGain_Q14[ k ] >= 0 ); |
| 205 HarmShapeFIRPacked_Q14 = silk_RSHIFT( HarmShap
eGain_Q14[ k ], 2 ); |
| 206 HarmShapeFIRPacked_Q14 |= silk_LSHIFT( (opus_int32)silk_RSHIFT( HarmShap
eGain_Q14[ k ], 1 ), 16 ); |
| 207 |
| 208 NSQ->rewhite_flag = 0; |
| 209 if( psIndices->signalType == TYPE_VOICED ) { |
| 210 /* Voiced */ |
| 211 lag = pitchL[ k ]; |
| 212 |
| 213 /* Re-whitening */ |
| 214 if( ( k & ( 3 - silk_LSHIFT( LSF_interpolation_flag, 1 ) ) ) == 0 )
{ |
| 215 if( k == 2 ) { |
| 216 /* RESET DELAYED DECISIONS */ |
| 217 /* Find winner */ |
| 218 RDmin_Q10 = psDelDec[ 0 ].RD_Q10; |
| 219 Winner_ind = 0; |
| 220 for( i = 1; i < psEncC->nStatesDelayedDecision; i++ ) { |
| 221 if( psDelDec[ i ].RD_Q10 < RDmin_Q10 ) { |
| 222 RDmin_Q10 = psDelDec[ i ].RD_Q10; |
| 223 Winner_ind = i; |
| 224 } |
| 225 } |
| 226 for( i = 0; i < psEncC->nStatesDelayedDecision; i++ ) { |
| 227 if( i != Winner_ind ) { |
| 228 psDelDec[ i ].RD_Q10 += ( silk_int32_MAX >> 4 ); |
| 229 silk_assert( psDelDec[ i ].RD_Q10 >= 0 ); |
| 230 } |
| 231 } |
| 232 |
| 233 /* Copy final part of signals from winner state to output an
d long-term filter states */ |
| 234 psDD = &psDelDec[ Winner_ind ]; |
| 235 last_smple_idx = smpl_buf_idx + decisionDelay; |
| 236 for( i = 0; i < decisionDelay; i++ ) { |
| 237 last_smple_idx = ( last_smple_idx - 1 ) & DECISION_DELAY
_MASK; |
| 238 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_R
OUND( psDD->Q_Q10[ last_smple_idx ], 10 ); |
| 239 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_
RSHIFT_ROUND( |
| 240 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gains_Q
16[ 1 ] ), 14 ) ); |
| 241 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay
+ i ] = psDD->Shape_Q14[ last_smple_idx ]; |
| 242 } |
| 243 |
| 244 subfr = 0; |
| 245 } |
| 246 |
| 247 /* Rewhiten with new A coefs */ |
| 248 start_idx = psEncC->ltp_mem_length - lag - psEncC->predictLPCOrd
er - LTP_ORDER / 2; |
| 249 silk_assert( start_idx > 0 ); |
| 250 |
| 251 silk_LPC_analysis_filter( &sLTP[ start_idx ], &NSQ->xq[ start_id
x + k * psEncC->subfr_length ], |
| 252 A_Q12, psEncC->ltp_mem_length - start_idx, psEncC->predictLP
COrder, psEncC->arch ); |
| 253 |
| 254 NSQ->sLTP_buf_idx = psEncC->ltp_mem_length; |
| 255 NSQ->rewhite_flag = 1; |
| 256 } |
| 257 } |
| 258 |
| 259 silk_nsq_del_dec_scale_states_sse4_1( psEncC, NSQ, psDelDec, x_Q3, x_sc_
Q10, sLTP, sLTP_Q15, k, |
| 260 psEncC->nStatesDelayedDecision, LTP_scale_Q14, Gains_Q16, pitchL, ps
Indices->signalType, decisionDelay ); |
| 261 |
| 262 silk_noise_shape_quantizer_del_dec_sse4_1( NSQ, psDelDec, psIndices->sig
nalType, x_sc_Q10, pulses, pxq, sLTP_Q15, |
| 263 delayedGain_Q10, A_Q12, B_Q14, AR_shp_Q13, lag, HarmShapeFIRPacked_Q
14, Tilt_Q14[ k ], LF_shp_Q14[ k ], |
| 264 Gains_Q16[ k ], Lambda_Q10, offset_Q10, psEncC->subfr_length, subfr+
+, psEncC->shapingLPCOrder, |
| 265 psEncC->predictLPCOrder, psEncC->warping_Q16, psEncC->nStatesDelayed
Decision, &smpl_buf_idx, decisionDelay ); |
| 266 |
| 267 x_Q3 += psEncC->subfr_length; |
| 268 pulses += psEncC->subfr_length; |
| 269 pxq += psEncC->subfr_length; |
| 270 } |
| 271 |
| 272 /* Find winner */ |
| 273 RDmin_Q10 = psDelDec[ 0 ].RD_Q10; |
| 274 Winner_ind = 0; |
| 275 for( k = 1; k < psEncC->nStatesDelayedDecision; k++ ) { |
| 276 if( psDelDec[ k ].RD_Q10 < RDmin_Q10 ) { |
| 277 RDmin_Q10 = psDelDec[ k ].RD_Q10; |
| 278 Winner_ind = k; |
| 279 } |
| 280 } |
| 281 |
| 282 /* Copy final part of signals from winner state to output and long-term filt
er states */ |
| 283 psDD = &psDelDec[ Winner_ind ]; |
| 284 psIndices->Seed = psDD->SeedInit; |
| 285 last_smple_idx = smpl_buf_idx + decisionDelay; |
| 286 Gain_Q10 = silk_RSHIFT32( Gains_Q16[ psEncC->nb_subfr - 1 ], 6 ); |
| 287 for( i = 0; i < decisionDelay; i++ ) { |
| 288 last_smple_idx = ( last_smple_idx - 1 ) & DECISION_DELAY_MASK; |
| 289 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q_Q1
0[ last_smple_idx ], 10 ); |
| 290 pxq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( |
| 291 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], Gain_Q10 ), 8 ) ); |
| 292 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay + i ] = psDD->S
hape_Q14[ last_smple_idx ]; |
| 293 } |
| 294 silk_memcpy( NSQ->sLPC_Q14, &psDD->sLPC_Q14[ psEncC->subfr_length ], NSQ_LPC
_BUF_LENGTH * sizeof( opus_int32 ) ); |
| 295 silk_memcpy( NSQ->sAR2_Q14, psDD->sAR2_Q14, sizeof( psDD->sAR2_Q14 ) ); |
| 296 |
| 297 /* Update states */ |
| 298 NSQ->sLF_AR_shp_Q14 = psDD->LF_AR_Q14; |
| 299 NSQ->lagPrev = pitchL[ psEncC->nb_subfr - 1 ]; |
| 300 |
| 301 /* Save quantized speech signal */ |
| 302 /* DEBUG_STORE_DATA( enc.pcm, &NSQ->xq[psEncC->ltp_mem_length], psEncC->fram
e_length * sizeof( opus_int16 ) ) */ |
| 303 silk_memmove( NSQ->xq, &NSQ->xq[ psEncC->frame_length ],
psEncC->ltp_mem_length * sizeof( opus_int16 ) ); |
| 304 silk_memmove( NSQ->sLTP_shp_Q14, &NSQ->sLTP_shp_Q14[ psEncC->frame_length ],
psEncC->ltp_mem_length * sizeof( opus_int32 ) ); |
| 305 RESTORE_STACK; |
| 306 } |
| 307 |
| 308 /******************************************/ |
| 309 /* Noise shape quantizer for one subframe */ |
| 310 /******************************************/ |
| 311 static OPUS_INLINE void silk_noise_shape_quantizer_del_dec_sse4_1( |
| 312 silk_nsq_state *NSQ, /* I/O NSQ state
*/ |
| 313 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision states
*/ |
| 314 opus_int signalType, /* I Signal type
*/ |
| 315 const opus_int32 x_Q10[], /* I
*/ |
| 316 opus_int8 pulses[], /* O
*/ |
| 317 opus_int16 xq[], /* O
*/ |
| 318 opus_int32 sLTP_Q15[], /* I/O LTP filter state
*/ |
| 319 opus_int32 delayedGain_Q10[], /* I/O Gain delay buffer
*/ |
| 320 const opus_int16 a_Q12[], /* I Short term prediction co
efs */ |
| 321 const opus_int16 b_Q14[], /* I Long term prediction coe
fs */ |
| 322 const opus_int16 AR_shp_Q13[], /* I Noise shaping coefs
*/ |
| 323 opus_int lag, /* I Pitch lag
*/ |
| 324 opus_int32 HarmShapeFIRPacked_Q14, /* I
*/ |
| 325 opus_int Tilt_Q14, /* I Spectral tilt
*/ |
| 326 opus_int32 LF_shp_Q14, /* I
*/ |
| 327 opus_int32 Gain_Q16, /* I
*/ |
| 328 opus_int Lambda_Q10, /* I
*/ |
| 329 opus_int offset_Q10, /* I
*/ |
| 330 opus_int length, /* I Input length
*/ |
| 331 opus_int subfr, /* I Subframe number
*/ |
| 332 opus_int shapingLPCOrder, /* I Shaping LPC filter order
*/ |
| 333 opus_int predictLPCOrder, /* I Prediction filter order
*/ |
| 334 opus_int warping_Q16, /* I
*/ |
| 335 opus_int nStatesDelayedDecision, /* I Number of states in deci
sion tree */ |
| 336 opus_int *smpl_buf_idx, /* I Index to newest samples
in buffers */ |
| 337 opus_int decisionDelay /* I
*/ |
| 338 ) |
| 339 { |
| 340 opus_int i, j, k, Winner_ind, RDmin_ind, RDmax_ind, last_smple_idx; |
| 341 opus_int32 Winner_rand_state; |
| 342 opus_int32 LTP_pred_Q14, LPC_pred_Q14, n_AR_Q14, n_LTP_Q14; |
| 343 opus_int32 n_LF_Q14, r_Q10, rr_Q10, rd1_Q10, rd2_Q10, RDmin_Q10, RDmax_Q10
; |
| 344 opus_int32 q1_Q0, q1_Q10, q2_Q10, exc_Q14, LPC_exc_Q14, xq_Q14, Gain_Q10; |
| 345 opus_int32 tmp1, tmp2, sLF_AR_shp_Q14; |
| 346 opus_int32 *pred_lag_ptr, *shp_lag_ptr, *psLPC_Q14; |
| 347 VARDECL( NSQ_sample_pair, psSampleState ); |
| 348 NSQ_del_dec_struct *psDD; |
| 349 NSQ_sample_struct *psSS; |
| 350 |
| 351 __m128i a_Q12_0123, a_Q12_4567, a_Q12_89AB, a_Q12_CDEF; |
| 352 __m128i b_Q12_0123, b_sr_Q12_0123; |
| 353 SAVE_STACK; |
| 354 |
| 355 silk_assert( nStatesDelayedDecision > 0 ); |
| 356 ALLOC( psSampleState, nStatesDelayedDecision, NSQ_sample_pair ); |
| 357 |
| 358 shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_
FIR_TAPS / 2 ]; |
| 359 pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ]; |
| 360 Gain_Q10 = silk_RSHIFT( Gain_Q16, 6 ); |
| 361 |
| 362 a_Q12_0123 = OP_CVTEPI16_EPI32_M64( a_Q12 ); |
| 363 a_Q12_4567 = OP_CVTEPI16_EPI32_M64( a_Q12 + 4 ); |
| 364 |
| 365 if( opus_likely( predictLPCOrder == 16 ) ) { |
| 366 a_Q12_89AB = OP_CVTEPI16_EPI32_M64( a_Q12 + 8 ); |
| 367 a_Q12_CDEF = OP_CVTEPI16_EPI32_M64( a_Q12 + 12 ); |
| 368 } |
| 369 |
| 370 if( signalType == TYPE_VOICED ){ |
| 371 b_Q12_0123 = OP_CVTEPI16_EPI32_M64( b_Q14 ); |
| 372 b_sr_Q12_0123 = _mm_shuffle_epi32( b_Q12_0123, _MM_SHUFFLE( 0, 3, 2, 1 )
); /* equal shift right 4 bytes */ |
| 373 } |
| 374 for( i = 0; i < length; i++ ) { |
| 375 /* Perform common calculations used in all states */ |
| 376 |
| 377 /* Long-term prediction */ |
| 378 if( signalType == TYPE_VOICED ) { |
| 379 /* Unrolled loop */ |
| 380 /* Avoids introducing a bias because silk_SMLAWB() always rounds to
-inf */ |
| 381 LTP_pred_Q14 = 2; |
| 382 { |
| 383 __m128i tmpa, tmpb, pred_lag_ptr_tmp; |
| 384 pred_lag_ptr_tmp = _mm_loadu_si128( (__m128i *)(&pred_lag_ptr
[ -3 ] ) ); |
| 385 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, 0x1B
); |
| 386 tmpa = _mm_mul_epi32( pred_lag_ptr_tmp, b_Q12_012
3 ); |
| 387 tmpa = _mm_srli_si128( tmpa, 2 ); |
| 388 |
| 389 pred_lag_ptr_tmp = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUF
FLE( 0, 3, 2, 1 ) );/* equal shift right 4 bytes */ |
| 390 pred_lag_ptr_tmp = _mm_mul_epi32( pred_lag_ptr_tmp, b_sr_Q12_
0123 ); |
| 391 pred_lag_ptr_tmp = _mm_srli_si128( pred_lag_ptr_tmp, 2 ); |
| 392 pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpa ); |
| 393 |
| 394 tmpb = _mm_shuffle_epi32( pred_lag_ptr_tmp, _MM_SHUFFLE( 0, 0, 3
, 2 ) );/* equal shift right 8 bytes */ |
| 395 pred_lag_ptr_tmp = _mm_add_epi32( pred_lag_ptr_tmp, tmpb ); |
| 396 LTP_pred_Q14 += _mm_cvtsi128_si32( pred_lag_ptr_tmp ); |
| 397 |
| 398 LTP_pred_Q14 = silk_SMLAWB( LTP_pred_Q14, pred_lag_ptr[ -4 ], b_
Q14[ 4 ] ); |
| 399 LTP_pred_Q14 = silk_LSHIFT( LTP_pred_Q14, 1 );
/* Q13 -> Q14 */ |
| 400 pred_lag_ptr++; |
| 401 } |
| 402 } else { |
| 403 LTP_pred_Q14 = 0; |
| 404 } |
| 405 |
| 406 /* Long-term shaping */ |
| 407 if( lag > 0 ) { |
| 408 /* Symmetric, packed FIR coefficients */ |
| 409 n_LTP_Q14 = silk_SMULWB( silk_ADD32( shp_lag_ptr[ 0 ], shp_lag_ptr[
-2 ] ), HarmShapeFIRPacked_Q14 ); |
| 410 n_LTP_Q14 = silk_SMLAWT( n_LTP_Q14, shp_lag_ptr[ -1 ],
HarmShapeFIRPacked_Q14 ); |
| 411 n_LTP_Q14 = silk_SUB_LSHIFT32( LTP_pred_Q14, n_LTP_Q14, 2 );
/* Q12 -> Q14 */ |
| 412 shp_lag_ptr++; |
| 413 } else { |
| 414 n_LTP_Q14 = 0; |
| 415 } |
| 416 { |
| 417 __m128i tmpa, tmpb, psLPC_Q14_tmp, a_Q12_tmp; |
| 418 |
| 419 for( k = 0; k < nStatesDelayedDecision; k++ ) { |
| 420 /* Delayed decision state */ |
| 421 psDD = &psDelDec[ k ]; |
| 422 |
| 423 /* Sample state */ |
| 424 psSS = psSampleState[ k ]; |
| 425 |
| 426 /* Generate dither */ |
| 427 psDD->Seed = silk_RAND( psDD->Seed ); |
| 428 |
| 429 /* Pointer used in short term prediction and shaping */ |
| 430 psLPC_Q14 = &psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH - 1 + i ]; |
| 431 /* Short-term prediction */ |
| 432 silk_assert( predictLPCOrder == 10 || predictLPCOrder == 16 ); |
| 433 /* Avoids introducing a bias because silk_SMLAWB() always rounds
to -inf */ |
| 434 LPC_pred_Q14 = silk_RSHIFT( predictLPCOrder, 1 ); |
| 435 |
| 436 tmpb = _mm_setzero_si128(); |
| 437 |
| 438 /* step 1 */ |
| 439 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -3 ]
) ); /* -3, -2 , -1, 0 */ |
| 440 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B );
/* 0, -1, -2, -3 */ |
| 441 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_0123 );
/* 0, -1, -2, -3 * 0123 -> 0*0, 2*-2 */ |
| 442 |
| 443 tmpa = _mm_srli_epi64( tmpa, 16 ); |
| 444 tmpb = _mm_add_epi32( tmpb, tmpa ); |
| 445 |
| 446 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0
, 3, 2, 1 ) ); /* equal shift right 4 bytes */ |
| 447 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_0123, _MM_SHUFFLE(0, 3, 2,
1 ) ); /* equal shift right 4 bytes */ |
| 448 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); /*
1*-1, 3*-3 */ |
| 449 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 ); |
| 450 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); |
| 451 |
| 452 /* step 2 */ |
| 453 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -7 ]
) ); |
| 454 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); |
| 455 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_4567 ); |
| 456 tmpa = _mm_srli_epi64( tmpa, 16 ); |
| 457 tmpb = _mm_add_epi32( tmpb, tmpa ); |
| 458 |
| 459 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFLE( 0
, 3, 2, 1 ) ); /* equal shift right 4 bytes */ |
| 460 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_4567, _MM_SHUFFLE(0, 3, 2,
1 ) ); /* equal shift right 4 bytes */ |
| 461 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); |
| 462 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 ); |
| 463 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); |
| 464 |
| 465 if ( opus_likely( predictLPCOrder == 16 ) ) |
| 466 { |
| 467 /* step 3 */ |
| 468 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -
11 ] ) ); |
| 469 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); |
| 470 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_89AB )
; |
| 471 tmpa = _mm_srli_epi64( tmpa, 16 ); |
| 472 tmpb = _mm_add_epi32( tmpb, tmpa ); |
| 473 |
| 474 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFL
E( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ |
| 475 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_89AB, _MM_SHUFFLE(0, 3,
2, 1 ) );/* equal shift right 4 bytes */ |
| 476 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); |
| 477 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 ); |
| 478 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); |
| 479 |
| 480 /* setp 4 */ |
| 481 psLPC_Q14_tmp = _mm_loadu_si128( (__m128i *)(&psLPC_Q14[ -
15 ] ) ); |
| 482 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, 0x1B ); |
| 483 tmpa = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_CDEF )
; |
| 484 tmpa = _mm_srli_epi64( tmpa, 16 ); |
| 485 tmpb = _mm_add_epi32( tmpb, tmpa ); |
| 486 |
| 487 psLPC_Q14_tmp = _mm_shuffle_epi32( psLPC_Q14_tmp, _MM_SHUFFL
E( 0, 3, 2, 1 ) ); /* equal shift right 4 bytes */ |
| 488 a_Q12_tmp = _mm_shuffle_epi32( a_Q12_CDEF, _MM_SHUFFLE(0, 3,
2, 1 ) ); /* equal shift right 4 bytes */ |
| 489 psLPC_Q14_tmp = _mm_mul_epi32( psLPC_Q14_tmp, a_Q12_tmp ); |
| 490 psLPC_Q14_tmp = _mm_srli_epi64( psLPC_Q14_tmp, 16 ); |
| 491 tmpb = _mm_add_epi32( tmpb, psLPC_Q14_tmp ); |
| 492 |
| 493 /* add at last */ |
| 494 /* equal shift right 8 bytes*/ |
| 495 tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0
, 3, 2 ) ); |
| 496 tmpb = _mm_add_epi32( tmpb, tmpa ); |
| 497 LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb ); |
| 498 } |
| 499 else |
| 500 { |
| 501 /* add at last */ |
| 502 tmpa = _mm_shuffle_epi32( tmpb, _MM_SHUFFLE( 0, 0
, 3, 2 ) ); /* equal shift right 8 bytes*/ |
| 503 tmpb = _mm_add_epi32( tmpb, tmpa ); |
| 504 LPC_pred_Q14 += _mm_cvtsi128_si32( tmpb ); |
| 505 |
| 506 LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -8 ], a
_Q12[ 8 ] ); |
| 507 LPC_pred_Q14 = silk_SMLAWB( LPC_pred_Q14, psLPC_Q14[ -9 ], a
_Q12[ 9 ] ); |
| 508 } |
| 509 |
| 510 LPC_pred_Q14 = silk_LSHIFT( LPC_pred_Q14, 4 ); /* Q10 -> Q14 */ |
| 511 |
| 512 /* Noise shape feedback */ |
| 513 silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that ord
er is even */ |
| 514 /* Output of lowpass section */ |
| 515 tmp2 = silk_SMLAWB( psLPC_Q14[ 0 ], psDD->sAR2_Q14[ 0 ], warping
_Q16 ); |
| 516 /* Output of allpass section */ |
| 517 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ 0 ], psDD->sAR2_Q14[ 1 ] - t
mp2, warping_Q16 ); |
| 518 psDD->sAR2_Q14[ 0 ] = tmp2; |
| 519 n_AR_Q14 = silk_RSHIFT( shapingLPCOrder, 1 ); |
| 520 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ 0 ] ); |
| 521 /* Loop over allpass sections */ |
| 522 for( j = 2; j < shapingLPCOrder; j += 2 ) { |
| 523 /* Output of allpass section */ |
| 524 tmp2 = silk_SMLAWB( psDD->sAR2_Q14[ j - 1 ], psDD->sAR2_Q14[
j + 0 ] - tmp1, warping_Q16 ); |
| 525 psDD->sAR2_Q14[ j - 1 ] = tmp1; |
| 526 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ j - 1 ]
); |
| 527 /* Output of allpass section */ |
| 528 tmp1 = silk_SMLAWB( psDD->sAR2_Q14[ j + 0 ], psDD->sAR2_Q14[
j + 1 ] - tmp2, warping_Q16 ); |
| 529 psDD->sAR2_Q14[ j + 0 ] = tmp2; |
| 530 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp2, AR_shp_Q13[ j ] ); |
| 531 } |
| 532 psDD->sAR2_Q14[ shapingLPCOrder - 1 ] = tmp1; |
| 533 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, tmp1, AR_shp_Q13[ shapingLPCOr
der - 1 ] ); |
| 534 |
| 535 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 1 );
/* Q11 -> Q12 */ |
| 536 n_AR_Q14 = silk_SMLAWB( n_AR_Q14, psDD->LF_AR_Q14, Tilt_Q14 );
/* Q12 */ |
| 537 n_AR_Q14 = silk_LSHIFT( n_AR_Q14, 2 );
/* Q12 -> Q14 */ |
| 538 |
| 539 n_LF_Q14 = silk_SMULWB( psDD->Shape_Q14[ *smpl_buf_idx ], LF_shp
_Q14 ); /* Q12 */ |
| 540 n_LF_Q14 = silk_SMLAWT( n_LF_Q14, psDD->LF_AR_Q14, LF_shp_Q14 );
/* Q12 */ |
| 541 n_LF_Q14 = silk_LSHIFT( n_LF_Q14, 2 );
/* Q12 -> Q14 */ |
| 542 |
| 543 /* Input minus prediction plus noise feedback
*/ |
| 544 /* r = x[ i ] - LTP_pred - LPC_pred + n_AR + n_Tilt + n_LF + n_L
TP */ |
| 545 tmp1 = silk_ADD32( n_AR_Q14, n_LF_Q14 );
/* Q14 */ |
| 546 tmp2 = silk_ADD32( n_LTP_Q14, LPC_pred_Q14 );
/* Q13 */ |
| 547 tmp1 = silk_SUB32( tmp2, tmp1 );
/* Q13 */ |
| 548 tmp1 = silk_RSHIFT_ROUND( tmp1, 4 );
/* Q10 */ |
| 549 |
| 550 r_Q10 = silk_SUB32( x_Q10[ i ], tmp1 );
/* residual error Q10 */ |
| 551 |
| 552 /* Flip sign depending on dither */ |
| 553 if ( psDD->Seed < 0 ) { |
| 554 r_Q10 = -r_Q10; |
| 555 } |
| 556 r_Q10 = silk_LIMIT_32( r_Q10, -(31 << 10), 30 << 10 ); |
| 557 |
| 558 /* Find two quantization level candidates and measure their rate
-distortion */ |
| 559 q1_Q10 = silk_SUB32( r_Q10, offset_Q10 ); |
| 560 q1_Q0 = silk_RSHIFT( q1_Q10, 10 ); |
| 561 if( q1_Q0 > 0 ) { |
| 562 q1_Q10 = silk_SUB32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_
ADJUST_Q10 ); |
| 563 q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 ); |
| 564 q2_Q10 = silk_ADD32( q1_Q10, 1024 ); |
| 565 rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 ); |
| 566 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); |
| 567 } else if( q1_Q0 == 0 ) { |
| 568 q1_Q10 = offset_Q10; |
| 569 q2_Q10 = silk_ADD32( q1_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10
); |
| 570 rd1_Q10 = silk_SMULBB( q1_Q10, Lambda_Q10 ); |
| 571 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); |
| 572 } else if( q1_Q0 == -1 ) { |
| 573 q2_Q10 = offset_Q10; |
| 574 q1_Q10 = silk_SUB32( q2_Q10, 1024 - QUANT_LEVEL_ADJUST_Q10
); |
| 575 rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 ); |
| 576 rd2_Q10 = silk_SMULBB( q2_Q10, Lambda_Q10 ); |
| 577 } else { /* q1_Q0 < -1 */ |
| 578 q1_Q10 = silk_ADD32( silk_LSHIFT( q1_Q0, 10 ), QUANT_LEVEL_
ADJUST_Q10 ); |
| 579 q1_Q10 = silk_ADD32( q1_Q10, offset_Q10 ); |
| 580 q2_Q10 = silk_ADD32( q1_Q10, 1024 ); |
| 581 rd1_Q10 = silk_SMULBB( -q1_Q10, Lambda_Q10 ); |
| 582 rd2_Q10 = silk_SMULBB( -q2_Q10, Lambda_Q10 ); |
| 583 } |
| 584 rr_Q10 = silk_SUB32( r_Q10, q1_Q10 ); |
| 585 rd1_Q10 = silk_RSHIFT( silk_SMLABB( rd1_Q10, rr_Q10, rr_Q10 ), 1
0 ); |
| 586 rr_Q10 = silk_SUB32( r_Q10, q2_Q10 ); |
| 587 rd2_Q10 = silk_RSHIFT( silk_SMLABB( rd2_Q10, rr_Q10, rr_Q10 ), 1
0 ); |
| 588 |
| 589 if( rd1_Q10 < rd2_Q10 ) { |
| 590 psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 ); |
| 591 psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 ); |
| 592 psSS[ 0 ].Q_Q10 = q1_Q10; |
| 593 psSS[ 1 ].Q_Q10 = q2_Q10; |
| 594 } else { |
| 595 psSS[ 0 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd2_Q10 ); |
| 596 psSS[ 1 ].RD_Q10 = silk_ADD32( psDD->RD_Q10, rd1_Q10 ); |
| 597 psSS[ 0 ].Q_Q10 = q2_Q10; |
| 598 psSS[ 1 ].Q_Q10 = q1_Q10; |
| 599 } |
| 600 |
| 601 /* Update states for best quantization */ |
| 602 |
| 603 /* Quantized excitation */ |
| 604 exc_Q14 = silk_LSHIFT32( psSS[ 0 ].Q_Q10, 4 ); |
| 605 if ( psDD->Seed < 0 ) { |
| 606 exc_Q14 = -exc_Q14; |
| 607 } |
| 608 |
| 609 /* Add predictions */ |
| 610 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); |
| 611 xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 ); |
| 612 |
| 613 /* Update states */ |
| 614 sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 ); |
| 615 psSS[ 0 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 ); |
| 616 psSS[ 0 ].LF_AR_Q14 = sLF_AR_shp_Q14; |
| 617 psSS[ 0 ].LPC_exc_Q14 = LPC_exc_Q14; |
| 618 psSS[ 0 ].xq_Q14 = xq_Q14; |
| 619 |
| 620 /* Update states for second best quantization */ |
| 621 |
| 622 /* Quantized excitation */ |
| 623 exc_Q14 = silk_LSHIFT32( psSS[ 1 ].Q_Q10, 4 ); |
| 624 if ( psDD->Seed < 0 ) { |
| 625 exc_Q14 = -exc_Q14; |
| 626 } |
| 627 |
| 628 |
| 629 /* Add predictions */ |
| 630 LPC_exc_Q14 = silk_ADD32( exc_Q14, LTP_pred_Q14 ); |
| 631 xq_Q14 = silk_ADD32( LPC_exc_Q14, LPC_pred_Q14 ); |
| 632 |
| 633 /* Update states */ |
| 634 sLF_AR_shp_Q14 = silk_SUB32( xq_Q14, n_AR_Q14 ); |
| 635 psSS[ 1 ].sLTP_shp_Q14 = silk_SUB32( sLF_AR_shp_Q14, n_LF_Q14 ); |
| 636 psSS[ 1 ].LF_AR_Q14 = sLF_AR_shp_Q14; |
| 637 psSS[ 1 ].LPC_exc_Q14 = LPC_exc_Q14; |
| 638 psSS[ 1 ].xq_Q14 = xq_Q14; |
| 639 } |
| 640 } |
| 641 *smpl_buf_idx = ( *smpl_buf_idx - 1 ) & DECISION_DELAY_MASK;
/* Index to newest samples */ |
| 642 last_smple_idx = ( *smpl_buf_idx + decisionDelay ) & DECISION_DELAY_MASK
; /* Index to decisionDelay old samples */ |
| 643 |
| 644 /* Find winner */ |
| 645 RDmin_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10; |
| 646 Winner_ind = 0; |
| 647 for( k = 1; k < nStatesDelayedDecision; k++ ) { |
| 648 if( psSampleState[ k ][ 0 ].RD_Q10 < RDmin_Q10 ) { |
| 649 RDmin_Q10 = psSampleState[ k ][ 0 ].RD_Q10; |
| 650 Winner_ind = k; |
| 651 } |
| 652 } |
| 653 |
| 654 /* Increase RD values of expired states */ |
| 655 Winner_rand_state = psDelDec[ Winner_ind ].RandState[ last_smple_idx ]; |
| 656 for( k = 0; k < nStatesDelayedDecision; k++ ) { |
| 657 if( psDelDec[ k ].RandState[ last_smple_idx ] != Winner_rand_state )
{ |
| 658 psSampleState[ k ][ 0 ].RD_Q10 = silk_ADD32( psSampleState[ k ][
0 ].RD_Q10, silk_int32_MAX >> 4 ); |
| 659 psSampleState[ k ][ 1 ].RD_Q10 = silk_ADD32( psSampleState[ k ][
1 ].RD_Q10, silk_int32_MAX >> 4 ); |
| 660 silk_assert( psSampleState[ k ][ 0 ].RD_Q10 >= 0 ); |
| 661 } |
| 662 } |
| 663 |
| 664 /* Find worst in first set and best in second set */ |
| 665 RDmax_Q10 = psSampleState[ 0 ][ 0 ].RD_Q10; |
| 666 RDmin_Q10 = psSampleState[ 0 ][ 1 ].RD_Q10; |
| 667 RDmax_ind = 0; |
| 668 RDmin_ind = 0; |
| 669 for( k = 1; k < nStatesDelayedDecision; k++ ) { |
| 670 /* find worst in first set */ |
| 671 if( psSampleState[ k ][ 0 ].RD_Q10 > RDmax_Q10 ) { |
| 672 RDmax_Q10 = psSampleState[ k ][ 0 ].RD_Q10; |
| 673 RDmax_ind = k; |
| 674 } |
| 675 /* find best in second set */ |
| 676 if( psSampleState[ k ][ 1 ].RD_Q10 < RDmin_Q10 ) { |
| 677 RDmin_Q10 = psSampleState[ k ][ 1 ].RD_Q10; |
| 678 RDmin_ind = k; |
| 679 } |
| 680 } |
| 681 |
| 682 /* Replace a state if best from second set outperforms worst in first se
t */ |
| 683 if( RDmin_Q10 < RDmax_Q10 ) { |
| 684 silk_memcpy( ( (opus_int32 *)&psDelDec[ RDmax_ind ] ) + i, |
| 685 ( (opus_int32 *)&psDelDec[ RDmin_ind ] ) + i, sizeof( N
SQ_del_dec_struct ) - i * sizeof( opus_int32) ); |
| 686 silk_memcpy( &psSampleState[ RDmax_ind ][ 0 ], &psSampleState[ RDmin
_ind ][ 1 ], sizeof( NSQ_sample_struct ) ); |
| 687 } |
| 688 |
| 689 /* Write samples from winner to output and long-term filter states */ |
| 690 psDD = &psDelDec[ Winner_ind ]; |
| 691 if( subfr > 0 || i >= decisionDelay ) { |
| 692 pulses[ i - decisionDelay ] = (opus_int8)silk_RSHIFT_ROUND( psDD->Q
_Q10[ last_smple_idx ], 10 ); |
| 693 xq[ i - decisionDelay ] = (opus_int16)silk_SAT16( silk_RSHIFT_ROUND( |
| 694 silk_SMULWW( psDD->Xq_Q14[ last_smple_idx ], delayedGain_Q10[ la
st_smple_idx ] ), 8 ) ); |
| 695 NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - decisionDelay ] = psDD->S
hape_Q14[ last_smple_idx ]; |
| 696 sLTP_Q15[ NSQ->sLTP_buf_idx - decisionDelay ] = psDD->P
red_Q15[ last_smple_idx ]; |
| 697 } |
| 698 NSQ->sLTP_shp_buf_idx++; |
| 699 NSQ->sLTP_buf_idx++; |
| 700 |
| 701 /* Update states */ |
| 702 for( k = 0; k < nStatesDelayedDecision; k++ ) { |
| 703 psDD = &psDelDec[ k ]; |
| 704 psSS = &psSampleState[ k ][ 0 ]; |
| 705 psDD->LF_AR_Q14 = psSS->LF_AR_Q14; |
| 706 psDD->sLPC_Q14[ NSQ_LPC_BUF_LENGTH + i ] = psSS->xq_Q14; |
| 707 psDD->Xq_Q14[ *smpl_buf_idx ] = psSS->xq_Q14; |
| 708 psDD->Q_Q10[ *smpl_buf_idx ] = psSS->Q_Q10; |
| 709 psDD->Pred_Q15[ *smpl_buf_idx ] = silk_LSHIFT32( psSS->LPC_
exc_Q14, 1 ); |
| 710 psDD->Shape_Q14[ *smpl_buf_idx ] = psSS->sLTP_shp_Q14; |
| 711 psDD->Seed = silk_ADD32_ovflw( psDD->S
eed, silk_RSHIFT_ROUND( psSS->Q_Q10, 10 ) ); |
| 712 psDD->RandState[ *smpl_buf_idx ] = psDD->Seed; |
| 713 psDD->RD_Q10 = psSS->RD_Q10; |
| 714 } |
| 715 delayedGain_Q10[ *smpl_buf_idx ] = Gain_Q10; |
| 716 } |
| 717 /* Update LPC states */ |
| 718 for( k = 0; k < nStatesDelayedDecision; k++ ) { |
| 719 psDD = &psDelDec[ k ]; |
| 720 silk_memcpy( psDD->sLPC_Q14, &psDD->sLPC_Q14[ length ], NSQ_LPC_BUF_LENG
TH * sizeof( opus_int32 ) ); |
| 721 } |
| 722 RESTORE_STACK; |
| 723 } |
| 724 |
| 725 static OPUS_INLINE void silk_nsq_del_dec_scale_states_sse4_1( |
| 726 const silk_encoder_state *psEncC, /* I Encoder State
*/ |
| 727 silk_nsq_state *NSQ, /* I/O NSQ state
*/ |
| 728 NSQ_del_dec_struct psDelDec[], /* I/O Delayed decision sta
tes */ |
| 729 const opus_int32 x_Q3[], /* I Input in Q3
*/ |
| 730 opus_int32 x_sc_Q10[], /* O Input scaled with 1/
Gain in Q10 */ |
| 731 const opus_int16 sLTP[], /* I Re-whitened LTP stat
e in Q0 */ |
| 732 opus_int32 sLTP_Q15[], /* O LTP state matching s
caled input */ |
| 733 opus_int subfr, /* I Subframe number
*/ |
| 734 opus_int nStatesDelayedDecision, /* I Number of del dec st
ates */ |
| 735 const opus_int LTP_scale_Q14, /* I LTP state scaling
*/ |
| 736 const opus_int32 Gains_Q16[ MAX_NB_SUBFR ], /* I
*/ |
| 737 const opus_int pitchL[ MAX_NB_SUBFR ], /* I Pitch lag
*/ |
| 738 const opus_int signal_type, /* I Signal type
*/ |
| 739 const opus_int decisionDelay /* I Decision delay
*/ |
| 740 ) |
| 741 { |
| 742 opus_int i, k, lag; |
| 743 opus_int32 gain_adj_Q16, inv_gain_Q31, inv_gain_Q23; |
| 744 NSQ_del_dec_struct *psDD; |
| 745 __m128i xmm_inv_gain_Q23, xmm_x_Q3_x2x0, xmm_x_Q3_x3x1; |
| 746 |
| 747 lag = pitchL[ subfr ]; |
| 748 inv_gain_Q31 = silk_INVERSE32_varQ( silk_max( Gains_Q16[ subfr ], 1 ), 47 ); |
| 749 |
| 750 silk_assert( inv_gain_Q31 != 0 ); |
| 751 |
| 752 /* Calculate gain adjustment factor */ |
| 753 if( Gains_Q16[ subfr ] != NSQ->prev_gain_Q16 ) { |
| 754 gain_adj_Q16 = silk_DIV32_varQ( NSQ->prev_gain_Q16, Gains_Q16[ subfr ],
16 ); |
| 755 } else { |
| 756 gain_adj_Q16 = (opus_int32)1 << 16; |
| 757 } |
| 758 |
| 759 /* Scale input */ |
| 760 inv_gain_Q23 = silk_RSHIFT_ROUND( inv_gain_Q31, 8 ); |
| 761 |
| 762 /* prepare inv_gain_Q23 in packed 4 32-bits */ |
| 763 xmm_inv_gain_Q23 = _mm_set1_epi32(inv_gain_Q23); |
| 764 |
| 765 for( i = 0; i < psEncC->subfr_length - 3; i += 4 ) { |
| 766 xmm_x_Q3_x2x0 = _mm_loadu_si128( (__m128i *)(&(x_Q3[ i ] ) ) ); |
| 767 /* equal shift right 4 bytes*/ |
| 768 xmm_x_Q3_x3x1 = _mm_shuffle_epi32( xmm_x_Q3_x2x0, _MM_SHUFFLE( 0, 3, 2,
1 ) ); |
| 769 |
| 770 xmm_x_Q3_x2x0 = _mm_mul_epi32( xmm_x_Q3_x2x0, xmm_inv_gain_Q23 ); |
| 771 xmm_x_Q3_x3x1 = _mm_mul_epi32( xmm_x_Q3_x3x1, xmm_inv_gain_Q23 ); |
| 772 |
| 773 xmm_x_Q3_x2x0 = _mm_srli_epi64( xmm_x_Q3_x2x0, 16 ); |
| 774 xmm_x_Q3_x3x1 = _mm_slli_epi64( xmm_x_Q3_x3x1, 16 ); |
| 775 |
| 776 xmm_x_Q3_x2x0 = _mm_blend_epi16( xmm_x_Q3_x2x0, xmm_x_Q3_x3x1, 0xCC ); |
| 777 |
| 778 _mm_storeu_si128( (__m128i *)(&(x_sc_Q10[ i ])), xmm_x_Q3_x2x0 ); |
| 779 } |
| 780 |
| 781 for( ; i < psEncC->subfr_length; i++ ) { |
| 782 x_sc_Q10[ i ] = silk_SMULWW( x_Q3[ i ], inv_gain_Q23 ); |
| 783 } |
| 784 |
| 785 /* Save inverse gain */ |
| 786 NSQ->prev_gain_Q16 = Gains_Q16[ subfr ]; |
| 787 |
| 788 /* After rewhitening the LTP state is un-scaled, so scale with inv_gain_Q16
*/ |
| 789 if( NSQ->rewhite_flag ) { |
| 790 if( subfr == 0 ) { |
| 791 /* Do LTP downscaling */ |
| 792 inv_gain_Q31 = silk_LSHIFT( silk_SMULWB( inv_gain_Q31, LTP_scale_Q14
), 2 ); |
| 793 } |
| 794 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_buf_idx;
i++ ) { |
| 795 silk_assert( i < MAX_FRAME_LENGTH ); |
| 796 sLTP_Q15[ i ] = silk_SMULWB( inv_gain_Q31, sLTP[ i ] ); |
| 797 } |
| 798 } |
| 799 |
| 800 /* Adjust for changing gain */ |
| 801 if( gain_adj_Q16 != (opus_int32)1 << 16 ) { |
| 802 /* Scale long-term shaping state */ |
| 803 { |
| 804 __m128i xmm_gain_adj_Q16, xmm_sLTP_shp_Q14_x2x0, xmm_sLTP_shp_Q14_x3
x1; |
| 805 |
| 806 /* prepare gain_adj_Q16 in packed 4 32-bits */ |
| 807 xmm_gain_adj_Q16 = _mm_set1_epi32( gain_adj_Q16 ); |
| 808 |
| 809 for( i = NSQ->sLTP_shp_buf_idx - psEncC->ltp_mem_length; i < NSQ->sL
TP_shp_buf_idx - 3; i += 4 ) |
| 810 { |
| 811 xmm_sLTP_shp_Q14_x2x0 = _mm_loadu_si128( (__m128i *)(&(NSQ->sLTP
_shp_Q14[ i ] ) ) ); |
| 812 /* equal shift right 4 bytes*/ |
| 813 xmm_sLTP_shp_Q14_x3x1 = _mm_shuffle_epi32( xmm_sLTP_shp_Q14_x2x0
, _MM_SHUFFLE( 0, 3, 2, 1 ) ); |
| 814 |
| 815 xmm_sLTP_shp_Q14_x2x0 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x2x0, xm
m_gain_adj_Q16 ); |
| 816 xmm_sLTP_shp_Q14_x3x1 = _mm_mul_epi32( xmm_sLTP_shp_Q14_x3x1, xm
m_gain_adj_Q16 ); |
| 817 |
| 818 xmm_sLTP_shp_Q14_x2x0 = _mm_srli_epi64( xmm_sLTP_shp_Q14_x2x0, 1
6 ); |
| 819 xmm_sLTP_shp_Q14_x3x1 = _mm_slli_epi64( xmm_sLTP_shp_Q14_x3x1, 1
6 ); |
| 820 |
| 821 xmm_sLTP_shp_Q14_x2x0 = _mm_blend_epi16( xmm_sLTP_shp_Q14_x2x0,
xmm_sLTP_shp_Q14_x3x1, 0xCC ); |
| 822 |
| 823 _mm_storeu_si128( (__m128i *)(&(NSQ->sLTP_shp_Q14[ i ] ) ), xmm_
sLTP_shp_Q14_x2x0 ); |
| 824 } |
| 825 |
| 826 for( ; i < NSQ->sLTP_shp_buf_idx; i++ ) { |
| 827 NSQ->sLTP_shp_Q14[ i ] = silk_SMULWW( gain_adj_Q16, NSQ->sLTP_sh
p_Q14[ i ] ); |
| 828 } |
| 829 |
| 830 /* Scale long-term prediction state */ |
| 831 if( signal_type == TYPE_VOICED && NSQ->rewhite_flag == 0 ) { |
| 832 for( i = NSQ->sLTP_buf_idx - lag - LTP_ORDER / 2; i < NSQ->sLTP_
buf_idx - decisionDelay; i++ ) { |
| 833 sLTP_Q15[ i ] = silk_SMULWW( gain_adj_Q16, sLTP_Q15[ i ] ); |
| 834 } |
| 835 } |
| 836 |
| 837 for( k = 0; k < nStatesDelayedDecision; k++ ) { |
| 838 psDD = &psDelDec[ k ]; |
| 839 |
| 840 /* Scale scalar states */ |
| 841 psDD->LF_AR_Q14 = silk_SMULWW( gain_adj_Q16, psDD->LF_AR_Q14 ); |
| 842 |
| 843 /* Scale short-term prediction and shaping states */ |
| 844 for( i = 0; i < NSQ_LPC_BUF_LENGTH; i++ ) { |
| 845 psDD->sLPC_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sLPC_
Q14[ i ] ); |
| 846 } |
| 847 for( i = 0; i < MAX_SHAPE_LPC_ORDER; i++ ) { |
| 848 psDD->sAR2_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->sAR2_
Q14[ i ] ); |
| 849 } |
| 850 for( i = 0; i < DECISION_DELAY; i++ ) { |
| 851 psDD->Pred_Q15[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Pred
_Q15[ i ] ); |
| 852 psDD->Shape_Q14[ i ] = silk_SMULWW( gain_adj_Q16, psDD->Shap
e_Q14[ i ] ); |
| 853 } |
| 854 } |
| 855 } |
| 856 } |
| 857 } |
| OLD | NEW |